8 年之前 · e37b2442fc
--- a/bananas/theoratest/konqi.ogv
+++ b/bananas/theoratest/konqi.ogv
--- a/bananas/theoratest/theoratest.monkey2
+++ b/bananas/theoratest/theoratest.monkey2
@@ -0,0 +1,105 @@
 
				+Namespace myapp
			
 
				+
			
 
				+#Import "<std>"
			
 
				+#Import "<mojo>"
			
 
				+#Import "<theoraplayer>"
			
 
				+
			
 
				+#Import "konqi.ogv"
			
 
				+
			
 
				+Using std..
			
 
				+Using mojo..
			
 
				+Using theoraplayer..
			
 
				+
			
 
				+Class MyWindow Extends Window
			
 
				+
			
 
				+	Field audiofactory:AudioInterfaceFactory
			
 
				+	
			
 
				+	Field vidman:VideoManager
			
 
				+	
			
 
				+	Field data:DataBuffer
			
 
				+	
			
 
				+	Field vidclip:VideoClip
			
 
				+	
			
 
				+	Field image:Image
			
 
				+	
			
 
				+	Field time:Double
			
 
				+	
			
 
				+	Field gain:float=1
			
 
				+
			
 
				+	Method New( title:String="Simple mojo app",width:Int=640,height:Int=480,flags:WindowFlags=WindowFlags.Resizable )
			
 
				+
			
 
				+		Super.New( title,width,height,flags )
			
 
				+		
			
 
				+		vidman=VideoManager.getInstance()
			
 
				+		
			
 
				+		audiofactory=New OpenAL_AudioInterfaceFactory
			
 
				+		
			
 
				+		vidman.setAudioInterfaceFactory( audiofactory )
			
 
				+		
			
 
				+		data=DataBuffer.Load( "asset::konqi.ogv" )
			
 
				+		
			
 
				+		vidclip=vidman.createVideoClip( data.Data,data.Length )
			
 
				+'		vidclip=vidman.createVideoClip( AssetsDir()+"konqi.ogv" )
			
 
				+		If Not vidclip Print "Can't load vidclip!"
			
 
				+		
			
 
				+		image=New Image( vidclip.getWidth(),vidclip.getHeight(),PixelFormat.RGB24,TextureFlags.Dynamic )
			
 
				+		
			
 
				+		vidclip.play()
			
 
				+		
			
 
				+		time=Now()
			
 
				+		
			
 
				+	End
			
 
				+
			
 
				+	Method OnRender( canvas:Canvas ) Override
			
 
				+	
			
 
				+		App.RequestRender()
			
 
				+		
			
 
				+		Local now:=Now()
			
 
				+		
			
 
				+		Local elapsed:=now-time
			
 
				+		
			
 
				+		time=now
			
 
				+		
			
 
				+		vidman.update( elapsed )
			
 
				+	
			
 
				+'		vidclip.updateTimerToNextFrame()	'play full speed...
			
 
				+		
			
 
				+		Local frame:=vidclip.fetchNextFrame()
			
 
				+		
			
 
				+		If frame
			
 
				+		
			
 
				+			Local pixmap:=New Pixmap( vidclip.getWidth(),vidclip.getHeight(),PixelFormat.RGB24,frame.getBuffer(),vidclip.getWidth()*3 )
			
 
				+			
			
 
				+			image.Texture.PastePixmap( pixmap,0,0 )
			
 
				+			
			
 
				+			vidclip.popFrame()
			
 
				+		
			
 
				+		Endif
			
 
				+		
			
 
				+		If Keyboard.KeyPressed( Key.Up )
			
 
				+			gain=Min( gain+.125,1.0 )
			
 
				+			vidclip.setAudioGain( gain )
			
 
				+		Else If Keyboard.KeyPressed( Key.Down )
			
 
				+			gain=Max( gain-.125,0.0 )
			
 
				+			vidclip.setAudioGain( gain )
			
 
				+		Endif
			
 
				+		
			
 
				+		canvas.BlendMode=BlendMode.Opaque
			
 
				+		
			
 
				+		canvas.DrawRect( 0,0,Width,Height,image )
			
 
				+		
			
 
				+		canvas.DrawText( "Time position="+vidclip.getTimePosition()+", duration="+vidclip.getDuration(),0,0 )
			
 
				+		
			
 
				+		canvas.DrawText( "Audio gain="+vidclip.getAudioGain(),0,16 )
			
 
				+	End
			
 
				+	
			
 
				+End
			
 
				+
			
 
				+Function Main()
			
 
				+
			
 
				+	New AppInstance
			
 
				+	
			
 
				+	New MyWindow
			
 
				+	
			
 
				+	App.Run()
			
 
				+End
			
--- a/bananas/zipfiletest/blah.zip
+++ b/bananas/zipfiletest/blah.zip
--- a/bananas/zipfiletest/zipfiletest.monkey2
+++ b/bananas/zipfiletest/zipfiletest.monkey2
@@ -0,0 +1,24 @@
 
				+
			
 
				+'Note: ZipFile is VERY WIP!
			
 
				+'
			
 
				+#Import "<std>"
			
 
				+
			
 
				+#Import "blah.zip@/"
			
 
				+
			
 
				+Using std..
			
 
				+
			
 
				+Function Main()
			
 
				+
			
 
				+	Local zip:=ZipFile.Open( "asset::blah.zip" )
			
 
				+	
			
 
				+	For Local file:=Eachin zip.Files
			
 
				+		Print file
			
 
				+	Next
			
 
				+	
			
 
				+	Local str:=zip.ExtractData( "geom/affinemat3.monkey2" ).PeekString( 0 )
			
 
				+	
			
 
				+	Print str
			
 
				+	
			
 
				+	zip.Close()
			
 
				+
			
 
				+End
			
--- a/bin/mx2cc_linux
+++ b/bin/mx2cc_linux
--- a/bin/mx2cc_macos
+++ b/bin/mx2cc_macos
--- a/bin/mx2cc_windows.exe
+++ b/bin/mx2cc_windows.exe
--- a/modules/mojo/app/window.monkey2
+++ b/modules/mojo/app/window.monkey2
@@ -89,10 +89,18 @@ Class Window Extends View
 
				 	End
			
 
				 	
			
 
				 	#rem monkeydoc Window fullscreen state.
			
 
				+	
			
 
				+	Note: The setter for this property deprecated! Please use BeginFullscreen/EndFullscreen instead.
			
 
				+	
			
 
				 	#end
			
 
				 	Property Fullscreen:Bool()
			
 
				 	
			
 
				 		Return Cast<SDL_WindowFlags>( SDL_GetWindowFlags( _sdlWindow ) ) & SDL_WINDOW_FULLSCREEN
			
 
				+		
			
 
				+	Setter( fullscreen:Bool )
			
 
				+		If fullscreen=Fullscreen Return
			
 
				+	
			
 
				+		If fullscreen BeginFullscreen() Else EndFullscreen()
			
 
				 	End
			
 
				 
			
 
				 	#rem monkeydoc Window maximized state.
			
--- a/modules/mojo/graphics/image.monkey2
+++ b/modules/mojo/graphics/image.monkey2
@@ -72,6 +72,15 @@ Class Image Extends Resource
 
				 		AddDependancy( texture )
			
 
				 	End
			
 
				 
			
 
				+	Method New( width:Int,height:Int,format:PixelFormat,textureFlags:TextureFlags=Null,shader:Shader=Null )
			
 
				+	
			
 
				+		Local texture:=New Texture( width,height,format,textureFlags )
			
 
				+		
			
 
				+		Init( texture,texture.Rect,shader )
			
 
				+		
			
 
				+		AddDependancy( texture )
			
 
				+	End
			
 
				+
			
 
				 	Method New( image:Image )
			
 
				 	
			
 
				 		Init( image._textures[0],image._rect,image._shader )
			
--- a/modules/monkey/native/bbarray.h
+++ b/modules/monkey/native/bbarray.h
@@ -190,7 +190,7 @@ template<class T,int D> bbString bbDBType( bbArray<T,D> *p ){
 
				 
			
 
				 template<class T,int D> bbString bbDBValue( bbArray<T,D> *p ){
			
 
				 	char buf[64];
			
 
				-	sprintf( buf,"@%p",p->_rep );
			
 
				+	sprintf( buf,"@%p",*(void**)(&p->_rep) );
			
 
				 	return buf;
			
 
				 }
			
 
				 
			
--- a/modules/std/geom/affinemat3.monkey2
+++ b/modules/std/geom/affinemat3.monkey2
@@ -144,8 +144,6 @@ Struct AffineMat3<T>
 
				 		Return New AffineMat3( sv.x,0,0,sv.y,0,0 )
			
 
				 	End
			
 
				 
			
 
				-	#rem monkeydoc @hidden
			
 
				-	#end
			
 
				 	Function Ortho:AffineMat3( left:T,right:T,bottom:T,top:T )
			
 
				 
			
 
				 		Local w:=right-left,h:=top-bottom
			
--- a/modules/std/geom/affinemat4.monkey2
+++ b/modules/std/geom/affinemat4.monkey2
@@ -15,33 +15,33 @@ Affine 4x4 matrices are often used for 3d transformations such as scaling, rotat
 
				 Struct AffineMat4<T>
			
 
				 
			
 
				 	Field m:Mat3<T>
			
 
				-	Field v:Vec3<T>
			
 
				+	Field t:Vec3<T>
			
 
				 	
			
 
				 	Method New()
			
 
				 		m.i.x=1; m.j.y=1; m.k.z=1
			
 
				 	End
			
 
				 	
			
 
				-	Method New( m:Mat3<T>,v:Vec3<T> )
			
 
				-		Self.m=m; Self.v=v
			
 
				+	Method New( m:Mat3<T>,t:Vec3<T> )
			
 
				+		Self.m=m; Self.t=t
			
 
				 	End
			
 
				 	
			
 
				 	Method New( m:Mat3<T> )
			
 
				 		Self.m=m
			
 
				 	End
			
 
				 	
			
 
				-	Method New( v:Vec3<T> )
			
 
				-		m.i.x=1; m.j.y=1; m.k.z=1 ; Self.v=v
			
 
				+	Method New( t:Vec3<T> )
			
 
				+		m.i.x=1; m.j.y=1; m.k.z=1 ; Self.t=t
			
 
				 	End
			
 
				 	
			
 
				-	Method New( i:Vec3<T>,j:Vec3<T>,k:Vec3<T>,v:Vec3<T> )
			
 
				-		m.i=i; m.j=j; m.k=k; Self.v=v
			
 
				+	Method New( i:Vec3<T>,j:Vec3<T>,k:Vec3<T>,t:Vec3<T> )
			
 
				+		m.i=i; m.j=j; m.k=k; Self.t=t
			
 
				 	End
			
 
				 
			
 
				 	Method New( ix:T,iy:T,iz:T,jx:T,jy:T,jz:T,kx:T,ky:T,kz:T,vx:T,vy:T,vz:T )
			
 
				 		m.i.x=ix; m.i.y=iy; m.i.z=iz
			
 
				 		m.j.x=jx; m.j.y=jy; m.j.z=jz
			
 
				 		m.k.x=kx; m.k.y=ky; m.k.z=kz
			
 
				-		v.x=vx; v.y=vy; v.z=vz
			
 
				+		t.x=vx; t.y=vy; t.z=vz
			
 
				 	End
			
 
				 	
			
 
				 	#rem monkeydoc Converts the matrix to a matrix of a different type.
			
@@ -53,72 +53,109 @@ Struct AffineMat4<T>
 
				 	#rem monkeydoc Converts the matrix to a printable string.
			
 
				 	#end
			
 
				 	Operator To:String()
			
 
				-		Return "AffineMat4("+m+","+v+")"
			
 
				+		Return "AffineMat4("+m+","+t+")"
			
 
				 	End
			
 
				 	
			
 
				 	#rem monkeydoc Returns the transpose of the matrix.
			
 
				 	#End
			
 
				 	Operator~:AffineMat4()
			
 
				 		Local i:=~m
			
 
				-		Return New AffineMat4( i,i*-v )
			
 
				+		Return New AffineMat4( i,i*-t )
			
 
				 	End
			
 
				 	
			
 
				 	#rem monkeydoc Returns the inverse of the matrix.
			
 
				 	#end
			
 
				 	Operator-:AffineMat4()
			
 
				 		Local i:=-m
			
 
				-		Return New AffineMat4( i,i*-v )
			
 
				+		Return New AffineMat4( i,i*-t )
			
 
				 	End
			
 
				 	
			
 
				 	#rem monkeydoc Multiplies the matrix by another matrix and returns the result.
			
 
				 	#end
			
 
				 	Operator*:AffineMat4( q:AffineMat4 )
			
 
				-		Return New AffineMat4( m*q.m,m*q.v+v )
			
 
				+		Return New AffineMat4( m*q.m,m*q.t+t )
			
 
				 	End
			
 
				 	
			
 
				 	#rem monkeydoc Multiplies a vector by the matrix and returns the result.
			
 
				 	#end
			
 
				 	Operator*:Vec3<T>( v:Vec3<T> )
			
 
				 		Return New Vec3<T>( 
			
 
				-			m.i.x*v.x+m.j.x*v.y+m.k.x*v.z+v.x,
			
 
				-			m.i.y*v.x+m.j.y*v.y+m.k.y*v.z+v.y,
			
 
				-			m.i.z*v.x+m.j.z*v.y+m.k.z*v.z+v.z )
			
 
				+			m.i.x*v.x+m.j.x*v.y+m.k.x*v.z+t.x,
			
 
				+			m.i.y*v.x+m.j.y*v.y+m.k.y*v.z+t.y,
			
 
				+			m.i.z*v.x+m.j.z*v.y+m.k.z*v.z+t.z )
			
 
				 	End
			
 
				 
			
 
				 	#rem monkeydoc Applies a translation transformation to the matrix and returns the result.
			
 
				 	#end
			
 
				-	Method Translate:AffineMat4( tv:Vec3<T> )
			
 
				-		Return Self * TranslationMatrix( tv )
			
 
				+	Method Translate:AffineMat4( tx:T,ty:T,tz:T )
			
 
				+		Return Self * Translation( tx,ty,tz )
			
 
				 	End
			
 
				 	
			
 
				+	Method Translate:AffineMat4( tv:Vec3<T> )
			
 
				+		Return Self * Translation( tv )
			
 
				+	End
			
 
				+
			
 
				 	#rem monkeydoc Applies a rotation transformation to the matrix and returns the result.
			
 
				 	#end
			
 
				-	Method Rotate:AffineMat4( rv:Vec3<T> )
			
 
				-		Return Self * RotationMatrix( rv )
			
 
				+	Method Rotate:AffineMat4( rx:Double,ry:Double,rz:Double )
			
 
				+		Return Self * Rotation( rx,ry,rz )
			
 
				+	End
			
 
				+
			
 
				+	Method Rotate:AffineMat4( rv:Vec3<Double> )
			
 
				+		Return Self * Rotation( rv )
			
 
				 	End
			
 
				 	
			
 
				 	#rem monkeydoc Applies a scaling transformation to the matrix and returns the result.
			
 
				 	#end
			
 
				-	Method Scale:AffineMat4( rv:Vec3<T> )
			
 
				-		Return Self * ScalingMatrix( rv )
			
 
				+	Method Scale:AffineMat4( sx:T,sy:T,sz:T )
			
 
				+		Return Self * Scaling( sx,sy,sz )
			
 
				+	End
			
 
				+	
			
 
				+	Method Scale:AffineMat4( sv:Vec3<T> )
			
 
				+		Return Self * Scaling( sv )
			
 
				 	End
			
 
				 	
			
 
				 	#rem monkeydoc Creates a translation matrix.
			
 
				 	#end
			
 
				-	Function TranslationMatrix:AffineMat4( tv:Vec3<T> )
			
 
				+	Function Translation:AffineMat4( tv:Vec3<T> )
			
 
				 		Return New AffineMat4( tv )
			
 
				 	End
			
 
				 	
			
 
				-	#rem monkeydoc Creates a rotation matrix.
			
 
				+	Function Translation:AffineMat4( tx:T,ty:T,tz:T )
			
 
				+		Return New AffineMat4( New Vec3<T>( tx,ty,tz ) )
			
 
				+	End
			
 
				+
			
 
				+	#rem monkeydoc Creates a rotation matrix from a quaternion.
			
 
				 	#end
			
 
				-	Function RotationMatrix:AffineMat4( rv:Vec3<T> )
			
 
				-		Return New AffineMat4( Mat3<T>.RotationMatrix( rv ) )
			
 
				+	Function Rotation:AffineMat4( quat:Quat<T> )
			
 
				+		Return New AffineMat4( Mat3<T>.Rotation( quat ) )
			
 
				+	End
			
 
				+	
			
 
				+	#rem monkeydoc Creates a rotation matrix from euler angles.
			
 
				+	
			
 
				+	Order of rotation is Yaw * Pitch * Roll.
			
 
				+	
			
 
				+	#end
			
 
				+	Function Rotation:AffineMat4( rv:Vec3<Double> )
			
 
				+		Return New AffineMat4( Mat3<T>.Rotation( rv ) )
			
 
				+	End
			
 
				+	
			
 
				+	Function Rotation:AffineMat4( rx:Double,ry:Double,rz:Double )
			
 
				+		Return New AffineMat4( Mat3<T>.Rotation( rx,ry,rz ) )
			
 
				 	End
			
 
				 	
			
 
				 	#rem monkeydoc Creates a scaling matrix.
			
 
				 	#end
			
 
				-	Function ScalingMatrix:AffineMat4( sv:Vec3<T> )
			
 
				-		Return New AffineMat4( Mat3<T>.ScalingMatrix( sv ) )
			
 
				+	Function Scaling:AffineMat4( sv:Vec3<T> )
			
 
				+		Return New AffineMat4( Mat3<T>.Scaling( sv ) )
			
 
				+	End
			
 
				+	
			
 
				+	Function Scaling:AffineMat4( sx:T,sy:T,sz:T )
			
 
				+		Return New AffineMat4( Mat3<T>.Scaling( sx,sy,sz ) )
			
 
				+	End
			
 
				+	
			
 
				+	Function Scaling:AffineMat4( t:T )
			
 
				+		Return Scaling( t,t,t )
			
 
				 	End
			
 
				 	
			
 
				 End
			
--- a/modules/std/geom/box.monkey2
+++ b/modules/std/geom/box.monkey2
@@ -9,12 +9,21 @@ Alias Boxf:Box<Float>
 
				 #end
			
 
				 Struct Box<T>
			
 
				 
			
 
				+	Const FullBounds:=New Box( -1000000,-1000000,-1000000,1000000,1000000,1000000 )
			
 
				+
			
 
				+	Const EmptyBounds:=New Box( 1000000,1000000,1000000,-1000000,-1000000,-1000000 )
			
 
				+	
			
 
				 	Field min:Vec3<T>
			
 
				 	Field max:Vec3<T>
			
 
				 	
			
 
				 	Method New()
			
 
				 	End
			
 
				 	
			
 
				+	Method New( min:T,max:T )
			
 
				+		Self.min=New Vec3<T>( min )
			
 
				+		Self.max=New Vec3<T>( max )
			
 
				+	End
			
 
				+	
			
 
				 	Method New( min:Vec3<T>,max:Vec3<T> )
			
 
				 		Self.min=min
			
 
				 		Self.max=max
			
@@ -25,10 +34,22 @@ Struct Box<T>
 
				 		max.x=x1;max.y=y1;max.z=z1
			
 
				 	End
			
 
				 	
			
 
				+	Operator To:String()
			
 
				+		Return "Box("+min+","+max+")"
			
 
				+	End
			
 
				+	
			
 
				+	Operator To<C>:Box<C>()
			
 
				+		Return New Box<C>( min,max )
			
 
				+	End
			
 
				+	
			
 
				 	Property Empty:Bool()
			
 
				 		Return max.x<=min.x Or max.y<=min.y Or max.z<=min.z
			
 
				 	End
			
 
				 	
			
 
				+	Property Center:Vec3<T>()
			
 
				+		Return (min+max)/2
			
 
				+	End
			
 
				+	
			
 
				 	Property Size:Vec3<T>()
			
 
				 		Return max-min
			
 
				 	End
			
--- a/modules/std/geom/mat3.monkey2
+++ b/modules/std/geom/mat3.monkey2
@@ -22,12 +22,6 @@ Struct Mat3<T>
 
				 	End
			
 
				 	
			
 
				 	Method New( q:Quat<T> )
			
 
				-		Local xx:=q.v.x*q.v.x , yy:=q.v.y*q.v.y , zz:=q.v.z*q.v.z
			
 
				-		Local xy:=q.v.x*q.v.y , xz:=q.v.x*q.v.z , yz:=q.v.y*q.v.z
			
 
				-		Local wx:=q.w*q.v.x   , wy:=q.w*q.v.y   , wz:=q.w*q.v.z
			
 
				-		i.x=1-2*(yy+zz) ; i.y=  2*(xy-wz) ; i.z=  2*(xz+wy)
			
 
				-		j.x=  2*(xy+wz) ; j.y=1-2*(xx+zz) ; j.z=  2*(yz-wx)
			
 
				-		k.x=  2*(xz-wy) ; k.y=  2*(yz+wx) ; k.z=1-2*(xx+yy)
			
 
				 	End
			
 
				 	
			
 
				 	Method New( ix:Float,jy:Float,kz:Float )
			
@@ -40,39 +34,16 @@ Struct Mat3<T>
 
				 		k.x=kx; k.y=ky; k.z=kz
			
 
				 	End
			
 
				 	
			
 
				-	Method To<C>:Mat3<C>()
			
 
				+	Operator To<C>:Mat3<C>()
			
 
				 		Return New Mat3<C>( i,j,k )
			
 
				 	End
			
 
				 	
			
 
				-	Method To:String()
			
 
				+	Operator To:String()
			
 
				 		Return "Mat3("+i+","+j+","+k+")"
			
 
				 	End
			
 
				 	
			
 
				-	Method To:Quat<T>()
			
 
				-		Return New Quat<T>( Self )
			
 
				-	End
			
 
				-	
			
 
				-	Property Determinant:Double()
			
 
				-		return i.x*(j.y*k.z-j.z*k.y )-i.y*(j.x*k.z-j.z*k.x )+i.z*(j.x*k.y-j.y*k.x )
			
 
				-	End
			
 
				-	
			
 
				-	Property Cofactor:Mat3()
			
 
				-		Return New Mat3(
			
 
				-			 (j.y*k.z-j.z*k.y),-(j.x*k.z-j.z*k.x), (j.x*k.y-j.y*k.x),
			
 
				-			-(i.y*k.z-i.z*k.y), (i.x*k.z-i.z*k.x),-(i.x*k.y-i.y*k.x),
			
 
				-			 (i.y*j.z-i.z*j.y),-(i.x*j.z-i.z*j.x), (i.x*j.y-i.y*j.x) )
			
 
				-	End
			
 
				-	
			
 
				-	Property Pitch:Double()
			
 
				-		Return k.Pitch
			
 
				-	End
			
 
				-	
			
 
				-	Property Yaw:Double()
			
 
				-		Return k.Yaw
			
 
				-	End
			
 
				-	
			
 
				-	Property Roll:Double()
			
 
				-		Return ATan2( i.y,j.y )
			
 
				+	Property Determinant:T()
			
 
				+		Return i.x*(j.y*k.z-j.z*k.y )-i.y*(j.x*k.z-j.z*k.x )+i.z*(j.x*k.y-j.y*k.x )
			
 
				 	End
			
 
				 	
			
 
				 	Operator~:Mat3()
			
@@ -102,12 +73,83 @@ Struct Mat3<T>
 
				 		Return New Vec3<T>( i.x*v.x+j.x*v.y+k.x*v.z,i.y*v.x+j.y*v.y+k.y*v.z,i.z*v.x+j.z*v.y+k.z*v.z )
			
 
				 	End
			
 
				 	
			
 
				+	Method GetCofactor:Mat3()
			
 
				+		Return New Mat3(
			
 
				+			 (j.y*k.z-j.z*k.y),-(j.x*k.z-j.z*k.x), (j.x*k.y-j.y*k.x),
			
 
				+			-(i.y*k.z-i.z*k.y), (i.x*k.z-i.z*k.x),-(i.x*k.y-i.y*k.x),
			
 
				+			 (i.y*j.z-i.z*j.y),-(i.x*j.z-i.z*j.x), (i.x*j.y-i.y*j.x) )
			
 
				+	End
			
 
				+	
			
 
				+	Method GetPitch:Double()
			
 
				+		Return k.Pitch
			
 
				+	End
			
 
				+	
			
 
				+	Method GetYaw:Double()
			
 
				+		Return k.Yaw
			
 
				+	End
			
 
				+	
			
 
				+	Method GetRoll:Double()
			
 
				+		Return ATan2( i.y,j.y )
			
 
				+	End
			
 
				+	
			
 
				+	Method GetRotation:Vec3<T>()
			
 
				+		Return New Vec3<T>( GetPitch(),GetYaw(),GetRoll() )
			
 
				+	End
			
 
				+	
			
 
				+	Method GetQuat:Quat<T>()
			
 
				+		Local r:Quat<T>
			
 
				+		Local m:=Orthogonalize()
			
 
				+		Local t:=m.i.x+m.j.y+m.k.z
			
 
				+		If t>EPSILON
			
 
				+			t=Sqrt( t+1 )*2
			
 
				+			r.v.x=(m.k.y-m.j.z)/t
			
 
				+			r.v.y=(m.i.z-m.k.x)/t
			
 
				+			r.v.z=(m.j.x-m.i.y)/t
			
 
				+			r.w=t/4
			
 
				+		Else If m.i.x>m.j.y And m.i.x>m.k.z
			
 
				+			t=Sqrt( m.i.x-m.j.y-m.k.z+1 )*2
			
 
				+			r.v.x=t/4
			
 
				+			r.v.y=(m.j.x+m.i.y)/t
			
 
				+			r.v.z=(m.i.z+m.k.x)/t
			
 
				+			r.w=(m.k.y-m.j.z)/t
			
 
				+		Else If m.j.y>m.k.z
			
 
				+			t=Sqrt( m.j.y-m.k.z-m.i.x+1 )*2
			
 
				+			r.v.x=(m.j.x+m.i.y)/t
			
 
				+			r.v.y=t/4
			
 
				+			r.v.z=(m.k.y+m.j.z)/t
			
 
				+			r.w=(m.i.z-m.k.x)/t
			
 
				+		Else
			
 
				+			t=Sqrt( m.k.z-m.j.y-m.i.x+1 )*2
			
 
				+			r.v.x=(m.i.z+m.k.x)/t
			
 
				+			r.v.y=(m.k.y+m.j.z)/t
			
 
				+			r.v.z=t/4
			
 
				+			r.w=(m.j.x-m.i.y)/t
			
 
				+		Endif
			
 
				+		Return r
			
 
				+	End
			
 
				+	
			
 
				+	Method GetScaling:Vec3<T>()
			
 
				+		Return New Vec3<T>( i.Length,j.Length,k.Length )
			
 
				+	End
			
 
				+	
			
 
				 	Method Rotate:Mat3( rv:Vec3<T> )
			
 
				-		Return Self * RotationMatrix( rv )
			
 
				+		Return Self * Rotation( rv )
			
 
				+	End
			
 
				+	
			
 
				+	Method Rotate:Mat3( rx:Double,ry:Double,rz:Double )
			
 
				+		Return Self * Rotation( rx,ry,rz )
			
 
				 	End
			
 
				 	
			
 
				 	Method Scale:Mat3( rv:Vec3<T> )
			
 
				-		Return Self * ScalingMatrix( rv )
			
 
				+		Return Self * Scaling( rv )
			
 
				+	End
			
 
				+
			
 
				+	Method Scale:Mat3( sx:T,sy:T,sz:T )
			
 
				+		Return Self * Scaling( sx,sy,sz )
			
 
				+	End
			
 
				+	
			
 
				+	Method Scale:Mat3( t:T )
			
 
				+		Return Self * Scaling( t )
			
 
				 	End
			
 
				 
			
 
				 	Method Orthogonalize:Mat3()
			
@@ -115,27 +157,65 @@ Struct Mat3<T>
 
				 		Return New Mat3( j.Cross( k ).Normalize(),k.Cross( i ).Normalize(),k )
			
 
				 	End
			
 
				 	
			
 
				-	Function YawMatrix:Mat3( an:Double )
			
 
				+	#rem monkeydoc Creates a yaw (y axis) rotation matrix.
			
 
				+	#end
			
 
				+	Function Yaw:Mat3( an:Double )
			
 
				 		Local sin:=Sin(an),cos:=Cos(an)
			
 
				 		Return New Mat3( cos,0,sin, 0,1,0, -sin,0,cos )
			
 
				 	End
			
 
				 	
			
 
				-	Function PitchMatrix:Mat3( an:Double )
			
 
				+	#rem monkeydoc Creates a pitch (x axis) rotation matrix.
			
 
				+	#end
			
 
				+	Function Pitch:Mat3( an:Double )
			
 
				 		Local sin:=Sin(an),cos:=Cos(an)
			
 
				 		return New Mat3( 1,0,0, 0,cos,sin, 0,-sin,cos )
			
 
				 	End
			
 
				 	
			
 
				-	Function RollMatrix:Mat3( an:Double )
			
 
				+	#rem monkeydoc Creates a roll (z axis) rotation matrix.
			
 
				+	#end
			
 
				+	Function Roll:Mat3( an:Double )
			
 
				 		Local sin:=Sin(an),cos:=Cos(an)
			
 
				 		Return New Mat3( cos,sin,0, -sin,cos,0, 0,0,1 )
			
 
				 	End
			
 
				 	
			
 
				-	Function RotationMatrix:Mat3( rv:Vec3<T> )
			
 
				-		Return YawMatrix( rv.y ) * PitchMatrix( rv.x ) * RollMatrix( rv.z )
			
 
				+	#rem monkeydoc Creates a rotation matrix from a quaternion.
			
 
				+	#end
			
 
				+	Function Rotation:Mat3( quat:Quat<T> )
			
 
				+		Local xx:=quat.v.x*quat.v.x , yy:=quat.v.y*quat.v.y , zz:=quat.v.z*quat.v.z
			
 
				+		Local xy:=quat.v.x*quat.v.y , xz:=quat.v.x*quat.v.z , yz:=quat.v.y*quat.v.z
			
 
				+		Local wx:=quat.w*quat.v.x   , wy:=quat.w*quat.v.y   , wz:=quat.w*quat.v.z
			
 
				+		Local r:Mat3
			
 
				+		r.i.x=1-2*(yy+zz) ; r.i.y=  2*(xy-wz) ; r.i.z=  2*(xz+wy)
			
 
				+		r.j.x=  2*(xy+wz) ; r.j.y=1-2*(xx+zz) ; r.j.z=  2*(yz-wx)
			
 
				+		r.k.x=  2*(xz-wy) ; r.k.y=  2*(yz+wx) ; r.k.z=1-2*(xx+yy)
			
 
				+		Return r
			
 
				+	End
			
 
				+	
			
 
				+	#rem monkeydoc Creates a rotation matrix from euler angles.
			
 
				+	
			
 
				+	Order of rotation is Yaw * Pitch * Roll.
			
 
				+	
			
 
				+	#end
			
 
				+	Function Rotation:Mat3( rv:Vec3<Double> )
			
 
				+		Return Yaw( rv.y ) * Pitch( rv.x ) * Roll( rv.z )
			
 
				 	End
			
 
				 	
			
 
				-	Function ScalingMatrix:Mat3( sv:Vec3<T> )
			
 
				+	Function Rotation:Mat3( rx:Double,ry:Double,rz:Double )
			
 
				+		Return Yaw( ry ) * Pitch( rx ) * Roll( rz )
			
 
				+	End
			
 
				+
			
 
				+	#rem monkeydoc Creates a scaling matrix.
			
 
				+	#end
			
 
				+	Function Scaling:Mat3( sv:Vec3<T> )
			
 
				 		Return New Mat3( sv.x,sv.y,sv.z )
			
 
				 	End
			
 
				 
			
 
				+	Function Scaling:Mat3( sx:T,sy:T,sz:T )
			
 
				+		Return New Mat3( sx,sy,sz )
			
 
				+	End
			
 
				+	
			
 
				+	Function Scaling:Mat3( t:T )
			
 
				+		Return New Mat3( t,t,t )
			
 
				+	End
			
 
				+	
			
 
				 End
			
--- a/modules/std/geom/mat4.monkey2
+++ b/modules/std/geom/mat4.monkey2
@@ -26,11 +26,16 @@ Struct Mat4<T>
 
				 		Self.i=i;Self.j=j;Self.k=k;Self.t=t
			
 
				 	End
			
 
				 	
			
 
				+	Method New( m:Mat3<T> )
			
 
				+		i.XYZ=m.i ; j.XYZ=m.j ; k.XYZ=m.k ; t.w=1
			
 
				+	End
			
 
				+	
			
 
				 	Method New( m:AffineMat3<T> )
			
 
				-		i.x=m.i.x;i.y=m.i.y
			
 
				-		j.x=m.j.x;j.y=m.j.y
			
 
				-		k.z=1
			
 
				-		t.x=m.t.x;t.y=m.t.y;t.w=1
			
 
				+		i.XY=m.i ; j.XY=m.j ; k.z=1 ; t.XY=m.t ; t.w=1
			
 
				+	End
			
 
				+	
			
 
				+	Method New( m:AffineMat4<T> )
			
 
				+		i.XYZ=m.m.i ; j.XYZ=m.m.j ; k.XYZ=m.m.k ; t.XYZ=m.t ; t.w=1
			
 
				 	End
			
 
				 	
			
 
				 	Operator*:Mat4( m:Mat4 )
			
@@ -59,25 +64,125 @@ Struct Mat4<T>
 
				 		Return r
			
 
				 	End
			
 
				 	
			
 
				-	Function Translation:Mat4( tx:T,ty:T,tz:T )
			
 
				-		Local r:=New Mat4
			
 
				-		r.t.x=tx;r.t.y=ty;r.t.z=tz
			
 
				+	Operator*:Mat4( m:AffineMat4<T> )
			
 
				+
			
 
				+		Local r:Mat4
			
 
				+		
			
 
				+		r.i.x=i.x*m.m.i.x + j.x*m.m.i.y + k.x*m.m.i.z
			
 
				+		r.i.y=i.y*m.m.i.x + j.y*m.m.i.y + k.y*m.m.i.z
			
 
				+		r.i.z=i.z*m.m.i.x + j.z*m.m.i.y + k.z*m.m.i.z
			
 
				+		r.i.w=i.w*m.m.i.x + j.w*m.m.i.y + k.w*m.m.i.z
			
 
				+		
			
 
				+		r.j.x=i.x*m.m.j.x + j.x*m.m.j.y + k.x*m.m.j.z
			
 
				+		r.j.y=i.y*m.m.j.x + j.y*m.m.j.y + k.y*m.m.j.z
			
 
				+		r.j.z=i.z*m.m.j.x + j.z*m.m.j.y + k.z*m.m.j.z
			
 
				+		r.j.w=i.w*m.m.j.x + j.w*m.m.j.y + k.w*m.m.j.z
			
 
				+		
			
 
				+		r.k.x=i.x*m.m.k.x + j.x*m.m.k.y + k.x*m.m.k.z
			
 
				+		r.k.y=i.y*m.m.k.x + j.y*m.m.k.y + k.y*m.m.k.z
			
 
				+		r.k.z=i.z*m.m.k.x + j.z*m.m.k.y + k.z*m.m.k.z
			
 
				+		r.k.w=i.w*m.m.k.x + j.w*m.m.k.y + k.w*m.m.k.z
			
 
				+		
			
 
				+		r.t.x=i.x*m.t.x   + j.x*m.t.y   + k.x*m.t.z + t.x
			
 
				+		r.t.y=i.y*m.t.x   + j.y*m.t.y   + k.y*m.t.z + t.y
			
 
				+		r.t.z=i.z*m.t.x   + j.z*m.t.y   + k.z*m.t.z + t.z
			
 
				+		r.t.w=i.w*m.t.x   + j.w*m.t.y   + k.w*m.t.z + t.w
			
 
				+		
			
 
				 		Return r
			
 
				 	End
			
 
				 	
			
 
				-	Function Scale:Mat4( sx:Float,sy:Float,sz:Float )
			
 
				+	Operator*:Mat4( m:Mat3<T> )
			
 
				+
			
 
				 		Local r:Mat4
			
 
				-		r.i.x=sx;r.j.y=sy;r.k.z=sz;r.t.w=1
			
 
				+		
			
 
				+		r.i.x=i.x*m.i.x + j.x*m.i.y + k.x*m.i.z
			
 
				+		r.i.y=i.y*m.i.x + j.y*m.i.y + k.y*m.i.z
			
 
				+		r.i.z=i.z*m.i.x + j.z*m.i.y + k.z*m.i.z
			
 
				+		r.i.w=i.w*m.i.x + j.w*m.i.y + k.w*m.i.z
			
 
				+		
			
 
				+		r.j.x=i.x*m.j.x + j.x*m.j.y + k.x*m.j.z
			
 
				+		r.j.y=i.y*m.j.x + j.y*m.j.y + k.y*m.j.z
			
 
				+		r.j.z=i.z*m.j.x + j.z*m.j.y + k.z*m.j.z
			
 
				+		r.j.w=i.w*m.j.x + j.w*m.j.y + k.w*m.j.z
			
 
				+		
			
 
				+		r.k.x=i.x*m.k.x + j.x*m.k.y + k.x*m.k.z
			
 
				+		r.k.y=i.y*m.k.x + j.y*m.k.y + k.y*m.k.z
			
 
				+		r.k.z=i.z*m.k.x + j.z*m.k.y + k.z*m.k.z
			
 
				+		r.k.w=i.w*m.k.x + j.w*m.k.y + k.w*m.k.z
			
 
				+		
			
 
				+		r.t.x=t.x
			
 
				+		r.t.y=t.y
			
 
				+		r.t.z=t.z
			
 
				+		r.t.w=t.w
			
 
				+		
			
 
				+		Return r
			
 
				+	End
			
 
				+	
			
 
				+	#rem monkeydoc Creates a translation matrix.
			
 
				+	#end
			
 
				+	Function Translation:Mat4( tv:Vec3<T> )
			
 
				+		Return Translation( tv.x,tv.y,tv.z )
			
 
				+	End
			
 
				+	
			
 
				+	Function Translation:Mat4( tx:T,ty:T,tz:T )
			
 
				+		Local r:=New Mat4
			
 
				+		r.t.x=tx;r.t.y=ty;r.t.z=tz;r.t.w=1
			
 
				 		Return r
			
 
				 	End
			
 
				+
			
 
				+	#rem monkeydoc Creates a rotation matrix.
			
 
				+	#end
			
 
				+	Function Rotation:Mat4( rv:Vec3<Double> )
			
 
				+		Return Rotation( rv.x,rv.y,rv.z )
			
 
				+	End
			
 
				+	
			
 
				+	Function Rotation:Mat4( rx:Double,ry:Double,rz:Double )
			
 
				+		Return New Mat4( Mat3<T>.Rotation( rx,ry,rz ) )
			
 
				+	End
			
 
				+	
			
 
				+	#rem monkeydoc Creates a scaling matrix.
			
 
				+	#end
			
 
				+	Function Scaling:Mat4( sx:T,sy:T,sz:T )
			
 
				+		Return New Mat4( sx,sy,sz,1 )
			
 
				+	End
			
 
				+	
			
 
				+	Function Scaling:Mat4( sv:Vec3<T> )
			
 
				+		Return Scaling( sv.x,sv.y,sv.z )
			
 
				+	End
			
 
				 	
			
 
				-	Function Ortho:Mat4( left:Float,right:Float,bottom:Float,top:Float,near:Float,far:Float )
			
 
				+	Function Scaling:Mat4( t:T )
			
 
				+		Return Scaling( t,t,t )
			
 
				+	End
			
 
				 
			
 
				-		Local w:=right-left,h:=top-bottom,d:=far-near
			
 
				+	#rem monkeydoc Creates an orthographic projection matrix.
			
 
				+	#End	
			
 
				+	Function Ortho:Mat4( left:Double,right:Double,bottom:Double,top:Double,near:Double,far:Double )
			
 
				 
			
 
				-		Local r:Mat4
			
 
				-		r.i.x=2/w ; r.j.y=2/h ; r.k.z=2/d
			
 
				-		r.t=New Vec4<T>( -(right+left)/w,-(top+bottom)/h,-(far+near)/d,1 )
			
 
				+		Local w:=right-left,h:=top-bottom,d:=far-near,r:Mat4
			
 
				+
			
 
				+		r.i.x=2/w
			
 
				+		r.j.y=2/h
			
 
				+		r.k.z=2/d
			
 
				+		r.t.x=-(right+left)/w
			
 
				+		r.t.y=-(top+bottom)/h
			
 
				+		r.t.z=-(far+near)/d
			
 
				+		r.t.w=1
			
 
				+
			
 
				+		Return r
			
 
				+	End
			
 
				+	
			
 
				+	Function Frustum:Mat4( left:Double,right:Double,bottom:Double,top:Double,near:Double,far:Double )
			
 
				+	
			
 
				+		Local w:=right-left,h:=top-bottom,d:=far-near,near2:=near*2,r:Mat4
			
 
				+
			
 
				+		r.i.x=near2/w
			
 
				+		r.j.y=near2/h
			
 
				+		r.k.x=(right+left)/w
			
 
				+		r.k.y=(top+bottom)/h
			
 
				+		r.k.z=(far+near)/d
			
 
				+		r.k.w=1
			
 
				+		r.t.z=-(far*near2)/d
			
 
				+		
			
 
				 		Return r
			
 
				 	End
			
 
				 	
			
--- a/modules/std/geom/quat.monkey2
+++ b/modules/std/geom/quat.monkey2
@@ -28,36 +28,6 @@ Class Quat<T>
 
				 		v.x=vx ; v.y=vy ; v.z=vz ; Self.w=w
			
 
				 	End
			
 
				 	
			
 
				-	Method New( m:Mat3<T> )
			
 
				-		m=m.Orthogonalize()
			
 
				-		Local t:=m.i.x+m.j.y+m.k.z
			
 
				-		If t>EPSILON
			
 
				-			t=Sqrt( t+1 )*2
			
 
				-			v.x=(m.k.y-m.j.z)/t
			
 
				-			v.y=(m.i.z-m.k.x)/t
			
 
				-			v.z=(m.j.x-m.i.y)/t
			
 
				-			w=t/4
			
 
				-		Else If m.i.x>m.j.y And m.i.x>m.k.z
			
 
				-			t=Sqrt( m.i.x-m.j.y-m.k.z+1 )*2
			
 
				-			v.x=t/4
			
 
				-			v.y=(m.j.x+m.i.y)/t
			
 
				-			v.z=(m.i.z+m.k.x)/t
			
 
				-			w=(m.k.y-m.j.z)/t
			
 
				-		Else If m.j.y>m.k.z
			
 
				-			t=Sqrt( m.j.y-m.k.z-m.i.x+1 )*2
			
 
				-			v.x=(m.j.x+m.i.y)/t
			
 
				-			v.y=t/4
			
 
				-			v.z=(m.k.y+m.j.z)/t
			
 
				-			w=(m.i.z-m.k.x)/t
			
 
				-		Else
			
 
				-			t=Sqrt( m.k.z-m.j.y-m.i.x+1 )*2
			
 
				-			v.x=(m.i.z+m.k.x)/t
			
 
				-			v.y=(m.k.y+m.j.z)/t
			
 
				-			v.z=t/4
			
 
				-			w=(m.j.x-m.i.y)/t
			
 
				-		Endif
			
 
				-	End
			
 
				-	
			
 
				 	Operator To<C>:Quat<C>()
			
 
				 		Return New Quat<C>( v,w )
			
 
				 	End
			
@@ -66,10 +36,6 @@ Class Quat<T>
 
				 		Return "Quat("+v+","+w+")"
			
 
				 	End
			
 
				 	
			
 
				-	Operator To:Mat3<T>()
			
 
				-		Return New Mat3<T>( Self )
			
 
				-	End
			
 
				-	
			
 
				 	Property Length:Double()
			
 
				 		Return Sqrt( v.Dot(v) + w*w )
			
 
				 	End
			
@@ -85,26 +51,14 @@ Class Quat<T>
 
				 		Local yz:=v.y*v.z , wx:=w*v.x
			
 
				 		Local xy:=v.x*v.y , wz:=w*v.z
			
 
				 		Local xx:=v.x*v.x , zz:=v.z*v.z
			
 
				-		return New Vec3<T>( 2*(xy+wz),1-2*(xx+zz),2*(yz-wx) )
			
 
				+		Return New Vec3<T>( 2*(xy+wz),1-2*(xx+zz),2*(yz-wx) )
			
 
				 	End
			
 
				 	
			
 
				 	Property K:Vec3<T>()
			
 
				 		Local xz:=v.x*v.z , wy:=w*v.y
			
 
				 		Local yz:=v.y*v.z , wx:=w*v.x
			
 
				 		Local xx:=v.x*v.x , yy:=v.y*v.y
			
 
				-		return New Vec3<T>( 2*(xz-wy),2*(yz+wx),1-2*(xx+yy) )
			
 
				-	End
			
 
				-	
			
 
				-	Property Yaw:Double()
			
 
				-		Return K.Yaw
			
 
				-	End
			
 
				-	
			
 
				-	Property Pitch:Double()
			
 
				-		Return K.Pitch
			
 
				-	End
			
 
				-	
			
 
				-	Property Roll:Double()
			
 
				-		Return ATan2( I.y,J.y )
			
 
				+		Return New Vec3<T>( 2*(xz-wy),2*(yz+wx),1-2*(xx+yy) )
			
 
				 	End
			
 
				 	
			
 
				 	Operator-:Quat()
			
@@ -135,6 +89,18 @@ Class Quat<T>
 
				 		Return New Quat( v/t,w/t )
			
 
				 	End
			
 
				 	
			
 
				+	Method GetYaw:Double()
			
 
				+		Return K.Yaw
			
 
				+	End
			
 
				+	
			
 
				+	Method GetPitch:Double()
			
 
				+		Return K.Pitch
			
 
				+	End
			
 
				+	
			
 
				+	Method GetRoll:Double()
			
 
				+		Return ATan2( I.y,J.y )
			
 
				+	End
			
 
				+	
			
 
				 	Method Dot:Double( q:Quat )
			
 
				 		Return v.x*q.v.x + v.y*q.v.y + v.z*q.v.z + w*q.w
			
 
				 	End
			
@@ -161,10 +127,24 @@ Class Quat<T>
 
				 		Return Self*b + t*a
			
 
				 	End
			
 
				 	
			
 
				-'	Function RotationQuat:Quat( rv:Vec3<T> )
			
 
				-'	End
			
 
				+	Function Pitch:Quat( r:Double )
			
 
				+		Return New Quat( Sin( r/2 ),0,0,Cos( r/2 ) )
			
 
				+	End
			
 
				+
			
 
				+	Function Yaw:Quat( r:Double )
			
 
				+		Return New Quat( 0,Sin( r/2 ),0,Cos( r/2 ) )
			
 
				+	End
			
 
				+
			
 
				+	Function Roll:Quat( r:Double )
			
 
				+		Return New Quat( 0,0,Sin( r/2 ),Cos( r/2 ) )
			
 
				+	End
			
 
				+
			
 
				+	Function Rotation:Quat( rv:Vec3<Double> )
			
 
				+		Return Yaw( rv.y ) * Pitch( rv.x ) * Roll( rv.z )
			
 
				+	End
			
 
				 	
			
 
				-'	Function AxisAngleQuat:Quat( axis:Vec3<T>,angle:Double )
			
 
				-'	End
			
 
				+	Function Rotation:Quat( rx:Double,ry:Double,rz:Double )
			
 
				+		Return Yaw( ry ) * Pitch( rx ) * Roll( rz )
			
 
				+	End
			
 
				 
			
 
				 End
			
--- a/modules/std/geom/vec3.monkey2
+++ b/modules/std/geom/vec3.monkey2
@@ -54,6 +54,12 @@ Struct Vec3<T>
 
				 		Self.z=z
			
 
				 	End
			
 
				 	
			
 
				+	Property XY:Vec2<T>()
			
 
				+		Return New Vec2<T>( x,y )
			
 
				+	Setter( xy:Vec2<T> )
			
 
				+		x=xy.x;y=xy.y
			
 
				+	End
			
 
				+	
			
 
				 	Operator-:Vec3()
			
 
				 		Return New Vec3( -x,-y,-z )
			
 
				 	End
			
--- a/modules/std/geom/vec4.monkey2
+++ b/modules/std/geom/vec4.monkey2
@@ -57,6 +57,26 @@ Struct Vec4<T>
 
				 		Self.w=w
			
 
				 	End
			
 
				 	
			
 
				+	Property XY:Vec2<T>()
			
 
				+		Return New Vec2<T>( x,y )
			
 
				+	Setter( xy:Vec2<T> )
			
 
				+		x=xy.x;y=xy.y
			
 
				+	End
			
 
				+	
			
 
				+	Property XYZ:Vec3<T>()
			
 
				+		Return New Vec3<T>( x,y,z )
			
 
				+	Setter( xyz:Vec3<T> )
			
 
				+		x=xyz.x;y=xyz.y;z=xyz.z
			
 
				+	End
			
 
				+	
			
 
				+	Operator To:String()
			
 
				+		Return "Vec4("+x+","+y+","+z+","+w+")"
			
 
				+	End
			
 
				+	
			
 
				+	Operator To<C>:Vec4<C>()
			
 
				+		Return New Vec4<C>( x,y,z,w )
			
 
				+	End
			
 
				+	
			
 
				 	Operator-:Vec4()
			
 
				 		Return New Vec4( -x,-y,-z,-w )
			
 
				 	End
			
--- a/modules/std/misc/zipfile.monkey2
+++ b/modules/std/misc/zipfile.monkey2
@@ -3,21 +3,54 @@ Namespace std.zipfile
 
				 
			
 
				 Using miniz
			
 
				 
			
 
				-#rem monkeydoc @hidden
			
 
				+#rem monkeydoc The ZipFile class.
			
 
				 #end
			
 
				 Class ZipFile
			
 
				 
			
 
				+	#rem monkeydoc Array of all files contained in the zip.
			
 
				+	#end
			
 
				 	Property Files:String[]()
			
 
				 		Return _files
			
 
				 	End
			
 
				 	
			
 
				+	#rem monkeydoc Closes the zip.
			
 
				+	#end
			
 
				 	Method Close()
			
 
				 		If Not _data Return
			
 
				 		libc.free( _zip )
			
 
				+		_files=Null
			
 
				+		_sizes=Null
			
 
				 		_data.Discard()
			
 
				 		_data=Null
			
 
				 	End
			
 
				 	
			
 
				+	#rem monkeydoc Checks if a file is contained in the zip.
			
 
				+	#end
			
 
				+	Method Contains:Bool( file:String )
			
 
				+	
			
 
				+		Return FindFile( file )<>-1
			
 
				+	End
			
 
				+	
			
 
				+	#rem moneydoc Extracts a file from the zip into a databuffer.
			
 
				+	#end
			
 
				+	Method ExtractData:DataBuffer( file:String )
			
 
				+	
			
 
				+		Local i:=FindFile( file )
			
 
				+		If i=-1 Return Null
			
 
				+
			
 
				+		Local size:=_sizes[i]
			
 
				+
			
 
				+		Local buf:=New DataBuffer( size )
			
 
				+		
			
 
				+		If mz_zip_reader_extract_to_mem( _zip,i,buf.Data,size,0 ) Return buf
			
 
				+		
			
 
				+		buf.Discard()
			
 
				+		
			
 
				+		Return Null
			
 
				+	End
			
 
				+
			
 
				+	#rem monkeydoc @hidden
			
 
				+	#end	
			
 
				 	Method FindFile:Int( file:String )
			
 
				 
			
 
				 		For Local i:=0 Until _files.Length
			
@@ -27,6 +60,8 @@ Class ZipFile
 
				 		Return -1
			
 
				 	End
			
 
				 	
			
 
				+	#rem monkeydoc @hidden
			
 
				+	#end	
			
 
				 	Method Extract:Bool( dir:String,prefix:String="" )
			
 
				 	
			
 
				 		If Not dir.EndsWith( "/" ) dir+="/"
			
--- a/modules/theoraplayer/makefile.monkey2
+++ b/modules/theoraplayer/makefile.monkey2
@@ -0,0 +1,154 @@
 
				+
			
 
				+Namespace theoraplayer
			
 
				+
			
 
				+'***** ogg *****
			
 
				+'
			
 
				+#Import "native/ogg/src/bitwise.c"
			
 
				+#Import "native/ogg/src/framing.c"
			
 
				+
			
 
				+'***** vorbis *****
			
 
				+'
			
 
				+#Import "native/vorbis/lib/analysis.c"
			
 
				+'#Import "native/vorbis/lib/barkmel.c"
			
 
				+#Import "native/vorbis/lib/bitrate.c"
			
 
				+#Import "native/vorbis/lib/block.c"
			
 
				+#Import "native/vorbis/lib/codebook.c"
			
 
				+#Import "native/vorbis/lib/envelope.c"
			
 
				+#Import "native/vorbis/lib/floor0.c"
			
 
				+#Import "native/vorbis/lib/floor1.c"
			
 
				+#Import "native/vorbis/lib/info.c"
			
 
				+#Import "native/vorbis/lib/lookup.c"
			
 
				+#Import "native/vorbis/lib/lpc.c"
			
 
				+#Import "native/vorbis/lib/lsp.c"
			
 
				+#Import "native/vorbis/lib/mapping0.c"
			
 
				+#Import "native/vorbis/lib/mdct.c"
			
 
				+#Import "native/vorbis/lib/psy.c"
			
 
				+'#Import "native/vorbis/lib/psytune.c"
			
 
				+#Import "native/vorbis/lib/registry.c"
			
 
				+#Import "native/vorbis/lib/res0.c"
			
 
				+#Import "native/vorbis/lib/sharedbook.c"
			
 
				+#Import "native/vorbis/lib/smallft.c"
			
 
				+#Import "native/vorbis/lib/synthesis.c"
			
 
				+'#Import "native/vorbis/lib/tone.c"
			
 
				+#Import "native/vorbis/lib/vorbisenc.c"
			
 
				+#Import "native/vorbis/lib/vorbisfile.c"
			
 
				+#Import "native/vorbis/lib/window.c"
			
 
				+
			
 
				+'***** theora *****
			
 
				+'
			
 
				+'#Import "native/theora/lib/analyze.c"
			
 
				+#Import "native/theora/lib/apiwrapper.c"
			
 
				+#Import "native/theora/lib/bitpack.c"
			
 
				+#Import "native/theora/lib/collect.c"
			
 
				+#Import "native/theora/lib/decapiwrapper.c"
			
 
				+#Import "native/theora/lib/decinfo.c"
			
 
				+#Import "native/theora/lib/decode.c"
			
 
				+#Import "native/theora/lib/dequant.c"
			
 
				+'#Import "native/theora/lib/encapiwrapper.c"
			
 
				+'#Import "native/theora/lib/encfrag.c"
			
 
				+'#Import "native/theora/lib/encinfo.c"
			
 
				+'#Import "native/theora/lib/encode.c"
			
 
				+#Import "native/theora/lib/encoder_disabled.c"
			
 
				+#Import "native/theora/lib/enquant.c"
			
 
				+#Import "native/theora/lib/fdct.c"
			
 
				+#Import "native/theora/lib/fragment.c"
			
 
				+#Import "native/theora/lib/huffdec.c"
			
 
				+#Import "native/theora/lib/huffenc.c"
			
 
				+#Import "native/theora/lib/idct.c"
			
 
				+#Import "native/theora/lib/info.c"
			
 
				+#Import "native/theora/lib/internal.c"
			
 
				+#Import "native/theora/lib/mathops.c"
			
 
				+'#Import "native/theora/lib/mcenc.c"
			
 
				+#Import "native/theora/lib/quant.c"
			
 
				+#Import "native/theora/lib/rate.c"
			
 
				+#Import "native/theora/lib/state.c"
			
 
				+#Import "native/theora/lib/tokenize.c"
			
 
				+
			
 
				+'#Import "native/theora/lib/x86/mmxencfrag.c
			
 
				+'#Import "native/theora/lib/x86/mmxfdct.c"
			
 
				+'#Import "native/theora/lib/x86/mmxfrag.c"
			
 
				+'#Import "native/theora/lib/x86/mmxidct.c"
			
 
				+'#Import "native/theora/lib/x86/mmxstate.c"
			
 
				+'#Import "native/theora/lib/x86/sse2encfrag.c"
			
 
				+'#Import "native/theora/lib/x86/sse2fdct.c"
			
 
				+'#Import "native/theora/lib/x86/sse2idct.c"
			
 
				+'#Import "native/theora/lib/x86/x86cpu.c"
			
 
				+'#Import "native/theora/lib/x86/x86enc.c"
			
 
				+'#Import "native/theora/lib/x86/x86enquant.c"
			
 
				+'#Import "native/theora/lib/x86/x86state.c"
			
 
				+
			
 
				+'***** theoraplayer *****
			
 
				+'
			
 
				+#Import "native/theoraplayer/src/AudioInterface.cpp"
			
 
				+#Import "native/theoraplayer/src/AudioInterfaceFactory.cpp"
			
 
				+#Import "native/theoraplayer/src/AudioPacketQueue.cpp"
			
 
				+#Import "native/theoraplayer/src/DataSource.cpp"
			
 
				+#Import "native/theoraplayer/src/Exception.cpp"
			
 
				+#Import "native/theoraplayer/src/FileDataSource.cpp"
			
 
				+#Import "native/theoraplayer/src/FrameQueue.cpp"
			
 
				+#Import "native/theoraplayer/src/Manager.cpp"
			
 
				+#Import "native/theoraplayer/src/MemoryDataSource.cpp"
			
 
				+#Import "native/theoraplayer/src/Mutex.cpp"
			
 
				+#Import "native/theoraplayer/src/theoraplayer.cpp"
			
 
				+#Import "native/theoraplayer/src/Thread.cpp"
			
 
				+#Import "native/theoraplayer/src/Timer.cpp"
			
 
				+#Import "native/theoraplayer/src/Utility.cpp"
			
 
				+#Import "native/theoraplayer/src/VideoClip.cpp"
			
 
				+#Import "native/theoraplayer/src/VideoFrame.cpp"
			
 
				+#Import "native/theoraplayer/src/WorkerThread.cpp"
			
 
				+
			
 
				+#Import "native/theoraplayer/src/YUV/yuv_util.c"
			
 
				+
			
 
				+#Import "native/theoraplayer/src/formats/Theora/VideoClip_Theora.cpp"
			
 
				+
			
 
				+#Import "native/theoraplayer/src/YUV/C/yuv420_grey_c.c"
			
 
				+#Import "native/theoraplayer/src/YUV/C/yuv420_rgb_c.c"
			
 
				+#Import "native/theoraplayer/src/YUV/C/yuv420_yuv_c.c"
			
 
				+
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/compare.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_common.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_gcc.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_neon.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_neon64.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_posix.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/compare_win.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/convert.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_argb.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_from.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_from_argb.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_jpeg.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_to_argb.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/convert_to_i420.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/cpu_id.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/mjpeg_decoder.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/mjpeg_validate.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/planar_functions.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/rotate.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/rotate_argb.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/rotate_mips.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/rotate_neon.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/rotate_neon64.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/row_any.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/row_common.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/row_gcc.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/row_mips.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/row_neon.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/row_neon64.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/row_posix.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/row_win.cc"
			
 
				+'#Import "native/theoraplayer/src/YUV/libyuv/src/row_x86.asm"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/scale.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_any.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_argb.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_argb_neon.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_common.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_gcc.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_mips.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_neon.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_neon64.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_posix.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/scale_win.cc"
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/src/video_common.cc"
			
 
				+'#Import "native/theoraplayer/src/YUV/libyuv/src/x86inc.asm"
			
 
				+
			
 
				+#Import "native/theoraplayer/src/YUV/libyuv/yuv_libyuv.c"
			
--- a/modules/theoraplayer/module.json
+++ b/modules/theoraplayer/module.json
@@ -0,0 +1,8 @@
 
				+{
			
 
				+	"module":"theoraplayer",
			
 
				+	"about":"Minimal theora player",
			
 
				+	"author":"Kresimir Spes",
			
 
				+	"version":"1.0.0",
			
 
				+	"support":"http://monkey2.monkey-x.com",
			
 
				+	"depends":["openal","libc"]
			
 
				+}
			
--- a/modules/theoraplayer/native/LICENSE
+++ b/modules/theoraplayer/native/LICENSE
@@ -0,0 +1,27 @@
 
				+Copyright (c) Kresimir Spes ([email protected])
			
 
				+All rights reserved.
			
 
				+
			
 
				+Redistribution and use in source and binary forms, with or without
			
 
				+modification, are permitted provided that the following conditions are met:
			
 
				+
			
 
				+1.	Redistributions of source code must retain the above copyright notice,
			
 
				+	this list of conditions and the following disclaimer.
			
 
				+
			
 
				+2.	Redistributions in binary form must reproduce the above copyright notice,
			
 
				+	this list of conditions and the following disclaimer in the documentation
			
 
				+	and/or other materials provided with the distribution.
			
 
				+
			
 
				+3.	Neither the name of the copyright holder nor the names of its contributors
			
 
				+	may be used to endorse or promote products derived from this software
			
 
				+	without specific prior written permission.
			
 
				+
			
 
				+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
			
 
				+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
			
 
				+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
			
 
				+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
			
 
				+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
			
 
				+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
			
 
				+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
			
 
				+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
			
 
				+POSSIBILITY OF SUCH DAMAGE.
			
--- a/modules/theoraplayer/native/OpenAL_AudioInterface.cpp
+++ b/modules/theoraplayer/native/OpenAL_AudioInterface.cpp
@@ -0,0 +1,235 @@
 
				+/// @file
			
 
				+/// @version 2.0
			
 
				+/// 
			
 
				+/// @section LICENSE
			
 
				+/// 
			
 
				+/// This program is free software; you can redistribute it and/or modify it under
			
 
				+/// the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+#include "OpenAL_AudioInterface.h"
			
 
				+
			
 
				+ALCdevice* gDevice = 0;
			
 
				+ALCcontext* gContext = 0;
			
 
				+
			
 
				+short float2short(float f)
			
 
				+{
			
 
				+	if (f > 1.0f)
			
 
				+	{
			
 
				+		f = 1.0f;
			
 
				+	}
			
 
				+	else if (f < -1.0f)
			
 
				+	{
			
 
				+		f = -1.0f;
			
 
				+	}
			
 
				+	return (short)(f * 32767);
			
 
				+}
			
 
				+
			
 
				+OpenAL_AudioInterface::OpenAL_AudioInterface(theoraplayer::VideoClip* clip, int channelsCount, int frequency) :
			
 
				+	theoraplayer::AudioInterface(clip, channelsCount, frequency), theoraplayer::Timer()
			
 
				+{
			
 
				+	this->sourceNumChannels = this->channelsCount;
			
 
				+	if (this->channelsCount > 2)
			
 
				+	{
			
 
				+		// ignore audio with more than 2 channels, use only the stereo channels
			
 
				+		this->channelsCount = 2;
			
 
				+	}
			
 
				+	this->maxBuffSize = frequency * this->channelsCount * 2;
			
 
				+	this->buffSize = 0;
			
 
				+	this->numProcessedSamples = 0;
			
 
				+	this->currentTimer = 0;
			
 
				+	this->tempBuffer = new short[this->maxBuffSize];
			
 
				+	alGenSources(1, &this->source);
			
 
				+	clip->setTimer(this);
			
 
				+	this->numPlayedSamples = 0;
			
 
				+}
			
 
				+
			
 
				+OpenAL_AudioInterface::~OpenAL_AudioInterface()
			
 
				+{
			
 
				+	if (this->tempBuffer != NULL)
			
 
				+	{
			
 
				+		delete[] this->tempBuffer;
			
 
				+	}
			
 
				+	if (this->source != 0)
			
 
				+	{
			
 
				+		alSourcei(this->source, AL_BUFFER, 0);
			
 
				+		alDeleteSources(1, &this->source);
			
 
				+	}
			
 
				+	while (this->bufferQueue.size() > 0)
			
 
				+	{
			
 
				+		alDeleteBuffers(1, &this->bufferQueue.front().id);
			
 
				+		this->bufferQueue.pop();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+float OpenAL_AudioInterface::getQueuedAudioSize()
			
 
				+{
			
 
				+	return ((float)(this->numProcessedSamples - this->numPlayedSamples)) / this->frequency;
			
 
				+}
			
 
				+
			
 
				+void OpenAL_AudioInterface::insertData(float* data, int samplesCount)
			
 
				+{
			
 
				+	float* tempData = NULL;
			
 
				+	if (this->sourceNumChannels > 2)
			
 
				+	{
			
 
				+		tempData = new float[samplesCount * 2 / this->sourceNumChannels + 16]; // 16 padding just in case
			
 
				+		int i = 0;
			
 
				+		int n = 0;
			
 
				+		for (n = 0, i = 0; i < samplesCount; i += this->sourceNumChannels, n += 2)
			
 
				+		{
			
 
				+			tempData[n] = data[i];
			
 
				+			tempData[n + 1] = data[i + 1];
			
 
				+		}
			
 
				+		data = tempData;
			
 
				+		samplesCount = n;
			
 
				+	}
			
 
				+	//printf("got %d bytes, %d buffers queued\n",samplesCount,(int)this->bufferQueue.size());
			
 
				+	int state = 0;
			
 
				+	OpenAL_Buffer buff;
			
 
				+	ALuint format;
			
 
				+	for (int i = 0; i < samplesCount; ++i)
			
 
				+	{
			
 
				+		if (this->buffSize < this->maxBuffSize)
			
 
				+		{
			
 
				+			this->tempBuffer[this->buffSize] = float2short(data[i]);
			
 
				+			++this->buffSize;
			
 
				+		}
			
 
				+		if (this->buffSize == this->frequency * this->channelsCount / 10)
			
 
				+		{
			
 
				+			memset(&buff, 0, sizeof(OpenAL_Buffer));
			
 
				+			alGenBuffers(1, &buff.id);
			
 
				+			format = (this->channelsCount == 1) ? AL_FORMAT_MONO16 : AL_FORMAT_STEREO16;
			
 
				+			alBufferData(buff.id, format, this->tempBuffer, this->buffSize * 2, this->frequency);
			
 
				+			alSourceQueueBuffers(this->source, 1, &buff.id);
			
 
				+			buff.samplesCount = this->buffSize / this->channelsCount;
			
 
				+			this->numProcessedSamples += this->buffSize / this->channelsCount;
			
 
				+			this->bufferQueue.push(buff);
			
 
				+			this->buffSize = 0;
			
 
				+			state = 0;
			
 
				+			alGetSourcei(this->source, AL_SOURCE_STATE, &state);
			
 
				+			if (state != AL_PLAYING)
			
 
				+			{
			
 
				+				//alSourcef(this->source,AL_PITCH,0.5); // debug
			
 
				+				//alSourcef(this->source,AL_SAMPLE_OFFSET,(float) this->numProcessedSamples-mFreq/4);
			
 
				+				alSourcePlay(this->source);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (tempData != NULL)
			
 
				+	{
			
 
				+		delete[] tempData;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void OpenAL_AudioInterface::update(float timeDelta)
			
 
				+{
			
 
				+	int i = 0;
			
 
				+	int nProcessed = 0;
			
 
				+	OpenAL_Buffer buff;
			
 
				+	// process played buffers
			
 
				+	alGetSourcei(this->source, AL_BUFFERS_PROCESSED, &nProcessed);
			
 
				+	for (i = 0; i < nProcessed; ++i)
			
 
				+	{
			
 
				+		buff = this->bufferQueue.front();
			
 
				+		this->bufferQueue.pop();
			
 
				+		this->numPlayedSamples += buff.samplesCount;
			
 
				+		alSourceUnqueueBuffers(this->source, 1, &buff.id);
			
 
				+		alDeleteBuffers(1, &buff.id);
			
 
				+	}
			
 
				+	if (nProcessed != 0)
			
 
				+	{
			
 
				+		// update offset
			
 
				+		alGetSourcef(this->source, AL_SEC_OFFSET, &this->currentTimer);
			
 
				+	}
			
 
				+	// control playback and return time position
			
 
				+	//alGetSourcei(this->source,AL_SOURCE_STATE,&state);
			
 
				+	//if (state == AL_PLAYING)
			
 
				+	this->currentTimer += timeDelta;
			
 
				+	this->time = this->currentTimer + (float) this->numPlayedSamples / this->frequency;
			
 
				+	float duration = this->clip->getDuration();
			
 
				+	if (this->time > duration)
			
 
				+	{
			
 
				+		this->time = duration;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void OpenAL_AudioInterface::pause()
			
 
				+{
			
 
				+	alSourcePause(this->source);
			
 
				+	Timer::pause();
			
 
				+}
			
 
				+
			
 
				+void OpenAL_AudioInterface::play()
			
 
				+{
			
 
				+	alSourcePlay(this->source);
			
 
				+	Timer::play();
			
 
				+}
			
 
				+
			
 
				+void OpenAL_AudioInterface::seek(float time)
			
 
				+{
			
 
				+	OpenAL_Buffer buff;
			
 
				+	alSourceStop(this->source);
			
 
				+	while (!this->bufferQueue.empty())
			
 
				+	{
			
 
				+		buff = this->bufferQueue.front();
			
 
				+		this->bufferQueue.pop();
			
 
				+		alSourceUnqueueBuffers(this->source, 1, &buff.id);
			
 
				+		alDeleteBuffers(1, &buff.id);
			
 
				+	}
			
 
				+	//		int nProcessed;
			
 
				+	//		alGetSourcei(this->source,AL_BUFFERS_PROCESSED,&nProcessed);
			
 
				+	//		if (nProcessed != 0)
			
 
				+	//			nProcessed=nProcessed;
			
 
				+	this->buffSize = 0;
			
 
				+	this->currentTimer = 0;
			
 
				+	this->numPlayedSamples = this->numProcessedSamples = (int)(time * this->frequency);
			
 
				+	this->time = time;
			
 
				+}
			
 
				+
			
 
				+OpenAL_AudioInterfaceFactory::OpenAL_AudioInterfaceFactory()
			
 
				+{
			
 
				+	return;
			
 
				+	
			
 
				+	// openal init is here used only to simplify samples for this plugin
			
 
				+	// if you want to use this interface in your own program, you'll
			
 
				+	// probably want to remove the openal init/destory lines
			
 
				+	gDevice = alcOpenDevice(NULL);
			
 
				+	if (alcGetError(gDevice) != ALC_NO_ERROR)
			
 
				+	{
			
 
				+		return;
			
 
				+	}
			
 
				+	gContext = alcCreateContext(gDevice, NULL);
			
 
				+	if (alcGetError(gDevice) != ALC_NO_ERROR)
			
 
				+	{
			
 
				+		alcCloseDevice(gDevice);
			
 
				+		gDevice = NULL;
			
 
				+		return;
			
 
				+	}
			
 
				+	alcMakeContextCurrent(gContext);
			
 
				+	if (alcGetError(gDevice) != ALC_NO_ERROR)
			
 
				+	{
			
 
				+		alcDestroyContext(gContext);
			
 
				+		gContext = NULL;
			
 
				+		alcCloseDevice(gDevice);
			
 
				+		gDevice = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+OpenAL_AudioInterfaceFactory::~OpenAL_AudioInterfaceFactory()
			
 
				+{
			
 
				+	return;
			
 
				+	
			
 
				+	if (gDevice != NULL)
			
 
				+	{
			
 
				+		alcMakeContextCurrent(NULL);
			
 
				+		alcDestroyContext(gContext);
			
 
				+		alcCloseDevice(gDevice);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+OpenAL_AudioInterface* OpenAL_AudioInterfaceFactory::createInstance(theoraplayer::VideoClip* clip, int channelsCount, int frequency)
			
 
				+{
			
 
				+	return new OpenAL_AudioInterface(clip, channelsCount, frequency);
			
 
				+}
			
--- a/modules/theoraplayer/native/OpenAL_AudioInterface.h
+++ b/modules/theoraplayer/native/OpenAL_AudioInterface.h
@@ -0,0 +1,77 @@
 
				+/// @file
			
 
				+/// @version 2.0
			
 
				+/// 
			
 
				+/// @section LICENSE
			
 
				+/// 
			
 
				+/// This program is free software; you can redistribute it and/or modify it under
			
 
				+/// the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				+/// 
			
 
				+/// @section DESCRIPTION
			
 
				+/// 
			
 
				+/// Defines an audio interface for OpenAL.
			
 
				+
			
 
				+#ifndef THEORAPLAYER_DEMOS_OPENAL_AUDIO_INTERFACE_H
			
 
				+#define THEORAPLAYER_DEMOS_OPENAL_AUDIO_INTERFACE_H
			
 
				+
			
 
				+#ifndef __APPLE__
			
 
				+#include <AL/al.h>
			
 
				+#include <AL/alc.h>
			
 
				+#else
			
 
				+#include <OpenAL/al.h>
			
 
				+#include <OpenAL/alc.h>
			
 
				+#endif
			
 
				+#include <queue>
			
 
				+
			
 
				+#include <theoraplayer/AudioInterface.h>
			
 
				+#include <theoraplayer/AudioInterfaceFactory.h>
			
 
				+#include <theoraplayer/Timer.h>
			
 
				+#include <theoraplayer/VideoClip.h>
			
 
				+
			
 
				+class OpenAL_AudioInterface : public theoraplayer::AudioInterface, theoraplayer::Timer
			
 
				+{	
			
 
				+public:
			
 
				+	OpenAL_AudioInterface(theoraplayer::VideoClip* clip, int channelsCount, int frequency);
			
 
				+	~OpenAL_AudioInterface();
			
 
				+
			
 
				+	//! queued audio buffers, expressed in seconds
			
 
				+	float getQueuedAudioSize();
			
 
				+
			
 
				+	void insertData(float* data, int samplesCount);	
			
 
				+
			
 
				+	void update(float timeDelta);
			
 
				+
			
 
				+	void pause();
			
 
				+	void play();
			
 
				+	void seek(float time);
			
 
				+
			
 
				+private:
			
 
				+	int sourceNumChannels;
			
 
				+	int maxBuffSize;
			
 
				+	int buffSize;
			
 
				+	short* tempBuffer;
			
 
				+	float currentTimer;
			
 
				+
			
 
				+	struct OpenAL_Buffer
			
 
				+	{
			
 
				+		ALuint id;
			
 
				+		int samplesCount;
			
 
				+	};
			
 
				+	std::queue<OpenAL_Buffer> bufferQueue;
			
 
				+
			
 
				+	ALuint source;
			
 
				+	int numProcessedSamples;
			
 
				+	int numPlayedSamples;
			
 
				+
			
 
				+};
			
 
				+
			
 
				+class OpenAL_AudioInterfaceFactory : public theoraplayer::AudioInterfaceFactory
			
 
				+{
			
 
				+public:
			
 
				+	OpenAL_AudioInterfaceFactory();
			
 
				+	~OpenAL_AudioInterfaceFactory();
			
 
				+
			
 
				+	OpenAL_AudioInterface* createInstance(theoraplayer::VideoClip* clip, int channelsCount, int frequency);
			
 
				+
			
 
				+};
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/monkey2_glue.cpp
+++ b/modules/theoraplayer/native/monkey2_glue.cpp
@@ -0,0 +1,33 @@
 
				+
			
 
				+#include "monkey2_glue.h"
			
 
				+
			
 
				+#include "theoraplayer.h"
			
 
				+#include "Manager.h"
			
 
				+#include "MemoryDataSource.h"
			
 
				+
			
 
				+theoraplayer::Manager *bb_theoraplayer_getManager(){
			
 
				+
			
 
				+	if( !theoraplayer::manager ) theoraplayer::init();
			
 
				+	
			
 
				+	return theoraplayer::manager;
			
 
				+}
			
 
				+
			
 
				+theoraplayer::VideoClip *bb_theoraplayer_createVideoClip( theoraplayer::Manager *self,const char *filename ){
			
 
				+
			
 
				+	return self->createVideoClip( filename );
			
 
				+}
			
 
				+
			
 
				+//FIXME - leaks MemoryDataSource!
			
 
				+//
			
 
				+theoraplayer::VideoClip *bb_theoraplayer_createVideoClip( theoraplayer::Manager *self,const void *data,int length ){
			
 
				+
			
 
				+	theoraplayer::MemoryDataSource *src=new theoraplayer::MemoryDataSource( (unsigned char*)data,length,"Theora" );
			
 
				+	
			
 
				+	return self->createVideoClip( src );
			
 
				+}
			
 
				+
			
 
				+theoraplayer::MemoryDataSource *bb_theoraplayer_createMemoryDataSource( const void *data,int length,const char *formatName ){
			
 
				+
			
 
				+	return new theoraplayer::MemoryDataSource( (unsigned char*)data,length,formatName );
			
 
				+}
			
 
				+
			
--- a/modules/theoraplayer/native/monkey2_glue.h
+++ b/modules/theoraplayer/native/monkey2_glue.h
@@ -0,0 +1,21 @@
 
				+
			
 
				+#ifndef BB_THEORAPLAYER_GLUE_H
			
 
				+#define BB_THEORAPLAYER_GLUE_H
			
 
				+
			
 
				+#include <bbmonkey.h>
			
 
				+
			
 
				+namespace theoraplayer{
			
 
				+	class Manager;
			
 
				+	class VideoClip;
			
 
				+	class MemoryDataSource;
			
 
				+}
			
 
				+
			
 
				+theoraplayer::Manager *bb_theoraplayer_getManager();
			
 
				+
			
 
				+theoraplayer::VideoClip *bb_theoraplayer_createVideoClip( theoraplayer::Manager *self,const char *filename );
			
 
				+
			
 
				+theoraplayer::VideoClip *bb_theoraplayer_createVideoClip( theoraplayer::Manager *self,const void *data,int length );
			
 
				+
			
 
				+theoraplayer::MemoryDataSource *bb_theoraplayer_createMemoryDataSource( const void *data,int length,const char *formatName );
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/ogg/COPYING
+++ b/modules/theoraplayer/native/ogg/COPYING
@@ -0,0 +1,28 @@
 
				+Copyright (c) 2002, Xiph.org Foundation
			
 
				+
			
 
				+Redistribution and use in source and binary forms, with or without
			
 
				+modification, are permitted provided that the following conditions
			
 
				+are met:
			
 
				+
			
 
				+- Redistributions of source code must retain the above copyright
			
 
				+notice, this list of conditions and the following disclaimer.
			
 
				+
			
 
				+- Redistributions in binary form must reproduce the above copyright
			
 
				+notice, this list of conditions and the following disclaimer in the
			
 
				+documentation and/or other materials provided with the distribution.
			
 
				+
			
 
				+- Neither the name of the Xiph.org Foundation nor the names of its
			
 
				+contributors may be used to endorse or promote products derived from
			
 
				+this software without specific prior written permission.
			
 
				+
			
 
				+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION
			
 
				+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
			
 
				+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
			
 
				+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
			
 
				+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
--- a/modules/theoraplayer/native/ogg/include/ogg/config_types.h
+++ b/modules/theoraplayer/native/ogg/include/ogg/config_types.h
@@ -0,0 +1,11 @@
 
				+#ifndef __CONFIG_TYPES_H__
			
 
				+#define __CONFIG_TYPES_H__
			
 
				+
			
 
				+/* these are filled in by configure */
			
 
				+typedef short ogg_int16_t;
			
 
				+typedef unsigned short ogg_uint16_t;
			
 
				+typedef int ogg_int32_t;
			
 
				+typedef unsigned int ogg_uint32_t;
			
 
				+typedef long long ogg_int64_t;
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/ogg/include/ogg/config_types.h.in
+++ b/modules/theoraplayer/native/ogg/include/ogg/config_types.h.in
@@ -0,0 +1,25 @@
 
				+#ifndef __CONFIG_TYPES_H__
			
 
				+#define __CONFIG_TYPES_H__
			
 
				+
			
 
				+/* these are filled in by configure */
			
 
				+#define INCLUDE_INTTYPES_H @INCLUDE_INTTYPES_H@
			
 
				+#define INCLUDE_STDINT_H @INCLUDE_STDINT_H@
			
 
				+#define INCLUDE_SYS_TYPES_H @INCLUDE_SYS_TYPES_H@
			
 
				+
			
 
				+#if INCLUDE_INTTYPES_H
			
 
				+#  include <inttypes.h>
			
 
				+#endif
			
 
				+#if INCLUDE_STDINT_H
			
 
				+#  include <stdint.h>
			
 
				+#endif
			
 
				+#if INCLUDE_SYS_TYPES_H
			
 
				+#  include <sys/types.h>
			
 
				+#endif
			
 
				+
			
 
				+typedef @SIZE16@ ogg_int16_t;
			
 
				+typedef @USIZE16@ ogg_uint16_t;
			
 
				+typedef @SIZE32@ ogg_int32_t;
			
 
				+typedef @USIZE32@ ogg_uint32_t;
			
 
				+typedef @SIZE64@ ogg_int64_t;
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/ogg/include/ogg/ogg.h
+++ b/modules/theoraplayer/native/ogg/include/ogg/ogg.h
@@ -0,0 +1,210 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2007             *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+ function: toplevel libogg include
			
 
				+ last mod: $Id: ogg.h 18044 2011-08-01 17:55:20Z gmaxwell $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#ifndef _OGG_H
			
 
				+#define _OGG_H
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#include <stddef.h>
			
 
				+#include <ogg/os_types.h>
			
 
				+
			
 
				+typedef struct {
			
 
				+  void *iov_base;
			
 
				+  size_t iov_len;
			
 
				+} ogg_iovec_t;
			
 
				+
			
 
				+typedef struct {
			
 
				+  long endbyte;
			
 
				+  int  endbit;
			
 
				+
			
 
				+  unsigned char *buffer;
			
 
				+  unsigned char *ptr;
			
 
				+  long storage;
			
 
				+} oggpack_buffer;
			
 
				+
			
 
				+/* ogg_page is used to encapsulate the data in one Ogg bitstream page *****/
			
 
				+
			
 
				+typedef struct {
			
 
				+  unsigned char *header;
			
 
				+  long header_len;
			
 
				+  unsigned char *body;
			
 
				+  long body_len;
			
 
				+} ogg_page;
			
 
				+
			
 
				+/* ogg_stream_state contains the current encode/decode state of a logical
			
 
				+   Ogg bitstream **********************************************************/
			
 
				+
			
 
				+typedef struct {
			
 
				+  unsigned char   *body_data;    /* bytes from packet bodies */
			
 
				+  long    body_storage;          /* storage elements allocated */
			
 
				+  long    body_fill;             /* elements stored; fill mark */
			
 
				+  long    body_returned;         /* elements of fill returned */
			
 
				+
			
 
				+
			
 
				+  int     *lacing_vals;      /* The values that will go to the segment table */
			
 
				+  ogg_int64_t *granule_vals; /* granulepos values for headers. Not compact
			
 
				+                                this way, but it is simple coupled to the
			
 
				+                                lacing fifo */
			
 
				+  long    lacing_storage;
			
 
				+  long    lacing_fill;
			
 
				+  long    lacing_packet;
			
 
				+  long    lacing_returned;
			
 
				+
			
 
				+  unsigned char    header[282];      /* working space for header encode */
			
 
				+  int              header_fill;
			
 
				+
			
 
				+  int     e_o_s;          /* set when we have buffered the last packet in the
			
 
				+                             logical bitstream */
			
 
				+  int     b_o_s;          /* set after we've written the initial page
			
 
				+                             of a logical bitstream */
			
 
				+  long    serialno;
			
 
				+  long    pageno;
			
 
				+  ogg_int64_t  packetno;  /* sequence number for decode; the framing
			
 
				+                             knows where there's a hole in the data,
			
 
				+                             but we need coupling so that the codec
			
 
				+                             (which is in a separate abstraction
			
 
				+                             layer) also knows about the gap */
			
 
				+  ogg_int64_t   granulepos;
			
 
				+
			
 
				+} ogg_stream_state;
			
 
				+
			
 
				+/* ogg_packet is used to encapsulate the data and metadata belonging
			
 
				+   to a single raw Ogg/Vorbis packet *************************************/
			
 
				+
			
 
				+typedef struct {
			
 
				+  unsigned char *packet;
			
 
				+  long  bytes;
			
 
				+  long  b_o_s;
			
 
				+  long  e_o_s;
			
 
				+
			
 
				+  ogg_int64_t  granulepos;
			
 
				+
			
 
				+  ogg_int64_t  packetno;     /* sequence number for decode; the framing
			
 
				+                                knows where there's a hole in the data,
			
 
				+                                but we need coupling so that the codec
			
 
				+                                (which is in a separate abstraction
			
 
				+                                layer) also knows about the gap */
			
 
				+} ogg_packet;
			
 
				+
			
 
				+typedef struct {
			
 
				+  unsigned char *data;
			
 
				+  int storage;
			
 
				+  int fill;
			
 
				+  int returned;
			
 
				+
			
 
				+  int unsynced;
			
 
				+  int headerbytes;
			
 
				+  int bodybytes;
			
 
				+} ogg_sync_state;
			
 
				+
			
 
				+/* Ogg BITSTREAM PRIMITIVES: bitstream ************************/
			
 
				+
			
 
				+extern void  oggpack_writeinit(oggpack_buffer *b);
			
 
				+extern int   oggpack_writecheck(oggpack_buffer *b);
			
 
				+extern void  oggpack_writetrunc(oggpack_buffer *b,long bits);
			
 
				+extern void  oggpack_writealign(oggpack_buffer *b);
			
 
				+extern void  oggpack_writecopy(oggpack_buffer *b,void *source,long bits);
			
 
				+extern void  oggpack_reset(oggpack_buffer *b);
			
 
				+extern void  oggpack_writeclear(oggpack_buffer *b);
			
 
				+extern void  oggpack_readinit(oggpack_buffer *b,unsigned char *buf,int bytes);
			
 
				+extern void  oggpack_write(oggpack_buffer *b,unsigned long value,int bits);
			
 
				+extern long  oggpack_look(oggpack_buffer *b,int bits);
			
 
				+extern long  oggpack_look1(oggpack_buffer *b);
			
 
				+extern void  oggpack_adv(oggpack_buffer *b,int bits);
			
 
				+extern void  oggpack_adv1(oggpack_buffer *b);
			
 
				+extern long  oggpack_read(oggpack_buffer *b,int bits);
			
 
				+extern long  oggpack_read1(oggpack_buffer *b);
			
 
				+extern long  oggpack_bytes(oggpack_buffer *b);
			
 
				+extern long  oggpack_bits(oggpack_buffer *b);
			
 
				+extern unsigned char *oggpack_get_buffer(oggpack_buffer *b);
			
 
				+
			
 
				+extern void  oggpackB_writeinit(oggpack_buffer *b);
			
 
				+extern int   oggpackB_writecheck(oggpack_buffer *b);
			
 
				+extern void  oggpackB_writetrunc(oggpack_buffer *b,long bits);
			
 
				+extern void  oggpackB_writealign(oggpack_buffer *b);
			
 
				+extern void  oggpackB_writecopy(oggpack_buffer *b,void *source,long bits);
			
 
				+extern void  oggpackB_reset(oggpack_buffer *b);
			
 
				+extern void  oggpackB_writeclear(oggpack_buffer *b);
			
 
				+extern void  oggpackB_readinit(oggpack_buffer *b,unsigned char *buf,int bytes);
			
 
				+extern void  oggpackB_write(oggpack_buffer *b,unsigned long value,int bits);
			
 
				+extern long  oggpackB_look(oggpack_buffer *b,int bits);
			
 
				+extern long  oggpackB_look1(oggpack_buffer *b);
			
 
				+extern void  oggpackB_adv(oggpack_buffer *b,int bits);
			
 
				+extern void  oggpackB_adv1(oggpack_buffer *b);
			
 
				+extern long  oggpackB_read(oggpack_buffer *b,int bits);
			
 
				+extern long  oggpackB_read1(oggpack_buffer *b);
			
 
				+extern long  oggpackB_bytes(oggpack_buffer *b);
			
 
				+extern long  oggpackB_bits(oggpack_buffer *b);
			
 
				+extern unsigned char *oggpackB_get_buffer(oggpack_buffer *b);
			
 
				+
			
 
				+/* Ogg BITSTREAM PRIMITIVES: encoding **************************/
			
 
				+
			
 
				+extern int      ogg_stream_packetin(ogg_stream_state *os, ogg_packet *op);
			
 
				+extern int      ogg_stream_iovecin(ogg_stream_state *os, ogg_iovec_t *iov,
			
 
				+                                   int count, long e_o_s, ogg_int64_t granulepos);
			
 
				+extern int      ogg_stream_pageout(ogg_stream_state *os, ogg_page *og);
			
 
				+extern int      ogg_stream_pageout_fill(ogg_stream_state *os, ogg_page *og, int nfill);
			
 
				+extern int      ogg_stream_flush(ogg_stream_state *os, ogg_page *og);
			
 
				+extern int      ogg_stream_flush_fill(ogg_stream_state *os, ogg_page *og, int nfill);
			
 
				+
			
 
				+/* Ogg BITSTREAM PRIMITIVES: decoding **************************/
			
 
				+
			
 
				+extern int      ogg_sync_init(ogg_sync_state *oy);
			
 
				+extern int      ogg_sync_clear(ogg_sync_state *oy);
			
 
				+extern int      ogg_sync_reset(ogg_sync_state *oy);
			
 
				+extern int      ogg_sync_destroy(ogg_sync_state *oy);
			
 
				+extern int      ogg_sync_check(ogg_sync_state *oy);
			
 
				+
			
 
				+extern char    *ogg_sync_buffer(ogg_sync_state *oy, long size);
			
 
				+extern int      ogg_sync_wrote(ogg_sync_state *oy, long bytes);
			
 
				+extern long     ogg_sync_pageseek(ogg_sync_state *oy,ogg_page *og);
			
 
				+extern int      ogg_sync_pageout(ogg_sync_state *oy, ogg_page *og);
			
 
				+extern int      ogg_stream_pagein(ogg_stream_state *os, ogg_page *og);
			
 
				+extern int      ogg_stream_packetout(ogg_stream_state *os,ogg_packet *op);
			
 
				+extern int      ogg_stream_packetpeek(ogg_stream_state *os,ogg_packet *op);
			
 
				+
			
 
				+/* Ogg BITSTREAM PRIMITIVES: general ***************************/
			
 
				+
			
 
				+extern int      ogg_stream_init(ogg_stream_state *os,int serialno);
			
 
				+extern int      ogg_stream_clear(ogg_stream_state *os);
			
 
				+extern int      ogg_stream_reset(ogg_stream_state *os);
			
 
				+extern int      ogg_stream_reset_serialno(ogg_stream_state *os,int serialno);
			
 
				+extern int      ogg_stream_destroy(ogg_stream_state *os);
			
 
				+extern int      ogg_stream_check(ogg_stream_state *os);
			
 
				+extern int      ogg_stream_eos(ogg_stream_state *os);
			
 
				+
			
 
				+extern void     ogg_page_checksum_set(ogg_page *og);
			
 
				+
			
 
				+extern int      ogg_page_version(const ogg_page *og);
			
 
				+extern int      ogg_page_continued(const ogg_page *og);
			
 
				+extern int      ogg_page_bos(const ogg_page *og);
			
 
				+extern int      ogg_page_eos(const ogg_page *og);
			
 
				+extern ogg_int64_t  ogg_page_granulepos(const ogg_page *og);
			
 
				+extern int      ogg_page_serialno(const ogg_page *og);
			
 
				+extern long     ogg_page_pageno(const ogg_page *og);
			
 
				+extern int      ogg_page_packets(const ogg_page *og);
			
 
				+
			
 
				+extern void     ogg_packet_clear(ogg_packet *op);
			
 
				+
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif  /* _OGG_H */
			
--- a/modules/theoraplayer/native/ogg/include/ogg/os_types.h
+++ b/modules/theoraplayer/native/ogg/include/ogg/os_types.h
@@ -0,0 +1,147 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2002             *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+ function: #ifdef jail to whip a few platforms into the UNIX ideal.
			
 
				+ last mod: $Id: os_types.h 17712 2010-12-03 17:10:02Z xiphmont $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#ifndef _OS_TYPES_H
			
 
				+#define _OS_TYPES_H
			
 
				+
			
 
				+/* make it easy on the folks that want to compile the libs with a
			
 
				+   different malloc than stdlib */
			
 
				+#define _ogg_malloc  malloc
			
 
				+#define _ogg_calloc  calloc
			
 
				+#define _ogg_realloc realloc
			
 
				+#define _ogg_free    free
			
 
				+
			
 
				+#if defined(_WIN32) 
			
 
				+
			
 
				+#  if defined(__CYGWIN__)
			
 
				+#    include <stdint.h>
			
 
				+     typedef int16_t ogg_int16_t;
			
 
				+     typedef uint16_t ogg_uint16_t;
			
 
				+     typedef int32_t ogg_int32_t;
			
 
				+     typedef uint32_t ogg_uint32_t;
			
 
				+     typedef int64_t ogg_int64_t;
			
 
				+     typedef uint64_t ogg_uint64_t;
			
 
				+#  elif defined(__MINGW32__)
			
 
				+#    include <sys/types.h>
			
 
				+     typedef short ogg_int16_t;
			
 
				+     typedef unsigned short ogg_uint16_t;
			
 
				+     typedef int ogg_int32_t;
			
 
				+     typedef unsigned int ogg_uint32_t;
			
 
				+     typedef long long ogg_int64_t;
			
 
				+     typedef unsigned long long ogg_uint64_t;
			
 
				+#  elif defined(__MWERKS__)
			
 
				+     typedef long long ogg_int64_t;
			
 
				+     typedef int ogg_int32_t;
			
 
				+     typedef unsigned int ogg_uint32_t;
			
 
				+     typedef short ogg_int16_t;
			
 
				+     typedef unsigned short ogg_uint16_t;
			
 
				+#  else
			
 
				+     /* MSVC/Borland */
			
 
				+     typedef __int64 ogg_int64_t;
			
 
				+     typedef __int32 ogg_int32_t;
			
 
				+     typedef unsigned __int32 ogg_uint32_t;
			
 
				+     typedef __int16 ogg_int16_t;
			
 
				+     typedef unsigned __int16 ogg_uint16_t;
			
 
				+#  endif
			
 
				+
			
 
				+#elif defined(__MACOS__)
			
 
				+
			
 
				+#  include <sys/types.h>
			
 
				+   typedef SInt16 ogg_int16_t;
			
 
				+   typedef UInt16 ogg_uint16_t;
			
 
				+   typedef SInt32 ogg_int32_t;
			
 
				+   typedef UInt32 ogg_uint32_t;
			
 
				+   typedef SInt64 ogg_int64_t;
			
 
				+
			
 
				+#elif (defined(__APPLE__) && defined(__MACH__)) /* MacOS X Framework build */
			
 
				+
			
 
				+#  include <inttypes.h>
			
 
				+   typedef int16_t ogg_int16_t;
			
 
				+   typedef uint16_t ogg_uint16_t;
			
 
				+   typedef int32_t ogg_int32_t;
			
 
				+   typedef uint32_t ogg_uint32_t;
			
 
				+   typedef int64_t ogg_int64_t;
			
 
				+
			
 
				+#elif defined(__HAIKU__)
			
 
				+
			
 
				+  /* Haiku */
			
 
				+#  include <sys/types.h>
			
 
				+   typedef short ogg_int16_t;
			
 
				+   typedef unsigned short ogg_uint16_t;
			
 
				+   typedef int ogg_int32_t;
			
 
				+   typedef unsigned int ogg_uint32_t;
			
 
				+   typedef long long ogg_int64_t;
			
 
				+
			
 
				+#elif defined(__BEOS__)
			
 
				+
			
 
				+   /* Be */
			
 
				+#  include <inttypes.h>
			
 
				+   typedef int16_t ogg_int16_t;
			
 
				+   typedef uint16_t ogg_uint16_t;
			
 
				+   typedef int32_t ogg_int32_t;
			
 
				+   typedef uint32_t ogg_uint32_t;
			
 
				+   typedef int64_t ogg_int64_t;
			
 
				+
			
 
				+#elif defined (__EMX__)
			
 
				+
			
 
				+   /* OS/2 GCC */
			
 
				+   typedef short ogg_int16_t;
			
 
				+   typedef unsigned short ogg_uint16_t;
			
 
				+   typedef int ogg_int32_t;
			
 
				+   typedef unsigned int ogg_uint32_t;
			
 
				+   typedef long long ogg_int64_t;
			
 
				+
			
 
				+#elif defined (DJGPP)
			
 
				+
			
 
				+   /* DJGPP */
			
 
				+   typedef short ogg_int16_t;
			
 
				+   typedef int ogg_int32_t;
			
 
				+   typedef unsigned int ogg_uint32_t;
			
 
				+   typedef long long ogg_int64_t;
			
 
				+
			
 
				+#elif defined(R5900)
			
 
				+
			
 
				+   /* PS2 EE */
			
 
				+   typedef long ogg_int64_t;
			
 
				+   typedef int ogg_int32_t;
			
 
				+   typedef unsigned ogg_uint32_t;
			
 
				+   typedef short ogg_int16_t;
			
 
				+
			
 
				+#elif defined(__SYMBIAN32__)
			
 
				+
			
 
				+   /* Symbian GCC */
			
 
				+   typedef signed short ogg_int16_t;
			
 
				+   typedef unsigned short ogg_uint16_t;
			
 
				+   typedef signed int ogg_int32_t;
			
 
				+   typedef unsigned int ogg_uint32_t;
			
 
				+   typedef long long int ogg_int64_t;
			
 
				+
			
 
				+#elif defined(__TMS320C6X__)
			
 
				+
			
 
				+   /* TI C64x compiler */
			
 
				+   typedef signed short ogg_int16_t;
			
 
				+   typedef unsigned short ogg_uint16_t;
			
 
				+   typedef signed int ogg_int32_t;
			
 
				+   typedef unsigned int ogg_uint32_t;
			
 
				+   typedef long long int ogg_int64_t;
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#  include <ogg/config_types.h>
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#endif  /* _OS_TYPES_H */
			
--- a/modules/theoraplayer/native/ogg/libtheoraplayer-readme.txt
+++ b/modules/theoraplayer/native/ogg/libtheoraplayer-readme.txt
@@ -0,0 +1,15 @@
 
				+libogg's source code is here provided in minimalist distribution format
			
 
				+with all source files not needed for compiling libtheoraplayer removed.
			
 
				+
			
 
				+- The project files were modified to fit libtheoraplayer's binary output
			
 
				+  folder structure.
			
 
				+- Some project files missing in the original source distibution were added to support
			
 
				+  compiling the libtheoraplayer on those platforms.
			
 
				+- Also, some code may have been changed to address certain compiler/platform
			
 
				+  specific problems and is so indicated in the source code.
			
 
				+
			
 
				+libogg is owned and maintained by the Xiph.Org foundation and this distribution
			
 
				+is present here only for convenience and easier compilation of libtheoraplayer.
			
 
				+
			
 
				+If you want to use libogg outside of libtheoraplayer, it is encouraged to use the
			
 
				+original source distribution by Xiph: http://xiph.org/
			
--- a/modules/theoraplayer/native/ogg/src/bitwise.c
+++ b/modules/theoraplayer/native/ogg/src/bitwise.c
@@ -0,0 +1,857 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE Ogg CONTAINER SOURCE CODE.              *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010             *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function: packing variable sized words into an octet stream
			
 
				+  last mod: $Id: bitwise.c 18051 2011-08-04 17:56:39Z giles $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+/* We're 'LSb' endian; if we write a word but read individual bits,
			
 
				+   then we'll read the lsb first */
			
 
				+
			
 
				+#include <string.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <limits.h>
			
 
				+#include <ogg/ogg.h>
			
 
				+
			
 
				+#define BUFFER_INCREMENT 256
			
 
				+
			
 
				+static const unsigned long mask[]=
			
 
				+{0x00000000,0x00000001,0x00000003,0x00000007,0x0000000f,
			
 
				+ 0x0000001f,0x0000003f,0x0000007f,0x000000ff,0x000001ff,
			
 
				+ 0x000003ff,0x000007ff,0x00000fff,0x00001fff,0x00003fff,
			
 
				+ 0x00007fff,0x0000ffff,0x0001ffff,0x0003ffff,0x0007ffff,
			
 
				+ 0x000fffff,0x001fffff,0x003fffff,0x007fffff,0x00ffffff,
			
 
				+ 0x01ffffff,0x03ffffff,0x07ffffff,0x0fffffff,0x1fffffff,
			
 
				+ 0x3fffffff,0x7fffffff,0xffffffff };
			
 
				+
			
 
				+static const unsigned int mask8B[]=
			
 
				+{0x00,0x80,0xc0,0xe0,0xf0,0xf8,0xfc,0xfe,0xff};
			
 
				+
			
 
				+void oggpack_writeinit(oggpack_buffer *b){
			
 
				+  memset(b,0,sizeof(*b));
			
 
				+  b->ptr=b->buffer=_ogg_malloc(BUFFER_INCREMENT);
			
 
				+  b->buffer[0]='\0';
			
 
				+  b->storage=BUFFER_INCREMENT;
			
 
				+}
			
 
				+
			
 
				+void oggpackB_writeinit(oggpack_buffer *b){
			
 
				+  oggpack_writeinit(b);
			
 
				+}
			
 
				+
			
 
				+int oggpack_writecheck(oggpack_buffer *b){
			
 
				+  if(!b->ptr || !b->storage)return -1;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+int oggpackB_writecheck(oggpack_buffer *b){
			
 
				+  return oggpack_writecheck(b);
			
 
				+}
			
 
				+
			
 
				+void oggpack_writetrunc(oggpack_buffer *b,long bits){
			
 
				+  long bytes=bits>>3;
			
 
				+  if(b->ptr){
			
 
				+    bits-=bytes*8;
			
 
				+    b->ptr=b->buffer+bytes;
			
 
				+    b->endbit=bits;
			
 
				+    b->endbyte=bytes;
			
 
				+    *b->ptr&=mask[bits];
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void oggpackB_writetrunc(oggpack_buffer *b,long bits){
			
 
				+  long bytes=bits>>3;
			
 
				+  if(b->ptr){
			
 
				+    bits-=bytes*8;
			
 
				+    b->ptr=b->buffer+bytes;
			
 
				+    b->endbit=bits;
			
 
				+    b->endbyte=bytes;
			
 
				+    *b->ptr&=mask8B[bits];
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/* Takes only up to 32 bits. */
			
 
				+void oggpack_write(oggpack_buffer *b,unsigned long value,int bits){
			
 
				+  if(bits<0 || bits>32) goto err;
			
 
				+  if(b->endbyte>=b->storage-4){
			
 
				+    void *ret;
			
 
				+    if(!b->ptr)return;
			
 
				+    if(b->storage>LONG_MAX-BUFFER_INCREMENT) goto err;
			
 
				+    ret=_ogg_realloc(b->buffer,b->storage+BUFFER_INCREMENT);
			
 
				+    if(!ret) goto err;
			
 
				+    b->buffer=ret;
			
 
				+    b->storage+=BUFFER_INCREMENT;
			
 
				+    b->ptr=b->buffer+b->endbyte;
			
 
				+  }
			
 
				+
			
 
				+  value&=mask[bits];
			
 
				+  bits+=b->endbit;
			
 
				+
			
 
				+  b->ptr[0]|=value<<b->endbit;
			
 
				+
			
 
				+  if(bits>=8){
			
 
				+    b->ptr[1]=(unsigned char)(value>>(8-b->endbit));
			
 
				+    if(bits>=16){
			
 
				+      b->ptr[2]=(unsigned char)(value>>(16-b->endbit));
			
 
				+      if(bits>=24){
			
 
				+        b->ptr[3]=(unsigned char)(value>>(24-b->endbit));
			
 
				+        if(bits>=32){
			
 
				+          if(b->endbit)
			
 
				+            b->ptr[4]=(unsigned char)(value>>(32-b->endbit));
			
 
				+          else
			
 
				+            b->ptr[4]=0;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  b->endbyte+=bits/8;
			
 
				+  b->ptr+=bits/8;
			
 
				+  b->endbit=bits&7;
			
 
				+  return;
			
 
				+ err:
			
 
				+  oggpack_writeclear(b);
			
 
				+}
			
 
				+
			
 
				+/* Takes only up to 32 bits. */
			
 
				+void oggpackB_write(oggpack_buffer *b,unsigned long value,int bits){
			
 
				+  if(bits<0 || bits>32) goto err;
			
 
				+  if(b->endbyte>=b->storage-4){
			
 
				+    void *ret;
			
 
				+    if(!b->ptr)return;
			
 
				+    if(b->storage>LONG_MAX-BUFFER_INCREMENT) goto err;
			
 
				+    ret=_ogg_realloc(b->buffer,b->storage+BUFFER_INCREMENT);
			
 
				+    if(!ret) goto err;
			
 
				+    b->buffer=ret;
			
 
				+    b->storage+=BUFFER_INCREMENT;
			
 
				+    b->ptr=b->buffer+b->endbyte;
			
 
				+  }
			
 
				+
			
 
				+  value=(value&mask[bits])<<(32-bits);
			
 
				+  bits+=b->endbit;
			
 
				+
			
 
				+  b->ptr[0]|=value>>(24+b->endbit);
			
 
				+
			
 
				+  if(bits>=8){
			
 
				+    b->ptr[1]=(unsigned char)(value>>(16+b->endbit));
			
 
				+    if(bits>=16){
			
 
				+      b->ptr[2]=(unsigned char)(value>>(8+b->endbit));
			
 
				+      if(bits>=24){
			
 
				+        b->ptr[3]=(unsigned char)(value>>(b->endbit));
			
 
				+        if(bits>=32){
			
 
				+          if(b->endbit)
			
 
				+            b->ptr[4]=(unsigned char)(value<<(8-b->endbit));
			
 
				+          else
			
 
				+            b->ptr[4]=0;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  b->endbyte+=bits/8;
			
 
				+  b->ptr+=bits/8;
			
 
				+  b->endbit=bits&7;
			
 
				+  return;
			
 
				+ err:
			
 
				+  oggpack_writeclear(b);
			
 
				+}
			
 
				+
			
 
				+void oggpack_writealign(oggpack_buffer *b){
			
 
				+  int bits=8-b->endbit;
			
 
				+  if(bits<8)
			
 
				+    oggpack_write(b,0,bits);
			
 
				+}
			
 
				+
			
 
				+void oggpackB_writealign(oggpack_buffer *b){
			
 
				+  int bits=8-b->endbit;
			
 
				+  if(bits<8)
			
 
				+    oggpackB_write(b,0,bits);
			
 
				+}
			
 
				+
			
 
				+static void oggpack_writecopy_helper(oggpack_buffer *b,
			
 
				+                                     void *source,
			
 
				+                                     long bits,
			
 
				+                                     void (*w)(oggpack_buffer *,
			
 
				+                                               unsigned long,
			
 
				+                                               int),
			
 
				+                                     int msb){
			
 
				+  unsigned char *ptr=(unsigned char *)source;
			
 
				+
			
 
				+  long bytes=bits/8;
			
 
				+  bits-=bytes*8;
			
 
				+
			
 
				+  if(b->endbit){
			
 
				+    int i;
			
 
				+    /* unaligned copy.  Do it the hard way. */
			
 
				+    for(i=0;i<bytes;i++)
			
 
				+      w(b,(unsigned long)(ptr[i]),8);
			
 
				+  }else{
			
 
				+    /* aligned block copy */
			
 
				+    if(b->endbyte+bytes+1>=b->storage){
			
 
				+      void *ret;
			
 
				+      if(!b->ptr) goto err;
			
 
				+      if(b->endbyte+bytes+BUFFER_INCREMENT>b->storage) goto err;
			
 
				+      b->storage=b->endbyte+bytes+BUFFER_INCREMENT;
			
 
				+      ret=_ogg_realloc(b->buffer,b->storage);
			
 
				+      if(!ret) goto err;
			
 
				+      b->buffer=ret;
			
 
				+      b->ptr=b->buffer+b->endbyte;
			
 
				+    }
			
 
				+
			
 
				+    memmove(b->ptr,source,bytes);
			
 
				+    b->ptr+=bytes;
			
 
				+    b->endbyte+=bytes;
			
 
				+    *b->ptr=0;
			
 
				+
			
 
				+  }
			
 
				+  if(bits){
			
 
				+    if(msb)
			
 
				+      w(b,(unsigned long)(ptr[bytes]>>(8-bits)),bits);
			
 
				+    else
			
 
				+      w(b,(unsigned long)(ptr[bytes]),bits);
			
 
				+  }
			
 
				+  return;
			
 
				+ err:
			
 
				+  oggpack_writeclear(b);
			
 
				+}
			
 
				+
			
 
				+void oggpack_writecopy(oggpack_buffer *b,void *source,long bits){
			
 
				+  oggpack_writecopy_helper(b,source,bits,oggpack_write,0);
			
 
				+}
			
 
				+
			
 
				+void oggpackB_writecopy(oggpack_buffer *b,void *source,long bits){
			
 
				+  oggpack_writecopy_helper(b,source,bits,oggpackB_write,1);
			
 
				+}
			
 
				+
			
 
				+void oggpack_reset(oggpack_buffer *b){
			
 
				+  if(!b->ptr)return;
			
 
				+  b->ptr=b->buffer;
			
 
				+  b->buffer[0]=0;
			
 
				+  b->endbit=b->endbyte=0;
			
 
				+}
			
 
				+
			
 
				+void oggpackB_reset(oggpack_buffer *b){
			
 
				+  oggpack_reset(b);
			
 
				+}
			
 
				+
			
 
				+void oggpack_writeclear(oggpack_buffer *b){
			
 
				+  if(b->buffer)_ogg_free(b->buffer);
			
 
				+  memset(b,0,sizeof(*b));
			
 
				+}
			
 
				+
			
 
				+void oggpackB_writeclear(oggpack_buffer *b){
			
 
				+  oggpack_writeclear(b);
			
 
				+}
			
 
				+
			
 
				+void oggpack_readinit(oggpack_buffer *b,unsigned char *buf,int bytes){
			
 
				+  memset(b,0,sizeof(*b));
			
 
				+  b->buffer=b->ptr=buf;
			
 
				+  b->storage=bytes;
			
 
				+}
			
 
				+
			
 
				+void oggpackB_readinit(oggpack_buffer *b,unsigned char *buf,int bytes){
			
 
				+  oggpack_readinit(b,buf,bytes);
			
 
				+}
			
 
				+
			
 
				+/* Read in bits without advancing the bitptr; bits <= 32 */
			
 
				+long oggpack_look(oggpack_buffer *b,int bits){
			
 
				+  unsigned long ret;
			
 
				+  unsigned long m;
			
 
				+
			
 
				+  if(bits<0 || bits>32) return -1;
			
 
				+  m=mask[bits];
			
 
				+  bits+=b->endbit;
			
 
				+
			
 
				+  if(b->endbyte >= b->storage-4){
			
 
				+    /* not the main path */
			
 
				+    if(b->endbyte > b->storage-((bits+7)>>3)) return -1;
			
 
				+    /* special case to avoid reading b->ptr[0], which might be past the end of
			
 
				+        the buffer; also skips some useless accounting */
			
 
				+    else if(!bits)return(0L);
			
 
				+  }
			
 
				+
			
 
				+  ret=b->ptr[0]>>b->endbit;
			
 
				+  if(bits>8){
			
 
				+    ret|=b->ptr[1]<<(8-b->endbit);
			
 
				+    if(bits>16){
			
 
				+      ret|=b->ptr[2]<<(16-b->endbit);
			
 
				+      if(bits>24){
			
 
				+        ret|=b->ptr[3]<<(24-b->endbit);
			
 
				+        if(bits>32 && b->endbit)
			
 
				+          ret|=b->ptr[4]<<(32-b->endbit);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  return(m&ret);
			
 
				+}
			
 
				+
			
 
				+/* Read in bits without advancing the bitptr; bits <= 32 */
			
 
				+long oggpackB_look(oggpack_buffer *b,int bits){
			
 
				+  unsigned long ret;
			
 
				+  int m=32-bits;
			
 
				+
			
 
				+  if(m<0 || m>32) return -1;
			
 
				+  bits+=b->endbit;
			
 
				+
			
 
				+  if(b->endbyte >= b->storage-4){
			
 
				+    /* not the main path */
			
 
				+    if(b->endbyte > b->storage-((bits+7)>>3)) return -1;
			
 
				+    /* special case to avoid reading b->ptr[0], which might be past the end of
			
 
				+        the buffer; also skips some useless accounting */
			
 
				+    else if(!bits)return(0L);
			
 
				+  }
			
 
				+
			
 
				+  ret=b->ptr[0]<<(24+b->endbit);
			
 
				+  if(bits>8){
			
 
				+    ret|=b->ptr[1]<<(16+b->endbit);
			
 
				+    if(bits>16){
			
 
				+      ret|=b->ptr[2]<<(8+b->endbit);
			
 
				+      if(bits>24){
			
 
				+        ret|=b->ptr[3]<<(b->endbit);
			
 
				+        if(bits>32 && b->endbit)
			
 
				+          ret|=b->ptr[4]>>(8-b->endbit);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  return ((ret&0xffffffff)>>(m>>1))>>((m+1)>>1);
			
 
				+}
			
 
				+
			
 
				+long oggpack_look1(oggpack_buffer *b){
			
 
				+  if(b->endbyte>=b->storage)return(-1);
			
 
				+  return((b->ptr[0]>>b->endbit)&1);
			
 
				+}
			
 
				+
			
 
				+long oggpackB_look1(oggpack_buffer *b){
			
 
				+  if(b->endbyte>=b->storage)return(-1);
			
 
				+  return((b->ptr[0]>>(7-b->endbit))&1);
			
 
				+}
			
 
				+
			
 
				+void oggpack_adv(oggpack_buffer *b,int bits){
			
 
				+  bits+=b->endbit;
			
 
				+
			
 
				+  if(b->endbyte > b->storage-((bits+7)>>3)) goto overflow;
			
 
				+
			
 
				+  b->ptr+=bits/8;
			
 
				+  b->endbyte+=bits/8;
			
 
				+  b->endbit=bits&7;
			
 
				+  return;
			
 
				+
			
 
				+ overflow:
			
 
				+  b->ptr=NULL;
			
 
				+  b->endbyte=b->storage;
			
 
				+  b->endbit=1;
			
 
				+}
			
 
				+
			
 
				+void oggpackB_adv(oggpack_buffer *b,int bits){
			
 
				+  oggpack_adv(b,bits);
			
 
				+}
			
 
				+
			
 
				+void oggpack_adv1(oggpack_buffer *b){
			
 
				+  if(++(b->endbit)>7){
			
 
				+    b->endbit=0;
			
 
				+    b->ptr++;
			
 
				+    b->endbyte++;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void oggpackB_adv1(oggpack_buffer *b){
			
 
				+  oggpack_adv1(b);
			
 
				+}
			
 
				+
			
 
				+/* bits <= 32 */
			
 
				+long oggpack_read(oggpack_buffer *b,int bits){
			
 
				+  long ret;
			
 
				+  unsigned long m;
			
 
				+
			
 
				+  if(bits<0 || bits>32) goto err;
			
 
				+  m=mask[bits];
			
 
				+  bits+=b->endbit;
			
 
				+
			
 
				+  if(b->endbyte >= b->storage-4){
			
 
				+    /* not the main path */
			
 
				+    if(b->endbyte > b->storage-((bits+7)>>3)) goto overflow;
			
 
				+    /* special case to avoid reading b->ptr[0], which might be past the end of
			
 
				+        the buffer; also skips some useless accounting */
			
 
				+    else if(!bits)return(0L);
			
 
				+  }
			
 
				+
			
 
				+  ret=b->ptr[0]>>b->endbit;
			
 
				+  if(bits>8){
			
 
				+    ret|=b->ptr[1]<<(8-b->endbit);
			
 
				+    if(bits>16){
			
 
				+      ret|=b->ptr[2]<<(16-b->endbit);
			
 
				+      if(bits>24){
			
 
				+        ret|=b->ptr[3]<<(24-b->endbit);
			
 
				+        if(bits>32 && b->endbit){
			
 
				+          ret|=b->ptr[4]<<(32-b->endbit);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  ret&=m;
			
 
				+  b->ptr+=bits/8;
			
 
				+  b->endbyte+=bits/8;
			
 
				+  b->endbit=bits&7;
			
 
				+  return ret;
			
 
				+
			
 
				+ overflow:
			
 
				+ err:
			
 
				+  b->ptr=NULL;
			
 
				+  b->endbyte=b->storage;
			
 
				+  b->endbit=1;
			
 
				+  return -1L;
			
 
				+}
			
 
				+
			
 
				+/* bits <= 32 */
			
 
				+long oggpackB_read(oggpack_buffer *b,int bits){
			
 
				+  long ret;
			
 
				+  long m=32-bits;
			
 
				+
			
 
				+  if(m<0 || m>32) goto err;
			
 
				+  bits+=b->endbit;
			
 
				+
			
 
				+  if(b->endbyte+4>=b->storage){
			
 
				+    /* not the main path */
			
 
				+    if(b->endbyte > b->storage-((bits+7)>>3)) goto overflow;
			
 
				+    /* special case to avoid reading b->ptr[0], which might be past the end of
			
 
				+        the buffer; also skips some useless accounting */
			
 
				+    else if(!bits)return(0L);
			
 
				+  }
			
 
				+
			
 
				+  ret=b->ptr[0]<<(24+b->endbit);
			
 
				+  if(bits>8){
			
 
				+    ret|=b->ptr[1]<<(16+b->endbit);
			
 
				+    if(bits>16){
			
 
				+      ret|=b->ptr[2]<<(8+b->endbit);
			
 
				+      if(bits>24){
			
 
				+        ret|=b->ptr[3]<<(b->endbit);
			
 
				+        if(bits>32 && b->endbit)
			
 
				+          ret|=b->ptr[4]>>(8-b->endbit);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  ret=((ret&0xffffffffUL)>>(m>>1))>>((m+1)>>1);
			
 
				+
			
 
				+  b->ptr+=bits/8;
			
 
				+  b->endbyte+=bits/8;
			
 
				+  b->endbit=bits&7;
			
 
				+  return ret;
			
 
				+
			
 
				+ overflow:
			
 
				+ err:
			
 
				+  b->ptr=NULL;
			
 
				+  b->endbyte=b->storage;
			
 
				+  b->endbit=1;
			
 
				+  return -1L;
			
 
				+}
			
 
				+
			
 
				+long oggpack_read1(oggpack_buffer *b){
			
 
				+  long ret;
			
 
				+
			
 
				+  if(b->endbyte >= b->storage) goto overflow;
			
 
				+  ret=(b->ptr[0]>>b->endbit)&1;
			
 
				+
			
 
				+  b->endbit++;
			
 
				+  if(b->endbit>7){
			
 
				+    b->endbit=0;
			
 
				+    b->ptr++;
			
 
				+    b->endbyte++;
			
 
				+  }
			
 
				+  return ret;
			
 
				+
			
 
				+ overflow:
			
 
				+  b->ptr=NULL;
			
 
				+  b->endbyte=b->storage;
			
 
				+  b->endbit=1;
			
 
				+  return -1L;
			
 
				+}
			
 
				+
			
 
				+long oggpackB_read1(oggpack_buffer *b){
			
 
				+  long ret;
			
 
				+
			
 
				+  if(b->endbyte >= b->storage) goto overflow;
			
 
				+  ret=(b->ptr[0]>>(7-b->endbit))&1;
			
 
				+
			
 
				+  b->endbit++;
			
 
				+  if(b->endbit>7){
			
 
				+    b->endbit=0;
			
 
				+    b->ptr++;
			
 
				+    b->endbyte++;
			
 
				+  }
			
 
				+  return ret;
			
 
				+
			
 
				+ overflow:
			
 
				+  b->ptr=NULL;
			
 
				+  b->endbyte=b->storage;
			
 
				+  b->endbit=1;
			
 
				+  return -1L;
			
 
				+}
			
 
				+
			
 
				+long oggpack_bytes(oggpack_buffer *b){
			
 
				+  return(b->endbyte+(b->endbit+7)/8);
			
 
				+}
			
 
				+
			
 
				+long oggpack_bits(oggpack_buffer *b){
			
 
				+  return(b->endbyte*8+b->endbit);
			
 
				+}
			
 
				+
			
 
				+long oggpackB_bytes(oggpack_buffer *b){
			
 
				+  return oggpack_bytes(b);
			
 
				+}
			
 
				+
			
 
				+long oggpackB_bits(oggpack_buffer *b){
			
 
				+  return oggpack_bits(b);
			
 
				+}
			
 
				+
			
 
				+unsigned char *oggpack_get_buffer(oggpack_buffer *b){
			
 
				+  return(b->buffer);
			
 
				+}
			
 
				+
			
 
				+unsigned char *oggpackB_get_buffer(oggpack_buffer *b){
			
 
				+  return oggpack_get_buffer(b);
			
 
				+}
			
 
				+
			
 
				+/* Self test of the bitwise routines; everything else is based on
			
 
				+   them, so they damned well better be solid. */
			
 
				+
			
 
				+#ifdef _V_SELFTEST
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+static int ilog(unsigned int v){
			
 
				+  int ret=0;
			
 
				+  while(v){
			
 
				+    ret++;
			
 
				+    v>>=1;
			
 
				+  }
			
 
				+  return(ret);
			
 
				+}
			
 
				+
			
 
				+oggpack_buffer o;
			
 
				+oggpack_buffer r;
			
 
				+
			
 
				+void report(char *in){
			
 
				+  fprintf(stderr,"%s",in);
			
 
				+  exit(1);
			
 
				+}
			
 
				+
			
 
				+void cliptest(unsigned long *b,int vals,int bits,int *comp,int compsize){
			
 
				+  long bytes,i;
			
 
				+  unsigned char *buffer;
			
 
				+
			
 
				+  oggpack_reset(&o);
			
 
				+  for(i=0;i<vals;i++)
			
 
				+    oggpack_write(&o,b[i],bits?bits:ilog(b[i]));
			
 
				+  buffer=oggpack_get_buffer(&o);
			
 
				+  bytes=oggpack_bytes(&o);
			
 
				+  if(bytes!=compsize)report("wrong number of bytes!\n");
			
 
				+  for(i=0;i<bytes;i++)if(buffer[i]!=comp[i]){
			
 
				+    for(i=0;i<bytes;i++)fprintf(stderr,"%x %x\n",(int)buffer[i],(int)comp[i]);
			
 
				+    report("wrote incorrect value!\n");
			
 
				+  }
			
 
				+  oggpack_readinit(&r,buffer,bytes);
			
 
				+  for(i=0;i<vals;i++){
			
 
				+    int tbit=bits?bits:ilog(b[i]);
			
 
				+    if(oggpack_look(&r,tbit)==-1)
			
 
				+      report("out of data!\n");
			
 
				+    if(oggpack_look(&r,tbit)!=(b[i]&mask[tbit]))
			
 
				+      report("looked at incorrect value!\n");
			
 
				+    if(tbit==1)
			
 
				+      if(oggpack_look1(&r)!=(b[i]&mask[tbit]))
			
 
				+        report("looked at single bit incorrect value!\n");
			
 
				+    if(tbit==1){
			
 
				+      if(oggpack_read1(&r)!=(b[i]&mask[tbit]))
			
 
				+        report("read incorrect single bit value!\n");
			
 
				+    }else{
			
 
				+    if(oggpack_read(&r,tbit)!=(b[i]&mask[tbit]))
			
 
				+      report("read incorrect value!\n");
			
 
				+    }
			
 
				+  }
			
 
				+  if(oggpack_bytes(&r)!=bytes)report("leftover bytes after read!\n");
			
 
				+}
			
 
				+
			
 
				+void cliptestB(unsigned long *b,int vals,int bits,int *comp,int compsize){
			
 
				+  long bytes,i;
			
 
				+  unsigned char *buffer;
			
 
				+
			
 
				+  oggpackB_reset(&o);
			
 
				+  for(i=0;i<vals;i++)
			
 
				+    oggpackB_write(&o,b[i],bits?bits:ilog(b[i]));
			
 
				+  buffer=oggpackB_get_buffer(&o);
			
 
				+  bytes=oggpackB_bytes(&o);
			
 
				+  if(bytes!=compsize)report("wrong number of bytes!\n");
			
 
				+  for(i=0;i<bytes;i++)if(buffer[i]!=comp[i]){
			
 
				+    for(i=0;i<bytes;i++)fprintf(stderr,"%x %x\n",(int)buffer[i],(int)comp[i]);
			
 
				+    report("wrote incorrect value!\n");
			
 
				+  }
			
 
				+  oggpackB_readinit(&r,buffer,bytes);
			
 
				+  for(i=0;i<vals;i++){
			
 
				+    int tbit=bits?bits:ilog(b[i]);
			
 
				+    if(oggpackB_look(&r,tbit)==-1)
			
 
				+      report("out of data!\n");
			
 
				+    if(oggpackB_look(&r,tbit)!=(b[i]&mask[tbit]))
			
 
				+      report("looked at incorrect value!\n");
			
 
				+    if(tbit==1)
			
 
				+      if(oggpackB_look1(&r)!=(b[i]&mask[tbit]))
			
 
				+        report("looked at single bit incorrect value!\n");
			
 
				+    if(tbit==1){
			
 
				+      if(oggpackB_read1(&r)!=(b[i]&mask[tbit]))
			
 
				+        report("read incorrect single bit value!\n");
			
 
				+    }else{
			
 
				+    if(oggpackB_read(&r,tbit)!=(b[i]&mask[tbit]))
			
 
				+      report("read incorrect value!\n");
			
 
				+    }
			
 
				+  }
			
 
				+  if(oggpackB_bytes(&r)!=bytes)report("leftover bytes after read!\n");
			
 
				+}
			
 
				+
			
 
				+int main(void){
			
 
				+  unsigned char *buffer;
			
 
				+  long bytes,i;
			
 
				+  static unsigned long testbuffer1[]=
			
 
				+    {18,12,103948,4325,543,76,432,52,3,65,4,56,32,42,34,21,1,23,32,546,456,7,
			
 
				+       567,56,8,8,55,3,52,342,341,4,265,7,67,86,2199,21,7,1,5,1,4};
			
 
				+  int test1size=43;
			
 
				+
			
 
				+  static unsigned long testbuffer2[]=
			
 
				+    {216531625L,1237861823,56732452,131,3212421,12325343,34547562,12313212,
			
 
				+       1233432,534,5,346435231,14436467,7869299,76326614,167548585,
			
 
				+       85525151,0,12321,1,349528352};
			
 
				+  int test2size=21;
			
 
				+
			
 
				+  static unsigned long testbuffer3[]=
			
 
				+    {1,0,14,0,1,0,12,0,1,0,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,1,1,1,0,0,1,
			
 
				+       0,1,30,1,1,1,0,0,1,0,0,0,12,0,11,0,1,0,0,1};
			
 
				+  int test3size=56;
			
 
				+
			
 
				+  static unsigned long large[]=
			
 
				+    {2136531625L,2137861823,56732452,131,3212421,12325343,34547562,12313212,
			
 
				+       1233432,534,5,2146435231,14436467,7869299,76326614,167548585,
			
 
				+       85525151,0,12321,1,2146528352};
			
 
				+
			
 
				+  int onesize=33;
			
 
				+  static int one[33]={146,25,44,151,195,15,153,176,233,131,196,65,85,172,47,40,
			
 
				+                    34,242,223,136,35,222,211,86,171,50,225,135,214,75,172,
			
 
				+                    223,4};
			
 
				+  static int oneB[33]={150,101,131,33,203,15,204,216,105,193,156,65,84,85,222,
			
 
				+                       8,139,145,227,126,34,55,244,171,85,100,39,195,173,18,
			
 
				+                       245,251,128};
			
 
				+
			
 
				+  int twosize=6;
			
 
				+  static int two[6]={61,255,255,251,231,29};
			
 
				+  static int twoB[6]={247,63,255,253,249,120};
			
 
				+
			
 
				+  int threesize=54;
			
 
				+  static int three[54]={169,2,232,252,91,132,156,36,89,13,123,176,144,32,254,
			
 
				+                      142,224,85,59,121,144,79,124,23,67,90,90,216,79,23,83,
			
 
				+                      58,135,196,61,55,129,183,54,101,100,170,37,127,126,10,
			
 
				+                      100,52,4,14,18,86,77,1};
			
 
				+  static int threeB[54]={206,128,42,153,57,8,183,251,13,89,36,30,32,144,183,
			
 
				+                         130,59,240,121,59,85,223,19,228,180,134,33,107,74,98,
			
 
				+                         233,253,196,135,63,2,110,114,50,155,90,127,37,170,104,
			
 
				+                         200,20,254,4,58,106,176,144,0};
			
 
				+
			
 
				+  int foursize=38;
			
 
				+  static int four[38]={18,6,163,252,97,194,104,131,32,1,7,82,137,42,129,11,72,
			
 
				+                     132,60,220,112,8,196,109,64,179,86,9,137,195,208,122,169,
			
 
				+                     28,2,133,0,1};
			
 
				+  static int fourB[38]={36,48,102,83,243,24,52,7,4,35,132,10,145,21,2,93,2,41,
			
 
				+                        1,219,184,16,33,184,54,149,170,132,18,30,29,98,229,67,
			
 
				+                        129,10,4,32};
			
 
				+
			
 
				+  int fivesize=45;
			
 
				+  static int five[45]={169,2,126,139,144,172,30,4,80,72,240,59,130,218,73,62,
			
 
				+                     241,24,210,44,4,20,0,248,116,49,135,100,110,130,181,169,
			
 
				+                     84,75,159,2,1,0,132,192,8,0,0,18,22};
			
 
				+  static int fiveB[45]={1,84,145,111,245,100,128,8,56,36,40,71,126,78,213,226,
			
 
				+                        124,105,12,0,133,128,0,162,233,242,67,152,77,205,77,
			
 
				+                        172,150,169,129,79,128,0,6,4,32,0,27,9,0};
			
 
				+
			
 
				+  int sixsize=7;
			
 
				+  static int six[7]={17,177,170,242,169,19,148};
			
 
				+  static int sixB[7]={136,141,85,79,149,200,41};
			
 
				+
			
 
				+  /* Test read/write together */
			
 
				+  /* Later we test against pregenerated bitstreams */
			
 
				+  oggpack_writeinit(&o);
			
 
				+
			
 
				+  fprintf(stderr,"\nSmall preclipped packing (LSb): ");
			
 
				+  cliptest(testbuffer1,test1size,0,one,onesize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nNull bit call (LSb): ");
			
 
				+  cliptest(testbuffer3,test3size,0,two,twosize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nLarge preclipped packing (LSb): ");
			
 
				+  cliptest(testbuffer2,test2size,0,three,threesize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\n32 bit preclipped packing (LSb): ");
			
 
				+  oggpack_reset(&o);
			
 
				+  for(i=0;i<test2size;i++)
			
 
				+    oggpack_write(&o,large[i],32);
			
 
				+  buffer=oggpack_get_buffer(&o);
			
 
				+  bytes=oggpack_bytes(&o);
			
 
				+  oggpack_readinit(&r,buffer,bytes);
			
 
				+  for(i=0;i<test2size;i++){
			
 
				+    if(oggpack_look(&r,32)==-1)report("out of data. failed!");
			
 
				+    if(oggpack_look(&r,32)!=large[i]){
			
 
				+      fprintf(stderr,"%ld != %ld (%lx!=%lx):",oggpack_look(&r,32),large[i],
			
 
				+              oggpack_look(&r,32),large[i]);
			
 
				+      report("read incorrect value!\n");
			
 
				+    }
			
 
				+    oggpack_adv(&r,32);
			
 
				+  }
			
 
				+  if(oggpack_bytes(&r)!=bytes)report("leftover bytes after read!\n");
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nSmall unclipped packing (LSb): ");
			
 
				+  cliptest(testbuffer1,test1size,7,four,foursize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nLarge unclipped packing (LSb): ");
			
 
				+  cliptest(testbuffer2,test2size,17,five,fivesize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nSingle bit unclipped packing (LSb): ");
			
 
				+  cliptest(testbuffer3,test3size,1,six,sixsize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nTesting read past end (LSb): ");
			
 
				+  oggpack_readinit(&r,(unsigned char *)"\0\0\0\0\0\0\0\0",8);
			
 
				+  for(i=0;i<64;i++){
			
 
				+    if(oggpack_read(&r,1)!=0){
			
 
				+      fprintf(stderr,"failed; got -1 prematurely.\n");
			
 
				+      exit(1);
			
 
				+    }
			
 
				+  }
			
 
				+  if(oggpack_look(&r,1)!=-1 ||
			
 
				+     oggpack_read(&r,1)!=-1){
			
 
				+      fprintf(stderr,"failed; read past end without -1.\n");
			
 
				+      exit(1);
			
 
				+  }
			
 
				+  oggpack_readinit(&r,(unsigned char *)"\0\0\0\0\0\0\0\0",8);
			
 
				+  if(oggpack_read(&r,30)!=0 || oggpack_read(&r,16)!=0){
			
 
				+      fprintf(stderr,"failed 2; got -1 prematurely.\n");
			
 
				+      exit(1);
			
 
				+  }
			
 
				+
			
 
				+  if(oggpack_look(&r,18)!=0 ||
			
 
				+     oggpack_look(&r,18)!=0){
			
 
				+    fprintf(stderr,"failed 3; got -1 prematurely.\n");
			
 
				+      exit(1);
			
 
				+  }
			
 
				+  if(oggpack_look(&r,19)!=-1 ||
			
 
				+     oggpack_look(&r,19)!=-1){
			
 
				+    fprintf(stderr,"failed; read past end without -1.\n");
			
 
				+      exit(1);
			
 
				+  }
			
 
				+  if(oggpack_look(&r,32)!=-1 ||
			
 
				+     oggpack_look(&r,32)!=-1){
			
 
				+    fprintf(stderr,"failed; read past end without -1.\n");
			
 
				+      exit(1);
			
 
				+  }
			
 
				+  oggpack_writeclear(&o);
			
 
				+  fprintf(stderr,"ok.\n");
			
 
				+
			
 
				+  /********** lazy, cut-n-paste retest with MSb packing ***********/
			
 
				+
			
 
				+  /* Test read/write together */
			
 
				+  /* Later we test against pregenerated bitstreams */
			
 
				+  oggpackB_writeinit(&o);
			
 
				+
			
 
				+  fprintf(stderr,"\nSmall preclipped packing (MSb): ");
			
 
				+  cliptestB(testbuffer1,test1size,0,oneB,onesize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nNull bit call (MSb): ");
			
 
				+  cliptestB(testbuffer3,test3size,0,twoB,twosize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nLarge preclipped packing (MSb): ");
			
 
				+  cliptestB(testbuffer2,test2size,0,threeB,threesize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\n32 bit preclipped packing (MSb): ");
			
 
				+  oggpackB_reset(&o);
			
 
				+  for(i=0;i<test2size;i++)
			
 
				+    oggpackB_write(&o,large[i],32);
			
 
				+  buffer=oggpackB_get_buffer(&o);
			
 
				+  bytes=oggpackB_bytes(&o);
			
 
				+  oggpackB_readinit(&r,buffer,bytes);
			
 
				+  for(i=0;i<test2size;i++){
			
 
				+    if(oggpackB_look(&r,32)==-1)report("out of data. failed!");
			
 
				+    if(oggpackB_look(&r,32)!=large[i]){
			
 
				+      fprintf(stderr,"%ld != %ld (%lx!=%lx):",oggpackB_look(&r,32),large[i],
			
 
				+              oggpackB_look(&r,32),large[i]);
			
 
				+      report("read incorrect value!\n");
			
 
				+    }
			
 
				+    oggpackB_adv(&r,32);
			
 
				+  }
			
 
				+  if(oggpackB_bytes(&r)!=bytes)report("leftover bytes after read!\n");
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nSmall unclipped packing (MSb): ");
			
 
				+  cliptestB(testbuffer1,test1size,7,fourB,foursize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nLarge unclipped packing (MSb): ");
			
 
				+  cliptestB(testbuffer2,test2size,17,fiveB,fivesize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nSingle bit unclipped packing (MSb): ");
			
 
				+  cliptestB(testbuffer3,test3size,1,sixB,sixsize);
			
 
				+  fprintf(stderr,"ok.");
			
 
				+
			
 
				+  fprintf(stderr,"\nTesting read past end (MSb): ");
			
 
				+  oggpackB_readinit(&r,(unsigned char *)"\0\0\0\0\0\0\0\0",8);
			
 
				+  for(i=0;i<64;i++){
			
 
				+    if(oggpackB_read(&r,1)!=0){
			
 
				+      fprintf(stderr,"failed; got -1 prematurely.\n");
			
 
				+      exit(1);
			
 
				+    }
			
 
				+  }
			
 
				+  if(oggpackB_look(&r,1)!=-1 ||
			
 
				+     oggpackB_read(&r,1)!=-1){
			
 
				+      fprintf(stderr,"failed; read past end without -1.\n");
			
 
				+      exit(1);
			
 
				+  }
			
 
				+  oggpackB_readinit(&r,(unsigned char *)"\0\0\0\0\0\0\0\0",8);
			
 
				+  if(oggpackB_read(&r,30)!=0 || oggpackB_read(&r,16)!=0){
			
 
				+      fprintf(stderr,"failed 2; got -1 prematurely.\n");
			
 
				+      exit(1);
			
 
				+  }
			
 
				+
			
 
				+  if(oggpackB_look(&r,18)!=0 ||
			
 
				+     oggpackB_look(&r,18)!=0){
			
 
				+    fprintf(stderr,"failed 3; got -1 prematurely.\n");
			
 
				+      exit(1);
			
 
				+  }
			
 
				+  if(oggpackB_look(&r,19)!=-1 ||
			
 
				+     oggpackB_look(&r,19)!=-1){
			
 
				+    fprintf(stderr,"failed; read past end without -1.\n");
			
 
				+      exit(1);
			
 
				+  }
			
 
				+  if(oggpackB_look(&r,32)!=-1 ||
			
 
				+     oggpackB_look(&r,32)!=-1){
			
 
				+    fprintf(stderr,"failed; read past end without -1.\n");
			
 
				+      exit(1);
			
 
				+  }
			
 
				+  oggpackB_writeclear(&o);
			
 
				+  fprintf(stderr,"ok.\n\n");
			
 
				+
			
 
				+
			
 
				+  return(0);
			
 
				+}
			
 
				+#endif  /* _V_SELFTEST */
			
 
				+
			
 
				+#undef BUFFER_INCREMENT
			
--- a/modules/theoraplayer/native/ogg/src/framing.c
+++ b/modules/theoraplayer/native/ogg/src/framing.c
@@ -0,0 +1,2111 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE Ogg CONTAINER SOURCE CODE.              *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2010             *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+ function: code raw packets into framed OggSquish stream and
			
 
				+           decode Ogg streams back into raw packets
			
 
				+ last mod: $Id: framing.c 18758 2013-01-08 16:29:56Z tterribe $
			
 
				+
			
 
				+ note: The CRC code is directly derived from public domain code by
			
 
				+ Ross Williams ([email protected]).  See docs/framing.html
			
 
				+ for details.
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <limits.h>
			
 
				+#include <string.h>
			
 
				+#include <ogg/ogg.h>
			
 
				+
			
 
				+/* A complete description of Ogg framing exists in docs/framing.html */
			
 
				+
			
 
				+int ogg_page_version(const ogg_page *og){
			
 
				+  return((int)(og->header[4]));
			
 
				+}
			
 
				+
			
 
				+int ogg_page_continued(const ogg_page *og){
			
 
				+  return((int)(og->header[5]&0x01));
			
 
				+}
			
 
				+
			
 
				+int ogg_page_bos(const ogg_page *og){
			
 
				+  return((int)(og->header[5]&0x02));
			
 
				+}
			
 
				+
			
 
				+int ogg_page_eos(const ogg_page *og){
			
 
				+  return((int)(og->header[5]&0x04));
			
 
				+}
			
 
				+
			
 
				+ogg_int64_t ogg_page_granulepos(const ogg_page *og){
			
 
				+  unsigned char *page=og->header;
			
 
				+  ogg_int64_t granulepos=page[13]&(0xff);
			
 
				+  granulepos= (granulepos<<8)|(page[12]&0xff);
			
 
				+  granulepos= (granulepos<<8)|(page[11]&0xff);
			
 
				+  granulepos= (granulepos<<8)|(page[10]&0xff);
			
 
				+  granulepos= (granulepos<<8)|(page[9]&0xff);
			
 
				+  granulepos= (granulepos<<8)|(page[8]&0xff);
			
 
				+  granulepos= (granulepos<<8)|(page[7]&0xff);
			
 
				+  granulepos= (granulepos<<8)|(page[6]&0xff);
			
 
				+  return(granulepos);
			
 
				+}
			
 
				+
			
 
				+int ogg_page_serialno(const ogg_page *og){
			
 
				+  return(og->header[14] |
			
 
				+         (og->header[15]<<8) |
			
 
				+         (og->header[16]<<16) |
			
 
				+         (og->header[17]<<24));
			
 
				+}
			
 
				+
			
 
				+long ogg_page_pageno(const ogg_page *og){
			
 
				+  return(og->header[18] |
			
 
				+         (og->header[19]<<8) |
			
 
				+         (og->header[20]<<16) |
			
 
				+         (og->header[21]<<24));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/* returns the number of packets that are completed on this page (if
			
 
				+   the leading packet is begun on a previous page, but ends on this
			
 
				+   page, it's counted */
			
 
				+
			
 
				+/* NOTE:
			
 
				+   If a page consists of a packet begun on a previous page, and a new
			
 
				+   packet begun (but not completed) on this page, the return will be:
			
 
				+     ogg_page_packets(page)   ==1,
			
 
				+     ogg_page_continued(page) !=0
			
 
				+
			
 
				+   If a page happens to be a single packet that was begun on a
			
 
				+   previous page, and spans to the next page (in the case of a three or
			
 
				+   more page packet), the return will be:
			
 
				+     ogg_page_packets(page)   ==0,
			
 
				+     ogg_page_continued(page) !=0
			
 
				+*/
			
 
				+
			
 
				+int ogg_page_packets(const ogg_page *og){
			
 
				+  int i,n=og->header[26],count=0;
			
 
				+  for(i=0;i<n;i++)
			
 
				+    if(og->header[27+i]<255)count++;
			
 
				+  return(count);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#if 0
			
 
				+/* helper to initialize lookup for direct-table CRC (illustrative; we
			
 
				+   use the static init below) */
			
 
				+
			
 
				+static ogg_uint32_t _ogg_crc_entry(unsigned long index){
			
 
				+  int           i;
			
 
				+  unsigned long r;
			
 
				+
			
 
				+  r = index << 24;
			
 
				+  for (i=0; i<8; i++)
			
 
				+    if (r & 0x80000000UL)
			
 
				+      r = (r << 1) ^ 0x04c11db7; /* The same as the ethernet generator
			
 
				+                                    polynomial, although we use an
			
 
				+                                    unreflected alg and an init/final
			
 
				+                                    of 0, not 0xffffffff */
			
 
				+    else
			
 
				+       r<<=1;
			
 
				+ return (r & 0xffffffffUL);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static const ogg_uint32_t crc_lookup[256]={
			
 
				+  0x00000000,0x04c11db7,0x09823b6e,0x0d4326d9,
			
 
				+  0x130476dc,0x17c56b6b,0x1a864db2,0x1e475005,
			
 
				+  0x2608edb8,0x22c9f00f,0x2f8ad6d6,0x2b4bcb61,
			
 
				+  0x350c9b64,0x31cd86d3,0x3c8ea00a,0x384fbdbd,
			
 
				+  0x4c11db70,0x48d0c6c7,0x4593e01e,0x4152fda9,
			
 
				+  0x5f15adac,0x5bd4b01b,0x569796c2,0x52568b75,
			
 
				+  0x6a1936c8,0x6ed82b7f,0x639b0da6,0x675a1011,
			
 
				+  0x791d4014,0x7ddc5da3,0x709f7b7a,0x745e66cd,
			
 
				+  0x9823b6e0,0x9ce2ab57,0x91a18d8e,0x95609039,
			
 
				+  0x8b27c03c,0x8fe6dd8b,0x82a5fb52,0x8664e6e5,
			
 
				+  0xbe2b5b58,0xbaea46ef,0xb7a96036,0xb3687d81,
			
 
				+  0xad2f2d84,0xa9ee3033,0xa4ad16ea,0xa06c0b5d,
			
 
				+  0xd4326d90,0xd0f37027,0xddb056fe,0xd9714b49,
			
 
				+  0xc7361b4c,0xc3f706fb,0xceb42022,0xca753d95,
			
 
				+  0xf23a8028,0xf6fb9d9f,0xfbb8bb46,0xff79a6f1,
			
 
				+  0xe13ef6f4,0xe5ffeb43,0xe8bccd9a,0xec7dd02d,
			
 
				+  0x34867077,0x30476dc0,0x3d044b19,0x39c556ae,
			
 
				+  0x278206ab,0x23431b1c,0x2e003dc5,0x2ac12072,
			
 
				+  0x128e9dcf,0x164f8078,0x1b0ca6a1,0x1fcdbb16,
			
 
				+  0x018aeb13,0x054bf6a4,0x0808d07d,0x0cc9cdca,
			
 
				+  0x7897ab07,0x7c56b6b0,0x71159069,0x75d48dde,
			
 
				+  0x6b93dddb,0x6f52c06c,0x6211e6b5,0x66d0fb02,
			
 
				+  0x5e9f46bf,0x5a5e5b08,0x571d7dd1,0x53dc6066,
			
 
				+  0x4d9b3063,0x495a2dd4,0x44190b0d,0x40d816ba,
			
 
				+  0xaca5c697,0xa864db20,0xa527fdf9,0xa1e6e04e,
			
 
				+  0xbfa1b04b,0xbb60adfc,0xb6238b25,0xb2e29692,
			
 
				+  0x8aad2b2f,0x8e6c3698,0x832f1041,0x87ee0df6,
			
 
				+  0x99a95df3,0x9d684044,0x902b669d,0x94ea7b2a,
			
 
				+  0xe0b41de7,0xe4750050,0xe9362689,0xedf73b3e,
			
 
				+  0xf3b06b3b,0xf771768c,0xfa325055,0xfef34de2,
			
 
				+  0xc6bcf05f,0xc27dede8,0xcf3ecb31,0xcbffd686,
			
 
				+  0xd5b88683,0xd1799b34,0xdc3abded,0xd8fba05a,
			
 
				+  0x690ce0ee,0x6dcdfd59,0x608edb80,0x644fc637,
			
 
				+  0x7a089632,0x7ec98b85,0x738aad5c,0x774bb0eb,
			
 
				+  0x4f040d56,0x4bc510e1,0x46863638,0x42472b8f,
			
 
				+  0x5c007b8a,0x58c1663d,0x558240e4,0x51435d53,
			
 
				+  0x251d3b9e,0x21dc2629,0x2c9f00f0,0x285e1d47,
			
 
				+  0x36194d42,0x32d850f5,0x3f9b762c,0x3b5a6b9b,
			
 
				+  0x0315d626,0x07d4cb91,0x0a97ed48,0x0e56f0ff,
			
 
				+  0x1011a0fa,0x14d0bd4d,0x19939b94,0x1d528623,
			
 
				+  0xf12f560e,0xf5ee4bb9,0xf8ad6d60,0xfc6c70d7,
			
 
				+  0xe22b20d2,0xe6ea3d65,0xeba91bbc,0xef68060b,
			
 
				+  0xd727bbb6,0xd3e6a601,0xdea580d8,0xda649d6f,
			
 
				+  0xc423cd6a,0xc0e2d0dd,0xcda1f604,0xc960ebb3,
			
 
				+  0xbd3e8d7e,0xb9ff90c9,0xb4bcb610,0xb07daba7,
			
 
				+  0xae3afba2,0xaafbe615,0xa7b8c0cc,0xa379dd7b,
			
 
				+  0x9b3660c6,0x9ff77d71,0x92b45ba8,0x9675461f,
			
 
				+  0x8832161a,0x8cf30bad,0x81b02d74,0x857130c3,
			
 
				+  0x5d8a9099,0x594b8d2e,0x5408abf7,0x50c9b640,
			
 
				+  0x4e8ee645,0x4a4ffbf2,0x470cdd2b,0x43cdc09c,
			
 
				+  0x7b827d21,0x7f436096,0x7200464f,0x76c15bf8,
			
 
				+  0x68860bfd,0x6c47164a,0x61043093,0x65c52d24,
			
 
				+  0x119b4be9,0x155a565e,0x18197087,0x1cd86d30,
			
 
				+  0x029f3d35,0x065e2082,0x0b1d065b,0x0fdc1bec,
			
 
				+  0x3793a651,0x3352bbe6,0x3e119d3f,0x3ad08088,
			
 
				+  0x2497d08d,0x2056cd3a,0x2d15ebe3,0x29d4f654,
			
 
				+  0xc5a92679,0xc1683bce,0xcc2b1d17,0xc8ea00a0,
			
 
				+  0xd6ad50a5,0xd26c4d12,0xdf2f6bcb,0xdbee767c,
			
 
				+  0xe3a1cbc1,0xe760d676,0xea23f0af,0xeee2ed18,
			
 
				+  0xf0a5bd1d,0xf464a0aa,0xf9278673,0xfde69bc4,
			
 
				+  0x89b8fd09,0x8d79e0be,0x803ac667,0x84fbdbd0,
			
 
				+  0x9abc8bd5,0x9e7d9662,0x933eb0bb,0x97ffad0c,
			
 
				+  0xafb010b1,0xab710d06,0xa6322bdf,0xa2f33668,
			
 
				+  0xbcb4666d,0xb8757bda,0xb5365d03,0xb1f740b4};
			
 
				+
			
 
				+/* init the encode/decode logical stream state */
			
 
				+
			
 
				+int ogg_stream_init(ogg_stream_state *os,int serialno){
			
 
				+  if(os){
			
 
				+    memset(os,0,sizeof(*os));
			
 
				+    os->body_storage=16*1024;
			
 
				+    os->lacing_storage=1024;
			
 
				+
			
 
				+    os->body_data=_ogg_malloc(os->body_storage*sizeof(*os->body_data));
			
 
				+    os->lacing_vals=_ogg_malloc(os->lacing_storage*sizeof(*os->lacing_vals));
			
 
				+    os->granule_vals=_ogg_malloc(os->lacing_storage*sizeof(*os->granule_vals));
			
 
				+
			
 
				+    if(!os->body_data || !os->lacing_vals || !os->granule_vals){
			
 
				+      ogg_stream_clear(os);
			
 
				+      return -1;
			
 
				+    }
			
 
				+
			
 
				+    os->serialno=serialno;
			
 
				+
			
 
				+    return(0);
			
 
				+  }
			
 
				+  return(-1);
			
 
				+}
			
 
				+
			
 
				+/* async/delayed error detection for the ogg_stream_state */
			
 
				+int ogg_stream_check(ogg_stream_state *os){
			
 
				+  if(!os || !os->body_data) return -1;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+/* _clear does not free os, only the non-flat storage within */
			
 
				+int ogg_stream_clear(ogg_stream_state *os){
			
 
				+  if(os){
			
 
				+    if(os->body_data)_ogg_free(os->body_data);
			
 
				+    if(os->lacing_vals)_ogg_free(os->lacing_vals);
			
 
				+    if(os->granule_vals)_ogg_free(os->granule_vals);
			
 
				+
			
 
				+    memset(os,0,sizeof(*os));
			
 
				+  }
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+int ogg_stream_destroy(ogg_stream_state *os){
			
 
				+  if(os){
			
 
				+    ogg_stream_clear(os);
			
 
				+    _ogg_free(os);
			
 
				+  }
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+/* Helpers for ogg_stream_encode; this keeps the structure and
			
 
				+   what's happening fairly clear */
			
 
				+
			
 
				+static int _os_body_expand(ogg_stream_state *os,long needed){
			
 
				+  if(os->body_storage-needed<=os->body_fill){
			
 
				+    long body_storage;
			
 
				+    void *ret;
			
 
				+    if(os->body_storage>LONG_MAX-needed){
			
 
				+      ogg_stream_clear(os);
			
 
				+      return -1;
			
 
				+    }
			
 
				+    body_storage=os->body_storage+needed;
			
 
				+    if(body_storage<LONG_MAX-1024)body_storage+=1024;
			
 
				+    ret=_ogg_realloc(os->body_data,body_storage*sizeof(*os->body_data));
			
 
				+    if(!ret){
			
 
				+      ogg_stream_clear(os);
			
 
				+      return -1;
			
 
				+    }
			
 
				+    os->body_storage=body_storage;
			
 
				+    os->body_data=ret;
			
 
				+  }
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static int _os_lacing_expand(ogg_stream_state *os,long needed){
			
 
				+  if(os->lacing_storage-needed<=os->lacing_fill){
			
 
				+    long lacing_storage;
			
 
				+    void *ret;
			
 
				+    if(os->lacing_storage>LONG_MAX-needed){
			
 
				+      ogg_stream_clear(os);
			
 
				+      return -1;
			
 
				+    }
			
 
				+    lacing_storage=os->lacing_storage+needed;
			
 
				+    if(lacing_storage<LONG_MAX-32)lacing_storage+=32;
			
 
				+    ret=_ogg_realloc(os->lacing_vals,lacing_storage*sizeof(*os->lacing_vals));
			
 
				+    if(!ret){
			
 
				+      ogg_stream_clear(os);
			
 
				+      return -1;
			
 
				+    }
			
 
				+    os->lacing_vals=ret;
			
 
				+    ret=_ogg_realloc(os->granule_vals,lacing_storage*
			
 
				+                     sizeof(*os->granule_vals));
			
 
				+    if(!ret){
			
 
				+      ogg_stream_clear(os);
			
 
				+      return -1;
			
 
				+    }
			
 
				+    os->granule_vals=ret;
			
 
				+    os->lacing_storage=lacing_storage;
			
 
				+  }
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+/* checksum the page */
			
 
				+/* Direct table CRC; note that this will be faster in the future if we
			
 
				+   perform the checksum simultaneously with other copies */
			
 
				+
			
 
				+void ogg_page_checksum_set(ogg_page *og){
			
 
				+  if(og){
			
 
				+    ogg_uint32_t crc_reg=0;
			
 
				+    int i;
			
 
				+
			
 
				+    /* safety; needed for API behavior, but not framing code */
			
 
				+    og->header[22]=0;
			
 
				+    og->header[23]=0;
			
 
				+    og->header[24]=0;
			
 
				+    og->header[25]=0;
			
 
				+
			
 
				+    for(i=0;i<og->header_len;i++)
			
 
				+      crc_reg=(crc_reg<<8)^crc_lookup[((crc_reg >> 24)&0xff)^og->header[i]];
			
 
				+    for(i=0;i<og->body_len;i++)
			
 
				+      crc_reg=(crc_reg<<8)^crc_lookup[((crc_reg >> 24)&0xff)^og->body[i]];
			
 
				+
			
 
				+    og->header[22]=(unsigned char)(crc_reg&0xff);
			
 
				+    og->header[23]=(unsigned char)((crc_reg>>8)&0xff);
			
 
				+    og->header[24]=(unsigned char)((crc_reg>>16)&0xff);
			
 
				+    og->header[25]=(unsigned char)((crc_reg>>24)&0xff);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/* submit data to the internal buffer of the framing engine */
			
 
				+int ogg_stream_iovecin(ogg_stream_state *os, ogg_iovec_t *iov, int count,
			
 
				+                       long e_o_s, ogg_int64_t granulepos){
			
 
				+
			
 
				+  long bytes = 0, lacing_vals;
			
 
				+  int i;
			
 
				+
			
 
				+  if(ogg_stream_check(os)) return -1;
			
 
				+  if(!iov) return 0;
			
 
				+
			
 
				+  for (i = 0; i < count; ++i){
			
 
				+    if(iov[i].iov_len>LONG_MAX) return -1;
			
 
				+    if(bytes>LONG_MAX-(long)iov[i].iov_len) return -1;
			
 
				+    bytes += (long)iov[i].iov_len;
			
 
				+  }
			
 
				+  lacing_vals=bytes/255+1;
			
 
				+
			
 
				+  if(os->body_returned){
			
 
				+    /* advance packet data according to the body_returned pointer. We
			
 
				+       had to keep it around to return a pointer into the buffer last
			
 
				+       call */
			
 
				+
			
 
				+    os->body_fill-=os->body_returned;
			
 
				+    if(os->body_fill)
			
 
				+      memmove(os->body_data,os->body_data+os->body_returned,
			
 
				+              os->body_fill);
			
 
				+    os->body_returned=0;
			
 
				+  }
			
 
				+
			
 
				+  /* make sure we have the buffer storage */
			
 
				+  if(_os_body_expand(os,bytes) || _os_lacing_expand(os,lacing_vals))
			
 
				+    return -1;
			
 
				+
			
 
				+  /* Copy in the submitted packet.  Yes, the copy is a waste; this is
			
 
				+     the liability of overly clean abstraction for the time being.  It
			
 
				+     will actually be fairly easy to eliminate the extra copy in the
			
 
				+     future */
			
 
				+
			
 
				+  for (i = 0; i < count; ++i) {
			
 
				+    memcpy(os->body_data+os->body_fill, iov[i].iov_base, iov[i].iov_len);
			
 
				+    os->body_fill += (int)iov[i].iov_len;
			
 
				+  }
			
 
				+
			
 
				+  /* Store lacing vals for this packet */
			
 
				+  for(i=0;i<lacing_vals-1;i++){
			
 
				+    os->lacing_vals[os->lacing_fill+i]=255;
			
 
				+    os->granule_vals[os->lacing_fill+i]=os->granulepos;
			
 
				+  }
			
 
				+  os->lacing_vals[os->lacing_fill+i]=bytes%255;
			
 
				+  os->granulepos=os->granule_vals[os->lacing_fill+i]=granulepos;
			
 
				+
			
 
				+  /* flag the first segment as the beginning of the packet */
			
 
				+  os->lacing_vals[os->lacing_fill]|= 0x100;
			
 
				+
			
 
				+  os->lacing_fill+=lacing_vals;
			
 
				+
			
 
				+  /* for the sake of completeness */
			
 
				+  os->packetno++;
			
 
				+
			
 
				+  if(e_o_s)os->e_o_s=1;
			
 
				+
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+int ogg_stream_packetin(ogg_stream_state *os,ogg_packet *op){
			
 
				+  ogg_iovec_t iov;
			
 
				+  iov.iov_base = op->packet;
			
 
				+  iov.iov_len = op->bytes;
			
 
				+  return ogg_stream_iovecin(os, &iov, 1, op->e_o_s, op->granulepos);
			
 
				+}
			
 
				+
			
 
				+/* Conditionally flush a page; force==0 will only flush nominal-size
			
 
				+   pages, force==1 forces us to flush a page regardless of page size
			
 
				+   so long as there's any data available at all. */
			
 
				+static int ogg_stream_flush_i(ogg_stream_state *os,ogg_page *og, int force, int nfill){
			
 
				+  int i;
			
 
				+  int vals=0;
			
 
				+  int maxvals=(os->lacing_fill>255?255:os->lacing_fill);
			
 
				+  int bytes=0;
			
 
				+  long acc=0;
			
 
				+  ogg_int64_t granule_pos=-1;
			
 
				+
			
 
				+  if(ogg_stream_check(os)) return(0);
			
 
				+  if(maxvals==0) return(0);
			
 
				+
			
 
				+  /* construct a page */
			
 
				+  /* decide how many segments to include */
			
 
				+
			
 
				+  /* If this is the initial header case, the first page must only include
			
 
				+     the initial header packet */
			
 
				+  if(os->b_o_s==0){  /* 'initial header page' case */
			
 
				+    granule_pos=0;
			
 
				+    for(vals=0;vals<maxvals;vals++){
			
 
				+      if((os->lacing_vals[vals]&0x0ff)<255){
			
 
				+        vals++;
			
 
				+        break;
			
 
				+      }
			
 
				+    }
			
 
				+  }else{
			
 
				+
			
 
				+    /* The extra packets_done, packet_just_done logic here attempts to do two things:
			
 
				+       1) Don't unneccessarily span pages.
			
 
				+       2) Unless necessary, don't flush pages if there are less than four packets on
			
 
				+          them; this expands page size to reduce unneccessary overhead if incoming packets
			
 
				+          are large.
			
 
				+       These are not necessary behaviors, just 'always better than naive flushing'
			
 
				+       without requiring an application to explicitly request a specific optimized
			
 
				+       behavior. We'll want an explicit behavior setup pathway eventually as well. */
			
 
				+
			
 
				+    int packets_done=0;
			
 
				+    int packet_just_done=0;
			
 
				+    for(vals=0;vals<maxvals;vals++){
			
 
				+      if(acc>nfill && packet_just_done>=4){
			
 
				+        force=1;
			
 
				+        break;
			
 
				+      }
			
 
				+      acc+=os->lacing_vals[vals]&0x0ff;
			
 
				+      if((os->lacing_vals[vals]&0xff)<255){
			
 
				+        granule_pos=os->granule_vals[vals];
			
 
				+        packet_just_done=++packets_done;
			
 
				+      }else
			
 
				+        packet_just_done=0;
			
 
				+    }
			
 
				+    if(vals==255)force=1;
			
 
				+  }
			
 
				+
			
 
				+  if(!force) return(0);
			
 
				+
			
 
				+  /* construct the header in temp storage */
			
 
				+  memcpy(os->header,"OggS",4);
			
 
				+
			
 
				+  /* stream structure version */
			
 
				+  os->header[4]=0x00;
			
 
				+
			
 
				+  /* continued packet flag? */
			
 
				+  os->header[5]=0x00;
			
 
				+  if((os->lacing_vals[0]&0x100)==0)os->header[5]|=0x01;
			
 
				+  /* first page flag? */
			
 
				+  if(os->b_o_s==0)os->header[5]|=0x02;
			
 
				+  /* last page flag? */
			
 
				+  if(os->e_o_s && os->lacing_fill==vals)os->header[5]|=0x04;
			
 
				+  os->b_o_s=1;
			
 
				+
			
 
				+  /* 64 bits of PCM position */
			
 
				+  for(i=6;i<14;i++){
			
 
				+    os->header[i]=(unsigned char)(granule_pos&0xff);
			
 
				+    granule_pos>>=8;
			
 
				+  }
			
 
				+
			
 
				+  /* 32 bits of stream serial number */
			
 
				+  {
			
 
				+    long serialno=os->serialno;
			
 
				+    for(i=14;i<18;i++){
			
 
				+      os->header[i]=(unsigned char)(serialno&0xff);
			
 
				+      serialno>>=8;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /* 32 bits of page counter (we have both counter and page header
			
 
				+     because this val can roll over) */
			
 
				+  if(os->pageno==-1)os->pageno=0; /* because someone called
			
 
				+                                     stream_reset; this would be a
			
 
				+                                     strange thing to do in an
			
 
				+                                     encode stream, but it has
			
 
				+                                     plausible uses */
			
 
				+  {
			
 
				+    long pageno=os->pageno++;
			
 
				+    for(i=18;i<22;i++){
			
 
				+      os->header[i]=(unsigned char)(pageno&0xff);
			
 
				+      pageno>>=8;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /* zero for computation; filled in later */
			
 
				+  os->header[22]=0;
			
 
				+  os->header[23]=0;
			
 
				+  os->header[24]=0;
			
 
				+  os->header[25]=0;
			
 
				+
			
 
				+  /* segment table */
			
 
				+  os->header[26]=(unsigned char)(vals&0xff);
			
 
				+  for(i=0;i<vals;i++)
			
 
				+    bytes+=os->header[i+27]=(unsigned char)(os->lacing_vals[i]&0xff);
			
 
				+
			
 
				+  /* set pointers in the ogg_page struct */
			
 
				+  og->header=os->header;
			
 
				+  og->header_len=os->header_fill=vals+27;
			
 
				+  og->body=os->body_data+os->body_returned;
			
 
				+  og->body_len=bytes;
			
 
				+
			
 
				+  /* advance the lacing data and set the body_returned pointer */
			
 
				+
			
 
				+  os->lacing_fill-=vals;
			
 
				+  memmove(os->lacing_vals,os->lacing_vals+vals,os->lacing_fill*sizeof(*os->lacing_vals));
			
 
				+  memmove(os->granule_vals,os->granule_vals+vals,os->lacing_fill*sizeof(*os->granule_vals));
			
 
				+  os->body_returned+=bytes;
			
 
				+
			
 
				+  /* calculate the checksum */
			
 
				+
			
 
				+  ogg_page_checksum_set(og);
			
 
				+
			
 
				+  /* done */
			
 
				+  return(1);
			
 
				+}
			
 
				+
			
 
				+/* This will flush remaining packets into a page (returning nonzero),
			
 
				+   even if there is not enough data to trigger a flush normally
			
 
				+   (undersized page). If there are no packets or partial packets to
			
 
				+   flush, ogg_stream_flush returns 0.  Note that ogg_stream_flush will
			
 
				+   try to flush a normal sized page like ogg_stream_pageout; a call to
			
 
				+   ogg_stream_flush does not guarantee that all packets have flushed.
			
 
				+   Only a return value of 0 from ogg_stream_flush indicates all packet
			
 
				+   data is flushed into pages.
			
 
				+
			
 
				+   since ogg_stream_flush will flush the last page in a stream even if
			
 
				+   it's undersized, you almost certainly want to use ogg_stream_pageout
			
 
				+   (and *not* ogg_stream_flush) unless you specifically need to flush
			
 
				+   a page regardless of size in the middle of a stream. */
			
 
				+
			
 
				+int ogg_stream_flush(ogg_stream_state *os,ogg_page *og){
			
 
				+  return ogg_stream_flush_i(os,og,1,4096);
			
 
				+}
			
 
				+
			
 
				+/* Like the above, but an argument is provided to adjust the nominal
			
 
				+   page size for applications which are smart enough to provide their
			
 
				+   own delay based flushing */
			
 
				+
			
 
				+int ogg_stream_flush_fill(ogg_stream_state *os,ogg_page *og, int nfill){
			
 
				+  return ogg_stream_flush_i(os,og,1,nfill);
			
 
				+}
			
 
				+
			
 
				+/* This constructs pages from buffered packet segments.  The pointers
			
 
				+returned are to static buffers; do not free. The returned buffers are
			
 
				+good only until the next call (using the same ogg_stream_state) */
			
 
				+
			
 
				+int ogg_stream_pageout(ogg_stream_state *os, ogg_page *og){
			
 
				+  int force=0;
			
 
				+  if(ogg_stream_check(os)) return 0;
			
 
				+
			
 
				+  if((os->e_o_s&&os->lacing_fill) ||          /* 'were done, now flush' case */
			
 
				+     (os->lacing_fill&&!os->b_o_s))           /* 'initial header page' case */
			
 
				+    force=1;
			
 
				+
			
 
				+  return(ogg_stream_flush_i(os,og,force,4096));
			
 
				+}
			
 
				+
			
 
				+/* Like the above, but an argument is provided to adjust the nominal
			
 
				+page size for applications which are smart enough to provide their
			
 
				+own delay based flushing */
			
 
				+
			
 
				+int ogg_stream_pageout_fill(ogg_stream_state *os, ogg_page *og, int nfill){
			
 
				+  int force=0;
			
 
				+  if(ogg_stream_check(os)) return 0;
			
 
				+
			
 
				+  if((os->e_o_s&&os->lacing_fill) ||          /* 'were done, now flush' case */
			
 
				+     (os->lacing_fill&&!os->b_o_s))           /* 'initial header page' case */
			
 
				+    force=1;
			
 
				+
			
 
				+  return(ogg_stream_flush_i(os,og,force,nfill));
			
 
				+}
			
 
				+
			
 
				+int ogg_stream_eos(ogg_stream_state *os){
			
 
				+  if(ogg_stream_check(os)) return 1;
			
 
				+  return os->e_o_s;
			
 
				+}
			
 
				+
			
 
				+/* DECODING PRIMITIVES: packet streaming layer **********************/
			
 
				+
			
 
				+/* This has two layers to place more of the multi-serialno and paging
			
 
				+   control in the application's hands.  First, we expose a data buffer
			
 
				+   using ogg_sync_buffer().  The app either copies into the
			
 
				+   buffer, or passes it directly to read(), etc.  We then call
			
 
				+   ogg_sync_wrote() to tell how many bytes we just added.
			
 
				+
			
 
				+   Pages are returned (pointers into the buffer in ogg_sync_state)
			
 
				+   by ogg_sync_pageout().  The page is then submitted to
			
 
				+   ogg_stream_pagein() along with the appropriate
			
 
				+   ogg_stream_state* (ie, matching serialno).  We then get raw
			
 
				+   packets out calling ogg_stream_packetout() with a
			
 
				+   ogg_stream_state. */
			
 
				+
			
 
				+/* initialize the struct to a known state */
			
 
				+int ogg_sync_init(ogg_sync_state *oy){
			
 
				+  if(oy){
			
 
				+    oy->storage = -1; /* used as a readiness flag */
			
 
				+    memset(oy,0,sizeof(*oy));
			
 
				+  }
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+/* clear non-flat storage within */
			
 
				+int ogg_sync_clear(ogg_sync_state *oy){
			
 
				+  if(oy){
			
 
				+    if(oy->data)_ogg_free(oy->data);
			
 
				+    memset(oy,0,sizeof(*oy));
			
 
				+  }
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+int ogg_sync_destroy(ogg_sync_state *oy){
			
 
				+  if(oy){
			
 
				+    ogg_sync_clear(oy);
			
 
				+    _ogg_free(oy);
			
 
				+  }
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+int ogg_sync_check(ogg_sync_state *oy){
			
 
				+  if(oy->storage<0) return -1;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+char *ogg_sync_buffer(ogg_sync_state *oy, long size){
			
 
				+  if(ogg_sync_check(oy)) return NULL;
			
 
				+
			
 
				+  /* first, clear out any space that has been previously returned */
			
 
				+  if(oy->returned){
			
 
				+    oy->fill-=oy->returned;
			
 
				+    if(oy->fill>0)
			
 
				+      memmove(oy->data,oy->data+oy->returned,oy->fill);
			
 
				+    oy->returned=0;
			
 
				+  }
			
 
				+
			
 
				+  if(size>oy->storage-oy->fill){
			
 
				+    /* We need to extend the internal buffer */
			
 
				+    long newsize=size+oy->fill+4096; /* an extra page to be nice */
			
 
				+    void *ret;
			
 
				+
			
 
				+    if(oy->data)
			
 
				+      ret=_ogg_realloc(oy->data,newsize);
			
 
				+    else
			
 
				+      ret=_ogg_malloc(newsize);
			
 
				+    if(!ret){
			
 
				+      ogg_sync_clear(oy);
			
 
				+      return NULL;
			
 
				+    }
			
 
				+    oy->data=ret;
			
 
				+    oy->storage=newsize;
			
 
				+  }
			
 
				+
			
 
				+  /* expose a segment at least as large as requested at the fill mark */
			
 
				+  return((char *)oy->data+oy->fill);
			
 
				+}
			
 
				+
			
 
				+int ogg_sync_wrote(ogg_sync_state *oy, long bytes){
			
 
				+  if(ogg_sync_check(oy))return -1;
			
 
				+  if(oy->fill+bytes>oy->storage)return -1;
			
 
				+  oy->fill+=bytes;
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+/* sync the stream.  This is meant to be useful for finding page
			
 
				+   boundaries.
			
 
				+
			
 
				+   return values for this:
			
 
				+  -n) skipped n bytes
			
 
				+   0) page not ready; more data (no bytes skipped)
			
 
				+   n) page synced at current location; page length n bytes
			
 
				+
			
 
				+*/
			
 
				+
			
 
				+long ogg_sync_pageseek(ogg_sync_state *oy,ogg_page *og){
			
 
				+  unsigned char *page=oy->data+oy->returned;
			
 
				+  unsigned char *next;
			
 
				+  long bytes=oy->fill-oy->returned;
			
 
				+
			
 
				+  if(ogg_sync_check(oy))return 0;
			
 
				+
			
 
				+  if(oy->headerbytes==0){
			
 
				+    int headerbytes,i;
			
 
				+    if(bytes<27)return(0); /* not enough for a header */
			
 
				+
			
 
				+    /* verify capture pattern */
			
 
				+    if(memcmp(page,"OggS",4))goto sync_fail;
			
 
				+
			
 
				+    headerbytes=page[26]+27;
			
 
				+    if(bytes<headerbytes)return(0); /* not enough for header + seg table */
			
 
				+
			
 
				+    /* count up body length in the segment table */
			
 
				+
			
 
				+    for(i=0;i<page[26];i++)
			
 
				+      oy->bodybytes+=page[27+i];
			
 
				+    oy->headerbytes=headerbytes;
			
 
				+  }
			
 
				+
			
 
				+  if(oy->bodybytes+oy->headerbytes>bytes)return(0);
			
 
				+
			
 
				+  /* The whole test page is buffered.  Verify the checksum */
			
 
				+  {
			
 
				+    /* Grab the checksum bytes, set the header field to zero */
			
 
				+    char chksum[4];
			
 
				+    ogg_page log;
			
 
				+
			
 
				+    memcpy(chksum,page+22,4);
			
 
				+    memset(page+22,0,4);
			
 
				+
			
 
				+    /* set up a temp page struct and recompute the checksum */
			
 
				+    log.header=page;
			
 
				+    log.header_len=oy->headerbytes;
			
 
				+    log.body=page+oy->headerbytes;
			
 
				+    log.body_len=oy->bodybytes;
			
 
				+    ogg_page_checksum_set(&log);
			
 
				+
			
 
				+    /* Compare */
			
 
				+    if(memcmp(chksum,page+22,4)){
			
 
				+      /* D'oh.  Mismatch! Corrupt page (or miscapture and not a page
			
 
				+         at all) */
			
 
				+      /* replace the computed checksum with the one actually read in */
			
 
				+      memcpy(page+22,chksum,4);
			
 
				+
			
 
				+      /* Bad checksum. Lose sync */
			
 
				+      goto sync_fail;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /* yes, have a whole page all ready to go */
			
 
				+  {
			
 
				+    unsigned char *page=oy->data+oy->returned;
			
 
				+    long bytes;
			
 
				+
			
 
				+    if(og){
			
 
				+      og->header=page;
			
 
				+      og->header_len=oy->headerbytes;
			
 
				+      og->body=page+oy->headerbytes;
			
 
				+      og->body_len=oy->bodybytes;
			
 
				+    }
			
 
				+
			
 
				+    oy->unsynced=0;
			
 
				+    oy->returned+=(bytes=oy->headerbytes+oy->bodybytes);
			
 
				+    oy->headerbytes=0;
			
 
				+    oy->bodybytes=0;
			
 
				+    return(bytes);
			
 
				+  }
			
 
				+
			
 
				+ sync_fail:
			
 
				+
			
 
				+  oy->headerbytes=0;
			
 
				+  oy->bodybytes=0;
			
 
				+
			
 
				+  /* search for possible capture */
			
 
				+  next=memchr(page+1,'O',bytes-1);
			
 
				+  if(!next)
			
 
				+    next=oy->data+oy->fill;
			
 
				+
			
 
				+  oy->returned=(int)(next-oy->data);
			
 
				+  return((long)-(next-page));
			
 
				+}
			
 
				+
			
 
				+/* sync the stream and get a page.  Keep trying until we find a page.
			
 
				+   Suppress 'sync errors' after reporting the first.
			
 
				+
			
 
				+   return values:
			
 
				+   -1) recapture (hole in data)
			
 
				+    0) need more data
			
 
				+    1) page returned
			
 
				+
			
 
				+   Returns pointers into buffered data; invalidated by next call to
			
 
				+   _stream, _clear, _init, or _buffer */
			
 
				+
			
 
				+int ogg_sync_pageout(ogg_sync_state *oy, ogg_page *og){
			
 
				+
			
 
				+  if(ogg_sync_check(oy))return 0;
			
 
				+
			
 
				+  /* all we need to do is verify a page at the head of the stream
			
 
				+     buffer.  If it doesn't verify, we look for the next potential
			
 
				+     frame */
			
 
				+
			
 
				+  for(;;){
			
 
				+    long ret=ogg_sync_pageseek(oy,og);
			
 
				+    if(ret>0){
			
 
				+      /* have a page */
			
 
				+      return(1);
			
 
				+    }
			
 
				+    if(ret==0){
			
 
				+      /* need more data */
			
 
				+      return(0);
			
 
				+    }
			
 
				+
			
 
				+    /* head did not start a synced page... skipped some bytes */
			
 
				+    if(!oy->unsynced){
			
 
				+      oy->unsynced=1;
			
 
				+      return(-1);
			
 
				+    }
			
 
				+
			
 
				+    /* loop. keep looking */
			
 
				+
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/* add the incoming page to the stream state; we decompose the page
			
 
				+   into packet segments here as well. */
			
 
				+
			
 
				+int ogg_stream_pagein(ogg_stream_state *os, ogg_page *og){
			
 
				+  unsigned char *header=og->header;
			
 
				+  unsigned char *body=og->body;
			
 
				+  long           bodysize=og->body_len;
			
 
				+  int            segptr=0;
			
 
				+
			
 
				+  int version=ogg_page_version(og);
			
 
				+  int continued=ogg_page_continued(og);
			
 
				+  int bos=ogg_page_bos(og);
			
 
				+  int eos=ogg_page_eos(og);
			
 
				+  ogg_int64_t granulepos=ogg_page_granulepos(og);
			
 
				+  int serialno=ogg_page_serialno(og);
			
 
				+  long pageno=ogg_page_pageno(og);
			
 
				+  int segments=header[26];
			
 
				+
			
 
				+  if(ogg_stream_check(os)) return -1;
			
 
				+
			
 
				+  /* clean up 'returned data' */
			
 
				+  {
			
 
				+    long lr=os->lacing_returned;
			
 
				+    long br=os->body_returned;
			
 
				+
			
 
				+    /* body data */
			
 
				+    if(br){
			
 
				+      os->body_fill-=br;
			
 
				+      if(os->body_fill)
			
 
				+        memmove(os->body_data,os->body_data+br,os->body_fill);
			
 
				+      os->body_returned=0;
			
 
				+    }
			
 
				+
			
 
				+    if(lr){
			
 
				+      /* segment table */
			
 
				+      if(os->lacing_fill-lr){
			
 
				+        memmove(os->lacing_vals,os->lacing_vals+lr,
			
 
				+                (os->lacing_fill-lr)*sizeof(*os->lacing_vals));
			
 
				+        memmove(os->granule_vals,os->granule_vals+lr,
			
 
				+                (os->lacing_fill-lr)*sizeof(*os->granule_vals));
			
 
				+      }
			
 
				+      os->lacing_fill-=lr;
			
 
				+      os->lacing_packet-=lr;
			
 
				+      os->lacing_returned=0;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /* check the serial number */
			
 
				+  if(serialno!=os->serialno)return(-1);
			
 
				+  if(version>0)return(-1);
			
 
				+
			
 
				+  if(_os_lacing_expand(os,segments+1)) return -1;
			
 
				+
			
 
				+  /* are we in sequence? */
			
 
				+  if(pageno!=os->pageno){
			
 
				+    int i;
			
 
				+
			
 
				+    /* unroll previous partial packet (if any) */
			
 
				+    for(i=os->lacing_packet;i<os->lacing_fill;i++)
			
 
				+      os->body_fill-=os->lacing_vals[i]&0xff;
			
 
				+    os->lacing_fill=os->lacing_packet;
			
 
				+
			
 
				+    /* make a note of dropped data in segment table */
			
 
				+    if(os->pageno!=-1){
			
 
				+      os->lacing_vals[os->lacing_fill++]=0x400;
			
 
				+      os->lacing_packet++;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /* are we a 'continued packet' page?  If so, we may need to skip
			
 
				+     some segments */
			
 
				+  if(continued){
			
 
				+    if(os->lacing_fill<1 ||
			
 
				+       os->lacing_vals[os->lacing_fill-1]==0x400){
			
 
				+      bos=0;
			
 
				+      for(;segptr<segments;segptr++){
			
 
				+        int val=header[27+segptr];
			
 
				+        body+=val;
			
 
				+        bodysize-=val;
			
 
				+        if(val<255){
			
 
				+          segptr++;
			
 
				+          break;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  if(bodysize){
			
 
				+    if(_os_body_expand(os,bodysize)) return -1;
			
 
				+    memcpy(os->body_data+os->body_fill,body,bodysize);
			
 
				+    os->body_fill+=bodysize;
			
 
				+  }
			
 
				+
			
 
				+  {
			
 
				+    int saved=-1;
			
 
				+    while(segptr<segments){
			
 
				+      int val=header[27+segptr];
			
 
				+      os->lacing_vals[os->lacing_fill]=val;
			
 
				+      os->granule_vals[os->lacing_fill]=-1;
			
 
				+
			
 
				+      if(bos){
			
 
				+        os->lacing_vals[os->lacing_fill]|=0x100;
			
 
				+        bos=0;
			
 
				+      }
			
 
				+
			
 
				+      if(val<255)saved=os->lacing_fill;
			
 
				+
			
 
				+      os->lacing_fill++;
			
 
				+      segptr++;
			
 
				+
			
 
				+      if(val<255)os->lacing_packet=os->lacing_fill;
			
 
				+    }
			
 
				+
			
 
				+    /* set the granulepos on the last granuleval of the last full packet */
			
 
				+    if(saved!=-1){
			
 
				+      os->granule_vals[saved]=granulepos;
			
 
				+    }
			
 
				+
			
 
				+  }
			
 
				+
			
 
				+  if(eos){
			
 
				+    os->e_o_s=1;
			
 
				+    if(os->lacing_fill>0)
			
 
				+      os->lacing_vals[os->lacing_fill-1]|=0x200;
			
 
				+  }
			
 
				+
			
 
				+  os->pageno=pageno+1;
			
 
				+
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+/* clear things to an initial state.  Good to call, eg, before seeking */
			
 
				+int ogg_sync_reset(ogg_sync_state *oy){
			
 
				+  if(ogg_sync_check(oy))return -1;
			
 
				+
			
 
				+  oy->fill=0;
			
 
				+  oy->returned=0;
			
 
				+  oy->unsynced=0;
			
 
				+  oy->headerbytes=0;
			
 
				+  oy->bodybytes=0;
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+int ogg_stream_reset(ogg_stream_state *os){
			
 
				+  if(ogg_stream_check(os)) return -1;
			
 
				+
			
 
				+  os->body_fill=0;
			
 
				+  os->body_returned=0;
			
 
				+
			
 
				+  os->lacing_fill=0;
			
 
				+  os->lacing_packet=0;
			
 
				+  os->lacing_returned=0;
			
 
				+
			
 
				+  os->header_fill=0;
			
 
				+
			
 
				+  os->e_o_s=0;
			
 
				+  os->b_o_s=0;
			
 
				+  os->pageno=-1;
			
 
				+  os->packetno=0;
			
 
				+  os->granulepos=0;
			
 
				+
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+int ogg_stream_reset_serialno(ogg_stream_state *os,int serialno){
			
 
				+  if(ogg_stream_check(os)) return -1;
			
 
				+  ogg_stream_reset(os);
			
 
				+  os->serialno=serialno;
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+static int _packetout(ogg_stream_state *os,ogg_packet *op,int adv){
			
 
				+
			
 
				+  /* The last part of decode. We have the stream broken into packet
			
 
				+     segments.  Now we need to group them into packets (or return the
			
 
				+     out of sync markers) */
			
 
				+
			
 
				+  int ptr=os->lacing_returned;
			
 
				+
			
 
				+  if(os->lacing_packet<=ptr)return(0);
			
 
				+
			
 
				+  if(os->lacing_vals[ptr]&0x400){
			
 
				+    /* we need to tell the codec there's a gap; it might need to
			
 
				+       handle previous packet dependencies. */
			
 
				+    os->lacing_returned++;
			
 
				+    os->packetno++;
			
 
				+    return(-1);
			
 
				+  }
			
 
				+
			
 
				+  if(!op && !adv)return(1); /* just using peek as an inexpensive way
			
 
				+                               to ask if there's a whole packet
			
 
				+                               waiting */
			
 
				+
			
 
				+  /* Gather the whole packet. We'll have no holes or a partial packet */
			
 
				+  {
			
 
				+    int size=os->lacing_vals[ptr]&0xff;
			
 
				+    long bytes=size;
			
 
				+    int eos=os->lacing_vals[ptr]&0x200; /* last packet of the stream? */
			
 
				+    int bos=os->lacing_vals[ptr]&0x100; /* first packet of the stream? */
			
 
				+
			
 
				+    while(size==255){
			
 
				+      int val=os->lacing_vals[++ptr];
			
 
				+      size=val&0xff;
			
 
				+      if(val&0x200)eos=0x200;
			
 
				+      bytes+=size;
			
 
				+    }
			
 
				+
			
 
				+    if(op){
			
 
				+      op->e_o_s=eos;
			
 
				+      op->b_o_s=bos;
			
 
				+      op->packet=os->body_data+os->body_returned;
			
 
				+      op->packetno=os->packetno;
			
 
				+      op->granulepos=os->granule_vals[ptr];
			
 
				+      op->bytes=bytes;
			
 
				+    }
			
 
				+
			
 
				+    if(adv){
			
 
				+      os->body_returned+=bytes;
			
 
				+      os->lacing_returned=ptr+1;
			
 
				+      os->packetno++;
			
 
				+    }
			
 
				+  }
			
 
				+  return(1);
			
 
				+}
			
 
				+
			
 
				+int ogg_stream_packetout(ogg_stream_state *os,ogg_packet *op){
			
 
				+  if(ogg_stream_check(os)) return 0;
			
 
				+  return _packetout(os,op,1);
			
 
				+}
			
 
				+
			
 
				+int ogg_stream_packetpeek(ogg_stream_state *os,ogg_packet *op){
			
 
				+  if(ogg_stream_check(os)) return 0;
			
 
				+  return _packetout(os,op,0);
			
 
				+}
			
 
				+
			
 
				+void ogg_packet_clear(ogg_packet *op) {
			
 
				+  _ogg_free(op->packet);
			
 
				+  memset(op, 0, sizeof(*op));
			
 
				+}
			
 
				+
			
 
				+#ifdef _V_SELFTEST
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+ogg_stream_state os_en, os_de;
			
 
				+ogg_sync_state oy;
			
 
				+
			
 
				+void checkpacket(ogg_packet *op,long len, int no, long pos){
			
 
				+  long j;
			
 
				+  static int sequence=0;
			
 
				+  static int lastno=0;
			
 
				+
			
 
				+  if(op->bytes!=len){
			
 
				+    fprintf(stderr,"incorrect packet length (%ld != %ld)!\n",op->bytes,len);
			
 
				+    exit(1);
			
 
				+  }
			
 
				+  if(op->granulepos!=pos){
			
 
				+    fprintf(stderr,"incorrect packet granpos (%ld != %ld)!\n",(long)op->granulepos,pos);
			
 
				+    exit(1);
			
 
				+  }
			
 
				+
			
 
				+  /* packet number just follows sequence/gap; adjust the input number
			
 
				+     for that */
			
 
				+  if(no==0){
			
 
				+    sequence=0;
			
 
				+  }else{
			
 
				+    sequence++;
			
 
				+    if(no>lastno+1)
			
 
				+      sequence++;
			
 
				+  }
			
 
				+  lastno=no;
			
 
				+  if(op->packetno!=sequence){
			
 
				+    fprintf(stderr,"incorrect packet sequence %ld != %d\n",
			
 
				+            (long)(op->packetno),sequence);
			
 
				+    exit(1);
			
 
				+  }
			
 
				+
			
 
				+  /* Test data */
			
 
				+  for(j=0;j<op->bytes;j++)
			
 
				+    if(op->packet[j]!=((j+no)&0xff)){
			
 
				+      fprintf(stderr,"body data mismatch (1) at pos %ld: %x!=%lx!\n\n",
			
 
				+              j,op->packet[j],(j+no)&0xff);
			
 
				+      exit(1);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void check_page(unsigned char *data,const int *header,ogg_page *og){
			
 
				+  long j;
			
 
				+  /* Test data */
			
 
				+  for(j=0;j<og->body_len;j++)
			
 
				+    if(og->body[j]!=data[j]){
			
 
				+      fprintf(stderr,"body data mismatch (2) at pos %ld: %x!=%x!\n\n",
			
 
				+              j,data[j],og->body[j]);
			
 
				+      exit(1);
			
 
				+    }
			
 
				+
			
 
				+  /* Test header */
			
 
				+  for(j=0;j<og->header_len;j++){
			
 
				+    if(og->header[j]!=header[j]){
			
 
				+      fprintf(stderr,"header content mismatch at pos %ld:\n",j);
			
 
				+      for(j=0;j<header[26]+27;j++)
			
 
				+        fprintf(stderr," (%ld)%02x:%02x",j,header[j],og->header[j]);
			
 
				+      fprintf(stderr,"\n");
			
 
				+      exit(1);
			
 
				+    }
			
 
				+  }
			
 
				+  if(og->header_len!=header[26]+27){
			
 
				+    fprintf(stderr,"header length incorrect! (%ld!=%d)\n",
			
 
				+            og->header_len,header[26]+27);
			
 
				+    exit(1);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void print_header(ogg_page *og){
			
 
				+  int j;
			
 
				+  fprintf(stderr,"\nHEADER:\n");
			
 
				+  fprintf(stderr,"  capture: %c %c %c %c  version: %d  flags: %x\n",
			
 
				+          og->header[0],og->header[1],og->header[2],og->header[3],
			
 
				+          (int)og->header[4],(int)og->header[5]);
			
 
				+
			
 
				+  fprintf(stderr,"  granulepos: %d  serialno: %d  pageno: %ld\n",
			
 
				+          (og->header[9]<<24)|(og->header[8]<<16)|
			
 
				+          (og->header[7]<<8)|og->header[6],
			
 
				+          (og->header[17]<<24)|(og->header[16]<<16)|
			
 
				+          (og->header[15]<<8)|og->header[14],
			
 
				+          ((long)(og->header[21])<<24)|(og->header[20]<<16)|
			
 
				+          (og->header[19]<<8)|og->header[18]);
			
 
				+
			
 
				+  fprintf(stderr,"  checksum: %02x:%02x:%02x:%02x\n  segments: %d (",
			
 
				+          (int)og->header[22],(int)og->header[23],
			
 
				+          (int)og->header[24],(int)og->header[25],
			
 
				+          (int)og->header[26]);
			
 
				+
			
 
				+  for(j=27;j<og->header_len;j++)
			
 
				+    fprintf(stderr,"%d ",(int)og->header[j]);
			
 
				+  fprintf(stderr,")\n\n");
			
 
				+}
			
 
				+
			
 
				+void copy_page(ogg_page *og){
			
 
				+  unsigned char *temp=_ogg_malloc(og->header_len);
			
 
				+  memcpy(temp,og->header,og->header_len);
			
 
				+  og->header=temp;
			
 
				+
			
 
				+  temp=_ogg_malloc(og->body_len);
			
 
				+  memcpy(temp,og->body,og->body_len);
			
 
				+  og->body=temp;
			
 
				+}
			
 
				+
			
 
				+void free_page(ogg_page *og){
			
 
				+  _ogg_free (og->header);
			
 
				+  _ogg_free (og->body);
			
 
				+}
			
 
				+
			
 
				+void error(void){
			
 
				+  fprintf(stderr,"error!\n");
			
 
				+  exit(1);
			
 
				+}
			
 
				+
			
 
				+/* 17 only */
			
 
				+const int head1_0[] = {0x4f,0x67,0x67,0x53,0,0x06,
			
 
				+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,0,0,0,0,
			
 
				+                       0x15,0xed,0xec,0x91,
			
 
				+                       1,
			
 
				+                       17};
			
 
				+
			
 
				+/* 17, 254, 255, 256, 500, 510, 600 byte, pad */
			
 
				+const int head1_1[] = {0x4f,0x67,0x67,0x53,0,0x02,
			
 
				+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,0,0,0,0,
			
 
				+                       0x59,0x10,0x6c,0x2c,
			
 
				+                       1,
			
 
				+                       17};
			
 
				+const int head2_1[] = {0x4f,0x67,0x67,0x53,0,0x04,
			
 
				+                       0x07,0x18,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,1,0,0,0,
			
 
				+                       0x89,0x33,0x85,0xce,
			
 
				+                       13,
			
 
				+                       254,255,0,255,1,255,245,255,255,0,
			
 
				+                       255,255,90};
			
 
				+
			
 
				+/* nil packets; beginning,middle,end */
			
 
				+const int head1_2[] = {0x4f,0x67,0x67,0x53,0,0x02,
			
 
				+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,0,0,0,0,
			
 
				+                       0xff,0x7b,0x23,0x17,
			
 
				+                       1,
			
 
				+                       0};
			
 
				+const int head2_2[] = {0x4f,0x67,0x67,0x53,0,0x04,
			
 
				+                       0x07,0x28,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,1,0,0,0,
			
 
				+                       0x5c,0x3f,0x66,0xcb,
			
 
				+                       17,
			
 
				+                       17,254,255,0,0,255,1,0,255,245,255,255,0,
			
 
				+                       255,255,90,0};
			
 
				+
			
 
				+/* large initial packet */
			
 
				+const int head1_3[] = {0x4f,0x67,0x67,0x53,0,0x02,
			
 
				+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,0,0,0,0,
			
 
				+                       0x01,0x27,0x31,0xaa,
			
 
				+                       18,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,255,10};
			
 
				+
			
 
				+const int head2_3[] = {0x4f,0x67,0x67,0x53,0,0x04,
			
 
				+                       0x07,0x08,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,1,0,0,0,
			
 
				+                       0x7f,0x4e,0x8a,0xd2,
			
 
				+                       4,
			
 
				+                       255,4,255,0};
			
 
				+
			
 
				+
			
 
				+/* continuing packet test */
			
 
				+const int head1_4[] = {0x4f,0x67,0x67,0x53,0,0x02,
			
 
				+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,0,0,0,0,
			
 
				+                       0xff,0x7b,0x23,0x17,
			
 
				+                       1,
			
 
				+                       0};
			
 
				+
			
 
				+const int head2_4[] = {0x4f,0x67,0x67,0x53,0,0x00,
			
 
				+                       0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
			
 
				+                       0x01,0x02,0x03,0x04,1,0,0,0,
			
 
				+                       0xf8,0x3c,0x19,0x79,
			
 
				+                       255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255};
			
 
				+
			
 
				+const int head3_4[] = {0x4f,0x67,0x67,0x53,0,0x05,
			
 
				+                       0x07,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,2,0,0,0,
			
 
				+                       0x38,0xe6,0xb6,0x28,
			
 
				+                       6,
			
 
				+                       255,220,255,4,255,0};
			
 
				+
			
 
				+
			
 
				+/* spill expansion test */
			
 
				+const int head1_4b[] = {0x4f,0x67,0x67,0x53,0,0x02,
			
 
				+                        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                        0x01,0x02,0x03,0x04,0,0,0,0,
			
 
				+                        0xff,0x7b,0x23,0x17,
			
 
				+                        1,
			
 
				+                        0};
			
 
				+
			
 
				+const int head2_4b[] = {0x4f,0x67,0x67,0x53,0,0x00,
			
 
				+                        0x07,0x10,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                        0x01,0x02,0x03,0x04,1,0,0,0,
			
 
				+                        0xce,0x8f,0x17,0x1a,
			
 
				+                        23,
			
 
				+                        255,255,255,255,255,255,255,255,
			
 
				+                        255,255,255,255,255,255,255,255,255,10,255,4,255,0,0};
			
 
				+
			
 
				+
			
 
				+const int head3_4b[] = {0x4f,0x67,0x67,0x53,0,0x04,
			
 
				+                        0x07,0x14,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                        0x01,0x02,0x03,0x04,2,0,0,0,
			
 
				+                        0x9b,0xb2,0x50,0xa1,
			
 
				+                        1,
			
 
				+                        0};
			
 
				+
			
 
				+/* page with the 255 segment limit */
			
 
				+const int head1_5[] = {0x4f,0x67,0x67,0x53,0,0x02,
			
 
				+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,0,0,0,0,
			
 
				+                       0xff,0x7b,0x23,0x17,
			
 
				+                       1,
			
 
				+                       0};
			
 
				+
			
 
				+const int head2_5[] = {0x4f,0x67,0x67,0x53,0,0x00,
			
 
				+                       0x07,0xfc,0x03,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,1,0,0,0,
			
 
				+                       0xed,0x2a,0x2e,0xa7,
			
 
				+                       255,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10,10,
			
 
				+                       10,10,10,10,10,10,10};
			
 
				+
			
 
				+const int head3_5[] = {0x4f,0x67,0x67,0x53,0,0x04,
			
 
				+                       0x07,0x00,0x04,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,2,0,0,0,
			
 
				+                       0x6c,0x3b,0x82,0x3d,
			
 
				+                       1,
			
 
				+                       50};
			
 
				+
			
 
				+
			
 
				+/* packet that overspans over an entire page */
			
 
				+const int head1_6[] = {0x4f,0x67,0x67,0x53,0,0x02,
			
 
				+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,0,0,0,0,
			
 
				+                       0xff,0x7b,0x23,0x17,
			
 
				+                       1,
			
 
				+                       0};
			
 
				+
			
 
				+const int head2_6[] = {0x4f,0x67,0x67,0x53,0,0x00,
			
 
				+                       0x07,0x04,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,1,0,0,0,
			
 
				+                       0x68,0x22,0x7c,0x3d,
			
 
				+                       255,
			
 
				+                       100,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255};
			
 
				+
			
 
				+const int head3_6[] = {0x4f,0x67,0x67,0x53,0,0x01,
			
 
				+                       0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
			
 
				+                       0x01,0x02,0x03,0x04,2,0,0,0,
			
 
				+                       0xf4,0x87,0xba,0xf3,
			
 
				+                       255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255};
			
 
				+
			
 
				+const int head4_6[] = {0x4f,0x67,0x67,0x53,0,0x05,
			
 
				+                       0x07,0x10,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,3,0,0,0,
			
 
				+                       0xf7,0x2f,0x6c,0x60,
			
 
				+                       5,
			
 
				+                       254,255,4,255,0};
			
 
				+
			
 
				+/* packet that overspans over an entire page */
			
 
				+const int head1_7[] = {0x4f,0x67,0x67,0x53,0,0x02,
			
 
				+                       0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,0,0,0,0,
			
 
				+                       0xff,0x7b,0x23,0x17,
			
 
				+                       1,
			
 
				+                       0};
			
 
				+
			
 
				+const int head2_7[] = {0x4f,0x67,0x67,0x53,0,0x00,
			
 
				+                       0x07,0x04,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,1,0,0,0,
			
 
				+                       0x68,0x22,0x7c,0x3d,
			
 
				+                       255,
			
 
				+                       100,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255,255,255,
			
 
				+                       255,255,255,255,255,255};
			
 
				+
			
 
				+const int head3_7[] = {0x4f,0x67,0x67,0x53,0,0x05,
			
 
				+                       0x07,0x08,0x00,0x00,0x00,0x00,0x00,0x00,
			
 
				+                       0x01,0x02,0x03,0x04,2,0,0,0,
			
 
				+                       0xd4,0xe0,0x60,0xe5,
			
 
				+                       1,
			
 
				+                       0};
			
 
				+
			
 
				+void test_pack(const int *pl, const int **headers, int byteskip,
			
 
				+               int pageskip, int packetskip){
			
 
				+  unsigned char *data=_ogg_malloc(1024*1024); /* for scripted test cases only */
			
 
				+  long inptr=0;
			
 
				+  long outptr=0;
			
 
				+  long deptr=0;
			
 
				+  long depacket=0;
			
 
				+  long granule_pos=7,pageno=0;
			
 
				+  int i,j,packets,pageout=pageskip;
			
 
				+  int eosflag=0;
			
 
				+  int bosflag=0;
			
 
				+
			
 
				+  int byteskipcount=0;
			
 
				+
			
 
				+  ogg_stream_reset(&os_en);
			
 
				+  ogg_stream_reset(&os_de);
			
 
				+  ogg_sync_reset(&oy);
			
 
				+
			
 
				+  for(packets=0;packets<packetskip;packets++)
			
 
				+    depacket+=pl[packets];
			
 
				+
			
 
				+  for(packets=0;;packets++)if(pl[packets]==-1)break;
			
 
				+
			
 
				+  for(i=0;i<packets;i++){
			
 
				+    /* construct a test packet */
			
 
				+    ogg_packet op;
			
 
				+    int len=pl[i];
			
 
				+
			
 
				+    op.packet=data+inptr;
			
 
				+    op.bytes=len;
			
 
				+    op.e_o_s=(pl[i+1]<0?1:0);
			
 
				+    op.granulepos=granule_pos;
			
 
				+
			
 
				+    granule_pos+=1024;
			
 
				+
			
 
				+    for(j=0;j<len;j++)data[inptr++]=i+j;
			
 
				+
			
 
				+    /* submit the test packet */
			
 
				+    ogg_stream_packetin(&os_en,&op);
			
 
				+
			
 
				+    /* retrieve any finished pages */
			
 
				+    {
			
 
				+      ogg_page og;
			
 
				+
			
 
				+      while(ogg_stream_pageout(&os_en,&og)){
			
 
				+        /* We have a page.  Check it carefully */
			
 
				+
			
 
				+        fprintf(stderr,"%ld, ",pageno);
			
 
				+
			
 
				+        if(headers[pageno]==NULL){
			
 
				+          fprintf(stderr,"coded too many pages!\n");
			
 
				+          exit(1);
			
 
				+        }
			
 
				+
			
 
				+        check_page(data+outptr,headers[pageno],&og);
			
 
				+
			
 
				+        outptr+=og.body_len;
			
 
				+        pageno++;
			
 
				+        if(pageskip){
			
 
				+          bosflag=1;
			
 
				+          pageskip--;
			
 
				+          deptr+=og.body_len;
			
 
				+        }
			
 
				+
			
 
				+        /* have a complete page; submit it to sync/decode */
			
 
				+
			
 
				+        {
			
 
				+          ogg_page og_de;
			
 
				+          ogg_packet op_de,op_de2;
			
 
				+          char *buf=ogg_sync_buffer(&oy,og.header_len+og.body_len);
			
 
				+          char *next=buf;
			
 
				+          byteskipcount+=og.header_len;
			
 
				+          if(byteskipcount>byteskip){
			
 
				+            memcpy(next,og.header,byteskipcount-byteskip);
			
 
				+            next+=byteskipcount-byteskip;
			
 
				+            byteskipcount=byteskip;
			
 
				+          }
			
 
				+
			
 
				+          byteskipcount+=og.body_len;
			
 
				+          if(byteskipcount>byteskip){
			
 
				+            memcpy(next,og.body,byteskipcount-byteskip);
			
 
				+            next+=byteskipcount-byteskip;
			
 
				+            byteskipcount=byteskip;
			
 
				+          }
			
 
				+
			
 
				+          ogg_sync_wrote(&oy,next-buf);
			
 
				+
			
 
				+          while(1){
			
 
				+            int ret=ogg_sync_pageout(&oy,&og_de);
			
 
				+            if(ret==0)break;
			
 
				+            if(ret<0)continue;
			
 
				+            /* got a page.  Happy happy.  Verify that it's good. */
			
 
				+
			
 
				+            fprintf(stderr,"(%d), ",pageout);
			
 
				+
			
 
				+            check_page(data+deptr,headers[pageout],&og_de);
			
 
				+            deptr+=og_de.body_len;
			
 
				+            pageout++;
			
 
				+
			
 
				+            /* submit it to deconstitution */
			
 
				+            ogg_stream_pagein(&os_de,&og_de);
			
 
				+
			
 
				+            /* packets out? */
			
 
				+            while(ogg_stream_packetpeek(&os_de,&op_de2)>0){
			
 
				+              ogg_stream_packetpeek(&os_de,NULL);
			
 
				+              ogg_stream_packetout(&os_de,&op_de); /* just catching them all */
			
 
				+
			
 
				+              /* verify peek and out match */
			
 
				+              if(memcmp(&op_de,&op_de2,sizeof(op_de))){
			
 
				+                fprintf(stderr,"packetout != packetpeek! pos=%ld\n",
			
 
				+                        depacket);
			
 
				+                exit(1);
			
 
				+              }
			
 
				+
			
 
				+              /* verify the packet! */
			
 
				+              /* check data */
			
 
				+              if(memcmp(data+depacket,op_de.packet,op_de.bytes)){
			
 
				+                fprintf(stderr,"packet data mismatch in decode! pos=%ld\n",
			
 
				+                        depacket);
			
 
				+                exit(1);
			
 
				+              }
			
 
				+              /* check bos flag */
			
 
				+              if(bosflag==0 && op_de.b_o_s==0){
			
 
				+                fprintf(stderr,"b_o_s flag not set on packet!\n");
			
 
				+                exit(1);
			
 
				+              }
			
 
				+              if(bosflag && op_de.b_o_s){
			
 
				+                fprintf(stderr,"b_o_s flag incorrectly set on packet!\n");
			
 
				+                exit(1);
			
 
				+              }
			
 
				+              bosflag=1;
			
 
				+              depacket+=op_de.bytes;
			
 
				+
			
 
				+              /* check eos flag */
			
 
				+              if(eosflag){
			
 
				+                fprintf(stderr,"Multiple decoded packets with eos flag!\n");
			
 
				+                exit(1);
			
 
				+              }
			
 
				+
			
 
				+              if(op_de.e_o_s)eosflag=1;
			
 
				+
			
 
				+              /* check granulepos flag */
			
 
				+              if(op_de.granulepos!=-1){
			
 
				+                fprintf(stderr," granule:%ld ",(long)op_de.granulepos);
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  _ogg_free(data);
			
 
				+  if(headers[pageno]!=NULL){
			
 
				+    fprintf(stderr,"did not write last page!\n");
			
 
				+    exit(1);
			
 
				+  }
			
 
				+  if(headers[pageout]!=NULL){
			
 
				+    fprintf(stderr,"did not decode last page!\n");
			
 
				+    exit(1);
			
 
				+  }
			
 
				+  if(inptr!=outptr){
			
 
				+    fprintf(stderr,"encoded page data incomplete!\n");
			
 
				+    exit(1);
			
 
				+  }
			
 
				+  if(inptr!=deptr){
			
 
				+    fprintf(stderr,"decoded page data incomplete!\n");
			
 
				+    exit(1);
			
 
				+  }
			
 
				+  if(inptr!=depacket){
			
 
				+    fprintf(stderr,"decoded packet data incomplete!\n");
			
 
				+    exit(1);
			
 
				+  }
			
 
				+  if(!eosflag){
			
 
				+    fprintf(stderr,"Never got a packet with EOS set!\n");
			
 
				+    exit(1);
			
 
				+  }
			
 
				+  fprintf(stderr,"ok.\n");
			
 
				+}
			
 
				+
			
 
				+int main(void){
			
 
				+
			
 
				+  ogg_stream_init(&os_en,0x04030201);
			
 
				+  ogg_stream_init(&os_de,0x04030201);
			
 
				+  ogg_sync_init(&oy);
			
 
				+
			
 
				+  /* Exercise each code path in the framing code.  Also verify that
			
 
				+     the checksums are working.  */
			
 
				+
			
 
				+  {
			
 
				+    /* 17 only */
			
 
				+    const int packets[]={17, -1};
			
 
				+    const int *headret[]={head1_0,NULL};
			
 
				+
			
 
				+    fprintf(stderr,"testing single page encoding... ");
			
 
				+    test_pack(packets,headret,0,0,0);
			
 
				+  }
			
 
				+
			
 
				+  {
			
 
				+    /* 17, 254, 255, 256, 500, 510, 600 byte, pad */
			
 
				+    const int packets[]={17, 254, 255, 256, 500, 510, 600, -1};
			
 
				+    const int *headret[]={head1_1,head2_1,NULL};
			
 
				+
			
 
				+    fprintf(stderr,"testing basic page encoding... ");
			
 
				+    test_pack(packets,headret,0,0,0);
			
 
				+  }
			
 
				+
			
 
				+  {
			
 
				+    /* nil packets; beginning,middle,end */
			
 
				+    const int packets[]={0,17, 254, 255, 0, 256, 0, 500, 510, 600, 0, -1};
			
 
				+    const int *headret[]={head1_2,head2_2,NULL};
			
 
				+
			
 
				+    fprintf(stderr,"testing basic nil packets... ");
			
 
				+    test_pack(packets,headret,0,0,0);
			
 
				+  }
			
 
				+
			
 
				+  {
			
 
				+    /* large initial packet */
			
 
				+    const int packets[]={4345,259,255,-1};
			
 
				+    const int *headret[]={head1_3,head2_3,NULL};
			
 
				+
			
 
				+    fprintf(stderr,"testing initial-packet lacing > 4k... ");
			
 
				+    test_pack(packets,headret,0,0,0);
			
 
				+  }
			
 
				+
			
 
				+  {
			
 
				+    /* continuing packet test; with page spill expansion, we have to
			
 
				+       overflow the lacing table. */
			
 
				+    const int packets[]={0,65500,259,255,-1};
			
 
				+    const int *headret[]={head1_4,head2_4,head3_4,NULL};
			
 
				+
			
 
				+    fprintf(stderr,"testing single packet page span... ");
			
 
				+    test_pack(packets,headret,0,0,0);
			
 
				+  }
			
 
				+
			
 
				+  {
			
 
				+    /* spill expand packet test */
			
 
				+    const int packets[]={0,4345,259,255,0,0,-1};
			
 
				+    const int *headret[]={head1_4b,head2_4b,head3_4b,NULL};
			
 
				+
			
 
				+    fprintf(stderr,"testing page spill expansion... ");
			
 
				+    test_pack(packets,headret,0,0,0);
			
 
				+  }
			
 
				+
			
 
				+  /* page with the 255 segment limit */
			
 
				+  {
			
 
				+
			
 
				+    const int packets[]={0,10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,10,
			
 
				+                   10,10,10,10,10,10,10,50,-1};
			
 
				+    const int *headret[]={head1_5,head2_5,head3_5,NULL};
			
 
				+
			
 
				+    fprintf(stderr,"testing max packet segments... ");
			
 
				+    test_pack(packets,headret,0,0,0);
			
 
				+  }
			
 
				+
			
 
				+  {
			
 
				+    /* packet that overspans over an entire page */
			
 
				+    const int packets[]={0,100,130049,259,255,-1};
			
 
				+    const int *headret[]={head1_6,head2_6,head3_6,head4_6,NULL};
			
 
				+
			
 
				+    fprintf(stderr,"testing very large packets... ");
			
 
				+    test_pack(packets,headret,0,0,0);
			
 
				+  }
			
 
				+
			
 
				+  {
			
 
				+    /* test for the libogg 1.1.1 resync in large continuation bug
			
 
				+       found by Josh Coalson)  */
			
 
				+    const int packets[]={0,100,130049,259,255,-1};
			
 
				+    const int *headret[]={head1_6,head2_6,head3_6,head4_6,NULL};
			
 
				+
			
 
				+    fprintf(stderr,"testing continuation resync in very large packets... ");
			
 
				+    test_pack(packets,headret,100,2,3);
			
 
				+  }
			
 
				+
			
 
				+  {
			
 
				+    /* term only page.  why not? */
			
 
				+    const int packets[]={0,100,64770,-1};
			
 
				+    const int *headret[]={head1_7,head2_7,head3_7,NULL};
			
 
				+
			
 
				+    fprintf(stderr,"testing zero data page (1 nil packet)... ");
			
 
				+    test_pack(packets,headret,0,0,0);
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+
			
 
				+  {
			
 
				+    /* build a bunch of pages for testing */
			
 
				+    unsigned char *data=_ogg_malloc(1024*1024);
			
 
				+    int pl[]={0, 1,1,98,4079, 1,1,2954,2057, 76,34,912,0,234,1000,1000, 1000,300,-1};
			
 
				+    int inptr=0,i,j;
			
 
				+    ogg_page og[5];
			
 
				+
			
 
				+    ogg_stream_reset(&os_en);
			
 
				+
			
 
				+    for(i=0;pl[i]!=-1;i++){
			
 
				+      ogg_packet op;
			
 
				+      int len=pl[i];
			
 
				+
			
 
				+      op.packet=data+inptr;
			
 
				+      op.bytes=len;
			
 
				+      op.e_o_s=(pl[i+1]<0?1:0);
			
 
				+      op.granulepos=(i+1)*1000;
			
 
				+
			
 
				+      for(j=0;j<len;j++)data[inptr++]=i+j;
			
 
				+      ogg_stream_packetin(&os_en,&op);
			
 
				+    }
			
 
				+
			
 
				+    _ogg_free(data);
			
 
				+
			
 
				+    /* retrieve finished pages */
			
 
				+    for(i=0;i<5;i++){
			
 
				+      if(ogg_stream_pageout(&os_en,&og[i])==0){
			
 
				+        fprintf(stderr,"Too few pages output building sync tests!\n");
			
 
				+        exit(1);
			
 
				+      }
			
 
				+      copy_page(&og[i]);
			
 
				+    }
			
 
				+
			
 
				+    /* Test lost pages on pagein/packetout: no rollback */
			
 
				+    {
			
 
				+      ogg_page temp;
			
 
				+      ogg_packet test;
			
 
				+
			
 
				+      fprintf(stderr,"Testing loss of pages... ");
			
 
				+
			
 
				+      ogg_sync_reset(&oy);
			
 
				+      ogg_stream_reset(&os_de);
			
 
				+      for(i=0;i<5;i++){
			
 
				+        memcpy(ogg_sync_buffer(&oy,og[i].header_len),og[i].header,
			
 
				+               og[i].header_len);
			
 
				+        ogg_sync_wrote(&oy,og[i].header_len);
			
 
				+        memcpy(ogg_sync_buffer(&oy,og[i].body_len),og[i].body,og[i].body_len);
			
 
				+        ogg_sync_wrote(&oy,og[i].body_len);
			
 
				+      }
			
 
				+
			
 
				+      ogg_sync_pageout(&oy,&temp);
			
 
				+      ogg_stream_pagein(&os_de,&temp);
			
 
				+      ogg_sync_pageout(&oy,&temp);
			
 
				+      ogg_stream_pagein(&os_de,&temp);
			
 
				+      ogg_sync_pageout(&oy,&temp);
			
 
				+      /* skip */
			
 
				+      ogg_sync_pageout(&oy,&temp);
			
 
				+      ogg_stream_pagein(&os_de,&temp);
			
 
				+
			
 
				+      /* do we get the expected results/packets? */
			
 
				+
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,0,0,0);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,1,1,-1);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,1,2,-1);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,98,3,-1);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,4079,4,5000);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=-1){
			
 
				+        fprintf(stderr,"Error: loss of page did not return error\n");
			
 
				+        exit(1);
			
 
				+      }
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,76,9,-1);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,34,10,-1);
			
 
				+      fprintf(stderr,"ok.\n");
			
 
				+    }
			
 
				+
			
 
				+    /* Test lost pages on pagein/packetout: rollback with continuation */
			
 
				+    {
			
 
				+      ogg_page temp;
			
 
				+      ogg_packet test;
			
 
				+
			
 
				+      fprintf(stderr,"Testing loss of pages (rollback required)... ");
			
 
				+
			
 
				+      ogg_sync_reset(&oy);
			
 
				+      ogg_stream_reset(&os_de);
			
 
				+      for(i=0;i<5;i++){
			
 
				+        memcpy(ogg_sync_buffer(&oy,og[i].header_len),og[i].header,
			
 
				+               og[i].header_len);
			
 
				+        ogg_sync_wrote(&oy,og[i].header_len);
			
 
				+        memcpy(ogg_sync_buffer(&oy,og[i].body_len),og[i].body,og[i].body_len);
			
 
				+        ogg_sync_wrote(&oy,og[i].body_len);
			
 
				+      }
			
 
				+
			
 
				+      ogg_sync_pageout(&oy,&temp);
			
 
				+      ogg_stream_pagein(&os_de,&temp);
			
 
				+      ogg_sync_pageout(&oy,&temp);
			
 
				+      ogg_stream_pagein(&os_de,&temp);
			
 
				+      ogg_sync_pageout(&oy,&temp);
			
 
				+      ogg_stream_pagein(&os_de,&temp);
			
 
				+      ogg_sync_pageout(&oy,&temp);
			
 
				+      /* skip */
			
 
				+      ogg_sync_pageout(&oy,&temp);
			
 
				+      ogg_stream_pagein(&os_de,&temp);
			
 
				+
			
 
				+      /* do we get the expected results/packets? */
			
 
				+
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,0,0,0);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,1,1,-1);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,1,2,-1);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,98,3,-1);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,4079,4,5000);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,1,5,-1);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,1,6,-1);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,2954,7,-1);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,2057,8,9000);
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=-1){
			
 
				+        fprintf(stderr,"Error: loss of page did not return error\n");
			
 
				+        exit(1);
			
 
				+      }
			
 
				+      if(ogg_stream_packetout(&os_de,&test)!=1)error();
			
 
				+      checkpacket(&test,300,17,18000);
			
 
				+      fprintf(stderr,"ok.\n");
			
 
				+    }
			
 
				+
			
 
				+    /* the rest only test sync */
			
 
				+    {
			
 
				+      ogg_page og_de;
			
 
				+      /* Test fractional page inputs: incomplete capture */
			
 
				+      fprintf(stderr,"Testing sync on partial inputs... ");
			
 
				+      ogg_sync_reset(&oy);
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header,
			
 
				+             3);
			
 
				+      ogg_sync_wrote(&oy,3);
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
			
 
				+
			
 
				+      /* Test fractional page inputs: incomplete fixed header */
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header+3,
			
 
				+             20);
			
 
				+      ogg_sync_wrote(&oy,20);
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
			
 
				+
			
 
				+      /* Test fractional page inputs: incomplete header */
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header+23,
			
 
				+             5);
			
 
				+      ogg_sync_wrote(&oy,5);
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
			
 
				+
			
 
				+      /* Test fractional page inputs: incomplete body */
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header+28,
			
 
				+             og[1].header_len-28);
			
 
				+      ogg_sync_wrote(&oy,og[1].header_len-28);
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,1000);
			
 
				+      ogg_sync_wrote(&oy,1000);
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body+1000,
			
 
				+             og[1].body_len-1000);
			
 
				+      ogg_sync_wrote(&oy,og[1].body_len-1000);
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
			
 
				+
			
 
				+      fprintf(stderr,"ok.\n");
			
 
				+    }
			
 
				+
			
 
				+    /* Test fractional page inputs: page + incomplete capture */
			
 
				+    {
			
 
				+      ogg_page og_de;
			
 
				+      fprintf(stderr,"Testing sync on 1+partial inputs... ");
			
 
				+      ogg_sync_reset(&oy);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header,
			
 
				+             og[1].header_len);
			
 
				+      ogg_sync_wrote(&oy,og[1].header_len);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,
			
 
				+             og[1].body_len);
			
 
				+      ogg_sync_wrote(&oy,og[1].body_len);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header,
			
 
				+             20);
			
 
				+      ogg_sync_wrote(&oy,20);
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header+20,
			
 
				+             og[1].header_len-20);
			
 
				+      ogg_sync_wrote(&oy,og[1].header_len-20);
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,
			
 
				+             og[1].body_len);
			
 
				+      ogg_sync_wrote(&oy,og[1].body_len);
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
			
 
				+
			
 
				+      fprintf(stderr,"ok.\n");
			
 
				+    }
			
 
				+
			
 
				+    /* Test recapture: garbage + page */
			
 
				+    {
			
 
				+      ogg_page og_de;
			
 
				+      fprintf(stderr,"Testing search for capture... ");
			
 
				+      ogg_sync_reset(&oy);
			
 
				+
			
 
				+      /* 'garbage' */
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,
			
 
				+             og[1].body_len);
			
 
				+      ogg_sync_wrote(&oy,og[1].body_len);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header,
			
 
				+             og[1].header_len);
			
 
				+      ogg_sync_wrote(&oy,og[1].header_len);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,
			
 
				+             og[1].body_len);
			
 
				+      ogg_sync_wrote(&oy,og[1].body_len);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[2].header_len),og[2].header,
			
 
				+             20);
			
 
				+      ogg_sync_wrote(&oy,20);
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[2].header_len),og[2].header+20,
			
 
				+             og[2].header_len-20);
			
 
				+      ogg_sync_wrote(&oy,og[2].header_len-20);
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[2].body_len),og[2].body,
			
 
				+             og[2].body_len);
			
 
				+      ogg_sync_wrote(&oy,og[2].body_len);
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
			
 
				+
			
 
				+      fprintf(stderr,"ok.\n");
			
 
				+    }
			
 
				+
			
 
				+    /* Test recapture: page + garbage + page */
			
 
				+    {
			
 
				+      ogg_page og_de;
			
 
				+      fprintf(stderr,"Testing recapture... ");
			
 
				+      ogg_sync_reset(&oy);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].header_len),og[1].header,
			
 
				+             og[1].header_len);
			
 
				+      ogg_sync_wrote(&oy,og[1].header_len);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[1].body_len),og[1].body,
			
 
				+             og[1].body_len);
			
 
				+      ogg_sync_wrote(&oy,og[1].body_len);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[2].header_len),og[2].header,
			
 
				+             og[2].header_len);
			
 
				+      ogg_sync_wrote(&oy,og[2].header_len);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[2].header_len),og[2].header,
			
 
				+             og[2].header_len);
			
 
				+      ogg_sync_wrote(&oy,og[2].header_len);
			
 
				+
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[2].body_len),og[2].body,
			
 
				+             og[2].body_len-5);
			
 
				+      ogg_sync_wrote(&oy,og[2].body_len-5);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[3].header_len),og[3].header,
			
 
				+             og[3].header_len);
			
 
				+      ogg_sync_wrote(&oy,og[3].header_len);
			
 
				+
			
 
				+      memcpy(ogg_sync_buffer(&oy,og[3].body_len),og[3].body,
			
 
				+             og[3].body_len);
			
 
				+      ogg_sync_wrote(&oy,og[3].body_len);
			
 
				+
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)>0)error();
			
 
				+      if(ogg_sync_pageout(&oy,&og_de)<=0)error();
			
 
				+
			
 
				+      fprintf(stderr,"ok.\n");
			
 
				+    }
			
 
				+
			
 
				+    /* Free page data that was previously copied */
			
 
				+    {
			
 
				+      for(i=0;i<5;i++){
			
 
				+        free_page(&og[i]);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  return(0);
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/COPYING
+++ b/modules/theoraplayer/native/theora/COPYING
@@ -0,0 +1,28 @@
 
				+Copyright (C) 2002-2009 Xiph.org Foundation
			
 
				+
			
 
				+Redistribution and use in source and binary forms, with or without
			
 
				+modification, are permitted provided that the following conditions
			
 
				+are met:
			
 
				+
			
 
				+- Redistributions of source code must retain the above copyright
			
 
				+notice, this list of conditions and the following disclaimer.
			
 
				+
			
 
				+- Redistributions in binary form must reproduce the above copyright
			
 
				+notice, this list of conditions and the following disclaimer in the
			
 
				+documentation and/or other materials provided with the distribution.
			
 
				+
			
 
				+- Neither the name of the Xiph.org Foundation nor the names of its
			
 
				+contributors may be used to endorse or promote products derived from
			
 
				+this software without specific prior written permission.
			
 
				+
			
 
				+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION
			
 
				+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
			
 
				+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
			
 
				+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
			
 
				+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
--- a/modules/theoraplayer/native/theora/include/theora/codec.h
+++ b/modules/theoraplayer/native/theora/include/theora/codec.h
@@ -0,0 +1,606 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+  last mod: $Id: theora.h,v 1.8 2004/03/15 22:17:32 derf Exp $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+/**\mainpage
			
 
				+ *
			
 
				+ * \section intro Introduction
			
 
				+ *
			
 
				+ * This is the documentation for the <tt>libtheora</tt> C API.
			
 
				+ *
			
 
				+ * The \c libtheora package is the current reference
			
 
				+ * implementation for <a href="http://www.theora.org/">Theora</a>, a free,
			
 
				+ * patent-unencumbered video codec.
			
 
				+ * Theora is derived from On2's VP3 codec with additional features and
			
 
				+ *  integration with Ogg multimedia formats by
			
 
				+ *  <a href="http://www.xiph.org/">the Xiph.Org Foundation</a>.
			
 
				+ * Complete documentation of the format itself is available in
			
 
				+ * <a href="http://www.theora.org/doc/Theora.pdf">the Theora
			
 
				+ *  specification</a>.
			
 
				+ *
			
 
				+ * \section Organization
			
 
				+ *
			
 
				+ * The functions documented here are divided between two
			
 
				+ * separate libraries:
			
 
				+ * - \c libtheoraenc contains the encoder interface,
			
 
				+ *   described in \ref encfuncs.
			
 
				+ * - \c libtheoradec contains the decoder interface,
			
 
				+ *   described in \ref decfuncs, \n
			
 
				+ *   and additional \ref basefuncs.
			
 
				+ *
			
 
				+ * New code should link to \c libtheoradec. If using encoder
			
 
				+ * features, it must also link to \c libtheoraenc.
			
 
				+ *
			
 
				+ * During initial development, prior to the 1.0 release,
			
 
				+ * \c libtheora exported a different \ref oldfuncs which
			
 
				+ * combined both encode and decode functions.
			
 
				+ * In general, legacy API symbols can be indentified
			
 
				+ * by their \c theora_ or \c OC_ namespace prefixes.
			
 
				+ * The current API uses \c th_ or \c TH_ instead.
			
 
				+ *
			
 
				+ * While deprecated, \c libtheoraenc and \c libtheoradec
			
 
				+ * together export the legacy api as well at the one documented above.
			
 
				+ * Likewise, the legacy \c libtheora included with this package
			
 
				+ * exports the new 1.x API. Older code and build scripts can therefore
			
 
				+ * but updated independently to the current scheme.
			
 
				+ */
			
 
				+
			
 
				+/**\file
			
 
				+ * The shared <tt>libtheoradec</tt> and <tt>libtheoraenc</tt> C API.
			
 
				+ * You don't need to include this directly.*/
			
 
				+
			
 
				+#if !defined(_O_THEORA_CODEC_H_)
			
 
				+# define _O_THEORA_CODEC_H_ (1)
			
 
				+# include <ogg/ogg.h>
			
 
				+
			
 
				+#if defined(__cplusplus)
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**\name Return codes*/
			
 
				+/*@{*/
			
 
				+/**An invalid pointer was provided.*/
			
 
				+#define TH_EFAULT     (-1)
			
 
				+/**An invalid argument was provided.*/
			
 
				+#define TH_EINVAL     (-10)
			
 
				+/**The contents of the header were incomplete, invalid, or unexpected.*/
			
 
				+#define TH_EBADHEADER (-20)
			
 
				+/**The header does not belong to a Theora stream.*/
			
 
				+#define TH_ENOTFORMAT (-21)
			
 
				+/**The bitstream version is too high.*/
			
 
				+#define TH_EVERSION   (-22)
			
 
				+/**The specified function is not implemented.*/
			
 
				+#define TH_EIMPL      (-23)
			
 
				+/**There were errors in the video data packet.*/
			
 
				+#define TH_EBADPACKET (-24)
			
 
				+/**The decoded packet represented a dropped frame.
			
 
				+   The player can continue to display the current frame, as the contents of the
			
 
				+    decoded frame buffer have not changed.*/
			
 
				+#define TH_DUPFRAME   (1)
			
 
				+/*@}*/
			
 
				+
			
 
				+/**The currently defined color space tags.
			
 
				+ * See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
			
 
				+ *  specification</a>, Chapter 4, for exact details on the meaning
			
 
				+ *  of each of these color spaces.*/
			
 
				+typedef enum{
			
 
				+  /**The color space was not specified at the encoder.
			
 
				+      It may be conveyed by an external means.*/
			
 
				+  TH_CS_UNSPECIFIED,
			
 
				+  /**A color space designed for NTSC content.*/
			
 
				+  TH_CS_ITU_REC_470M,
			
 
				+  /**A color space designed for PAL/SECAM content.*/
			
 
				+  TH_CS_ITU_REC_470BG,
			
 
				+  /**The total number of currently defined color spaces.*/
			
 
				+  TH_CS_NSPACES
			
 
				+}th_colorspace;
			
 
				+
			
 
				+/**The currently defined pixel format tags.
			
 
				+ * See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
			
 
				+ *  specification</a>, Section 4.4, for details on the precise sample
			
 
				+ *  locations.*/
			
 
				+typedef enum{
			
 
				+  /**Chroma decimation by 2 in both the X and Y directions (4:2:0).
			
 
				+     The Cb and Cr chroma planes are half the width and half the
			
 
				+      height of the luma plane.*/
			
 
				+  TH_PF_420,
			
 
				+  /**Currently reserved.*/
			
 
				+  TH_PF_RSVD,
			
 
				+  /**Chroma decimation by 2 in the X direction (4:2:2).
			
 
				+     The Cb and Cr chroma planes are half the width of the luma plane, but full
			
 
				+      height.*/
			
 
				+  TH_PF_422,
			
 
				+  /**No chroma decimation (4:4:4).
			
 
				+     The Cb and Cr chroma planes are full width and full height.*/
			
 
				+  TH_PF_444,
			
 
				+  /**The total number of currently defined pixel formats.*/
			
 
				+  TH_PF_NFORMATS
			
 
				+}th_pixel_fmt;
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**A buffer for a single color plane in an uncompressed image.
			
 
				+ * This contains the image data in a left-to-right, top-down format.
			
 
				+ * Each row of pixels is stored contiguously in memory, but successive
			
 
				+ *  rows need not be.
			
 
				+ * Use \a stride to compute the offset of the next row.
			
 
				+ * The encoder accepts both positive \a stride values (top-down in memory)
			
 
				+ *  and negative (bottom-up in memory).
			
 
				+ * The decoder currently always generates images with positive strides.*/
			
 
				+typedef struct{
			
 
				+  /**The width of this plane.*/
			
 
				+  int            width;
			
 
				+  /**The height of this plane.*/
			
 
				+  int            height;
			
 
				+  /**The offset in bytes between successive rows.*/
			
 
				+  int            stride;
			
 
				+  /**A pointer to the beginning of the first row.*/
			
 
				+  unsigned char *data;
			
 
				+}th_img_plane;
			
 
				+
			
 
				+/**A complete image buffer for an uncompressed frame.
			
 
				+ * The chroma planes may be decimated by a factor of two in either
			
 
				+ *  direction, as indicated by th_info#pixel_fmt.
			
 
				+ * The width and height of the Y' plane must be multiples of 16.
			
 
				+ * They may need to be cropped for display, using the rectangle
			
 
				+ *  specified by th_info#pic_x, th_info#pic_y, th_info#pic_width,
			
 
				+ *  and th_info#pic_height.
			
 
				+ * All samples are 8 bits.
			
 
				+ * \note The term YUV often used to describe a colorspace is ambiguous.
			
 
				+ * The exact parameters of the RGB to YUV conversion process aside, in
			
 
				+ *  many contexts the U and V channels actually have opposite meanings.
			
 
				+ * To avoid this confusion, we are explicit: the name of the color
			
 
				+ *  channels are Y'CbCr, and they appear in that order, always.
			
 
				+ * The prime symbol denotes that the Y channel is non-linear.
			
 
				+ * Cb and Cr stand for "Chroma blue" and "Chroma red", respectively.*/
			
 
				+typedef th_img_plane th_ycbcr_buffer[3];
			
 
				+
			
 
				+/**Theora bitstream information.
			
 
				+ * This contains the basic playback parameters for a stream, and corresponds to
			
 
				+ *  the initial 'info' header packet.
			
 
				+ * To initialize an encoder, the application fills in this structure and
			
 
				+ *  passes it to th_encode_alloc().
			
 
				+ * A default encoding mode is chosen based on the values of the #quality and
			
 
				+ *  #target_bitrate fields.
			
 
				+ * On decode, it is filled in by th_decode_headerin(), and then passed to
			
 
				+ *  th_decode_alloc().
			
 
				+ *
			
 
				+ * Encoded Theora frames must be a multiple of 16 in size;
			
 
				+ *  this is what the #frame_width and #frame_height members represent.
			
 
				+ * To handle arbitrary picture sizes, a crop rectangle is specified in the
			
 
				+ *  #pic_x, #pic_y, #pic_width and #pic_height members.
			
 
				+ *
			
 
				+ * All frame buffers contain pointers to the full, padded frame.
			
 
				+ * However, the current encoder <em>will not</em> reference pixels outside of
			
 
				+ *  the cropped picture region, and the application does not need to fill them
			
 
				+ *  in.
			
 
				+ * The decoder <em>will</em> allocate storage for a full frame, but the
			
 
				+ *  application <em>should not</em> rely on the padding containing sensible
			
 
				+ *  data.
			
 
				+ *
			
 
				+ * It is also generally recommended that the offsets and sizes should still be
			
 
				+ *  multiples of 2 to avoid chroma sampling shifts when chroma is sub-sampled.
			
 
				+ * See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
			
 
				+ *  specification</a>, Section 4.4, for more details.
			
 
				+ *
			
 
				+ * Frame rate, in frames per second, is stored as a rational fraction, as is
			
 
				+ *  the pixel aspect ratio.
			
 
				+ * Note that this refers to the aspect ratio of the individual pixels, not of
			
 
				+ *  the overall frame itself.
			
 
				+ * The frame aspect ratio can be computed from pixel aspect ratio using the
			
 
				+ *  image dimensions.*/
			
 
				+typedef struct{
			
 
				+  /**\name Theora version
			
 
				+   * Bitstream version information.*/
			
 
				+  /*@{*/
			
 
				+  unsigned char version_major;
			
 
				+  unsigned char version_minor;
			
 
				+  unsigned char version_subminor;
			
 
				+  /*@}*/
			
 
				+  /**The encoded frame width.
			
 
				+   * This must be a multiple of 16, and less than 1048576.*/
			
 
				+  ogg_uint32_t  frame_width;
			
 
				+  /**The encoded frame height.
			
 
				+   * This must be a multiple of 16, and less than 1048576.*/
			
 
				+  ogg_uint32_t  frame_height;
			
 
				+  /**The displayed picture width.
			
 
				+   * This must be no larger than width.*/
			
 
				+  ogg_uint32_t  pic_width;
			
 
				+  /**The displayed picture height.
			
 
				+   * This must be no larger than height.*/
			
 
				+  ogg_uint32_t  pic_height;
			
 
				+  /**The X offset of the displayed picture.
			
 
				+   * This must be no larger than #frame_width-#pic_width or 255, whichever is
			
 
				+   *  smaller.*/
			
 
				+  ogg_uint32_t  pic_x;
			
 
				+  /**The Y offset of the displayed picture.
			
 
				+   * This must be no larger than #frame_height-#pic_height, and
			
 
				+   *  #frame_height-#pic_height-#pic_y must be no larger than 255.
			
 
				+   * This slightly funny restriction is due to the fact that the offset is
			
 
				+   *  specified from the top of the image for consistency with the standard
			
 
				+   *  graphics left-handed coordinate system used throughout this API, while
			
 
				+   *  it is stored in the encoded stream as an offset from the bottom.*/
			
 
				+  ogg_uint32_t  pic_y;
			
 
				+  /**\name Frame rate
			
 
				+   * The frame rate, as a fraction.
			
 
				+   * If either is 0, the frame rate is undefined.*/
			
 
				+  /*@{*/
			
 
				+  ogg_uint32_t  fps_numerator;
			
 
				+  ogg_uint32_t  fps_denominator;
			
 
				+  /*@}*/
			
 
				+  /**\name Aspect ratio
			
 
				+   * The aspect ratio of the pixels.
			
 
				+   * If either value is zero, the aspect ratio is undefined.
			
 
				+   * If not specified by any external means, 1:1 should be assumed.
			
 
				+   * The aspect ratio of the full picture can be computed as
			
 
				+   * \code
			
 
				+   *  aspect_numerator*pic_width/(aspect_denominator*pic_height).
			
 
				+   * \endcode */
			
 
				+  /*@{*/
			
 
				+  ogg_uint32_t  aspect_numerator;
			
 
				+  ogg_uint32_t  aspect_denominator;
			
 
				+  /*@}*/
			
 
				+  /**The color space.*/
			
 
				+  th_colorspace colorspace;
			
 
				+  /**The pixel format.*/
			
 
				+  th_pixel_fmt  pixel_fmt;
			
 
				+  /**The target bit-rate in bits per second.
			
 
				+     If initializing an encoder with this struct, set this field to a non-zero
			
 
				+      value to activate CBR encoding by default.*/
			
 
				+  int           target_bitrate;
			
 
				+  /**The target quality level.
			
 
				+     Valid values range from 0 to 63, inclusive, with higher values giving
			
 
				+      higher quality.
			
 
				+     If initializing an encoder with this struct, and #target_bitrate is set
			
 
				+      to zero, VBR encoding at this quality will be activated by default.*/
			
 
				+  /*Currently this is set so that a qi of 0 corresponds to distortions of 24
			
 
				+     times the JND, and each increase by 16 halves that value.
			
 
				+    This gives us fine discrimination at low qualities, yet effective rate
			
 
				+     control at high qualities.
			
 
				+    The qi value 63 is special, however.
			
 
				+    For this, the highest quality, we use one half of a JND for our threshold.
			
 
				+    Due to the lower bounds placed on allowable quantizers in Theora, we will
			
 
				+     not actually be able to achieve quality this good, but this should
			
 
				+     provide as close to visually lossless quality as Theora is capable of.
			
 
				+    We could lift the quantizer restrictions without breaking VP3.1
			
 
				+     compatibility, but this would result in quantized coefficients that are
			
 
				+     too large for the current bitstream to be able to store.
			
 
				+    We'd have to redesign the token syntax to store these large coefficients,
			
 
				+     which would make transcoding complex.*/
			
 
				+  int           quality;
			
 
				+  /**The amount to shift to extract the last keyframe number from the granule
			
 
				+   *  position.
			
 
				+   * This can be at most 31.
			
 
				+   * th_info_init() will set this to a default value (currently <tt>6</tt>,
			
 
				+   *  which is good for streaming applications), but you can set it to 0 to
			
 
				+   *  make every frame a keyframe.
			
 
				+   * The maximum distance between key frames is
			
 
				+   *  <tt>1<<#keyframe_granule_shift</tt>.
			
 
				+   * The keyframe frequency can be more finely controlled with
			
 
				+   *  #TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE, which can also be adjusted
			
 
				+   *  during encoding (for example, to force the next frame to be a keyframe),
			
 
				+   *  but it cannot be set larger than the amount permitted by this field after
			
 
				+   *  the headers have been output.*/
			
 
				+  int           keyframe_granule_shift;
			
 
				+}th_info;
			
 
				+
			
 
				+/**The comment information.
			
 
				+ *
			
 
				+ * This structure holds the in-stream metadata corresponding to
			
 
				+ *  the 'comment' header packet.
			
 
				+ * The comment header is meant to be used much like someone jotting a quick
			
 
				+ *  note on the label of a video.
			
 
				+ * It should be a short, to the point text note that can be more than a couple
			
 
				+ *  words, but not more than a short paragraph.
			
 
				+ *
			
 
				+ * The metadata is stored as a series of (tag, value) pairs, in
			
 
				+ *  length-encoded string vectors.
			
 
				+ * The first occurrence of the '=' character delimits the tag and value.
			
 
				+ * A particular tag may occur more than once, and order is significant.
			
 
				+ * The character set encoding for the strings is always UTF-8, but the tag
			
 
				+ *  names are limited to ASCII, and treated as case-insensitive.
			
 
				+ * See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
			
 
				+ *  specification</a>, Section 6.3.3 for details.
			
 
				+ *
			
 
				+ * In filling in this structure, th_decode_headerin() will null-terminate
			
 
				+ *  the user_comment strings for safety.
			
 
				+ * However, the bitstream format itself treats them as 8-bit clean vectors,
			
 
				+ *  possibly containing null characters, and so the length array should be
			
 
				+ *  treated as their authoritative length.
			
 
				+ */
			
 
				+typedef struct th_comment{
			
 
				+  /**The array of comment string vectors.*/
			
 
				+  char **user_comments;
			
 
				+  /**An array of the corresponding length of each vector, in bytes.*/
			
 
				+  int   *comment_lengths;
			
 
				+  /**The total number of comment strings.*/
			
 
				+  int    comments;
			
 
				+  /**The null-terminated vendor string.
			
 
				+     This identifies the software used to encode the stream.*/
			
 
				+  char  *vendor;
			
 
				+}th_comment;
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**A single base matrix.*/
			
 
				+typedef unsigned char th_quant_base[64];
			
 
				+
			
 
				+/**A set of \a qi ranges.*/
			
 
				+typedef struct{
			
 
				+  /**The number of ranges in the set.*/
			
 
				+  int                  nranges;
			
 
				+  /**The size of each of the #nranges ranges.
			
 
				+     These must sum to 63.*/
			
 
				+  const int           *sizes;
			
 
				+  /**#nranges <tt>+1</tt> base matrices.
			
 
				+     Matrices \a i and <tt>i+1</tt> form the endpoints of range \a i.*/
			
 
				+  const th_quant_base *base_matrices;
			
 
				+}th_quant_ranges;
			
 
				+
			
 
				+/**A complete set of quantization parameters.
			
 
				+   The quantizer for each coefficient is calculated as:
			
 
				+   \code
			
 
				+    Q=MAX(MIN(qmin[qti][ci!=0],scale[ci!=0][qi]*base[qti][pli][qi][ci]/100),
			
 
				+     1024).
			
 
				+   \endcode
			
 
				+
			
 
				+   \a qti is the quantization type index: 0 for intra, 1 for inter.
			
 
				+   <tt>ci!=0</tt> is 0 for the DC coefficient and 1 for AC coefficients.
			
 
				+   \a qi is the quality index, ranging between 0 (low quality) and 63 (high
			
 
				+    quality).
			
 
				+   \a pli is the color plane index: 0 for Y', 1 for Cb, 2 for Cr.
			
 
				+   \a ci is the DCT coefficient index.
			
 
				+   Coefficient indices correspond to the normal 2D DCT block
			
 
				+    ordering--row-major with low frequencies first--\em not zig-zag order.
			
 
				+
			
 
				+   Minimum quantizers are constant, and are given by:
			
 
				+   \code
			
 
				+   qmin[2][2]={{4,2},{8,4}}.
			
 
				+   \endcode
			
 
				+
			
 
				+   Parameters that can be stored in the bitstream are as follows:
			
 
				+    - The two scale matrices ac_scale and dc_scale.
			
 
				+      \code
			
 
				+      scale[2][64]={dc_scale,ac_scale}.
			
 
				+      \endcode
			
 
				+    - The base matrices for each \a qi, \a qti and \a pli (up to 384 in all).
			
 
				+      In order to avoid storing a full 384 base matrices, only a sparse set of
			
 
				+       matrices are stored, and the rest are linearly interpolated.
			
 
				+      This is done as follows.
			
 
				+      For each \a qti and \a pli, a series of \a n \a qi ranges is defined.
			
 
				+      The size of each \a qi range can vary arbitrarily, but they must sum to
			
 
				+       63.
			
 
				+      Then, <tt>n+1</tt> matrices are specified, one for each endpoint of the
			
 
				+       ranges.
			
 
				+      For interpolation purposes, each range's endpoints are the first \a qi
			
 
				+       value it contains and one past the last \a qi value it contains.
			
 
				+      Fractional values are rounded to the nearest integer, with ties rounded
			
 
				+       away from zero.
			
 
				+
			
 
				+      Base matrices are stored by reference, so if the same matrices are used
			
 
				+       multiple times, they will only appear once in the bitstream.
			
 
				+      The bitstream is also capable of omitting an entire set of ranges and
			
 
				+       its associated matrices if they are the same as either the previous
			
 
				+       set (indexed in row-major order) or if the inter set is the same as the
			
 
				+       intra set.
			
 
				+
			
 
				+    - Loop filter limit values.
			
 
				+      The same limits are used for the loop filter in all color planes, despite
			
 
				+       potentially differing levels of quantization in each.
			
 
				+
			
 
				+   For the current encoder, <tt>scale[ci!=0][qi]</tt> must be no greater
			
 
				+    than <tt>scale[ci!=0][qi-1]</tt> and <tt>base[qti][pli][qi][ci]</tt> must
			
 
				+    be no greater than <tt>base[qti][pli][qi-1][ci]</tt>.
			
 
				+   These two conditions ensure that the actual quantizer for a given \a qti,
			
 
				+    \a pli, and \a ci does not increase as \a qi increases.
			
 
				+   This is not required by the decoder.*/
			
 
				+typedef struct{
			
 
				+  /**The DC scaling factors.*/
			
 
				+  ogg_uint16_t    dc_scale[64];
			
 
				+  /**The AC scaling factors.*/
			
 
				+  ogg_uint16_t    ac_scale[64];
			
 
				+  /**The loop filter limit values.*/
			
 
				+  unsigned char   loop_filter_limits[64];
			
 
				+  /**The \a qi ranges for each \a ci and \a pli.*/
			
 
				+  th_quant_ranges qi_ranges[2][3];
			
 
				+}th_quant_info;
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**The number of Huffman tables used by Theora.*/
			
 
				+#define TH_NHUFFMAN_TABLES (80)
			
 
				+/**The number of DCT token values in each table.*/
			
 
				+#define TH_NDCT_TOKENS     (32)
			
 
				+
			
 
				+/**A Huffman code for a Theora DCT token.
			
 
				+ * Each set of Huffman codes in a given table must form a complete, prefix-free
			
 
				+ *  code.
			
 
				+ * There is no requirement that all the tokens in a table have a valid code,
			
 
				+ *  but the current encoder is not optimized to take advantage of this.
			
 
				+ * If each of the five grouops of 16 tables does not contain at least one table
			
 
				+ *  with a code for every token, then the encoder may fail to encode certain
			
 
				+ *  frames.
			
 
				+ * The complete table in the first group of 16 does not have to be in the same
			
 
				+ *  place as the complete table in the other groups, but the complete tables in
			
 
				+ *  the remaining four groups must all be in the same place.*/
			
 
				+typedef struct{
			
 
				+  /**The bit pattern for the code, with the LSbit of the pattern aligned in
			
 
				+   *   the LSbit of the word.*/
			
 
				+  ogg_uint32_t pattern;
			
 
				+  /**The number of bits in the code.
			
 
				+   * This must be between 0 and 32, inclusive.*/
			
 
				+  int          nbits;
			
 
				+}th_huff_code;
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**\defgroup basefuncs Functions Shared by Encode and Decode*/
			
 
				+/*@{*/
			
 
				+/**\name Basic shared functions
			
 
				+ * These functions return information about the library itself,
			
 
				+ * or provide high-level information about codec state
			
 
				+ * and packet type.
			
 
				+ *
			
 
				+ * You must link to \c libtheoradec if you use any of the
			
 
				+ * functions in this section.*/
			
 
				+/*@{*/
			
 
				+/**Retrieves a human-readable string to identify the library vendor and
			
 
				+ *  version.
			
 
				+ * \return the version string.*/
			
 
				+extern const char *th_version_string(void);
			
 
				+/**Retrieves the library version number.
			
 
				+ * This is the highest bitstream version that the encoder library will produce,
			
 
				+ *  or that the decoder library can decode.
			
 
				+ * This number is composed of a 16-bit major version, 8-bit minor version
			
 
				+ * and 8 bit sub-version, composed as follows:
			
 
				+ * \code
			
 
				+ * (VERSION_MAJOR<<16)+(VERSION_MINOR<<8)+(VERSION_SUBMINOR)
			
 
				+ * \endcode
			
 
				+ * \return the version number.*/
			
 
				+extern ogg_uint32_t th_version_number(void);
			
 
				+/**Converts a granule position to an absolute frame index, starting at
			
 
				+ *  <tt>0</tt>.
			
 
				+ * The granule position is interpreted in the context of a given
			
 
				+ *  #th_enc_ctx or #th_dec_ctx handle (either will suffice).
			
 
				+ * \param _encdec  A previously allocated #th_enc_ctx or #th_dec_ctx
			
 
				+ *                  handle.
			
 
				+ * \param _granpos The granule position to convert.
			
 
				+ * \returns The absolute frame index corresponding to \a _granpos.
			
 
				+ * \retval -1 The given granule position was invalid (i.e. negative).*/
			
 
				+extern ogg_int64_t th_granule_frame(void *_encdec,ogg_int64_t _granpos);
			
 
				+/**Converts a granule position to an absolute time in seconds.
			
 
				+ * The granule position is interpreted in the context of a given
			
 
				+ *  #th_enc_ctx or #th_dec_ctx handle (either will suffice).
			
 
				+ * \param _encdec  A previously allocated #th_enc_ctx or #th_dec_ctx
			
 
				+ *                  handle.
			
 
				+ * \param _granpos The granule position to convert.
			
 
				+ * \return The absolute time in seconds corresponding to \a _granpos.
			
 
				+ *         This is the "end time" for the frame, or the latest time it should
			
 
				+ *          be displayed.
			
 
				+ *         It is not the presentation time.
			
 
				+ * \retval -1 The given granule position was invalid (i.e. negative).*/
			
 
				+extern double th_granule_time(void *_encdec,ogg_int64_t _granpos);
			
 
				+/**Determines whether a Theora packet is a header or not.
			
 
				+ * This function does no verification beyond checking the packet type bit, so
			
 
				+ *  it should not be used for bitstream identification; use
			
 
				+ *  th_decode_headerin() for that.
			
 
				+ * As per the Theora specification, an empty (0-byte) packet is treated as a
			
 
				+ *  data packet (a delta frame with no coded blocks).
			
 
				+ * \param _op An <tt>ogg_packet</tt> containing encoded Theora data.
			
 
				+ * \retval 1 The packet is a header packet
			
 
				+ * \retval 0 The packet is a video data packet.*/
			
 
				+extern int th_packet_isheader(ogg_packet *_op);
			
 
				+/**Determines whether a theora packet is a key frame or not.
			
 
				+ * This function does no verification beyond checking the packet type and
			
 
				+ *  key frame bits, so it should not be used for bitstream identification; use
			
 
				+ *  th_decode_headerin() for that.
			
 
				+ * As per the Theora specification, an empty (0-byte) packet is treated as a
			
 
				+ *  delta frame (with no coded blocks).
			
 
				+ * \param _op An <tt>ogg_packet</tt> containing encoded Theora data.
			
 
				+ * \retval 1  The packet contains a key frame.
			
 
				+ * \retval 0  The packet contains a delta frame.
			
 
				+ * \retval -1 The packet is not a video data packet.*/
			
 
				+extern int th_packet_iskeyframe(ogg_packet *_op);
			
 
				+/*@}*/
			
 
				+
			
 
				+
			
 
				+/**\name Functions for manipulating header data
			
 
				+ * These functions manipulate the #th_info and #th_comment structures
			
 
				+ * which describe video parameters and key-value metadata, respectively.
			
 
				+ *
			
 
				+ * You must link to \c libtheoradec if you use any of the
			
 
				+ * functions in this section.*/
			
 
				+/*@{*/
			
 
				+/**Initializes a th_info structure.
			
 
				+ * This should be called on a freshly allocated #th_info structure before
			
 
				+ *  attempting to use it.
			
 
				+ * \param _info The #th_info struct to initialize.*/
			
 
				+extern void th_info_init(th_info *_info);
			
 
				+/**Clears a #th_info structure.
			
 
				+ * This should be called on a #th_info structure after it is no longer
			
 
				+ *  needed.
			
 
				+ * \param _info The #th_info struct to clear.*/
			
 
				+extern void th_info_clear(th_info *_info);
			
 
				+
			
 
				+/**Initialize a #th_comment structure.
			
 
				+ * This should be called on a freshly allocated #th_comment structure
			
 
				+ *  before attempting to use it.
			
 
				+ * \param _tc The #th_comment struct to initialize.*/
			
 
				+extern void th_comment_init(th_comment *_tc);
			
 
				+/**Add a comment to an initialized #th_comment structure.
			
 
				+ * \note Neither th_comment_add() nor th_comment_add_tag() support
			
 
				+ *  comments containing null values, although the bitstream format does
			
 
				+ *  support them.
			
 
				+ * To add such comments you will need to manipulate the #th_comment
			
 
				+ *  structure directly.
			
 
				+ * \param _tc      The #th_comment struct to add the comment to.
			
 
				+ * \param _comment Must be a null-terminated UTF-8 string containing the
			
 
				+ *                  comment in "TAG=the value" form.*/
			
 
				+extern void th_comment_add(th_comment *_tc,const char *_comment);
			
 
				+/**Add a comment to an initialized #th_comment structure.
			
 
				+ * \note Neither th_comment_add() nor th_comment_add_tag() support
			
 
				+ *  comments containing null values, although the bitstream format does
			
 
				+ *  support them.
			
 
				+ * To add such comments you will need to manipulate the #th_comment
			
 
				+ *  structure directly.
			
 
				+ * \param _tc  The #th_comment struct to add the comment to.
			
 
				+ * \param _tag A null-terminated string containing the tag  associated with
			
 
				+ *              the comment.
			
 
				+ * \param _val The corresponding value as a null-terminated string.*/
			
 
				+extern void th_comment_add_tag(th_comment *_tc,const char *_tag,
			
 
				+ const char *_val);
			
 
				+/**Look up a comment value by its tag.
			
 
				+ * \param _tc    An initialized #th_comment structure.
			
 
				+ * \param _tag   The tag to look up.
			
 
				+ * \param _count The instance of the tag.
			
 
				+ *               The same tag can appear multiple times, each with a distinct
			
 
				+ *                value, so an index is required to retrieve them all.
			
 
				+ *               The order in which these values appear is significant and
			
 
				+ *                should be preserved.
			
 
				+ *               Use th_comment_query_count() to get the legal range for
			
 
				+ *                the \a _count parameter.
			
 
				+ * \return A pointer to the queried tag's value.
			
 
				+ *         This points directly to data in the #th_comment structure.
			
 
				+ *         It should not be modified or freed by the application, and
			
 
				+ *          modifications to the structure may invalidate the pointer.
			
 
				+ * \retval NULL If no matching tag is found.*/
			
 
				+extern char *th_comment_query(th_comment *_tc,const char *_tag,int _count);
			
 
				+/**Look up the number of instances of a tag.
			
 
				+ * Call this first when querying for a specific tag and then iterate over the
			
 
				+ *  number of instances with separate calls to th_comment_query() to
			
 
				+ *  retrieve all the values for that tag in order.
			
 
				+ * \param _tc    An initialized #th_comment structure.
			
 
				+ * \param _tag   The tag to look up.
			
 
				+ * \return The number on instances of this particular tag.*/
			
 
				+extern int th_comment_query_count(th_comment *_tc,const char *_tag);
			
 
				+/**Clears a #th_comment structure.
			
 
				+ * This should be called on a #th_comment structure after it is no longer
			
 
				+ *  needed.
			
 
				+ * It will free all memory used by the structure members.
			
 
				+ * \param _tc The #th_comment struct to clear.*/
			
 
				+extern void th_comment_clear(th_comment *_tc);
			
 
				+/*@}*/
			
 
				+/*@}*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+#if defined(__cplusplus)
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/include/theora/theora.h
+++ b/modules/theoraplayer/native/theora/include/theora/theora.h
@@ -0,0 +1,786 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+  last mod: $Id: theora.h,v 1.17 2003/12/06 18:06:19 arc Exp $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#ifndef _O_THEORA_H_
			
 
				+#define _O_THEORA_H_
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif /* __cplusplus */
			
 
				+
			
 
				+#include <stddef.h>	/* for size_t */
			
 
				+
			
 
				+#include <ogg/ogg.h>
			
 
				+
			
 
				+/** \file
			
 
				+ * The libtheora pre-1.0 legacy C API.
			
 
				+ *
			
 
				+ * \ingroup oldfuncs
			
 
				+ *
			
 
				+ * \section intro Introduction
			
 
				+ *
			
 
				+ * This is the documentation for the libtheora legacy C API, declared in
			
 
				+ * the theora.h header, which describes the old interface used before
			
 
				+ * the 1.0 release. This API was widely deployed for several years and
			
 
				+ * remains supported, but for new code we recommend the cleaner API
			
 
				+ * declared in theoradec.h and theoraenc.h.
			
 
				+ *
			
 
				+ * libtheora is the reference implementation for
			
 
				+ * <a href="http://www.theora.org/">Theora</a>, a free video codec.
			
 
				+ * Theora is derived from On2's VP3 codec with improved integration with
			
 
				+ * Ogg multimedia formats by <a href="http://www.xiph.org/">Xiph.Org</a>.
			
 
				+ *
			
 
				+ * \section overview Overview
			
 
				+ *
			
 
				+ * This library will both decode and encode theora packets to/from raw YUV
			
 
				+ * frames.  In either case, the packets will most likely either come from or
			
 
				+ * need to be embedded in an Ogg stream.  Use
			
 
				+ * <a href="http://xiph.org/ogg/">libogg</a> or
			
 
				+ * <a href="http://www.annodex.net/software/liboggz/index.html">liboggz</a>
			
 
				+ * to extract/package these packets.
			
 
				+ *
			
 
				+ * \section decoding Decoding Process
			
 
				+ *
			
 
				+ * Decoding can be separated into the following steps:
			
 
				+ * -# initialise theora_info and theora_comment structures using
			
 
				+ *    theora_info_init() and theora_comment_init():
			
 
				+ \verbatim
			
 
				+ theora_info     info;
			
 
				+ theora_comment  comment;
			
 
				+
			
 
				+ theora_info_init(&info);
			
 
				+ theora_comment_init(&comment);
			
 
				+ \endverbatim
			
 
				+ * -# retrieve header packets from Ogg stream (there should be 3) and decode
			
 
				+ *    into theora_info and theora_comment structures using
			
 
				+ *    theora_decode_header().  See \ref identification for more information on
			
 
				+ *    identifying which packets are theora packets.
			
 
				+ \verbatim
			
 
				+ int i;
			
 
				+ for (i = 0; i < 3; i++)
			
 
				+ {
			
 
				+   (get a theora packet "op" from the Ogg stream)
			
 
				+   theora_decode_header(&info, &comment, op);
			
 
				+ }
			
 
				+ \endverbatim
			
 
				+ * -# initialise the decoder based on the information retrieved into the
			
 
				+ *    theora_info struct by theora_decode_header().  You will need a
			
 
				+ *    theora_state struct.
			
 
				+ \verbatim
			
 
				+ theora_state state;
			
 
				+
			
 
				+ theora_decode_init(&state, &info);
			
 
				+ \endverbatim
			
 
				+ * -# pass in packets and retrieve decoded frames!  See the yuv_buffer
			
 
				+ *    documentation for information on how to retrieve raw YUV data.
			
 
				+ \verbatim
			
 
				+ yuf_buffer buffer;
			
 
				+ while (last packet was not e_o_s) {
			
 
				+   (get a theora packet "op" from the Ogg stream)
			
 
				+   theora_decode_packetin(&state, op);
			
 
				+   theora_decode_YUVout(&state, &buffer);
			
 
				+ }
			
 
				+ \endverbatim
			
 
				+ *
			
 
				+ *
			
 
				+ * \subsection identification Identifying Theora Packets
			
 
				+ *
			
 
				+ * All streams inside an Ogg file have a unique serial_no attached to the
			
 
				+ * stream.  Typically, you will want to
			
 
				+ *  - retrieve the serial_no for each b_o_s (beginning of stream) page
			
 
				+ *    encountered within the Ogg file;
			
 
				+ *  - test the first (only) packet on that page to determine if it is a theora
			
 
				+ *    packet;
			
 
				+ *  - once you have found a theora b_o_s page then use the retrieved serial_no
			
 
				+ *    to identify future packets belonging to the same theora stream.
			
 
				+ *
			
 
				+ * Note that you \e cannot use theora_packet_isheader() to determine if a
			
 
				+ * packet is a theora packet or not, as this function does not perform any
			
 
				+ * checking beyond whether a header bit is present.  Instead, use the
			
 
				+ * theora_decode_header() function and check the return value; or examine the
			
 
				+ * header bytes at the beginning of the Ogg page.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+/** \defgroup oldfuncs Legacy pre-1.0 C API */
			
 
				+/*  @{ */
			
 
				+
			
 
				+/**
			
 
				+ * A YUV buffer for passing uncompressed frames to and from the codec.
			
 
				+ * This holds a Y'CbCr frame in planar format. The CbCr planes can be
			
 
				+ * subsampled and have their own separate dimensions and row stride
			
 
				+ * offsets. Note that the strides may be negative in some
			
 
				+ * configurations. For theora the width and height of the largest plane
			
 
				+ * must be a multiple of 16. The actual meaningful picture size and
			
 
				+ * offset are stored in the theora_info structure; frames returned by
			
 
				+ * the decoder may need to be cropped for display.
			
 
				+ *
			
 
				+ * All samples are 8 bits. Within each plane samples are ordered by
			
 
				+ * row from the top of the frame to the bottom. Within each row samples
			
 
				+ * are ordered from left to right.
			
 
				+ *
			
 
				+ * During decode, the yuv_buffer struct is allocated by the user, but all
			
 
				+ * fields (including luma and chroma pointers) are filled by the library.
			
 
				+ * These pointers address library-internal memory and their contents should
			
 
				+ * not be modified.
			
 
				+ *
			
 
				+ * Conversely, during encode the user allocates the struct and fills out all
			
 
				+ * fields.  The user also manages the data addressed by the luma and chroma
			
 
				+ * pointers.  See the encoder_example.c and dump_video.c example files in
			
 
				+ * theora/examples/ for more information.
			
 
				+ */
			
 
				+typedef struct {
			
 
				+    int   y_width;      /**< Width of the Y' luminance plane */
			
 
				+    int   y_height;     /**< Height of the luminance plane */
			
 
				+    int   y_stride;     /**< Offset in bytes between successive rows */
			
 
				+
			
 
				+    int   uv_width;     /**< Width of the Cb and Cr chroma planes */
			
 
				+    int   uv_height;    /**< Height of the chroma planes */
			
 
				+    int   uv_stride;    /**< Offset between successive chroma rows */
			
 
				+    unsigned char *y;   /**< Pointer to start of luminance data */
			
 
				+    unsigned char *u;   /**< Pointer to start of Cb data */
			
 
				+    unsigned char *v;   /**< Pointer to start of Cr data */
			
 
				+
			
 
				+} yuv_buffer;
			
 
				+
			
 
				+/**
			
 
				+ * A Colorspace.
			
 
				+ */
			
 
				+typedef enum {
			
 
				+  OC_CS_UNSPECIFIED,    /**< The colorspace is unknown or unspecified */
			
 
				+  OC_CS_ITU_REC_470M,   /**< This is the best option for 'NTSC' content */
			
 
				+  OC_CS_ITU_REC_470BG,  /**< This is the best option for 'PAL' content */
			
 
				+  OC_CS_NSPACES         /**< This marks the end of the defined colorspaces */
			
 
				+} theora_colorspace;
			
 
				+
			
 
				+/**
			
 
				+ * A Chroma subsampling
			
 
				+ *
			
 
				+ * These enumerate the available chroma subsampling options supported
			
 
				+ * by the theora format. See Section 4.4 of the specification for
			
 
				+ * exact definitions.
			
 
				+ */
			
 
				+typedef enum {
			
 
				+  OC_PF_420,    /**< Chroma subsampling by 2 in each direction (4:2:0) */
			
 
				+  OC_PF_RSVD,   /**< Reserved value */
			
 
				+  OC_PF_422,    /**< Horizonatal chroma subsampling by 2 (4:2:2) */
			
 
				+  OC_PF_444     /**< No chroma subsampling at all (4:4:4) */
			
 
				+} theora_pixelformat;
			
 
				+
			
 
				+/**
			
 
				+ * Theora bitstream info.
			
 
				+ * Contains the basic playback parameters for a stream,
			
 
				+ * corresponding to the initial 'info' header packet.
			
 
				+ *
			
 
				+ * Encoded theora frames must be a multiple of 16 in width and height.
			
 
				+ * To handle other frame sizes, a crop rectangle is specified in
			
 
				+ * frame_height and frame_width, offset_x and * offset_y. The offset
			
 
				+ * and size should still be a multiple of 2 to avoid chroma sampling
			
 
				+ * shifts. Offset values in this structure are measured from the
			
 
				+ * upper left of the image.
			
 
				+ *
			
 
				+ * Frame rate, in frames per second, is stored as a rational
			
 
				+ * fraction. Aspect ratio is also stored as a rational fraction, and
			
 
				+ * refers to the aspect ratio of the frame pixels, not of the
			
 
				+ * overall frame itself.
			
 
				+ *
			
 
				+ * See <a href="http://svn.xiph.org/trunk/theora/examples/encoder_example.c">
			
 
				+ * examples/encoder_example.c</a> for usage examples of the
			
 
				+ * other parameters and good default settings for the encoder parameters.
			
 
				+ */
			
 
				+typedef struct {
			
 
				+  ogg_uint32_t  width;		/**< encoded frame width  */
			
 
				+  ogg_uint32_t  height;		/**< encoded frame height */
			
 
				+  ogg_uint32_t  frame_width;	/**< display frame width  */
			
 
				+  ogg_uint32_t  frame_height;	/**< display frame height */
			
 
				+  ogg_uint32_t  offset_x;	/**< horizontal offset of the displayed frame */
			
 
				+  ogg_uint32_t  offset_y;	/**< vertical offset of the displayed frame */
			
 
				+  ogg_uint32_t  fps_numerator;	    /**< frame rate numerator **/
			
 
				+  ogg_uint32_t  fps_denominator;    /**< frame rate denominator **/
			
 
				+  ogg_uint32_t  aspect_numerator;   /**< pixel aspect ratio numerator */
			
 
				+  ogg_uint32_t  aspect_denominator; /**< pixel aspect ratio denominator */
			
 
				+  theora_colorspace colorspace;	    /**< colorspace */
			
 
				+  int           target_bitrate;	    /**< nominal bitrate in bits per second */
			
 
				+  int           quality;  /**< Nominal quality setting, 0-63 */
			
 
				+  int           quick_p;  /**< Quick encode/decode */
			
 
				+
			
 
				+  /* decode only */
			
 
				+  unsigned char version_major;
			
 
				+  unsigned char version_minor;
			
 
				+  unsigned char version_subminor;
			
 
				+
			
 
				+  void *codec_setup;
			
 
				+
			
 
				+  /* encode only */
			
 
				+  int           dropframes_p;
			
 
				+  int           keyframe_auto_p;
			
 
				+  ogg_uint32_t  keyframe_frequency;
			
 
				+  ogg_uint32_t  keyframe_frequency_force;  /* also used for decode init to
			
 
				+                                              get granpos shift correct */
			
 
				+  ogg_uint32_t  keyframe_data_target_bitrate;
			
 
				+  ogg_int32_t   keyframe_auto_threshold;
			
 
				+  ogg_uint32_t  keyframe_mindistance;
			
 
				+  ogg_int32_t   noise_sensitivity;
			
 
				+  ogg_int32_t   sharpness;
			
 
				+
			
 
				+  theora_pixelformat pixelformat;	/**< chroma subsampling mode to expect */
			
 
				+
			
 
				+} theora_info;
			
 
				+
			
 
				+/** Codec internal state and context.
			
 
				+ */
			
 
				+typedef struct{
			
 
				+  theora_info *i;
			
 
				+  ogg_int64_t granulepos;
			
 
				+
			
 
				+  void *internal_encode;
			
 
				+  void *internal_decode;
			
 
				+
			
 
				+} theora_state;
			
 
				+
			
 
				+/**
			
 
				+ * Comment header metadata.
			
 
				+ *
			
 
				+ * This structure holds the in-stream metadata corresponding to
			
 
				+ * the 'comment' header packet.
			
 
				+ *
			
 
				+ * Meta data is stored as a series of (tag, value) pairs, in
			
 
				+ * length-encoded string vectors. The first occurence of the
			
 
				+ * '=' character delimits the tag and value. A particular tag
			
 
				+ * may occur more than once. The character set encoding for
			
 
				+ * the strings is always UTF-8, but the tag names are limited
			
 
				+ * to case-insensitive ASCII. See the spec for details.
			
 
				+ *
			
 
				+ * In filling in this structure, theora_decode_header() will
			
 
				+ * null-terminate the user_comment strings for safety. However,
			
 
				+ * the bitstream format itself treats them as 8-bit clean,
			
 
				+ * and so the length array should be treated as authoritative
			
 
				+ * for their length.
			
 
				+ */
			
 
				+typedef struct theora_comment{
			
 
				+  char **user_comments;         /**< An array of comment string vectors */
			
 
				+  int   *comment_lengths;       /**< An array of corresponding string vector lengths in bytes */
			
 
				+  int    comments;              /**< The total number of comment string vectors */
			
 
				+  char  *vendor;                /**< The vendor string identifying the encoder, null terminated */
			
 
				+
			
 
				+} theora_comment;
			
 
				+
			
 
				+
			
 
				+/**\name theora_control() codes */
			
 
				+/* \anchor decctlcodes_old
			
 
				+ * These are the available request codes for theora_control()
			
 
				+ * when called with a decoder instance.
			
 
				+ * By convention decoder control codes are odd, to distinguish
			
 
				+ * them from \ref encctlcodes_old "encoder control codes" which
			
 
				+ * are even.
			
 
				+ *
			
 
				+ * Note that since the 1.0 release, both the legacy and the final
			
 
				+ * implementation accept all the same control codes, but only the
			
 
				+ * final API declares the newer codes.
			
 
				+ *
			
 
				+ * Keep any experimental or vendor-specific values above \c 0x8000.*/
			
 
				+
			
 
				+/*@{*/
			
 
				+
			
 
				+/**Get the maximum post-processing level.
			
 
				+ * The decoder supports a post-processing filter that can improve
			
 
				+ * the appearance of the decoded images. This returns the highest
			
 
				+ * level setting for this post-processor, corresponding to maximum
			
 
				+ * improvement and computational expense.
			
 
				+ */
			
 
				+#define TH_DECCTL_GET_PPLEVEL_MAX (1)
			
 
				+
			
 
				+/**Set the post-processing level.
			
 
				+ * Sets the level of post-processing to use when decoding the
			
 
				+ * compressed stream. This must be a value between zero (off)
			
 
				+ * and the maximum returned by TH_DECCTL_GET_PPLEVEL_MAX.
			
 
				+ */
			
 
				+#define TH_DECCTL_SET_PPLEVEL (3)
			
 
				+
			
 
				+/**Sets the maximum distance between key frames.
			
 
				+ * This can be changed during an encode, but will be bounded by
			
 
				+ *  <tt>1<<th_info#keyframe_granule_shift</tt>.
			
 
				+ * If it is set before encoding begins, th_info#keyframe_granule_shift will
			
 
				+ *  be enlarged appropriately.
			
 
				+ *
			
 
				+ * \param[in]  buf <tt>ogg_uint32_t</tt>: The maximum distance between key
			
 
				+ *                   frames.
			
 
				+ * \param[out] buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
			
 
				+ * \retval OC_FAULT  \a theora_state or \a buf is <tt>NULL</tt>.
			
 
				+ * \retval OC_EINVAL \a buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
			
 
				+ * \retval OC_IMPL   Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
			
 
				+
			
 
				+/**Set the granule position.
			
 
				+ * Call this after a seek, to update the internal granulepos
			
 
				+ * in the decoder, to insure that subsequent frames are marked
			
 
				+ * properly. If you track timestamps yourself and do not use
			
 
				+ * the granule postion returned by the decoder, then you do
			
 
				+ * not need to use this control.
			
 
				+ */
			
 
				+#define TH_DECCTL_SET_GRANPOS (5)
			
 
				+
			
 
				+/**\anchor encctlcodes_old */
			
 
				+
			
 
				+/**Sets the quantization parameters to use.
			
 
				+ * The parameters are copied, not stored by reference, so they can be freed
			
 
				+ *  after this call.
			
 
				+ * <tt>NULL</tt> may be specified to revert to the default parameters.
			
 
				+ *
			
 
				+ * \param[in] buf #th_quant_info
			
 
				+ * \retval OC_FAULT  \a theora_state is <tt>NULL</tt>.
			
 
				+ * \retval OC_EINVAL Encoding has already begun, the quantization parameters
			
 
				+ *                    are not acceptable to this version of the encoder,
			
 
				+ *                    \a buf is <tt>NULL</tt> and \a buf_sz is not zero,
			
 
				+ *                    or \a buf is non-<tt>NULL</tt> and \a buf_sz is
			
 
				+ *                    not <tt>sizeof(#th_quant_info)</tt>.
			
 
				+ * \retval OC_IMPL   Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_SET_QUANT_PARAMS (2)
			
 
				+
			
 
				+/**Disables any encoder features that would prevent lossless transcoding back
			
 
				+ *  to VP3.
			
 
				+ * This primarily means disabling block-level QI values and not using 4MV mode
			
 
				+ *  when any of the luma blocks in a macro block are not coded.
			
 
				+ * It also includes using the VP3 quantization tables and Huffman codes; if you
			
 
				+ *  set them explicitly after calling this function, the resulting stream will
			
 
				+ *  not be VP3-compatible.
			
 
				+ * If you enable VP3-compatibility when encoding 4:2:2 or 4:4:4 source
			
 
				+ *  material, or when using a picture region smaller than the full frame (e.g.
			
 
				+ *  a non-multiple-of-16 width or height), then non-VP3 bitstream features will
			
 
				+ *  still be disabled, but the stream will still not be VP3-compatible, as VP3
			
 
				+ *  was not capable of encoding such formats.
			
 
				+ * If you call this after encoding has already begun, then the quantization
			
 
				+ *  tables and codebooks cannot be changed, but the frame-level features will
			
 
				+ *  be enabled or disabled as requested.
			
 
				+ *
			
 
				+ * \param[in]  buf <tt>int</tt>: a non-zero value to enable VP3 compatibility,
			
 
				+ *                   or 0 to disable it (the default).
			
 
				+ * \param[out] buf <tt>int</tt>: 1 if all bitstream features required for
			
 
				+ *                   VP3-compatibility could be set, and 0 otherwise.
			
 
				+ *                  The latter will be returned if the pixel format is not
			
 
				+ *                   4:2:0, the picture region is smaller than the full frame,
			
 
				+ *                   or if encoding has begun, preventing the quantization
			
 
				+ *                   tables and codebooks from being set.
			
 
				+ * \retval OC_FAULT  \a theora_state or \a buf is <tt>NULL</tt>.
			
 
				+ * \retval OC_EINVAL \a buf_sz is not <tt>sizeof(int)</tt>.
			
 
				+ * \retval OC_IMPL   Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
			
 
				+
			
 
				+/**Gets the maximum speed level.
			
 
				+ * Higher speed levels favor quicker encoding over better quality per bit.
			
 
				+ * Depending on the encoding mode, and the internal algorithms used, quality
			
 
				+ *  may actually improve, but in this case bitrate will also likely increase.
			
 
				+ * In any case, overall rate/distortion performance will probably decrease.
			
 
				+ * The maximum value, and the meaning of each value, may change depending on
			
 
				+ *  the current encoding mode (VBR vs. CQI, etc.).
			
 
				+ *
			
 
				+ * \param[out] buf int: The maximum encoding speed level.
			
 
				+ * \retval OC_FAULT  \a theora_state or \a buf is <tt>NULL</tt>.
			
 
				+ * \retval OC_EINVAL \a buf_sz is not <tt>sizeof(int)</tt>.
			
 
				+ * \retval OC_IMPL   Not supported by this implementation in the current
			
 
				+ *                    encoding mode.*/
			
 
				+#define TH_ENCCTL_GET_SPLEVEL_MAX (12)
			
 
				+
			
 
				+/**Sets the speed level.
			
 
				+ * By default a speed value of 1 is used.
			
 
				+ *
			
 
				+ * \param[in] buf int: The new encoding speed level.
			
 
				+ *                      0 is slowest, larger values use less CPU.
			
 
				+ * \retval OC_FAULT  \a theora_state or \a buf is <tt>NULL</tt>.
			
 
				+ * \retval OC_EINVAL \a buf_sz is not <tt>sizeof(int)</tt>, or the
			
 
				+ *                    encoding speed level is out of bounds.
			
 
				+ *                   The maximum encoding speed level may be
			
 
				+ *                    implementation- and encoding mode-specific, and can be
			
 
				+ *                    obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
			
 
				+ * \retval OC_IMPL   Not supported by this implementation in the current
			
 
				+ *                    encoding mode.*/
			
 
				+#define TH_ENCCTL_SET_SPLEVEL (14)
			
 
				+
			
 
				+/*@}*/
			
 
				+
			
 
				+#define OC_FAULT       -1       /**< General failure */
			
 
				+#define OC_EINVAL      -10      /**< Library encountered invalid internal data */
			
 
				+#define OC_DISABLED    -11      /**< Requested action is disabled */
			
 
				+#define OC_BADHEADER   -20      /**< Header packet was corrupt/invalid */
			
 
				+#define OC_NOTFORMAT   -21      /**< Packet is not a theora packet */
			
 
				+#define OC_VERSION     -22      /**< Bitstream version is not handled */
			
 
				+#define OC_IMPL        -23      /**< Feature or action not implemented */
			
 
				+#define OC_BADPACKET   -24      /**< Packet is corrupt */
			
 
				+#define OC_NEWPACKET   -25      /**< Packet is an (ignorable) unhandled extension */
			
 
				+#define OC_DUPFRAME    1        /**< Packet is a dropped frame */
			
 
				+
			
 
				+/**
			
 
				+ * Retrieve a human-readable string to identify the encoder vendor and version.
			
 
				+ * \returns A version string.
			
 
				+ */
			
 
				+extern const char *theora_version_string(void);
			
 
				+
			
 
				+/**
			
 
				+ * Retrieve a 32-bit version number.
			
 
				+ * This number is composed of a 16-bit major version, 8-bit minor version
			
 
				+ * and 8 bit sub-version, composed as follows:
			
 
				+<pre>
			
 
				+   (VERSION_MAJOR<<16) + (VERSION_MINOR<<8) + (VERSION_SUB)
			
 
				+</pre>
			
 
				+* \returns The version number.
			
 
				+*/
			
 
				+extern ogg_uint32_t theora_version_number(void);
			
 
				+
			
 
				+/**
			
 
				+ * Initialize the theora encoder.
			
 
				+ * \param th The theora_state handle to initialize for encoding.
			
 
				+ * \param ti A theora_info struct filled with the desired encoding parameters.
			
 
				+ * \retval 0 Success
			
 
				+ */
			
 
				+extern int theora_encode_init(theora_state *th, theora_info *ti);
			
 
				+
			
 
				+/**
			
 
				+ * Submit a YUV buffer to the theora encoder.
			
 
				+ * \param t A theora_state handle previously initialized for encoding.
			
 
				+ * \param yuv A buffer of YUV data to encode.  Note that both the yuv_buffer
			
 
				+ *            struct and the luma/chroma buffers within should be allocated by
			
 
				+ *            the user.
			
 
				+ * \retval OC_EINVAL Encoder is not ready, or is finished.
			
 
				+ * \retval -1 The size of the given frame differs from those previously input
			
 
				+ * \retval 0 Success
			
 
				+ */
			
 
				+extern int theora_encode_YUVin(theora_state *t, yuv_buffer *yuv);
			
 
				+
			
 
				+/**
			
 
				+ * Request the next packet of encoded video.
			
 
				+ * The encoded data is placed in a user-provided ogg_packet structure.
			
 
				+ * \param t A theora_state handle previously initialized for encoding.
			
 
				+ * \param last_p whether this is the last packet the encoder should produce.
			
 
				+ * \param op An ogg_packet structure to fill. libtheora will set all
			
 
				+ *           elements of this structure, including a pointer to encoded
			
 
				+ *           data. The memory for the encoded data is owned by libtheora.
			
 
				+ * \retval 0 No internal storage exists OR no packet is ready
			
 
				+ * \retval -1 The encoding process has completed
			
 
				+ * \retval 1 Success
			
 
				+ */
			
 
				+extern int theora_encode_packetout( theora_state *t, int last_p,
			
 
				+                                    ogg_packet *op);
			
 
				+
			
 
				+/**
			
 
				+ * Request a packet containing the initial header.
			
 
				+ * A pointer to the header data is placed in a user-provided ogg_packet
			
 
				+ * structure.
			
 
				+ * \param t A theora_state handle previously initialized for encoding.
			
 
				+ * \param op An ogg_packet structure to fill. libtheora will set all
			
 
				+ *           elements of this structure, including a pointer to the header
			
 
				+ *           data. The memory for the header data is owned by libtheora.
			
 
				+ * \retval 0 Success
			
 
				+ */
			
 
				+extern int theora_encode_header(theora_state *t, ogg_packet *op);
			
 
				+
			
 
				+/**
			
 
				+ * Request a comment header packet from provided metadata.
			
 
				+ * A pointer to the comment data is placed in a user-provided ogg_packet
			
 
				+ * structure.
			
 
				+ * \param tc A theora_comment structure filled with the desired metadata
			
 
				+ * \param op An ogg_packet structure to fill. libtheora will set all
			
 
				+ *           elements of this structure, including a pointer to the encoded
			
 
				+ *           comment data. The memory for the comment data is owned by
			
 
				+ *           the application, and must be freed by it using _ogg_free().
			
 
				+ *           On some systems (such as Windows when using dynamic linking), this
			
 
				+ *           may mean the free is executed in a different module from the
			
 
				+ *           malloc, which will crash; there is no way to free this memory on
			
 
				+ *           such systems.
			
 
				+ * \retval 0 Success
			
 
				+ */
			
 
				+extern int theora_encode_comment(theora_comment *tc, ogg_packet *op);
			
 
				+
			
 
				+/**
			
 
				+ * Request a packet containing the codebook tables for the stream.
			
 
				+ * A pointer to the codebook data is placed in a user-provided ogg_packet
			
 
				+ * structure.
			
 
				+ * \param t A theora_state handle previously initialized for encoding.
			
 
				+ * \param op An ogg_packet structure to fill. libtheora will set all
			
 
				+ *           elements of this structure, including a pointer to the codebook
			
 
				+ *           data. The memory for the header data is owned by libtheora.
			
 
				+ * \retval 0 Success
			
 
				+ */
			
 
				+extern int theora_encode_tables(theora_state *t, ogg_packet *op);
			
 
				+
			
 
				+/**
			
 
				+ * Decode an Ogg packet, with the expectation that the packet contains
			
 
				+ * an initial header, comment data or codebook tables.
			
 
				+ *
			
 
				+ * \param ci A theora_info structure to fill. This must have been previously
			
 
				+ *           initialized with theora_info_init(). If \a op contains an initial
			
 
				+ *           header, theora_decode_header() will fill \a ci with the
			
 
				+ *           parsed header values. If \a op contains codebook tables,
			
 
				+ *           theora_decode_header() will parse these and attach an internal
			
 
				+ *           representation to \a ci->codec_setup.
			
 
				+ * \param cc A theora_comment structure to fill. If \a op contains comment
			
 
				+ *           data, theora_decode_header() will fill \a cc with the parsed
			
 
				+ *           comments.
			
 
				+ * \param op An ogg_packet structure which you expect contains an initial
			
 
				+ *           header, comment data or codebook tables.
			
 
				+ *
			
 
				+ * \retval OC_BADHEADER \a op is NULL; OR the first byte of \a op->packet
			
 
				+ *                      has the signature of an initial packet, but op is
			
 
				+ *                      not a b_o_s packet; OR this packet has the signature
			
 
				+ *                      of an initial header packet, but an initial header
			
 
				+ *                      packet has already been seen; OR this packet has the
			
 
				+ *                      signature of a comment packet, but the initial header
			
 
				+ *                      has not yet been seen; OR this packet has the signature
			
 
				+ *                      of a comment packet, but contains invalid data; OR
			
 
				+ *                      this packet has the signature of codebook tables,
			
 
				+ *                      but the initial header or comments have not yet
			
 
				+ *                      been seen; OR this packet has the signature of codebook
			
 
				+ *                      tables, but contains invalid data;
			
 
				+ *                      OR the stream being decoded has a compatible version
			
 
				+ *                      but this packet does not have the signature of a
			
 
				+ *                      theora initial header, comments, or codebook packet
			
 
				+ * \retval OC_VERSION   The packet data of \a op is an initial header with
			
 
				+ *                      a version which is incompatible with this version of
			
 
				+ *                      libtheora.
			
 
				+ * \retval OC_NEWPACKET the stream being decoded has an incompatible (future)
			
 
				+ *                      version and contains an unknown signature.
			
 
				+ * \retval 0            Success
			
 
				+ *
			
 
				+ * \note The normal usage is that theora_decode_header() be called on the
			
 
				+ *       first three packets of a theora logical bitstream in succession.
			
 
				+ */
			
 
				+extern int theora_decode_header(theora_info *ci, theora_comment *cc,
			
 
				+                                ogg_packet *op);
			
 
				+
			
 
				+/**
			
 
				+ * Initialize a theora_state handle for decoding.
			
 
				+ * \param th The theora_state handle to initialize.
			
 
				+ * \param c  A theora_info struct filled with the desired decoding parameters.
			
 
				+ *           This is of course usually obtained from a previous call to
			
 
				+ *           theora_decode_header().
			
 
				+ * \retval 0 Success
			
 
				+ */
			
 
				+extern int theora_decode_init(theora_state *th, theora_info *c);
			
 
				+
			
 
				+/**
			
 
				+ * Input a packet containing encoded data into the theora decoder.
			
 
				+ * \param th A theora_state handle previously initialized for decoding.
			
 
				+ * \param op An ogg_packet containing encoded theora data.
			
 
				+ * \retval 0 Success
			
 
				+ * \retval OC_BADPACKET \a op does not contain encoded video data
			
 
				+ */
			
 
				+extern int theora_decode_packetin(theora_state *th,ogg_packet *op);
			
 
				+
			
 
				+/**
			
 
				+ * Output the next available frame of decoded YUV data.
			
 
				+ * \param th A theora_state handle previously initialized for decoding.
			
 
				+ * \param yuv A yuv_buffer in which libtheora should place the decoded data.
			
 
				+ *            Note that the buffer struct itself is allocated by the user, but
			
 
				+ *            that the luma and chroma pointers will be filled in by the
			
 
				+ *            library.  Also note that these luma and chroma regions should be
			
 
				+ *            considered read-only by the user.
			
 
				+ * \retval 0 Success
			
 
				+ */
			
 
				+extern int theora_decode_YUVout(theora_state *th,yuv_buffer *yuv);
			
 
				+
			
 
				+/**
			
 
				+ * Report whether a theora packet is a header or not
			
 
				+ * This function does no verification beyond checking the header
			
 
				+ * flag bit so it should not be used for bitstream identification;
			
 
				+ * use theora_decode_header() for that.
			
 
				+ *
			
 
				+ * \param op An ogg_packet containing encoded theora data.
			
 
				+ * \retval 1 The packet is a header packet
			
 
				+ * \retval 0 The packet is not a header packet (and so contains frame data)
			
 
				+ *
			
 
				+ * Thus function was added in the 1.0alpha4 release.
			
 
				+ */
			
 
				+extern int theora_packet_isheader(ogg_packet *op);
			
 
				+
			
 
				+/**
			
 
				+ * Report whether a theora packet is a keyframe or not
			
 
				+ *
			
 
				+ * \param op An ogg_packet containing encoded theora data.
			
 
				+ * \retval 1 The packet contains a keyframe image
			
 
				+ * \retval 0 The packet is contains an interframe delta
			
 
				+ * \retval -1 The packet is not an image data packet at all
			
 
				+ *
			
 
				+ * Thus function was added in the 1.0alpha4 release.
			
 
				+ */
			
 
				+extern int theora_packet_iskeyframe(ogg_packet *op);
			
 
				+
			
 
				+/**
			
 
				+ * Report the granulepos shift radix
			
 
				+ *
			
 
				+ * When embedded in Ogg, Theora uses a two-part granulepos,
			
 
				+ * splitting the 64-bit field into two pieces. The more-significant
			
 
				+ * section represents the frame count at the last keyframe,
			
 
				+ * and the less-significant section represents the count of
			
 
				+ * frames since the last keyframe. In this way the overall
			
 
				+ * field is still non-decreasing with time, but usefully encodes
			
 
				+ * a pointer to the last keyframe, which is necessary for
			
 
				+ * correctly restarting decode after a seek.
			
 
				+ *
			
 
				+ * This function reports the number of bits used to represent
			
 
				+ * the distance to the last keyframe, and thus how the granulepos
			
 
				+ * field must be shifted or masked to obtain the two parts.
			
 
				+ *
			
 
				+ * Since libtheora returns compressed data in an ogg_packet
			
 
				+ * structure, this may be generally useful even if the Theora
			
 
				+ * packets are not being used in an Ogg container.
			
 
				+ *
			
 
				+ * \param ti A previously initialized theora_info struct
			
 
				+ * \returns The bit shift dividing the two granulepos fields
			
 
				+ *
			
 
				+ * This function was added in the 1.0alpha5 release.
			
 
				+ */
			
 
				+int theora_granule_shift(theora_info *ti);
			
 
				+
			
 
				+/**
			
 
				+ * Convert a granulepos to an absolute frame index, starting at 0.
			
 
				+ * The granulepos is interpreted in the context of a given theora_state handle.
			
 
				+ *
			
 
				+ * Note that while the granulepos encodes the frame count (i.e. starting
			
 
				+ * from 1) this call returns the frame index, starting from zero. Thus
			
 
				+ * One can calculate the presentation time by multiplying the index by
			
 
				+ * the rate.
			
 
				+ *
			
 
				+ * \param th A previously initialized theora_state handle (encode or decode)
			
 
				+ * \param granulepos The granulepos to convert.
			
 
				+ * \returns The frame index corresponding to \a granulepos.
			
 
				+ * \retval -1 The given granulepos is undefined (i.e. negative)
			
 
				+ *
			
 
				+ * Thus function was added in the 1.0alpha4 release.
			
 
				+ */
			
 
				+extern ogg_int64_t theora_granule_frame(theora_state *th,ogg_int64_t granulepos);
			
 
				+
			
 
				+/**
			
 
				+ * Convert a granulepos to absolute time in seconds. The granulepos is
			
 
				+ * interpreted in the context of a given theora_state handle, and gives
			
 
				+ * the end time of a frame's presentation as used in Ogg mux ordering.
			
 
				+ *
			
 
				+ * \param th A previously initialized theora_state handle (encode or decode)
			
 
				+ * \param granulepos The granulepos to convert.
			
 
				+ * \returns The absolute time in seconds corresponding to \a granulepos.
			
 
				+ *          This is the "end time" for the frame, or the latest time it should
			
 
				+ *           be displayed.
			
 
				+ *          It is not the presentation time.
			
 
				+ * \retval -1. The given granulepos is undefined (i.e. negative).
			
 
				+ */
			
 
				+extern double theora_granule_time(theora_state *th,ogg_int64_t granulepos);
			
 
				+
			
 
				+/**
			
 
				+ * Initialize a theora_info structure. All values within the given theora_info
			
 
				+ * structure are initialized, and space is allocated within libtheora for
			
 
				+ * internal codec setup data.
			
 
				+ * \param c A theora_info struct to initialize.
			
 
				+ */
			
 
				+extern void theora_info_init(theora_info *c);
			
 
				+
			
 
				+/**
			
 
				+ * Clear a theora_info structure. All values within the given theora_info
			
 
				+ * structure are cleared, and associated internal codec setup data is freed.
			
 
				+ * \param c A theora_info struct to initialize.
			
 
				+ */
			
 
				+extern void theora_info_clear(theora_info *c);
			
 
				+
			
 
				+/**
			
 
				+ * Free all internal data associated with a theora_state handle.
			
 
				+ * \param t A theora_state handle.
			
 
				+ */
			
 
				+extern void theora_clear(theora_state *t);
			
 
				+
			
 
				+/**
			
 
				+ * Initialize an allocated theora_comment structure
			
 
				+ * \param tc An allocated theora_comment structure
			
 
				+ **/
			
 
				+extern void theora_comment_init(theora_comment *tc);
			
 
				+
			
 
				+/**
			
 
				+ * Add a comment to an initialized theora_comment structure
			
 
				+ * \param tc A previously initialized theora comment structure
			
 
				+ * \param comment A null-terminated string encoding the comment in the form
			
 
				+ *                "TAG=the value"
			
 
				+ *
			
 
				+ * Neither theora_comment_add() nor theora_comment_add_tag() support
			
 
				+ * comments containing null values, although the bitstream format
			
 
				+ * supports this. To add such comments you will need to manipulate
			
 
				+ * the theora_comment structure directly.
			
 
				+ **/
			
 
				+
			
 
				+extern void theora_comment_add(theora_comment *tc, char *comment);
			
 
				+
			
 
				+/**
			
 
				+ * Add a comment to an initialized theora_comment structure.
			
 
				+ * \param tc A previously initialized theora comment structure
			
 
				+ * \param tag A null-terminated string containing the tag
			
 
				+ *            associated with the comment.
			
 
				+ * \param value The corresponding value as a null-terminated string
			
 
				+ *
			
 
				+ * Neither theora_comment_add() nor theora_comment_add_tag() support
			
 
				+ * comments containing null values, although the bitstream format
			
 
				+ * supports this. To add such comments you will need to manipulate
			
 
				+ * the theora_comment structure directly.
			
 
				+ **/
			
 
				+extern void theora_comment_add_tag(theora_comment *tc,
			
 
				+                                       char *tag, char *value);
			
 
				+
			
 
				+/**
			
 
				+ * Look up a comment value by tag.
			
 
				+ * \param tc Tn initialized theora_comment structure
			
 
				+ * \param tag The tag to look up
			
 
				+ * \param count The instance of the tag. The same tag can appear multiple
			
 
				+ *              times, each with a distinct and ordered value, so an index
			
 
				+ *              is required to retrieve them all.
			
 
				+ * \returns A pointer to the queried tag's value
			
 
				+ * \retval NULL No matching tag is found
			
 
				+ *
			
 
				+ * \note Use theora_comment_query_count() to get the legal range for the
			
 
				+ * count parameter.
			
 
				+ **/
			
 
				+
			
 
				+extern char *theora_comment_query(theora_comment *tc, char *tag, int count);
			
 
				+
			
 
				+/** Look up the number of instances of a tag.
			
 
				+ *  \param tc An initialized theora_comment structure
			
 
				+ *  \param tag The tag to look up
			
 
				+ *  \returns The number on instances of a particular tag.
			
 
				+ *
			
 
				+ *  Call this first when querying for a specific tag and then interate
			
 
				+ *  over the number of instances with separate calls to
			
 
				+ *  theora_comment_query() to retrieve all instances in order.
			
 
				+ **/
			
 
				+extern int   theora_comment_query_count(theora_comment *tc, char *tag);
			
 
				+
			
 
				+/**
			
 
				+ * Clear an allocated theora_comment struct so that it can be freed.
			
 
				+ * \param tc An allocated theora_comment structure.
			
 
				+ **/
			
 
				+extern void  theora_comment_clear(theora_comment *tc);
			
 
				+
			
 
				+/**Encoder control function.
			
 
				+ * This is used to provide advanced control the encoding process.
			
 
				+ * \param th     A #theora_state handle.
			
 
				+ * \param req    The control code to process.
			
 
				+ *                See \ref encctlcodes_old "the list of available
			
 
				+ *			control codes" for details.
			
 
				+ * \param buf    The parameters for this control code.
			
 
				+ * \param buf_sz The size of the parameter buffer.*/
			
 
				+extern int theora_control(theora_state *th,int req,void *buf,size_t buf_sz);
			
 
				+
			
 
				+/* @} */ /* end oldfuncs doxygen group */
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif /* __cplusplus */
			
 
				+
			
 
				+#endif /* _O_THEORA_H_ */
			
--- a/modules/theoraplayer/native/theora/include/theora/theoradec.h
+++ b/modules/theoraplayer/native/theora/include/theora/theoradec.h
@@ -0,0 +1,329 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+  last mod: $Id: theora.h,v 1.8 2004/03/15 22:17:32 derf Exp $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+/**\file
			
 
				+ * The <tt>libtheoradec</tt> C decoding API.*/
			
 
				+
			
 
				+#if !defined(_O_THEORA_THEORADEC_H_)
			
 
				+# define _O_THEORA_THEORADEC_H_ (1)
			
 
				+# include <stddef.h>
			
 
				+# include <ogg/ogg.h>
			
 
				+# include "codec.h"
			
 
				+
			
 
				+#if defined(__cplusplus)
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**\name th_decode_ctl() codes
			
 
				+ * \anchor decctlcodes
			
 
				+ * These are the available request codes for th_decode_ctl().
			
 
				+ * By convention, these are odd, to distinguish them from the
			
 
				+ *  \ref encctlcodes "encoder control codes".
			
 
				+ * Keep any experimental or vendor-specific values above \c 0x8000.*/
			
 
				+/*@{*/
			
 
				+/**Gets the maximum post-processing level.
			
 
				+ * The decoder supports a post-processing filter that can improve
			
 
				+ * the appearance of the decoded images. This returns the highest
			
 
				+ * level setting for this post-processor, corresponding to maximum
			
 
				+ * improvement and computational expense.
			
 
				+ *
			
 
				+ * \param[out] _buf int: The maximum post-processing level.
			
 
				+ * \retval TH_EFAULT  \a _dec_ctx or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL  \a _buf_sz is not <tt>sizeof(int)</tt>.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation.*/
			
 
				+#define TH_DECCTL_GET_PPLEVEL_MAX (1)
			
 
				+/**Sets the post-processing level.
			
 
				+ * By default, post-processing is disabled.
			
 
				+ *
			
 
				+ * Sets the level of post-processing to use when decoding the
			
 
				+ * compressed stream. This must be a value between zero (off)
			
 
				+ * and the maximum returned by TH_DECCTL_GET_PPLEVEL_MAX.
			
 
				+ *
			
 
				+ * \param[in] _buf int: The new post-processing level.
			
 
				+ *                      0 to disable; larger values use more CPU.
			
 
				+ * \retval TH_EFAULT  \a _dec_ctx or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL  \a _buf_sz is not <tt>sizeof(int)</tt>, or the
			
 
				+ *                     post-processing level is out of bounds.
			
 
				+ *                    The maximum post-processing level may be
			
 
				+ *                     implementation-specific, and can be obtained via
			
 
				+ *                     #TH_DECCTL_GET_PPLEVEL_MAX.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation.*/
			
 
				+#define TH_DECCTL_SET_PPLEVEL (3)
			
 
				+/**Sets the granule position.
			
 
				+ * Call this after a seek, before decoding the first frame, to ensure that the
			
 
				+ *  proper granule position is returned for all subsequent frames.
			
 
				+ * If you track timestamps yourself and do not use the granule position
			
 
				+ *  returned by the decoder, then you need not call this function.
			
 
				+ *
			
 
				+ * \param[in] _buf <tt>ogg_int64_t</tt>: The granule position of the next
			
 
				+ *                  frame.
			
 
				+ * \retval TH_EFAULT  \a _dec_ctx or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL  \a _buf_sz is not <tt>sizeof(ogg_int64_t)</tt>, or the
			
 
				+ *                     granule position is negative.*/
			
 
				+#define TH_DECCTL_SET_GRANPOS (5)
			
 
				+/**Sets the striped decode callback function.
			
 
				+ * If set, this function will be called as each piece of a frame is fully
			
 
				+ *  decoded in th_decode_packetin().
			
 
				+ * You can pass in a #th_stripe_callback with
			
 
				+ *  th_stripe_callback#stripe_decoded set to <tt>NULL</tt> to disable the
			
 
				+ *  callbacks at any point.
			
 
				+ * Enabling striped decode does not prevent you from calling
			
 
				+ *  th_decode_ycbcr_out() after the frame is fully decoded.
			
 
				+ *
			
 
				+ * \param[in]  _buf #th_stripe_callback: The callback parameters.
			
 
				+ * \retval TH_EFAULT  \a _dec_ctx or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL  \a _buf_sz is not
			
 
				+ *                     <tt>sizeof(th_stripe_callback)</tt>.*/
			
 
				+#define TH_DECCTL_SET_STRIPE_CB (7)
			
 
				+
			
 
				+/**Enables telemetry and sets the macroblock display mode */
			
 
				+#define TH_DECCTL_SET_TELEMETRY_MBMODE (9)
			
 
				+/**Enables telemetry and sets the motion vector display mode */
			
 
				+#define TH_DECCTL_SET_TELEMETRY_MV (11)
			
 
				+/**Enables telemetry and sets the adaptive quantization display mode */
			
 
				+#define TH_DECCTL_SET_TELEMETRY_QI (13)
			
 
				+/**Enables telemetry and sets the bitstream breakdown visualization mode */
			
 
				+#define TH_DECCTL_SET_TELEMETRY_BITS (15)
			
 
				+/*@}*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**A callback function for striped decode.
			
 
				+ * This is a function pointer to an application-provided function that will be
			
 
				+ *  called each time a section of the image is fully decoded in
			
 
				+ *  th_decode_packetin().
			
 
				+ * This allows the application to process the section immediately, while it is
			
 
				+ *  still in cache.
			
 
				+ * Note that the frame is decoded bottom to top, so \a _yfrag0 will steadily
			
 
				+ *  decrease with each call until it reaches 0, at which point the full frame
			
 
				+ *  is decoded.
			
 
				+ * The number of fragment rows made available in each call depends on the pixel
			
 
				+ *  format and the number of post-processing filters enabled, and may not even
			
 
				+ *  be constant for the entire frame.
			
 
				+ * If a non-<tt>NULL</tt> \a _granpos pointer is passed to
			
 
				+ *  th_decode_packetin(), the granule position for the frame will be stored
			
 
				+ *  in it before the first callback is made.
			
 
				+ * If an entire frame is dropped (a 0-byte packet), then no callbacks will be
			
 
				+ *  made at all for that frame.
			
 
				+ * \param _ctx       An application-provided context pointer.
			
 
				+ * \param _buf       The image buffer for the decoded frame.
			
 
				+ * \param _yfrag0    The Y coordinate of the first row of 8x8 fragments
			
 
				+ *                    decoded.
			
 
				+ *                   Multiply this by 8 to obtain the pixel row number in the
			
 
				+ *                    luma plane.
			
 
				+ *                   If the chroma planes are subsampled in the Y direction,
			
 
				+ *                    this will always be divisible by two.
			
 
				+ * \param _yfrag_end The Y coordinate of the first row of 8x8 fragments past
			
 
				+ *                    the newly decoded section.
			
 
				+ *                   If the chroma planes are subsampled in the Y direction,
			
 
				+ *                    this will always be divisible by two.
			
 
				+ *                   I.e., this section contains fragment rows
			
 
				+ *                    <tt>\a _yfrag0 ...\a _yfrag_end -1</tt>.*/
			
 
				+typedef void (*th_stripe_decoded_func)(void *_ctx,th_ycbcr_buffer _buf,
			
 
				+ int _yfrag0,int _yfrag_end);
			
 
				+
			
 
				+/**The striped decode callback data to pass to #TH_DECCTL_SET_STRIPE_CB.*/
			
 
				+typedef struct{
			
 
				+  /**An application-provided context pointer.
			
 
				+   * This will be passed back verbatim to the application.*/
			
 
				+  void                   *ctx;
			
 
				+  /**The callback function pointer.*/
			
 
				+  th_stripe_decoded_func  stripe_decoded;
			
 
				+}th_stripe_callback;
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**\name Decoder state
			
 
				+   The following data structures are opaque, and their contents are not
			
 
				+    publicly defined by this API.
			
 
				+   Referring to their internals directly is unsupported, and may break without
			
 
				+    warning.*/
			
 
				+/*@{*/
			
 
				+/**The decoder context.*/
			
 
				+typedef struct th_dec_ctx    th_dec_ctx;
			
 
				+/**Setup information.
			
 
				+   This contains auxiliary information (Huffman tables and quantization
			
 
				+    parameters) decoded from the setup header by th_decode_headerin() to be
			
 
				+    passed to th_decode_alloc().
			
 
				+   It can be re-used to initialize any number of decoders, and can be freed
			
 
				+    via th_setup_free() at any time.*/
			
 
				+typedef struct th_setup_info th_setup_info;
			
 
				+/*@}*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**\defgroup decfuncs Functions for Decoding*/
			
 
				+/*@{*/
			
 
				+/**\name Functions for decoding
			
 
				+ * You must link to <tt>libtheoradec</tt> if you use any of the
			
 
				+ * functions in this section.
			
 
				+ *
			
 
				+ * The functions are listed in the order they are used in a typical decode.
			
 
				+ * The basic steps are:
			
 
				+ * - Parse the header packets by repeatedly calling th_decode_headerin().
			
 
				+ * - Allocate a #th_dec_ctx handle with th_decode_alloc().
			
 
				+ * - Call th_setup_free() to free any memory used for codec setup
			
 
				+ *    information.
			
 
				+ * - Perform any additional decoder configuration with th_decode_ctl().
			
 
				+ * - For each video data packet:
			
 
				+ *   - Submit the packet to the decoder via th_decode_packetin().
			
 
				+ *   - Retrieve the uncompressed video data via th_decode_ycbcr_out().
			
 
				+ * - Call th_decode_free() to release all decoder memory.*/
			
 
				+/*@{*/
			
 
				+/**Decodes the header packets of a Theora stream.
			
 
				+ * This should be called on the initial packets of the stream, in succession,
			
 
				+ *  until it returns <tt>0</tt>, indicating that all headers have been
			
 
				+ *  processed, or an error is encountered.
			
 
				+ * At least three header packets are required, and additional optional header
			
 
				+ *  packets may follow.
			
 
				+ * This can be used on the first packet of any logical stream to determine if
			
 
				+ *  that stream is a Theora stream.
			
 
				+ * \param _info  A #th_info structure to fill in.
			
 
				+ *               This must have been previously initialized with
			
 
				+ *                th_info_init().
			
 
				+ *               The application may immediately begin using the contents of
			
 
				+ *                this structure after the first header is decoded, though it
			
 
				+ *                must continue to be passed in on all subsequent calls.
			
 
				+ * \param _tc    A #th_comment structure to fill in.
			
 
				+ *               The application may immediately begin using the contents of
			
 
				+ *                this structure after the second header is decoded, though it
			
 
				+ *                must continue to be passed in on all subsequent calls.
			
 
				+ * \param _setup Returns a pointer to additional, private setup information
			
 
				+ *                needed by the decoder.
			
 
				+ *               The contents of this pointer must be initialized to
			
 
				+ *                <tt>NULL</tt> on the first call, and the returned value must
			
 
				+ *                continue to be passed in on all subsequent calls.
			
 
				+ * \param _op    An <tt>ogg_packet</tt> structure which contains one of the
			
 
				+ *                initial packets of an Ogg logical stream.
			
 
				+ * \return A positive value indicates that a Theora header was successfully
			
 
				+ *          processed.
			
 
				+ * \retval 0             The first video data packet was encountered after all
			
 
				+ *                        required header packets were parsed.
			
 
				+ *                       The packet just passed in on this call should be saved
			
 
				+ *                        and fed to th_decode_packetin() to begin decoding
			
 
				+ *                        video data.
			
 
				+ * \retval TH_EFAULT     One of \a _info, \a _tc, or \a _setup was
			
 
				+ *                        <tt>NULL</tt>.
			
 
				+ * \retval TH_EBADHEADER \a _op was <tt>NULL</tt>, the packet was not the next
			
 
				+ *                        header packet in the expected sequence, or the format
			
 
				+ *                        of the header data was invalid.
			
 
				+ * \retval TH_EVERSION   The packet data was a Theora info header, but for a
			
 
				+ *                        bitstream version not decodable with this version of
			
 
				+ *                        <tt>libtheoradec</tt>.
			
 
				+ * \retval TH_ENOTFORMAT The packet was not a Theora header.
			
 
				+ */
			
 
				+extern int th_decode_headerin(th_info *_info,th_comment *_tc,
			
 
				+ th_setup_info **_setup,ogg_packet *_op);
			
 
				+/**Allocates a decoder instance.
			
 
				+ *
			
 
				+ * <b>Security Warning:</b> The Theora format supports very large frame sizes,
			
 
				+ *  potentially even larger than the address space of a 32-bit machine, and
			
 
				+ *  creating a decoder context allocates the space for several frames of data.
			
 
				+ * If the allocation fails here, your program will crash, possibly at some
			
 
				+ *  future point because the OS kernel returned a valid memory range and will
			
 
				+ *  only fail when it tries to map the pages in it the first time they are
			
 
				+ *  used.
			
 
				+ * Even if it succeeds, you may experience a denial of service if the frame
			
 
				+ *  size is large enough to cause excessive paging.
			
 
				+ * If you are integrating libtheora in a larger application where such things
			
 
				+ *  are undesirable, it is highly recommended that you check the frame size in
			
 
				+ *  \a _info before calling this function and refuse to decode streams where it
			
 
				+ *  is larger than some reasonable maximum.
			
 
				+ * libtheora will not check this for you, because there may be machines that
			
 
				+ *  can handle such streams and applications that wish to.
			
 
				+ * \param _info  A #th_info struct filled via th_decode_headerin().
			
 
				+ * \param _setup A #th_setup_info handle returned via
			
 
				+ *                th_decode_headerin().
			
 
				+ * \return The initialized #th_dec_ctx handle.
			
 
				+ * \retval NULL If the decoding parameters were invalid.*/
			
 
				+extern th_dec_ctx *th_decode_alloc(const th_info *_info,
			
 
				+ const th_setup_info *_setup);
			
 
				+/**Releases all storage used for the decoder setup information.
			
 
				+ * This should be called after you no longer want to create any decoders for
			
 
				+ *  a stream whose headers you have parsed with th_decode_headerin().
			
 
				+ * \param _setup The setup information to free.
			
 
				+ *               This can safely be <tt>NULL</tt>.*/
			
 
				+extern void th_setup_free(th_setup_info *_setup);
			
 
				+/**Decoder control function.
			
 
				+ * This is used to provide advanced control of the decoding process.
			
 
				+ * \param _dec    A #th_dec_ctx handle.
			
 
				+ * \param _req    The control code to process.
			
 
				+ *                See \ref decctlcodes "the list of available control codes"
			
 
				+ *                 for details.
			
 
				+ * \param _buf    The parameters for this control code.
			
 
				+ * \param _buf_sz The size of the parameter buffer.
			
 
				+ * \return Possible return values depend on the control code used.
			
 
				+ *          See \ref decctlcodes "the list of control codes" for
			
 
				+ *          specific values. Generally 0 indicates success.*/
			
 
				+extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
			
 
				+ size_t _buf_sz);
			
 
				+/**Submits a packet containing encoded video data to the decoder.
			
 
				+ * \param _dec     A #th_dec_ctx handle.
			
 
				+ * \param _op      An <tt>ogg_packet</tt> containing encoded video data.
			
 
				+ * \param _granpos Returns the granule position of the decoded packet.
			
 
				+ *                 If non-<tt>NULL</tt>, the granule position for this specific
			
 
				+ *                  packet is stored in this location.
			
 
				+ *                 This is computed incrementally from previously decoded
			
 
				+ *                  packets.
			
 
				+ *                 After a seek, the correct granule position must be set via
			
 
				+ *                  #TH_DECCTL_SET_GRANPOS for this to work properly.
			
 
				+ * \retval 0             Success.
			
 
				+ *                       A new decoded frame can be retrieved by calling
			
 
				+ *                        th_decode_ycbcr_out().
			
 
				+ * \retval TH_DUPFRAME   The packet represented a dropped frame (either a
			
 
				+ *                        0-byte frame or an INTER frame with no coded blocks).
			
 
				+ *                       The player can skip the call to th_decode_ycbcr_out(),
			
 
				+ *                        as the contents of the decoded frame buffer have not
			
 
				+ *                        changed.
			
 
				+ * \retval TH_EFAULT     \a _dec or \a _op was <tt>NULL</tt>.
			
 
				+ * \retval TH_EBADPACKET \a _op does not contain encoded video data.
			
 
				+ * \retval TH_EIMPL      The video data uses bitstream features which this
			
 
				+ *                        library does not support.*/
			
 
				+extern int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
			
 
				+ ogg_int64_t *_granpos);
			
 
				+/**Outputs the next available frame of decoded Y'CbCr data.
			
 
				+ * If a striped decode callback has been set with #TH_DECCTL_SET_STRIPE_CB,
			
 
				+ *  then the application does not need to call this function.
			
 
				+ * \param _dec   A #th_dec_ctx handle.
			
 
				+ * \param _ycbcr A video buffer structure to fill in.
			
 
				+ *               <tt>libtheoradec</tt> will fill in all the members of this
			
 
				+ *                structure, including the pointers to the uncompressed video
			
 
				+ *                data.
			
 
				+ *               The memory for this video data is owned by
			
 
				+ *                <tt>libtheoradec</tt>.
			
 
				+ *               It may be freed or overwritten without notification when
			
 
				+ *                subsequent frames are decoded.
			
 
				+ * \retval 0 Success
			
 
				+ * \retval TH_EFAULT     \a _dec or \a _ycbcr was <tt>NULL</tt>.
			
 
				+ */
			
 
				+extern int th_decode_ycbcr_out(th_dec_ctx *_dec,
			
 
				+ th_ycbcr_buffer _ycbcr);
			
 
				+/**Frees an allocated decoder instance.
			
 
				+ * \param _dec A #th_dec_ctx handle.*/
			
 
				+extern void th_decode_free(th_dec_ctx *_dec);
			
 
				+/*@}*/
			
 
				+/*@}*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+#if defined(__cplusplus)
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/include/theora/theoraenc.h
+++ b/modules/theoraplayer/native/theora/include/theora/theoraenc.h
@@ -0,0 +1,548 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+  last mod: $Id: theora.h,v 1.8 2004/03/15 22:17:32 derf Exp $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+/**\file
			
 
				+ * The <tt>libtheoraenc</tt> C encoding API.*/
			
 
				+
			
 
				+#if !defined(_O_THEORA_THEORAENC_H_)
			
 
				+# define _O_THEORA_THEORAENC_H_ (1)
			
 
				+# include <stddef.h>
			
 
				+# include <ogg/ogg.h>
			
 
				+# include "codec.h"
			
 
				+
			
 
				+#if defined(__cplusplus)
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**\name th_encode_ctl() codes
			
 
				+ * \anchor encctlcodes
			
 
				+ * These are the available request codes for th_encode_ctl().
			
 
				+ * By convention, these are even, to distinguish them from the
			
 
				+ *  \ref decctlcodes "decoder control codes".
			
 
				+ * Keep any experimental or vendor-specific values above \c 0x8000.*/
			
 
				+/*@{*/
			
 
				+/**Sets the Huffman tables to use.
			
 
				+ * The tables are copied, not stored by reference, so they can be freed after
			
 
				+ *  this call.
			
 
				+ * <tt>NULL</tt> may be specified to revert to the default tables.
			
 
				+ *
			
 
				+ * \param[in] _buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt>
			
 
				+ * \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL Encoding has already begun or one or more of the given
			
 
				+ *                     tables is not full or prefix-free, \a _buf is
			
 
				+ *                     <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf is
			
 
				+ *                     non-<tt>NULL</tt> and \a _buf_sz is not
			
 
				+ *                     <tt>sizeof(#th_huff_code)*#TH_NHUFFMAN_TABLES*#TH_NDCT_TOKENS</tt>.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_SET_HUFFMAN_CODES (0)
			
 
				+/**Sets the quantization parameters to use.
			
 
				+ * The parameters are copied, not stored by reference, so they can be freed
			
 
				+ *  after this call.
			
 
				+ * <tt>NULL</tt> may be specified to revert to the default parameters.
			
 
				+ *
			
 
				+ * \param[in] _buf #th_quant_info
			
 
				+ * \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL Encoding has already begun, \a _buf is
			
 
				+ *                    <tt>NULL</tt> and \a _buf_sz is not zero,
			
 
				+ *                    or \a _buf is non-<tt>NULL</tt> and
			
 
				+ *                    \a _buf_sz is not <tt>sizeof(#th_quant_info)</tt>.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_SET_QUANT_PARAMS (2)
			
 
				+/**Sets the maximum distance between key frames.
			
 
				+ * This can be changed during an encode, but will be bounded by
			
 
				+ *  <tt>1<<th_info#keyframe_granule_shift</tt>.
			
 
				+ * If it is set before encoding begins, th_info#keyframe_granule_shift will
			
 
				+ *  be enlarged appropriately.
			
 
				+ *
			
 
				+ * \param[in]  _buf <tt>ogg_uint32_t</tt>: The maximum distance between key
			
 
				+ *                   frames.
			
 
				+ * \param[out] _buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
			
 
				+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
			
 
				+/**Disables any encoder features that would prevent lossless transcoding back
			
 
				+ *  to VP3.
			
 
				+ * This primarily means disabling block-adaptive quantization and always coding
			
 
				+ *  all four luma blocks in a macro block when 4MV is used.
			
 
				+ * It also includes using the VP3 quantization tables and Huffman codes; if you
			
 
				+ *  set them explicitly after calling this function, the resulting stream will
			
 
				+ *  not be VP3-compatible.
			
 
				+ * If you enable VP3-compatibility when encoding 4:2:2 or 4:4:4 source
			
 
				+ *  material, or when using a picture region smaller than the full frame (e.g.
			
 
				+ *  a non-multiple-of-16 width or height), then non-VP3 bitstream features will
			
 
				+ *  still be disabled, but the stream will still not be VP3-compatible, as VP3
			
 
				+ *  was not capable of encoding such formats.
			
 
				+ * If you call this after encoding has already begun, then the quantization
			
 
				+ *  tables and codebooks cannot be changed, but the frame-level features will
			
 
				+ *  be enabled or disabled as requested.
			
 
				+ *
			
 
				+ * \param[in]  _buf <tt>int</tt>: a non-zero value to enable VP3 compatibility,
			
 
				+ *                   or 0 to disable it (the default).
			
 
				+ * \param[out] _buf <tt>int</tt>: 1 if all bitstream features required for
			
 
				+ *                   VP3-compatibility could be set, and 0 otherwise.
			
 
				+ *                  The latter will be returned if the pixel format is not
			
 
				+ *                   4:2:0, the picture region is smaller than the full frame,
			
 
				+ *                   or if encoding has begun, preventing the quantization
			
 
				+ *                   tables and codebooks from being set.
			
 
				+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
			
 
				+/**Gets the maximum speed level.
			
 
				+ * Higher speed levels favor quicker encoding over better quality per bit.
			
 
				+ * Depending on the encoding mode, and the internal algorithms used, quality
			
 
				+ *  may actually improve, but in this case bitrate will also likely increase.
			
 
				+ * In any case, overall rate/distortion performance will probably decrease.
			
 
				+ * The maximum value, and the meaning of each value, may change depending on
			
 
				+ *  the current encoding mode (VBR vs. constant quality, etc.).
			
 
				+ *
			
 
				+ * \param[out] _buf <tt>int</tt>: The maximum encoding speed level.
			
 
				+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation in the current
			
 
				+ *                    encoding mode.*/
			
 
				+#define TH_ENCCTL_GET_SPLEVEL_MAX (12)
			
 
				+/**Sets the speed level.
			
 
				+ * The current speed level may be retrieved using #TH_ENCCTL_GET_SPLEVEL.
			
 
				+ *
			
 
				+ * \param[in] _buf <tt>int</tt>: The new encoding speed level.
			
 
				+ *                 0 is slowest, larger values use less CPU.
			
 
				+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
			
 
				+ *                    encoding speed level is out of bounds.
			
 
				+ *                   The maximum encoding speed level may be
			
 
				+ *                    implementation- and encoding mode-specific, and can be
			
 
				+ *                    obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation in the current
			
 
				+ *                    encoding mode.*/
			
 
				+#define TH_ENCCTL_SET_SPLEVEL (14)
			
 
				+/**Gets the current speed level.
			
 
				+ * The default speed level may vary according to encoder implementation, but if
			
 
				+ *  this control code is not supported (it returns #TH_EIMPL), the default may
			
 
				+ *  be assumed to be the slowest available speed (0).
			
 
				+ * The maximum encoding speed level may be implementation- and encoding
			
 
				+ *  mode-specific, and can be obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
			
 
				+ *
			
 
				+ * \param[out] _buf <tt>int</tt>: The current encoding speed level.
			
 
				+ *                  0 is slowest, larger values use less CPU.
			
 
				+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation in the current
			
 
				+ *                    encoding mode.*/
			
 
				+#define TH_ENCCTL_GET_SPLEVEL (16)
			
 
				+/**Sets the number of duplicates of the next frame to produce.
			
 
				+ * Although libtheora can encode duplicate frames very cheaply, it costs some
			
 
				+ *  amount of CPU to detect them, and a run of duplicates cannot span a
			
 
				+ *  keyframe boundary.
			
 
				+ * This control code tells the encoder to produce the specified number of extra
			
 
				+ *  duplicates of the next frame.
			
 
				+ * This allows the encoder to make smarter keyframe placement decisions and
			
 
				+ *  rate control decisions, and reduces CPU usage as well, when compared to
			
 
				+ *  just submitting the same frame for encoding multiple times.
			
 
				+ * This setting only applies to the next frame submitted for encoding.
			
 
				+ * You MUST call th_encode_packetout() repeatedly until it returns 0, or the
			
 
				+ *  extra duplicate frames will be lost.
			
 
				+ *
			
 
				+ * \param[in] _buf <tt>int</tt>: The number of duplicates to produce.
			
 
				+ *                 If this is negative or zero, no duplicates will be produced.
			
 
				+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
			
 
				+ *                    number of duplicates is greater than or equal to the
			
 
				+ *                    maximum keyframe interval.
			
 
				+ *                   In the latter case, NO duplicate frames will be produced.
			
 
				+ *                   You must ensure that the maximum keyframe interval is set
			
 
				+ *                    larger than the maximum number of duplicates you will
			
 
				+ *                    ever wish to insert prior to encoding.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation in the current
			
 
				+ *                    encoding mode.*/
			
 
				+#define TH_ENCCTL_SET_DUP_COUNT (18)
			
 
				+/**Modifies the default bitrate management behavior.
			
 
				+ * Use to allow or disallow frame dropping, and to enable or disable capping
			
 
				+ *  bit reservoir overflows and underflows.
			
 
				+ * See \ref encctlcodes "the list of available flags".
			
 
				+ * The flags are set by default to
			
 
				+ *  <tt>#TH_RATECTL_DROP_FRAMES|#TH_RATECTL_CAP_OVERFLOW</tt>.
			
 
				+ *
			
 
				+ * \param[in] _buf <tt>int</tt>: Any combination of
			
 
				+ *                  \ref ratectlflags "the available flags":
			
 
				+ *                 - #TH_RATECTL_DROP_FRAMES: Enable frame dropping.
			
 
				+ *                 - #TH_RATECTL_CAP_OVERFLOW: Don't bank excess bits for later
			
 
				+ *                    use.
			
 
				+ *                 - #TH_RATECTL_CAP_UNDERFLOW: Don't try to make up shortfalls
			
 
				+ *                    later.
			
 
				+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt> or rate control
			
 
				+ *                    is not enabled.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation in the current
			
 
				+ *                    encoding mode.*/
			
 
				+#define TH_ENCCTL_SET_RATE_FLAGS (20)
			
 
				+/**Sets the size of the bitrate management bit reservoir as a function
			
 
				+ *  of number of frames.
			
 
				+ * The reservoir size affects how quickly bitrate management reacts to
			
 
				+ *  instantaneous changes in the video complexity.
			
 
				+ * Larger reservoirs react more slowly, and provide better overall quality, but
			
 
				+ *  require more buffering by a client, adding more latency to live streams.
			
 
				+ * By default, libtheora sets the reservoir to the maximum distance between
			
 
				+ *  keyframes, subject to a minimum and maximum limit.
			
 
				+ * This call may be used to increase or decrease the reservoir, increasing or
			
 
				+ *  decreasing the allowed temporary variance in bitrate.
			
 
				+ * An implementation may impose some limits on the size of a reservoir it can
			
 
				+ *  handle, in which case the actual reservoir size may not be exactly what was
			
 
				+ *  requested.
			
 
				+ * The actual value set will be returned.
			
 
				+ *
			
 
				+ * \param[in]  _buf <tt>int</tt>: Requested size of the reservoir measured in
			
 
				+ *                   frames.
			
 
				+ * \param[out] _buf <tt>int</tt>: The actual size of the reservoir set.
			
 
				+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or rate control
			
 
				+ *                    is not enabled.  The buffer has an implementation
			
 
				+ *                    defined minimum and maximum size and the value in _buf
			
 
				+ *                    will be adjusted to match the actual value set.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation in the current
			
 
				+ *                    encoding mode.*/
			
 
				+#define TH_ENCCTL_SET_RATE_BUFFER (22)
			
 
				+/**Enable pass 1 of two-pass encoding mode and retrieve the first pass metrics.
			
 
				+ * Pass 1 mode must be enabled before the first frame is encoded, and a target
			
 
				+ *  bitrate must have already been specified to the encoder.
			
 
				+ * Although this does not have to be the exact rate that will be used in the
			
 
				+ *  second pass, closer values may produce better results.
			
 
				+ * The first call returns the size of the two-pass header data, along with some
			
 
				+ *  placeholder content, and sets the encoder into pass 1 mode implicitly.
			
 
				+ * This call sets the encoder to pass 1 mode implicitly.
			
 
				+ * Then, a subsequent call must be made after each call to
			
 
				+ *  th_encode_ycbcr_in() to retrieve the metrics for that frame.
			
 
				+ * An additional, final call must be made to retrieve the summary data,
			
 
				+ *  containing such information as the total number of frames, etc.
			
 
				+ * This must be stored in place of the placeholder data that was returned
			
 
				+ *  in the first call, before the frame metrics data.
			
 
				+ * All of this data must be presented back to the encoder during pass 2 using
			
 
				+ *  #TH_ENCCTL_2PASS_IN.
			
 
				+ *
			
 
				+ * \param[out] <tt>char *</tt>_buf: Returns a pointer to internal storage
			
 
				+ *              containing the two pass metrics data.
			
 
				+ *             This storage is only valid until the next call, or until the
			
 
				+ *              encoder context is freed, and must be copied by the
			
 
				+ *              application.
			
 
				+ * \retval >=0       The number of bytes of metric data available in the
			
 
				+ *                    returned buffer.
			
 
				+ * \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(char *)</tt>, no target
			
 
				+ *                    bitrate has been set, or the first call was made after
			
 
				+ *                    the first frame was submitted for encoding.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_2PASS_OUT (24)
			
 
				+/**Submits two-pass encoding metric data collected the first encoding pass to
			
 
				+ *  the second pass.
			
 
				+ * The first call must be made before the first frame is encoded, and a target
			
 
				+ *  bitrate must have already been specified to the encoder.
			
 
				+ * It sets the encoder to pass 2 mode implicitly; this cannot be disabled.
			
 
				+ * The encoder may require reading data from some or all of the frames in
			
 
				+ *  advance, depending on, e.g., the reservoir size used in the second pass.
			
 
				+ * You must call this function repeatedly before each frame to provide data
			
 
				+ *  until either a) it fails to consume all of the data presented or b) all of
			
 
				+ *  the pass 1 data has been consumed.
			
 
				+ * In the first case, you must save the remaining data to be presented after
			
 
				+ *  the next frame.
			
 
				+ * You can call this function with a NULL argument to get an upper bound on
			
 
				+ *  the number of bytes that will be required before the next frame.
			
 
				+ *
			
 
				+ * When pass 2 is first enabled, the default bit reservoir is set to the entire
			
 
				+ *  file; this gives maximum flexibility but can lead to very high peak rates.
			
 
				+ * You can subsequently set it to another value with #TH_ENCCTL_SET_RATE_BUFFER
			
 
				+ *  (e.g., to set it to the keyframe interval for non-live streaming), however,
			
 
				+ *  you may then need to provide more data before the next frame.
			
 
				+ *
			
 
				+ * \param[in] _buf <tt>char[]</tt>: A buffer containing the data returned by
			
 
				+ *                  #TH_ENCCTL_2PASS_OUT in pass 1.
			
 
				+ *                 You may pass <tt>NULL</tt> for \a _buf to return an upper
			
 
				+ *                  bound on the number of additional bytes needed before the
			
 
				+ *                  next frame.
			
 
				+ *                 The summary data returned at the end of pass 1 must be at
			
 
				+ *                  the head of the buffer on the first call with a
			
 
				+ *                  non-<tt>NULL</tt> \a _buf, and the placeholder data
			
 
				+ *                  returned at the start of pass 1 should be omitted.
			
 
				+ *                 After each call you should advance this buffer by the number
			
 
				+ *                  of bytes consumed.
			
 
				+ * \retval >0            The number of bytes of metric data required/consumed.
			
 
				+ * \retval 0             No more data is required before the next frame.
			
 
				+ * \retval TH_EFAULT     \a _enc is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL     No target bitrate has been set, or the first call was
			
 
				+ *                        made after the first frame was submitted for
			
 
				+ *                        encoding.
			
 
				+ * \retval TH_ENOTFORMAT The data did not appear to be pass 1 from a compatible
			
 
				+ *                        implementation of this library.
			
 
				+ * \retval TH_EBADHEADER The data was invalid; this may be returned when
			
 
				+ *                        attempting to read an aborted pass 1 file that still
			
 
				+ *                        has the placeholder data in place of the summary
			
 
				+ *                        data.
			
 
				+ * \retval TH_EIMPL       Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_2PASS_IN (26)
			
 
				+/**Sets the current encoding quality.
			
 
				+ * This is only valid so long as no bitrate has been specified, either through
			
 
				+ *  the #th_info struct used to initialize the encoder or through
			
 
				+ *  #TH_ENCCTL_SET_BITRATE (this restriction may be relaxed in a future
			
 
				+ *  version).
			
 
				+ * If it is set before the headers are emitted, the target quality encoded in
			
 
				+ *  them will be updated.
			
 
				+ *
			
 
				+ * \param[in] _buf <tt>int</tt>: The new target quality, in the range 0...63,
			
 
				+ *                  inclusive.
			
 
				+ * \retval 0             Success.
			
 
				+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL     A target bitrate has already been specified, or the
			
 
				+ *                        quality index was not in the range 0...63.
			
 
				+ * \retval TH_EIMPL       Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_SET_QUALITY (28)
			
 
				+/**Sets the current encoding bitrate.
			
 
				+ * Once a bitrate is set, the encoder must use a rate-controlled mode for all
			
 
				+ *  future frames (this restriction may be relaxed in a future version).
			
 
				+ * If it is set before the headers are emitted, the target bitrate encoded in
			
 
				+ *  them will be updated.
			
 
				+ * Due to the buffer delay, the exact bitrate of each section of the encode is
			
 
				+ *  not guaranteed.
			
 
				+ * The encoder may have already used more bits than allowed for the frames it
			
 
				+ *  has encoded, expecting to make them up in future frames, or it may have
			
 
				+ *  used fewer, holding the excess in reserve.
			
 
				+ * The exact transition between the two bitrates is not well-defined by this
			
 
				+ *  API, but may be affected by flags set with #TH_ENCCTL_SET_RATE_FLAGS.
			
 
				+ * After a number of frames equal to the buffer delay, one may expect further
			
 
				+ *  output to average at the target bitrate.
			
 
				+ *
			
 
				+ * \param[in] _buf <tt>long</tt>: The new target bitrate, in bits per second.
			
 
				+ * \retval 0             Success.
			
 
				+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL     The target bitrate was not positive.
			
 
				+ *                       A future version of this library may allow passing 0
			
 
				+ *                        to disabled rate-controlled mode and return to a
			
 
				+ *                        quality-based mode, in which case this function will
			
 
				+ *                        not return an error for that value.
			
 
				+ * \retval TH_EIMPL      Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_SET_BITRATE (30)
			
 
				+/**Sets the configuration to be compatible with that from the given setup
			
 
				+ *  header.
			
 
				+ * This sets the Huffman codebooks and quantization parameters to match those
			
 
				+ *  found in the given setup header.
			
 
				+ * This guarantees that packets encoded by this encoder will be decodable using
			
 
				+ *  a decoder configured with the passed-in setup header.
			
 
				+ * It does <em>not</em> guarantee that th_encode_flushheader() will produce a
			
 
				+ *  bit-identical setup header, only that they will be compatible.
			
 
				+ * If you need a bit-identical setup header, then use the one you passed into
			
 
				+ *  this command, and not the one returned by th_encode_flushheader().
			
 
				+ *
			
 
				+ * This also does <em>not</em> enable or disable VP3 compatibility; that is not
			
 
				+ *  signaled in the setup header (or anywhere else in the encoded stream), and
			
 
				+ *  is controlled independently by the #TH_ENCCTL_SET_VP3_COMPATIBLE function.
			
 
				+ * If you wish to enable VP3 compatibility mode <em>and</em> want the codebooks
			
 
				+ *  and quantization parameters to match the given setup header, you should
			
 
				+ *  enable VP3 compatibility before invoking this command, otherwise the
			
 
				+ *  codebooks and quantization parameters will be reset to the VP3 defaults.
			
 
				+ *
			
 
				+ * The current encoder does not support Huffman codebooks which do not contain
			
 
				+ *  codewords for all 32 tokens.
			
 
				+ * Such codebooks are legal, according to the specification, but cannot be
			
 
				+ *  configured with this function.
			
 
				+ *
			
 
				+ * \param[in] _buf <tt>unsigned char[]</tt>: The encoded setup header to copy
			
 
				+ *                                            the configuration from.
			
 
				+ *                                           This should be the original,
			
 
				+ *                                            undecoded setup header packet,
			
 
				+ *                                            and <em>not</em> a #th_setup_info
			
 
				+ *                                            structure filled in by
			
 
				+ *                                            th_decode_headerin().
			
 
				+ * \retval TH_EFAULT     \a _enc or \a _buf is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL     Encoding has already begun, so the codebooks and
			
 
				+ *                        quantization parameters cannot be changed, or the
			
 
				+ *                        data in the setup header was not supported by this
			
 
				+ *                        encoder.
			
 
				+ * \retval TH_EBADHEADER \a _buf did not contain a valid setup header packet.
			
 
				+ * \retval TH_ENOTFORMAT \a _buf did not contain a Theora header at all.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_SET_COMPAT_CONFIG (32)
			
 
				+
			
 
				+/*@}*/
			
 
				+
			
 
				+
			
 
				+/**\name TH_ENCCTL_SET_RATE_FLAGS flags
			
 
				+ * \anchor ratectlflags
			
 
				+ * These are the flags available for use with #TH_ENCCTL_SET_RATE_FLAGS.*/
			
 
				+/*@{*/
			
 
				+/**Drop frames to keep within bitrate buffer constraints.
			
 
				+ * This can have a severe impact on quality, but is the only way to ensure that
			
 
				+ *  bitrate targets are met at low rates during sudden bursts of activity.
			
 
				+ * It is enabled by default.*/
			
 
				+#define TH_RATECTL_DROP_FRAMES   (0x1)
			
 
				+/**Ignore bitrate buffer overflows.
			
 
				+ * If the encoder uses so few bits that the reservoir of available bits
			
 
				+ *  overflows, ignore the excess.
			
 
				+ * The encoder will not try to use these extra bits in future frames.
			
 
				+ * At high rates this may cause the result to be undersized, but allows a
			
 
				+ *  client to play the stream using a finite buffer; it should normally be
			
 
				+ *  enabled, which is the default.*/
			
 
				+#define TH_RATECTL_CAP_OVERFLOW  (0x2)
			
 
				+/**Ignore bitrate buffer underflows.
			
 
				+ * If the encoder uses so many bits that the reservoir of available bits
			
 
				+ *  underflows, ignore the deficit.
			
 
				+ * The encoder will not try to make up these extra bits in future frames.
			
 
				+ * At low rates this may cause the result to be oversized; it should normally
			
 
				+ *  be disabled, which is the default.*/
			
 
				+#define TH_RATECTL_CAP_UNDERFLOW (0x4)
			
 
				+/*@}*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**The quantization parameters used by VP3.*/
			
 
				+extern const th_quant_info TH_VP31_QUANT_INFO;
			
 
				+
			
 
				+/**The Huffman tables used by VP3.*/
			
 
				+extern const th_huff_code
			
 
				+ TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**\name Encoder state
			
 
				+   The following data structure is opaque, and its contents are not publicly
			
 
				+    defined by this API.
			
 
				+   Referring to its internals directly is unsupported, and may break without
			
 
				+    warning.*/
			
 
				+/*@{*/
			
 
				+/**The encoder context.*/
			
 
				+typedef struct th_enc_ctx    th_enc_ctx;
			
 
				+/*@}*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**\defgroup encfuncs Functions for Encoding*/
			
 
				+/*@{*/
			
 
				+/**\name Functions for encoding
			
 
				+ * You must link to <tt>libtheoraenc</tt> and <tt>libtheoradec</tt>
			
 
				+ *  if you use any of the functions in this section.
			
 
				+ *
			
 
				+ * The functions are listed in the order they are used in a typical encode.
			
 
				+ * The basic steps are:
			
 
				+ * - Fill in a #th_info structure with details on the format of the video you
			
 
				+ *    wish to encode.
			
 
				+ * - Allocate a #th_enc_ctx handle with th_encode_alloc().
			
 
				+ * - Perform any additional encoder configuration required with
			
 
				+ *    th_encode_ctl().
			
 
				+ * - Repeatedly call th_encode_flushheader() to retrieve all the header
			
 
				+ *    packets.
			
 
				+ * - For each uncompressed frame:
			
 
				+ *   - Submit the uncompressed frame via th_encode_ycbcr_in()
			
 
				+ *   - Repeatedly call th_encode_packetout() to retrieve any video
			
 
				+ *      data packets that are ready.
			
 
				+ * - Call th_encode_free() to release all encoder memory.*/
			
 
				+/*@{*/
			
 
				+/**Allocates an encoder instance.
			
 
				+ * \param _info A #th_info struct filled with the desired encoding parameters.
			
 
				+ * \return The initialized #th_enc_ctx handle.
			
 
				+ * \retval NULL If the encoding parameters were invalid.*/
			
 
				+extern th_enc_ctx *th_encode_alloc(const th_info *_info);
			
 
				+/**Encoder control function.
			
 
				+ * This is used to provide advanced control the encoding process.
			
 
				+ * \param _enc    A #th_enc_ctx handle.
			
 
				+ * \param _req    The control code to process.
			
 
				+ *                See \ref encctlcodes "the list of available control codes"
			
 
				+ *                 for details.
			
 
				+ * \param _buf    The parameters for this control code.
			
 
				+ * \param _buf_sz The size of the parameter buffer.
			
 
				+ * \return Possible return values depend on the control code used.
			
 
				+ *          See \ref encctlcodes "the list of control codes" for
			
 
				+ *          specific values. Generally 0 indicates success.*/
			
 
				+extern int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz);
			
 
				+/**Outputs the next header packet.
			
 
				+ * This should be called repeatedly after encoder initialization until it
			
 
				+ *  returns 0 in order to get all of the header packets, in order, before
			
 
				+ *  encoding actual video data.
			
 
				+ * \param _enc      A #th_enc_ctx handle.
			
 
				+ * \param _comments The metadata to place in the comment header, when it is
			
 
				+ *                   encoded.
			
 
				+ * \param _op       An <tt>ogg_packet</tt> structure to fill.
			
 
				+ *                  All of the elements of this structure will be set,
			
 
				+ *                   including a pointer to the header data.
			
 
				+ *                  The memory for the header data is owned by
			
 
				+ *                   <tt>libtheoraenc</tt>, and may be invalidated when the
			
 
				+ *                   next encoder function is called.
			
 
				+ * \return A positive value indicates that a header packet was successfully
			
 
				+ *          produced.
			
 
				+ * \retval 0         No packet was produced, and no more header packets remain.
			
 
				+ * \retval TH_EFAULT \a _enc, \a _comments, or \a _op was <tt>NULL</tt>.*/
			
 
				+extern int th_encode_flushheader(th_enc_ctx *_enc,
			
 
				+ th_comment *_comments,ogg_packet *_op);
			
 
				+/**Submits an uncompressed frame to the encoder.
			
 
				+ * \param _enc   A #th_enc_ctx handle.
			
 
				+ * \param _ycbcr A buffer of Y'CbCr data to encode.
			
 
				+ *               If the width and height of the buffer matches the frame size
			
 
				+ *                the encoder was initialized with, the encoder will only
			
 
				+ *                reference the portion inside the picture region.
			
 
				+ *               Any data outside this region will be ignored, and need not map
			
 
				+ *                to a valid address.
			
 
				+ *               Alternatively, you can pass a buffer equal to the size of the
			
 
				+ *                picture region, if this is less than the full frame size.
			
 
				+ *               When using subsampled chroma planes, odd picture sizes or odd
			
 
				+ *                picture offsets may require an unexpected chroma plane size,
			
 
				+ *                and their use is generally discouraged, as they will not be
			
 
				+ *                well-supported by players and other media frameworks.
			
 
				+ *               See Section 4.4 of
			
 
				+ *                <a href="http://www.theora.org/doc/Theora.pdf">the Theora
			
 
				+ *                specification</a> for details if you wish to use them anyway.
			
 
				+ * \retval 0         Success.
			
 
				+ * \retval TH_EFAULT \a _enc or \a _ycbcr is <tt>NULL</tt>.
			
 
				+ * \retval TH_EINVAL The buffer size matches neither the frame size nor the
			
 
				+ *                    picture size the encoder was initialized with, or
			
 
				+ *                    encoding has already completed.*/
			
 
				+extern int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr);
			
 
				+/**Retrieves encoded video data packets.
			
 
				+ * This should be called repeatedly after each frame is submitted to flush any
			
 
				+ *  encoded packets, until it returns 0.
			
 
				+ * The encoder will not buffer these packets as subsequent frames are
			
 
				+ *  compressed, so a failure to do so will result in lost video data.
			
 
				+ * \note Currently the encoder operates in a one-frame-in, one-packet-out
			
 
				+ *        manner.
			
 
				+ *       However, this may be changed in the future.
			
 
				+ * \param _enc  A #th_enc_ctx handle.
			
 
				+ * \param _last Set this flag to a non-zero value if no more uncompressed
			
 
				+ *               frames will be submitted.
			
 
				+ *              This ensures that a proper EOS flag is set on the last packet.
			
 
				+ * \param _op   An <tt>ogg_packet</tt> structure to fill.
			
 
				+ *              All of the elements of this structure will be set, including a
			
 
				+ *               pointer to the video data.
			
 
				+ *              The memory for the video data is owned by
			
 
				+ *               <tt>libtheoraenc</tt>, and may be invalidated when the next
			
 
				+ *               encoder function is called.
			
 
				+ * \return A positive value indicates that a video data packet was successfully
			
 
				+ *          produced.
			
 
				+ * \retval 0         No packet was produced, and no more encoded video data
			
 
				+ *                    remains.
			
 
				+ * \retval TH_EFAULT \a _enc or \a _op was <tt>NULL</tt>.*/
			
 
				+extern int th_encode_packetout(th_enc_ctx *_enc,int _last,ogg_packet *_op);
			
 
				+/**Frees an allocated encoder instance.
			
 
				+ * \param _enc A #th_enc_ctx handle.*/
			
 
				+extern void th_encode_free(th_enc_ctx *_enc);
			
 
				+/*@}*/
			
 
				+/*@}*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+#if defined(__cplusplus)
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/Version_script
+++ b/modules/theoraplayer/native/theora/lib/Version_script
@@ -0,0 +1,53 @@
 
				+#
			
 
				+# Export file for libtheora
			
 
				+#
			
 
				+# Only the symbols listed in the global section will be callable from
			
 
				+# applications linking to the libraries.
			
 
				+#
			
 
				+
			
 
				+# We use something that looks like a versioned so filename here 
			
 
				+# to define the old API because of a historical confusion. This
			
 
				+# label must be kept to maintain ABI compatibility.
			
 
				+
			
 
				+libtheora.so.1.0
			
 
				+{
			
 
				+	global:
			
 
				+		theora_version_string;
			
 
				+		theora_version_number;
			
 
				+
			
 
				+		theora_encode_init;
			
 
				+		theora_encode_YUVin;
			
 
				+		theora_encode_packetout;
			
 
				+		theora_encode_header;
			
 
				+		theora_encode_comment;
			
 
				+		theora_encode_tables;
			
 
				+
			
 
				+		theora_decode_header;
			
 
				+		theora_decode_init;
			
 
				+		theora_decode_packetin;
			
 
				+		theora_decode_YUVout;
			
 
				+
			
 
				+		theora_control;
			
 
				+
			
 
				+		theora_packet_isheader;
			
 
				+		theora_packet_iskeyframe;
			
 
				+
			
 
				+		theora_granule_shift;
			
 
				+		theora_granule_frame;
			
 
				+		theora_granule_time;
			
 
				+
			
 
				+		theora_info_init;
			
 
				+		theora_info_clear;
			
 
				+
			
 
				+		theora_clear;
			
 
				+
			
 
				+		theora_comment_init;
			
 
				+		theora_comment_add;
			
 
				+		theora_comment_add_tag;
			
 
				+		theora_comment_query;
			
 
				+		theora_comment_query_count;
			
 
				+		theora_comment_clear;
			
 
				+
			
 
				+	local:
			
 
				+		*;
			
 
				+};
			
--- a/modules/theoraplayer/native/theora/lib/Version_script-dec
+++ b/modules/theoraplayer/native/theora/lib/Version_script-dec
@@ -0,0 +1,82 @@
 
				+#
			
 
				+# Export file for libtheoradec
			
 
				+#
			
 
				+# Only the symbols listed in the global section will be callable from
			
 
				+# applications linking to the libraries.
			
 
				+#
			
 
				+
			
 
				+# The 1.x API
			
 
				+libtheoradec_1.0
			
 
				+{
			
 
				+	global:
			
 
				+		th_version_string;
			
 
				+		th_version_number;
			
 
				+
			
 
				+		th_decode_headerin;
			
 
				+		th_decode_alloc;
			
 
				+		th_setup_free;
			
 
				+		th_decode_ctl;
			
 
				+		th_decode_packetin;
			
 
				+		th_decode_ycbcr_out;
			
 
				+		th_decode_free;
			
 
				+
			
 
				+		th_packet_isheader;
			
 
				+		th_packet_iskeyframe;
			
 
				+
			
 
				+		th_granule_frame;
			
 
				+		th_granule_time;
			
 
				+
			
 
				+		th_info_init;
			
 
				+		th_info_clear;
			
 
				+
			
 
				+		th_comment_init;
			
 
				+		th_comment_add;
			
 
				+		th_comment_add_tag;
			
 
				+		th_comment_query;
			
 
				+		th_comment_query_count;
			
 
				+		th_comment_clear;
			
 
				+
			
 
				+	local:
			
 
				+		*;
			
 
				+};
			
 
				+
			
 
				+# The deprecated legacy api from the libtheora alpha releases.
			
 
				+# We use something that looks like a versioned so filename here 
			
 
				+# to define the old API because of a historical confusion. This
			
 
				+# label must be kept to maintain ABI compatibility.
			
 
				+
			
 
				+libtheora.so.1.0
			
 
				+{
			
 
				+	global:
			
 
				+		theora_version_string;
			
 
				+		theora_version_number;
			
 
				+
			
 
				+		theora_decode_header;
			
 
				+		theora_decode_init;
			
 
				+		theora_decode_packetin;
			
 
				+		theora_decode_YUVout;
			
 
				+
			
 
				+		theora_control;
			
 
				+
			
 
				+		theora_packet_isheader;
			
 
				+		theora_packet_iskeyframe;
			
 
				+
			
 
				+		theora_granule_shift;
			
 
				+		theora_granule_frame;
			
 
				+		theora_granule_time;
			
 
				+
			
 
				+		theora_info_init;
			
 
				+		theora_info_clear;
			
 
				+
			
 
				+		theora_clear;
			
 
				+
			
 
				+		theora_comment_init;
			
 
				+		theora_comment_add;
			
 
				+		theora_comment_add_tag;
			
 
				+		theora_comment_query;
			
 
				+		theora_comment_query_count;
			
 
				+		theora_comment_clear;
			
 
				+
			
 
				+	local:
			
 
				+		*;
			
 
				+};
			
--- a/modules/theoraplayer/native/theora/lib/Version_script-enc
+++ b/modules/theoraplayer/native/theora/lib/Version_script-enc
@@ -0,0 +1,43 @@
 
				+#
			
 
				+# Export file for libtheora
			
 
				+#
			
 
				+# Only the symbols listed in the global section will be callable from
			
 
				+# applications linking to the libraries.
			
 
				+#
			
 
				+
			
 
				+# The 1.x encoder API
			
 
				+libtheoraenc_1.0
			
 
				+{
			
 
				+	global:
			
 
				+		th_encode_alloc;
			
 
				+		th_encode_ctl;
			
 
				+		th_encode_flushheader;
			
 
				+		th_encode_ycbcr_in;
			
 
				+		th_encode_packetout;
			
 
				+		th_encode_free;
			
 
				+
			
 
				+		TH_VP31_QUANT_INFO;
			
 
				+		TH_VP31_HUFF_CODES;
			
 
				+
			
 
				+	local:
			
 
				+		*;
			
 
				+};
			
 
				+
			
 
				+# The encoder portion of the deprecated alpha release api.
			
 
				+# We use something that looks like a versioned so filename here 
			
 
				+# to define the old API because of a historical confusion. This
			
 
				+# label must be kept to maintain ABI compatibility.
			
 
				+
			
 
				+libtheora.so.1.0
			
 
				+{
			
 
				+	global:
			
 
				+		theora_encode_init;
			
 
				+		theora_encode_YUVin;
			
 
				+		theora_encode_packetout;
			
 
				+		theora_encode_header;
			
 
				+		theora_encode_comment;
			
 
				+		theora_encode_tables;
			
 
				+
			
 
				+	local:
			
 
				+		*;
			
 
				+};
			
--- a/modules/theoraplayer/native/theora/lib/analyze.c
+++ b/modules/theoraplayer/native/theora/lib/analyze.c
@@ -0,0 +1,2712 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function: mode selection code
			
 
				+  last mod: $Id$
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include <limits.h>
			
 
				+#include <string.h>
			
 
				+#include "encint.h"
			
 
				+#include "modedec.h"
			
 
				+#if defined(OC_COLLECT_METRICS)
			
 
				+# include "collect.c"
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+
			
 
				+typedef struct oc_rd_metric          oc_rd_metric;
			
 
				+typedef struct oc_mode_choice        oc_mode_choice;
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*There are 8 possible schemes used to encode macro block modes.
			
 
				+  Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
			
 
				+  The same set of Huffman codes is used for each of these 7 schemes, but the
			
 
				+   mode assigned to each codeword varies.
			
 
				+  Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
			
 
				+   while schemes 1-6 have a fixed mapping.
			
 
				+  Scheme 7 just encodes each mode directly in 3 bits.*/
			
 
				+
			
 
				+/*The mode orderings for the various mode coding schemes.
			
 
				+  Scheme 0 uses a custom alphabet, which is not stored in this table.
			
 
				+  This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
			
 
				+   decoder.*/
			
 
				+static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
			
 
				+  /*Last MV dominates.*/
			
 
				+  /*L P M N I G GM 4*/
			
 
				+  {3,4,2,0,1,5,6,7},
			
 
				+  /*L P N M I G GM 4*/
			
 
				+  {2,4,3,0,1,5,6,7},
			
 
				+  /*L M P N I G GM 4*/
			
 
				+  {3,4,1,0,2,5,6,7},
			
 
				+  /*L M N P I G GM 4*/
			
 
				+  {2,4,1,0,3,5,6,7},
			
 
				+  /*No MV dominates.*/
			
 
				+  /*N L P M I G GM 4*/
			
 
				+  {0,4,3,1,2,5,6,7},
			
 
				+  /*N G L P M I GM 4*/
			
 
				+  {0,5,4,2,3,1,6,7},
			
 
				+  /*Default ordering.*/
			
 
				+  /*N I M L P G GM 4*/
			
 
				+  {0,1,2,3,4,5,6,7}
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Initialize the mode scheme chooser.
			
 
				+  This need only be called once per encoder.*/
			
 
				+void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
			
 
				+  int si;
			
 
				+  _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
			
 
				+  for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
			
 
				+}
			
 
				+
			
 
				+/*Reset the mode scheme chooser.
			
 
				+  This needs to be called once for each frame, including the first.*/
			
 
				+static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
			
 
				+  int si;
			
 
				+  memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
			
 
				+  /*Scheme 0 starts with 24 bits to store the mode list in.*/
			
 
				+  _chooser->scheme_bits[0]=24;
			
 
				+  memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
			
 
				+  for(si=0;si<8;si++){
			
 
				+    /*Scheme 7 should always start first, and scheme 0 should always start
			
 
				+       last.*/
			
 
				+    _chooser->scheme_list[si]=7-si;
			
 
				+    _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Return the cost of coding _mb_mode in the specified scheme.*/
			
 
				+static int oc_mode_scheme_chooser_scheme_mb_cost(
			
 
				+ const oc_mode_scheme_chooser *_chooser,int _scheme,int _mb_mode){
			
 
				+  int codebook;
			
 
				+  int ri;
			
 
				+  codebook=_scheme+1>>3;
			
 
				+  /*For any scheme except 0, we can just use the bit cost of the mode's rank
			
 
				+     in that scheme.*/
			
 
				+  ri=_chooser->mode_ranks[_scheme][_mb_mode];
			
 
				+  if(_scheme==0){
			
 
				+    int mc;
			
 
				+    /*For scheme 0, incrementing the mode count could potentially change the
			
 
				+       mode's rank.
			
 
				+      Find the index where the mode would be moved to in the optimal list,
			
 
				+       and use its bit cost instead of the one for the mode's current
			
 
				+       position in the list.*/
			
 
				+    /*We don't actually reorder the list; this is for computing opportunity
			
 
				+       cost, not an update.*/
			
 
				+    mc=_chooser->mode_counts[_mb_mode];
			
 
				+    while(ri>0&&mc>=_chooser->mode_counts[_chooser->scheme0_list[ri-1]])ri--;
			
 
				+  }
			
 
				+  return OC_MODE_BITS[codebook][ri];
			
 
				+}
			
 
				+
			
 
				+/*This is the real purpose of this data structure: not actually selecting a
			
 
				+   mode scheme, but estimating the cost of coding a given mode given all the
			
 
				+   modes selected so far.
			
 
				+  This is done via opportunity cost: the cost is defined as the number of bits
			
 
				+   required to encode all the modes selected so far including the current one
			
 
				+   using the best possible scheme, minus the number of bits required to encode
			
 
				+   all the modes selected so far not including the current one using the best
			
 
				+   possible scheme.
			
 
				+  The computational expense of doing this probably makes it overkill.
			
 
				+  Just be happy we take a greedy approach instead of trying to solve the
			
 
				+   global mode-selection problem (which is NP-hard).
			
 
				+  _mb_mode: The mode to determine the cost of.
			
 
				+  Return: The number of bits required to code this mode.*/
			
 
				+static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
			
 
				+ int _mb_mode){
			
 
				+  int scheme0;
			
 
				+  int scheme1;
			
 
				+  int best_bits;
			
 
				+  int mode_bits;
			
 
				+  int si;
			
 
				+  int scheme0_bits;
			
 
				+  int scheme1_bits;
			
 
				+  scheme0=_chooser->scheme_list[0];
			
 
				+  scheme1=_chooser->scheme_list[1];
			
 
				+  scheme0_bits=_chooser->scheme_bits[scheme0];
			
 
				+  scheme1_bits=_chooser->scheme_bits[scheme1];
			
 
				+  mode_bits=oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme0,_mb_mode);
			
 
				+  /*Typical case: If the difference between the best scheme and the next best
			
 
				+     is greater than 6 bits, then adding just one mode cannot change which
			
 
				+     scheme we use.*/
			
 
				+  if(scheme1_bits-scheme0_bits>6)return mode_bits;
			
 
				+  /*Otherwise, check to see if adding this mode selects a different scheme as
			
 
				+     the best.*/
			
 
				+  si=1;
			
 
				+  best_bits=scheme0_bits+mode_bits;
			
 
				+  do{
			
 
				+    int cur_bits;
			
 
				+    cur_bits=scheme1_bits+
			
 
				+     oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme1,_mb_mode);
			
 
				+    if(cur_bits<best_bits)best_bits=cur_bits;
			
 
				+    if(++si>=8)break;
			
 
				+    scheme1=_chooser->scheme_list[si];
			
 
				+    scheme1_bits=_chooser->scheme_bits[scheme1];
			
 
				+  }
			
 
				+  while(scheme1_bits-scheme0_bits<=6);
			
 
				+  return best_bits-scheme0_bits;
			
 
				+}
			
 
				+
			
 
				+/*Incrementally update the mode counts and per-scheme bit counts and re-order
			
 
				+   the scheme lists once a mode has been selected.
			
 
				+  _mb_mode: The mode that was chosen.*/
			
 
				+static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
			
 
				+ int _mb_mode){
			
 
				+  int ri;
			
 
				+  int si;
			
 
				+  _chooser->mode_counts[_mb_mode]++;
			
 
				+  /*Re-order the scheme0 mode list if necessary.*/
			
 
				+  for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
			
 
				+    int pmode;
			
 
				+    pmode=_chooser->scheme0_list[ri-1];
			
 
				+    if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
			
 
				+    /*Reorder the mode ranking.*/
			
 
				+    _chooser->scheme0_ranks[pmode]++;
			
 
				+    _chooser->scheme0_list[ri]=pmode;
			
 
				+  }
			
 
				+  _chooser->scheme0_ranks[_mb_mode]=ri;
			
 
				+  _chooser->scheme0_list[ri]=_mb_mode;
			
 
				+  /*Now add the bit cost for the mode to each scheme.*/
			
 
				+  for(si=0;si<8;si++){
			
 
				+    _chooser->scheme_bits[si]+=
			
 
				+     OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
			
 
				+  }
			
 
				+  /*Finally, re-order the list of schemes.*/
			
 
				+  for(si=1;si<8;si++){
			
 
				+    int sj;
			
 
				+    int scheme0;
			
 
				+    int bits0;
			
 
				+    sj=si;
			
 
				+    scheme0=_chooser->scheme_list[si];
			
 
				+    bits0=_chooser->scheme_bits[scheme0];
			
 
				+    do{
			
 
				+      int scheme1;
			
 
				+      scheme1=_chooser->scheme_list[sj-1];
			
 
				+      if(bits0>=_chooser->scheme_bits[scheme1])break;
			
 
				+      _chooser->scheme_list[sj]=scheme1;
			
 
				+    }
			
 
				+    while(--sj>0);
			
 
				+    _chooser->scheme_list[sj]=scheme0;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*The number of bits required to encode a super block run.
			
 
				+  _run_count: The desired run count; must be positive and less than 4130.*/
			
 
				+static int oc_sb_run_bits(int _run_count){
			
 
				+  int i;
			
 
				+  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
			
 
				+  return OC_SB_RUN_CODE_NBITS[i];
			
 
				+}
			
 
				+
			
 
				+/*The number of bits required to encode a block run.
			
 
				+  _run_count: The desired run count; must be positive and less than 30.*/
			
 
				+static int oc_block_run_bits(int _run_count){
			
 
				+  return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void oc_fr_state_init(oc_fr_state *_fr){
			
 
				+  _fr->bits=0;
			
 
				+  _fr->sb_partial_count=0;
			
 
				+  _fr->sb_full_count=0;
			
 
				+  _fr->b_coded_count_prev=0;
			
 
				+  _fr->b_coded_count=0;
			
 
				+  _fr->b_count=0;
			
 
				+  _fr->sb_prefer_partial=0;
			
 
				+  _fr->sb_bits=0;
			
 
				+  _fr->sb_partial=-1;
			
 
				+  _fr->sb_full=-1;
			
 
				+  _fr->b_coded_prev=-1;
			
 
				+  _fr->b_coded=-1;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int oc_fr_state_sb_cost(const oc_fr_state *_fr,
			
 
				+ int _sb_partial,int _sb_full){
			
 
				+  int bits;
			
 
				+  int sb_partial_count;
			
 
				+  int sb_full_count;
			
 
				+  bits=0;
			
 
				+  sb_partial_count=_fr->sb_partial_count;
			
 
				+  /*Extend the sb_partial run, or start a new one.*/
			
 
				+  if(_fr->sb_partial==_sb_partial){
			
 
				+    if(sb_partial_count>=4129){
			
 
				+      bits++;
			
 
				+      sb_partial_count=0;
			
 
				+    }
			
 
				+    else bits-=oc_sb_run_bits(sb_partial_count);
			
 
				+  }
			
 
				+  else sb_partial_count=0;
			
 
				+  bits+=oc_sb_run_bits(++sb_partial_count);
			
 
				+  if(!_sb_partial){
			
 
				+    /*Extend the sb_full run, or start a new one.*/
			
 
				+    sb_full_count=_fr->sb_full_count;
			
 
				+    if(_fr->sb_full==_sb_full){
			
 
				+      if(sb_full_count>=4129){
			
 
				+        bits++;
			
 
				+        sb_full_count=0;
			
 
				+      }
			
 
				+      else bits-=oc_sb_run_bits(sb_full_count);
			
 
				+    }
			
 
				+    else sb_full_count=0;
			
 
				+    bits+=oc_sb_run_bits(++sb_full_count);
			
 
				+  }
			
 
				+  return bits;
			
 
				+}
			
 
				+
			
 
				+static void oc_fr_state_advance_sb(oc_fr_state *_fr,
			
 
				+ int _sb_partial,int _sb_full){
			
 
				+  int sb_partial_count;
			
 
				+  int sb_full_count;
			
 
				+  sb_partial_count=_fr->sb_partial_count;
			
 
				+  if(_fr->sb_partial!=_sb_partial||sb_partial_count>=4129)sb_partial_count=0;
			
 
				+  sb_partial_count++;
			
 
				+  if(!_sb_partial){
			
 
				+    sb_full_count=_fr->sb_full_count;
			
 
				+    if(_fr->sb_full!=_sb_full||sb_full_count>=4129)sb_full_count=0;
			
 
				+    sb_full_count++;
			
 
				+    _fr->sb_full_count=sb_full_count;
			
 
				+    _fr->sb_full=_sb_full;
			
 
				+    /*Roll back the partial block state.*/
			
 
				+    _fr->b_coded=_fr->b_coded_prev;
			
 
				+    _fr->b_coded_count=_fr->b_coded_count_prev;
			
 
				+  }
			
 
				+  else{
			
 
				+    /*Commit back the partial block state.*/
			
 
				+    _fr->b_coded_prev=_fr->b_coded;
			
 
				+    _fr->b_coded_count_prev=_fr->b_coded_count;
			
 
				+  }
			
 
				+  _fr->sb_partial_count=sb_partial_count;
			
 
				+  _fr->sb_partial=_sb_partial;
			
 
				+  _fr->b_count=0;
			
 
				+  _fr->sb_prefer_partial=0;
			
 
				+  _fr->sb_bits=0;
			
 
				+}
			
 
				+
			
 
				+/*Commit the state of the current super block and advance to the next.*/
			
 
				+static void oc_fr_state_flush_sb(oc_fr_state *_fr){
			
 
				+  int sb_partial;
			
 
				+  int sb_full;
			
 
				+  int b_coded_count;
			
 
				+  int b_count;
			
 
				+  b_count=_fr->b_count;
			
 
				+  b_coded_count=_fr->b_coded_count;
			
 
				+  sb_full=_fr->b_coded;
			
 
				+  sb_partial=b_coded_count<b_count;
			
 
				+  if(!sb_partial){
			
 
				+    /*If the super block is fully coded/uncoded...*/
			
 
				+    if(_fr->sb_prefer_partial){
			
 
				+      /*So far coding this super block as partial was cheaper anyway.*/
			
 
				+      if(b_coded_count>15||_fr->b_coded_prev<0){
			
 
				+        int sb_bits;
			
 
				+        /*If the block run is too long, this will limit how far it can be
			
 
				+           extended into the next partial super block.
			
 
				+          If we need to extend it farther, we don't want to have to roll all
			
 
				+           the way back here (since there could be many full SBs between now
			
 
				+           and then), so we disallow this.
			
 
				+          Similarly, if this is the start of a stripe, we don't know how the
			
 
				+           length of the outstanding block run from the previous stripe.*/
			
 
				+        sb_bits=oc_fr_state_sb_cost(_fr,sb_partial,sb_full);
			
 
				+        _fr->bits+=sb_bits-_fr->sb_bits;
			
 
				+        _fr->sb_bits=sb_bits;
			
 
				+      }
			
 
				+      else sb_partial=1;
			
 
				+    }
			
 
				+  }
			
 
				+  oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
			
 
				+}
			
 
				+
			
 
				+static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
			
 
				+  ptrdiff_t bits;
			
 
				+  int       sb_bits;
			
 
				+  int       b_coded_count;
			
 
				+  int       b_count;
			
 
				+  int       sb_prefer_partial;
			
 
				+  sb_bits=_fr->sb_bits;
			
 
				+  bits=_fr->bits-sb_bits;
			
 
				+  b_count=_fr->b_count;
			
 
				+  b_coded_count=_fr->b_coded_count;
			
 
				+  sb_prefer_partial=_fr->sb_prefer_partial;
			
 
				+  if(b_coded_count>=b_count){
			
 
				+    int sb_partial_bits;
			
 
				+    /*This super block is currently fully coded/uncoded.*/
			
 
				+    if(b_count<=0){
			
 
				+      /*This is the first block in this SB.*/
			
 
				+      b_count=1;
			
 
				+      /*Check to see whether it's cheaper to code it partially or fully.*/
			
 
				+      if(_fr->b_coded==_b_coded){
			
 
				+        sb_partial_bits=-oc_block_run_bits(b_coded_count);
			
 
				+        sb_partial_bits+=oc_block_run_bits(++b_coded_count);
			
 
				+      }
			
 
				+      else{
			
 
				+        b_coded_count=1;
			
 
				+        sb_partial_bits=2;
			
 
				+      }
			
 
				+      sb_partial_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
			
 
				+      sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
			
 
				+      sb_prefer_partial=sb_partial_bits<sb_bits;
			
 
				+      sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
			
 
				+    }
			
 
				+    else if(_fr->b_coded==_b_coded){
			
 
				+      b_coded_count++;
			
 
				+      if(++b_count<16){
			
 
				+        if(sb_prefer_partial){
			
 
				+          /*Check to see if it's cheaper to code it fully.*/
			
 
				+          sb_partial_bits=sb_bits;
			
 
				+          sb_partial_bits+=oc_block_run_bits(b_coded_count);
			
 
				+          if(b_coded_count>0){
			
 
				+            sb_partial_bits-=oc_block_run_bits(b_coded_count-1);
			
 
				+          }
			
 
				+          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
			
 
				+          sb_prefer_partial=sb_partial_bits<sb_bits;
			
 
				+          sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
			
 
				+        }
			
 
				+        /*There's no need to check the converse (whether it's cheaper to code
			
 
				+           this SB partially if we were coding it fully), since the cost to
			
 
				+           code a SB partially can only increase as we add more blocks, whereas
			
 
				+           the cost to code it fully stays constant.*/
			
 
				+      }
			
 
				+      else{
			
 
				+        /*If we get to the end and this SB is still full, then force it to be
			
 
				+           coded full.
			
 
				+          Otherwise we might not be able to extend the block run far enough
			
 
				+           into the next partial SB.*/
			
 
				+        if(sb_prefer_partial){
			
 
				+          sb_prefer_partial=0;
			
 
				+          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    else{
			
 
				+      /*This SB was full, but now must be made partial.*/
			
 
				+      if(!sb_prefer_partial){
			
 
				+        sb_bits=oc_block_run_bits(b_coded_count);
			
 
				+        if(b_coded_count>b_count){
			
 
				+          sb_bits-=oc_block_run_bits(b_coded_count-b_count);
			
 
				+        }
			
 
				+        sb_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
			
 
				+      }
			
 
				+      b_count++;
			
 
				+      b_coded_count=1;
			
 
				+      sb_prefer_partial=1;
			
 
				+      sb_bits+=2;
			
 
				+    }
			
 
				+  }
			
 
				+  else{
			
 
				+    b_count++;
			
 
				+    if(_fr->b_coded==_b_coded)sb_bits-=oc_block_run_bits(b_coded_count);
			
 
				+    else b_coded_count=0;
			
 
				+    sb_bits+=oc_block_run_bits(++b_coded_count);
			
 
				+  }
			
 
				+  _fr->bits=bits+sb_bits;
			
 
				+  _fr->b_coded_count=b_coded_count;
			
 
				+  _fr->b_coded=_b_coded;
			
 
				+  _fr->b_count=b_count;
			
 
				+  _fr->sb_prefer_partial=sb_prefer_partial;
			
 
				+  _fr->sb_bits=sb_bits;
			
 
				+}
			
 
				+
			
 
				+static void oc_fr_skip_block(oc_fr_state *_fr){
			
 
				+  oc_fr_state_advance_block(_fr,0);
			
 
				+}
			
 
				+
			
 
				+static void oc_fr_code_block(oc_fr_state *_fr){
			
 
				+  oc_fr_state_advance_block(_fr,1);
			
 
				+}
			
 
				+
			
 
				+static int oc_fr_cost1(const oc_fr_state *_fr){
			
 
				+  oc_fr_state tmp;
			
 
				+  ptrdiff_t   bits;
			
 
				+  *&tmp=*_fr;
			
 
				+  oc_fr_skip_block(&tmp);
			
 
				+  bits=tmp.bits;
			
 
				+  *&tmp=*_fr;
			
 
				+  oc_fr_code_block(&tmp);
			
 
				+  return (int)(tmp.bits-bits);
			
 
				+}
			
 
				+
			
 
				+static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
			
 
				+  oc_fr_state tmp;
			
 
				+  *&tmp=*_pre;
			
 
				+  oc_fr_skip_block(&tmp);
			
 
				+  oc_fr_skip_block(&tmp);
			
 
				+  oc_fr_skip_block(&tmp);
			
 
				+  oc_fr_skip_block(&tmp);
			
 
				+  return (int)(_post->bits-tmp.bits);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void oc_qii_state_init(oc_qii_state *_qs){
			
 
				+  _qs->bits=0;
			
 
				+  _qs->qi01_count=0;
			
 
				+  _qs->qi01=-1;
			
 
				+  _qs->qi12_count=0;
			
 
				+  _qs->qi12=-1;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void oc_qii_state_advance(oc_qii_state *_qd,
			
 
				+ const oc_qii_state *_qs,int _qii){
			
 
				+  ptrdiff_t bits;
			
 
				+  int       qi01;
			
 
				+  int       qi01_count;
			
 
				+  int       qi12;
			
 
				+  int       qi12_count;
			
 
				+  bits=_qs->bits;
			
 
				+  qi01=_qii+1>>1;
			
 
				+  qi01_count=_qs->qi01_count;
			
 
				+  if(qi01==_qs->qi01){
			
 
				+    if(qi01_count>=4129){
			
 
				+      bits++;
			
 
				+      qi01_count=0;
			
 
				+    }
			
 
				+    else bits-=oc_sb_run_bits(qi01_count);
			
 
				+  }
			
 
				+  else qi01_count=0;
			
 
				+  qi01_count++;
			
 
				+  bits+=oc_sb_run_bits(qi01_count);
			
 
				+  qi12_count=_qs->qi12_count;
			
 
				+  if(_qii){
			
 
				+    qi12=_qii>>1;
			
 
				+    if(qi12==_qs->qi12){
			
 
				+      if(qi12_count>=4129){
			
 
				+        bits++;
			
 
				+        qi12_count=0;
			
 
				+      }
			
 
				+      else bits-=oc_sb_run_bits(qi12_count);
			
 
				+    }
			
 
				+    else qi12_count=0;
			
 
				+    qi12_count++;
			
 
				+    bits+=oc_sb_run_bits(qi12_count);
			
 
				+  }
			
 
				+  else qi12=_qs->qi12;
			
 
				+  _qd->bits=bits;
			
 
				+  _qd->qi01=qi01;
			
 
				+  _qd->qi01_count=qi01_count;
			
 
				+  _qd->qi12=qi12;
			
 
				+  _qd->qi12_count=qi12_count;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
			
 
				+  ptrdiff_t *coded_fragis;
			
 
				+  unsigned   mcu_nvsbs;
			
 
				+  ptrdiff_t  mcu_nfrags;
			
 
				+  int        flimit;
			
 
				+  int        hdec;
			
 
				+  int        vdec;
			
 
				+  int        pli;
			
 
				+  int        nqis;
			
 
				+  int        qii;
			
 
				+  int        qi0;
			
 
				+  int        qti;
			
 
				+  /*Initialize the per-plane coded block flag trackers.
			
 
				+    These are used for bit-estimation purposes only; the real flag bits span
			
 
				+     all three planes, so we can't compute them in parallel.*/
			
 
				+  for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
			
 
				+  for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli);
			
 
				+  /*Set up the per-plane skip SSD storage pointers.*/
			
 
				+  mcu_nvsbs=_enc->mcu_nvsbs;
			
 
				+  mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16;
			
 
				+  hdec=!(_enc->state.info.pixel_fmt&1);
			
 
				+  vdec=!(_enc->state.info.pixel_fmt&2);
			
 
				+  _pipe->skip_ssd[0]=_enc->mcu_skip_ssd;
			
 
				+  _pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags;
			
 
				+  _pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec);
			
 
				+  /*Set up per-plane pointers to the coded and uncoded fragments lists.
			
 
				+    Unlike the decoder, each planes' coded and uncoded fragment list is kept
			
 
				+     separate during the analysis stage; we only make the coded list for all
			
 
				+     three planes contiguous right before the final packet is output
			
 
				+     (destroying the uncoded lists, which are no longer needed).*/
			
 
				+  coded_fragis=_enc->state.coded_fragis;
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    _pipe->coded_fragis[pli]=coded_fragis;
			
 
				+    coded_fragis+=_enc->state.fplanes[pli].nfrags;
			
 
				+    _pipe->uncoded_fragis[pli]=coded_fragis;
			
 
				+  }
			
 
				+  memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
			
 
				+  memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
			
 
				+  /*Set up condensed quantizer tables.*/
			
 
				+  qi0=_enc->state.qis[0];
			
 
				+  nqis=_enc->state.nqis;
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    for(qii=0;qii<nqis;qii++){
			
 
				+      int qi;
			
 
				+      qi=_enc->state.qis[qii];
			
 
				+      for(qti=0;qti<2;qti++){
			
 
				+        /*Set the DC coefficient in the dequantization table.*/
			
 
				+        _enc->state.dequant_tables[qi][pli][qti][0]=
			
 
				+         _enc->dequant_dc[qi0][pli][qti];
			
 
				+        _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
			
 
				+        /*Copy over the quantization table.*/
			
 
				+        memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti],
			
 
				+         _enc->opt_data.enquant_table_size);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  /*Fix up the DC coefficients in the quantization tables.*/
			
 
				+  oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis);
			
 
				+  /*Initialize the tokenization state.*/
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    _pipe->ndct_tokens1[pli]=0;
			
 
				+    _pipe->eob_run1[pli]=0;
			
 
				+  }
			
 
				+  /*Initialize the bounding value array for the loop filter.*/
			
 
				+  flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
			
 
				+  _pipe->loop_filter=flimit!=0;
			
 
				+  if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
			
 
				+  /*Clear the temporary DCT scratch space.*/
			
 
				+  memset(_pipe->dct_data,0,sizeof(_pipe->dct_data));
			
 
				+}
			
 
				+
			
 
				+/*Sets the current MCU stripe to super block row _sby.
			
 
				+  Return: A non-zero value if this was the last MCU.*/
			
 
				+static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
			
 
				+ oc_enc_pipeline_state *_pipe,int _sby){
			
 
				+  const oc_fragment_plane *fplane;
			
 
				+  unsigned                 mcu_nvsbs;
			
 
				+  int                      sby_end;
			
 
				+  int                      notdone;
			
 
				+  int                      vdec;
			
 
				+  int                      pli;
			
 
				+  mcu_nvsbs=_enc->mcu_nvsbs;
			
 
				+  sby_end=_enc->state.fplanes[0].nvsbs;
			
 
				+  notdone=_sby+mcu_nvsbs<sby_end;
			
 
				+  if(notdone)sby_end=_sby+mcu_nvsbs;
			
 
				+  vdec=0;
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    fplane=_enc->state.fplanes+pli;
			
 
				+    _pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs;
			
 
				+    _pipe->fragy0[pli]=_sby<<2-vdec;
			
 
				+    _pipe->froffset[pli]=fplane->froffset
			
 
				+     +_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags;
			
 
				+    if(notdone){
			
 
				+      _pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs;
			
 
				+      _pipe->fragy_end[pli]=sby_end<<2-vdec;
			
 
				+    }
			
 
				+    else{
			
 
				+      _pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs;
			
 
				+      _pipe->fragy_end[pli]=fplane->nvfrags;
			
 
				+    }
			
 
				+    vdec=!(_enc->state.info.pixel_fmt&2);
			
 
				+  }
			
 
				+  return notdone;
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
			
 
				+ oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
			
 
				+  /*Copy over all the uncoded fragments from this plane and advance the uncoded
			
 
				+     fragment list.*/
			
 
				+  if(_pipe->nuncoded_fragis[_pli]>0){
			
 
				+    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
			
 
				+    oc_frag_copy_list(&_enc->state,
			
 
				+     _enc->state.ref_frame_data[OC_FRAME_SELF],
			
 
				+     _enc->state.ref_frame_data[OC_FRAME_PREV],
			
 
				+     _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
			
 
				+     _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
			
 
				+    _pipe->nuncoded_fragis[_pli]=0;
			
 
				+  }
			
 
				+  /*Perform DC prediction.*/
			
 
				+  oc_enc_pred_dc_frag_rows(_enc,_pli,
			
 
				+   _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
			
 
				+  /*Finish DC tokenization.*/
			
 
				+  oc_enc_tokenize_dc_frag_list(_enc,_pli,
			
 
				+   _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
			
 
				+   _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
			
 
				+  _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
			
 
				+  _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
			
 
				+  /*And advance the coded fragment list.*/
			
 
				+  _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
			
 
				+  _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
			
 
				+  _pipe->ncoded_fragis[_pli]=0;
			
 
				+  /*Apply the loop filter if necessary.*/
			
 
				+  if(_pipe->loop_filter){
			
 
				+    oc_state_loop_filter_frag_rows(&_enc->state,
			
 
				+     _pipe->bounding_values,OC_FRAME_SELF,_pli,
			
 
				+     _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
			
 
				+  }
			
 
				+  else _sdelay=_edelay=0;
			
 
				+  /*To fill borders, we have an additional two pixel delay, since a fragment
			
 
				+     in the next row could filter its top edge, using two pixels from a
			
 
				+     fragment in this row.
			
 
				+    But there's no reason to delay a full fragment between the two.*/
			
 
				+  oc_state_borders_fill_rows(&_enc->state,
			
 
				+   _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli,
			
 
				+   (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
			
 
				+   (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Cost information about the coded blocks in a MB.*/
			
 
				+struct oc_rd_metric{
			
 
				+  int uncoded_ac_ssd;
			
 
				+  int coded_ac_ssd;
			
 
				+  int ac_bits;
			
 
				+  int dc_flag;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
			
 
				+ oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
			
 
				+ unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
			
 
				+ oc_fr_state *_fr,oc_token_checkpoint **_stack){
			
 
				+  ogg_int16_t            *data;
			
 
				+  ogg_int16_t            *dct;
			
 
				+  ogg_int16_t            *idct;
			
 
				+  oc_qii_state            qs;
			
 
				+  const ogg_uint16_t     *dequant;
			
 
				+  ogg_uint16_t            dequant_dc;
			
 
				+  ptrdiff_t               frag_offs;
			
 
				+  int                     ystride;
			
 
				+  const unsigned char    *src;
			
 
				+  const unsigned char    *ref;
			
 
				+  unsigned char          *dst;
			
 
				+  int                     nonzero;
			
 
				+  unsigned                uncoded_ssd;
			
 
				+  unsigned                coded_ssd;
			
 
				+  oc_token_checkpoint    *checkpoint;
			
 
				+  oc_fragment            *frags;
			
 
				+  int                     mb_mode;
			
 
				+  int                     refi;
			
 
				+  int                     mv_offs[2];
			
 
				+  int                     nmv_offs;
			
 
				+  int                     ac_bits;
			
 
				+  int                     borderi;
			
 
				+  int                     nqis;
			
 
				+  int                     qti;
			
 
				+  int                     qii;
			
 
				+  int                     dc;
			
 
				+  nqis=_enc->state.nqis;
			
 
				+  frags=_enc->state.frags;
			
 
				+  frag_offs=_enc->state.frag_buf_offs[_fragi];
			
 
				+  ystride=_enc->state.ref_ystride[_pli];
			
 
				+  src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
			
 
				+  borderi=frags[_fragi].borderi;
			
 
				+  qii=frags[_fragi].qii;
			
 
				+  data=_enc->pipe.dct_data;
			
 
				+  dct=data+64;
			
 
				+  idct=data+128;
			
 
				+  if(qii&~3){
			
 
				+#if !defined(OC_COLLECT_METRICS)
			
 
				+    if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
			
 
				+      /*Enable early skip detection.*/
			
 
				+      frags[_fragi].coded=0;
			
 
				+      frags[_fragi].refi=OC_FRAME_NONE;
			
 
				+      oc_fr_skip_block(_fr);
			
 
				+      return 0;
			
 
				+    }
			
 
				+#endif
			
 
				+    /*Try and code this block anyway.*/
			
 
				+    qii&=3;
			
 
				+  }
			
 
				+  refi=frags[_fragi].refi;
			
 
				+  mb_mode=frags[_fragi].mb_mode;
			
 
				+  ref=_enc->state.ref_frame_data[refi]+frag_offs;
			
 
				+  dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs;
			
 
				+  /*Motion compensation:*/
			
 
				+  switch(mb_mode){
			
 
				+    case OC_MODE_INTRA:{
			
 
				+      nmv_offs=0;
			
 
				+      oc_enc_frag_sub_128(_enc,data,src,ystride);
			
 
				+    }break;
			
 
				+    case OC_MODE_GOLDEN_NOMV:
			
 
				+    case OC_MODE_INTER_NOMV:{
			
 
				+      nmv_offs=1;
			
 
				+      mv_offs[0]=0;
			
 
				+      oc_enc_frag_sub(_enc,data,src,ref,ystride);
			
 
				+    }break;
			
 
				+    default:{
			
 
				+      const oc_mv *frag_mvs;
			
 
				+      frag_mvs=_enc->state.frag_mvs;
			
 
				+      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,
			
 
				+       _pli,frag_mvs[_fragi]);
			
 
				+      if(nmv_offs>1){
			
 
				+        oc_enc_frag_copy2(_enc,dst,
			
 
				+         ref+mv_offs[0],ref+mv_offs[1],ystride);
			
 
				+        oc_enc_frag_sub(_enc,data,src,dst,ystride);
			
 
				+      }
			
 
				+      else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
			
 
				+    }break;
			
 
				+  }
			
 
				+#if defined(OC_COLLECT_METRICS)
			
 
				+  {
			
 
				+    unsigned sad;
			
 
				+    unsigned satd;
			
 
				+    switch(nmv_offs){
			
 
				+      case 0:{
			
 
				+        sad=oc_enc_frag_intra_sad(_enc,src,ystride);
			
 
				+        satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);
			
 
				+      }break;
			
 
				+      case 1:{
			
 
				+        sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
			
 
				+        satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
			
 
				+        satd+=abs(dc);
			
 
				+      }break;
			
 
				+      default:{
			
 
				+        sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX);
			
 
				+        satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
			
 
				+        satd+=abs(dc);
			
 
				+      }break;
			
 
				+    }
			
 
				+    _enc->frag_sad[_fragi]=sad;
			
 
				+    _enc->frag_satd[_fragi]=satd;
			
 
				+  }
			
 
				+#endif
			
 
				+  /*Transform:*/
			
 
				+  oc_enc_fdct8x8(_enc,dct,data);
			
 
				+  /*Quantize:*/
			
 
				+  qti=mb_mode!=OC_MODE_INTRA;
			
 
				+  dequant=_enc->dequant[_pli][qii][qti];
			
 
				+  nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]);
			
 
				+  dc=data[0];
			
 
				+  /*Tokenize.*/
			
 
				+  checkpoint=*_stack;
			
 
				+  if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
			
 
				+    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
			
 
				+     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
			
 
				+  }
			
 
				+  else{
			
 
				+    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
			
 
				+     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
			
 
				+  }
			
 
				+  /*Reconstruct.
			
 
				+    TODO: nonzero may need to be adjusted after tokenization.*/
			
 
				+  dequant_dc=dequant[0];
			
 
				+  if(nonzero==0){
			
 
				+    ogg_int16_t p;
			
 
				+    int         ci;
			
 
				+    int         qi01;
			
 
				+    int         qi12;
			
 
				+    /*We round this dequant product (and not any of the others) because there's
			
 
				+       no iDCT rounding.*/
			
 
				+    p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5);
			
 
				+    /*LOOP VECTORIZES.*/
			
 
				+    for(ci=0;ci<64;ci++)data[ci]=p;
			
 
				+    /*We didn't code any AC coefficients, so don't change the quantizer.*/
			
 
				+    qi01=_pipe->qs[_pli].qi01;
			
 
				+    qi12=_pipe->qs[_pli].qi12;
			
 
				+    if(qi01>0)qii=1+qi12;
			
 
				+    else if(qi01>=0)qii=0;
			
 
				+  }
			
 
				+  else{
			
 
				+    idct[0]=dc*dequant_dc;
			
 
				+    /*Note: This clears idct[] back to zero for the next block.*/
			
 
				+    oc_idct8x8(&_enc->state,data,idct,nonzero+1);
			
 
				+  }
			
 
				+  frags[_fragi].qii=qii;
			
 
				+  if(nqis>1){
			
 
				+    oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
			
 
				+    ac_bits+=qs.bits-_pipe->qs[_pli].bits;
			
 
				+  }
			
 
				+  if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
			
 
				+  else{
			
 
				+    oc_enc_frag_recon_inter(_enc,dst,
			
 
				+     nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
			
 
				+  }
			
 
				+  /*If _fr is NULL, then this is an INTRA frame, and we can't skip blocks.*/
			
 
				+#if !defined(OC_COLLECT_METRICS)
			
 
				+  if(_fr!=NULL)
			
 
				+#endif
			
 
				+  {
			
 
				+    /*In retrospect, should we have skipped this block?*/
			
 
				+    if(borderi<0){
			
 
				+      coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride);
			
 
				+    }
			
 
				+    else{
			
 
				+      coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride,
			
 
				+       _enc->state.borders[borderi].mask);
			
 
				+    }
			
 
				+    /*Scale to match DCT domain.*/
			
 
				+    coded_ssd<<=4;
			
 
				+#if defined(OC_COLLECT_METRICS)
			
 
				+    _enc->frag_ssd[_fragi]=coded_ssd;
			
 
				+  }
			
 
				+  if(_fr!=NULL){
			
 
				+#endif
			
 
				+    coded_ssd=OC_RD_SCALE(coded_ssd,_rd_scale);
			
 
				+    uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
			
 
				+    if(uncoded_ssd<UINT_MAX&&
			
 
				+     /*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility
			
 
				+        is enabled.*/
			
 
				+     (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
			
 
				+      int overhead_bits;
			
 
				+      overhead_bits=oc_fr_cost1(_fr);
			
 
				+      /*Although the fragment coding overhead determination is accurate, it is
			
 
				+         greedy, using very coarse-grained local information.
			
 
				+        Allowing it to mildly discourage coding turns out to be beneficial, but
			
 
				+         it's not clear that allowing it to encourage coding through negative
			
 
				+         coding overhead deltas is useful.
			
 
				+        For that reason, we disallow negative coding overheads.*/
			
 
				+      if(overhead_bits<0)overhead_bits=0;
			
 
				+      if(uncoded_ssd<=coded_ssd+(overhead_bits+ac_bits)*_enc->lambda){
			
 
				+        /*Hm, not worth it; roll back.*/
			
 
				+        oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
			
 
				+        *_stack=checkpoint;
			
 
				+        frags[_fragi].coded=0;
			
 
				+        frags[_fragi].refi=OC_FRAME_NONE;
			
 
				+        oc_fr_skip_block(_fr);
			
 
				+        return 0;
			
 
				+      }
			
 
				+    }
			
 
				+    else _mo->dc_flag=1;
			
 
				+    _mo->uncoded_ac_ssd+=uncoded_ssd;
			
 
				+    _mo->coded_ac_ssd+=coded_ssd;
			
 
				+    _mo->ac_bits+=ac_bits;
			
 
				+    oc_fr_code_block(_fr);
			
 
				+  }
			
 
				+  /*GCC 4.4.4 generates a warning here because it can't tell that
			
 
				+     the init code in the nqis check above will run anytime this
			
 
				+     line runs.*/
			
 
				+  if(nqis>1)*(_pipe->qs+_pli)=*&qs;
			
 
				+  frags[_fragi].dc=dc;
			
 
				+  frags[_fragi].coded=1;
			
 
				+  return 1;
			
 
				+}
			
 
				+
			
 
				+static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
			
 
				+ oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead,
			
 
				+ const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
			
 
				+  /*Worst case token stack usage for 4 fragments.*/
			
 
				+  oc_token_checkpoint  stack[64*4];
			
 
				+  oc_token_checkpoint *stackptr;
			
 
				+  const oc_sb_map     *sb_maps;
			
 
				+  signed char         *mb_modes;
			
 
				+  oc_fragment         *frags;
			
 
				+  ptrdiff_t           *coded_fragis;
			
 
				+  ptrdiff_t            ncoded_fragis;
			
 
				+  ptrdiff_t           *uncoded_fragis;
			
 
				+  ptrdiff_t            nuncoded_fragis;
			
 
				+  oc_rd_metric         mo;
			
 
				+  oc_fr_state          fr_checkpoint;
			
 
				+  oc_qii_state         qs_checkpoint;
			
 
				+  int                  mb_mode;
			
 
				+  int                  refi;
			
 
				+  int                  ncoded;
			
 
				+  ptrdiff_t            fragi;
			
 
				+  int                  bi;
			
 
				+  *&fr_checkpoint=*(_pipe->fr+0);
			
 
				+  *&qs_checkpoint=*(_pipe->qs+0);
			
 
				+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
			
 
				+  mb_modes=_enc->state.mb_modes;
			
 
				+  frags=_enc->state.frags;
			
 
				+  coded_fragis=_pipe->coded_fragis[0];
			
 
				+  ncoded_fragis=_pipe->ncoded_fragis[0];
			
 
				+  uncoded_fragis=_pipe->uncoded_fragis[0];
			
 
				+  nuncoded_fragis=_pipe->nuncoded_fragis[0];
			
 
				+  mb_mode=mb_modes[_mbi];
			
 
				+  refi=OC_FRAME_FOR_MODE(mb_mode);
			
 
				+  ncoded=0;
			
 
				+  stackptr=stack;
			
 
				+  memset(&mo,0,sizeof(mo));
			
 
				+  for(bi=0;bi<4;bi++){
			
 
				+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
			
 
				+    frags[fragi].refi=refi;
			
 
				+    frags[fragi].mb_mode=mb_mode;
			
 
				+    if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
			
 
				+     _rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){
			
 
				+      coded_fragis[ncoded_fragis++]=fragi;
			
 
				+      ncoded++;
			
 
				+    }
			
 
				+    else *(uncoded_fragis-++nuncoded_fragis)=fragi;
			
 
				+  }
			
 
				+  if(ncoded>0&&!mo.dc_flag){
			
 
				+    int cost;
			
 
				+    /*Some individual blocks were worth coding.
			
 
				+      See if that's still true when accounting for mode and MV overhead.*/
			
 
				+    cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
			
 
				+     +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
			
 
				+    if(mo.uncoded_ac_ssd<=cost){
			
 
				+      /*Taking macroblock overhead into account, it is not worth coding this
			
 
				+         MB.*/
			
 
				+      oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
			
 
				+      *(_pipe->fr+0)=*&fr_checkpoint;
			
 
				+      *(_pipe->qs+0)=*&qs_checkpoint;
			
 
				+      for(bi=0;bi<4;bi++){
			
 
				+        fragi=sb_maps[_mbi>>2][_mbi&3][bi];
			
 
				+        if(frags[fragi].coded){
			
 
				+          *(uncoded_fragis-++nuncoded_fragis)=fragi;
			
 
				+          frags[fragi].coded=0;
			
 
				+          frags[fragi].refi=OC_FRAME_NONE;
			
 
				+        }
			
 
				+        oc_fr_skip_block(_pipe->fr+0);
			
 
				+      }
			
 
				+      ncoded_fragis-=ncoded;
			
 
				+      ncoded=0;
			
 
				+    }
			
 
				+  }
			
 
				+  /*If no luma blocks coded, the mode is forced.*/
			
 
				+  if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
			
 
				+  /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
			
 
				+     with a single coded block.
			
 
				+    This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
			
 
				+     skipped blocks, while a 1MV does not.*/
			
 
				+  else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
			
 
				+    mb_modes[_mbi]=OC_MODE_INTER_MV;
			
 
				+  }
			
 
				+  _pipe->ncoded_fragis[0]=ncoded_fragis;
			
 
				+  _pipe->nuncoded_fragis[0]=nuncoded_fragis;
			
 
				+  return ncoded;
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc,
			
 
				+ oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
			
 
				+  const ogg_uint16_t *mcu_rd_scale;
			
 
				+  const ogg_uint16_t *mcu_rd_iscale;
			
 
				+  const oc_sb_map    *sb_maps;
			
 
				+  oc_sb_flags        *sb_flags;
			
 
				+  oc_fr_state        *fr;
			
 
				+  ptrdiff_t          *coded_fragis;
			
 
				+  ptrdiff_t           ncoded_fragis;
			
 
				+  ptrdiff_t          *uncoded_fragis;
			
 
				+  ptrdiff_t           nuncoded_fragis;
			
 
				+  ptrdiff_t           froffset;
			
 
				+  int                 sbi;
			
 
				+  fr=_pipe->fr+_pli;
			
 
				+  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
			
 
				+  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
			
 
				+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
			
 
				+  sb_flags=_enc->state.sb_flags;
			
 
				+  coded_fragis=_pipe->coded_fragis[_pli];
			
 
				+  ncoded_fragis=_pipe->ncoded_fragis[_pli];
			
 
				+  uncoded_fragis=_pipe->uncoded_fragis[_pli];
			
 
				+  nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
			
 
				+  froffset=_pipe->froffset[_pli];
			
 
				+  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
			
 
				+    /*Worst case token stack usage for 1 fragment.*/
			
 
				+    oc_token_checkpoint stack[64];
			
 
				+    oc_rd_metric        mo;
			
 
				+    int                 quadi;
			
 
				+    int                 bi;
			
 
				+    memset(&mo,0,sizeof(mo));
			
 
				+    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
			
 
				+      ptrdiff_t fragi;
			
 
				+      fragi=sb_maps[sbi][quadi][bi];
			
 
				+      if(fragi>=0){
			
 
				+        oc_token_checkpoint *stackptr;
			
 
				+        unsigned             rd_scale;
			
 
				+        unsigned             rd_iscale;
			
 
				+        rd_scale=mcu_rd_scale[fragi-froffset];
			
 
				+        rd_iscale=mcu_rd_iscale[fragi-froffset];
			
 
				+        stackptr=stack;
			
 
				+        if(oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
			
 
				+         rd_scale,rd_iscale,&mo,fr,&stackptr)){
			
 
				+          coded_fragis[ncoded_fragis++]=fragi;
			
 
				+        }
			
 
				+        else *(uncoded_fragis-++nuncoded_fragis)=fragi;
			
 
				+      }
			
 
				+    }
			
 
				+    oc_fr_state_flush_sb(fr);
			
 
				+    sb_flags[sbi].coded_fully=fr->sb_full;
			
 
				+    sb_flags[sbi].coded_partially=fr->sb_partial;
			
 
				+  }
			
 
				+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
			
 
				+  _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
			
 
				+}
			
 
				+
			
 
				+/*Mode decision is done by exhaustively examining all potential choices.
			
 
				+  Obviously, doing the motion compensation, fDCT, tokenization, and then
			
 
				+   counting the bits each token uses is computationally expensive.
			
 
				+  Theora's EOB runs can also split the cost of these tokens across multiple
			
 
				+   fragments, and naturally we don't know what the optimal choice of Huffman
			
 
				+   codes will be until we know all the tokens we're going to encode in all the
			
 
				+   fragments.
			
 
				+  So we use a simple approach to estimating the bit cost and distortion of each
			
 
				+   mode based upon the SATD value of the residual before coding.
			
 
				+  The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
			
 
				+   the process (modified somewhat from that of the paper) is very simple.
			
 
				+  We build a non-linear regression of the mappings from
			
 
				+   (pre-transform+quantization) SATD to (post-transform+quantization) bits and
			
 
				+   SSD for each qi.
			
 
				+  A separate set of mappings is kept for each quantization type and color
			
 
				+   plane.
			
 
				+  The mappings are constructed by partitioning the SATD values into a small
			
 
				+   number of bins (currently 24) and using a linear regression in each bin
			
 
				+   (as opposed to the 0th-order regression used by Kim).
			
 
				+  The bit counts and SSD measurements are obtained by examining actual encoded
			
 
				+   frames, with appropriate lambda values and optimal Huffman codes selected.
			
 
				+  EOB bits are assigned to the fragment that started the EOB run (as opposed to
			
 
				+   dividing them among all the blocks in the run; the latter approach seems
			
 
				+   more theoretically correct, but Monty's testing showed a small improvement
			
 
				+   with the former, though that may have been merely statistical noise).
			
 
				+
			
 
				+  @ARTICLE{Kim03,
			
 
				+    author="Hyun Mun Kim",
			
 
				+    title="Adaptive Rate Control Using Nonlinear Regression",
			
 
				+    journal="IEEE Transactions on Circuits and Systems for Video Technology",
			
 
				+    volume=13,
			
 
				+    number=5,
			
 
				+    pages="432--439",
			
 
				+    month=May,
			
 
				+    year=2003
			
 
				+  }*/
			
 
				+
			
 
				+/*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding
			
 
				+   overflow for large lambda values.*/
			
 
				+#define OC_MODE_RD_COST(_ssd,_rate,_lambda) \
			
 
				+ ((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \
			
 
				+ +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
			
 
				+ +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
			
 
				+
			
 
				+static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
			
 
				+#if !defined(OC_COLLECT_METRICS)
			
 
				+  const
			
 
				+#endif
			
 
				+  oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]=
			
 
				+   _enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD;
			
 
				+  int qii;
			
 
				+#if defined(OC_COLLECT_METRICS)
			
 
				+  oc_enc_mode_metrics_load(_enc);
			
 
				+#endif
			
 
				+  for(qii=0;qii<_enc->state.nqis;qii++){
			
 
				+    int qi;
			
 
				+    int pli;
			
 
				+    qi=_enc->state.qis[qii];
			
 
				+    for(pli=0;pli<3;pli++){
			
 
				+      int qti;
			
 
				+      for(qti=0;qti<2;qti++){
			
 
				+        int log_plq;
			
 
				+        int modeline;
			
 
				+        int bin;
			
 
				+        int dx;
			
 
				+        int dq;
			
 
				+        log_plq=_enc->log_plq[qi][pli][qti];
			
 
				+        /*Find the pair of rows in the mode table that bracket this quantizer.
			
 
				+          If it falls outside the range the table covers, then we just use a
			
 
				+           pair on the edge for linear extrapolation.*/
			
 
				+        for(modeline=0;modeline<OC_LOGQ_BINS-1&&
			
 
				+         OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
			
 
				+        /*Interpolate a row for this quantizer.*/
			
 
				+        dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq;
			
 
				+        dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti];
			
 
				+        if(dq==0)dq=1;
			
 
				+        for(bin=0;bin<OC_COMP_BINS;bin++){
			
 
				+          int y0;
			
 
				+          int z0;
			
 
				+          int dy;
			
 
				+          int dz;
			
 
				+          y0=oc_mode_rd_table[modeline][pli][qti][bin].rate;
			
 
				+          z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse;
			
 
				+          dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0;
			
 
				+          dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0;
			
 
				+          _enc->mode_rd[qii][pli][qti][bin].rate=
			
 
				+           (ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767);
			
 
				+          _enc->mode_rd[qii][pli][qti][bin].rmse=
			
 
				+           (ogg_int16_t)OC_CLAMPI(-32768,z0+(dz*dx+(dq>>1))/dq,32767);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
			
 
				+   prediction.*/
			
 
				+static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
			
 
				+ int _qii,int _pli,int _qti,int _satd){
			
 
				+  unsigned rmse;
			
 
				+  int      shift;
			
 
				+  int      bin;
			
 
				+  int      dx;
			
 
				+  int      y0;
			
 
				+  int      z0;
			
 
				+  int      dy;
			
 
				+  int      dz;
			
 
				+  /*SATD metrics for chroma planes vary much less than luma, so we scale them
			
 
				+     by 4 to distribute them into the mode decision bins more evenly.*/
			
 
				+  _satd<<=_pli+1&2;
			
 
				+  shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT;
			
 
				+  bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2);
			
 
				+  dx=_satd-(bin<<shift);
			
 
				+  y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate;
			
 
				+  z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse;
			
 
				+  dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0;
			
 
				+  dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0;
			
 
				+  rmse=OC_MAXI(z0+(dz*dx>>shift),0);
			
 
				+  *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
			
 
				+  return OC_MAXI(y0+(dy*dx>>shift),0);
			
 
				+}
			
 
				+
			
 
				+/*activity_avg must be positive, or flat regions could get a zero weight, which
			
 
				+   confounds analysis.
			
 
				+  We set the minimum to this value so that it also avoids the need for divide
			
 
				+   by zero checks in oc_mb_masking().*/
			
 
				+# define OC_ACTIVITY_AVG_MIN (1<<OC_RD_SCALE_BITS)
			
 
				+
			
 
				+static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
			
 
				+ unsigned _activity[4]){
			
 
				+  const unsigned char *src;
			
 
				+  const ptrdiff_t     *frag_buf_offs;
			
 
				+  const ptrdiff_t     *sb_map;
			
 
				+  unsigned             luma;
			
 
				+  int                  ystride;
			
 
				+  ptrdiff_t            frag_offs;
			
 
				+  ptrdiff_t            fragi;
			
 
				+  int                  bi;
			
 
				+  frag_buf_offs=_enc->state.frag_buf_offs;
			
 
				+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
			
 
				+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
			
 
				+  ystride=_enc->state.ref_ystride[0];
			
 
				+  luma=0;
			
 
				+  for(bi=0;bi<4;bi++){
			
 
				+    const unsigned char *s;
			
 
				+    unsigned             x;
			
 
				+    unsigned             x2;
			
 
				+    unsigned             act;
			
 
				+    int                  i;
			
 
				+    int                  j;
			
 
				+    fragi=sb_map[bi];
			
 
				+    frag_offs=frag_buf_offs[fragi];
			
 
				+    /*TODO: This could be replaced with SATD^2, since we already have to
			
 
				+       compute SATD.*/
			
 
				+    x=x2=0;
			
 
				+    s=src+frag_offs;
			
 
				+    for(i=0;i<8;i++){
			
 
				+      for(j=0;j<8;j++){
			
 
				+        unsigned c;
			
 
				+        c=s[j];
			
 
				+        x+=c;
			
 
				+        x2+=c*c;
			
 
				+      }
			
 
				+      s+=ystride;
			
 
				+    }
			
 
				+    luma+=x;
			
 
				+    act=(x2<<6)-x*x;
			
 
				+    if(act<8<<12){
			
 
				+      /*The region is flat.*/
			
 
				+      act=OC_MINI(act,5<<12);
			
 
				+    }
			
 
				+    else{
			
 
				+      unsigned e1;
			
 
				+      unsigned e2;
			
 
				+      unsigned e3;
			
 
				+      unsigned e4;
			
 
				+      /*Test for an edge.
			
 
				+        TODO: There are probably much simpler ways to do this (e.g., it could
			
 
				+         probably be combined with the SATD calculation).
			
 
				+        Alternatively, we could split the block around the mean and compute the
			
 
				+         reduction in variance in each half.
			
 
				+        For a Gaussian source the reduction should be
			
 
				+         (1-2/pi) ~= 0.36338022763241865692446494650994.
			
 
				+        Significantly more reduction is a good indication of a bi-level image.
			
 
				+        This has the advantage of identifying, in addition to straight edges,
			
 
				+         small text regions, which would otherwise be classified as "texture".*/
			
 
				+      e1=e2=e3=e4=0;
			
 
				+      s=src+frag_offs-1;
			
 
				+      for(i=0;i<8;i++){
			
 
				+        for(j=0;j<8;j++){
			
 
				+          e1+=abs((s[j+2]-s[j]<<1)+(s-ystride)[j+2]-(s-ystride)[j]
			
 
				+           +(s+ystride)[j+2]-(s+ystride)[j]);
			
 
				+          e2+=abs(((s+ystride)[j+1]-(s-ystride)[j+1]<<1)
			
 
				+           +(s+ystride)[j]-(s-ystride)[j]+(s+ystride)[j+2]-(s-ystride)[j+2]);
			
 
				+          e3+=abs(((s+ystride)[j+2]-(s-ystride)[j]<<1)
			
 
				+           +(s+ystride)[j+1]-s[j]+s[j+2]-(s-ystride)[j+1]);
			
 
				+          e4+=abs(((s+ystride)[j]-(s-ystride)[j+2]<<1)
			
 
				+           +(s+ystride)[j+1]-s[j+2]+s[j]-(s-ystride)[j+1]);
			
 
				+        }
			
 
				+        s+=ystride;
			
 
				+      }
			
 
				+      /*If the largest component of the edge energy is at least 40% of the
			
 
				+         total, then classify the block as an edge block.*/
			
 
				+      if(5*OC_MAXI(OC_MAXI(e1,e2),OC_MAXI(e3,e4))>2*(e1+e2+e3+e4)){
			
 
				+         /*act=act_th*(act/act_th)**0.7
			
 
				+              =exp(log(act_th)+0.7*(log(act)-log(act_th))).
			
 
				+           Here act_th=5.0 and 0x394A=oc_blog32_q10(5<<12).*/
			
 
				+         act=oc_bexp32_q10(0x394A+(7*(oc_blog32_q10(act)-0x394A+5)/10));
			
 
				+      }
			
 
				+    }
			
 
				+    _activity[bi]=act;
			
 
				+  }
			
 
				+  return luma;
			
 
				+}
			
 
				+
			
 
				+static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
			
 
				+ unsigned _activity[4],const unsigned _intra_satd[12]){
			
 
				+  int bi;
			
 
				+  for(bi=0;bi<4;bi++){
			
 
				+    unsigned act;
			
 
				+    act=(11*_intra_satd[bi]>>8)*_intra_satd[bi];
			
 
				+    if(act<8<<12){
			
 
				+      /*The region is flat.*/
			
 
				+      act=OC_MINI(act,5<<12);
			
 
				+    }
			
 
				+    _activity[bi]=act;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Compute the masking scales for the blocks in a macro block.
			
 
				+  All masking is computed from the luma blocks.
			
 
				+  We derive scaling factors for the chroma blocks from these, and use the same
			
 
				+   ones for all chroma blocks, regardless of the subsampling.
			
 
				+  It's possible for luma to be perfectly flat and yet have high chroma energy,
			
 
				+   but this is unlikely in non-artificial images, and not a case that has been
			
 
				+   addressed by any research to my knowledge.
			
 
				+  The output of the masking process is two scale factors, which are fed into
			
 
				+   the various R-D optimizations.
			
 
				+  The first, rd_scale, is applied to D in the equation
			
 
				+    D*rd_scale+lambda*R.
			
 
				+  This is the form that must be used to properly combine scores from multiple
			
 
				+   blocks, and can be interpreted as scaling distortions by their visibility.
			
 
				+  The inverse, rd_iscale, is applied to lambda in the equation
			
 
				+    D+rd_iscale*lambda*R.
			
 
				+  This is equivalent to the first form within a single block, but much faster
			
 
				+   to use when evaluating many possible distortions (e.g., during actual
			
 
				+   quantization, where separate distortions are evaluated for every
			
 
				+   coefficient).
			
 
				+  The two macros OC_RD_SCALE(rd_scale,d) and OC_RD_ISCALE(rd_iscale,lambda) are
			
 
				+   used to perform the multiplications with the proper re-scaling for the range
			
 
				+   of the scaling factors.
			
 
				+  Many researchers apply masking values directly to the quantizers used, and
			
 
				+   not to the R-D cost.
			
 
				+  Since we generally use MSE for D, rd_scale must use the square of their
			
 
				+   values to generate an equivalent effect.*/
			
 
				+static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5],
			
 
				+ const ogg_uint16_t _chroma_rd_scale[2],const unsigned _activity[4],
			
 
				+ unsigned _activity_avg,unsigned _luma,unsigned _luma_avg){
			
 
				+  unsigned activity_sum;
			
 
				+  unsigned la;
			
 
				+  unsigned lb;
			
 
				+  unsigned d;
			
 
				+  int      bi;
			
 
				+  int      bi_min;
			
 
				+  int      bi_min2;
			
 
				+  /*The ratio lb/la is meant to approximate
			
 
				+     ((((_luma-16)/219)*(255/128))**0.649**0.4**2), which is the
			
 
				+     effective luminance masking from~\cite{LKW06} (including the self-masking
			
 
				+     deflator).
			
 
				+    The following actually turns out to be a pretty good approximation for
			
 
				+     _luma>75 or so.
			
 
				+    For smaller values luminance does not really follow Weber's Law anyway, and
			
 
				+     this approximation gives a much less aggressive bitrate boost in this
			
 
				+     region.
			
 
				+    Though some researchers claim that contrast sensitivity actually decreases
			
 
				+     for very low luminance values, in my experience excessive brightness on
			
 
				+     LCDs or buggy color conversions (e.g., treating Y' as full-range instead
			
 
				+     of the CCIR 601 range) make artifacts in such regions extremely visible.
			
 
				+    We substitute _luma_avg for 128 to allow the strength of the masking to
			
 
				+     vary with the actual average image luminance, within certain limits (the
			
 
				+     caller has clamped _luma_avg to the range [90,160], inclusive).
			
 
				+    @ARTICLE{LKW06,
			
 
				+      author="Zhen Liu and Lina J. Karam and Andrew B. Watson",
			
 
				+      title="{JPEG2000} Encoding With Perceptual Distortion Control",
			
 
				+      journal="{IEEE} Transactions on Image Processing",
			
 
				+      volume=15,
			
 
				+      number=7,
			
 
				+      pages="1763--1778",
			
 
				+      month=Jul,
			
 
				+      year=2006
			
 
				+    }*/
			
 
				+#if 0
			
 
				+  la=_luma+4*_luma_avg;
			
 
				+  lb=4*_luma+_luma_avg;
			
 
				+#else
			
 
				+  /*Disable luminance masking.*/
			
 
				+  la=lb=1;
			
 
				+#endif
			
 
				+  activity_sum=0;
			
 
				+  for(bi=0;bi<4;bi++){
			
 
				+    unsigned a;
			
 
				+    unsigned b;
			
 
				+    activity_sum+=_activity[bi];
			
 
				+    /*Apply activity masking.*/
			
 
				+    a=_activity[bi]+4*_activity_avg;
			
 
				+    b=4*_activity[bi]+_activity_avg;
			
 
				+    d=OC_RD_SCALE(b,1);
			
 
				+    /*And luminance masking.*/
			
 
				+    d=(a+(d>>1))/d;
			
 
				+    _rd_scale[bi]=(d*la+(lb>>1))/lb;
			
 
				+    /*And now the inverse.*/
			
 
				+    d=OC_MAXI(OC_RD_ISCALE(a,1),1);
			
 
				+    d=(b+(d>>1))/d;
			
 
				+    _rd_iscale[bi]=(d*lb+(la>>1))/la;
			
 
				+  }
			
 
				+  /*Now compute scaling factors for chroma blocks.
			
 
				+    We start by finding the two smallest iscales from the luma blocks.*/
			
 
				+  bi_min=_rd_iscale[1]<_rd_iscale[0];
			
 
				+  bi_min2=1-bi_min;
			
 
				+  for(bi=2;bi<4;bi++){
			
 
				+    if(_rd_iscale[bi]<_rd_iscale[bi_min]){
			
 
				+      bi_min2=bi_min;
			
 
				+      bi_min=bi;
			
 
				+    }
			
 
				+    else if(_rd_iscale[bi]<_rd_iscale[bi_min2])bi_min2=bi;
			
 
				+  }
			
 
				+  /*If the minimum iscale is less than 1.0, use the second smallest instead,
			
 
				+     and force the value to at least 1.0 (inflating chroma is a waste).*/
			
 
				+  if(_rd_iscale[bi_min]<(1<<OC_RD_ISCALE_BITS))bi_min=bi_min2;
			
 
				+  d=OC_MINI(_rd_scale[bi_min],1<<OC_RD_SCALE_BITS);
			
 
				+  _rd_scale[4]=OC_RD_SCALE(d,_chroma_rd_scale[0]);
			
 
				+  d=OC_MAXI(_rd_iscale[bi_min],1<<OC_RD_ISCALE_BITS);
			
 
				+  _rd_iscale[4]=OC_RD_ISCALE(d,_chroma_rd_scale[1]);
			
 
				+  return activity_sum;
			
 
				+}
			
 
				+
			
 
				+static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
			
 
				+ unsigned _frag_satd[12]){
			
 
				+  const unsigned char   *src;
			
 
				+  const ptrdiff_t       *frag_buf_offs;
			
 
				+  const ptrdiff_t       *sb_map;
			
 
				+  const oc_mb_map_plane *mb_map;
			
 
				+  const unsigned char   *map_idxs;
			
 
				+  int                    map_nidxs;
			
 
				+  int                    mapii;
			
 
				+  int                    mapi;
			
 
				+  int                    ystride;
			
 
				+  int                    pli;
			
 
				+  int                    bi;
			
 
				+  ptrdiff_t              fragi;
			
 
				+  ptrdiff_t              frag_offs;
			
 
				+  unsigned               luma;
			
 
				+  int                    dc;
			
 
				+  frag_buf_offs=_enc->state.frag_buf_offs;
			
 
				+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
			
 
				+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
			
 
				+  ystride=_enc->state.ref_ystride[0];
			
 
				+  luma=0;
			
 
				+  for(bi=0;bi<4;bi++){
			
 
				+    fragi=sb_map[bi];
			
 
				+    frag_offs=frag_buf_offs[fragi];
			
 
				+    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
			
 
				+    luma+=dc;
			
 
				+  }
			
 
				+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
			
 
				+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
			
 
				+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
			
 
				+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
			
 
				+  ystride=_enc->state.ref_ystride[1];
			
 
				+  for(mapii=4;mapii<map_nidxs;mapii++){
			
 
				+    mapi=map_idxs[mapii];
			
 
				+    pli=mapi>>2;
			
 
				+    bi=mapi&3;
			
 
				+    fragi=mb_map[pli][bi];
			
 
				+    frag_offs=frag_buf_offs[fragi];
			
 
				+    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
			
 
				+  }
			
 
				+  return luma;
			
 
				+}
			
 
				+
			
 
				+/*Select luma block-level quantizers for a MB in an INTRA frame.*/
			
 
				+static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
			
 
				+ const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){
			
 
				+  const unsigned char *src;
			
 
				+  const ptrdiff_t     *frag_buf_offs;
			
 
				+  const oc_sb_map     *sb_maps;
			
 
				+  oc_fragment         *frags;
			
 
				+  ptrdiff_t            frag_offs;
			
 
				+  ptrdiff_t            fragi;
			
 
				+  oc_qii_state         qs[4][3];
			
 
				+  unsigned             cost[4][3];
			
 
				+  unsigned             ssd[4][3];
			
 
				+  unsigned             rate[4][3];
			
 
				+  int                  prev[3][3];
			
 
				+  unsigned             satd;
			
 
				+  int                  dc;
			
 
				+  unsigned             best_cost;
			
 
				+  unsigned             best_ssd;
			
 
				+  unsigned             best_rate;
			
 
				+  int                  best_qii;
			
 
				+  int                  qii;
			
 
				+  int                  lambda;
			
 
				+  int                  ystride;
			
 
				+  int                  nqis;
			
 
				+  int                  bi;
			
 
				+  frag_buf_offs=_enc->state.frag_buf_offs;
			
 
				+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
			
 
				+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
			
 
				+  ystride=_enc->state.ref_ystride[0];
			
 
				+  fragi=sb_maps[_mbi>>2][_mbi&3][0];
			
 
				+  frag_offs=frag_buf_offs[fragi];
			
 
				+  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
			
 
				+    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
			
 
				+  }
			
 
				+  else{
			
 
				+    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
			
 
				+  }
			
 
				+  nqis=_enc->state.nqis;
			
 
				+  lambda=_enc->lambda;
			
 
				+  for(qii=0;qii<nqis;qii++){
			
 
				+    oc_qii_state_advance(qs[0]+qii,_qs,qii);
			
 
				+    rate[0][qii]=oc_dct_cost2(_enc,ssd[0]+qii,qii,0,0,satd)
			
 
				+     +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
			
 
				+    ssd[0][qii]=OC_RD_SCALE(ssd[0][qii],_rd_scale[0]);
			
 
				+    cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
			
 
				+  }
			
 
				+  for(bi=1;bi<4;bi++){
			
 
				+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
			
 
				+    frag_offs=frag_buf_offs[fragi];
			
 
				+    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
			
 
				+      satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
			
 
				+    }
			
 
				+    else{
			
 
				+      satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
			
 
				+    }
			
 
				+    for(qii=0;qii<nqis;qii++){
			
 
				+      oc_qii_state qt[3];
			
 
				+      unsigned     cur_ssd;
			
 
				+      unsigned     cur_rate;
			
 
				+      int          best_qij;
			
 
				+      int          qij;
			
 
				+      oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
			
 
				+      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,0,satd);
			
 
				+      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
			
 
				+      best_ssd=ssd[bi-1][0]+cur_ssd;
			
 
				+      best_rate=rate[bi-1][0]+cur_rate
			
 
				+       +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
			
 
				+      best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda);
			
 
				+      best_qij=0;
			
 
				+      for(qij=1;qij<nqis;qij++){
			
 
				+        unsigned chain_ssd;
			
 
				+        unsigned chain_rate;
			
 
				+        unsigned chain_cost;
			
 
				+        oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii);
			
 
				+        chain_ssd=ssd[bi-1][qij]+cur_ssd;
			
 
				+        chain_rate=rate[bi-1][qij]+cur_rate
			
 
				+         +(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE);
			
 
				+        chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda);
			
 
				+        if(chain_cost<best_cost){
			
 
				+          best_cost=chain_cost;
			
 
				+          best_ssd=chain_ssd;
			
 
				+          best_rate=chain_rate;
			
 
				+          best_qij=qij;
			
 
				+        }
			
 
				+      }
			
 
				+      *(qs[bi]+qii)=*(qt+best_qij);
			
 
				+      cost[bi][qii]=best_cost;
			
 
				+      ssd[bi][qii]=best_ssd;
			
 
				+      rate[bi][qii]=best_rate;
			
 
				+      prev[bi-1][qii]=best_qij;
			
 
				+    }
			
 
				+  }
			
 
				+  best_qii=0;
			
 
				+  best_cost=cost[3][0];
			
 
				+  for(qii=1;qii<nqis;qii++){
			
 
				+    if(cost[3][qii]<best_cost){
			
 
				+      best_cost=cost[3][qii];
			
 
				+      best_qii=qii;
			
 
				+    }
			
 
				+  }
			
 
				+  frags=_enc->state.frags;
			
 
				+  for(bi=3;;){
			
 
				+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
			
 
				+    frags[fragi].qii=best_qii;
			
 
				+    if(bi--<=0)break;
			
 
				+    best_qii=prev[bi][best_qii];
			
 
				+  }
			
 
				+  return best_cost;
			
 
				+}
			
 
				+
			
 
				+/*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
			
 
				+static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
			
 
				+ const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi,unsigned _rd_scale){
			
 
				+  const unsigned char *src;
			
 
				+  oc_fragment         *frags;
			
 
				+  ptrdiff_t            frag_offs;
			
 
				+  oc_qii_state         qt[3];
			
 
				+  unsigned             cost[3];
			
 
				+  unsigned             satd;
			
 
				+  int                  dc;
			
 
				+  unsigned             best_cost;
			
 
				+  int                  best_qii;
			
 
				+  int                  qii;
			
 
				+  int                  lambda;
			
 
				+  int                  ystride;
			
 
				+  int                  nqis;
			
 
				+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
			
 
				+  ystride=_enc->state.ref_ystride[_pli];
			
 
				+  frag_offs=_enc->state.frag_buf_offs[_fragi];
			
 
				+  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
			
 
				+    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
			
 
				+  }
			
 
				+  else{
			
 
				+    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
			
 
				+  }
			
 
				+  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
			
 
				+     worth spending the bits to change the AC quantizer.
			
 
				+    TODO: This may be worth revisiting when we separate out DC and AC
			
 
				+     predictions from SATD.*/
			
 
				+#if 0
			
 
				+  nqis=_enc->state.nqis;
			
 
				+#else
			
 
				+  nqis=1;
			
 
				+#endif
			
 
				+  lambda=_enc->lambda;
			
 
				+  best_qii=0;
			
 
				+  for(qii=0;qii<nqis;qii++){
			
 
				+    unsigned cur_rate;
			
 
				+    unsigned cur_ssd;
			
 
				+    oc_qii_state_advance(qt+qii,_qs,qii);
			
 
				+    cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,_pli,0,satd)
			
 
				+     +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
			
 
				+    cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
			
 
				+    cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
			
 
				+  }
			
 
				+  best_cost=cost[0];
			
 
				+  for(qii=1;qii<nqis;qii++){
			
 
				+    if(cost[qii]<best_cost){
			
 
				+      best_cost=cost[qii];
			
 
				+      best_qii=qii;
			
 
				+    }
			
 
				+  }
			
 
				+  frags=_enc->state.frags;
			
 
				+  frags[_fragi].qii=best_qii;
			
 
				+  return best_cost;
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc,
			
 
				+ oc_enc_pipeline_state *_pipe,unsigned _mbi,
			
 
				+ const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
			
 
				+  /*Worst case token stack usage for 4 fragments.*/
			
 
				+  oc_token_checkpoint  stack[64*4];
			
 
				+  oc_token_checkpoint *stackptr;
			
 
				+  const oc_sb_map     *sb_maps;
			
 
				+  oc_fragment         *frags;
			
 
				+  ptrdiff_t           *coded_fragis;
			
 
				+  ptrdiff_t            ncoded_fragis;
			
 
				+  ptrdiff_t            fragi;
			
 
				+  int                  bi;
			
 
				+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
			
 
				+  frags=_enc->state.frags;
			
 
				+  coded_fragis=_pipe->coded_fragis[0];
			
 
				+  ncoded_fragis=_pipe->ncoded_fragis[0];
			
 
				+  stackptr=stack;
			
 
				+  for(bi=0;bi<4;bi++){
			
 
				+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
			
 
				+    frags[fragi].refi=OC_FRAME_SELF;
			
 
				+    frags[fragi].mb_mode=OC_MODE_INTRA;
			
 
				+    oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
			
 
				+     _rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr);
			
 
				+    coded_fragis[ncoded_fragis++]=fragi;
			
 
				+  }
			
 
				+  _pipe->ncoded_fragis[0]=ncoded_fragis;
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
			
 
				+ oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
			
 
				+  const ogg_uint16_t *mcu_rd_scale;
			
 
				+  const ogg_uint16_t *mcu_rd_iscale;
			
 
				+  const oc_sb_map    *sb_maps;
			
 
				+  ptrdiff_t          *coded_fragis;
			
 
				+  ptrdiff_t           ncoded_fragis;
			
 
				+  ptrdiff_t           froffset;
			
 
				+  int                 sbi;
			
 
				+  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
			
 
				+  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
			
 
				+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
			
 
				+  coded_fragis=_pipe->coded_fragis[_pli];
			
 
				+  ncoded_fragis=_pipe->ncoded_fragis[_pli];
			
 
				+  froffset=_pipe->froffset[_pli];
			
 
				+  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
			
 
				+    /*Worst case token stack usage for 1 fragment.*/
			
 
				+    oc_token_checkpoint stack[64];
			
 
				+    int                 quadi;
			
 
				+    int                 bi;
			
 
				+    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
			
 
				+      ptrdiff_t fragi;
			
 
				+      fragi=sb_maps[sbi][quadi][bi];
			
 
				+      if(fragi>=0){
			
 
				+        oc_token_checkpoint *stackptr;
			
 
				+        unsigned             rd_scale;
			
 
				+        unsigned             rd_iscale;
			
 
				+        rd_scale=mcu_rd_scale[fragi-froffset];
			
 
				+        rd_iscale=mcu_rd_iscale[fragi-froffset];
			
 
				+        oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi,rd_scale);
			
 
				+        stackptr=stack;
			
 
				+        oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
			
 
				+         rd_scale,rd_iscale,NULL,NULL,&stackptr);
			
 
				+        coded_fragis[ncoded_fragis++]=fragi;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
			
 
				+}
			
 
				+
			
 
				+/*Analysis stage for an INTRA frame.*/
			
 
				+void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
			
 
				+  ogg_int64_t             activity_sum;
			
 
				+  ogg_int64_t             luma_sum;
			
 
				+  unsigned                activity_avg;
			
 
				+  unsigned                luma_avg;
			
 
				+  const ogg_uint16_t     *chroma_rd_scale;
			
 
				+  ogg_uint16_t           *mcu_rd_scale;
			
 
				+  ogg_uint16_t           *mcu_rd_iscale;
			
 
				+  const unsigned char    *map_idxs;
			
 
				+  int                     nmap_idxs;
			
 
				+  oc_sb_flags            *sb_flags;
			
 
				+  signed char            *mb_modes;
			
 
				+  const oc_mb_map        *mb_maps;
			
 
				+  const oc_sb_map        *sb_maps;
			
 
				+  oc_fragment            *frags;
			
 
				+  unsigned                stripe_sby;
			
 
				+  unsigned                mcu_nvsbs;
			
 
				+  int                     notstart;
			
 
				+  int                     notdone;
			
 
				+  int                     refi;
			
 
				+  int                     pli;
			
 
				+  _enc->state.frame_type=OC_INTRA_FRAME;
			
 
				+  oc_enc_tokenize_start(_enc);
			
 
				+  oc_enc_pipeline_init(_enc,&_enc->pipe);
			
 
				+  oc_enc_mode_rd_init(_enc);
			
 
				+  activity_sum=luma_sum=0;
			
 
				+  activity_avg=_enc->activity_avg;
			
 
				+  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
			
 
				+  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTRA_FRAME][_enc->state.qis[0]];
			
 
				+  mcu_rd_scale=_enc->mcu_rd_scale;
			
 
				+  mcu_rd_iscale=_enc->mcu_rd_iscale;
			
 
				+  /*Choose MVs and MB modes and quantize and code luma.
			
 
				+    Must be done in Hilbert order.*/
			
 
				+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
			
 
				+  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
			
 
				+  _enc->state.ncoded_fragis[0]=0;
			
 
				+  _enc->state.ncoded_fragis[1]=0;
			
 
				+  _enc->state.ncoded_fragis[2]=0;
			
 
				+  sb_flags=_enc->state.sb_flags;
			
 
				+  mb_modes=_enc->state.mb_modes;
			
 
				+  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
			
 
				+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
			
 
				+  frags=_enc->state.frags;
			
 
				+  notstart=0;
			
 
				+  notdone=1;
			
 
				+  mcu_nvsbs=_enc->mcu_nvsbs;
			
 
				+  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
			
 
				+    ptrdiff_t cfroffset;
			
 
				+    unsigned  sbi;
			
 
				+    unsigned  sbi_end;
			
 
				+    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
			
 
				+    sbi_end=_enc->pipe.sbi_end[0];
			
 
				+    cfroffset=_enc->pipe.froffset[1];
			
 
				+    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
			
 
				+      int quadi;
			
 
				+      /*Mode addressing is through Y plane, always 4 MB per SB.*/
			
 
				+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
			
 
				+        unsigned  activity[4];
			
 
				+        unsigned  rd_scale[5];
			
 
				+        unsigned  rd_iscale[5];
			
 
				+        unsigned  luma;
			
 
				+        unsigned  mbi;
			
 
				+        int       mapii;
			
 
				+        int       mapi;
			
 
				+        int       bi;
			
 
				+        ptrdiff_t fragi;
			
 
				+        mbi=sbi<<2|quadi;
			
 
				+        /*Activity masking.*/
			
 
				+        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
			
 
				+          luma=oc_mb_activity(_enc,mbi,activity);
			
 
				+        }
			
 
				+        else{
			
 
				+          unsigned intra_satd[12];
			
 
				+          luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
			
 
				+          oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
			
 
				+          for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
			
 
				+        }
			
 
				+        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
			
 
				+         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
			
 
				+        luma_sum+=luma;
			
 
				+        /*Motion estimation:
			
 
				+          We do a basic 1MV search for all macroblocks, coded or not,
			
 
				+           keyframe or not, unless we aren't using motion estimation at all.*/
			
 
				+        if(!_recode&&_enc->state.curframe_num>0&&
			
 
				+         _enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){
			
 
				+          oc_mcenc_search(_enc,mbi);
			
 
				+        }
			
 
				+        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
			
 
				+          oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale);
			
 
				+        }
			
 
				+        mb_modes[mbi]=OC_MODE_INTRA;
			
 
				+        oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe,
			
 
				+         mbi,rd_scale,rd_iscale);
			
 
				+        /*Propagate final MB mode and MVs to the chroma blocks.*/
			
 
				+        for(mapii=4;mapii<nmap_idxs;mapii++){
			
 
				+          mapi=map_idxs[mapii];
			
 
				+          pli=mapi>>2;
			
 
				+          bi=mapi&3;
			
 
				+          fragi=mb_maps[mbi][pli][bi];
			
 
				+          frags[fragi].refi=OC_FRAME_SELF;
			
 
				+          frags[fragi].mb_mode=OC_MODE_INTRA;
			
 
				+        }
			
 
				+        /*Save masking scale factors for chroma blocks.*/
			
 
				+        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
			
 
				+          mapi=map_idxs[mapii];
			
 
				+          bi=mapi&3;
			
 
				+          fragi=mb_maps[mbi][1][bi];
			
 
				+          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
			
 
				+          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
			
 
				+    /*Code chroma planes.*/
			
 
				+    for(pli=1;pli<3;pli++){
			
 
				+      oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe,
			
 
				+       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
			
 
				+      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
			
 
				+    }
			
 
				+    notstart=1;
			
 
				+  }
			
 
				+  /*Compute the average block activity and MB luma score for the frame.*/
			
 
				+  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
			
 
				+   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
			
 
				+   _enc->state.fplanes[0].nfrags));
			
 
				+  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
			
 
				+  /*Finish filling in the reference frame borders.*/
			
 
				+  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
			
 
				+  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
			
 
				+  _enc->state.ntotal_coded_fragis=_enc->state.nfrags;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Cost information about a MB mode.*/
			
 
				+struct oc_mode_choice{
			
 
				+  unsigned      cost;
			
 
				+  unsigned      ssd;
			
 
				+  unsigned      rate;
			
 
				+  unsigned      overhead;
			
 
				+  unsigned char qii[12];
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
			
 
				+  _modec->cost=OC_MODE_RD_COST(_modec->ssd,
			
 
				+   _modec->rate+_modec->overhead,_lambda);
			
 
				+}
			
 
				+
			
 
				+/*A set of skip SSD's to use to disable early skipping.*/
			
 
				+static const unsigned OC_NOSKIP[12]={
			
 
				+  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
			
 
				+  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
			
 
				+  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX
			
 
				+};
			
 
				+
			
 
				+/*The estimated number of bits used by a coded chroma block to specify the AC
			
 
				+   quantizer.
			
 
				+  TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
			
 
				+   measurements suggest this is in the right ballpark, but it varies somewhat
			
 
				+   with lambda.*/
			
 
				+#define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
			
 
				+
			
 
				+static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
			
 
				+ oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
			
 
				+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
			
 
				+ const unsigned _rd_scale[4],int _qti){
			
 
				+  oc_fr_state  fr;
			
 
				+  oc_qii_state qs;
			
 
				+  unsigned     ssd;
			
 
				+  unsigned     rate;
			
 
				+  unsigned     satd;
			
 
				+  unsigned     best_ssd;
			
 
				+  unsigned     best_rate;
			
 
				+  int          best_fri;
			
 
				+  int          best_qii;
			
 
				+  int          lambda;
			
 
				+  int          nqis;
			
 
				+  int          nskipped;
			
 
				+  int          bi;
			
 
				+  lambda=_enc->lambda;
			
 
				+  nqis=_enc->state.nqis;
			
 
				+  /*We could do a trellis optimization here, but we don't make final skip
			
 
				+     decisions until after transform+quantization, so the result wouldn't be
			
 
				+     optimal anyway.
			
 
				+    Instead we just use a greedy approach; for most SATD values, the
			
 
				+     differences between the qiis are large enough to drown out the cost to
			
 
				+     code the flags, anyway.*/
			
 
				+  *&fr=*_fr;
			
 
				+  *&qs=*_qs;
			
 
				+  ssd=rate=nskipped=0;
			
 
				+  for(bi=0;bi<4;bi++){
			
 
				+    oc_fr_state  ft[2];
			
 
				+    oc_qii_state qt[3];
			
 
				+    unsigned     best_cost;
			
 
				+    unsigned     cur_cost;
			
 
				+    unsigned     cur_ssd;
			
 
				+    unsigned     cur_rate;
			
 
				+    unsigned     cur_overhead;
			
 
				+    int          qii;
			
 
				+    satd=_frag_satd[bi];
			
 
				+    *(ft+0)=*&fr;
			
 
				+    oc_fr_code_block(ft+0);
			
 
				+    cur_overhead=ft[0].bits-fr.bits;
			
 
				+    best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd)
			
 
				+     +(cur_overhead<<OC_BIT_SCALE);
			
 
				+    if(nqis>1){
			
 
				+      oc_qii_state_advance(qt+0,&qs,0);
			
 
				+      best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE;
			
 
				+    }
			
 
				+    best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]);
			
 
				+    best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
			
 
				+    best_fri=0;
			
 
				+    best_qii=0;
			
 
				+    for(qii=1;qii<nqis;qii++){
			
 
				+      oc_qii_state_advance(qt+qii,&qs,qii);
			
 
				+      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,_qti,satd)
			
 
				+       +(cur_overhead+qt[qii].bits-qs.bits<<OC_BIT_SCALE);
			
 
				+      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
			
 
				+      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
			
 
				+      if(cur_cost<best_cost){
			
 
				+        best_cost=cur_cost;
			
 
				+        best_ssd=cur_ssd;
			
 
				+        best_rate=cur_rate;
			
 
				+        best_qii=qii;
			
 
				+      }
			
 
				+    }
			
 
				+    if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){
			
 
				+      *(ft+1)=*&fr;
			
 
				+      oc_fr_skip_block(ft+1);
			
 
				+      cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
			
 
				+      cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
			
 
				+      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
			
 
				+      if(cur_cost<=best_cost){
			
 
				+        best_ssd=cur_ssd;
			
 
				+        best_rate=cur_overhead;
			
 
				+        best_fri=1;
			
 
				+        best_qii+=4;
			
 
				+      }
			
 
				+    }
			
 
				+    rate+=best_rate;
			
 
				+    ssd+=best_ssd;
			
 
				+    *&fr=*(ft+best_fri);
			
 
				+    if(best_fri==0)*&qs=*(qt+best_qii);
			
 
				+    else nskipped++;
			
 
				+    _modec->qii[bi]=best_qii;
			
 
				+  }
			
 
				+  _modec->ssd=ssd;
			
 
				+  _modec->rate=rate;
			
 
				+}
			
 
				+
			
 
				+static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
			
 
				+ oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
			
 
				+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
			
 
				+ unsigned _rd_scale,int _qti){
			
 
				+  unsigned ssd;
			
 
				+  unsigned rate;
			
 
				+  unsigned satd;
			
 
				+  unsigned best_ssd;
			
 
				+  unsigned best_rate;
			
 
				+  int      best_qii;
			
 
				+  unsigned cur_cost;
			
 
				+  unsigned cur_ssd;
			
 
				+  unsigned cur_rate;
			
 
				+  int      lambda;
			
 
				+  int      nblocks;
			
 
				+  int      nqis;
			
 
				+  int      pli;
			
 
				+  int      bi;
			
 
				+  int      qii;
			
 
				+  lambda=_enc->lambda;
			
 
				+  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
			
 
				+     worth spending the bits to change the AC quantizer.
			
 
				+    TODO: This may be worth revisiting when we separate out DC and AC
			
 
				+     predictions from SATD.*/
			
 
				+#if 0
			
 
				+  nqis=_enc->state.nqis;
			
 
				+#else
			
 
				+  nqis=1;
			
 
				+#endif
			
 
				+  ssd=_modec->ssd;
			
 
				+  rate=_modec->rate;
			
 
				+  /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
			
 
				+     order, we assume a constant overhead for coded block and qii flags.*/
			
 
				+  nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
			
 
				+  nblocks=(nblocks-4>>1)+4;
			
 
				+  bi=4;
			
 
				+  for(pli=1;pli<3;pli++){
			
 
				+    for(;bi<nblocks;bi++){
			
 
				+      unsigned best_cost;
			
 
				+      satd=_frag_satd[bi];
			
 
				+      best_rate=oc_dct_cost2(_enc,&best_ssd,0,pli,_qti,satd)
			
 
				+       +OC_CHROMA_QII_RATE;
			
 
				+      best_ssd=OC_RD_SCALE(best_ssd,_rd_scale);
			
 
				+      best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
			
 
				+      best_qii=0;
			
 
				+      for(qii=1;qii<nqis;qii++){
			
 
				+        cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,pli,_qti,satd)
			
 
				+         +OC_CHROMA_QII_RATE;
			
 
				+        cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
			
 
				+        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
			
 
				+        if(cur_cost<best_cost){
			
 
				+          best_cost=cur_cost;
			
 
				+          best_ssd=cur_ssd;
			
 
				+          best_rate=cur_rate;
			
 
				+          best_qii=qii;
			
 
				+        }
			
 
				+      }
			
 
				+      if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){
			
 
				+        cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
			
 
				+        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
			
 
				+        if(cur_cost<=best_cost){
			
 
				+          best_ssd=cur_ssd;
			
 
				+          best_rate=0;
			
 
				+          best_qii+=4;
			
 
				+        }
			
 
				+      }
			
 
				+      rate+=best_rate;
			
 
				+      ssd+=best_ssd;
			
 
				+      _modec->qii[bi]=best_qii;
			
 
				+    }
			
 
				+    nblocks=(nblocks-4<<1)+4;
			
 
				+  }
			
 
				+  _modec->ssd=ssd;
			
 
				+  _modec->rate=rate;
			
 
				+}
			
 
				+
			
 
				+static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
			
 
				+ unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){
			
 
				+  const unsigned char   *src;
			
 
				+  const unsigned char   *ref;
			
 
				+  int                    ystride;
			
 
				+  const oc_fragment     *frags;
			
 
				+  const ptrdiff_t       *frag_buf_offs;
			
 
				+  const ptrdiff_t       *sb_map;
			
 
				+  const oc_mb_map_plane *mb_map;
			
 
				+  const unsigned char   *map_idxs;
			
 
				+  oc_mv                 *mvs;
			
 
				+  int                    map_nidxs;
			
 
				+  unsigned               uncoded_ssd;
			
 
				+  int                    mapii;
			
 
				+  int                    mapi;
			
 
				+  int                    pli;
			
 
				+  int                    bi;
			
 
				+  ptrdiff_t              fragi;
			
 
				+  ptrdiff_t              frag_offs;
			
 
				+  int                    borderi;
			
 
				+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
			
 
				+  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
			
 
				+  ystride=_enc->state.ref_ystride[0];
			
 
				+  frags=_enc->state.frags;
			
 
				+  frag_buf_offs=_enc->state.frag_buf_offs;
			
 
				+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
			
 
				+  mvs=_enc->mb_info[_mbi].block_mv;
			
 
				+  for(bi=0;bi<4;bi++){
			
 
				+    fragi=sb_map[bi];
			
 
				+    borderi=frags[fragi].borderi;
			
 
				+    frag_offs=frag_buf_offs[fragi];
			
 
				+    if(borderi<0){
			
 
				+      uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
			
 
				+    }
			
 
				+    else{
			
 
				+      uncoded_ssd=oc_enc_frag_border_ssd(_enc,
			
 
				+       src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
			
 
				+    }
			
 
				+    /*Scale to match DCT domain and RD.*/
			
 
				+    uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]);
			
 
				+    /*Motion is a special case; if there is more than a full-pixel motion
			
 
				+       against the prior frame, penalize skipping.
			
 
				+      TODO: The factor of two here is a kludge, but it tested out better than a
			
 
				+       hard limit.*/
			
 
				+    if(mvs[bi]!=0)uncoded_ssd*=2;
			
 
				+    _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
			
 
				+  }
			
 
				+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
			
 
				+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
			
 
				+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
			
 
				+  map_nidxs=(map_nidxs-4>>1)+4;
			
 
				+  mapii=4;
			
 
				+  mvs=_enc->mb_info[_mbi].unref_mv;
			
 
				+  for(pli=1;pli<3;pli++){
			
 
				+    ystride=_enc->state.ref_ystride[pli];
			
 
				+    for(;mapii<map_nidxs;mapii++){
			
 
				+      mapi=map_idxs[mapii];
			
 
				+      bi=mapi&3;
			
 
				+      fragi=mb_map[pli][bi];
			
 
				+      borderi=frags[fragi].borderi;
			
 
				+      frag_offs=frag_buf_offs[fragi];
			
 
				+      if(borderi<0){
			
 
				+        uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
			
 
				+      }
			
 
				+      else{
			
 
				+        uncoded_ssd=oc_enc_frag_border_ssd(_enc,
			
 
				+         src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
			
 
				+      }
			
 
				+      /*Scale to match DCT domain and RD.*/
			
 
				+      uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]);
			
 
				+      /*Motion is a special case; if there is more than a full-pixel motion
			
 
				+         against the prior frame, penalize skipping.
			
 
				+        TODO: The factor of two here is a kludge, but it tested out better than
			
 
				+         a hard limit*/
			
 
				+      if(mvs[OC_FRAME_PREV]!=0)uncoded_ssd*=2;
			
 
				+      _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
			
 
				+    }
			
 
				+    map_nidxs=(map_nidxs-4<<1)+4;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
			
 
				+ unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
			
 
				+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
			
 
				+ const unsigned _rd_scale[5]){
			
 
				+  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,_rd_scale,0);
			
 
				+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
			
 
				+   _frag_satd,_skip_ssd,_rd_scale[4],0);
			
 
				+  _modec->overhead=
			
 
				+   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
			
 
				+  oc_mode_set_cost(_modec,_enc->lambda);
			
 
				+}
			
 
				+
			
 
				+static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
			
 
				+ unsigned _mbi,int _mb_mode,oc_mv _mv,
			
 
				+ const oc_fr_state *_fr,const oc_qii_state *_qs,
			
 
				+ const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
			
 
				+  unsigned               frag_satd[12];
			
 
				+  const unsigned char   *src;
			
 
				+  const unsigned char   *ref;
			
 
				+  int                    ystride;
			
 
				+  const ptrdiff_t       *frag_buf_offs;
			
 
				+  const ptrdiff_t       *sb_map;
			
 
				+  const oc_mb_map_plane *mb_map;
			
 
				+  const unsigned char   *map_idxs;
			
 
				+  int                    map_nidxs;
			
 
				+  int                    mapii;
			
 
				+  int                    mapi;
			
 
				+  int                    mv_offs[2];
			
 
				+  int                    pli;
			
 
				+  int                    bi;
			
 
				+  ptrdiff_t              fragi;
			
 
				+  ptrdiff_t              frag_offs;
			
 
				+  int                    dc;
			
 
				+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
			
 
				+  ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
			
 
				+  ystride=_enc->state.ref_ystride[0];
			
 
				+  frag_buf_offs=_enc->state.frag_buf_offs;
			
 
				+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
			
 
				+  _modec->rate=_modec->ssd=0;
			
 
				+  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv)>1){
			
 
				+    for(bi=0;bi<4;bi++){
			
 
				+      fragi=sb_map[bi];
			
 
				+      frag_offs=frag_buf_offs[fragi];
			
 
				+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
			
 
				+        frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
			
 
				+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
			
 
				+        frag_satd[bi]+=abs(dc);
			
 
				+      }
			
 
				+      else{
			
 
				+        frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
			
 
				+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  else{
			
 
				+    for(bi=0;bi<4;bi++){
			
 
				+      fragi=sb_map[bi];
			
 
				+      frag_offs=frag_buf_offs[fragi];
			
 
				+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
			
 
				+        frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
			
 
				+         ref+frag_offs+mv_offs[0],ystride);
			
 
				+        frag_satd[bi]+=abs(dc);
			
 
				+      }
			
 
				+      else{
			
 
				+        frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs,
			
 
				+         ref+frag_offs+mv_offs[0],ystride);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
			
 
				+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
			
 
				+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
			
 
				+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
			
 
				+  ystride=_enc->state.ref_ystride[1];
			
 
				+  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,_mv)>1){
			
 
				+    for(mapii=4;mapii<map_nidxs;mapii++){
			
 
				+      mapi=map_idxs[mapii];
			
 
				+      pli=mapi>>2;
			
 
				+      bi=mapi&3;
			
 
				+      fragi=mb_map[pli][bi];
			
 
				+      frag_offs=frag_buf_offs[fragi];
			
 
				+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
			
 
				+        frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
			
 
				+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
			
 
				+        frag_satd[mapii]+=abs(dc);
			
 
				+      }
			
 
				+      else{
			
 
				+        frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
			
 
				+         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  else{
			
 
				+    for(mapii=4;mapii<map_nidxs;mapii++){
			
 
				+      mapi=map_idxs[mapii];
			
 
				+      pli=mapi>>2;
			
 
				+      bi=mapi&3;
			
 
				+      fragi=mb_map[pli][bi];
			
 
				+      frag_offs=frag_buf_offs[fragi];
			
 
				+      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
			
 
				+        frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
			
 
				+         ref+frag_offs+mv_offs[0],ystride);
			
 
				+        frag_satd[mapii]+=abs(dc);
			
 
				+      }
			
 
				+      else{
			
 
				+        frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs,
			
 
				+         ref+frag_offs+mv_offs[0],ystride);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
			
 
				+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
			
 
				+   frag_satd,_skip_ssd,_rd_scale[4],1);
			
 
				+  _modec->overhead=
			
 
				+   oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
			
 
				+  oc_mode_set_cost(_modec,_enc->lambda);
			
 
				+}
			
 
				+
			
 
				+static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
			
 
				+ unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
			
 
				+ const unsigned _skip_ssd[12],const unsigned _rd_scale[4]){
			
 
				+  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,0,_fr,_qs,_skip_ssd,_rd_scale);
			
 
				+}
			
 
				+
			
 
				+static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
			
 
				+ unsigned _mbi,int _mb_mode,oc_mv _mv,
			
 
				+ const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12],
			
 
				+ const unsigned _rd_scale[4]){
			
 
				+  int bits0;
			
 
				+  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd,_rd_scale);
			
 
				+  bits0=OC_MV_BITS[0][OC_MV_X(_mv)+31]+OC_MV_BITS[0][OC_MV_Y(_mv)+31];
			
 
				+  _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
			
 
				+   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
			
 
				+  oc_mode_set_cost(_modec,_enc->lambda);
			
 
				+  return bits0;
			
 
				+}
			
 
				+
			
 
				+/*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/
			
 
				+static const unsigned char OC_MB_PHASE[4][4]={
			
 
				+  {0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0}
			
 
				+};
			
 
				+
			
 
				+static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
			
 
				+ unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
			
 
				+ const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
			
 
				+  unsigned               frag_satd[12];
			
 
				+  oc_mv                  lbmvs[4];
			
 
				+  oc_mv                  cbmvs[4];
			
 
				+  const unsigned char   *src;
			
 
				+  const unsigned char   *ref;
			
 
				+  int                    ystride;
			
 
				+  const ptrdiff_t       *frag_buf_offs;
			
 
				+  oc_mv                 *frag_mvs;
			
 
				+  const oc_mb_map_plane *mb_map;
			
 
				+  const unsigned char   *map_idxs;
			
 
				+  int                    map_nidxs;
			
 
				+  int                    nqis;
			
 
				+  int                    mapii;
			
 
				+  int                    mapi;
			
 
				+  int                    mv_offs[2];
			
 
				+  int                    pli;
			
 
				+  int                    bi;
			
 
				+  ptrdiff_t              fragi;
			
 
				+  ptrdiff_t              frag_offs;
			
 
				+  int                    bits0;
			
 
				+  int                    bits1;
			
 
				+  unsigned               satd;
			
 
				+  int                    dc;
			
 
				+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
			
 
				+  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
			
 
				+  ystride=_enc->state.ref_ystride[0];
			
 
				+  frag_buf_offs=_enc->state.frag_buf_offs;
			
 
				+  frag_mvs=_enc->state.frag_mvs;
			
 
				+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
			
 
				+  _modec->rate=_modec->ssd=0;
			
 
				+  for(bi=0;bi<4;bi++){
			
 
				+    fragi=mb_map[0][bi];
			
 
				+    /*Save the block MVs as the current ones while we're here; we'll replace
			
 
				+       them if we don't ultimately choose 4MV mode.*/
			
 
				+    frag_mvs[fragi]=_mv[bi];
			
 
				+    frag_offs=frag_buf_offs[fragi];
			
 
				+    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv[bi])>1){
			
 
				+      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
			
 
				+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
			
 
				+    }
			
 
				+    else{
			
 
				+      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
			
 
				+       ref+frag_offs+mv_offs[0],ystride);
			
 
				+    }
			
 
				+    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
			
 
				+  }
			
 
				+  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
			
 
				+   _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
			
 
				+  /*Figure out which blocks are being skipped and give them (0,0) MVs.*/
			
 
				+  bits0=0;
			
 
				+  bits1=0;
			
 
				+  nqis=_enc->state.nqis;
			
 
				+  for(bi=0;bi<4;bi++){
			
 
				+    if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis)lbmvs[bi]=0;
			
 
				+    else{
			
 
				+      lbmvs[bi]=_mv[bi];
			
 
				+      bits0+=OC_MV_BITS[0][OC_MV_X(_mv[bi])+31]
			
 
				+       +OC_MV_BITS[0][OC_MV_Y(_mv[bi])+31];
			
 
				+      bits1+=12;
			
 
				+    }
			
 
				+  }
			
 
				+  (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,lbmvs);
			
 
				+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
			
 
				+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
			
 
				+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
			
 
				+  ystride=_enc->state.ref_ystride[1];
			
 
				+  for(mapii=4;mapii<map_nidxs;mapii++){
			
 
				+    mapi=map_idxs[mapii];
			
 
				+    pli=mapi>>2;
			
 
				+    bi=mapi&3;
			
 
				+    fragi=mb_map[pli][bi];
			
 
				+    frag_offs=frag_buf_offs[fragi];
			
 
				+    /*TODO: We could save half these calls by re-using the results for the Cb
			
 
				+       and Cr planes; is it worth it?*/
			
 
				+    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,cbmvs[bi])>1){
			
 
				+      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
			
 
				+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
			
 
				+    }
			
 
				+    else{
			
 
				+      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
			
 
				+       ref+frag_offs+mv_offs[0],ystride);
			
 
				+    }
			
 
				+    frag_satd[mapii]=satd+abs(dc);
			
 
				+  }
			
 
				+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
			
 
				+   frag_satd,_skip_ssd,_rd_scale[4],1);
			
 
				+  _modec->overhead=
			
 
				+   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
			
 
				+   +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
			
 
				+   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
			
 
				+  oc_mode_set_cost(_modec,_enc->lambda);
			
 
				+}
			
 
				+
			
 
				+int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
			
 
				+  oc_set_chroma_mvs_func  set_chroma_mvs;
			
 
				+  oc_qii_state            intra_luma_qs;
			
 
				+  oc_mv                   last_mv;
			
 
				+  oc_mv                   prior_mv;
			
 
				+  ogg_int64_t             interbits;
			
 
				+  ogg_int64_t             intrabits;
			
 
				+  ogg_int64_t             activity_sum;
			
 
				+  ogg_int64_t             luma_sum;
			
 
				+  unsigned                activity_avg;
			
 
				+  unsigned                luma_avg;
			
 
				+  const ogg_uint16_t     *chroma_rd_scale;
			
 
				+  ogg_uint16_t           *mcu_rd_scale;
			
 
				+  ogg_uint16_t           *mcu_rd_iscale;
			
 
				+  const unsigned char    *map_idxs;
			
 
				+  int                     nmap_idxs;
			
 
				+  unsigned               *coded_mbis;
			
 
				+  unsigned               *uncoded_mbis;
			
 
				+  size_t                  ncoded_mbis;
			
 
				+  size_t                  nuncoded_mbis;
			
 
				+  oc_sb_flags            *sb_flags;
			
 
				+  signed char            *mb_modes;
			
 
				+  const oc_sb_map        *sb_maps;
			
 
				+  const oc_mb_map        *mb_maps;
			
 
				+  oc_mb_enc_info         *embs;
			
 
				+  oc_fragment            *frags;
			
 
				+  oc_mv                  *frag_mvs;
			
 
				+  unsigned                stripe_sby;
			
 
				+  unsigned                mcu_nvsbs;
			
 
				+  int                     notstart;
			
 
				+  int                     notdone;
			
 
				+  unsigned                sbi;
			
 
				+  unsigned                sbi_end;
			
 
				+  int                     refi;
			
 
				+  int                     pli;
			
 
				+  int                     sp_level;
			
 
				+  sp_level=_enc->sp_level;
			
 
				+  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
			
 
				+  _enc->state.frame_type=OC_INTER_FRAME;
			
 
				+  oc_mode_scheme_chooser_reset(&_enc->chooser);
			
 
				+  oc_enc_tokenize_start(_enc);
			
 
				+  oc_enc_pipeline_init(_enc,&_enc->pipe);
			
 
				+  oc_enc_mode_rd_init(_enc);
			
 
				+  if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
			
 
				+  _enc->mv_bits[0]=_enc->mv_bits[1]=0;
			
 
				+  interbits=intrabits=0;
			
 
				+  activity_sum=luma_sum=0;
			
 
				+  activity_avg=_enc->activity_avg;
			
 
				+  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
			
 
				+  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTER_FRAME][_enc->state.qis[0]];
			
 
				+  mcu_rd_scale=_enc->mcu_rd_scale;
			
 
				+  mcu_rd_iscale=_enc->mcu_rd_iscale;
			
 
				+  last_mv=prior_mv=0;
			
 
				+  /*Choose MVs and MB modes and quantize and code luma.
			
 
				+    Must be done in Hilbert order.*/
			
 
				+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
			
 
				+  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
			
 
				+  coded_mbis=_enc->coded_mbis;
			
 
				+  uncoded_mbis=coded_mbis+_enc->state.nmbs;
			
 
				+  ncoded_mbis=0;
			
 
				+  nuncoded_mbis=0;
			
 
				+  _enc->state.ncoded_fragis[0]=0;
			
 
				+  _enc->state.ncoded_fragis[1]=0;
			
 
				+  _enc->state.ncoded_fragis[2]=0;
			
 
				+  sb_flags=_enc->state.sb_flags;
			
 
				+  mb_modes=_enc->state.mb_modes;
			
 
				+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
			
 
				+  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
			
 
				+  embs=_enc->mb_info;
			
 
				+  frags=_enc->state.frags;
			
 
				+  frag_mvs=_enc->state.frag_mvs;
			
 
				+  notstart=0;
			
 
				+  notdone=1;
			
 
				+  mcu_nvsbs=_enc->mcu_nvsbs;
			
 
				+  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
			
 
				+    ptrdiff_t cfroffset;
			
 
				+    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
			
 
				+    sbi_end=_enc->pipe.sbi_end[0];
			
 
				+    cfroffset=_enc->pipe.froffset[1];
			
 
				+    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
			
 
				+      int quadi;
			
 
				+      /*Mode addressing is through Y plane, always 4 MB per SB.*/
			
 
				+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
			
 
				+        oc_mode_choice modes[8];
			
 
				+        unsigned       activity[4];
			
 
				+        unsigned       rd_scale[5];
			
 
				+        unsigned       rd_iscale[5];
			
 
				+        unsigned       skip_ssd[12];
			
 
				+        unsigned       intra_satd[12];
			
 
				+        unsigned       luma;
			
 
				+        int            mb_mv_bits_0;
			
 
				+        int            mb_gmv_bits_0;
			
 
				+        int            inter_mv_pref;
			
 
				+        int            mb_mode;
			
 
				+        int            refi;
			
 
				+        int            mv;
			
 
				+        unsigned       mbi;
			
 
				+        int            mapii;
			
 
				+        int            mapi;
			
 
				+        int            bi;
			
 
				+        ptrdiff_t      fragi;
			
 
				+        mbi=sbi<<2|quadi;
			
 
				+        luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
			
 
				+        /*Activity masking.*/
			
 
				+        if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
			
 
				+          oc_mb_activity(_enc,mbi,activity);
			
 
				+        }
			
 
				+        else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
			
 
				+        luma_sum+=luma;
			
 
				+        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
			
 
				+         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
			
 
				+        /*Motion estimation:
			
 
				+          We always do a basic 1MV search for all macroblocks, coded or not,
			
 
				+           keyframe or not.*/
			
 
				+        if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
			
 
				+        mv=0;
			
 
				+        /*Find the block choice with the lowest estimated coding cost.
			
 
				+          If a Cb or Cr block is coded but no Y' block from a macro block then
			
 
				+           the mode MUST be OC_MODE_INTER_NOMV.
			
 
				+          This is the default state to which the mode data structure is
			
 
				+           initialised in encoder and decoder at the start of each frame.*/
			
 
				+        /*Block coding cost is estimated from correlated SATD metrics.*/
			
 
				+        /*At this point, all blocks that are in frame are still marked coded.*/
			
 
				+        if(!_recode){
			
 
				+          embs[mbi].unref_mv[OC_FRAME_GOLD]=
			
 
				+           embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
			
 
				+          embs[mbi].unref_mv[OC_FRAME_PREV]=
			
 
				+           embs[mbi].analysis_mv[0][OC_FRAME_PREV];
			
 
				+          embs[mbi].refined=0;
			
 
				+        }
			
 
				+        /*Estimate the cost of coding this MB in a keyframe.*/
			
 
				+        if(_allow_keyframe){
			
 
				+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
			
 
				+           _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
			
 
				+          intrabits+=modes[OC_MODE_INTRA].rate;
			
 
				+          for(bi=0;bi<4;bi++){
			
 
				+            oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
			
 
				+             modes[OC_MODE_INTRA].qii[bi]);
			
 
				+          }
			
 
				+        }
			
 
				+        /*Estimate the cost in a delta frame for various modes.*/
			
 
				+        oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd);
			
 
				+        if(sp_level<OC_SP_LEVEL_NOMC){
			
 
				+          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
			
 
				+           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
			
 
				+           skip_ssd,rd_scale);
			
 
				+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
			
 
				+           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
			
 
				+          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
			
 
				+           OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
			
 
				+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
			
 
				+          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
			
 
				+           OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
			
 
				+           skip_ssd,rd_scale);
			
 
				+          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
			
 
				+           OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
			
 
				+           skip_ssd,rd_scale);
			
 
				+          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
			
 
				+           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
			
 
				+           skip_ssd,rd_scale);
			
 
				+          mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
			
 
				+           OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
			
 
				+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
			
 
				+          /*The explicit MV modes (2,6,7) have not yet gone through halfpel
			
 
				+             refinement.
			
 
				+            We choose the explicit MV mode that's already furthest ahead on
			
 
				+             R-D cost and refine only that one.
			
 
				+            We have to be careful to remember which ones we've refined so that
			
 
				+             we don't refine it again if we re-encode this frame.*/
			
 
				+          inter_mv_pref=_enc->lambda*3;
			
 
				+          if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
			
 
				+            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
			
 
				+             embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
			
 
				+             skip_ssd,rd_scale);
			
 
				+          }
			
 
				+          else{
			
 
				+            modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
			
 
				+          }
			
 
				+          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
			
 
				+           modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
			
 
				+            if(!(embs[mbi].refined&0x80)){
			
 
				+              oc_mcenc_refine4mv(_enc,mbi);
			
 
				+              embs[mbi].refined|=0x80;
			
 
				+            }
			
 
				+            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
			
 
				+             embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
			
 
				+             skip_ssd,rd_scale);
			
 
				+          }
			
 
				+          else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
			
 
				+           modes[OC_MODE_INTER_MV].cost){
			
 
				+            if(!(embs[mbi].refined&0x40)){
			
 
				+              oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
			
 
				+              embs[mbi].refined|=0x40;
			
 
				+            }
			
 
				+            mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
			
 
				+             OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
			
 
				+             _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
			
 
				+          }
			
 
				+          if(!(embs[mbi].refined&0x04)){
			
 
				+            oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
			
 
				+            embs[mbi].refined|=0x04;
			
 
				+          }
			
 
				+          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
			
 
				+           OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
			
 
				+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
			
 
				+          /*Finally, pick the mode with the cheapest estimated R-D cost.*/
			
 
				+          mb_mode=OC_MODE_INTER_NOMV;
			
 
				+          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
			
 
				+            mb_mode=OC_MODE_INTRA;
			
 
				+          }
			
 
				+          if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){
			
 
				+            mb_mode=OC_MODE_INTER_MV_LAST;
			
 
				+          }
			
 
				+          if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){
			
 
				+            mb_mode=OC_MODE_INTER_MV_LAST2;
			
 
				+          }
			
 
				+          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
			
 
				+            mb_mode=OC_MODE_GOLDEN_NOMV;
			
 
				+          }
			
 
				+          if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){
			
 
				+            mb_mode=OC_MODE_GOLDEN_MV;
			
 
				+          }
			
 
				+          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){
			
 
				+            mb_mode=OC_MODE_INTER_MV_FOUR;
			
 
				+          }
			
 
				+          /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
			
 
				+          if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
			
 
				+            inter_mv_pref=0;
			
 
				+          }
			
 
				+          if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){
			
 
				+            mb_mode=OC_MODE_INTER_MV;
			
 
				+          }
			
 
				+        }
			
 
				+        else{
			
 
				+          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
			
 
				+           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
			
 
				+           skip_ssd,rd_scale);
			
 
				+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
			
 
				+           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
			
 
				+          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
			
 
				+           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
			
 
				+           skip_ssd,rd_scale);
			
 
				+          mb_mode=OC_MODE_INTER_NOMV;
			
 
				+          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
			
 
				+            mb_mode=OC_MODE_INTRA;
			
 
				+          }
			
 
				+          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
			
 
				+            mb_mode=OC_MODE_GOLDEN_NOMV;
			
 
				+          }
			
 
				+          mb_mv_bits_0=mb_gmv_bits_0=0;
			
 
				+        }
			
 
				+        mb_modes[mbi]=mb_mode;
			
 
				+        /*Propagate the MVs to the luma blocks.*/
			
 
				+        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
			
 
				+          switch(mb_mode){
			
 
				+            case OC_MODE_INTER_MV:{
			
 
				+              mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
			
 
				+            }break;
			
 
				+            case OC_MODE_INTER_MV_LAST:mv=last_mv;break;
			
 
				+            case OC_MODE_INTER_MV_LAST2:mv=prior_mv;break;
			
 
				+            case OC_MODE_GOLDEN_MV:{
			
 
				+              mv=embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
			
 
				+            }break;
			
 
				+          }
			
 
				+          for(bi=0;bi<4;bi++){
			
 
				+            fragi=mb_maps[mbi][0][bi];
			
 
				+            frag_mvs[fragi]=mv;
			
 
				+          }
			
 
				+        }
			
 
				+        for(bi=0;bi<4;bi++){
			
 
				+          fragi=sb_maps[mbi>>2][mbi&3][bi];
			
 
				+          frags[fragi].qii=modes[mb_mode].qii[bi];
			
 
				+        }
			
 
				+        if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi,
			
 
				+         modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){
			
 
				+          int orig_mb_mode;
			
 
				+          orig_mb_mode=mb_mode;
			
 
				+          mb_mode=mb_modes[mbi];
			
 
				+          refi=OC_FRAME_FOR_MODE(mb_mode);
			
 
				+          switch(mb_mode){
			
 
				+            case OC_MODE_INTER_MV:{
			
 
				+              prior_mv=last_mv;
			
 
				+              /*If we're backing out from 4MV, find the MV we're actually
			
 
				+                 using.*/
			
 
				+              if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
			
 
				+                for(bi=0;;bi++){
			
 
				+                  fragi=mb_maps[mbi][0][bi];
			
 
				+                  if(frags[fragi].coded){
			
 
				+                    mv=last_mv=frag_mvs[fragi];
			
 
				+                    break;
			
 
				+                  }
			
 
				+                }
			
 
				+                mb_mv_bits_0=OC_MV_BITS[0][OC_MV_X(mv)+31]
			
 
				+                 +OC_MV_BITS[0][OC_MV_Y(mv)+31];
			
 
				+              }
			
 
				+              /*Otherwise we used the original analysis MV.*/
			
 
				+              else last_mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
			
 
				+              _enc->mv_bits[0]+=mb_mv_bits_0;
			
 
				+              _enc->mv_bits[1]+=12;
			
 
				+            }break;
			
 
				+            case OC_MODE_INTER_MV_LAST2:{
			
 
				+              oc_mv tmp_mv;
			
 
				+              tmp_mv=prior_mv;
			
 
				+              prior_mv=last_mv;
			
 
				+              last_mv=tmp_mv;
			
 
				+            }break;
			
 
				+            case OC_MODE_GOLDEN_MV:{
			
 
				+              _enc->mv_bits[0]+=mb_gmv_bits_0;
			
 
				+              _enc->mv_bits[1]+=12;
			
 
				+            }break;
			
 
				+            case OC_MODE_INTER_MV_FOUR:{
			
 
				+              oc_mv lbmvs[4];
			
 
				+              oc_mv cbmvs[4];
			
 
				+              prior_mv=last_mv;
			
 
				+              for(bi=0;bi<4;bi++){
			
 
				+                fragi=mb_maps[mbi][0][bi];
			
 
				+                if(frags[fragi].coded){
			
 
				+                  lbmvs[bi]=last_mv=frag_mvs[fragi];
			
 
				+                  _enc->mv_bits[0]+=OC_MV_BITS[0][OC_MV_X(last_mv)+31]
			
 
				+                   +OC_MV_BITS[0][OC_MV_Y(last_mv)+31];
			
 
				+                  _enc->mv_bits[1]+=12;
			
 
				+                }
			
 
				+                /*Replace the block MVs for not-coded blocks with (0,0).*/
			
 
				+                else lbmvs[bi]=0;
			
 
				+              }
			
 
				+              (*set_chroma_mvs)(cbmvs,lbmvs);
			
 
				+              for(mapii=4;mapii<nmap_idxs;mapii++){
			
 
				+                mapi=map_idxs[mapii];
			
 
				+                pli=mapi>>2;
			
 
				+                bi=mapi&3;
			
 
				+                fragi=mb_maps[mbi][pli][bi];
			
 
				+                frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
			
 
				+                frags[fragi].refi=refi;
			
 
				+                frags[fragi].mb_mode=mb_mode;
			
 
				+                frag_mvs[fragi]=cbmvs[bi];
			
 
				+              }
			
 
				+            }break;
			
 
				+          }
			
 
				+          coded_mbis[ncoded_mbis++]=mbi;
			
 
				+          oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
			
 
				+          interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
			
 
				+        }
			
 
				+        else{
			
 
				+          *(uncoded_mbis-++nuncoded_mbis)=mbi;
			
 
				+          mb_mode=OC_MODE_INTER_NOMV;
			
 
				+          refi=OC_FRAME_PREV;
			
 
				+          mv=0;
			
 
				+        }
			
 
				+        /*Propagate final MB mode and MVs to the chroma blocks.
			
 
				+          This has already been done for 4MV mode, since it requires individual
			
 
				+           block motion vectors.*/
			
 
				+        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
			
 
				+          for(mapii=4;mapii<nmap_idxs;mapii++){
			
 
				+            mapi=map_idxs[mapii];
			
 
				+            pli=mapi>>2;
			
 
				+            bi=mapi&3;
			
 
				+            fragi=mb_maps[mbi][pli][bi];
			
 
				+            /*If we switched from 4MV mode to INTER_MV mode, then the qii
			
 
				+               values won't have been chosen with the right MV, but it's
			
 
				+               probaby not worth re-estimating them.*/
			
 
				+            frags[fragi].qii=modes[mb_mode].qii[mapii];
			
 
				+            frags[fragi].refi=refi;
			
 
				+            frags[fragi].mb_mode=mb_mode;
			
 
				+            frag_mvs[fragi]=mv;
			
 
				+          }
			
 
				+        }
			
 
				+        /*Save masking scale factors for chroma blocks.*/
			
 
				+        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
			
 
				+          mapi=map_idxs[mapii];
			
 
				+          bi=mapi&3;
			
 
				+          fragi=mb_maps[mbi][1][bi];
			
 
				+          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
			
 
				+          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
			
 
				+        }
			
 
				+      }
			
 
				+      oc_fr_state_flush_sb(_enc->pipe.fr+0);
			
 
				+      sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full;
			
 
				+      sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial;
			
 
				+    }
			
 
				+    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
			
 
				+    /*Code chroma planes.*/
			
 
				+    for(pli=1;pli<3;pli++){
			
 
				+      oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe,
			
 
				+       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
			
 
				+      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
			
 
				+    }
			
 
				+    notstart=1;
			
 
				+  }
			
 
				+  /*Update the average block activity and MB luma score for the frame.
			
 
				+    We could use a Bessel follower here, but fast reaction is probably almost
			
 
				+     always best.*/
			
 
				+  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
			
 
				+   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
			
 
				+   _enc->state.fplanes[0].nfrags));
			
 
				+  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
			
 
				+  /*Finish filling in the reference frame borders.*/
			
 
				+  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
			
 
				+  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
			
 
				+  /*Finish adding flagging overhead costs to inter bit counts to determine if
			
 
				+     we should have coded a key frame instead.*/
			
 
				+  if(_allow_keyframe){
			
 
				+    /*Technically the chroma plane counts are over-estimations, because they
			
 
				+       don't account for continuing runs from the luma planes, but the
			
 
				+       inaccuracy is small.
			
 
				+      We don't need to add the luma plane coding flag costs, because they are
			
 
				+       already included in the MB rate estimates.*/
			
 
				+    for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE;
			
 
				+    if(interbits>intrabits)return 1;
			
 
				+  }
			
 
				+  _enc->ncoded_mbis=ncoded_mbis;
			
 
				+  /*Compact the coded fragment list.*/
			
 
				+  {
			
 
				+    ptrdiff_t ncoded_fragis;
			
 
				+    ncoded_fragis=_enc->state.ncoded_fragis[0];
			
 
				+    for(pli=1;pli<3;pli++){
			
 
				+      memmove(_enc->state.coded_fragis+ncoded_fragis,
			
 
				+       _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
			
 
				+       _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
			
 
				+      ncoded_fragis+=_enc->state.ncoded_fragis[pli];
			
 
				+    }
			
 
				+    _enc->state.ntotal_coded_fragis=ncoded_fragis;
			
 
				+  }
			
 
				+  return 0;
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/apiwrapper.c
+++ b/modules/theoraplayer/native/theora/lib/apiwrapper.c
@@ -0,0 +1,166 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: apiwrapper.c 16503 2009-08-22 18:14:02Z giles $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <limits.h>
			
 
				+#include "apiwrapper.h"
			
 
				+
			
 
				+
			
 
				+
			
 
				+const char *theora_version_string(void){
			
 
				+  return th_version_string();
			
 
				+}
			
 
				+
			
 
				+ogg_uint32_t theora_version_number(void){
			
 
				+  return th_version_number();
			
 
				+}
			
 
				+
			
 
				+void theora_info_init(theora_info *_ci){
			
 
				+  memset(_ci,0,sizeof(*_ci));
			
 
				+}
			
 
				+
			
 
				+void theora_info_clear(theora_info *_ci){
			
 
				+  th_api_wrapper *api;
			
 
				+  api=(th_api_wrapper *)_ci->codec_setup;
			
 
				+  memset(_ci,0,sizeof(*_ci));
			
 
				+  if(api!=NULL){
			
 
				+    if(api->clear!=NULL)(*api->clear)(api);
			
 
				+    _ogg_free(api);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void theora_clear(theora_state *_th){
			
 
				+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
			
 
				+  if(_th->internal_decode!=NULL){
			
 
				+    (*((oc_state_dispatch_vtable *)_th->internal_decode)->clear)(_th);
			
 
				+  }
			
 
				+  if(_th->internal_encode!=NULL){
			
 
				+    (*((oc_state_dispatch_vtable *)_th->internal_encode)->clear)(_th);
			
 
				+  }
			
 
				+  if(_th->i!=NULL)theora_info_clear(_th->i);
			
 
				+  memset(_th,0,sizeof(*_th));
			
 
				+}
			
 
				+
			
 
				+int theora_control(theora_state *_th,int _req,void *_buf,size_t _buf_sz){
			
 
				+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
			
 
				+  if(_th->internal_decode!=NULL){
			
 
				+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->control)(_th,
			
 
				+     _req,_buf,_buf_sz);
			
 
				+  }
			
 
				+  else if(_th->internal_encode!=NULL){
			
 
				+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->control)(_th,
			
 
				+     _req,_buf,_buf_sz);
			
 
				+  }
			
 
				+  else return TH_EINVAL;
			
 
				+}
			
 
				+
			
 
				+ogg_int64_t theora_granule_frame(theora_state *_th,ogg_int64_t _gp){
			
 
				+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
			
 
				+  if(_th->internal_decode!=NULL){
			
 
				+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_frame)(
			
 
				+     _th,_gp);
			
 
				+  }
			
 
				+  else if(_th->internal_encode!=NULL){
			
 
				+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_frame)(
			
 
				+     _th,_gp);
			
 
				+  }
			
 
				+  else return -1;
			
 
				+}
			
 
				+
			
 
				+double theora_granule_time(theora_state *_th, ogg_int64_t _gp){
			
 
				+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
			
 
				+  if(_th->internal_decode!=NULL){
			
 
				+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_time)(
			
 
				+     _th,_gp);
			
 
				+  }
			
 
				+  else if(_th->internal_encode!=NULL){
			
 
				+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_time)(
			
 
				+     _th,_gp);
			
 
				+  }
			
 
				+  else return -1;
			
 
				+}
			
 
				+
			
 
				+void oc_theora_info2th_info(th_info *_info,const theora_info *_ci){
			
 
				+  _info->version_major=_ci->version_major;
			
 
				+  _info->version_minor=_ci->version_minor;
			
 
				+  _info->version_subminor=_ci->version_subminor;
			
 
				+  _info->frame_width=_ci->width;
			
 
				+  _info->frame_height=_ci->height;
			
 
				+  _info->pic_width=_ci->frame_width;
			
 
				+  _info->pic_height=_ci->frame_height;
			
 
				+  _info->pic_x=_ci->offset_x;
			
 
				+  _info->pic_y=_ci->offset_y;
			
 
				+  _info->fps_numerator=_ci->fps_numerator;
			
 
				+  _info->fps_denominator=_ci->fps_denominator;
			
 
				+  _info->aspect_numerator=_ci->aspect_numerator;
			
 
				+  _info->aspect_denominator=_ci->aspect_denominator;
			
 
				+  switch(_ci->colorspace){
			
 
				+    case OC_CS_ITU_REC_470M:_info->colorspace=TH_CS_ITU_REC_470M;break;
			
 
				+    case OC_CS_ITU_REC_470BG:_info->colorspace=TH_CS_ITU_REC_470BG;break;
			
 
				+    default:_info->colorspace=TH_CS_UNSPECIFIED;break;
			
 
				+  }
			
 
				+  switch(_ci->pixelformat){
			
 
				+    case OC_PF_420:_info->pixel_fmt=TH_PF_420;break;
			
 
				+    case OC_PF_422:_info->pixel_fmt=TH_PF_422;break;
			
 
				+    case OC_PF_444:_info->pixel_fmt=TH_PF_444;break;
			
 
				+    default:_info->pixel_fmt=TH_PF_RSVD;
			
 
				+  }
			
 
				+  _info->target_bitrate=_ci->target_bitrate;
			
 
				+  _info->quality=_ci->quality;
			
 
				+  _info->keyframe_granule_shift=_ci->keyframe_frequency_force>0?
			
 
				+   OC_MINI(31,oc_ilog(_ci->keyframe_frequency_force-1)):0;
			
 
				+}
			
 
				+
			
 
				+int theora_packet_isheader(ogg_packet *_op){
			
 
				+  return th_packet_isheader(_op);
			
 
				+}
			
 
				+
			
 
				+int theora_packet_iskeyframe(ogg_packet *_op){
			
 
				+  return th_packet_iskeyframe(_op);
			
 
				+}
			
 
				+
			
 
				+int theora_granule_shift(theora_info *_ci){
			
 
				+  /*This breaks when keyframe_frequency_force is not positive or is larger than
			
 
				+     2**31 (if your int is more than 32 bits), but that's what the original
			
 
				+     function does.*/
			
 
				+  return oc_ilog(_ci->keyframe_frequency_force-1);
			
 
				+}
			
 
				+
			
 
				+void theora_comment_init(theora_comment *_tc){
			
 
				+  th_comment_init((th_comment *)_tc);
			
 
				+}
			
 
				+
			
 
				+char *theora_comment_query(theora_comment *_tc,char *_tag,int _count){
			
 
				+  return th_comment_query((th_comment *)_tc,_tag,_count);
			
 
				+}
			
 
				+
			
 
				+int theora_comment_query_count(theora_comment *_tc,char *_tag){
			
 
				+  return th_comment_query_count((th_comment *)_tc,_tag);
			
 
				+}
			
 
				+
			
 
				+void theora_comment_clear(theora_comment *_tc){
			
 
				+  th_comment_clear((th_comment *)_tc);
			
 
				+}
			
 
				+
			
 
				+void theora_comment_add(theora_comment *_tc,char *_comment){
			
 
				+  th_comment_add((th_comment *)_tc,_comment);
			
 
				+}
			
 
				+
			
 
				+void theora_comment_add_tag(theora_comment *_tc, char *_tag, char *_value){
			
 
				+  th_comment_add_tag((th_comment *)_tc,_tag,_value);
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/apiwrapper.h
+++ b/modules/theoraplayer/native/theora/lib/apiwrapper.h
@@ -0,0 +1,54 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: apiwrapper.h 13596 2007-08-23 20:05:38Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#if !defined(_apiwrapper_H)
			
 
				+# define _apiwrapper_H (1)
			
 
				+# include <ogg/ogg.h>
			
 
				+# include <theora/theora.h>
			
 
				+# include "theora/theoradec.h"
			
 
				+# include "theora/theoraenc.h"
			
 
				+# include "state.h"
			
 
				+
			
 
				+typedef struct th_api_wrapper th_api_wrapper;
			
 
				+typedef struct th_api_info    th_api_info;
			
 
				+
			
 
				+/*Provide an entry point for the codec setup to clear itself in case we ever
			
 
				+   want to break pieces off into a common base library shared by encoder and
			
 
				+   decoder.
			
 
				+  In addition, this makes several other pieces of the API wrapper cleaner.*/
			
 
				+typedef void (*oc_setup_clear_func)(void *_ts);
			
 
				+
			
 
				+/*Generally only one of these pointers will be non-NULL in any given instance.
			
 
				+  Technically we do not even really need this struct, since we should be able
			
 
				+   to figure out which one from "context", but doing it this way makes sure we
			
 
				+   don't flub it up.*/
			
 
				+struct th_api_wrapper{
			
 
				+  oc_setup_clear_func  clear;
			
 
				+  th_setup_info       *setup;
			
 
				+  th_dec_ctx          *decode;
			
 
				+  th_enc_ctx          *encode;
			
 
				+};
			
 
				+
			
 
				+struct th_api_info{
			
 
				+  th_api_wrapper api;
			
 
				+  theora_info    info;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+void oc_theora_info2th_info(th_info *_info,const theora_info *_ci);
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm/arm2gnu.pl
+++ b/modules/theoraplayer/native/theora/lib/arm/arm2gnu.pl
@@ -0,0 +1,304 @@
 
				+#!/usr/bin/perl
			
 
				+
			
 
				+my $bigend;  # little/big endian
			
 
				+my $nxstack;
			
 
				+
			
 
				+$nxstack = 0;
			
 
				+
			
 
				+eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
			
 
				+    if $running_under_some_shell;
			
 
				+
			
 
				+while ($ARGV[0] =~ /^-/) {
			
 
				+    $_ = shift;
			
 
				+  last if /^--/;
			
 
				+    if (/^-n/) {
			
 
				+    $nflag++;
			
 
				+    next;
			
 
				+    }
			
 
				+    die "I don't recognize this switch: $_\\n";
			
 
				+}
			
 
				+$printit++ unless $nflag;
			
 
				+
			
 
				+$\ = "\n";      # automatically add newline on print
			
 
				+$n=0;
			
 
				+
			
 
				+$thumb = 0;     # ARM mode by default, not Thumb.
			
 
				+@proc_stack = ();
			
 
				+
			
 
				+LINE:
			
 
				+while (<>) {
			
 
				+
			
 
				+    # For ADRLs we need to add a new line after the substituted one.
			
 
				+    $addPadding = 0;
			
 
				+
			
 
				+    # First, we do not dare to touch *anything* inside double quotes, do we?
			
 
				+    # Second, if you want a dollar character in the string,
			
 
				+    # insert two of them -- that's how ARM C and assembler treat strings.
			
 
				+    s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1:   .ascii \"/   && do { s/\$\$/\$/g; next };
			
 
				+    s/\bDCB\b[ \t]*\"/.ascii \"/                          && do { s/\$\$/\$/g; next };
			
 
				+    s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/                    && do { s/\$\$/\$/g; next };
			
 
				+    # If there's nothing on a line but a comment, don't try to apply any further
			
 
				+    #  substitutions (this is a cheap hack to avoid mucking up the license header)
			
 
				+    s/^([ \t]*);/$1@/                                     && do { s/\$\$/\$/g; next };
			
 
				+    # If substituted -- leave immediately !
			
 
				+
			
 
				+    s/@/,:/;
			
 
				+    s/;/@/;
			
 
				+    while ( /@.*'/ ) {
			
 
				+      s/(@.*)'/$1/g;
			
 
				+    }
			
 
				+    s/\{FALSE\}/0/g;
			
 
				+    s/\{TRUE\}/1/g;
			
 
				+    s/\{(\w\w\w\w+)\}/$1/g;
			
 
				+    s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
			
 
				+    s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
			
 
				+    s/\bIMPORT\b/.extern/;
			
 
				+    s/\bEXPORT\b/.global/;
			
 
				+    s/^(\s+)\[/$1IF/;
			
 
				+    s/^(\s+)\|/$1ELSE/;
			
 
				+    s/^(\s+)\]/$1ENDIF/;
			
 
				+    s/IF *:DEF:/ .ifdef/;
			
 
				+    s/IF *:LNOT: *:DEF:/ .ifndef/;
			
 
				+    s/ELSE/ .else/;
			
 
				+    s/ENDIF/ .endif/;
			
 
				+
			
 
				+    if( /\bIF\b/ ) {
			
 
				+      s/\bIF\b/ .if/;
			
 
				+      s/=/==/;
			
 
				+    }
			
 
				+    if ( $n == 2) {
			
 
				+        s/\$/\\/g;
			
 
				+    }
			
 
				+    if ($n == 1) {
			
 
				+        s/\$//g;
			
 
				+        s/label//g;
			
 
				+    $n = 2;
			
 
				+      }
			
 
				+    if ( /MACRO/ ) {
			
 
				+      s/MACRO *\n/.macro/;
			
 
				+      $n=1;
			
 
				+    }
			
 
				+    if ( /\bMEND\b/ ) {
			
 
				+      s/\bMEND\b/.endm/;
			
 
				+      $n=0;
			
 
				+    }
			
 
				+
			
 
				+    # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
			
 
				+    #
			
 
				+    if ( /\bAREA\b/ ) {
			
 
				+        my $align;
			
 
				+        $align = "2";
			
 
				+        if ( /ALIGN=(\d+)/ ) {
			
 
				+            $align = $1;
			
 
				+        }
			
 
				+        if ( /CODE/ ) {
			
 
				+            $nxstack = 1;
			
 
				+        }
			
 
				+        s/^(.+)CODE(.+)READONLY(.*)/    .text/;
			
 
				+        s/^(.+)DATA(.+)READONLY(.*)/    .section .rdata/;
			
 
				+        s/^(.+)\|\|\.data\|\|(.+)/    .data/;
			
 
				+        s/^(.+)\|\|\.bss\|\|(.+)/    .bss/;
			
 
				+        s/$/;   .p2align $align/;
			
 
				+    }
			
 
				+
			
 
				+    s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/;       # ||.constdata$3||
			
 
				+    s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/;               # ||.bss$2||
			
 
				+    s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/;             # ||.data$2||
			
 
				+    s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
			
 
				+    s/^(\s+)\%(\s)/    .space $1/;
			
 
				+
			
 
				+    s/\|(.+)\.(\d+)\|/\.$1_$2/;                     # |L80.123| -> .L80_123
			
 
				+    s/\bCODE32\b/.code 32/ && do {$thumb = 0};
			
 
				+    s/\bCODE16\b/.code 16/ && do {$thumb = 1};
			
 
				+    if (/\bPROC\b/)
			
 
				+    {
			
 
				+        my $prefix;
			
 
				+        my $proc;
			
 
				+        /^([A-Za-z_\.]\w+)\b/;
			
 
				+        $proc = $1;
			
 
				+        $prefix = "";
			
 
				+        if ($proc)
			
 
				+        {
			
 
				+            $prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc);
			
 
				+            push(@proc_stack, $proc);
			
 
				+            s/^[A-Za-z_\.]\w+/$&:/;
			
 
				+        }
			
 
				+        $prefix = $prefix."\t.thumb_func; " if ($thumb);
			
 
				+        s/\bPROC\b/@ $&/;
			
 
				+        $_ = $prefix.$_;
			
 
				+    }
			
 
				+    s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
			
 
				+    s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
			
 
				+    if (/\bENDP\b/)
			
 
				+    {
			
 
				+        my $proc;
			
 
				+        s/\bENDP\b/@ $&/;
			
 
				+        $proc = pop(@proc_stack);
			
 
				+        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
			
 
				+    }
			
 
				+    s/\bSUBT\b/@ $&/;
			
 
				+    s/\bDATA\b/@ $&/;   # DATA directive is deprecated -- Asm guide, p.7-25
			
 
				+    s/\bKEEP\b/@ $&/;
			
 
				+    s/\bEXPORTAS\b/@ $&/;
			
 
				+    s/\|\|(.)+\bEQU\b/@ $&/;
			
 
				+    s/\|\|([\w\$]+)\|\|/$1/;
			
 
				+    s/\bENTRY\b/@ $&/;
			
 
				+    s/\bASSERT\b/@ $&/;
			
 
				+    s/\bGBLL\b/@ $&/;
			
 
				+    s/\bGBLA\b/@ $&/;
			
 
				+    s/^\W+OPT\b/@ $&/;
			
 
				+    s/:OR:/|/g;
			
 
				+    s/:SHL:/<</g;
			
 
				+    s/:SHR:/>>/g;
			
 
				+    s/:AND:/&/g;
			
 
				+    s/:LAND:/&&/g;
			
 
				+    s/CPSR/cpsr/;
			
 
				+    s/SPSR/spsr/;
			
 
				+    s/ALIGN$/.balign 4/;
			
 
				+    s/ALIGN\s+([0-9x]+)$/.balign $1/;
			
 
				+    s/psr_cxsf/psr_all/;
			
 
				+    s/LTORG/.ltorg/;
			
 
				+    s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
			
 
				+    s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
			
 
				+    s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
			
 
				+    s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
			
 
				+
			
 
				+    #  {PC} + 0xdeadfeed  -->  . + 0xdeadfeed
			
 
				+    s/\{PC\} \+/ \. +/;
			
 
				+
			
 
				+    # Single hex constant on the line !
			
 
				+    #
			
 
				+    # >>> NOTE <<<
			
 
				+    #   Double-precision floats in gcc are always mixed-endian, which means
			
 
				+    #   bytes in two words are little-endian, but words are big-endian.
			
 
				+    #   So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
			
 
				+    #   and 0xfeed0000 at high address.
			
 
				+    #
			
 
				+    s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
			
 
				+    # Only decimal constants on the line, no hex !
			
 
				+    s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
			
 
				+
			
 
				+    # Single hex constant on the line !
			
 
				+#    s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
			
 
				+    # Only decimal constants on the line, no hex !
			
 
				+#    s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
			
 
				+    s/\bDCFS[ \t]+0x/.word 0x/;
			
 
				+    s/\bDCFS\b/.float/;
			
 
				+
			
 
				+    s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
			
 
				+    s/\bDCD\b/.word/;
			
 
				+    s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
			
 
				+    s/\bDCW\b/.short/;
			
 
				+    s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
			
 
				+    s/\bDCB\b/.byte/;
			
 
				+    s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
			
 
				+    s/^[A-Za-z_\.]\w+/$&:/;
			
 
				+    s/^(\d+)/$1:/;
			
 
				+    s/\%(\d+)/$1b_or_f/;
			
 
				+    s/\%[Bb](\d+)/$1b/;
			
 
				+    s/\%[Ff](\d+)/$1f/;
			
 
				+    s/\%[Ff][Tt](\d+)/$1f/;
			
 
				+    s/&([\dA-Fa-f]+)/0x$1/;
			
 
				+    if ( /\b2_[01]+\b/ ) {
			
 
				+      s/\b2_([01]+)\b/conv$1&&&&/g;
			
 
				+      while ( /[01][01][01][01]&&&&/ ) {
			
 
				+        s/0000&&&&/&&&&0/g;
			
 
				+        s/0001&&&&/&&&&1/g;
			
 
				+        s/0010&&&&/&&&&2/g;
			
 
				+        s/0011&&&&/&&&&3/g;
			
 
				+        s/0100&&&&/&&&&4/g;
			
 
				+        s/0101&&&&/&&&&5/g;
			
 
				+        s/0110&&&&/&&&&6/g;
			
 
				+        s/0111&&&&/&&&&7/g;
			
 
				+        s/1000&&&&/&&&&8/g;
			
 
				+        s/1001&&&&/&&&&9/g;
			
 
				+        s/1010&&&&/&&&&A/g;
			
 
				+        s/1011&&&&/&&&&B/g;
			
 
				+        s/1100&&&&/&&&&C/g;
			
 
				+        s/1101&&&&/&&&&D/g;
			
 
				+        s/1110&&&&/&&&&E/g;
			
 
				+        s/1111&&&&/&&&&F/g;
			
 
				+      }
			
 
				+      s/000&&&&/&&&&0/g;
			
 
				+      s/001&&&&/&&&&1/g;
			
 
				+      s/010&&&&/&&&&2/g;
			
 
				+      s/011&&&&/&&&&3/g;
			
 
				+      s/100&&&&/&&&&4/g;
			
 
				+      s/101&&&&/&&&&5/g;
			
 
				+      s/110&&&&/&&&&6/g;
			
 
				+      s/111&&&&/&&&&7/g;
			
 
				+      s/00&&&&/&&&&0/g;
			
 
				+      s/01&&&&/&&&&1/g;
			
 
				+      s/10&&&&/&&&&2/g;
			
 
				+      s/11&&&&/&&&&3/g;
			
 
				+      s/0&&&&/&&&&0/g;
			
 
				+      s/1&&&&/&&&&1/g;
			
 
				+      s/conv&&&&/0x/g;
			
 
				+    }
			
 
				+
			
 
				+    if ( /commandline/)
			
 
				+    {
			
 
				+        if( /-bigend/)
			
 
				+        {
			
 
				+            $bigend=1;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if ( /\bDCDU\b/ )
			
 
				+    {
			
 
				+        my $cmd=$_;
			
 
				+        my $value;
			
 
				+        my $prefix;
			
 
				+        my $w1;
			
 
				+        my $w2;
			
 
				+        my $w3;
			
 
				+        my $w4;
			
 
				+
			
 
				+        s/\s+DCDU\b/@ $&/;
			
 
				+
			
 
				+        $cmd =~ /\bDCDU\b\s+0x(\d+)/;
			
 
				+        $value = $1;
			
 
				+        $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
			
 
				+        $w1 = $1;
			
 
				+        $w2 = $2;
			
 
				+        $w3 = $3;
			
 
				+        $w4 = $4;
			
 
				+
			
 
				+        if( $bigend ne "")
			
 
				+        {
			
 
				+            # big endian
			
 
				+            $prefix = "\t.byte\t0x".$w1.";".
			
 
				+                      "\t.byte\t0x".$w2.";".
			
 
				+                      "\t.byte\t0x".$w3.";".
			
 
				+                      "\t.byte\t0x".$w4."; ";
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            # little endian
			
 
				+            $prefix = "\t.byte\t0x".$w4.";".
			
 
				+                      "\t.byte\t0x".$w3.";".
			
 
				+                      "\t.byte\t0x".$w2.";".
			
 
				+                      "\t.byte\t0x".$w1."; ";
			
 
				+        }
			
 
				+        $_=$prefix.$_;
			
 
				+    }
			
 
				+
			
 
				+    if ( /\badrl\b/i )
			
 
				+    {
			
 
				+        s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
			
 
				+        $addPadding = 1;
			
 
				+    }
			
 
				+    s/\bEND\b/@ END/;
			
 
				+} continue {
			
 
				+    printf ("%s", $_) if $printit;
			
 
				+    if ($addPadding != 0)
			
 
				+    {
			
 
				+        printf ("   mov r0,r0\n");
			
 
				+        $addPadding = 0;
			
 
				+    }
			
 
				+}
			
 
				+#If we had a code section, mark that this object doesn't need an executable
			
 
				+# stack.
			
 
				+if ($nxstack) {
			
 
				+    printf ("    .section\t.note.GNU-stack,\"\",\%\%progbits\n");
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/arm/armbits.asm
+++ b/modules/theoraplayer/native/theora/lib/arm/armbits.asm
@@ -0,0 +1,231 @@
 
				+@********************************************************************
			
 
				+@*                                                                  *
			
 
				+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+@*                                                                  *
			
 
				+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+@*                                                                  *
			
 
				+@********************************************************************
			
 
				+@
			
 
				+@ function:
			
 
				+@   last mod: $Id: armbits.s 17481 2010-10-03 22:49:42Z tterribe $
			
 
				+@
			
 
				+@********************************************************************
			
 
				+
			
 
				+    .text;   .p2align 2
			
 
				+
			
 
				+	.global oc_pack_read_arm
			
 
				+	.global oc_pack_read1_arm
			
 
				+	.global oc_huff_token_decode_arm
			
 
				+
			
 
				+	.type	oc_pack_read1_arm, %function; oc_pack_read1_arm: @ PROC
			
 
				+	@ r0 = oc_pack_buf *_b
			
 
				+	ADD r12,r0,#8
			
 
				+	LDMIA r12,{r2,r3}      @ r2 = window
			
 
				+	@ Stall...             ; r3 = available
			
 
				+	@ Stall...
			
 
				+	SUBS r3,r3,#1          @ r3 = available-1, available<1 => LT
			
 
				+	BLT oc_pack_read1_refill
			
 
				+	MOV r0,r2,LSR #31      @ r0 = window>>31
			
 
				+	MOV r2,r2,LSL #1       @ r2 = window<<=1
			
 
				+	STMIA r12,{r2,r3}      @ window = r2
			
 
				+	                       @ available = r3
			
 
				+	MOV PC,r14
			
 
				+	.size oc_pack_read1_arm, .-oc_pack_read1_arm	@ ENDP
			
 
				+
			
 
				+	.type	oc_pack_read_arm, %function; oc_pack_read_arm: @ PROC
			
 
				+	@ r0 = oc_pack_buf *_b
			
 
				+	@ r1 = int          _bits
			
 
				+	ADD r12,r0,#8
			
 
				+	LDMIA r12,{r2,r3}      @ r2 = window
			
 
				+	@ Stall...             ; r3 = available
			
 
				+	@ Stall...
			
 
				+	SUBS r3,r3,r1          @ r3 = available-_bits, available<_bits => LT
			
 
				+	BLT oc_pack_read_refill
			
 
				+	RSB r0,r1,#32          @ r0 = 32-_bits
			
 
				+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
			
 
				+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
			
 
				+	STMIA r12,{r2,r3}      @ window = r2
			
 
				+	                       @ available = r3
			
 
				+	MOV PC,r14
			
 
				+
			
 
				+@ We need to refill window.
			
 
				+oc_pack_read1_refill:
			
 
				+	MOV r1,#1
			
 
				+oc_pack_read_refill:
			
 
				+	STMFD r13!,{r10,r11,r14}
			
 
				+	LDMIA r0,{r10,r11}     @ r10 = stop
			
 
				+	                       @ r11 = ptr
			
 
				+	RSB r0,r1,#32          @ r0 = 32-_bits
			
 
				+	RSB r3,r3,r0           @ r3 = 32-available
			
 
				+@ We can use unsigned compares for both the pointers and for available
			
 
				+@  (allowing us to chain condition codes) because available will never be
			
 
				+@  larger than 32 (or we wouldn't be here), and thus 32-available will never be
			
 
				+@  negative.
			
 
				+	CMP r10,r11            @ ptr<stop => HI
			
 
				+	CMPHI r3,#7            @   available<=24 => HI
			
 
				+	LDRHIB r14,[r11],#1    @     r14 = *ptr++
			
 
				+	SUBHI r3,#8            @     available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    @     r2 = window|=r14<<32-available
			
 
				+	CMPHI r10,r11          @     ptr<stop => HI
			
 
				+	CMPHI r3,#7            @       available<=24 => HI
			
 
				+	LDRHIB r14,[r11],#1    @         r14 = *ptr++
			
 
				+	SUBHI r3,#8            @         available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    @         r2 = window|=r14<<32-available
			
 
				+	CMPHI r10,r11          @         ptr<stop => HI
			
 
				+	CMPHI r3,#7            @           available<=24 => HI
			
 
				+	LDRHIB r14,[r11],#1    @             r14 = *ptr++
			
 
				+	SUBHI r3,#8            @             available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    @             r2 = window|=r14<<32-available
			
 
				+	CMPHI r10,r11          @             ptr<stop => HI
			
 
				+	CMPHI r3,#7            @               available<=24 => HI
			
 
				+	LDRHIB r14,[r11],#1    @                 r14 = *ptr++
			
 
				+	SUBHI r3,#8            @                 available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    @                 r2 = window|=r14<<32-available
			
 
				+	SUBS r3,r0,r3          @ r3 = available-=_bits, available<bits => GT
			
 
				+	BLT oc_pack_read_refill_last
			
 
				+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
			
 
				+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
			
 
				+	STR r11,[r12,#-4]      @ ptr = r11
			
 
				+	STMIA r12,{r2,r3}      @ window = r2
			
 
				+	                       @ available = r3
			
 
				+	LDMFD r13!,{r10,r11,PC}
			
 
				+
			
 
				+@ Either we wanted to read more than 24 bits and didn't have enough room to
			
 
				+@  stuff the last byte into the window, or we hit the end of the packet.
			
 
				+oc_pack_read_refill_last:
			
 
				+	CMP r11,r10            @ ptr<stop => LO
			
 
				+@ If we didn't hit the end of the packet, then pull enough of the next byte to
			
 
				+@  to fill up the window.
			
 
				+	LDRLOB r14,[r11]       @ (LO) r14 = *ptr
			
 
				+@ Otherwise, set the EOF flag and pretend we have lots of available bits.
			
 
				+	MOVHS r14,#1           @ (HS) r14 = 1
			
 
				+	ADDLO r10,r3,r1        @ (LO) r10 = available
			
 
				+	STRHS r14,[r12,#8]     @ (HS) eof = 1
			
 
				+	ANDLO r10,r10,#7       @ (LO) r10 = available0x7
			
 
				+	MOVHS r3,#1<<30        @ (HS) available = OC_LOTS_OF_BITS
			
 
				+	ORRLO r2,r14,LSL r10   @ (LO) r2 = window|=*ptr>>(available0x7)
			
 
				+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
			
 
				+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
			
 
				+	STR r11,[r12,#-4]      @ ptr = r11
			
 
				+	STMIA r12,{r2,r3}      @ window = r2
			
 
				+	                       @ available = r3
			
 
				+	LDMFD r13!,{r10,r11,PC}
			
 
				+	.size oc_pack_read_arm, .-oc_pack_read_arm	@ ENDP
			
 
				+
			
 
				+
			
 
				+
			
 
				+	.type	oc_huff_token_decode_arm, %function; oc_huff_token_decode_arm: @ PROC
			
 
				+	@ r0 = oc_pack_buf       *_b
			
 
				+	@ r1 = const ogg_int16_t *_tree
			
 
				+	STMFD r13!,{r4,r5,r10,r14}
			
 
				+	LDRSH r10,[r1]         @ r10 = n=_tree[0]
			
 
				+	LDMIA r0,{r2-r5}       @ r2 = stop
			
 
				+	@ Stall...             ; r3 = ptr
			
 
				+	@ Stall...             ; r4 = window
			
 
				+	                       @ r5 = available
			
 
				+	CMP r10,r5             @ n>available => GT
			
 
				+	BGT oc_huff_token_decode_refill0
			
 
				+	RSB r14,r10,#32        @ r14 = 32-n
			
 
				+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
			
 
				+	ADD r14,r1,r14,LSL #1  @ r14 = _tree+bits
			
 
				+	LDRSH r12,[r14,#2]     @ r12 = node=_tree[1+bits]
			
 
				+	@ Stall...
			
 
				+	@ Stall...
			
 
				+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
			
 
				+	BMI oc_huff_token_decode_continue
			
 
				+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
			
 
				+	MOV r4,r4,LSL r10      @ r4 = window<<=n
			
 
				+	SUB r5,r10             @ r5 = available-=n
			
 
				+	STMIB r0,{r3-r5}       @ ptr = r3
			
 
				+	                       @ window = r4
			
 
				+	                       @ available = r5
			
 
				+	AND r0,r14,#255        @ r0 = node0x255
			
 
				+	LDMFD r13!,{r4,r5,r10,pc}
			
 
				+
			
 
				+@ The first tree node wasn't enough to reach a leaf, read another
			
 
				+oc_huff_token_decode_continue:
			
 
				+	ADD r12,r1,r12,LSL #1  @ r12 = _tree+node
			
 
				+	MOV r4,r4,LSL r10      @ r4 = window<<=n
			
 
				+	SUB r5,r5,r10          @ r5 = available-=n
			
 
				+	LDRSH r10,[r12],#2     @ r10 = n=_tree[node]
			
 
				+	@ Stall...             ; r12 = _tree+node+1
			
 
				+	@ Stall...
			
 
				+	CMP r10,r5             @ n>available => GT
			
 
				+	BGT oc_huff_token_decode_refill
			
 
				+	RSB r14,r10,#32        @ r14 = 32-n
			
 
				+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
			
 
				+	ADD r12,r12,r14        @
			
 
				+	LDRSH r12,[r12,r14]    @ r12 = node=_tree[node+1+bits]
			
 
				+	@ Stall...
			
 
				+	@ Stall...
			
 
				+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
			
 
				+	BMI oc_huff_token_decode_continue
			
 
				+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
			
 
				+	MOV r4,r4,LSL r10      @ r4 = window<<=n
			
 
				+	SUB r5,r10             @ r5 = available-=n
			
 
				+	STMIB r0,{r3-r5}       @ ptr = r3
			
 
				+	                       @ window = r4
			
 
				+	                       @ available = r5
			
 
				+	AND r0,r14,#255        @ r0 = node0x255
			
 
				+	LDMFD r13!,{r4,r5,r10,pc}
			
 
				+
			
 
				+oc_huff_token_decode_refill0:
			
 
				+	ADD r12,r1,#2          @ r12 = _tree+1
			
 
				+oc_huff_token_decode_refill:
			
 
				+@ We can't possibly need more than 15 bits, so available must be <= 15.
			
 
				+@ Therefore we can load at least two bytes without checking it.
			
 
				+	CMP r2,r3              @ ptr<stop => HI
			
 
				+	LDRHIB r14,[r3],#1     @   r14 = *ptr++
			
 
				+	RSBHI r5,r5,#24        @ (HI) available = 32-(available+=8)
			
 
				+	RSBLS r5,r5,#32        @ (LS) r5 = 32-available
			
 
				+	ORRHI r4,r14,LSL r5    @   r4 = window|=r14<<32-available
			
 
				+	CMPHI r2,r3            @   ptr<stop => HI
			
 
				+	LDRHIB r14,[r3],#1     @     r14 = *ptr++
			
 
				+	SUBHI r5,#8            @     available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r4,r14,LSL r5    @     r4 = window|=r14<<32-available
			
 
				+@ We can use unsigned compares for both the pointers and for available
			
 
				+@  (allowing us to chain condition codes) because available will never be
			
 
				+@  larger than 32 (or we wouldn't be here), and thus 32-available will never be
			
 
				+@  negative.
			
 
				+	CMPHI r2,r3            @     ptr<stop => HI
			
 
				+	CMPHI r5,#7            @       available<=24 => HI
			
 
				+	LDRHIB r14,[r3],#1     @         r14 = *ptr++
			
 
				+	SUBHI r5,#8            @         available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r4,r14,LSL r5    @         r4 = window|=r14<<32-available
			
 
				+	CMP r2,r3              @ ptr<stop => HI
			
 
				+	MOVLS r5,#-1<<30       @ (LS) available = OC_LOTS_OF_BITS+32
			
 
				+	CMPHI r5,#7            @ (HI) available<=24 => HI
			
 
				+	LDRHIB r14,[r3],#1     @ (HI)   r14 = *ptr++
			
 
				+	SUBHI r5,#8            @ (HI)   available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r4,r14,LSL r5    @ (HI)   r4 = window|=r14<<32-available
			
 
				+	RSB r14,r10,#32        @ r14 = 32-n
			
 
				+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
			
 
				+	ADD r12,r12,r14        @
			
 
				+	LDRSH r12,[r12,r14]    @ r12 = node=_tree[node+1+bits]
			
 
				+	RSB r5,r5,#32          @ r5 = available
			
 
				+	@ Stall...
			
 
				+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
			
 
				+	BMI oc_huff_token_decode_continue
			
 
				+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
			
 
				+	MOV r4,r4,LSL r10      @ r4 = window<<=n
			
 
				+	SUB r5,r10             @ r5 = available-=n
			
 
				+	STMIB r0,{r3-r5}       @ ptr = r3
			
 
				+	                       @ window = r4
			
 
				+	                       @ available = r5
			
 
				+	AND r0,r14,#255        @ r0 = node0x255
			
 
				+	LDMFD r13!,{r4,r5,r10,pc}
			
 
				+	.size oc_huff_token_decode_arm, .-oc_huff_token_decode_arm	@ ENDP
			
 
				+
			
 
				+	@ END
			
 
				+    .section	.note.GNU-stack,"",%progbits
			
--- a/modules/theoraplayer/native/theora/lib/arm/armbits.h
+++ b/modules/theoraplayer/native/theora/lib/arm/armbits.h
@@ -0,0 +1,32 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#if !defined(_arm_armbits_H)
			
 
				+# define _arm_armbits_H (1)
			
 
				+# include "../bitpack.h"
			
 
				+# include "armcpu.h"
			
 
				+
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+#  define oc_pack_read oc_pack_read_arm
			
 
				+#  define oc_pack_read1 oc_pack_read1_arm
			
 
				+#  define oc_huff_token_decode oc_huff_token_decode_arm
			
 
				+# endif
			
 
				+
			
 
				+long oc_pack_read_arm(oc_pack_buf *_b,int _bits);
			
 
				+int oc_pack_read1_arm(oc_pack_buf *_b);
			
 
				+int oc_huff_token_decode_arm(oc_pack_buf *_b,const ogg_int16_t *_tree);
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm/armbits.s
+++ b/modules/theoraplayer/native/theora/lib/arm/armbits.s
@@ -0,0 +1,230 @@
 
				+;********************************************************************
			
 
				+;*                                                                  *
			
 
				+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+;*                                                                  *
			
 
				+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+;*                                                                  *
			
 
				+;********************************************************************
			
 
				+;
			
 
				+; function:
			
 
				+;   last mod: $Id: armbits.s 17481 2010-10-03 22:49:42Z tterribe $
			
 
				+;
			
 
				+;********************************************************************
			
 
				+
			
 
				+	AREA	|.text|, CODE, READONLY
			
 
				+
			
 
				+	EXPORT oc_pack_read_arm
			
 
				+	EXPORT oc_pack_read1_arm
			
 
				+	EXPORT oc_huff_token_decode_arm
			
 
				+
			
 
				+oc_pack_read1_arm PROC
			
 
				+	; r0 = oc_pack_buf *_b
			
 
				+	ADD r12,r0,#8
			
 
				+	LDMIA r12,{r2,r3}      ; r2 = window
			
 
				+	; Stall...             ; r3 = available
			
 
				+	; Stall...
			
 
				+	SUBS r3,r3,#1          ; r3 = available-1, available<1 => LT
			
 
				+	BLT oc_pack_read1_refill
			
 
				+	MOV r0,r2,LSR #31      ; r0 = window>>31
			
 
				+	MOV r2,r2,LSL #1       ; r2 = window<<=1
			
 
				+	STMIA r12,{r2,r3}      ; window = r2
			
 
				+	                       ; available = r3
			
 
				+	MOV PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+oc_pack_read_arm PROC
			
 
				+	; r0 = oc_pack_buf *_b
			
 
				+	; r1 = int          _bits
			
 
				+	ADD r12,r0,#8
			
 
				+	LDMIA r12,{r2,r3}      ; r2 = window
			
 
				+	; Stall...             ; r3 = available
			
 
				+	; Stall...
			
 
				+	SUBS r3,r3,r1          ; r3 = available-_bits, available<_bits => LT
			
 
				+	BLT oc_pack_read_refill
			
 
				+	RSB r0,r1,#32          ; r0 = 32-_bits
			
 
				+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
			
 
				+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
			
 
				+	STMIA r12,{r2,r3}      ; window = r2
			
 
				+	                       ; available = r3
			
 
				+	MOV PC,r14
			
 
				+
			
 
				+; We need to refill window.
			
 
				+oc_pack_read1_refill
			
 
				+	MOV r1,#1
			
 
				+oc_pack_read_refill
			
 
				+	STMFD r13!,{r10,r11,r14}
			
 
				+	LDMIA r0,{r10,r11}     ; r10 = stop
			
 
				+	                       ; r11 = ptr
			
 
				+	RSB r0,r1,#32          ; r0 = 32-_bits
			
 
				+	RSB r3,r3,r0           ; r3 = 32-available
			
 
				+; We can use unsigned compares for both the pointers and for available
			
 
				+;  (allowing us to chain condition codes) because available will never be
			
 
				+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
			
 
				+;  negative.
			
 
				+	CMP r10,r11            ; ptr<stop => HI
			
 
				+	CMPHI r3,#7            ;   available<=24 => HI
			
 
				+	LDRHIB r14,[r11],#1    ;     r14 = *ptr++
			
 
				+	SUBHI r3,#8            ;     available += 8
			
 
				+	; (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    ;     r2 = window|=r14<<32-available
			
 
				+	CMPHI r10,r11          ;     ptr<stop => HI
			
 
				+	CMPHI r3,#7            ;       available<=24 => HI
			
 
				+	LDRHIB r14,[r11],#1    ;         r14 = *ptr++
			
 
				+	SUBHI r3,#8            ;         available += 8
			
 
				+	; (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    ;         r2 = window|=r14<<32-available
			
 
				+	CMPHI r10,r11          ;         ptr<stop => HI
			
 
				+	CMPHI r3,#7            ;           available<=24 => HI
			
 
				+	LDRHIB r14,[r11],#1    ;             r14 = *ptr++
			
 
				+	SUBHI r3,#8            ;             available += 8
			
 
				+	; (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    ;             r2 = window|=r14<<32-available
			
 
				+	CMPHI r10,r11          ;             ptr<stop => HI
			
 
				+	CMPHI r3,#7            ;               available<=24 => HI
			
 
				+	LDRHIB r14,[r11],#1    ;                 r14 = *ptr++
			
 
				+	SUBHI r3,#8            ;                 available += 8
			
 
				+	; (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    ;                 r2 = window|=r14<<32-available
			
 
				+	SUBS r3,r0,r3          ; r3 = available-=_bits, available<bits => GT
			
 
				+	BLT oc_pack_read_refill_last
			
 
				+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
			
 
				+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
			
 
				+	STR r11,[r12,#-4]      ; ptr = r11
			
 
				+	STMIA r12,{r2,r3}      ; window = r2
			
 
				+	                       ; available = r3
			
 
				+	LDMFD r13!,{r10,r11,PC}
			
 
				+
			
 
				+; Either we wanted to read more than 24 bits and didn't have enough room to
			
 
				+;  stuff the last byte into the window, or we hit the end of the packet.
			
 
				+oc_pack_read_refill_last
			
 
				+	CMP r11,r10            ; ptr<stop => LO
			
 
				+; If we didn't hit the end of the packet, then pull enough of the next byte to
			
 
				+;  to fill up the window.
			
 
				+	LDRLOB r14,[r11]       ; (LO) r14 = *ptr
			
 
				+; Otherwise, set the EOF flag and pretend we have lots of available bits.
			
 
				+	MOVHS r14,#1           ; (HS) r14 = 1
			
 
				+	ADDLO r10,r3,r1        ; (LO) r10 = available
			
 
				+	STRHS r14,[r12,#8]     ; (HS) eof = 1
			
 
				+	ANDLO r10,r10,#7       ; (LO) r10 = available&7
			
 
				+	MOVHS r3,#1<<30        ; (HS) available = OC_LOTS_OF_BITS
			
 
				+	ORRLO r2,r14,LSL r10   ; (LO) r2 = window|=*ptr>>(available&7)
			
 
				+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
			
 
				+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
			
 
				+	STR r11,[r12,#-4]      ; ptr = r11
			
 
				+	STMIA r12,{r2,r3}      ; window = r2
			
 
				+	                       ; available = r3
			
 
				+	LDMFD r13!,{r10,r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+
			
 
				+
			
 
				+oc_huff_token_decode_arm PROC
			
 
				+	; r0 = oc_pack_buf       *_b
			
 
				+	; r1 = const ogg_int16_t *_tree
			
 
				+	STMFD r13!,{r4,r5,r10,r14}
			
 
				+	LDRSH r10,[r1]         ; r10 = n=_tree[0]
			
 
				+	LDMIA r0,{r2-r5}       ; r2 = stop
			
 
				+	; Stall...             ; r3 = ptr
			
 
				+	; Stall...             ; r4 = window
			
 
				+	                       ; r5 = available
			
 
				+	CMP r10,r5             ; n>available => GT
			
 
				+	BGT oc_huff_token_decode_refill0
			
 
				+	RSB r14,r10,#32        ; r14 = 32-n
			
 
				+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
			
 
				+	ADD r14,r1,r14,LSL #1  ; r14 = _tree+bits
			
 
				+	LDRSH r12,[r14,#2]     ; r12 = node=_tree[1+bits]
			
 
				+	; Stall...
			
 
				+	; Stall...
			
 
				+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
			
 
				+	BMI oc_huff_token_decode_continue
			
 
				+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
			
 
				+	MOV r4,r4,LSL r10      ; r4 = window<<=n
			
 
				+	SUB r5,r10             ; r5 = available-=n
			
 
				+	STMIB r0,{r3-r5}       ; ptr = r3
			
 
				+	                       ; window = r4
			
 
				+	                       ; available = r5
			
 
				+	AND r0,r14,#255        ; r0 = node&255
			
 
				+	LDMFD r13!,{r4,r5,r10,pc}
			
 
				+
			
 
				+; The first tree node wasn't enough to reach a leaf, read another
			
 
				+oc_huff_token_decode_continue
			
 
				+	ADD r12,r1,r12,LSL #1  ; r12 = _tree+node
			
 
				+	MOV r4,r4,LSL r10      ; r4 = window<<=n
			
 
				+	SUB r5,r5,r10          ; r5 = available-=n
			
 
				+	LDRSH r10,[r12],#2     ; r10 = n=_tree[node]
			
 
				+	; Stall...             ; r12 = _tree+node+1
			
 
				+	; Stall...
			
 
				+	CMP r10,r5             ; n>available => GT
			
 
				+	BGT oc_huff_token_decode_refill
			
 
				+	RSB r14,r10,#32        ; r14 = 32-n
			
 
				+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
			
 
				+	ADD r12,r12,r14        ;
			
 
				+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
			
 
				+	; Stall...
			
 
				+	; Stall...
			
 
				+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
			
 
				+	BMI oc_huff_token_decode_continue
			
 
				+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
			
 
				+	MOV r4,r4,LSL r10      ; r4 = window<<=n
			
 
				+	SUB r5,r10             ; r5 = available-=n
			
 
				+	STMIB r0,{r3-r5}       ; ptr = r3
			
 
				+	                       ; window = r4
			
 
				+	                       ; available = r5
			
 
				+	AND r0,r14,#255        ; r0 = node&255
			
 
				+	LDMFD r13!,{r4,r5,r10,pc}
			
 
				+
			
 
				+oc_huff_token_decode_refill0
			
 
				+	ADD r12,r1,#2          ; r12 = _tree+1
			
 
				+oc_huff_token_decode_refill
			
 
				+; We can't possibly need more than 15 bits, so available must be <= 15.
			
 
				+; Therefore we can load at least two bytes without checking it.
			
 
				+	CMP r2,r3              ; ptr<stop => HI
			
 
				+	LDRHIB r14,[r3],#1     ;   r14 = *ptr++
			
 
				+	RSBHI r5,r5,#24        ; (HI) available = 32-(available+=8)
			
 
				+	RSBLS r5,r5,#32        ; (LS) r5 = 32-available
			
 
				+	ORRHI r4,r14,LSL r5    ;   r4 = window|=r14<<32-available
			
 
				+	CMPHI r2,r3            ;   ptr<stop => HI
			
 
				+	LDRHIB r14,[r3],#1     ;     r14 = *ptr++
			
 
				+	SUBHI r5,#8            ;     available += 8
			
 
				+	; (HI) Stall...
			
 
				+	ORRHI r4,r14,LSL r5    ;     r4 = window|=r14<<32-available
			
 
				+; We can use unsigned compares for both the pointers and for available
			
 
				+;  (allowing us to chain condition codes) because available will never be
			
 
				+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
			
 
				+;  negative.
			
 
				+	CMPHI r2,r3            ;     ptr<stop => HI
			
 
				+	CMPHI r5,#7            ;       available<=24 => HI
			
 
				+	LDRHIB r14,[r3],#1     ;         r14 = *ptr++
			
 
				+	SUBHI r5,#8            ;         available += 8
			
 
				+	; (HI) Stall...
			
 
				+	ORRHI r4,r14,LSL r5    ;         r4 = window|=r14<<32-available
			
 
				+	CMP r2,r3              ; ptr<stop => HI
			
 
				+	MOVLS r5,#-1<<30       ; (LS) available = OC_LOTS_OF_BITS+32
			
 
				+	CMPHI r5,#7            ; (HI) available<=24 => HI
			
 
				+	LDRHIB r14,[r3],#1     ; (HI)   r14 = *ptr++
			
 
				+	SUBHI r5,#8            ; (HI)   available += 8
			
 
				+	; (HI) Stall...
			
 
				+	ORRHI r4,r14,LSL r5    ; (HI)   r4 = window|=r14<<32-available
			
 
				+	RSB r14,r10,#32        ; r14 = 32-n
			
 
				+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
			
 
				+	ADD r12,r12,r14        ;
			
 
				+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
			
 
				+	RSB r5,r5,#32          ; r5 = available
			
 
				+	; Stall...
			
 
				+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
			
 
				+	BMI oc_huff_token_decode_continue
			
 
				+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
			
 
				+	MOV r4,r4,LSL r10      ; r4 = window<<=n
			
 
				+	SUB r5,r10             ; r5 = available-=n
			
 
				+	STMIB r0,{r3-r5}       ; ptr = r3
			
 
				+	                       ; window = r4
			
 
				+	                       ; available = r5
			
 
				+	AND r0,r14,#255        ; r0 = node&255
			
 
				+	LDMFD r13!,{r4,r5,r10,pc}
			
 
				+	ENDP
			
 
				+
			
 
				+	END
			
--- a/modules/theoraplayer/native/theora/lib/arm/armcpu.c
+++ b/modules/theoraplayer/native/theora/lib/arm/armcpu.c
@@ -0,0 +1,116 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+ CPU capability detection for ARM processors.
			
 
				+
			
 
				+ function:
			
 
				+  last mod: $Id: cpu.c 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#include "armcpu.h"
			
 
				+
			
 
				+#if !defined(OC_ARM_ASM)|| \
			
 
				+ !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_MEDIA)&& \
			
 
				+ !defined(OC_ARM_ASM_NEON)
			
 
				+ogg_uint32_t oc_cpu_flags_get(void){
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+#elif defined(_MSC_VER)
			
 
				+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
			
 
				+# define WIN32_LEAN_AND_MEAN
			
 
				+# define WIN32_EXTRA_LEAN
			
 
				+# include <windows.h>
			
 
				+
			
 
				+ogg_uint32_t oc_cpu_flags_get(void){
			
 
				+  ogg_uint32_t flags;
			
 
				+  flags=0;
			
 
				+  /*MSVC has no inline __asm support for ARM, but it does let you __emit
			
 
				+     instructions via their assembled hex code.
			
 
				+    All of these instructions should be essentially nops.*/
			
 
				+# if defined(OC_ARM_ASM_EDSP)
			
 
				+  __try{
			
 
				+    /*PLD [r13]*/
			
 
				+    __emit(0xF5DDF000);
			
 
				+    flags|=OC_CPU_ARM_EDSP;
			
 
				+  }
			
 
				+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
			
 
				+    /*Ignore exception.*/
			
 
				+  }
			
 
				+#  if defined(OC_ARM_ASM_MEDIA)
			
 
				+  __try{
			
 
				+    /*SHADD8 r3,r3,r3*/
			
 
				+    __emit(0xE6333F93);
			
 
				+    flags|=OC_CPU_ARM_MEDIA;
			
 
				+  }
			
 
				+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
			
 
				+    /*Ignore exception.*/
			
 
				+  }
			
 
				+#   if defined(OC_ARM_ASM_NEON)
			
 
				+  __try{
			
 
				+    /*VORR q0,q0,q0*/
			
 
				+    __emit(0xF2200150);
			
 
				+    flags|=OC_CPU_ARM_NEON;
			
 
				+  }
			
 
				+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
			
 
				+    /*Ignore exception.*/
			
 
				+  }
			
 
				+#   endif
			
 
				+#  endif
			
 
				+# endif
			
 
				+  return flags;
			
 
				+}
			
 
				+
			
 
				+#elif defined(__linux__)
			
 
				+# include <stdio.h>
			
 
				+# include <stdlib.h>
			
 
				+# include <string.h>
			
 
				+
			
 
				+ogg_uint32_t oc_cpu_flags_get(void){
			
 
				+  ogg_uint32_t  flags;
			
 
				+  FILE         *fin;
			
 
				+  flags=0;
			
 
				+  /*Reading /proc/self/auxv would be easier, but that doesn't work reliably on
			
 
				+     Android.
			
 
				+    This also means that detection will fail in Scratchbox.*/
			
 
				+  fin=fopen("/proc/cpuinfo","r");
			
 
				+  if(fin!=NULL){
			
 
				+    /*512 should be enough for anybody (it's even enough for all the flags that
			
 
				+       x86 has accumulated... so far).*/
			
 
				+    char buf[512];
			
 
				+    while(fgets(buf,511,fin)!=NULL){
			
 
				+      if(memcmp(buf,"Features",8)==0){
			
 
				+        char *p;
			
 
				+        p=strstr(buf," edsp");
			
 
				+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_EDSP;
			
 
				+        p=strstr(buf," neon");
			
 
				+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_NEON;
			
 
				+      }
			
 
				+      if(memcmp(buf,"CPU architecture:",17)==0){
			
 
				+        int version;
			
 
				+        version=atoi(buf+17);
			
 
				+        if(version>=6)flags|=OC_CPU_ARM_MEDIA;
			
 
				+      }
			
 
				+    }
			
 
				+    fclose(fin);
			
 
				+  }
			
 
				+  return flags;
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+/*The feature registers which can tell us what the processor supports are
			
 
				+   accessible in priveleged modes only, so we can't have a general user-space
			
 
				+   detection method like on x86.*/
			
 
				+# error "Configured to use ARM asm but no CPU detection method available for " \
			
 
				+ "your platform.  Reconfigure with --disable-asm (or send patches)."
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm/armcpu.h
+++ b/modules/theoraplayer/native/theora/lib/arm/armcpu.h
@@ -0,0 +1,29 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+ function:
			
 
				+    last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#if !defined(_arm_armcpu_H)
			
 
				+# define _arm_armcpu_H (1)
			
 
				+#include "../internal.h"
			
 
				+
			
 
				+/*"Parallel instructions" from ARM v6 and above.*/
			
 
				+#define OC_CPU_ARM_MEDIA    (1<<24)
			
 
				+/*Flags chosen to match arch/arm/include/asm/hwcap.h in the Linux kernel.*/
			
 
				+#define OC_CPU_ARM_EDSP     (1<<7)
			
 
				+#define OC_CPU_ARM_NEON     (1<<12)
			
 
				+
			
 
				+ogg_uint32_t oc_cpu_flags_get(void);
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm/armenc.c
+++ b/modules/theoraplayer/native/theora/lib/arm/armenc.c
@@ -0,0 +1,57 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include "armenc.h"
			
 
				+
			
 
				+#if defined(OC_ARM_ASM)
			
 
				+
			
 
				+void oc_enc_accel_init_arm(oc_enc_ctx *_enc){
			
 
				+  ogg_uint32_t cpu_flags;
			
 
				+  cpu_flags=_enc->state.cpu_flags;
			
 
				+  oc_enc_accel_init_c(_enc);
			
 
				+# if defined(OC_ENC_USE_VTABLE)
			
 
				+  /*TODO: Add ARMv4 functions here.*/
			
 
				+# endif
			
 
				+# if defined(OC_ARM_ASM_EDSP)
			
 
				+  if(cpu_flags&OC_CPU_ARM_EDSP){
			
 
				+#  if defined(OC_STATE_USE_VTABLE)
			
 
				+    /*TODO: Add EDSP functions here.*/
			
 
				+#  endif
			
 
				+  }
			
 
				+#  if defined(OC_ARM_ASM_MEDIA)
			
 
				+  if(cpu_flags&OC_CPU_ARM_MEDIA){
			
 
				+#   if defined(OC_STATE_USE_VTABLE)
			
 
				+    /*TODO: Add Media functions here.*/
			
 
				+#   endif
			
 
				+  }
			
 
				+#   if defined(OC_ARM_ASM_NEON)
			
 
				+  if(cpu_flags&OC_CPU_ARM_NEON){
			
 
				+#    if defined(OC_STATE_USE_VTABLE)
			
 
				+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_neon;
			
 
				+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_neon;
			
 
				+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_neon;
			
 
				+    _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_neon;
			
 
				+    _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_neon;
			
 
				+    _enc->opt_vtable.quantize=oc_enc_quantize_neon;
			
 
				+#    endif
			
 
				+    _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
			
 
				+    _enc->opt_data.enquant_table_alignment=16;
			
 
				+  }
			
 
				+#   endif
			
 
				+#  endif
			
 
				+# endif
			
 
				+}
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm/armenc.h
+++ b/modules/theoraplayer/native/theora/lib/arm/armenc.h
@@ -0,0 +1,51 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#if !defined(_arm_armenc_H)
			
 
				+# define _arm_armenc_H (1)
			
 
				+# include "armint.h"
			
 
				+
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+#  define oc_enc_accel_init oc_enc_accel_init_arm
			
 
				+#  define OC_ENC_USE_VTABLE (1)
			
 
				+# endif
			
 
				+
			
 
				+# include "../encint.h"
			
 
				+
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+void oc_enc_accel_init_arm(oc_enc_ctx *_enc);
			
 
				+
			
 
				+#  if defined(OC_ARM_ASM_EDSP)
			
 
				+#   if defined(OC_ARM_ASM_MEDIA)
			
 
				+#    if defined(OC_ARM_ASM_NEON)
			
 
				+unsigned oc_enc_frag_satd_neon(int *_dc,const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride);
			
 
				+unsigned oc_enc_frag_satd2_neon(int *_dc,const unsigned char *_src,
			
 
				+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
			
 
				+unsigned oc_enc_frag_intra_satd_neon(int *_dc,
			
 
				+ const unsigned char *_src,int _ystride);
			
 
				+
			
 
				+void oc_enc_enquant_table_init_neon(void *_enquant,
			
 
				+ const ogg_uint16_t _dequant[64]);
			
 
				+void oc_enc_enquant_table_fixup_neon(void *_enquant[3][3][2],int _nqis);
			
 
				+int oc_enc_quantize_neon(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
			
 
				+ const ogg_uint16_t _dequant[64],const void *_enquant);
			
 
				+#    endif
			
 
				+#   endif
			
 
				+#  endif
			
 
				+# endif
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm/armencfrag.s
+++ b/modules/theoraplayer/native/theora/lib/arm/armencfrag.s
@@ -0,0 +1,220 @@
 
				+;********************************************************************
			
 
				+;*                                                                  *
			
 
				+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+;*                                                                  *
			
 
				+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+;*                                                                  *
			
 
				+;********************************************************************
			
 
				+;
			
 
				+; function:
			
 
				+;   last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
			
 
				+;
			
 
				+;********************************************************************
			
 
				+
			
 
				+	AREA	|.text|, CODE, READONLY
			
 
				+
			
 
				+	GET	armopts.s
			
 
				+
			
 
				+ [ OC_ARM_ASM_NEON
			
 
				+	EXPORT	oc_enc_frag_satd_neon
			
 
				+	EXPORT	oc_enc_frag_satd2_neon
			
 
				+	EXPORT	oc_enc_frag_intra_satd_neon
			
 
				+
			
 
				+oc_enc_frag_satd_neon PROC
			
 
				+	; r0 = int                 *_dc
			
 
				+	; r1 = const unsigned char *_src
			
 
				+	; r2 = const unsigned char *_ref
			
 
				+	; r3 = int                  _ystride
			
 
				+	; Load src and subtract ref, expanding to 16 bits.
			
 
				+	VLD1.64		{D16},[r1@64],r3
			
 
				+	VLD1.64		{D0}, [r2],r3
			
 
				+	VSUBL.U8	Q8, D16,D0
			
 
				+	VLD1.64		{D18},[r1@64],r3
			
 
				+	VLD1.64		{D1}, [r2],r3
			
 
				+	VSUBL.U8	Q9, D18,D1
			
 
				+	VLD1.64		{D20},[r1@64],r3
			
 
				+	VLD1.64		{D2}, [r2],r3
			
 
				+	VSUBL.U8	Q10,D20,D2
			
 
				+	VLD1.64		{D22},[r1@64],r3
			
 
				+	VLD1.64		{D3}, [r2],r3
			
 
				+	VSUBL.U8	Q11,D22,D3
			
 
				+	VLD1.64		{D24},[r1@64],r3
			
 
				+	VLD1.64		{D4}, [r2],r3
			
 
				+	VSUBL.U8	Q12,D24,D4
			
 
				+	VLD1.64		{D26},[r1@64],r3
			
 
				+	VLD1.64		{D5}, [r2],r3
			
 
				+	VSUBL.U8	Q13,D26,D5
			
 
				+	VLD1.64		{D28},[r1@64],r3
			
 
				+	VLD1.64		{D6}, [r2],r3
			
 
				+	VSUBL.U8	Q14,D28,D6
			
 
				+	VLD1.64		{D30},[r1@64]
			
 
				+	VLD1.64		{D7}, [r2]
			
 
				+	VSUBL.U8	Q15,D30,D7
			
 
				+oc_int_frag_satd_neon
			
 
				+	; Hadamard Stage A
			
 
				+	VADD.I16	Q0, Q8, Q12
			
 
				+	VSUB.I16	Q12,Q8, Q12
			
 
				+	VSUB.I16	Q1, Q9, Q13
			
 
				+	VADD.I16	Q9, Q9, Q13
			
 
				+	VSUB.I16	Q2, Q10,Q14
			
 
				+	VADD.I16	Q10,Q10,Q14
			
 
				+	VADD.I16	Q3, Q11,Q15
			
 
				+	VSUB.I16	Q15,Q11,Q15
			
 
				+	; Hadamard Stage B
			
 
				+	VADD.I16	Q8, Q0, Q10
			
 
				+	VSUB.I16	Q0, Q0, Q10
			
 
				+	VSUB.I16	Q11,Q9, Q3
			
 
				+	VADD.I16	Q3, Q9, Q3
			
 
				+	VSUB.I16	Q14,Q12,Q2
			
 
				+	VADD.I16	Q2, Q12,Q2
			
 
				+	VADD.I16	Q13,Q1, Q15
			
 
				+	VSUB.I16	Q1, Q1, Q15
			
 
				+	; Hadamard Stage C & Start 8x8 Transpose
			
 
				+	VSUB.I16	Q9, Q8, Q3
			
 
				+	VADD.I16	Q8, Q8, Q3
			
 
				+	VTRN.16		Q8, Q9
			
 
				+	VADD.I16	Q10,Q0, Q11
			
 
				+	VSUB.I16	Q11,Q0, Q11
			
 
				+	VTRN.16		Q10,Q11
			
 
				+	VADD.I16	Q12,Q2, Q13
			
 
				+	VTRN.32		Q8, Q10
			
 
				+	VSUB.I16	Q13,Q2, Q13
			
 
				+	VTRN.32		Q9, Q11
			
 
				+	VSUB.I16	Q15,Q14,Q1
			
 
				+	VTRN.16		Q12,Q13
			
 
				+	VADD.I16	Q14,Q14,Q1
			
 
				+	VTRN.16		Q14,Q15
			
 
				+	VTRN.32		Q12,Q14
			
 
				+	VSWP		D17,D24
			
 
				+	; Hadamard Stage A & Finish 8x8 Transpose
			
 
				+	VADD.I16	Q0, Q8, Q12
			
 
				+	VTRN.32		Q13,Q15
			
 
				+	VSUB.I16	Q12,Q8, Q12
			
 
				+	VSWP		D19,D26
			
 
				+	VSUB.I16	Q1, Q9, Q13
			
 
				+	VSWP		D21,D28
			
 
				+	VADD.I16	Q9, Q9, Q13
			
 
				+	VSWP		D23,D30
			
 
				+	VSUB.I16	Q2, Q10,Q14
			
 
				+	VADD.I16	Q10,Q10,Q14
			
 
				+	VADD.I16	Q3, Q11,Q15
			
 
				+	VSUB.I16	Q15,Q11,Q15
			
 
				+	; Hadamard Stage B
			
 
				+	VADD.I16	Q8, Q0, Q10
			
 
				+	VSUB.I16	Q0, Q0, Q10
			
 
				+	VSUB.I16	Q11,Q9, Q3
			
 
				+	VADD.I16	Q3, Q9, Q3
			
 
				+	VSUB.I16	Q14,Q12,Q2
			
 
				+	VADD.I16	Q2, Q12,Q2
			
 
				+	VADD.I16	Q13,Q1, Q15
			
 
				+	VSUB.I16	Q1, Q1, Q15
			
 
				+	; Hadamard Stage C & abs & accum
			
 
				+	VNEG.S16	Q9, Q3
			
 
				+	; Compute the (signed) DC component and save it off.
			
 
				+	VADDL.S16	Q10,D16,D6
			
 
				+	VABD.S16	Q12,Q8, Q9
			
 
				+	VABD.S16	Q15,Q11,Q0
			
 
				+	VST1.32		D20[0],[r0]
			
 
				+	; Remove the (abs) DC component from the total.
			
 
				+	MOV	r3,#0
			
 
				+	VMOV.I16	D24[0],r3
			
 
				+	VABA.S16	Q12,Q13,Q2
			
 
				+	VABA.S16	Q15,Q14,Q1
			
 
				+	VNEG.S16	Q0, Q0
			
 
				+	VNEG.S16	Q2, Q2
			
 
				+	VNEG.S16	Q1, Q1
			
 
				+	VABA.S16	Q12,Q8, Q3
			
 
				+	VABA.S16	Q15,Q11,Q0
			
 
				+	VABA.S16	Q12,Q13,Q2
			
 
				+	VABA.S16	Q15,Q14,Q1
			
 
				+	; We're now using all 16 bits of each value.
			
 
				+	VPADDL.U16	Q12,Q12
			
 
				+	VPADAL.U16	Q12,Q15
			
 
				+	VADD.U32	D24,D24,D25
			
 
				+	VPADDL.U32	D24,D24
			
 
				+	VMOV.U32	r0, D24[0]
			
 
				+	MOV	PC, r14
			
 
				+	ENDP
			
 
				+
			
 
				+oc_enc_frag_satd2_neon PROC
			
 
				+	; r0 = int                 *_dc
			
 
				+	; r1 = const unsigned char *_src
			
 
				+	; r2 = const unsigned char *_ref1
			
 
				+	; r3 = const unsigned char *_ref2
			
 
				+	; r12= int                  _ystride
			
 
				+	LDR	r12,[r13]
			
 
				+	; Load src and subtract (ref1+ref2>>1), expanding to 16 bits.
			
 
				+	VLD1.64		{D0}, [r2],r12
			
 
				+	VLD1.64		{D1}, [r3],r12
			
 
				+	VLD1.64		{D16},[r1@64],r12
			
 
				+	VHADD.U8	D0, D0, D1
			
 
				+	VLD1.64		{D2}, [r2],r12
			
 
				+	VLD1.64		{D3}, [r3],r12
			
 
				+	VSUBL.U8	Q8, D16,D0
			
 
				+	VLD1.64		{D18},[r1@64],r12
			
 
				+	VHADD.U8	D2, D2, D3
			
 
				+	VLD1.64		{D4}, [r2],r12
			
 
				+	VLD1.64		{D5}, [r3],r12
			
 
				+	VSUBL.U8	Q9, D18,D2
			
 
				+	VLD1.64		{D20},[r1@64],r12
			
 
				+	VHADD.U8	D4, D4, D5
			
 
				+	VLD1.64		{D6}, [r2],r12
			
 
				+	VLD1.64		{D7}, [r3],r12
			
 
				+	VSUBL.U8	Q10,D20,D4
			
 
				+	VLD1.64		{D22},[r1@64],r12
			
 
				+	VHADD.U8	D6, D6, D7
			
 
				+	VLD1.64		{D0}, [r2],r12
			
 
				+	VLD1.64		{D1}, [r3],r12
			
 
				+	VSUBL.U8	Q11,D22,D6
			
 
				+	VLD1.64		{D24},[r1@64],r12
			
 
				+	VHADD.U8	D0, D0, D1
			
 
				+	VLD1.64		{D2}, [r2],r12
			
 
				+	VLD1.64		{D3}, [r3],r12
			
 
				+	VSUBL.U8	Q12,D24,D0
			
 
				+	VLD1.64		{D26},[r1@64],r12
			
 
				+	VHADD.U8	D2, D2, D3
			
 
				+	VLD1.64		{D4}, [r2],r12
			
 
				+	VLD1.64		{D5}, [r3],r12
			
 
				+	VSUBL.U8	Q13,D26,D2
			
 
				+	VLD1.64		{D28},[r1@64],r12
			
 
				+	VHADD.U8	D4, D4, D5
			
 
				+	VLD1.64		{D6}, [r2]
			
 
				+	VSUBL.U8	Q14,D28,D4
			
 
				+	VLD1.64		{D7}, [r3]
			
 
				+	VHADD.U8	D6, D6, D7
			
 
				+	VLD1.64		{D30},[r1@64]
			
 
				+	VSUBL.U8	Q15,D30,D6
			
 
				+	B	oc_int_frag_satd_neon
			
 
				+	ENDP
			
 
				+
			
 
				+oc_enc_frag_intra_satd_neon PROC
			
 
				+	; r0 = int                 *_dc
			
 
				+	; r1 = const unsigned char *_src
			
 
				+	; r2 = int                  _ystride
			
 
				+	; Load and subtract 128 from src, expanding to 16 bits.
			
 
				+	VMOV.I8		D0,#128
			
 
				+	VLD1.64		{D16},[r1@64],r2
			
 
				+	VSUBL.U8	Q8, D16,D0
			
 
				+	VLD1.64		{D18},[r1@64],r2
			
 
				+	VSUBL.U8	Q9, D18,D0
			
 
				+	VLD1.64		{D20},[r1@64],r2
			
 
				+	VSUBL.U8	Q10,D20,D0
			
 
				+	VLD1.64		{D22},[r1@64],r2
			
 
				+	VSUBL.U8	Q11,D22,D0
			
 
				+	VLD1.64		{D24},[r1@64],r2
			
 
				+	VSUBL.U8	Q12,D24,D0
			
 
				+	VLD1.64		{D26},[r1@64],r2
			
 
				+	VSUBL.U8	Q13,D26,D0
			
 
				+	VLD1.64		{D28},[r1@64],r2
			
 
				+	VSUBL.U8	Q14,D28,D0
			
 
				+	VLD1.64		{D30},[r1@64]
			
 
				+	VSUBL.U8	Q15,D30,D0
			
 
				+	B	oc_int_frag_satd_neon
			
 
				+	ENDP
			
 
				+ ]
			
 
				+
			
 
				+	END
			
--- a/modules/theoraplayer/native/theora/lib/arm/armenquant.s
+++ b/modules/theoraplayer/native/theora/lib/arm/armenquant.s
@@ -0,0 +1,162 @@
 
				+;********************************************************************
			
 
				+;*                                                                  *
			
 
				+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+;*                                                                  *
			
 
				+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+;*                                                                  *
			
 
				+;********************************************************************
			
 
				+;
			
 
				+; function:
			
 
				+;   last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
			
 
				+;
			
 
				+;********************************************************************
			
 
				+
			
 
				+	AREA	|.text|, CODE, READONLY
			
 
				+
			
 
				+	GET	armopts.s
			
 
				+
			
 
				+ [ OC_ARM_ASM_NEON
			
 
				+	EXPORT	oc_enc_enquant_table_init_neon
			
 
				+	EXPORT	oc_enc_enquant_table_fixup_neon
			
 
				+	EXPORT	oc_enc_quantize_neon
			
 
				+
			
 
				+oc_enc_enquant_table_init_neon PROC
			
 
				+	; r0 = void               *_enquant
			
 
				+	; r1 = const ogg_uint16_t  _dequant[64]
			
 
				+	STMFD r13!,{r0,r14}
			
 
				+	; Initialize the table using the C routine
			
 
				+	BLX	oc_enc_enquant_table_init_c
			
 
				+	LDR	r0, [r13],#4
			
 
				+	MOV	r1, #2
			
 
				+	; Now partially de-interleave it, so that the first row is all
			
 
				+	;  multipliers, the second row is all shift factors, etc.
			
 
				+	; Also, negate the shifts for use by VSHL.
			
 
				+oeeti_neon_lp
			
 
				+	SUBS	r1, r1, #1
			
 
				+	VLDMIA		r0, {D16-D31}
			
 
				+	VUZP.16		Q8, Q9
			
 
				+	VNEG.S16	Q9, Q9
			
 
				+	VUZP.16		Q10,Q11
			
 
				+	VNEG.S16	Q11,Q11
			
 
				+	VUZP.16		Q12,Q13
			
 
				+	VNEG.S16	Q13,Q13
			
 
				+	VUZP.16		Q14,Q15
			
 
				+	VNEG.S16	Q15,Q15
			
 
				+	VSTMIA		r0!,{D16-D31}
			
 
				+	BNE	oeeti_neon_lp
			
 
				+	LDR	PC, [r13],#4
			
 
				+	ENDP
			
 
				+
			
 
				+oc_enc_enquant_table_fixup_neon PROC
			
 
				+	; r0 = void *_enquant[3][3][2]
			
 
				+	; r1 = int   _nqis
			
 
				+	STR	r14,[r13,#-4]!
			
 
				+oeetf_neon_lp1
			
 
				+	SUBS	r1, r1, #1
			
 
				+	BEQ	oeetf_neon_end1
			
 
				+	MOV	r14,#3
			
 
				+oeetf_neon_lp2
			
 
				+	LDR	r2, [r0]
			
 
				+	SUBS	r14,r14,#1
			
 
				+	LDRH	r3, [r2]
			
 
				+	LDRH	r12,[r2,#16]
			
 
				+	LDR	r2, [r0,#8]
			
 
				+	STRH	r3, [r2]
			
 
				+	STRH	r12,[r2,#16]
			
 
				+	LDR	r2, [r0,#4]
			
 
				+	LDRH	r3, [r2]
			
 
				+	LDRH	r12,[r2,#16]
			
 
				+	LDR	r2, [r0,#12]
			
 
				+	ADD	r0, r0, #24
			
 
				+	STRH	r3, [r2]
			
 
				+	STRH	r12,[r2,#16]
			
 
				+	BNE	oeetf_neon_lp2
			
 
				+	SUB	r0, r0, #64
			
 
				+	B	oeetf_neon_lp1
			
 
				+oeetf_neon_end1
			
 
				+	LDR	PC, [r13],#4
			
 
				+	ENDP
			
 
				+
			
 
				+oc_enc_quantize_neon PROC
			
 
				+	; r0 = ogg_int16_t        _qdct[64]
			
 
				+	; r1 = const ogg_int16_t  _dct[64]
			
 
				+	; r2 = const ogg_int16_t  _dequant[64]
			
 
				+	; r3 = const void        *_enquant
			
 
				+	STMFD	r13!,{r4,r5,r14}
			
 
				+	; The loop counter goes in the high half of r14.
			
 
				+	MOV	r14,#0xFFFCFFFF
			
 
				+oeq_neon_lp
			
 
				+	; Load the next two rows of the data and the quant matrices.
			
 
				+	VLD1.64		{D16,D17,D18,D19},[r1@128]!
			
 
				+	VLD1.64		{D20,D21,D22,D23},[r2@128]!
			
 
				+	; Add in the signed rounding bias from the quantizers.
			
 
				+	; Note that the VHADD relies on the fact that the quantizers are all
			
 
				+	;  even (they're in fact multiples of four) in order to round correctly
			
 
				+	;  on the entries being negated.
			
 
				+	VSHR.S16	Q0, Q8, #15
			
 
				+	VSHR.S16	Q1, Q9, #15
			
 
				+	VLD1.64		{D24,D25,D26,D27},[r3@128]!
			
 
				+	VHADD.S16	Q10,Q0, Q10
			
 
				+	VHADD.S16	Q11,Q1, Q11
			
 
				+	VLD1.64		{D28,D29,D30,D31},[r3@128]!
			
 
				+	ADDS	r14,r14,#1<<16
			
 
				+	VEOR.S16	Q10,Q0, Q10
			
 
				+	VEOR.S16	Q11,Q1, Q11
			
 
				+	VADD.S16	Q8, Q8, Q10
			
 
				+	VADD.S16	Q9, Q9, Q11
			
 
				+	; Perform the actual division and save the result.
			
 
				+	VQDMULH.S16	Q12,Q8, Q12
			
 
				+	VQDMULH.S16	Q14,Q9, Q14
			
 
				+	VADD.S16	Q8, Q8, Q8
			
 
				+	VADD.S16	Q9, Q9, Q9
			
 
				+	VADD.S16	Q8, Q8, Q12
			
 
				+	VADD.S16	Q9, Q9, Q14
			
 
				+	VSHL.S16	Q8, Q13
			
 
				+	VSHL.S16	Q9, Q15
			
 
				+	VSUB.S16	Q8, Q8, Q0
			
 
				+	VSUB.S16	Q9, Q9, Q1
			
 
				+	VST1.64		{D16,D17,D18,D19},[r0@128]!
			
 
				+	; Now pull out a bitfield marking the non-zero coefficients.
			
 
				+	VQMOVN.S16	D16,Q8
			
 
				+	VQMOVN.S16	D17,Q9
			
 
				+	VCEQ.S8		Q8, #0
			
 
				+	; Sadly, NEON has no PMOVMSKB; emulating it requires 6 instructions.
			
 
				+	VNEG.S8		Q8, Q8          ; D16=.......3.......2.......1.......0
			
 
				+	                                ;     .......7.......6.......5.......4
			
 
				+	                                ; D17=.......B.......A.......9.......8
			
 
				+	                                ;     .......F.......E.......D.......C
			
 
				+	VZIP.8		D16,D17         ; D16=.......9.......1.......8.......0
			
 
				+	                                ;     .......B.......3.......A.......2
			
 
				+	                                ; D17=.......D.......5.......C.......4
			
 
				+	                                ;     .......F.......7.......E.......6
			
 
				+	VSLI.8		D16,D17,#4      ; D16=...D...9...5...1...C...8...4...0
			
 
				+	                                ;     ...F...B...7...3...E...A...6...2
			
 
				+	; Shift over the bitfields from previous iterations and
			
 
				+	;  finish compacting the bitfield from the last iteration.
			
 
				+	ORR	r4, r4, r5, LSL #2      ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
			
 
				+	ORR	r4, r4, r4, LSR #15     ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
			
 
				+	PKHTB	r14,r14,r12,ASR #16     ; r14=i|A
			
 
				+	PKHBT	r12,r4, r12,LSL #16     ; r12=B|C
			
 
				+	VMOV		r4, r5, D16
			
 
				+	BLT	oeq_neon_lp
			
 
				+	; Start with the low half while the NEON register transfers.
			
 
				+	PKHBT	r0, r14,r12             ; r0 =B|A
			
 
				+	MVNS	r0, r0
			
 
				+	CLZNE	r0, r0
			
 
				+	RSBNE	r0, r0, #31
			
 
				+	; Stall 8-10 more cycles waiting for the last transfer.
			
 
				+	ORR	r4, r4, r5, LSL #2      ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
			
 
				+	ORR	r4, r4, r4, LSR #15     ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
			
 
				+	PKHBT	r1, r12,r4, LSL #16     ; r1 = D|C
			
 
				+	MVNS	r1, r1
			
 
				+	CLZNE	r1, r1
			
 
				+	RSBNE	r0, r1, #63
			
 
				+	LDMFD	r13!,{r4,r5,PC}
			
 
				+	ENDP
			
 
				+ ]
			
 
				+
			
 
				+	END
			
--- a/modules/theoraplayer/native/theora/lib/arm/armfrag.asm
+++ b/modules/theoraplayer/native/theora/lib/arm/armfrag.asm
@@ -0,0 +1,656 @@
 
				+@********************************************************************
			
 
				+@*                                                                  *
			
 
				+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+@*                                                                  *
			
 
				+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+@*                                                                  *
			
 
				+@********************************************************************
			
 
				+@ Original implementation:
			
 
				+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+@ last mod: $Id: armfrag.s 17874 2011-02-24 14:49:11Z tterribe $
			
 
				+@********************************************************************
			
 
				+
			
 
				+    .text;   .p2align 2
			
 
				+
			
 
				+	.include "armopts-gnu.s"
			
 
				+
			
 
				+@ Vanilla ARM v4 versions
			
 
				+	.global	oc_frag_copy_list_arm
			
 
				+	.global	oc_frag_recon_intra_arm
			
 
				+	.global	oc_frag_recon_inter_arm
			
 
				+	.global	oc_frag_recon_inter2_arm
			
 
				+
			
 
				+	.type	oc_frag_copy_list_arm, %function; oc_frag_copy_list_arm: @ PROC
			
 
				+	@ r0 = _dst_frame
			
 
				+	@ r1 = _src_frame
			
 
				+	@ r2 = _ystride
			
 
				+	@ r3 = _fragis
			
 
				+	@ <> = _nfragis
			
 
				+	@ <> = _frag_buf_offs
			
 
				+	LDR	r12,[r13]		@ r12 = _nfragis
			
 
				+	STMFD	r13!,{r4-r6,r11,r14}
			
 
				+	SUBS	r12, r12, #1
			
 
				+	LDR	r4,[r3],#4		@ r4 = _fragis[fragii]
			
 
				+	LDRGE	r14,[r13,#4*6]		@ r14 = _frag_buf_offs
			
 
				+	BLT	ofcl_arm_end
			
 
				+	SUB	r2, r2, #4
			
 
				+ofcl_arm_lp:
			
 
				+	LDR	r11,[r14,r4,LSL #2]	@ r11 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	SUBS	r12, r12, #1
			
 
				+	@ Stall (on XScale)
			
 
				+	ADD	r4, r1, r11		@ r4 = _src_frame+frag_buf_off
			
 
				+	LDR	r6, [r4], #4
			
 
				+	ADD	r11,r0, r11		@ r11 = _dst_frame+frag_buf_off
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4]
			
 
				+	LDRGE	r4,[r3],#4		@ r4 = _fragis[fragii]
			
 
				+	STR	r6, [r11],#4
			
 
				+	STR	r5, [r11]
			
 
				+	BGE	ofcl_arm_lp
			
 
				+ofcl_arm_end:
			
 
				+	LDMFD	r13!,{r4-r6,r11,PC}
			
 
				+oc_frag_recon_intra_arm:
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 =       int            _ystride
			
 
				+	@ r2 = const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4,r5,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r5, #255
			
 
				+	SUB	r1, r1, #7
			
 
				+ofrintra_lp_arm:
			
 
				+	LDRSH	r3, [r2], #2
			
 
				+	LDRSH	r4, [r2], #2
			
 
				+	LDRSH	r12,[r2], #2
			
 
				+	ADDS	r3, r3, #128
			
 
				+	CMPGT	r5, r3
			
 
				+	EORLT	r3, r5, r3, ASR #32
			
 
				+	STRB	r3, [r0], #1
			
 
				+	ADDS	r4, r4, #128
			
 
				+	CMPGT	r5, r4
			
 
				+	EORLT	r4, r5, r4, ASR #32
			
 
				+	LDRSH	r3, [r2], #2
			
 
				+	STRB	r4, [r0], #1
			
 
				+	ADDS	r12,r12,#128
			
 
				+	CMPGT	r5, r12
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	LDRSH	r4, [r2], #2
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r3, r3, #128
			
 
				+	CMPGT	r5, r3
			
 
				+	EORLT	r3, r5, r3, ASR #32
			
 
				+	LDRSH	r12,[r2], #2
			
 
				+	STRB	r3, [r0], #1
			
 
				+	ADDS	r4, r4, #128
			
 
				+	CMPGT	r5, r4
			
 
				+	EORLT	r4, r5, r4, ASR #32
			
 
				+	LDRSH	r3, [r2], #2
			
 
				+	STRB	r4, [r0], #1
			
 
				+	ADDS	r12,r12,#128
			
 
				+	CMPGT	r5, r12
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	LDRSH	r4, [r2], #2
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r3, r3, #128
			
 
				+	CMPGT	r5, r3
			
 
				+	EORLT	r3, r5, r3, ASR #32
			
 
				+	STRB	r3, [r0], #1
			
 
				+	ADDS	r4, r4, #128
			
 
				+	CMPGT	r5, r4
			
 
				+	EORLT	r4, r5, r4, ASR #32
			
 
				+	STRB	r4, [r0], r1
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	ofrintra_lp_arm
			
 
				+	LDMFD	r13!,{r4,r5,PC}
			
 
				+	.size oc_frag_copy_list_arm, .-oc_frag_copy_list_arm	@ ENDP
			
 
				+
			
 
				+	.type	oc_frag_recon_inter_arm, %function; oc_frag_recon_inter_arm: @ PROC
			
 
				+	@ r0 =       unsigned char *dst
			
 
				+	@ r1 = const unsigned char *src
			
 
				+	@ r2 =       int            ystride
			
 
				+	@ r3 = const ogg_int16_t    residue[64]
			
 
				+	STMFD	r13!,{r5,r9-r11,r14}
			
 
				+	MOV	r9, #8
			
 
				+	MOV	r5, #255
			
 
				+	SUB	r2, r2, #7
			
 
				+ofrinter_lp_arm:
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], #1
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], #1
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], r2
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], r2
			
 
				+	SUBS	r9, r9, #1
			
 
				+	BGT	ofrinter_lp_arm
			
 
				+	LDMFD	r13!,{r5,r9-r11,PC}
			
 
				+	.size oc_frag_recon_inter_arm, .-oc_frag_recon_inter_arm	@ ENDP
			
 
				+
			
 
				+	.type	oc_frag_recon_inter2_arm, %function; oc_frag_recon_inter2_arm: @ PROC
			
 
				+	@ r0 =       unsigned char *dst
			
 
				+	@ r1 = const unsigned char *src1
			
 
				+	@ r2 = const unsigned char *src2
			
 
				+	@ r3 =       int            ystride
			
 
				+	LDR	r12,[r13]
			
 
				+	@ r12= const ogg_int16_t    residue[64]
			
 
				+	STMFD	r13!,{r4-r8,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r8, #255
			
 
				+	SUB	r3, r3, #7
			
 
				+ofrinter2_lp_arm:
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	LDRB	r7, [r1], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r7, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r7, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r7, [r1], r3
			
 
				+	LDRB	r6, [r2], r3
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], r3
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	ofrinter2_lp_arm
			
 
				+	LDMFD	r13!,{r4-r8,PC}
			
 
				+	.size oc_frag_recon_inter2_arm, .-oc_frag_recon_inter2_arm	@ ENDP
			
 
				+
			
 
				+  .if OC_ARM_ASM_EDSP
			
 
				+	.global	oc_frag_copy_list_edsp
			
 
				+
			
 
				+	.type	oc_frag_copy_list_edsp, %function; oc_frag_copy_list_edsp: @ PROC
			
 
				+	@ r0 = _dst_frame
			
 
				+	@ r1 = _src_frame
			
 
				+	@ r2 = _ystride
			
 
				+	@ r3 = _fragis
			
 
				+	@ <> = _nfragis
			
 
				+	@ <> = _frag_buf_offs
			
 
				+	LDR	r12,[r13]		@ r12 = _nfragis
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUBS	r12, r12, #1
			
 
				+	LDRGE	r5, [r3],#4		@ r5 = _fragis[fragii]
			
 
				+	LDRGE	r14,[r13,#4*10]		@ r14 = _frag_buf_offs
			
 
				+	BLT	ofcl_edsp_end
			
 
				+ofcl_edsp_lp:
			
 
				+	MOV	r4, r1
			
 
				+	LDR	r5, [r14,r5, LSL #2]	@ r5 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	SUBS	r12, r12, #1
			
 
				+	@ Stall (on XScale)
			
 
				+	LDRD	r6, [r4, r5]!		@ r4 = _src_frame+frag_buf_off
			
 
				+	LDRD	r8, [r4, r2]!
			
 
				+	@ Stall
			
 
				+	STRD	r6, [r5, r0]!		@ r5 = _dst_frame+frag_buf_off
			
 
				+	STRD	r8, [r5, r2]!
			
 
				+	@ Stall
			
 
				+	LDRD	r6, [r4, r2]!	@ On Xscale at least, doing 3 consecutive
			
 
				+	LDRD	r8, [r4, r2]!	@ loads causes a stall, but thats no worse
			
 
				+	LDRD	r10,[r4, r2]!	@ than us only doing 2, and having to do
			
 
				+				@ another pair of LDRD/STRD later on.
			
 
				+	@ Stall
			
 
				+	STRD	r6, [r5, r2]!
			
 
				+	STRD	r8, [r5, r2]!
			
 
				+	STRD	r10,[r5, r2]!
			
 
				+	LDRD	r6, [r4, r2]!
			
 
				+	LDRD	r8, [r4, r2]!
			
 
				+	LDRD	r10,[r4, r2]!
			
 
				+	STRD	r6, [r5, r2]!
			
 
				+	STRD	r8, [r5, r2]!
			
 
				+	STRD	r10,[r5, r2]!
			
 
				+	LDRGE	r5, [r3],#4		@ r5 = _fragis[fragii]
			
 
				+	BGE	ofcl_edsp_lp
			
 
				+ofcl_edsp_end:
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	.size oc_frag_copy_list_edsp, .-oc_frag_copy_list_edsp	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+  .if OC_ARM_ASM_MEDIA
			
 
				+	.global	oc_frag_recon_intra_v6
			
 
				+	.global	oc_frag_recon_inter_v6
			
 
				+	.global	oc_frag_recon_inter2_v6
			
 
				+
			
 
				+	.type	oc_frag_recon_intra_v6, %function; oc_frag_recon_intra_v6: @ PROC
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 =       int            _ystride
			
 
				+	@ r2 = const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4-r6,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r12,r2
			
 
				+	LDR	r6, =0x00800080
			
 
				+ofrintra_v6_lp:
			
 
				+	LDRD	r2, [r12],#8	@ r2 = 11110000 r3 = 33332222
			
 
				+	LDRD	r4, [r12],#8	@ r4 = 55554444 r5 = 77776666
			
 
				+	SUBS	r14,r14,#1
			
 
				+	QADD16	r2, r2, r6
			
 
				+	QADD16	r3, r3, r6
			
 
				+	QADD16	r4, r4, r6
			
 
				+	QADD16	r5, r5, r6
			
 
				+	USAT16	r2, #8, r2		@ r2 = __11__00
			
 
				+	USAT16	r3, #8, r3		@ r3 = __33__22
			
 
				+	USAT16	r4, #8, r4		@ r4 = __55__44
			
 
				+	USAT16	r5, #8, r5		@ r5 = __77__66
			
 
				+	ORR	r2, r2, r2, LSR #8	@ r2 = __111100
			
 
				+	ORR	r3, r3, r3, LSR #8	@ r3 = __333322
			
 
				+	ORR	r4, r4, r4, LSR #8	@ r4 = __555544
			
 
				+	ORR	r5, r5, r5, LSR #8	@ r5 = __777766
			
 
				+	PKHBT   r2, r2, r3, LSL #16     @ r2 = 33221100
			
 
				+	PKHBT   r3, r4, r5, LSL #16     @ r3 = 77665544
			
 
				+	STRD	r2, [r0], r1
			
 
				+	BGT	ofrintra_v6_lp
			
 
				+	LDMFD	r13!,{r4-r6,PC}
			
 
				+	.size oc_frag_recon_intra_v6, .-oc_frag_recon_intra_v6	@ ENDP
			
 
				+
			
 
				+	.type	oc_frag_recon_inter_v6, %function; oc_frag_recon_inter_v6: @ PROC
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 = const unsigned char *_src
			
 
				+	@ r2 =       int            _ystride
			
 
				+	@ r3 = const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4-r7,r14}
			
 
				+	MOV	r14,#8
			
 
				+ofrinter_v6_lp:
			
 
				+	LDRD	r6, [r3], #8		@ r6 = 11110000 r7 = 33332222
			
 
				+	SUBS	r14,r14,#1
			
 
				+  .if OC_ARM_CAN_UNALIGN_LDRD
			
 
				+	LDRD	r4, [r1], r2	@ Unaligned ; r4 = 33221100 r5 = 77665544
			
 
				+  .else
			
 
				+	LDR	r5, [r1, #4]
			
 
				+	LDR	r4, [r1], r2
			
 
				+  .endif
			
 
				+	PKHBT	r12,r6, r7, LSL #16	@ r12= 22220000
			
 
				+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 33331111
			
 
				+	UXTB16	r6,r4			@ r6 = __22__00
			
 
				+	UXTB16	r4,r4, ROR #8		@ r4 = __33__11
			
 
				+	QADD16	r12,r12,r6		@ r12= xx22xx00
			
 
				+	QADD16	r4, r7, r4		@ r4 = xx33xx11
			
 
				+	LDRD	r6, [r3], #8		@ r6 = 55554444 r7 = 77776666
			
 
				+	USAT16	r4, #8, r4		@ r4 = __33__11
			
 
				+	USAT16	r12,#8,r12		@ r12= __22__00
			
 
				+	ORR	r4, r12,r4, LSL #8	@ r4 = 33221100
			
 
				+	PKHBT	r12,r6, r7, LSL #16	@ r12= 66664444
			
 
				+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 77775555
			
 
				+	UXTB16	r6,r5			@ r6 = __66__44
			
 
				+	UXTB16	r5,r5, ROR #8		@ r5 = __77__55
			
 
				+	QADD16	r12,r12,r6		@ r12= xx66xx44
			
 
				+	QADD16	r5, r7, r5		@ r5 = xx77xx55
			
 
				+	USAT16	r12,#8, r12		@ r12= __66__44
			
 
				+	USAT16	r5, #8, r5		@ r4 = __77__55
			
 
				+	ORR	r5, r12,r5, LSL #8	@ r5 = 33221100
			
 
				+	STRD	r4, [r0], r2
			
 
				+	BGT	ofrinter_v6_lp
			
 
				+	LDMFD	r13!,{r4-r7,PC}
			
 
				+	.size oc_frag_recon_inter_v6, .-oc_frag_recon_inter_v6	@ ENDP
			
 
				+
			
 
				+	.type	oc_frag_recon_inter2_v6, %function; oc_frag_recon_inter2_v6: @ PROC
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 = const unsigned char *_src1
			
 
				+	@ r2 = const unsigned char *_src2
			
 
				+	@ r3 =       int            _ystride
			
 
				+	LDR	r12,[r13]
			
 
				+	@ r12= const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4-r9,r14}
			
 
				+	MOV	r14,#8
			
 
				+ofrinter2_v6_lp:
			
 
				+	LDRD	r6, [r12,#8]	@ r6 = 55554444 r7 = 77776666
			
 
				+	SUBS	r14,r14,#1
			
 
				+	LDR	r4, [r1, #4]	@ Unaligned	; r4 = src1[1] = 77665544
			
 
				+	LDR	r5, [r2, #4]	@ Unaligned	; r5 = src2[1] = 77665544
			
 
				+	PKHBT	r8, r6, r7, LSL #16	@ r8 = 66664444
			
 
				+	PKHTB	r9, r7, r6, ASR #16	@ r9 = 77775555
			
 
				+	UHADD8	r4, r4, r5	@ r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
			
 
				+	UXTB16	r5, r4			@ r5 = __66__44
			
 
				+	UXTB16	r4, r4, ROR #8		@ r4 = __77__55
			
 
				+	QADD16	r8, r8, r5		@ r8 = xx66xx44
			
 
				+	QADD16	r9, r9, r4		@ r9 = xx77xx55
			
 
				+	LDRD	r6,[r12],#16	@ r6 = 33332222 r7 = 11110000
			
 
				+	USAT16	r8, #8, r8		@ r8 = __66__44
			
 
				+	LDR	r4, [r1], r3	@ Unaligned	; r4 = src1[0] = 33221100
			
 
				+	USAT16	r9, #8, r9		@ r9 = __77__55
			
 
				+	LDR	r5, [r2], r3	@ Unaligned	; r5 = src2[0] = 33221100
			
 
				+	ORR	r9, r8, r9, LSL #8	@ r9 = 77665544
			
 
				+	PKHBT	r8, r6, r7, LSL #16	@ r8 = 22220000
			
 
				+	UHADD8	r4, r4, r5	@ r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
			
 
				+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 33331111
			
 
				+	UXTB16	r5, r4			@ r5 = __22__00
			
 
				+	UXTB16	r4, r4, ROR #8		@ r4 = __33__11
			
 
				+	QADD16	r8, r8, r5		@ r8 = xx22xx00
			
 
				+	QADD16	r7, r7, r4		@ r7 = xx33xx11
			
 
				+	USAT16	r8, #8, r8		@ r8 = __22__00
			
 
				+	USAT16	r7, #8, r7		@ r7 = __33__11
			
 
				+	ORR	r8, r8, r7, LSL #8	@ r8 = 33221100
			
 
				+	STRD	r8, [r0], r3
			
 
				+	BGT	ofrinter2_v6_lp
			
 
				+	LDMFD	r13!,{r4-r9,PC}
			
 
				+	.size oc_frag_recon_inter2_v6, .-oc_frag_recon_inter2_v6	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+  .if OC_ARM_ASM_NEON
			
 
				+	.global	oc_frag_copy_list_neon
			
 
				+	.global	oc_frag_recon_intra_neon
			
 
				+	.global	oc_frag_recon_inter_neon
			
 
				+	.global	oc_frag_recon_inter2_neon
			
 
				+
			
 
				+	.type	oc_frag_copy_list_neon, %function; oc_frag_copy_list_neon: @ PROC
			
 
				+	@ r0 = _dst_frame
			
 
				+	@ r1 = _src_frame
			
 
				+	@ r2 = _ystride
			
 
				+	@ r3 = _fragis
			
 
				+	@ <> = _nfragis
			
 
				+	@ <> = _frag_buf_offs
			
 
				+	LDR	r12,[r13]		@ r12 = _nfragis
			
 
				+	STMFD	r13!,{r4-r7,r14}
			
 
				+	CMP	r12, #1
			
 
				+	LDRGE	r6, [r3]		@ r6 = _fragis[fragii]
			
 
				+	LDRGE	r14,[r13,#4*6]		@ r14 = _frag_buf_offs
			
 
				+	BLT	ofcl_neon_end
			
 
				+	@ Stall (2 on Xscale)
			
 
				+	LDR	r6, [r14,r6, LSL #2]	@ r6 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	@ Stall (on XScale)
			
 
				+	MOV	r7, r6			@ Guarantee PLD points somewhere valid.
			
 
				+ofcl_neon_lp:
			
 
				+	ADD	r4, r1, r6
			
 
				+	VLD1.64	{D0}, [r4,:64], r2
			
 
				+	ADD	r5, r0, r6
			
 
				+	VLD1.64	{D1}, [r4,:64], r2
			
 
				+	SUBS	r12, r12, #1
			
 
				+	VLD1.64	{D2}, [r4,:64], r2
			
 
				+	LDRGT	r6, [r3,#4]!		@ r6 = _fragis[fragii]
			
 
				+	VLD1.64	{D3}, [r4,:64], r2
			
 
				+	LDRGT	r6, [r14,r6, LSL #2]	@ r6 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	VLD1.64	{D4}, [r4,:64], r2
			
 
				+	ADDGT	r7, r1, r6
			
 
				+	VLD1.64	{D5}, [r4,:64], r2
			
 
				+	PLD	[r7]
			
 
				+	VLD1.64	{D6}, [r4,:64], r2
			
 
				+	PLD	[r7, r2]
			
 
				+	VLD1.64	{D7}, [r4,:64]
			
 
				+	PLD	[r7, r2, LSL #1]
			
 
				+	VST1.64	{D0}, [r5,:64], r2
			
 
				+	ADDGT	r7, r7, r2, LSL #2
			
 
				+	VST1.64	{D1}, [r5,:64], r2
			
 
				+	PLD	[r7, -r2]
			
 
				+	VST1.64	{D2}, [r5,:64], r2
			
 
				+	PLD	[r7]
			
 
				+	VST1.64	{D3}, [r5,:64], r2
			
 
				+	PLD	[r7, r2]
			
 
				+	VST1.64	{D4}, [r5,:64], r2
			
 
				+	PLD	[r7, r2, LSL #1]
			
 
				+	VST1.64	{D5}, [r5,:64], r2
			
 
				+	ADDGT	r7, r7, r2, LSL #2
			
 
				+	VST1.64	{D6}, [r5,:64], r2
			
 
				+	PLD	[r7, -r2]
			
 
				+	VST1.64	{D7}, [r5,:64]
			
 
				+	BGT	ofcl_neon_lp
			
 
				+ofcl_neon_end:
			
 
				+	LDMFD	r13!,{r4-r7,PC}
			
 
				+	.size oc_frag_copy_list_neon, .-oc_frag_copy_list_neon	@ ENDP
			
 
				+
			
 
				+	.type	oc_frag_recon_intra_neon, %function; oc_frag_recon_intra_neon: @ PROC
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 =       int            _ystride
			
 
				+	@ r2 = const ogg_int16_t    _residue[64]
			
 
				+	VMOV.I16	Q0, #128
			
 
				+	VLDMIA	r2,  {D16-D31}	@ D16= 3333222211110000 etc	; 9(8) cycles
			
 
				+	VQADD.S16	Q8, Q8, Q0
			
 
				+	VQADD.S16	Q9, Q9, Q0
			
 
				+	VQADD.S16	Q10,Q10,Q0
			
 
				+	VQADD.S16	Q11,Q11,Q0
			
 
				+	VQADD.S16	Q12,Q12,Q0
			
 
				+	VQADD.S16	Q13,Q13,Q0
			
 
				+	VQADD.S16	Q14,Q14,Q0
			
 
				+	VQADD.S16	Q15,Q15,Q0
			
 
				+	VQMOVUN.S16	D16,Q8	@ D16= 7766554433221100		; 1 cycle
			
 
				+	VQMOVUN.S16	D17,Q9	@ D17= FFEEDDCCBBAA9988		; 1 cycle
			
 
				+	VQMOVUN.S16	D18,Q10	@ D18= NNMMLLKKJJIIHHGG		; 1 cycle
			
 
				+	VST1.64	{D16},[r0,:64], r1
			
 
				+	VQMOVUN.S16	D19,Q11	@ D19= VVUUTTSSRRQQPPOO		; 1 cycle
			
 
				+	VST1.64	{D17},[r0,:64], r1
			
 
				+	VQMOVUN.S16	D20,Q12	@ D20= ddccbbaaZZYYXXWW		; 1 cycle
			
 
				+	VST1.64	{D18},[r0,:64], r1
			
 
				+	VQMOVUN.S16	D21,Q13	@ D21= llkkjjiihhggffee		; 1 cycle
			
 
				+	VST1.64	{D19},[r0,:64], r1
			
 
				+	VQMOVUN.S16	D22,Q14	@ D22= ttssrrqqppoonnmm		; 1 cycle
			
 
				+	VST1.64	{D20},[r0,:64], r1
			
 
				+	VQMOVUN.S16	D23,Q15	@ D23= !!,:@zzyyxxwwvvuu		; 1 cycle
			
 
				+	VST1.64	{D21},[r0,:64], r1
			
 
				+	VST1.64	{D22},[r0,:64], r1
			
 
				+	VST1.64	{D23},[r0,:64], r1
			
 
				+	MOV	PC,R14
			
 
				+	.size oc_frag_recon_intra_neon, .-oc_frag_recon_intra_neon	@ ENDP
			
 
				+
			
 
				+	.type	oc_frag_recon_inter_neon, %function; oc_frag_recon_inter_neon: @ PROC
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 = const unsigned char *_src
			
 
				+	@ r2 =       int            _ystride
			
 
				+	@ r3 = const ogg_int16_t    _residue[64]
			
 
				+	VLDMIA	r3, {D16-D31}	@ D16= 3333222211110000 etc	; 9(8) cycles
			
 
				+	VLD1.64	{D0}, [r1], r2
			
 
				+	VLD1.64	{D2}, [r1], r2
			
 
				+	VMOVL.U8	Q0, D0	@ Q0 = __77__66__55__44__33__22__11__00
			
 
				+	VLD1.64	{D4}, [r1], r2
			
 
				+	VMOVL.U8	Q1, D2	@ etc
			
 
				+	VLD1.64	{D6}, [r1], r2
			
 
				+	VMOVL.U8	Q2, D4
			
 
				+	VMOVL.U8	Q3, D6
			
 
				+	VQADD.S16	Q8, Q8, Q0
			
 
				+	VLD1.64	{D0}, [r1], r2
			
 
				+	VQADD.S16	Q9, Q9, Q1
			
 
				+	VLD1.64	{D2}, [r1], r2
			
 
				+	VQADD.S16	Q10,Q10,Q2
			
 
				+	VLD1.64	{D4}, [r1], r2
			
 
				+	VQADD.S16	Q11,Q11,Q3
			
 
				+	VLD1.64	{D6}, [r1], r2
			
 
				+	VMOVL.U8	Q0, D0
			
 
				+	VMOVL.U8	Q1, D2
			
 
				+	VMOVL.U8	Q2, D4
			
 
				+	VMOVL.U8	Q3, D6
			
 
				+	VQADD.S16	Q12,Q12,Q0
			
 
				+	VQADD.S16	Q13,Q13,Q1
			
 
				+	VQADD.S16	Q14,Q14,Q2
			
 
				+	VQADD.S16	Q15,Q15,Q3
			
 
				+	VQMOVUN.S16	D16,Q8
			
 
				+	VQMOVUN.S16	D17,Q9
			
 
				+	VQMOVUN.S16	D18,Q10
			
 
				+	VST1.64	{D16},[r0,:64], r2
			
 
				+	VQMOVUN.S16	D19,Q11
			
 
				+	VST1.64	{D17},[r0,:64], r2
			
 
				+	VQMOVUN.S16	D20,Q12
			
 
				+	VST1.64	{D18},[r0,:64], r2
			
 
				+	VQMOVUN.S16	D21,Q13
			
 
				+	VST1.64	{D19},[r0,:64], r2
			
 
				+	VQMOVUN.S16	D22,Q14
			
 
				+	VST1.64	{D20},[r0,:64], r2
			
 
				+	VQMOVUN.S16	D23,Q15
			
 
				+	VST1.64	{D21},[r0,:64], r2
			
 
				+	VST1.64	{D22},[r0,:64], r2
			
 
				+	VST1.64	{D23},[r0,:64], r2
			
 
				+	MOV	PC,R14
			
 
				+	.size oc_frag_recon_inter_neon, .-oc_frag_recon_inter_neon	@ ENDP
			
 
				+
			
 
				+	.type	oc_frag_recon_inter2_neon, %function; oc_frag_recon_inter2_neon: @ PROC
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 = const unsigned char *_src1
			
 
				+	@ r2 = const unsigned char *_src2
			
 
				+	@ r3 =       int            _ystride
			
 
				+	LDR	r12,[r13]
			
 
				+	@ r12= const ogg_int16_t    _residue[64]
			
 
				+	VLDMIA	r12,{D16-D31}
			
 
				+	VLD1.64	{D0}, [r1], r3
			
 
				+	VLD1.64	{D4}, [r2], r3
			
 
				+	VLD1.64	{D1}, [r1], r3
			
 
				+	VLD1.64	{D5}, [r2], r3
			
 
				+	VHADD.U8	Q2, Q0, Q2	@ Q2 = FFEEDDCCBBAA99887766554433221100
			
 
				+	VLD1.64	{D2}, [r1], r3
			
 
				+	VLD1.64	{D6}, [r2], r3
			
 
				+	VMOVL.U8	Q0, D4		@ Q0 = __77__66__55__44__33__22__11__00
			
 
				+	VLD1.64	{D3}, [r1], r3
			
 
				+	VMOVL.U8	Q2, D5		@ etc
			
 
				+	VLD1.64	{D7}, [r2], r3
			
 
				+	VHADD.U8	Q3, Q1, Q3
			
 
				+	VQADD.S16	Q8, Q8, Q0
			
 
				+	VQADD.S16	Q9, Q9, Q2
			
 
				+	VLD1.64	{D0}, [r1], r3
			
 
				+	VMOVL.U8	Q1, D6
			
 
				+	VLD1.64	{D4}, [r2], r3
			
 
				+	VMOVL.U8	Q3, D7
			
 
				+	VLD1.64	{D1}, [r1], r3
			
 
				+	VQADD.S16	Q10,Q10,Q1
			
 
				+	VLD1.64	{D5}, [r2], r3
			
 
				+	VQADD.S16	Q11,Q11,Q3
			
 
				+	VLD1.64	{D2}, [r1], r3
			
 
				+	VHADD.U8	Q2, Q0, Q2
			
 
				+	VLD1.64	{D6}, [r2], r3
			
 
				+	VLD1.64	{D3}, [r1], r3
			
 
				+	VMOVL.U8	Q0, D4
			
 
				+	VLD1.64	{D7}, [r2], r3
			
 
				+	VMOVL.U8	Q2, D5
			
 
				+	VHADD.U8	Q3, Q1, Q3
			
 
				+	VQADD.S16	Q12,Q12,Q0
			
 
				+	VQADD.S16	Q13,Q13,Q2
			
 
				+	VMOVL.U8	Q1, D6
			
 
				+	VMOVL.U8	Q3, D7
			
 
				+	VQADD.S16	Q14,Q14,Q1
			
 
				+	VQADD.S16	Q15,Q15,Q3
			
 
				+	VQMOVUN.S16	D16,Q8
			
 
				+	VQMOVUN.S16	D17,Q9
			
 
				+	VQMOVUN.S16	D18,Q10
			
 
				+	VST1.64	{D16},[r0,:64], r3
			
 
				+	VQMOVUN.S16	D19,Q11
			
 
				+	VST1.64	{D17},[r0,:64], r3
			
 
				+	VQMOVUN.S16	D20,Q12
			
 
				+	VST1.64	{D18},[r0,:64], r3
			
 
				+	VQMOVUN.S16	D21,Q13
			
 
				+	VST1.64	{D19},[r0,:64], r3
			
 
				+	VQMOVUN.S16	D22,Q14
			
 
				+	VST1.64	{D20},[r0,:64], r3
			
 
				+	VQMOVUN.S16	D23,Q15
			
 
				+	VST1.64	{D21},[r0,:64], r3
			
 
				+	VST1.64	{D22},[r0,:64], r3
			
 
				+	VST1.64	{D23},[r0,:64], r3
			
 
				+	MOV	PC,R14
			
 
				+	.size oc_frag_recon_inter2_neon, .-oc_frag_recon_inter2_neon	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+	@ END
			
 
				+    .section	.note.GNU-stack,"",%progbits
			
--- a/modules/theoraplayer/native/theora/lib/arm/armfrag.s
+++ b/modules/theoraplayer/native/theora/lib/arm/armfrag.s
@@ -0,0 +1,655 @@
 
				+;********************************************************************
			
 
				+;*                                                                  *
			
 
				+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+;*                                                                  *
			
 
				+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+;*                                                                  *
			
 
				+;********************************************************************
			
 
				+; Original implementation:
			
 
				+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+; last mod: $Id: armfrag.s 17874 2011-02-24 14:49:11Z tterribe $
			
 
				+;********************************************************************
			
 
				+
			
 
				+	AREA	|.text|, CODE, READONLY
			
 
				+
			
 
				+	GET	armopts.s
			
 
				+
			
 
				+; Vanilla ARM v4 versions
			
 
				+	EXPORT	oc_frag_copy_list_arm
			
 
				+	EXPORT	oc_frag_recon_intra_arm
			
 
				+	EXPORT	oc_frag_recon_inter_arm
			
 
				+	EXPORT	oc_frag_recon_inter2_arm
			
 
				+
			
 
				+oc_frag_copy_list_arm PROC
			
 
				+	; r0 = _dst_frame
			
 
				+	; r1 = _src_frame
			
 
				+	; r2 = _ystride
			
 
				+	; r3 = _fragis
			
 
				+	; <> = _nfragis
			
 
				+	; <> = _frag_buf_offs
			
 
				+	LDR	r12,[r13]		; r12 = _nfragis
			
 
				+	STMFD	r13!,{r4-r6,r11,r14}
			
 
				+	SUBS	r12, r12, #1
			
 
				+	LDR	r4,[r3],#4		; r4 = _fragis[fragii]
			
 
				+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
			
 
				+	BLT	ofcl_arm_end
			
 
				+	SUB	r2, r2, #4
			
 
				+ofcl_arm_lp
			
 
				+	LDR	r11,[r14,r4,LSL #2]	; r11 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	SUBS	r12, r12, #1
			
 
				+	; Stall (on XScale)
			
 
				+	ADD	r4, r1, r11		; r4 = _src_frame+frag_buf_off
			
 
				+	LDR	r6, [r4], #4
			
 
				+	ADD	r11,r0, r11		; r11 = _dst_frame+frag_buf_off
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4]
			
 
				+	LDRGE	r4,[r3],#4		; r4 = _fragis[fragii]
			
 
				+	STR	r6, [r11],#4
			
 
				+	STR	r5, [r11]
			
 
				+	BGE	ofcl_arm_lp
			
 
				+ofcl_arm_end
			
 
				+	LDMFD	r13!,{r4-r6,r11,PC}
			
 
				+oc_frag_recon_intra_arm
			
 
				+	; r0 =       unsigned char *_dst
			
 
				+	; r1 =       int            _ystride
			
 
				+	; r2 = const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4,r5,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r5, #255
			
 
				+	SUB	r1, r1, #7
			
 
				+ofrintra_lp_arm
			
 
				+	LDRSH	r3, [r2], #2
			
 
				+	LDRSH	r4, [r2], #2
			
 
				+	LDRSH	r12,[r2], #2
			
 
				+	ADDS	r3, r3, #128
			
 
				+	CMPGT	r5, r3
			
 
				+	EORLT	r3, r5, r3, ASR #32
			
 
				+	STRB	r3, [r0], #1
			
 
				+	ADDS	r4, r4, #128
			
 
				+	CMPGT	r5, r4
			
 
				+	EORLT	r4, r5, r4, ASR #32
			
 
				+	LDRSH	r3, [r2], #2
			
 
				+	STRB	r4, [r0], #1
			
 
				+	ADDS	r12,r12,#128
			
 
				+	CMPGT	r5, r12
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	LDRSH	r4, [r2], #2
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r3, r3, #128
			
 
				+	CMPGT	r5, r3
			
 
				+	EORLT	r3, r5, r3, ASR #32
			
 
				+	LDRSH	r12,[r2], #2
			
 
				+	STRB	r3, [r0], #1
			
 
				+	ADDS	r4, r4, #128
			
 
				+	CMPGT	r5, r4
			
 
				+	EORLT	r4, r5, r4, ASR #32
			
 
				+	LDRSH	r3, [r2], #2
			
 
				+	STRB	r4, [r0], #1
			
 
				+	ADDS	r12,r12,#128
			
 
				+	CMPGT	r5, r12
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	LDRSH	r4, [r2], #2
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r3, r3, #128
			
 
				+	CMPGT	r5, r3
			
 
				+	EORLT	r3, r5, r3, ASR #32
			
 
				+	STRB	r3, [r0], #1
			
 
				+	ADDS	r4, r4, #128
			
 
				+	CMPGT	r5, r4
			
 
				+	EORLT	r4, r5, r4, ASR #32
			
 
				+	STRB	r4, [r0], r1
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	ofrintra_lp_arm
			
 
				+	LDMFD	r13!,{r4,r5,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_frag_recon_inter_arm PROC
			
 
				+	; r0 =       unsigned char *dst
			
 
				+	; r1 = const unsigned char *src
			
 
				+	; r2 =       int            ystride
			
 
				+	; r3 = const ogg_int16_t    residue[64]
			
 
				+	STMFD	r13!,{r5,r9-r11,r14}
			
 
				+	MOV	r9, #8
			
 
				+	MOV	r5, #255
			
 
				+	SUB	r2, r2, #7
			
 
				+ofrinter_lp_arm
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], #1
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], #1
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], r2
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], r2
			
 
				+	SUBS	r9, r9, #1
			
 
				+	BGT	ofrinter_lp_arm
			
 
				+	LDMFD	r13!,{r5,r9-r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_frag_recon_inter2_arm PROC
			
 
				+	; r0 =       unsigned char *dst
			
 
				+	; r1 = const unsigned char *src1
			
 
				+	; r2 = const unsigned char *src2
			
 
				+	; r3 =       int            ystride
			
 
				+	LDR	r12,[r13]
			
 
				+	; r12= const ogg_int16_t    residue[64]
			
 
				+	STMFD	r13!,{r4-r8,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r8, #255
			
 
				+	SUB	r3, r3, #7
			
 
				+ofrinter2_lp_arm
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	LDRB	r7, [r1], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r7, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r7, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r7, [r1], r3
			
 
				+	LDRB	r6, [r2], r3
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], r3
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	ofrinter2_lp_arm
			
 
				+	LDMFD	r13!,{r4-r8,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+ [ OC_ARM_ASM_EDSP
			
 
				+	EXPORT	oc_frag_copy_list_edsp
			
 
				+
			
 
				+oc_frag_copy_list_edsp PROC
			
 
				+	; r0 = _dst_frame
			
 
				+	; r1 = _src_frame
			
 
				+	; r2 = _ystride
			
 
				+	; r3 = _fragis
			
 
				+	; <> = _nfragis
			
 
				+	; <> = _frag_buf_offs
			
 
				+	LDR	r12,[r13]		; r12 = _nfragis
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUBS	r12, r12, #1
			
 
				+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
			
 
				+	LDRGE	r14,[r13,#4*10]		; r14 = _frag_buf_offs
			
 
				+	BLT	ofcl_edsp_end
			
 
				+ofcl_edsp_lp
			
 
				+	MOV	r4, r1
			
 
				+	LDR	r5, [r14,r5, LSL #2]	; r5 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	SUBS	r12, r12, #1
			
 
				+	; Stall (on XScale)
			
 
				+	LDRD	r6, [r4, r5]!		; r4 = _src_frame+frag_buf_off
			
 
				+	LDRD	r8, [r4, r2]!
			
 
				+	; Stall
			
 
				+	STRD	r6, [r5, r0]!		; r5 = _dst_frame+frag_buf_off
			
 
				+	STRD	r8, [r5, r2]!
			
 
				+	; Stall
			
 
				+	LDRD	r6, [r4, r2]!	; On Xscale at least, doing 3 consecutive
			
 
				+	LDRD	r8, [r4, r2]!	; loads causes a stall, but that's no worse
			
 
				+	LDRD	r10,[r4, r2]!	; than us only doing 2, and having to do
			
 
				+				; another pair of LDRD/STRD later on.
			
 
				+	; Stall
			
 
				+	STRD	r6, [r5, r2]!
			
 
				+	STRD	r8, [r5, r2]!
			
 
				+	STRD	r10,[r5, r2]!
			
 
				+	LDRD	r6, [r4, r2]!
			
 
				+	LDRD	r8, [r4, r2]!
			
 
				+	LDRD	r10,[r4, r2]!
			
 
				+	STRD	r6, [r5, r2]!
			
 
				+	STRD	r8, [r5, r2]!
			
 
				+	STRD	r10,[r5, r2]!
			
 
				+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
			
 
				+	BGE	ofcl_edsp_lp
			
 
				+ofcl_edsp_end
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	ENDP
			
 
				+ ]
			
 
				+
			
 
				+ [ OC_ARM_ASM_MEDIA
			
 
				+	EXPORT	oc_frag_recon_intra_v6
			
 
				+	EXPORT	oc_frag_recon_inter_v6
			
 
				+	EXPORT	oc_frag_recon_inter2_v6
			
 
				+
			
 
				+oc_frag_recon_intra_v6 PROC
			
 
				+	; r0 =       unsigned char *_dst
			
 
				+	; r1 =       int            _ystride
			
 
				+	; r2 = const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4-r6,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r12,r2
			
 
				+	LDR	r6, =0x00800080
			
 
				+ofrintra_v6_lp
			
 
				+	LDRD	r2, [r12],#8	; r2 = 11110000 r3 = 33332222
			
 
				+	LDRD	r4, [r12],#8	; r4 = 55554444 r5 = 77776666
			
 
				+	SUBS	r14,r14,#1
			
 
				+	QADD16	r2, r2, r6
			
 
				+	QADD16	r3, r3, r6
			
 
				+	QADD16	r4, r4, r6
			
 
				+	QADD16	r5, r5, r6
			
 
				+	USAT16	r2, #8, r2		; r2 = __11__00
			
 
				+	USAT16	r3, #8, r3		; r3 = __33__22
			
 
				+	USAT16	r4, #8, r4		; r4 = __55__44
			
 
				+	USAT16	r5, #8, r5		; r5 = __77__66
			
 
				+	ORR	r2, r2, r2, LSR #8	; r2 = __111100
			
 
				+	ORR	r3, r3, r3, LSR #8	; r3 = __333322
			
 
				+	ORR	r4, r4, r4, LSR #8	; r4 = __555544
			
 
				+	ORR	r5, r5, r5, LSR #8	; r5 = __777766
			
 
				+	PKHBT   r2, r2, r3, LSL #16     ; r2 = 33221100
			
 
				+	PKHBT   r3, r4, r5, LSL #16     ; r3 = 77665544
			
 
				+	STRD	r2, [r0], r1
			
 
				+	BGT	ofrintra_v6_lp
			
 
				+	LDMFD	r13!,{r4-r6,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_frag_recon_inter_v6 PROC
			
 
				+	; r0 =       unsigned char *_dst
			
 
				+	; r1 = const unsigned char *_src
			
 
				+	; r2 =       int            _ystride
			
 
				+	; r3 = const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4-r7,r14}
			
 
				+	MOV	r14,#8
			
 
				+ofrinter_v6_lp
			
 
				+	LDRD	r6, [r3], #8		; r6 = 11110000 r7 = 33332222
			
 
				+	SUBS	r14,r14,#1
			
 
				+ [ OC_ARM_CAN_UNALIGN_LDRD
			
 
				+	LDRD	r4, [r1], r2	; Unaligned ; r4 = 33221100 r5 = 77665544
			
 
				+ |
			
 
				+	LDR	r5, [r1, #4]
			
 
				+	LDR	r4, [r1], r2
			
 
				+ ]
			
 
				+	PKHBT	r12,r6, r7, LSL #16	; r12= 22220000
			
 
				+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
			
 
				+	UXTB16	r6,r4			; r6 = __22__00
			
 
				+	UXTB16	r4,r4, ROR #8		; r4 = __33__11
			
 
				+	QADD16	r12,r12,r6		; r12= xx22xx00
			
 
				+	QADD16	r4, r7, r4		; r4 = xx33xx11
			
 
				+	LDRD	r6, [r3], #8		; r6 = 55554444 r7 = 77776666
			
 
				+	USAT16	r4, #8, r4		; r4 = __33__11
			
 
				+	USAT16	r12,#8,r12		; r12= __22__00
			
 
				+	ORR	r4, r12,r4, LSL #8	; r4 = 33221100
			
 
				+	PKHBT	r12,r6, r7, LSL #16	; r12= 66664444
			
 
				+	PKHTB	r7, r7, r6, ASR #16	; r7 = 77775555
			
 
				+	UXTB16	r6,r5			; r6 = __66__44
			
 
				+	UXTB16	r5,r5, ROR #8		; r5 = __77__55
			
 
				+	QADD16	r12,r12,r6		; r12= xx66xx44
			
 
				+	QADD16	r5, r7, r5		; r5 = xx77xx55
			
 
				+	USAT16	r12,#8, r12		; r12= __66__44
			
 
				+	USAT16	r5, #8, r5		; r4 = __77__55
			
 
				+	ORR	r5, r12,r5, LSL #8	; r5 = 33221100
			
 
				+	STRD	r4, [r0], r2
			
 
				+	BGT	ofrinter_v6_lp
			
 
				+	LDMFD	r13!,{r4-r7,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_frag_recon_inter2_v6 PROC
			
 
				+	; r0 =       unsigned char *_dst
			
 
				+	; r1 = const unsigned char *_src1
			
 
				+	; r2 = const unsigned char *_src2
			
 
				+	; r3 =       int            _ystride
			
 
				+	LDR	r12,[r13]
			
 
				+	; r12= const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4-r9,r14}
			
 
				+	MOV	r14,#8
			
 
				+ofrinter2_v6_lp
			
 
				+	LDRD	r6, [r12,#8]	; r6 = 55554444 r7 = 77776666
			
 
				+	SUBS	r14,r14,#1
			
 
				+	LDR	r4, [r1, #4]	; Unaligned	; r4 = src1[1] = 77665544
			
 
				+	LDR	r5, [r2, #4]	; Unaligned	; r5 = src2[1] = 77665544
			
 
				+	PKHBT	r8, r6, r7, LSL #16	; r8 = 66664444
			
 
				+	PKHTB	r9, r7, r6, ASR #16	; r9 = 77775555
			
 
				+	UHADD8	r4, r4, r5	; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
			
 
				+	UXTB16	r5, r4			; r5 = __66__44
			
 
				+	UXTB16	r4, r4, ROR #8		; r4 = __77__55
			
 
				+	QADD16	r8, r8, r5		; r8 = xx66xx44
			
 
				+	QADD16	r9, r9, r4		; r9 = xx77xx55
			
 
				+	LDRD	r6,[r12],#16	; r6 = 33332222 r7 = 11110000
			
 
				+	USAT16	r8, #8, r8		; r8 = __66__44
			
 
				+	LDR	r4, [r1], r3	; Unaligned	; r4 = src1[0] = 33221100
			
 
				+	USAT16	r9, #8, r9		; r9 = __77__55
			
 
				+	LDR	r5, [r2], r3	; Unaligned	; r5 = src2[0] = 33221100
			
 
				+	ORR	r9, r8, r9, LSL #8	; r9 = 77665544
			
 
				+	PKHBT	r8, r6, r7, LSL #16	; r8 = 22220000
			
 
				+	UHADD8	r4, r4, r5	; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
			
 
				+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
			
 
				+	UXTB16	r5, r4			; r5 = __22__00
			
 
				+	UXTB16	r4, r4, ROR #8		; r4 = __33__11
			
 
				+	QADD16	r8, r8, r5		; r8 = xx22xx00
			
 
				+	QADD16	r7, r7, r4		; r7 = xx33xx11
			
 
				+	USAT16	r8, #8, r8		; r8 = __22__00
			
 
				+	USAT16	r7, #8, r7		; r7 = __33__11
			
 
				+	ORR	r8, r8, r7, LSL #8	; r8 = 33221100
			
 
				+	STRD	r8, [r0], r3
			
 
				+	BGT	ofrinter2_v6_lp
			
 
				+	LDMFD	r13!,{r4-r9,PC}
			
 
				+	ENDP
			
 
				+ ]
			
 
				+
			
 
				+ [ OC_ARM_ASM_NEON
			
 
				+	EXPORT	oc_frag_copy_list_neon
			
 
				+	EXPORT	oc_frag_recon_intra_neon
			
 
				+	EXPORT	oc_frag_recon_inter_neon
			
 
				+	EXPORT	oc_frag_recon_inter2_neon
			
 
				+
			
 
				+oc_frag_copy_list_neon PROC
			
 
				+	; r0 = _dst_frame
			
 
				+	; r1 = _src_frame
			
 
				+	; r2 = _ystride
			
 
				+	; r3 = _fragis
			
 
				+	; <> = _nfragis
			
 
				+	; <> = _frag_buf_offs
			
 
				+	LDR	r12,[r13]		; r12 = _nfragis
			
 
				+	STMFD	r13!,{r4-r7,r14}
			
 
				+	CMP	r12, #1
			
 
				+	LDRGE	r6, [r3]		; r6 = _fragis[fragii]
			
 
				+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
			
 
				+	BLT	ofcl_neon_end
			
 
				+	; Stall (2 on Xscale)
			
 
				+	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	; Stall (on XScale)
			
 
				+	MOV	r7, r6			; Guarantee PLD points somewhere valid.
			
 
				+ofcl_neon_lp
			
 
				+	ADD	r4, r1, r6
			
 
				+	VLD1.64	{D0}, [r4@64], r2
			
 
				+	ADD	r5, r0, r6
			
 
				+	VLD1.64	{D1}, [r4@64], r2
			
 
				+	SUBS	r12, r12, #1
			
 
				+	VLD1.64	{D2}, [r4@64], r2
			
 
				+	LDRGT	r6, [r3,#4]!		; r6 = _fragis[fragii]
			
 
				+	VLD1.64	{D3}, [r4@64], r2
			
 
				+	LDRGT	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	VLD1.64	{D4}, [r4@64], r2
			
 
				+	ADDGT	r7, r1, r6
			
 
				+	VLD1.64	{D5}, [r4@64], r2
			
 
				+	PLD	[r7]
			
 
				+	VLD1.64	{D6}, [r4@64], r2
			
 
				+	PLD	[r7, r2]
			
 
				+	VLD1.64	{D7}, [r4@64]
			
 
				+	PLD	[r7, r2, LSL #1]
			
 
				+	VST1.64	{D0}, [r5@64], r2
			
 
				+	ADDGT	r7, r7, r2, LSL #2
			
 
				+	VST1.64	{D1}, [r5@64], r2
			
 
				+	PLD	[r7, -r2]
			
 
				+	VST1.64	{D2}, [r5@64], r2
			
 
				+	PLD	[r7]
			
 
				+	VST1.64	{D3}, [r5@64], r2
			
 
				+	PLD	[r7, r2]
			
 
				+	VST1.64	{D4}, [r5@64], r2
			
 
				+	PLD	[r7, r2, LSL #1]
			
 
				+	VST1.64	{D5}, [r5@64], r2
			
 
				+	ADDGT	r7, r7, r2, LSL #2
			
 
				+	VST1.64	{D6}, [r5@64], r2
			
 
				+	PLD	[r7, -r2]
			
 
				+	VST1.64	{D7}, [r5@64]
			
 
				+	BGT	ofcl_neon_lp
			
 
				+ofcl_neon_end
			
 
				+	LDMFD	r13!,{r4-r7,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_frag_recon_intra_neon PROC
			
 
				+	; r0 =       unsigned char *_dst
			
 
				+	; r1 =       int            _ystride
			
 
				+	; r2 = const ogg_int16_t    _residue[64]
			
 
				+	VMOV.I16	Q0, #128
			
 
				+	VLDMIA	r2,  {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
			
 
				+	VQADD.S16	Q8, Q8, Q0
			
 
				+	VQADD.S16	Q9, Q9, Q0
			
 
				+	VQADD.S16	Q10,Q10,Q0
			
 
				+	VQADD.S16	Q11,Q11,Q0
			
 
				+	VQADD.S16	Q12,Q12,Q0
			
 
				+	VQADD.S16	Q13,Q13,Q0
			
 
				+	VQADD.S16	Q14,Q14,Q0
			
 
				+	VQADD.S16	Q15,Q15,Q0
			
 
				+	VQMOVUN.S16	D16,Q8	; D16= 7766554433221100		; 1 cycle
			
 
				+	VQMOVUN.S16	D17,Q9	; D17= FFEEDDCCBBAA9988		; 1 cycle
			
 
				+	VQMOVUN.S16	D18,Q10	; D18= NNMMLLKKJJIIHHGG		; 1 cycle
			
 
				+	VST1.64	{D16},[r0@64], r1
			
 
				+	VQMOVUN.S16	D19,Q11	; D19= VVUUTTSSRRQQPPOO		; 1 cycle
			
 
				+	VST1.64	{D17},[r0@64], r1
			
 
				+	VQMOVUN.S16	D20,Q12	; D20= ddccbbaaZZYYXXWW		; 1 cycle
			
 
				+	VST1.64	{D18},[r0@64], r1
			
 
				+	VQMOVUN.S16	D21,Q13	; D21= llkkjjiihhggffee		; 1 cycle
			
 
				+	VST1.64	{D19},[r0@64], r1
			
 
				+	VQMOVUN.S16	D22,Q14	; D22= ttssrrqqppoonnmm		; 1 cycle
			
 
				+	VST1.64	{D20},[r0@64], r1
			
 
				+	VQMOVUN.S16	D23,Q15	; D23= !!@@zzyyxxwwvvuu		; 1 cycle
			
 
				+	VST1.64	{D21},[r0@64], r1
			
 
				+	VST1.64	{D22},[r0@64], r1
			
 
				+	VST1.64	{D23},[r0@64], r1
			
 
				+	MOV	PC,R14
			
 
				+	ENDP
			
 
				+
			
 
				+oc_frag_recon_inter_neon PROC
			
 
				+	; r0 =       unsigned char *_dst
			
 
				+	; r1 = const unsigned char *_src
			
 
				+	; r2 =       int            _ystride
			
 
				+	; r3 = const ogg_int16_t    _residue[64]
			
 
				+	VLDMIA	r3, {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
			
 
				+	VLD1.64	{D0}, [r1], r2
			
 
				+	VLD1.64	{D2}, [r1], r2
			
 
				+	VMOVL.U8	Q0, D0	; Q0 = __77__66__55__44__33__22__11__00
			
 
				+	VLD1.64	{D4}, [r1], r2
			
 
				+	VMOVL.U8	Q1, D2	; etc
			
 
				+	VLD1.64	{D6}, [r1], r2
			
 
				+	VMOVL.U8	Q2, D4
			
 
				+	VMOVL.U8	Q3, D6
			
 
				+	VQADD.S16	Q8, Q8, Q0
			
 
				+	VLD1.64	{D0}, [r1], r2
			
 
				+	VQADD.S16	Q9, Q9, Q1
			
 
				+	VLD1.64	{D2}, [r1], r2
			
 
				+	VQADD.S16	Q10,Q10,Q2
			
 
				+	VLD1.64	{D4}, [r1], r2
			
 
				+	VQADD.S16	Q11,Q11,Q3
			
 
				+	VLD1.64	{D6}, [r1], r2
			
 
				+	VMOVL.U8	Q0, D0
			
 
				+	VMOVL.U8	Q1, D2
			
 
				+	VMOVL.U8	Q2, D4
			
 
				+	VMOVL.U8	Q3, D6
			
 
				+	VQADD.S16	Q12,Q12,Q0
			
 
				+	VQADD.S16	Q13,Q13,Q1
			
 
				+	VQADD.S16	Q14,Q14,Q2
			
 
				+	VQADD.S16	Q15,Q15,Q3
			
 
				+	VQMOVUN.S16	D16,Q8
			
 
				+	VQMOVUN.S16	D17,Q9
			
 
				+	VQMOVUN.S16	D18,Q10
			
 
				+	VST1.64	{D16},[r0@64], r2
			
 
				+	VQMOVUN.S16	D19,Q11
			
 
				+	VST1.64	{D17},[r0@64], r2
			
 
				+	VQMOVUN.S16	D20,Q12
			
 
				+	VST1.64	{D18},[r0@64], r2
			
 
				+	VQMOVUN.S16	D21,Q13
			
 
				+	VST1.64	{D19},[r0@64], r2
			
 
				+	VQMOVUN.S16	D22,Q14
			
 
				+	VST1.64	{D20},[r0@64], r2
			
 
				+	VQMOVUN.S16	D23,Q15
			
 
				+	VST1.64	{D21},[r0@64], r2
			
 
				+	VST1.64	{D22},[r0@64], r2
			
 
				+	VST1.64	{D23},[r0@64], r2
			
 
				+	MOV	PC,R14
			
 
				+	ENDP
			
 
				+
			
 
				+oc_frag_recon_inter2_neon PROC
			
 
				+	; r0 =       unsigned char *_dst
			
 
				+	; r1 = const unsigned char *_src1
			
 
				+	; r2 = const unsigned char *_src2
			
 
				+	; r3 =       int            _ystride
			
 
				+	LDR	r12,[r13]
			
 
				+	; r12= const ogg_int16_t    _residue[64]
			
 
				+	VLDMIA	r12,{D16-D31}
			
 
				+	VLD1.64	{D0}, [r1], r3
			
 
				+	VLD1.64	{D4}, [r2], r3
			
 
				+	VLD1.64	{D1}, [r1], r3
			
 
				+	VLD1.64	{D5}, [r2], r3
			
 
				+	VHADD.U8	Q2, Q0, Q2	; Q2 = FFEEDDCCBBAA99887766554433221100
			
 
				+	VLD1.64	{D2}, [r1], r3
			
 
				+	VLD1.64	{D6}, [r2], r3
			
 
				+	VMOVL.U8	Q0, D4		; Q0 = __77__66__55__44__33__22__11__00
			
 
				+	VLD1.64	{D3}, [r1], r3
			
 
				+	VMOVL.U8	Q2, D5		; etc
			
 
				+	VLD1.64	{D7}, [r2], r3
			
 
				+	VHADD.U8	Q3, Q1, Q3
			
 
				+	VQADD.S16	Q8, Q8, Q0
			
 
				+	VQADD.S16	Q9, Q9, Q2
			
 
				+	VLD1.64	{D0}, [r1], r3
			
 
				+	VMOVL.U8	Q1, D6
			
 
				+	VLD1.64	{D4}, [r2], r3
			
 
				+	VMOVL.U8	Q3, D7
			
 
				+	VLD1.64	{D1}, [r1], r3
			
 
				+	VQADD.S16	Q10,Q10,Q1
			
 
				+	VLD1.64	{D5}, [r2], r3
			
 
				+	VQADD.S16	Q11,Q11,Q3
			
 
				+	VLD1.64	{D2}, [r1], r3
			
 
				+	VHADD.U8	Q2, Q0, Q2
			
 
				+	VLD1.64	{D6}, [r2], r3
			
 
				+	VLD1.64	{D3}, [r1], r3
			
 
				+	VMOVL.U8	Q0, D4
			
 
				+	VLD1.64	{D7}, [r2], r3
			
 
				+	VMOVL.U8	Q2, D5
			
 
				+	VHADD.U8	Q3, Q1, Q3
			
 
				+	VQADD.S16	Q12,Q12,Q0
			
 
				+	VQADD.S16	Q13,Q13,Q2
			
 
				+	VMOVL.U8	Q1, D6
			
 
				+	VMOVL.U8	Q3, D7
			
 
				+	VQADD.S16	Q14,Q14,Q1
			
 
				+	VQADD.S16	Q15,Q15,Q3
			
 
				+	VQMOVUN.S16	D16,Q8
			
 
				+	VQMOVUN.S16	D17,Q9
			
 
				+	VQMOVUN.S16	D18,Q10
			
 
				+	VST1.64	{D16},[r0@64], r3
			
 
				+	VQMOVUN.S16	D19,Q11
			
 
				+	VST1.64	{D17},[r0@64], r3
			
 
				+	VQMOVUN.S16	D20,Q12
			
 
				+	VST1.64	{D18},[r0@64], r3
			
 
				+	VQMOVUN.S16	D21,Q13
			
 
				+	VST1.64	{D19},[r0@64], r3
			
 
				+	VQMOVUN.S16	D22,Q14
			
 
				+	VST1.64	{D20},[r0@64], r3
			
 
				+	VQMOVUN.S16	D23,Q15
			
 
				+	VST1.64	{D21},[r0@64], r3
			
 
				+	VST1.64	{D22},[r0@64], r3
			
 
				+	VST1.64	{D23},[r0@64], r3
			
 
				+	MOV	PC,R14
			
 
				+	ENDP
			
 
				+ ]
			
 
				+
			
 
				+	END
			
--- a/modules/theoraplayer/native/theora/lib/arm/armidct.asm
+++ b/modules/theoraplayer/native/theora/lib/arm/armidct.asm
@@ -0,0 +1,1854 @@
 
				+@********************************************************************
			
 
				+@*                                                                  *
			
 
				+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+@*                                                                  *
			
 
				+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+@*                                                                  *
			
 
				+@********************************************************************
			
 
				+@ Original implementation:
			
 
				+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+@ last mod: $Id: armidct.s 17728 2010-12-07 10:28:07Z tterribe $
			
 
				+@********************************************************************
			
 
				+
			
 
				+    .text;   .p2align 2
			
 
				+
			
 
				+	.include "armopts-gnu.S"
			
 
				+
			
 
				+	.global	oc_idct8x8_1_arm
			
 
				+	.global	oc_idct8x8_arm
			
 
				+
			
 
				+	.type	oc_idct8x8_1_arm, %function; oc_idct8x8_1_arm: @ PROC
			
 
				+	@ r0 = ogg_int16_t  *_y
			
 
				+	@ r1 = ogg_uint16_t  _dc
			
 
				+	ORR	r1, r1, r1, LSL #16
			
 
				+	MOV	r2, r1
			
 
				+	MOV	r3, r1
			
 
				+	MOV	r12,r1
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	MOV	PC, r14
			
 
				+	.size oc_idct8x8_1_arm, .-oc_idct8x8_1_arm	@ ENDP
			
 
				+
			
 
				+	.type	oc_idct8x8_arm, %function; oc_idct8x8_arm: @ PROC
			
 
				+	@ r0 = ogg_int16_t *_y
			
 
				+	@ r1 = ogg_int16_t *_x
			
 
				+	@ r2 = int          _last_zzi
			
 
				+	CMP	r2, #3
			
 
				+	BLE	oc_idct8x8_3_arm
			
 
				+	CMP	r2, #6
			
 
				+	BLE	oc_idct8x8_6_arm
			
 
				+	CMP	r2, #10
			
 
				+	BLE	oc_idct8x8_10_arm
			
 
				+oc_idct8x8_slow_arm:
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	STR	r0, [r13,#-4]!
			
 
				+	ADD	r0, r13, #4	@ Write to temp storage.
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	LDR	r0, [r13], #4	@ Write to the final destination.
			
 
				+	SUB	r2, r1, #8*16
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	MOV	r6, #0
			
 
				+	MOV	r7, #0
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	MOV	r1, r13		@ And read from temp storage.
			
 
				+@ Column transforms
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	.size oc_idct8x8_arm, .-oc_idct8x8_arm	@ ENDP
			
 
				+
			
 
				+	.type	oc_idct8x8_10_arm, %function; oc_idct8x8_10_arm: @ PROC
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	MOV	r2, r0
			
 
				+	MOV	r0, r13		@ Write to temp storage.
			
 
				+	BL	idct4core_arm
			
 
				+	BL	idct3core_arm
			
 
				+	BL	idct2core_arm
			
 
				+	BL	idct1core_arm
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-4*16]!
			
 
				+	STR	r4, [r1,#4]
			
 
				+	STR	r4, [r1,#16]
			
 
				+	STR	r4, [r1,#20]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	STR	r4, [r1,#48]
			
 
				+	MOV	r1, r13		@ Read from temp storage.
			
 
				+	MOV	r0, r2		@ Write to the final destination
			
 
				+oc_idct8x8_10_arm_cols:
			
 
				+@ Column transforms
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	.size oc_idct8x8_10_arm, .-oc_idct8x8_10_arm	@ ENDP
			
 
				+
			
 
				+	.type	oc_idct8x8_6_arm, %function; oc_idct8x8_6_arm: @ PROC
			
 
				+	STMFD	r13!,{r4-r7,r9-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	MOV	r2, r0
			
 
				+	MOV	r0, r13		@ Write to temp storage.
			
 
				+	BL	idct3core_arm
			
 
				+	BL	idct2core_arm
			
 
				+	BL	idct1core_arm
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-3*16]!
			
 
				+	STR	r4, [r1,#4]
			
 
				+	STR	r4, [r1,#16]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	MOV	r1, r13		@ Read from temp storage.
			
 
				+	MOV	r0, r2		@ Write to the final destination
			
 
				+@ Column transforms
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r7,r9-r11,PC}
			
 
				+	.size oc_idct8x8_6_arm, .-oc_idct8x8_6_arm	@ ENDP
			
 
				+
			
 
				+	.type	oc_idct8x8_3_arm, %function; oc_idct8x8_3_arm: @ PROC
			
 
				+	STMFD	r13!,{r4-r7,r9-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	MOV	r2, r0
			
 
				+	MOV	r0, r13		@ Write to temp storage.
			
 
				+	BL	idct2core_arm
			
 
				+	BL	idct1core_arm
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-2*16]!
			
 
				+	STR	r4, [r1,#16]
			
 
				+	MOV	r1, r13		@ Read from temp storage.
			
 
				+	MOV	r0, r2		@ Write to the final destination
			
 
				+@ Column transforms
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r7,r9-r11,PC}
			
 
				+	.size oc_idct8x8_3_arm, .-oc_idct8x8_3_arm	@ ENDP
			
 
				+
			
 
				+	.type	idct1core_arm, %function; idct1core_arm: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r3, [r1], #16
			
 
				+	MOV	r12,#0x05
			
 
				+	ORR	r12,r12,#0xB500
			
 
				+	MUL	r3, r12, r3
			
 
				+	@ Stall ?
			
 
				+	MOV	r3, r3, ASR #16
			
 
				+	STRH	r3, [r0], #2
			
 
				+	STRH	r3, [r0, #14]
			
 
				+	STRH	r3, [r0, #30]
			
 
				+	STRH	r3, [r0, #46]
			
 
				+	STRH	r3, [r0, #62]
			
 
				+	STRH	r3, [r0, #78]
			
 
				+	STRH	r3, [r0, #94]
			
 
				+	STRH	r3, [r0, #110]
			
 
				+	MOV	PC,R14
			
 
				+	.size idct1core_arm, .-idct1core_arm	@ ENDP
			
 
				+
			
 
				+	.type	idct2core_arm, %function; idct2core_arm: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4
			
 
				+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
			
 
				+	LDR	r3, OC_C7S1
			
 
				+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r10,OC_C1S7
			
 
				+	MUL	r3, r11,r3		@ r3 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r11,r10,r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t[4]
			
 
				+	MUL	r10,r12,r3		@ r10= t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		@ r11= t[7]
			
 
				+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	MOV	r10,r10,ASR #16		@ r10= t[5]
			
 
				+	ADD	r12,r9,r12,ASR #16	@ r12= t[0]+t[6]
			
 
				+	ADD	r12,r12,r10		@ r12= t[0]+t2[6] = t[0]+t[6]+t[5]
			
 
				+	SUB	r10,r12,r10,LSL #1	@ r10= t[0]+t2[5] = t[0]+t[6]-t[5]
			
 
				+	ADD	r3, r3, r9		@ r3 = t[0]+t[4]
			
 
				+	ADD	r11,r11,r9		@ r11= t[0]+t[7]
			
 
				+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r12,[r0, #14]		@ y[1] = t[0]+t[6]
			
 
				+	STRH	r10,[r0, #30]		@ y[2] = t[0]+t[5]
			
 
				+	STRH	r3, [r0, #46]		@ y[3] = t[0]+t[4]
			
 
				+	RSB	r3, r3, r9, LSL #1	@ r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
			
 
				+	RSB	r10,r10,r9, LSL #1	@ r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
			
 
				+	RSB	r12,r12,r9, LSL #1	@ r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
			
 
				+	RSB	r11,r11,r9, LSL #1	@ r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
			
 
				+	STRH	r3, [r0, #62]		@ y[4] = t[0]-t[4]
			
 
				+	STRH	r10,[r0, #78]		@ y[5] = t[0]-t[5]
			
 
				+	STRH	r12,[r0, #94]		@ y[6] = t[0]-t[6]
			
 
				+	STRH	r11,[r0, #110]		@ y[7] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	.size idct2core_arm, .-idct2core_arm	@ ENDP
			
 
				+
			
 
				+	.type	idct2core_down_arm, %function; idct2core_down_arm: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4
			
 
				+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
			
 
				+	LDR	r3, OC_C7S1
			
 
				+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r10,OC_C1S7
			
 
				+	MUL	r3, r11,r3		@ r3 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r11,r10,r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	ADD	r9, r9, #8		@ r9 = t[0]+8
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t[4]
			
 
				+	MUL	r10,r12,r3		@ r10= t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		@ r11= t[7]
			
 
				+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	MOV	r10,r10,ASR #16		@ r10= t[5]
			
 
				+	ADD	r12,r9,r12,ASR #16	@ r12= t[0]+t[6]+8
			
 
				+	ADD	r12,r12,r10		@ r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
			
 
				+	SUB	r10,r12,r10,LSL #1	@ r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
			
 
				+	ADD	r3, r3, r9		@ r3 = t[0]+t[4]+8
			
 
				+	ADD	r11,r11,r9		@ r11= t[0]+t[7]+8
			
 
				+	@ TODO: This is wrong.
			
 
				+	@ The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	@  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r4, r11,ASR #4
			
 
				+	MOV	r5, r12,ASR #4
			
 
				+	MOV	r6, r10,ASR #4
			
 
				+	MOV	r7, r3, ASR #4
			
 
				+	RSB	r3, r3, r9, LSL #1	@r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
			
 
				+	RSB	r10,r10,r9, LSL #1	@r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
			
 
				+	RSB	r12,r12,r9, LSL #1	@r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
			
 
				+	RSB	r11,r11,r9, LSL #1	@r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r12,r12,ASR #4
			
 
				+	MOV	r11,r11,ASR #4
			
 
				+	STRH	r4, [r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r5, [r0, #14]		@ y[1] = t[0]+t[6]
			
 
				+	STRH	r6, [r0, #30]		@ y[2] = t[0]+t[5]
			
 
				+	STRH	r7, [r0, #46]		@ y[3] = t[0]+t[4]
			
 
				+	STRH	r3, [r0, #62]		@ y[4] = t[0]-t[4]
			
 
				+	STRH	r10,[r0, #78]		@ y[5] = t[0]-t[5]
			
 
				+	STRH	r12,[r0, #94]		@ y[6] = t[0]-t[6]
			
 
				+	STRH	r11,[r0, #110]		@ y[7] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	.size idct2core_down_arm, .-idct2core_down_arm	@ ENDP
			
 
				+
			
 
				+	.type	idct3core_arm, %function; idct3core_arm: @ PROC
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4		@ r12= OC_C4S4
			
 
				+	LDRSH	r3, [r1, #-12]		@ r3 = x[2]
			
 
				+	LDR	r10,OC_C6S2		@ r10= OC_C6S2
			
 
				+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r4, OC_C2S6		@ r4 = OC_C2S6
			
 
				+	MUL	r10,r3, r10		@ r10= t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
			
 
				+	MUL	r3, r4, r3		@ r3 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r4, OC_C7S1		@ r4 = OC_C7S1
			
 
				+	LDR	r5, OC_C1S7		@ r5 = OC_C1S7
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r4, r11,r4		@ r4 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	ADD	r3, r9, r3, ASR #16	@ r3 = t[0]+t[3]
			
 
				+	MUL	r11,r5, r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	MOV	r4, r4, ASR #16		@ r4 = t[4]
			
 
				+	MUL	r5, r12,r4		@ r5 = t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		@ r11= t[7]
			
 
				+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	ADD	r10,r9, r10,ASR #16	@ r10= t[1] = t[0]+t[2]
			
 
				+	RSB	r6, r10,r9, LSL #1	@ r6 = t[2] = t[0]-t[2]
			
 
				+					@ r3 = t2[0] = t[0]+t[3]
			
 
				+	RSB	r9, r3, r9, LSL #1	@ r9 = t2[3] = t[0]-t[3]
			
 
				+	MOV	r12,r12,ASR #16		@ r12= t[6]
			
 
				+	ADD	r5, r12,r5, ASR #16	@ r5 = t2[6] = t[6]+t[5]
			
 
				+	RSB	r12,r5, r12,LSL #1	@ r12= t2[5] = t[6]-t[5]
			
 
				+	ADD	r11,r3, r11		@ r11= t2[0]+t[7]
			
 
				+	ADD	r5, r10,r5		@ r5 = t[1]+t2[6]
			
 
				+	ADD	r12,r6, r12		@ r12= t[2]+t2[5]
			
 
				+	ADD	r4, r9, r4		@ r4 = t2[3]+t[4]
			
 
				+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r5, [r0, #14]		@ y[1] = t[1]+t2[6]
			
 
				+	STRH	r12,[r0, #30]		@ y[2] = t[2]+t2[5]
			
 
				+	STRH	r4, [r0, #46]		@ y[3] = t2[3]+t[4]
			
 
				+	RSB	r11,r11,r3, LSL #1	@ r11= t2[0] - t[7]
			
 
				+	RSB	r5, r5, r10,LSL #1	@ r5 = t[1]  - t2[6]
			
 
				+	RSB	r12,r12,r6, LSL #1	@ r6 = t[2]  - t2[5]
			
 
				+	RSB	r4, r4, r9, LSL #1	@ r4 = t2[3] - t[4]
			
 
				+	STRH	r4, [r0, #62]		@ y[4] = t2[3]-t[4]
			
 
				+	STRH	r12,[r0, #78]		@ y[5] = t[2]-t2[5]
			
 
				+	STRH	r5, [r0, #94]		@ y[6] = t[1]-t2[6]
			
 
				+	STRH	r11,[r0, #110]		@ y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,R14
			
 
				+	.size idct3core_arm, .-idct3core_arm	@ ENDP
			
 
				+
			
 
				+	.type	idct3core_down_arm, %function; idct3core_down_arm: @ PROC
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4		@ r12= OC_C4S4
			
 
				+	LDRSH	r3, [r1, #-12]		@ r3 = x[2]
			
 
				+	LDR	r10,OC_C6S2		@ r10= OC_C6S2
			
 
				+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r4, OC_C2S6		@ r4 = OC_C2S6
			
 
				+	MUL	r10,r3, r10		@ r10= t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
			
 
				+	MUL	r3, r4, r3		@ r3 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r4, OC_C7S1		@ r4 = OC_C7S1
			
 
				+	LDR	r5, OC_C1S7		@ r5 = OC_C1S7
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r4, r11,r4		@ r4 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	ADD	r9, r9, #8		@ r9 = t[0]+8
			
 
				+	MUL	r11,r5, r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	ADD	r3, r9, r3, ASR #16	@ r3 = t[0]+t[3]+8
			
 
				+	MOV	r4, r4, ASR #16		@ r4 = t[4]
			
 
				+	MUL	r5, r12,r4		@ r5 = t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		@ r11= t[7]
			
 
				+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	ADD	r10,r9, r10,ASR #16	@ r10= t[1]+8 = t[0]+t[2]+8
			
 
				+	RSB	r6, r10,r9, LSL #1	@ r6 = t[2]+8 = t[0]-t[2]+8
			
 
				+					@ r3 = t2[0]+8 = t[0]+t[3]+8
			
 
				+	RSB	r9, r3, r9, LSL #1	@ r9 = t2[3]+8 = t[0]-t[3]+8
			
 
				+	MOV	r12,r12,ASR #16		@ r12= t[6]
			
 
				+	ADD	r5, r12,r5, ASR #16	@ r5 = t2[6] = t[6]+t[5]
			
 
				+	RSB	r12,r5, r12,LSL #1	@ r12= t2[5] = t[6]-t[5]
			
 
				+	ADD	r11,r3, r11		@ r11= t2[0]+t[7] +8
			
 
				+	ADD	r5, r10,r5		@ r5 = t[1] +t2[6]+8
			
 
				+	ADD	r12,r6, r12		@ r12= t[2] +t2[5]+8
			
 
				+	ADD	r4, r9, r4		@ r4 = t2[3]+t[4] +8
			
 
				+	RSB	r3, r11,r3, LSL #1	@ r11= t2[0] - t[7]  + 8
			
 
				+	RSB	r10,r5, r10,LSL #1	@ r5 = t[1]  - t2[6] + 8
			
 
				+	RSB	r6, r12,r6, LSL #1	@ r6 = t[2]  - t2[5] + 8
			
 
				+	RSB	r9, r4, r9, LSL #1	@ r4 = t2[3] - t[4]  + 8
			
 
				+	@ TODO: This is wrong.
			
 
				+	@ The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	@  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r11,r11,ASR #4
			
 
				+	MOV	r5, r5, ASR #4
			
 
				+	MOV	r12,r12,ASR #4
			
 
				+	MOV	r4, r4, ASR #4
			
 
				+	MOV	r9, r9, ASR #4
			
 
				+	MOV	r6, r6, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r5, [r0, #14]		@ y[1] = t[1]+t2[6]
			
 
				+	STRH	r12,[r0, #30]		@ y[2] = t[2]+t2[5]
			
 
				+	STRH	r4, [r0, #46]		@ y[3] = t2[3]+t[4]
			
 
				+	STRH	r9, [r0, #62]		@ y[4] = t2[3]-t[4]
			
 
				+	STRH	r6, [r0, #78]		@ y[5] = t[2]-t2[5]
			
 
				+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t2[6]
			
 
				+	STRH	r3, [r0, #110]		@ y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,R14
			
 
				+	.size idct3core_down_arm, .-idct3core_down_arm	@ ENDP
			
 
				+
			
 
				+	.type	idct4core_arm, %function; idct4core_arm: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r10,OC_C4S4		@ r10= OC_C4S4
			
 
				+	LDRSH	r12,[r1, #-12]		@ r12= x[2]
			
 
				+	LDR	r4, OC_C6S2		@ r4 = OC_C6S2
			
 
				+	MUL	r9, r10,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r5, OC_C2S6		@ r5 = OC_C2S6
			
 
				+	MUL	r4, r12,r4		@ r4 = t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r3, [r1, #-14]		@ r3 = x[1]
			
 
				+	MUL	r5, r12,r5		@ r5 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r6, OC_C7S1		@ r6 = OC_C7S1
			
 
				+	LDR	r12,OC_C1S7		@ r12= OC_C1S7
			
 
				+	LDRSH	r11,[r1, #-10]		@ r11= x[3]
			
 
				+	MUL	r6, r3, r6		@ r6 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	LDR	r7, OC_C5S3		@ r7 = OC_C5S3
			
 
				+	MUL	r3, r12,r3		@ r3 = t[7]<<16 = OC_C1S7*x[1]
			
 
				+	LDR	r8, OC_C3S5		@ r8 = OC_C3S5
			
 
				+	MUL	r7, r11,r7		@ r7 = -t[5]<<16 = OC_C5S3*x[3]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r11,r8, r11		@ r11= t[6]<<16 = OC_C3S5*x[3]
			
 
				+	MOV	r6, r6, ASR #16		@ r6 = t[4]
			
 
				+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+@ before multiplying, not after (this is not equivalent)
			
 
				+	SUB	r7, r6, r7, ASR #16	@ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
			
 
				+	RSB	r6, r7, r6, LSL #1	@ r6 = t[4]-t[5]
			
 
				+	MUL	r6, r10,r6		@ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t[7]
			
 
				+	ADD	r11,r3, r11,ASR #16	@ r11= t2[7]=t[7]+t[6]
			
 
				+	RSB	r3, r11,r3, LSL #1	@ r3 = t[7]-t[6]
			
 
				+	MUL	r3, r10,r3		@ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
			
 
				+	ADD	r4, r9, r4, ASR #16	@ r4 = t[1] = t[0] + t[2]
			
 
				+	RSB	r10,r4, r9, LSL #1	@ r10= t[2] = t[0] - t[2]
			
 
				+	ADD	r5, r9, r5, ASR #16	@ r5 = t[0] = t[0] + t[3]
			
 
				+	RSB	r9, r5, r9, LSL #1	@ r9 = t[3] = t[0] - t[3]
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t2[6]
			
 
				+	ADD	r6, r3, r6, ASR #16	@ r6 = t3[6] = t2[6]+t2[5]
			
 
				+	RSB	r3, r6, r3, LSL #1	@ r3 = t3[5] = t2[6]-t2[5]
			
 
				+	ADD	r11,r5, r11		@ r11= t[0]+t2[7]
			
 
				+	ADD	r6, r4, r6		@ r6 = t[1]+t3[6]
			
 
				+	ADD	r3, r10,r3		@ r3 = t[2]+t3[5]
			
 
				+	ADD	r7, r9, r7		@ r7 = t[3]+t2[4]
			
 
				+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t2[6]
			
 
				+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t2[5]
			
 
				+	STRH	r7, [r0, #46]		@ y[3] = t2[3]+t[4]
			
 
				+	RSB	r11,r11,r5, LSL #1	@ r11= t[0]-t2[7]
			
 
				+	RSB	r6, r6, r4, LSL #1	@ r6 = t[1]-t3[6]
			
 
				+	RSB	r3, r3, r10,LSL #1	@ r3 = t[2]-t3[5]
			
 
				+	RSB	r7, r7, r9, LSL #1	@ r7 = t[3]-t2[4]
			
 
				+	STRH	r7, [r0, #62]		@ y[4] = t2[3]-t[4]
			
 
				+	STRH	r3, [r0, #78]		@ y[5] = t[2]-t2[5]
			
 
				+	STRH	r6, [r0, #94]		@ y[6] = t[1]-t2[6]
			
 
				+	STRH	r11, [r0, #110]		@ y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	.size idct4core_arm, .-idct4core_arm	@ ENDP
			
 
				+
			
 
				+	.type	idct4core_down_arm, %function; idct4core_down_arm: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r10,OC_C4S4		@ r10= OC_C4S4
			
 
				+	LDRSH	r12,[r1, #-12]		@ r12= x[2]
			
 
				+	LDR	r4, OC_C6S2		@ r4 = OC_C6S2
			
 
				+	MUL	r9, r10,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r5, OC_C2S6		@ r5 = OC_C2S6
			
 
				+	MUL	r4, r12,r4		@ r4 = t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r3, [r1, #-14]		@ r3 = x[1]
			
 
				+	MUL	r5, r12,r5		@ r5 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r6, OC_C7S1		@ r6 = OC_C7S1
			
 
				+	LDR	r12,OC_C1S7		@ r12= OC_C1S7
			
 
				+	LDRSH	r11,[r1, #-10]		@ r11= x[3]
			
 
				+	MUL	r6, r3, r6		@ r6 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	LDR	r7, OC_C5S3		@ r7 = OC_C5S3
			
 
				+	MUL	r3, r12,r3		@ r3 = t[7]<<16 = OC_C1S7*x[1]
			
 
				+	LDR	r8, OC_C3S5		@ r8 = OC_C3S5
			
 
				+	MUL	r7, r11,r7		@ r7 = -t[5]<<16 = OC_C5S3*x[3]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r11,r8, r11		@ r11= t[6]<<16 = OC_C3S5*x[3]
			
 
				+	MOV	r6, r6, ASR #16		@ r6 = t[4]
			
 
				+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+@ before multiplying, not after (this is not equivalent)
			
 
				+	SUB	r7, r6, r7, ASR #16	@ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
			
 
				+	RSB	r6, r7, r6, LSL #1	@ r6 = t[4]-t[5]
			
 
				+	MUL	r6, r10,r6		@ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t[7]
			
 
				+	ADD	r11,r3, r11,ASR #16	@ r11= t2[7]=t[7]+t[6]
			
 
				+	RSB	r3, r11,r3, LSL #1	@ r3 = t[7]-t[6]
			
 
				+	ADD	r9, r9, #8		@ r9 = t[0]+8
			
 
				+	MUL	r3, r10,r3		@ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
			
 
				+	ADD	r4, r9, r4, ASR #16	@ r4 = t[1] = t[0] + t[2] + 8
			
 
				+	RSB	r10,r4, r9, LSL #1	@ r10= t[2] = t[0] - t[2] + 8
			
 
				+	ADD	r5, r9, r5, ASR #16	@ r5 = t[0] = t[0] + t[3] + 8
			
 
				+	RSB	r9, r5, r9, LSL #1	@ r9 = t[3] = t[0] - t[3] + 8
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t2[6]
			
 
				+	ADD	r6, r3, r6, ASR #16	@ r6 = t3[6] = t2[6]+t2[5]
			
 
				+	RSB	r3, r6, r3, LSL #1	@ r3 = t3[5] = t2[6]-t2[5]
			
 
				+	ADD	r5, r5, r11		@ r5 = t[0]+t2[7]+8
			
 
				+	ADD	r4, r4, r6		@ r4 = t[1]+t3[6]+8
			
 
				+	ADD	r10,r10,r3		@ r10= t[2]+t3[5]+8
			
 
				+	ADD	r9, r9, r7		@ r9 = t[3]+t2[4]+8
			
 
				+	SUB	r11,r5, r11,LSL #1	@ r11= t[0]-t2[7]+8
			
 
				+	SUB	r6, r4, r6, LSL #1	@ r6 = t[1]-t3[6]+8
			
 
				+	SUB	r3, r10,r3, LSL #1	@ r3 = t[2]-t3[5]+8
			
 
				+	SUB	r7, r9, r7, LSL #1	@ r7 = t[3]-t2[4]+8
			
 
				+	@ TODO: This is wrong.
			
 
				+	@ The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	@  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r11,r11,ASR #4
			
 
				+	MOV	r6, r6, ASR #4
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	MOV	r7, r7, ASR #4
			
 
				+	MOV	r9, r9, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r4, r4, ASR #4
			
 
				+	MOV	r5, r5, ASR #4
			
 
				+	STRH	r5,[r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r4, [r0, #14]		@ y[1] = t[1]+t2[6]
			
 
				+	STRH	r10,[r0, #30]		@ y[2] = t[2]+t2[5]
			
 
				+	STRH	r9, [r0, #46]		@ y[3] = t2[3]+t[4]
			
 
				+	STRH	r7, [r0, #62]		@ y[4] = t2[3]-t[4]
			
 
				+	STRH	r3, [r0, #78]		@ y[5] = t[2]-t2[5]
			
 
				+	STRH	r6, [r0, #94]		@ y[6] = t[1]-t2[6]
			
 
				+	STRH	r11,[r0, #110]		@ y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	.size idct4core_down_arm, .-idct4core_down_arm	@ ENDP
			
 
				+
			
 
				+	.type	idct8core_arm, %function; idct8core_arm: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r2, [r1],#16		@ r2 = x[0]
			
 
				+	STMFD	r13!,{r1,r14}
			
 
				+	LDRSH	r6, [r1, #-8]		@ r6 = x[4]
			
 
				+	LDR	r12,OC_C4S4		@ r12= C4S4
			
 
				+	LDRSH	r4, [r1, #-12]		@ r4 = x[2]
			
 
				+	ADD	r2, r2, r6		@ r2 = x[0] + x[4]
			
 
				+	SUB	r6, r2, r6, LSL #1	@ r6 = x[0] - x[4]
			
 
				+	@ For spec compliance, these sums must be truncated to 16-bit precision
			
 
				+	@ _before_ the multiply (not after).
			
 
				+	@ Sadly, ARMv4 provides no simple way to do that.
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	MOV	r2, r2, ASR #16
			
 
				+	MOV	r6, r6, ASR #16
			
 
				+	MUL	r2, r12,r2		@ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
			
 
				+	LDRSH	r8, [r1, #-4]		@ r8 = x[6]
			
 
				+	LDR	r7, OC_C6S2		@ r7 = OC_C6S2
			
 
				+	MUL	r6, r12,r6		@ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
			
 
				+	LDR	r14,OC_C2S6		@ r14= OC_C2S6
			
 
				+	MUL	r3, r4, r7		@ r3 = OC_C6S2*x[2]
			
 
				+	LDR	r5, OC_C7S1		@ r5 = OC_C7S1
			
 
				+	MUL	r4, r14,r4		@ r4 = OC_C2S6*x[2]
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = OC_C6S2*x[2]>>16
			
 
				+	MUL	r14,r8, r14		@ r14= OC_C2S6*x[6]
			
 
				+	MOV	r4, r4, ASR #16		@ r4 = OC_C2S6*x[2]>>16
			
 
				+	MUL	r8, r7, r8		@ r8 = OC_C6S2*x[6]
			
 
				+	LDR	r7, OC_C1S7		@ r7 = OC_C1S7
			
 
				+	SUB	r3, r3, r14,ASR #16	@ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
			
 
				+	LDRSH	r14,[r1, #-14]		@ r14= x[1]
			
 
				+	ADD	r4, r4, r8, ASR #16	@ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
			
 
				+	LDRSH	r8, [r1, #-2]		@ r8 = x[7]
			
 
				+	MUL	r9, r5, r14		@ r9 = OC_C7S1*x[1]
			
 
				+	LDRSH	r10,[r1, #-6]		@ r10= x[5]
			
 
				+	MUL	r14,r7, r14		@ r14= OC_C1S7*x[1]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = OC_C7S1*x[1]>>16
			
 
				+	MUL	r7, r8, r7		@ r7 = OC_C1S7*x[7]
			
 
				+	MOV	r14,r14,ASR #16		@ r14= OC_C1S7*x[1]>>16
			
 
				+	MUL	r8, r5, r8		@ r8 = OC_C7S1*x[7]
			
 
				+	LDRSH	r1, [r1, #-10]		@ r1 = x[3]
			
 
				+	LDR	r5, OC_C3S5		@ r5 = OC_C3S5
			
 
				+	LDR	r11,OC_C5S3		@ r11= OC_C5S3
			
 
				+	ADD	r8, r14,r8, ASR #16	@ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
			
 
				+	MUL	r14,r5, r10		@ r14= OC_C3S5*x[5]
			
 
				+	SUB	r9, r9, r7, ASR #16	@ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
			
 
				+	MUL	r10,r11,r10		@ r10= OC_C5S3*x[5]
			
 
				+	MOV	r14,r14,ASR #16		@ r14= OC_C3S5*x[5]>>16
			
 
				+	MUL	r11,r1, r11		@ r11= OC_C5S3*x[3]
			
 
				+	MOV	r10,r10,ASR #16		@ r10= OC_C5S3*x[5]>>16
			
 
				+	MUL	r1, r5, r1		@ r1 = OC_C3S5*x[3]
			
 
				+	SUB	r14,r14,r11,ASR #16	@r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
			
 
				+	ADD	r10,r10,r1, ASR #16	@r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
			
 
				+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
			
 
				+	@ r10=t[6] r12=C4S4 r14=t[5]
			
 
				+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+@ before multiplying, not after (this is not equivalent)
			
 
				+	@ Stage 2
			
 
				+	@ 4-5 butterfly
			
 
				+	ADD	r9, r9, r14		@ r9 = t2[4]     =       t[4]+t[5]
			
 
				+	SUB	r14,r9, r14, LSL #1	@ r14=                   t[4]-t[5]
			
 
				+	MUL	r14,r12,r14		@ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
			
 
				+	@ 7-6 butterfly
			
 
				+	ADD	r8, r8, r10		@ r8 = t2[7]     =       t[7]+t[6]
			
 
				+	SUB	r10,r8, r10, LSL #1	@ r10=                   t[7]-t[6]
			
 
				+	MUL	r10,r12,r10		@ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
			
 
				+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
			
 
				+	@ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
			
 
				+	@ Stage 3
			
 
				+	@ 0-3 butterfly
			
 
				+	ADD	r2, r4, r2, ASR #16	@ r2 = t2[0] = t[0] + t[3]
			
 
				+	SUB	r4, r2, r4, LSL #1	@ r4 = t2[3] = t[0] - t[3]
			
 
				+	@ 1-2 butterfly
			
 
				+	ADD	r6, r3, r6, ASR #16	@ r6 = t2[1] = t[1] + t[2]
			
 
				+	SUB	r3, r6, r3, LSL #1	@ r3 = t2[2] = t[1] - t[2]
			
 
				+	@ 6-5 butterfly
			
 
				+	MOV	r14,r14,ASR #16		@ r14= t2[5]
			
 
				+	ADD	r10,r14,r10,ASR #16	@ r10= t3[6] = t[6] + t[5]
			
 
				+	SUB	r14,r10,r14,LSL #1	@ r14= t3[5] = t[6] - t[5]
			
 
				+	@ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
			
 
				+	@ r10=t3[6] r14=t3[5]
			
 
				+	@ Stage 4
			
 
				+	ADD	r2, r2, r8		@ r2 = t[0] + t[7]
			
 
				+	ADD	r6, r6, r10		@ r6 = t[1] + t[6]
			
 
				+	ADD	r3, r3, r14		@ r3 = t[2] + t[5]
			
 
				+	ADD	r4, r4, r9		@ r4 = t[3] + t[4]
			
 
				+	SUB	r8, r2, r8, LSL #1	@ r8 = t[0] - t[7]
			
 
				+	SUB	r10,r6, r10,LSL #1	@ r10= t[1] - t[6]
			
 
				+	SUB	r14,r3, r14,LSL #1	@ r14= t[2] - t[5]
			
 
				+	SUB	r9, r4, r9, LSL #1	@ r9 = t[3] - t[4]
			
 
				+	STRH	r2, [r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t[6]
			
 
				+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t[5]
			
 
				+	STRH	r4, [r0, #46]		@ y[3] = t[3]+t[4]
			
 
				+	STRH	r9, [r0, #62]		@ y[4] = t[3]-t[4]
			
 
				+	STRH	r14,[r0, #78]		@ y[5] = t[2]-t[5]
			
 
				+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t[6]
			
 
				+	STRH	r8, [r0, #110]		@ y[7] = t[0]-t[7]
			
 
				+	LDMFD	r13!,{r1,PC}
			
 
				+	.size idct8core_arm, .-idct8core_arm	@ ENDP
			
 
				+
			
 
				+	.type	idct8core_down_arm, %function; idct8core_down_arm: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r2, [r1],#16		@ r2 = x[0]
			
 
				+	STMFD	r13!,{r1,r14}
			
 
				+	LDRSH	r6, [r1, #-8]		@ r6 = x[4]
			
 
				+	LDR	r12,OC_C4S4		@ r12= C4S4
			
 
				+	LDRSH	r4, [r1, #-12]		@ r4 = x[2]
			
 
				+	ADD	r2, r2, r6		@ r2 = x[0] + x[4]
			
 
				+	SUB	r6, r2, r6, LSL #1	@ r6 = x[0] - x[4]
			
 
				+	@ For spec compliance, these sums must be truncated to 16-bit precision
			
 
				+	@ _before_ the multiply (not after).
			
 
				+	@ Sadly, ARMv4 provides no simple way to do that.
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	MOV	r2, r2, ASR #16
			
 
				+	MOV	r6, r6, ASR #16
			
 
				+	MUL	r2, r12,r2		@ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
			
 
				+	LDRSH	r8, [r1, #-4]		@ r8 = x[6]
			
 
				+	LDR	r7, OC_C6S2		@ r7 = OC_C6S2
			
 
				+	MUL	r6, r12,r6		@ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
			
 
				+	LDR	r14,OC_C2S6		@ r14= OC_C2S6
			
 
				+	MUL	r3, r4, r7		@ r3 = OC_C6S2*x[2]
			
 
				+	LDR	r5, OC_C7S1		@ r5 = OC_C7S1
			
 
				+	MUL	r4, r14,r4		@ r4 = OC_C2S6*x[2]
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = OC_C6S2*x[2]>>16
			
 
				+	MUL	r14,r8, r14		@ r14= OC_C2S6*x[6]
			
 
				+	MOV	r4, r4, ASR #16		@ r4 = OC_C2S6*x[2]>>16
			
 
				+	MUL	r8, r7, r8		@ r8 = OC_C6S2*x[6]
			
 
				+	LDR	r7, OC_C1S7		@ r7 = OC_C1S7
			
 
				+	SUB	r3, r3, r14,ASR #16	@ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
			
 
				+	LDRSH	r14,[r1, #-14]		@ r14= x[1]
			
 
				+	ADD	r4, r4, r8, ASR #16	@ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
			
 
				+	LDRSH	r8, [r1, #-2]		@ r8 = x[7]
			
 
				+	MUL	r9, r5, r14		@ r9 = OC_C7S1*x[1]
			
 
				+	LDRSH	r10,[r1, #-6]		@ r10= x[5]
			
 
				+	MUL	r14,r7, r14		@ r14= OC_C1S7*x[1]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = OC_C7S1*x[1]>>16
			
 
				+	MUL	r7, r8, r7		@ r7 = OC_C1S7*x[7]
			
 
				+	MOV	r14,r14,ASR #16		@ r14= OC_C1S7*x[1]>>16
			
 
				+	MUL	r8, r5, r8		@ r8 = OC_C7S1*x[7]
			
 
				+	LDRSH	r1, [r1, #-10]		@ r1 = x[3]
			
 
				+	LDR	r5, OC_C3S5		@ r5 = OC_C3S5
			
 
				+	LDR	r11,OC_C5S3		@ r11= OC_C5S3
			
 
				+	ADD	r8, r14,r8, ASR #16	@ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
			
 
				+	MUL	r14,r5, r10		@ r14= OC_C3S5*x[5]
			
 
				+	SUB	r9, r9, r7, ASR #16	@ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
			
 
				+	MUL	r10,r11,r10		@ r10= OC_C5S3*x[5]
			
 
				+	MOV	r14,r14,ASR #16		@ r14= OC_C3S5*x[5]>>16
			
 
				+	MUL	r11,r1, r11		@ r11= OC_C5S3*x[3]
			
 
				+	MOV	r10,r10,ASR #16		@ r10= OC_C5S3*x[5]>>16
			
 
				+	MUL	r1, r5, r1		@ r1 = OC_C3S5*x[3]
			
 
				+	SUB	r14,r14,r11,ASR #16	@r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
			
 
				+	ADD	r10,r10,r1, ASR #16	@r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
			
 
				+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
			
 
				+	@ r10=t[6] r12=C4S4 r14=t[5]
			
 
				+	@ Stage 2
			
 
				+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+@ before multiplying, not after (this is not equivalent)
			
 
				+	@ 4-5 butterfly
			
 
				+	ADD	r9, r9, r14		@ r9 = t2[4]     =       t[4]+t[5]
			
 
				+	SUB	r14,r9, r14, LSL #1	@ r14=                   t[4]-t[5]
			
 
				+	MUL	r14,r12,r14		@ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
			
 
				+	@ 7-6 butterfly
			
 
				+	ADD	r8, r8, r10		@ r8 = t2[7]     =       t[7]+t[6]
			
 
				+	SUB	r10,r8, r10, LSL #1	@ r10=                   t[7]-t[6]
			
 
				+	MUL	r10,r12,r10		@ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
			
 
				+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
			
 
				+	@ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
			
 
				+	@ Stage 3
			
 
				+	ADD	r2, r2, #8<<16		@ r2 = t[0]+8<<16
			
 
				+	ADD	r6, r6, #8<<16		@ r6 = t[1]+8<<16
			
 
				+	@ 0-3 butterfly
			
 
				+	ADD	r2, r4, r2, ASR #16	@ r2 = t2[0] = t[0] + t[3] + 8
			
 
				+	SUB	r4, r2, r4, LSL #1	@ r4 = t2[3] = t[0] - t[3] + 8
			
 
				+	@ 1-2 butterfly
			
 
				+	ADD	r6, r3, r6, ASR #16	@ r6 = t2[1] = t[1] + t[2] + 8
			
 
				+	SUB	r3, r6, r3, LSL #1	@ r3 = t2[2] = t[1] - t[2] + 8
			
 
				+	@ 6-5 butterfly
			
 
				+	MOV	r14,r14,ASR #16		@ r14= t2[5]
			
 
				+	ADD	r10,r14,r10,ASR #16	@ r10= t3[6] = t[6] + t[5]
			
 
				+	SUB	r14,r10,r14,LSL #1	@ r14= t3[5] = t[6] - t[5]
			
 
				+	@ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
			
 
				+	@ r10=t3[6] r14=t3[5]
			
 
				+	@ Stage 4
			
 
				+	ADD	r2, r2, r8		@ r2 = t[0] + t[7] + 8
			
 
				+	ADD	r6, r6, r10		@ r6 = t[1] + t[6] + 8
			
 
				+	ADD	r3, r3, r14		@ r3 = t[2] + t[5] + 8
			
 
				+	ADD	r4, r4, r9		@ r4 = t[3] + t[4] + 8
			
 
				+	SUB	r8, r2, r8, LSL #1	@ r8 = t[0] - t[7] + 8
			
 
				+	SUB	r10,r6, r10,LSL #1	@ r10= t[1] - t[6] + 8
			
 
				+	SUB	r14,r3, r14,LSL #1	@ r14= t[2] - t[5] + 8
			
 
				+	SUB	r9, r4, r9, LSL #1	@ r9 = t[3] - t[4] + 8
			
 
				+	@ TODO: This is wrong.
			
 
				+	@ The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	@  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r2, r2, ASR #4
			
 
				+	MOV	r6, r6, ASR #4
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	MOV	r4, r4, ASR #4
			
 
				+	MOV	r8, r8, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r14,r14,ASR #4
			
 
				+	MOV	r9, r9, ASR #4
			
 
				+	STRH	r2, [r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t[6]
			
 
				+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t[5]
			
 
				+	STRH	r4, [r0, #46]		@ y[3] = t[3]+t[4]
			
 
				+	STRH	r9, [r0, #62]		@ y[4] = t[3]-t[4]
			
 
				+	STRH	r14,[r0, #78]		@ y[5] = t[2]-t[5]
			
 
				+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t[6]
			
 
				+	STRH	r8, [r0, #110]		@ y[7] = t[0]-t[7]
			
 
				+	LDMFD	r13!,{r1,PC}
			
 
				+	.size idct8core_down_arm, .-idct8core_down_arm	@ ENDP
			
 
				+
			
 
				+  .if OC_ARM_ASM_MEDIA
			
 
				+	.global	oc_idct8x8_1_v6
			
 
				+	.global	oc_idct8x8_v6
			
 
				+
			
 
				+	.type	oc_idct8x8_1_v6, %function; oc_idct8x8_1_v6: @ PROC
			
 
				+	@ r0 = ogg_int16_t  *_y
			
 
				+	@ r1 = ogg_uint16_t  _dc
			
 
				+	ORR	r2, r1, r1, LSL #16
			
 
				+	ORR	r3, r1, r1, LSL #16
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	MOV	PC, r14
			
 
				+	.size oc_idct8x8_1_v6, .-oc_idct8x8_1_v6	@ ENDP
			
 
				+
			
 
				+	.type	oc_idct8x8_v6, %function; oc_idct8x8_v6: @ PROC
			
 
				+	@ r0 = ogg_int16_t *_y
			
 
				+	@ r1 = ogg_int16_t *_x
			
 
				+	@ r2 = int          _last_zzi
			
 
				+	CMP	r2, #3
			
 
				+	BLE	oc_idct8x8_3_v6
			
 
				+	@CMP	r2, #6
			
 
				+	@BLE	oc_idct8x8_6_v6
			
 
				+	CMP	r2, #10
			
 
				+	BLE	oc_idct8x8_10_v6
			
 
				+oc_idct8x8_slow_v6:
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	STR	r0, [r13,#-4]!
			
 
				+	ADD	r0, r13, #4	@ Write to temp storage.
			
 
				+	BL	idct8_8core_v6
			
 
				+	BL	idct8_8core_v6
			
 
				+	BL	idct8_8core_v6
			
 
				+	BL	idct8_8core_v6
			
 
				+	LDR	r0, [r13], #4	@ Write to the final destination.
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	STRD	r4, [r1,#-8*16]!
			
 
				+	STRD	r4, [r1,#8]
			
 
				+	STRD	r4, [r1,#16]
			
 
				+	STRD	r4, [r1,#24]
			
 
				+	STRD	r4, [r1,#32]
			
 
				+	STRD	r4, [r1,#40]
			
 
				+	STRD	r4, [r1,#48]
			
 
				+	STRD	r4, [r1,#56]
			
 
				+	STRD	r4, [r1,#64]
			
 
				+	STRD	r4, [r1,#72]
			
 
				+	STRD	r4, [r1,#80]
			
 
				+	STRD	r4, [r1,#88]
			
 
				+	STRD	r4, [r1,#96]
			
 
				+	STRD	r4, [r1,#104]
			
 
				+	STRD	r4, [r1,#112]
			
 
				+	STRD	r4, [r1,#120]
			
 
				+	MOV	r1, r13		@ And read from temp storage.
			
 
				+@ Column transforms
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	.size oc_idct8x8_v6, .-oc_idct8x8_v6	@ ENDP
			
 
				+
			
 
				+	.type	oc_idct8x8_10_v6, %function; oc_idct8x8_10_v6: @ PROC
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2+4
			
 
				+@ Row transforms
			
 
				+	MOV	r2, r13
			
 
				+	STR	r0, [r13,#-4]!
			
 
				+	AND	r0, r2, #4	@ Align the stack.
			
 
				+	ADD	r0, r0, r2	@ Write to temp storage.
			
 
				+	BL	idct4_3core_v6
			
 
				+	BL	idct2_1core_v6
			
 
				+	LDR	r0, [r13], #4	@ Write to the final destination.
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	STRD	r4, [r1,#-4*16]!
			
 
				+	STRD	r4, [r1,#16]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	STR	r4, [r1,#48]
			
 
				+	AND	r1, r13,#4	@ Align the stack.
			
 
				+	ADD	r1, r1, r13	@ And read from temp storage.
			
 
				+@ Column transforms
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	ADD	r13,r13,#64*2+4
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	.size oc_idct8x8_10_v6, .-oc_idct8x8_10_v6	@ ENDP
			
 
				+
			
 
				+	.type	oc_idct8x8_3_v6, %function; oc_idct8x8_3_v6: @ PROC
			
 
				+	STMFD	r13!,{r4-r8,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	MOV	r8, r0
			
 
				+	MOV	r0, r13		@ Write to temp storage.
			
 
				+	BL	idct2_1core_v6
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-2*16]!
			
 
				+	STR	r4, [r1,#16]
			
 
				+	MOV	r1, r13		@ Read from temp storage.
			
 
				+	MOV	r0, r8		@ Write to the final destination.
			
 
				+@ Column transforms
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r8,PC}
			
 
				+	.size oc_idct8x8_3_v6, .-oc_idct8x8_3_v6	@ ENDP
			
 
				+
			
 
				+	.type	idct2_1core_v6, %function; idct2_1core_v6: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDR	r2, [r1], #16		@ r2 = <x[0,1]|x[0,0]>
			
 
				+	LDR	r3, OC_C4S4
			
 
				+	LDRSH	r6, [r1], #16		@ r6 = x[1,0]
			
 
				+	SMULWB	r12,r3, r2		@ r12= t[0,0]=OC_C4S4*x[0,0]>>16
			
 
				+	LDRD	r4, OC_C7S1		@ r4 = OC_C7S1; r5 = OC_C1S7
			
 
				+	SMULWB	r6, r3, r6		@ r6 = t[1,0]=OC_C4S4*x[1,0]>>16
			
 
				+	SMULWT	r4, r4, r2		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	SMULWT	r7, r5, r2		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+@ Stage 2:
			
 
				+	SMULWB	r5, r3, r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r12,r12,r6, LSL #16	@ r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWB	r6, r3, r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r7, r7, r3		@ r7 = <0|t[0,7]>
			
 
				+@ Stage 3:
			
 
				+	PKHBT	r5, r6, r5, LSL #16	@ r5 = <t[0,5]|t[0,6]>
			
 
				+	PKHBT	r4, r4, r3		@ r4 = <0|t[0,4]>
			
 
				+	SADDSUBX	r5, r5, r5		@ r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
			
 
				+@ Stage 4:
			
 
				+	PKHTB	r6, r3, r5, ASR #16	@ r6 = <0|t[0,6]>
			
 
				+	PKHBT	r5, r5, r3		@ r5 = <0|t[0,5]>
			
 
				+	SADD16	r3, r12,r7		@ r3 = t[0]+t[7]
			
 
				+	STR	r3, [r0], #4		@ y[0<<3] = t[0]+t[7]
			
 
				+	SADD16	r3, r12,r6		@ r3 = t[0]+t[6]
			
 
				+	STR	r3, [r0, #12]		@ y[1<<3] = t[0]+t[6]
			
 
				+	SADD16	r3, r12,r5		@ r3 = t[0]+t[5]
			
 
				+	STR	r3, [r0, #28]		@ y[2<<3] = t[0]+t[5]
			
 
				+	SADD16	r3, r12,r4		@ r3 = t[0]+t[4]
			
 
				+	STR	r3, [r0, #44]		@ y[3<<3] = t[0]+t[4]
			
 
				+	SSUB16	r4, r12,r4		@ r4 = t[0]-t[4]
			
 
				+	STR	r4, [r0, #60]		@ y[4<<3] = t[0]-t[4]
			
 
				+	SSUB16	r5, r12,r5		@ r5 = t[0]-t[5]
			
 
				+	STR	r5, [r0, #76]		@ y[5<<3] = t[0]-t[5]
			
 
				+	SSUB16	r6, r12,r6		@ r6 = t[0]-t[6]
			
 
				+	STR	r6, [r0, #92]		@ y[6<<3] = t[0]-t[6]
			
 
				+	SSUB16	r7, r12,r7		@ r7 = t[0]-t[7]
			
 
				+	STR	r7, [r0, #108]		@ y[7<<3] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	.size idct2_1core_v6, .-idct2_1core_v6	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+	.balign 8
			
 
				+OC_C7S1:
			
 
				+	.word	12785 @ 31F1
			
 
				+OC_C1S7:
			
 
				+	.word	64277 @ FB15
			
 
				+OC_C6S2:
			
 
				+	.word	25080 @ 61F8
			
 
				+OC_C2S6:
			
 
				+	.word	60547 @ EC83
			
 
				+OC_C5S3:
			
 
				+	.word	36410 @ 8E3A
			
 
				+OC_C3S5:
			
 
				+	.word	54491 @ D4DB
			
 
				+OC_C4S4:
			
 
				+	.word	46341 @ B505
			
 
				+
			
 
				+  .if OC_ARM_ASM_MEDIA
			
 
				+	.type	idct2_2core_down_v6, %function; idct2_2core_down_v6: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDR	r2, [r1], #16		@ r2 = <x[0,1]|x[0,0]>
			
 
				+	LDR	r3, OC_C4S4
			
 
				+	MOV	r7 ,#8			@ r7  = 8
			
 
				+	LDR	r6, [r1], #16		@ r6 = <x[1,1]|x[1,0]>
			
 
				+	SMLAWB	r12,r3, r2, r7		@ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
			
 
				+	LDRD	r4, OC_C7S1		@ r4 = OC_C7S1; r5 = OC_C1S7
			
 
				+	SMLAWB	r7, r3, r6, r7		@ r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
			
 
				+	SMULWT  r5, r5, r2		@ r2 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	PKHBT	r12,r12,r7, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMULWT	r4, r4, r2		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+@ Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
			
 
				+	PKHBT	r7, r5, r5, LSL #16	@ r7 = <t[0,7]|t[0,7]>
			
 
				+@ Stage 2:
			
 
				+	SMULWB	r6, r3, r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r4, r4, r4, LSL #16	@ r4 = <t[0,4]|t[0,4]>
			
 
				+	SMULWT	r2, r3, r7		@ r2 = t[1,6]=OC_C4S4*t[1,7]>>16
			
 
				+	SMULWB	r5, r3, r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r6, r6, r2, LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r2, r3, r4		@ r2 = t[1,5]=OC_C4S4*t[1,4]>>16
			
 
				+	PKHBT	r2, r5, r2, LSL #16	@ r2 = <t[1,5]|t[0,5]>
			
 
				+@ Stage 3:
			
 
				+	SSUB16	r5, r6, r2		@ r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
			
 
				+	SADD16	r6, r6, r2		@ r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
			
 
				+@ Stage 4:
			
 
				+	SADD16	r2, r12,r7		@ r2 = t[0]+t[7]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[7]+8>>4
			
 
				+	STR	r3, [r0], #4		@ y[0<<3] = t[0]+t[7]+8>>4
			
 
				+	SADD16	r2, r12,r6		@ r2 = t[0]+t[6]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[6]+8>>4
			
 
				+	STR	r3, [r0, #12]		@ y[1<<3] = t[0]+t[6]+8>>4
			
 
				+	SADD16	r2, r12,r5		@ r2 = t[0]+t[5]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[5]+8>>4
			
 
				+	STR	r3, [r0, #28]		@ y[2<<3] = t[0]+t[5]+8>>4
			
 
				+	SADD16	r2, r12,r4		@ r2 = t[0]+t[4]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[4]+8>>4
			
 
				+	STR	r3, [r0, #44]		@ y[3<<3] = t[0]+t[4]+8>>4
			
 
				+	SSUB16	r4, r12,r4		@ r4 = t[0]-t[4]+8
			
 
				+	MOV	r3, r4, ASR #4
			
 
				+	MOV	r4, r4, LSL #16
			
 
				+	PKHTB	r3, r3, r4, ASR #20	@ r3 = t[0]-t[4]+8>>4
			
 
				+	STR	r3, [r0, #60]		@ y[4<<3] = t[0]-t[4]+8>>4
			
 
				+	SSUB16	r5, r12,r5		@ r5 = t[0]-t[5]+8
			
 
				+	MOV	r3, r5, ASR #4
			
 
				+	MOV	r5, r5, LSL #16
			
 
				+	PKHTB	r3, r3, r5, ASR #20	@ r3 = t[0]-t[5]+8>>4
			
 
				+	STR	r3, [r0, #76]		@ y[5<<3] = t[0]-t[5]+8>>4
			
 
				+	SSUB16	r6, r12,r6		@ r6 = t[0]-t[6]+8
			
 
				+	MOV	r3, r6, ASR #4
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	PKHTB	r3, r3, r6, ASR #20	@ r3 = t[0]-t[6]+8>>4
			
 
				+	STR	r3, [r0, #92]		@ y[6<<3] = t[0]-t[6]+8>>4
			
 
				+	SSUB16	r7, r12,r7		@ r7 = t[0]-t[7]+8
			
 
				+	MOV	r3, r7, ASR #4
			
 
				+	MOV	r7, r7, LSL #16
			
 
				+	PKHTB	r3, r3, r7, ASR #20	@ r3 = t[0]-t[7]+8>>4
			
 
				+	STR	r3, [r0, #108]		@ y[7<<3] = t[0]-t[7]+8>>4
			
 
				+	MOV	PC,r14
			
 
				+	.size idct2_2core_down_v6, .-idct2_2core_down_v6	@ ENDP
			
 
				+
			
 
				+@ In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
			
 
				+@  pay for increased branch mis-prediction to get here, but in practice it
			
 
				+@  doesn't seem to slow anything down to take it out, and it's less code this
			
 
				+@  way.
			
 
				+  .if 0
			
 
				+	.type	oc_idct8x8_6_v6, %function; oc_idct8x8_6_v6: @ PROC
			
 
				+	STMFD	r13!,{r4-r8,r10,r11,r14}
			
 
				+	SUB	r13,r13,#64*2+4
			
 
				+@ Row transforms
			
 
				+	MOV	r8, r0
			
 
				+	AND	r0, r13,#4	@ Align the stack.
			
 
				+	ADD	r0, r0, r13	@ Write to temp storage.
			
 
				+	BL	idct3_2core_v6
			
 
				+	BL	idct1core_v6
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	STRD	r4, [r1,#-3*16]!
			
 
				+	STR	r4, [r1,#16]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	AND	r1, r13,#4	@ Align the stack.
			
 
				+	MOV	r0, r8		@ Write to the final destination.
			
 
				+	ADD	r1, r1, r13	@ And read from temp storage.
			
 
				+@ Column transforms
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	ADD	r13,r13,#64*2+4
			
 
				+	LDMFD	r13!,{r4-r8,r10,r11,PC}
			
 
				+	.size oc_idct8x8_6_v6, .-oc_idct8x8_6_v6	@ ENDP
			
 
				+
			
 
				+	.type	idct1core_v6, %function; idct1core_v6: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r3, [r1], #16
			
 
				+	MOV	r12,#0x05
			
 
				+	ORR	r12,r12,#0xB500
			
 
				+	MUL	r3, r12, r3
			
 
				+	@ Stall ?
			
 
				+	MOV	r3, r3, ASR #16
			
 
				+	@ Don't need to actually store the odd lines; they won't be read.
			
 
				+	STRH	r3, [r0], #2
			
 
				+	STRH	r3, [r0, #30]
			
 
				+	STRH	r3, [r0, #62]
			
 
				+	STRH	r3, [r0, #94]
			
 
				+	MOV	PC,R14
			
 
				+	.size idct1core_v6, .-idct1core_v6	@ ENDP
			
 
				+
			
 
				+	.type	idct3_2core_v6, %function; idct3_2core_v6: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDRD	r4, [r1], #16		@ r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
			
 
				+	LDRD	r10,OC_C6S2_3_v6	@ r10= OC_C6S2; r11= OC_C2S6
			
 
				+	@ Stall
			
 
				+	SMULWB	r3, r11,r5		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMULWB	r2, r10,r5		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r5, [r1], #16		@ r5 = <x[1,1]|x[1,0]>
			
 
				+	SMULWB	r12,r11,r4		@ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
			
 
				+	LDRD	r6, OC_C7S1_3_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	SMULWB	r10,r11,r5		@ r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
			
 
				+	PKHBT	r12,r12,r10,LSL #16	@ r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWT  r10,r7, r5		@ r10= t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	PKHBT	r2, r2, r11		@ r2 = <0|t[0,2]>
			
 
				+	SMULWT  r7, r7, r4		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	PKHBT	r3, r3, r11		@ r3 = <0|t[0,3]>
			
 
				+	SMULWT	r5, r6, r5		@ r10= t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+	PKHBT	r7, r7, r10,LSL #16	@ r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+@ Stage 2:
			
 
				+	SMULWB	r6, r11,r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r4, r4, r5, LSL #16	@ r4 = <t[1,4]|t[0,4]>
			
 
				+	SMULWT	r10,r11,r7		@ r10= t[1,6]=OC_C4S4*t[1,7]>>16
			
 
				+	SMULWB	r5, r11,r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r6, r6, r10,LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r10,r11,r4		@ r10= t[1,5]=OC_C4S4*t[1,4]>>16
			
 
				+@ Stage 3:
			
 
				+	B	idct4_3core_stage3_v6
			
 
				+	.size idct3_2core_v6, .-idct3_2core_v6	@ ENDP
			
 
				+
			
 
				+@ Another copy so the LDRD offsets are less than +/- 255.
			
 
				+	.balign 8
			
 
				+OC_C7S1_3_v6:
			
 
				+	.word	12785 @ 31F1
			
 
				+OC_C1S7_3_v6:
			
 
				+	.word	64277 @ FB15
			
 
				+OC_C6S2_3_v6:
			
 
				+	.word	25080 @ 61F8
			
 
				+OC_C2S6_3_v6:
			
 
				+	.word	60547 @ EC83
			
 
				+
			
 
				+	.type	idct3_3core_down_v6, %function; idct3_3core_down_v6: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDRD	r10,[r1], #16		@ r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
			
 
				+	LDRD	r6, OC_C6S2_3_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
			
 
				+	LDR	r4, [r1], #16		@ r4 = <x[1,1]|x[1,0]>
			
 
				+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	MOV	r7,#8
			
 
				+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMLAWB	r12,r11,r10,r7		@ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
			
 
				+@ Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
			
 
				+	PKHBT	r3, r3, r3, LSL #16	@ r3 = <t[0,3]|t[0,3]>
			
 
				+	SMLAWB	r5, r11,r4, r7		@ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
			
 
				+	PKHBT	r2, r2, r2, LSL #16	@ r2 = <t[0,2]|t[0,2]>
			
 
				+	LDRD	r6, OC_C7S1_3_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+@ Stage 2:
			
 
				+	SMULWB	r6, r11,r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
			
 
				+	SMULWT	r10,r11,r7		@ r10= t[1,6]=OC_C4S4*t[1,7]>>16
			
 
				+	SMULWB	r5, r11,r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r6, r6, r10,LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r10,r11,r4		@ r10= t[1,5]=OC_C4S4*t[1,4]>>16
			
 
				+@ Stage 3:
			
 
				+	B	idct4_4core_down_stage3_v6
			
 
				+	.size idct3_3core_down_v6, .-idct3_3core_down_v6	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+	.type	idct4_3core_v6, %function; idct4_3core_v6: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDRD	r10,[r1], #16	@ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
			
 
				+	LDRD	r2, OC_C5S3_4_v6	@ r2 = OC_C5S3; r3 = OC_C3S5
			
 
				+	LDRD	r4, [r1], #16		@ r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
			
 
				+	SMULWT	r9, r3, r11		@ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
			
 
				+	SMULWT	r8, r2, r11		@ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
			
 
				+	PKHBT	r9, r9, r2		@ r9 = <0|t[0,6]>
			
 
				+	LDRD	r6, OC_C6S2_4_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
			
 
				+	PKHBT	r8, r8, r2		@ r9 = <0|-t[0,5]>
			
 
				+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMULWB	r12,r7, r5		@ r12= t[1,3]=OC_C2S6*x[1,2]>>16
			
 
				+	SMULWB	r5, r6, r5		@ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
			
 
				+	PKHBT	r3, r3, r12,LSL #16	@ r3 = <t[1,3]|t[0,3]>
			
 
				+	SMULWB	r12,r11,r10		@ r12= t[0,0]=OC_C4S4*x[0,0]>>16
			
 
				+	PKHBT	r2, r2, r5, LSL #16	@ r2 = <t[1,2]|t[0,2]>
			
 
				+	SMULWB	r5, r11,r4		@ r5 = t[1,0]=OC_C4S4*x[1,0]>>16
			
 
				+	LDRD	r6, OC_C7S1_4_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+@ Stage 2:
			
 
				+	SSUB16	r6, r7, r9		@ r6 = t[7]-t[6]
			
 
				+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
			
 
				+	SADD16	r7, r7, r9		@ r7 = t[7]=t[7]+t[6]
			
 
				+	SMULWT	r9, r11,r6		@ r9 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+	SADD16	r5, r4, r8		@ r5 = t[4]-t[5]
			
 
				+	SMULWB	r6, r11,r6		@ r6 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	SSUB16	r4, r4, r8		@ r4 = t[4]=t[4]+t[5]
			
 
				+	SMULWT	r10,r11,r5		@ r10= t[1,5]=OC_C4S4*r5T>>16
			
 
				+	PKHBT	r6, r6, r9, LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r5, r11,r5		@ r5 = t[0,5]=OC_C4S4*r5B>>16
			
 
				+@ Stage 3:
			
 
				+idct4_3core_stage3_v6:
			
 
				+	SADD16	r11,r12,r2		@ r11= t[1]=t[0]+t[2]
			
 
				+	PKHBT	r10,r5, r10,LSL #16	@ r10= <t[1,5]|t[0,5]>
			
 
				+	SSUB16	r2, r12,r2		@ r2 = t[2]=t[0]-t[2]
			
 
				+idct4_3core_stage3_5_v6:
			
 
				+	SSUB16	r5, r6, r10		@ r5 = t[5]=t[6]-t[5]
			
 
				+	SADD16	r6, r6, r10		@ r6 = t[6]=t[6]+t[5]
			
 
				+	SADD16	r10,r12,r3		@ r10= t[0]=t[0]+t[3]
			
 
				+	SSUB16	r3, r12,r3		@ r3 = t[3]=t[0]-t[3]
			
 
				+@ Stage 4:
			
 
				+	SADD16	r12,r10,r7		@ r12= t[0]+t[7]
			
 
				+	STR	r12,[r0], #4		@ y[0<<3] = t[0]+t[7]
			
 
				+	SADD16	r12,r11,r6		@ r12= t[1]+t[6]
			
 
				+	STR	r12,[r0, #12]		@ y[1<<3] = t[1]+t[6]
			
 
				+	SADD16	r12,r2, r5		@ r12= t[2]+t[5]
			
 
				+	STR	r12,[r0, #28]		@ y[2<<3] = t[2]+t[5]
			
 
				+	SADD16	r12,r3, r4		@ r12= t[3]+t[4]
			
 
				+	STR	r12,[r0, #44]		@ y[3<<3] = t[3]+t[4]
			
 
				+	SSUB16	r4, r3, r4		@ r4 = t[3]-t[4]
			
 
				+	STR	r4, [r0, #60]		@ y[4<<3] = t[3]-t[4]
			
 
				+	SSUB16	r5, r2, r5		@ r5 = t[2]-t[5]
			
 
				+	STR	r5, [r0, #76]		@ y[5<<3] = t[2]-t[5]
			
 
				+	SSUB16	r6, r11,r6		@ r6 = t[1]-t[6]
			
 
				+	STR	r6, [r0, #92]		@ y[6<<3] = t[1]-t[6]
			
 
				+	SSUB16	r7, r10,r7		@ r7 = t[0]-t[7]
			
 
				+	STR	r7, [r0, #108]		@ y[7<<3] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	.size idct4_3core_v6, .-idct4_3core_v6	@ ENDP
			
 
				+
			
 
				+@ Another copy so the LDRD offsets are less than +/- 255.
			
 
				+	.balign 8
			
 
				+OC_C7S1_4_v6:
			
 
				+	.word	12785 @ 31F1
			
 
				+OC_C1S7_4_v6:
			
 
				+	.word	64277 @ FB15
			
 
				+OC_C6S2_4_v6:
			
 
				+	.word	25080 @ 61F8
			
 
				+OC_C2S6_4_v6:
			
 
				+	.word	60547 @ EC83
			
 
				+OC_C5S3_4_v6:
			
 
				+	.word	36410 @ 8E3A
			
 
				+OC_C3S5_4_v6:
			
 
				+	.word	54491 @ D4DB
			
 
				+
			
 
				+	.type	idct4_4core_down_v6, %function; idct4_4core_down_v6: @ PROC
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDRD	r10,[r1], #16	@ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
			
 
				+	LDRD	r2, OC_C5S3_4_v6	@ r2 = OC_C5S3; r3 = OC_C3S5
			
 
				+	LDRD	r4, [r1], #16	@ r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
			
 
				+	SMULWT	r9, r3, r11		@ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
			
 
				+	LDRD	r6, OC_C6S2_4_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
			
 
				+	SMULWT	r8, r2, r11		@ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
			
 
				+@ Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
			
 
				+	PKHBT	r9, r9, r9, LSL #16	@ r9 = <t[0,6]|t[0,6]>
			
 
				+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	PKHBT	r8, r8, r8, LSL #16	@ r8 = <-t[0,5]|-t[0,5]>
			
 
				+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMULWB	r12,r7, r5		@ r12= t[1,3]=OC_C2S6*x[1,2]>>16
			
 
				+	MOV	r7,#8
			
 
				+	SMULWB	r5, r6, r5		@ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
			
 
				+	PKHBT	r3, r3, r12,LSL #16	@ r3 = <t[1,3]|t[0,3]>
			
 
				+	SMLAWB	r12,r11,r10,r7		@ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
			
 
				+	PKHBT	r2, r2, r5, LSL #16	@ r2 = <t[1,2]|t[0,2]>
			
 
				+	SMLAWB	r5, r11,r4 ,r7		@ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
			
 
				+	LDRD	r6, OC_C7S1_4_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+@ Stage 2:
			
 
				+	SSUB16	r6, r7, r9		@ r6 = t[7]-t[6]
			
 
				+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
			
 
				+	SADD16	r7, r7, r9		@ r7 = t[7]=t[7]+t[6]
			
 
				+	SMULWT	r9, r11,r6		@ r9 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+	SADD16	r5, r4, r8		@ r5 = t[4]-t[5]
			
 
				+	SMULWB	r6, r11,r6		@ r6 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	SSUB16	r4, r4, r8		@ r4 = t[4]=t[4]+t[5]
			
 
				+	SMULWT	r10,r11,r5		@ r10= t[1,5]=OC_C4S4*r5T>>16
			
 
				+	PKHBT	r6, r6, r9, LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r5, r11,r5		@ r5 = t[0,5]=OC_C4S4*r5B>>16
			
 
				+@ Stage 3:
			
 
				+idct4_4core_down_stage3_v6:
			
 
				+	SADD16	r11,r12,r2		@ r11= t[1]+8=t[0]+t[2]+8
			
 
				+	PKHBT	r10,r5, r10,LSL #16	@ r10= <t[1,5]|t[0,5]>
			
 
				+	SSUB16	r2, r12,r2		@ r2 = t[2]+8=t[0]-t[2]+8
			
 
				+	B	idct8_8core_down_stage3_5_v6
			
 
				+	.size idct4_4core_down_v6, .-idct4_4core_down_v6	@ ENDP
			
 
				+
			
 
				+	.type	idct8_8core_v6, %function; idct8_8core_v6: @ PROC
			
 
				+	STMFD	r13!,{r0,r14}
			
 
				+@ Stage 1:
			
 
				+	@5-6 rotation by 3pi/16
			
 
				+	LDRD	r10,OC_C5S3_4_v6	@ r10= OC_C5S3, r11= OC_C3S5
			
 
				+	LDR	r4, [r1,#8]		@ r4 = <x[0,5]|x[0,4]>
			
 
				+	LDR	r7, [r1,#24]		@ r7 = <x[1,5]|x[1,4]>
			
 
				+	SMULWT	r5, r11,r4		@ r5 = OC_C3S5*x[0,5]>>16
			
 
				+	LDR	r0, [r1,#4]		@ r0 = <x[0,3]|x[0,2]>
			
 
				+	SMULWT	r3, r11,r7		@ r3 = OC_C3S5*x[1,5]>>16
			
 
				+	LDR	r12,[r1,#20]		@ r12= <x[1,3]|x[1,2]>
			
 
				+	SMULWT	r6, r11,r0		@ r6 = OC_C3S5*x[0,3]>>16
			
 
				+	SMULWT	r11,r11,r12		@ r11= OC_C3S5*x[1,3]>>16
			
 
				+	SMLAWT	r6, r10,r4, r6		@ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
			
 
				+	PKHBT	r5, r5, r3, LSL #16	@ r5 = <r3|r5>
			
 
				+	SMLAWT	r11,r10,r7, r11		@ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
			
 
				+	PKHBT	r4, r4, r7, LSL #16	@ r4 = <x[1,4]|x[0,4]>
			
 
				+	SMULWT	r3, r10,r0		@ r3 = OC_C5S3*x[0,3]>>16
			
 
				+	PKHBT	r6, r6, r11,LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r8, r10,r12		@ r8 = OC_C5S3*x[1,3]>>16
			
 
				+	@2-3 rotation by 6pi/16
			
 
				+	LDRD	r10,OC_C6S2_4_v6	@ r10= OC_C6S2, r11= OC_C2S6
			
 
				+	PKHBT	r3, r3, r8, LSL #16	@ r3 = <r8|r3>
			
 
				+	LDR	r8, [r1,#12]		@ r8 = <x[0,7]|x[0,6]>
			
 
				+	SMULWB	r2, r10,r0		@ r2 = OC_C6S2*x[0,2]>>16
			
 
				+	SSUB16	r5, r5, r3		@ r5 = <t[1,5]|t[0,5]>
			
 
				+	SMULWB	r9, r10,r12		@ r9 = OC_C6S2*x[1,2]>>16
			
 
				+	LDR	r7, [r1,#28]		@ r7 = <x[1,7]|x[1,6]>
			
 
				+	SMULWB	r3, r10,r8		@ r3 = OC_C6S2*x[0,6]>>16
			
 
				+	SMULWB	r10,r10,r7		@ r10= OC_C6S2*x[1,6]>>16
			
 
				+	PKHBT	r2, r2, r9, LSL #16	@ r2 = <r2|r9>
			
 
				+	SMLAWB	r3, r11,r0, r3		@ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
			
 
				+	SMLAWB	r10,r11,r12,r10		@ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
			
 
				+	SMULWB	r9, r11,r8		@ r9 = OC_C2S6*x[0,6]>>16
			
 
				+	PKHBT	r3, r3, r10,LSL #16	@ r3 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r12,r11,r7		@ r12= OC_C2S6*x[1,6]>>16
			
 
				+	@4-7 rotation by 7pi/16
			
 
				+	LDRD	r10,OC_C7S1_8_v6	@ r10= OC_C7S1, r11= OC_C1S7
			
 
				+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <r9|r12>
			
 
				+	LDR	r0, [r1],#16		@ r0 = <x[0,1]|x[0,0]>
			
 
				+	PKHTB	r7, r7, r8, ASR #16	@ r7 = <x[1,7]|x[0,7]>
			
 
				+	SSUB16	r2, r2, r9		@ r2 = <t[1,2]|t[0,2]>
			
 
				+	SMULWB	r9, r10,r7		@ r9 = OC_C7S1*x[0,7]>>16
			
 
				+	LDR	r14,[r1],#16		@ r14= <x[1,1]|x[1,0]>
			
 
				+	SMULWT	r12,r10,r7		@ r12= OC_C7S1*x[1,7]>>16
			
 
				+	SMULWT	r8, r10,r0		@ r8 = OC_C7S1*x[0,1]>>16
			
 
				+	SMULWT	r10,r10,r14		@ r10= OC_C7S1*x[1,1]>>16
			
 
				+	SMLAWT	r9, r11,r0, r9		@ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
			
 
				+	PKHBT	r8, r8, r10,LSL #16	@ r8 = <r12|r8>
			
 
				+	SMLAWT	r12,r11,r14,r12		@ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
			
 
				+	PKHBT	r0, r0, r14,LSL #16	@ r0 = <x[1,0]|x[0,0]>
			
 
				+	SMULWB	r10,r11,r7		@ r10= OC_C1S7*x[0,6]>>16
			
 
				+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r12,r11,r7		@ r12= OC_C1S7*x[1,6]>>16
			
 
				+	@0-1 butterfly
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	PKHBT	r10,r10,r12,LSL #16	@ r10= <r12|r10>
			
 
				+	SADD16	r7, r0, r4		@ r7 = x[0]+x[4]
			
 
				+	SSUB16	r10,r8, r10		@ r10= <t[1,4]|t[0,4]>
			
 
				+	SSUB16	r4, r0, r4		@ r4 = x[0]-x[4]
			
 
				+	SMULWB	r8, r11,r7		@ r8 = t[0,0]=OC_C4S4*r7B>>16
			
 
				+	SMULWT	r12,r11,r7		@ r12= t[1,0]=OC_C4S4*r7T>>16
			
 
				+	SMULWB	r7, r11,r4		@ r7 = t[0,1]=OC_C4S4*r4B>>16
			
 
				+	PKHBT	r12,r8, r12,LSL #16	@ r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWT	r8, r11,r4		@ r8 = t[1,1]=OC_C4S4*r4T>>16
			
 
				+@ Stage 2:
			
 
				+	SADD16	r4, r10,r5		@ r4 = t[4]=t[4]+t[5]
			
 
				+	PKHBT	r8, r7, r8, LSL #16	@ r8 = <t[1,0]|t[0,0]>
			
 
				+	SSUB16	r5, r10,r5		@ r5 = t[4]-t[5]
			
 
				+	SMULWB	r10,r11,r5		@ r10= t[0,5]=OC_C4S4*r5B>>16
			
 
				+	SADD16	r7, r9, r6		@ r7 = t[7]=t[7]+t[6]
			
 
				+	SMULWT	r5, r11,r5		@ r5 = t[1,5]=OC_C4S4*r5T>>16
			
 
				+	SSUB16	r6, r9, r6		@ r6 = t[7]-t[6]
			
 
				+	SMULWB	r9, r11,r6		@ r9 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	PKHBT	r10,r10,r5, LSL #16	@ r10= <t[1,5]|t[0,5]>
			
 
				+	SMULWT	r6, r11,r6		@ r6 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+@ Stage 3:
			
 
				+	SADD16	r11,r8, r2		@ r11= t[1]=t[1]+t[2]
			
 
				+	PKHBT	r6, r9, r6, LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SSUB16	r2, r8, r2		@ r2 = t[2]=t[1]-t[2]
			
 
				+	LDMFD	r13!,{r0,r14}
			
 
				+	B	idct4_3core_stage3_5_v6
			
 
				+	.size idct8_8core_v6, .-idct8_8core_v6	@ ENDP
			
 
				+
			
 
				+@ Another copy so the LDRD offsets are less than +/- 255.
			
 
				+	.balign 8
			
 
				+OC_C7S1_8_v6:
			
 
				+	.word	12785 @ 31F1
			
 
				+OC_C1S7_8_v6:
			
 
				+	.word	64277 @ FB15
			
 
				+OC_C6S2_8_v6:
			
 
				+	.word	25080 @ 61F8
			
 
				+OC_C2S6_8_v6:
			
 
				+	.word	60547 @ EC83
			
 
				+OC_C5S3_8_v6:
			
 
				+	.word	36410 @ 8E3A
			
 
				+OC_C3S5_8_v6:
			
 
				+	.word	54491 @ D4DB
			
 
				+
			
 
				+	.type	idct8_8core_down_v6, %function; idct8_8core_down_v6: @ PROC
			
 
				+	STMFD	r13!,{r0,r14}
			
 
				+@ Stage 1:
			
 
				+	@5-6 rotation by 3pi/16
			
 
				+	LDRD	r10,OC_C5S3_8_v6	@ r10= OC_C5S3, r11= OC_C3S5
			
 
				+	LDR	r4, [r1,#8]		@ r4 = <x[0,5]|x[0,4]>
			
 
				+	LDR	r7, [r1,#24]		@ r7 = <x[1,5]|x[1,4]>
			
 
				+	SMULWT	r5, r11,r4		@ r5 = OC_C3S5*x[0,5]>>16
			
 
				+	LDR	r0, [r1,#4]		@ r0 = <x[0,3]|x[0,2]>
			
 
				+	SMULWT	r3, r11,r7		@ r3 = OC_C3S5*x[1,5]>>16
			
 
				+	LDR	r12,[r1,#20]		@ r12= <x[1,3]|x[1,2]>
			
 
				+	SMULWT	r6, r11,r0		@ r6 = OC_C3S5*x[0,3]>>16
			
 
				+	SMULWT	r11,r11,r12		@ r11= OC_C3S5*x[1,3]>>16
			
 
				+	SMLAWT	r6, r10,r4, r6		@ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
			
 
				+	PKHBT	r5, r5, r3, LSL #16	@ r5 = <r3|r5>
			
 
				+	SMLAWT	r11,r10,r7, r11		@ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
			
 
				+	PKHBT	r4, r4, r7, LSL #16	@ r4 = <x[1,4]|x[0,4]>
			
 
				+	SMULWT	r3, r10,r0		@ r3 = OC_C5S3*x[0,3]>>16
			
 
				+	PKHBT	r6, r6, r11,LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r8, r10,r12		@ r8 = OC_C5S3*x[1,3]>>16
			
 
				+	@2-3 rotation by 6pi/16
			
 
				+	LDRD	r10,OC_C6S2_8_v6	@ r10= OC_C6S2, r11= OC_C2S6
			
 
				+	PKHBT	r3, r3, r8, LSL #16	@ r3 = <r8|r3>
			
 
				+	LDR	r8, [r1,#12]		@ r8 = <x[0,7]|x[0,6]>
			
 
				+	SMULWB	r2, r10,r0		@ r2 = OC_C6S2*x[0,2]>>16
			
 
				+	SSUB16	r5, r5, r3		@ r5 = <t[1,5]|t[0,5]>
			
 
				+	SMULWB	r9, r10,r12		@ r9 = OC_C6S2*x[1,2]>>16
			
 
				+	LDR	r7, [r1,#28]		@ r7 = <x[1,7]|x[1,6]>
			
 
				+	SMULWB	r3, r10,r8		@ r3 = OC_C6S2*x[0,6]>>16
			
 
				+	SMULWB	r10,r10,r7		@ r10= OC_C6S2*x[1,6]>>16
			
 
				+	PKHBT	r2, r2, r9, LSL #16	@ r2 = <r2|r9>
			
 
				+	SMLAWB	r3, r11,r0, r3		@ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
			
 
				+	SMLAWB	r10,r11,r12,r10		@ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
			
 
				+	SMULWB	r9, r11,r8		@ r9 = OC_C2S6*x[0,6]>>16
			
 
				+	PKHBT	r3, r3, r10,LSL #16	@ r3 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r12,r11,r7		@ r12= OC_C2S6*x[1,6]>>16
			
 
				+	@4-7 rotation by 7pi/16
			
 
				+	LDRD	r10,OC_C7S1_8_v6	@ r10= OC_C7S1, r11= OC_C1S7
			
 
				+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <r9|r12>
			
 
				+	LDR	r0, [r1],#16		@ r0 = <x[0,1]|x[0,0]>
			
 
				+	PKHTB	r7, r7, r8, ASR #16	@ r7 = <x[1,7]|x[0,7]>
			
 
				+	SSUB16	r2, r2, r9		@ r2 = <t[1,2]|t[0,2]>
			
 
				+	SMULWB	r9, r10,r7		@ r9 = OC_C7S1*x[0,7]>>16
			
 
				+	LDR	r14,[r1],#16		@ r14= <x[1,1]|x[1,0]>
			
 
				+	SMULWT	r12,r10,r7		@ r12= OC_C7S1*x[1,7]>>16
			
 
				+	SMULWT	r8, r10,r0		@ r8 = OC_C7S1*x[0,1]>>16
			
 
				+	SMULWT	r10,r10,r14		@ r10= OC_C7S1*x[1,1]>>16
			
 
				+	SMLAWT	r9, r11,r0, r9		@ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
			
 
				+	PKHBT	r8, r8, r10,LSL #16	@ r8 = <r12|r8>
			
 
				+	SMLAWT	r12,r11,r14,r12		@ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
			
 
				+	PKHBT	r0, r0, r14,LSL #16	@ r0 = <x[1,0]|x[0,0]>
			
 
				+	SMULWB	r10,r11,r7		@ r10= OC_C1S7*x[0,6]>>16
			
 
				+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r12,r11,r7		@ r12= OC_C1S7*x[1,6]>>16
			
 
				+	@0-1 butterfly
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	MOV	r14,#8
			
 
				+	PKHBT	r10,r10,r12,LSL #16	@ r10= <r12|r10>
			
 
				+	SADD16	r7, r0, r4		@ r7 = x[0]+x[4]
			
 
				+	SSUB16	r10,r8, r10		@ r10= <t[1,4]|t[0,4]>
			
 
				+	SMLAWB	r8, r11,r7, r14		@ r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
			
 
				+	SSUB16	r4, r0, r4		@ r4 = x[0]-x[4]
			
 
				+	SMLAWT	r12,r11,r7, r14		@ r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
			
 
				+	SMLAWB	r7, r11,r4, r14		@ r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
			
 
				+	PKHBT	r12,r8, r12,LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMLAWT	r8, r11,r4, r14		@ r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
			
 
				+@ Stage 2:
			
 
				+	SADD16	r4, r10,r5		@ r4 = t[4]=t[4]+t[5]
			
 
				+	PKHBT	r8, r7, r8, LSL #16	@ r8 = <t[1,0]+8|t[0,0]+8>
			
 
				+	SSUB16	r5, r10,r5		@ r5 = t[4]-t[5]
			
 
				+	SMULWB	r10,r11,r5		@ r10= t[0,5]=OC_C4S4*r5B>>16
			
 
				+	SADD16	r7, r9, r6		@ r7 = t[7]=t[7]+t[6]
			
 
				+	SMULWT	r5, r11,r5		@ r5 = t[1,5]=OC_C4S4*r5T>>16
			
 
				+	SSUB16	r6, r9, r6		@ r6 = t[7]-t[6]
			
 
				+	SMULWB	r9, r11,r6		@ r9 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	PKHBT	r10,r10,r5, LSL #16	@ r10= <t[1,5]|t[0,5]>
			
 
				+	SMULWT	r6, r11,r6		@ r6 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+@ Stage 3:
			
 
				+	SADD16	r11,r8, r2		@ r11= t[1]+8=t[1]+t[2]+8
			
 
				+	PKHBT	r6, r9, r6, LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SSUB16	r2, r8, r2		@ r2 = t[2]+8=t[1]-t[2]+8
			
 
				+	LDMFD	r13!,{r0,r14}
			
 
				+idct8_8core_down_stage3_5_v6:
			
 
				+	SSUB16	r5, r6, r10		@ r5 = t[5]=t[6]-t[5]
			
 
				+	SADD16	r6, r6, r10		@ r6 = t[6]=t[6]+t[5]
			
 
				+	SADD16	r10,r12,r3		@ r10= t[0]+8=t[0]+t[3]+8
			
 
				+	SSUB16	r3, r12,r3		@ r3 = t[3]+8=t[0]-t[3]+8
			
 
				+@ Stage 4:
			
 
				+	SADD16	r12,r10,r7		@ r12= t[0]+t[7]+8
			
 
				+	SSUB16	r7, r10,r7		@ r7 = t[0]-t[7]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[0]+t[7]+8>>4
			
 
				+	STR	r10,[r0], #4		@ y[0<<3] = t[0]+t[7]+8>>4
			
 
				+	SADD16	r12,r11,r6		@ r12= t[1]+t[6]+8
			
 
				+	SSUB16	r6, r11,r6		@ r6 = t[1]-t[6]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[1]+t[6]+8>>4
			
 
				+	STR	r10,[r0, #12]		@ y[1<<3] = t[1]+t[6]+8>>4
			
 
				+	SADD16	r12,r2, r5		@ r12= t[2]+t[5]+8
			
 
				+	SSUB16	r5, r2, r5		@ r5 = t[2]-t[5]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[2]+t[5]+8>>4
			
 
				+	STR	r10,[r0, #28]		@ y[2<<3] = t[2]+t[5]+8>>4
			
 
				+	SADD16	r12,r3, r4		@ r12= t[3]+t[4]+8
			
 
				+	SSUB16	r4, r3, r4		@ r4 = t[3]-t[4]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[3]+t[4]+8>>4
			
 
				+	STR	r10,[r0, #44]		@ y[3<<3] = t[3]+t[4]+8>>4
			
 
				+	MOV	r10,r4, ASR #4
			
 
				+	MOV	r4, r4, LSL #16
			
 
				+	PKHTB	r10,r10,r4, ASR #20	@ r10= t[3]-t[4]+8>>4
			
 
				+	STR	r10,[r0, #60]		@ y[4<<3] = t[3]-t[4]+8>>4
			
 
				+	MOV	r10,r5, ASR #4
			
 
				+	MOV	r5, r5, LSL #16
			
 
				+	PKHTB	r10,r10,r5, ASR #20	@ r10= t[2]-t[5]+8>>4
			
 
				+	STR	r10,[r0, #76]		@ y[5<<3] = t[2]-t[5]+8>>4
			
 
				+	MOV	r10,r6, ASR #4
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	PKHTB	r10,r10,r6, ASR #20	@ r10= t[1]-t[6]+8>>4
			
 
				+	STR	r10,[r0, #92]		@ y[6<<3] = t[1]-t[6]+8>>4
			
 
				+	MOV	r10,r7, ASR #4
			
 
				+	MOV	r7, r7, LSL #16
			
 
				+	PKHTB	r10,r10,r7, ASR #20	@ r10= t[0]-t[7]+8>>4
			
 
				+	STR	r10,[r0, #108]		@ y[7<<3] = t[0]-t[7]+8>>4
			
 
				+	MOV	PC,r14
			
 
				+	.size idct8_8core_down_v6, .-idct8_8core_down_v6	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+  .if OC_ARM_ASM_NEON
			
 
				+	.global	oc_idct8x8_1_neon
			
 
				+	.global	oc_idct8x8_neon
			
 
				+
			
 
				+	.balign 16
			
 
				+OC_IDCT_CONSTS_NEON:
			
 
				+	.short	    8
			
 
				+	.short	64277 @ FB15 (C1S7)
			
 
				+	.short	60547 @ EC83 (C2S6)
			
 
				+	.short	54491 @ D4DB (C3S5)
			
 
				+	.short	46341 @ B505 (C4S4)
			
 
				+	.short	36410 @ 471D (C5S3)
			
 
				+	.short	25080 @ 30FC (C6S2)
			
 
				+	.short	12785 @ 31F1 (C7S1)
			
 
				+
			
 
				+	.type	oc_idct8x8_1_neon, %function; oc_idct8x8_1_neon: @ PROC
			
 
				+	@ r0 = ogg_int16_t  *_y
			
 
				+	@ r1 = ogg_uint16_t  _dc
			
 
				+	VDUP.S16	Q0, r1
			
 
				+	VMOV		Q1, Q0
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0,:128]
			
 
				+	MOV	PC, r14
			
 
				+	.size oc_idct8x8_1_neon, .-oc_idct8x8_1_neon	@ ENDP
			
 
				+
			
 
				+	.type	oc_idct8x8_neon, %function; oc_idct8x8_neon: @ PROC
			
 
				+	@ r0 = ogg_int16_t *_y
			
 
				+	@ r1 = ogg_int16_t *_x
			
 
				+	@ r2 = int          _last_zzi
			
 
				+	CMP	r2, #10
			
 
				+	BLE	oc_idct8x8_10_neon
			
 
				+oc_idct8x8_slow_neon:
			
 
				+	VPUSH		{D8-D15}
			
 
				+	MOV	r2, r1
			
 
				+	ADR	r3, OC_IDCT_CONSTS_NEON
			
 
				+	@ Row transforms (input is pre-transposed)
			
 
				+	VLD1.64		{D16,D17,D18,D19}, [r2,:128]!
			
 
				+	VLD1.64		{D20,D21,D22,D23}, [r2,:128]!
			
 
				+	VLD1.64		{D24,D25,D26,D27}, [r2,:128]!
			
 
				+	VSUB.S16	Q1, Q8, Q12	@ Q8 = x[0]-x[4]
			
 
				+	VLD1.64		{D28,D29,D30,D31}, [r2,:128]
			
 
				+	VADD.S16	Q8, Q8, Q12	@ Q1 = x[0]+x[4]
			
 
				+	VLD1.64		{D0,D1},           [r3,:128]
			
 
				+	MOV	r12, r14
			
 
				+	BL	oc_idct8x8_stage123_neon
			
 
				+@ Stage 4
			
 
				+	VSUB.S16	Q15,Q8, Q7	@ Q15 = y[7]=t[0]-t[7]
			
 
				+	VADD.S16	Q8, Q8, Q7	@ Q8  = y[0]=t[0]+t[7]
			
 
				+	VSUB.S16	Q14,Q9, Q3	@ Q14 = y[6]=t[1]-t[6]
			
 
				+	VADD.S16	Q9, Q9, Q3	@ Q9  = y[1]=t[1]+t[6]
			
 
				+	VSUB.S16	Q13,Q10,Q5	@ Q13 = y[5]=t[2]-t[5]
			
 
				+	VADD.S16	Q10,Q10,Q5	@ Q10 = y[2]=t[2]+t[5]
			
 
				+	VTRN.16		Q14,Q15
			
 
				+	VSUB.S16	Q12,Q11,Q4	@ Q12 = y[4]=t[3]-t[4]
			
 
				+	VADD.S16	Q11,Q11,Q4	@ Q11 = y[3]=t[3]+t[4]
			
 
				+	@ 8x8 Transpose
			
 
				+	VTRN.16		Q8, Q9
			
 
				+	VTRN.16		Q10,Q11
			
 
				+	VTRN.16		Q12,Q13
			
 
				+	VTRN.32		Q8, Q10
			
 
				+	VTRN.32		Q9, Q11
			
 
				+	VTRN.32		Q12,Q14
			
 
				+	VTRN.32		Q13,Q15
			
 
				+	VSWP		D17,D24
			
 
				+	VSUB.S16	Q1, Q8, Q12	@ Q8 = x[0]-x[4]
			
 
				+	VSWP		D19,D26
			
 
				+	VADD.S16	Q8, Q8, Q12	@ Q1 = x[0]+x[4]
			
 
				+	VSWP		D21,D28
			
 
				+	VSWP		D23,D30
			
 
				+	@ Column transforms
			
 
				+	BL	oc_idct8x8_stage123_neon
			
 
				+	@ We have to put the return address back in the LR, or the branch
			
 
				+	@  predictor will not recognize the function return and mis-predict the
			
 
				+	@  entire call stack.
			
 
				+	MOV	r14, r12
			
 
				+@ Stage 4
			
 
				+	VSUB.S16	Q15,Q8, Q7	@ Q15 = y[7]=t[0]-t[7]
			
 
				+	VADD.S16	Q8, Q8, Q7	@ Q8  = y[0]=t[0]+t[7]
			
 
				+	VSUB.S16	Q14,Q9, Q3	@ Q14 = y[6]=t[1]-t[6]
			
 
				+	VADD.S16	Q9, Q9, Q3	@ Q9  = y[1]=t[1]+t[6]
			
 
				+	VSUB.S16	Q13,Q10,Q5	@ Q13 = y[5]=t[2]-t[5]
			
 
				+	VADD.S16	Q10,Q10,Q5	@ Q10 = y[2]=t[2]+t[5]
			
 
				+	VSUB.S16	Q12,Q11,Q4	@ Q12 = y[4]=t[3]-t[4]
			
 
				+	VADD.S16	Q11,Q11,Q4	@ Q11 = y[3]=t[3]+t[4]
			
 
				+	VMOV.I8		Q2,#0
			
 
				+	VPOP		{D8-D15}
			
 
				+	VMOV.I8		Q3,#0
			
 
				+	VRSHR.S16	Q8, Q8, #4	@ Q8  = y[0]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
			
 
				+	VRSHR.S16	Q9, Q9, #4	@ Q9  = y[1]+8>>4
			
 
				+	VRSHR.S16	Q10,Q10,#4	@ Q10 = y[2]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
			
 
				+	VRSHR.S16	Q11,Q11,#4	@ Q11 = y[3]+8>>4
			
 
				+	VRSHR.S16	Q12,Q12,#4	@ Q12 = y[4]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
			
 
				+	VRSHR.S16	Q13,Q13,#4	@ Q13 = y[5]+8>>4
			
 
				+	VRSHR.S16	Q14,Q14,#4	@ Q14 = y[6]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1,:128]
			
 
				+	VRSHR.S16	Q15,Q15,#4	@ Q15 = y[7]+8>>4
			
 
				+	VSTMIA		r0, {D16-D31}
			
 
				+	MOV	PC, r14
			
 
				+	.size oc_idct8x8_neon, .-oc_idct8x8_neon	@ ENDP
			
 
				+
			
 
				+	.type	oc_idct8x8_stage123_neon, %function; oc_idct8x8_stage123_neon: @ PROC
			
 
				+@ Stages 1 & 2
			
 
				+	VMULL.S16	Q4, D18,D1[3]
			
 
				+	VMULL.S16	Q5, D19,D1[3]
			
 
				+	VMULL.S16	Q7, D30,D1[3]
			
 
				+	VMULL.S16	Q6, D31,D1[3]
			
 
				+	VMULL.S16	Q2, D30,D0[1]
			
 
				+	VMULL.S16	Q3, D31,D0[1]
			
 
				+	VSHRN.S32	D8, Q4, #16
			
 
				+	VSHRN.S32	D9, Q5, #16	@ Q4 = (OC_C7S1*x[1]>>16)
			
 
				+	VSHRN.S32	D14,Q7, #16
			
 
				+	VSHRN.S32	D15,Q6, #16	@ Q7 = (OC_C7S1*x[7]>>16)
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q3, #16	@ Q2 = (OC_C1S7*x[7]>>16)-x[7]
			
 
				+	VSUB.S16	Q4, Q4, Q15
			
 
				+	VADD.S16	Q7, Q7, Q9
			
 
				+	VSUB.S16	Q4, Q4, Q2	@ Q4 = t[4]
			
 
				+	VMULL.S16	Q2, D18,D0[1]
			
 
				+	VMULL.S16	Q9, D19,D0[1]
			
 
				+	VMULL.S16	Q5, D26,D0[3]
			
 
				+	VMULL.S16	Q3, D27,D0[3]
			
 
				+	VMULL.S16	Q6, D22,D0[3]
			
 
				+	VMULL.S16	Q12,D23,D0[3]
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q9, #16	@ Q2 = (OC_C1S7*x[1]>>16)-x[1]
			
 
				+	VSHRN.S32	D10,Q5, #16
			
 
				+	VSHRN.S32	D11,Q3, #16	@ Q5 = (OC_C3S5*x[5]>>16)-x[5]
			
 
				+	VSHRN.S32	D12,Q6, #16
			
 
				+	VSHRN.S32	D13,Q12,#16	@ Q6 = (OC_C3S5*x[3]>>16)-x[3]
			
 
				+	VADD.S16	Q7, Q7, Q2	@ Q7 = t[7]
			
 
				+	VSUB.S16	Q5, Q5, Q11
			
 
				+	VADD.S16	Q6, Q6, Q11
			
 
				+	VADD.S16	Q5, Q5, Q13
			
 
				+	VADD.S16	Q6, Q6, Q13
			
 
				+	VMULL.S16	Q9, D22,D1[1]
			
 
				+	VMULL.S16	Q11,D23,D1[1]
			
 
				+	VMULL.S16	Q15,D26,D1[1]
			
 
				+	VMULL.S16	Q13,D27,D1[1]
			
 
				+	VMULL.S16	Q2, D20,D1[2]
			
 
				+	VMULL.S16	Q12,D21,D1[2]
			
 
				+	VSHRN.S32	D18,Q9, #16
			
 
				+	VSHRN.S32	D19,Q11,#16	@ Q9 = (OC_C5S3*x[3]>>16)-x[3]
			
 
				+	VSHRN.S32	D30,Q15,#16
			
 
				+	VSHRN.S32	D31,Q13,#16	@ Q15= (OC_C5S3*x[5]>>16)-x[5]
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q12,#16	@ Q2 = (OC_C6S2*x[2]>>16)
			
 
				+	VSUB.S16	Q5, Q5, Q9	@ Q5 = t[5]
			
 
				+	VADD.S16	Q6, Q6, Q15	@ Q6 = t[6]
			
 
				+	VSUB.S16	Q2, Q2, Q14
			
 
				+	VMULL.S16	Q3, D28,D1[2]
			
 
				+	VMULL.S16	Q11,D29,D1[2]
			
 
				+	VMULL.S16	Q12,D28,D0[2]
			
 
				+	VMULL.S16	Q9, D29,D0[2]
			
 
				+	VMULL.S16	Q13,D20,D0[2]
			
 
				+	VMULL.S16	Q15,D21,D0[2]
			
 
				+	VSHRN.S32	D6, Q3, #16
			
 
				+	VSHRN.S32	D7, Q11,#16	@ Q3 = (OC_C6S2*x[6]>>16)
			
 
				+	VSHRN.S32	D24,Q12,#16
			
 
				+	VSHRN.S32	D25,Q9, #16	@ Q12= (OC_C2S6*x[6]>>16)-x[6]
			
 
				+	VSHRN.S32	D26,Q13,#16
			
 
				+	VSHRN.S32	D27,Q15,#16	@ Q13= (OC_C2S6*x[2]>>16)-x[2]
			
 
				+	VSUB.S16	Q9, Q4, Q5	@ Q9 = t[4]-t[5]
			
 
				+	VSUB.S16	Q11,Q7, Q6	@ Q11= t[7]-t[6]
			
 
				+	VADD.S16	Q3, Q3, Q10
			
 
				+	VADD.S16	Q4, Q4, Q5	@ Q4 = t[4]=t[4]+t[5]
			
 
				+	VADD.S16	Q7, Q7, Q6	@ Q7 = t[7]=t[7]+t[6]
			
 
				+	VSUB.S16	Q2, Q2, Q12	@ Q2 = t[2]
			
 
				+	VADD.S16	Q3, Q3, Q13	@ Q3 = t[3]
			
 
				+	VMULL.S16	Q12,D16,D1[0]
			
 
				+	VMULL.S16	Q13,D17,D1[0]
			
 
				+	VMULL.S16	Q14,D2, D1[0]
			
 
				+	VMULL.S16	Q15,D3, D1[0]
			
 
				+	VMULL.S16	Q5, D18,D1[0]
			
 
				+	VMULL.S16	Q6, D22,D1[0]
			
 
				+	VSHRN.S32	D24,Q12,#16
			
 
				+	VSHRN.S32	D25,Q13,#16
			
 
				+	VSHRN.S32	D28,Q14,#16
			
 
				+	VSHRN.S32	D29,Q15,#16
			
 
				+	VMULL.S16	Q13,D19,D1[0]
			
 
				+	VMULL.S16	Q15,D23,D1[0]
			
 
				+	VADD.S16	Q8, Q8, Q12	@ Q8 = t[0]
			
 
				+	VADD.S16	Q1, Q1, Q14	@ Q1 = t[1]
			
 
				+	VSHRN.S32	D10,Q5, #16
			
 
				+	VSHRN.S32	D12,Q6, #16
			
 
				+	VSHRN.S32	D11,Q13,#16
			
 
				+	VSHRN.S32	D13,Q15,#16
			
 
				+	VADD.S16	Q5, Q5, Q9	@ Q5 = t[5]=OC_C4S4*(t[4]-t[5])>>16
			
 
				+	VADD.S16	Q6, Q6, Q11	@ Q6 = t[6]=OC_C4S4*(t[7]-t[6])>>16
			
 
				+@ Stage 3
			
 
				+	VSUB.S16	Q11,Q8, Q3	@ Q11 = t[3]=t[0]-t[3]
			
 
				+	VADD.S16	Q8, Q8, Q3	@ Q8  = t[0]=t[0]+t[3]
			
 
				+	VADD.S16	Q9, Q1, Q2	@ Q9  = t[1]=t[1]+t[2]
			
 
				+	VADD.S16	Q3, Q6, Q5	@ Q3  = t[6]=t[6]+t[5]
			
 
				+	VSUB.S16	Q10,Q1, Q2	@ Q10 = t[2]=t[1]-t[2]
			
 
				+	VSUB.S16	Q5, Q6, Q5	@ Q5  = t[5]=t[6]-t[5]
			
 
				+	MOV	PC, r14
			
 
				+	.size oc_idct8x8_stage123_neon, .-oc_idct8x8_stage123_neon	@ ENDP
			
 
				+
			
 
				+	.type	oc_idct8x8_10_neon, %function; oc_idct8x8_10_neon: @ PROC
			
 
				+	ADR	r3, OC_IDCT_CONSTS_NEON
			
 
				+	VLD1.64		{D0,D1},          [r3,:128]
			
 
				+	MOV	r2, r1
			
 
				+	@ Row transforms (input is pre-transposed)
			
 
				+@ Stage 1
			
 
				+	VLD1.64		{D16,D17,D18,D19},[r2,:128]!
			
 
				+	MOV	r12, #16
			
 
				+	VMULL.S16	Q15,D16,D1[0]	@ Q15= OC_C4S4*x[0]-(x[0]<<16)
			
 
				+	VLD1.64		{D17},            [r2,:64], r12
			
 
				+	VMULL.S16	Q2, D18,D0[1]	@ Q2 = OC_C1S7*x[1]-(x[1]<<16)
			
 
				+	VLD1.64		{D19},            [r2,:64]
			
 
				+	VMULL.S16	Q14,D17,D0[2]	@ Q14= OC_C2S6*x[2]-(x[2]<<16)
			
 
				+	VMULL.S16	Q3, D19,D0[3]	@ Q3 = OC_C3S5*x[3]-(x[3]<<16)
			
 
				+	VMULL.S16	Q13,D19,D1[1]	@ Q13= OC_C5S3*x[3]-(x[3]<<16)
			
 
				+	VMULL.S16	Q12,D18,D1[3]	@ Q12= OC_C7S1*x[1]
			
 
				+	VMULL.S16	Q1, D17,D1[2]	@ Q1 = OC_C6S2*x[2]
			
 
				+	VSHRN.S32	D30,Q15,#16	@ D30= t[0]-x[0]
			
 
				+	VSHRN.S32	D4, Q2, #16	@ D4 = t[7]-x[1]
			
 
				+	VSHRN.S32	D31,Q14,#16	@ D31= t[3]-x[2]
			
 
				+	VSHRN.S32	D6, Q3, #16	@ D6 = t[6]-x[3]
			
 
				+	VSHRN.S32	D7, Q13,#16	@ D7 = -t[5]-x[3]
			
 
				+	VSHRN.S32	D5, Q12,#16	@ D5 = t[4]
			
 
				+	VSHRN.S32	D2, Q1, #16	@ D2 = t[2]
			
 
				+	VADD.S16	D4, D4, D18	@ D4 = t[7]
			
 
				+	VADD.S16	D6, D6, D19	@ D6 = t[6]
			
 
				+	VADD.S16	D7, D7, D19	@ D7 = -t[5]
			
 
				+	VADD.S16	Q15,Q15,Q8	@ D30= t[0]
			
 
				+					@ D31= t[3]
			
 
				+@ Stages 2 & 3
			
 
				+	VSUB.S16	Q12,Q2, Q3	@ D24= t[7]-t[6]
			
 
				+					@ D25= t[4]'=t[4]+t[5]
			
 
				+	VADD.S16	Q13,Q2, Q3	@ D26= t[7]=t[7]+t[6]
			
 
				+					@ D27= t[4]-t[5]
			
 
				+	VMULL.S16	Q11,D24,D1[0]	@ Q11= OC_C4S4*(t[7]-t[6])
			
 
				+					@       -(t[7]-t[6]<<16)
			
 
				+	VMULL.S16	Q14,D27,D1[0]	@ Q14= OC_C4S4*(t[4]-t[5])
			
 
				+					@       -(t[4]-t[5]<<16)
			
 
				+	VADD.S16	D16,D30,D31	@ D16= t[0]=t[0]+t[3]
			
 
				+	VSUB.S16	D17,D30,D2	@ D17= t[2]=t[0]-t[2]
			
 
				+	VADD.S16	D18,D30,D2	@ D18= t[1]=t[0]+t[2]
			
 
				+	VSHRN.S32	D22,Q11,#16	@ D22= (OC_C4S4*(t[7]-t[6])>>16)
			
 
				+					@       -(t[7]-t[6])
			
 
				+	VSHRN.S32	D23,Q14,#16	@ D23= (OC_C4S4*(t[4]-t[5])>>16)
			
 
				+					@       -(t[4]-t[5])
			
 
				+	VSUB.S16	D19,D30,D31	@ D19= t[3]=t[0]-t[3]
			
 
				+	VADD.S16	D22,D22,D24	@ D22= t[6]=OC_C4S4*(t[7]-t[6])>>16
			
 
				+	VADD.S16	D23,D23,D27	@ D23= t[5]=OC_C4S4*(t[4]-t[5])>>16
			
 
				+	VSUB.S16	D27,D22,D23	@ D27= t[5]=t[6]-t[5]
			
 
				+	VADD.S16	D24,D22,D23	@ D24= t[6]=t[6]+t[5]
			
 
				+@ Stage 4
			
 
				+	VSUB.S16	Q11,Q8, Q13	@ D22= y[7]=t[0]-t[7]
			
 
				+					@ D23= y[5]=t[2]'-t[5]''
			
 
				+	VSUB.S16	Q10,Q9, Q12	@ D20= y[6]=t[1]-t[6]
			
 
				+					@ D21= y[4]=t[3]'-t[4]''
			
 
				+	VADD.S16	Q8, Q8, Q13	@ D16= y[0]=t[0]+t[7]
			
 
				+					@ D17= y[2]=t[2]'+t[5]''
			
 
				+	VADD.S16	Q9, Q9, Q12	@ D18= y[1]=t[1]-t[6]
			
 
				+					@ D19= y[3]=t[3]'-t[4]''
			
 
				+	@ 8x4 transpose
			
 
				+	VTRN.16		Q10,Q11		@ Q10= c5c4a5a4 c7c6a7a6
			
 
				+					@ Q11= d5d4b5b4 d7d6b7b6
			
 
				+	VTRN.16		Q8, Q9		@ Q8 = c3c2a3a2 c1c0a1a0
			
 
				+					@ Q9 = d3d2b3b2 d1d0b1b0
			
 
				+	VSWP		D20,D21		@ Q10= c7c6a7a6 c5c4a5a4
			
 
				+	VSWP		D22,D23		@ Q11= d7d6b7b6 d5d4b5b4
			
 
				+	VUZP.32		Q9, Q11		@ Q9 = b7b6b5b4 b3b2b1b0
			
 
				+					@ Q11= d7d6d5d4 d3d2d1d0
			
 
				+	VMULL.S16	Q15,D18,D0[1]
			
 
				+	VMULL.S16	Q13,D22,D1[1]
			
 
				+	VUZP.32		Q8, Q10		@ Q8 = a7a6a5a4 a3a2a1a0
			
 
				+					@ Q10= c7c6c5c4 c3c2c1c0
			
 
				+	@ Column transforms
			
 
				+@ Stages 1, 2, & 3
			
 
				+	VMULL.S16	Q14,D19,D0[1]	@ Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
			
 
				+	VMULL.S16	Q12,D23,D1[1]	@ Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
			
 
				+	VMULL.S16	Q3, D22,D0[3]
			
 
				+	VMULL.S16	Q2, D23,D0[3]	@  Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
			
 
				+	VSHRN.S32	D30,Q15,#16
			
 
				+	VSHRN.S32	D31,Q14,#16	@ Q15= (OC_C1S7*x[1]>>16)-x[1]
			
 
				+	VSHRN.S32	D26,Q13,#16
			
 
				+	VSHRN.S32	D27,Q12,#16	@ Q13= (OC_C5S3*x[3]>>16)-x[3]
			
 
				+	VSHRN.S32	D28,Q3, #16
			
 
				+	VSHRN.S32	D29,Q2, #16	@ Q14= (OC_C3S5*x[3]>>16)-x[3]
			
 
				+	VADD.S16	Q15,Q15,Q9	@ Q15= t[7]
			
 
				+	VADD.S16	Q13,Q13,Q11	@ Q13= -t[5]
			
 
				+	VADD.S16	Q14,Q14,Q11	@ Q14= t[6]
			
 
				+	VMULL.S16	Q12,D18,D1[3]
			
 
				+	VMULL.S16	Q2, D19,D1[3]	@  Q2:Q12= OC_C7S1*x[1]
			
 
				+	VMULL.S16	Q1, D16,D1[0]
			
 
				+	VMULL.S16	Q11,D17,D1[0]	@ Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
			
 
				+	VMULL.S16	Q3, D20,D0[2]
			
 
				+	VMULL.S16	Q9, D21,D0[2]	@  Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
			
 
				+	VSHRN.S32	D24,Q12,#16
			
 
				+	VSHRN.S32	D25,Q2, #16	@ Q12= t[4]
			
 
				+	VMULL.S16	Q2, D20,D1[2]
			
 
				+	VSHRN.S32	D2, Q1, #16
			
 
				+	VSHRN.S32	D3, Q11,#16	@ Q1 = (OC_C4S4*x[0]>>16)-x[0]
			
 
				+	VMULL.S16	Q11,D21,D1[2]	@  Q2:Q11= OC_C6S2*x[2]
			
 
				+	VSHRN.S32	D6, Q3, #16
			
 
				+	VSHRN.S32	D7, Q9, #16	@ Q3 = (OC_C2S6*x[2]>>16)-x[2]
			
 
				+	VSUB.S16	Q9, Q15,Q14	@ Q9 = t[7]-t[6]
			
 
				+	VADD.S16	Q15,Q15,Q14	@ Q15= t[7]=t[7]+t[6]
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q11,#16	@ Q2 = t[2]
			
 
				+	VADD.S16	Q1, Q1, Q8	@ Q1 = t[0]
			
 
				+	VADD.S16	Q8, Q12,Q13	@ Q8 = t[4]-t[5]
			
 
				+	VADD.S16	Q3, Q3, Q10	@ Q3 = t[3]
			
 
				+	VMULL.S16	Q10,D16,D1[0]
			
 
				+	VMULL.S16	Q11,D17,D1[0]	@ Q11:Q10= OC_C4S4*(t[4]-t[5])
			
 
				+					@           -(t[4]-t[5]<<16)
			
 
				+	VSUB.S16	Q12,Q12,Q13	@ Q12= t[4]=t[4]+t[5]
			
 
				+	VMULL.S16	Q14,D18,D1[0]
			
 
				+	VMULL.S16	Q13,D19,D1[0]	@ Q13:Q14= OC_C4S4*(t[6]-t[7])
			
 
				+					@           -(t[6]-t[7]<<16)
			
 
				+	VSHRN.S32	D20,Q10,#16
			
 
				+	VSHRN.S32	D21,Q11,#16	@ Q10= (OC_C4S4*(t[4]-t[5])>>16)
			
 
				+					@       -(t[4]-t[5])
			
 
				+	VADD.S16	Q11,Q1, Q3	@ Q11= t[0]=t[0]+t[3]
			
 
				+	VSUB.S16	Q3, Q1, Q3	@ Q3 = t[3]=t[0]-t[3]
			
 
				+	VSHRN.S32	D28,Q14,#16
			
 
				+	VSHRN.S32	D29,Q13,#16	@ Q14= (OC_C4S4*(t[7]-t[6])>>16)
			
 
				+					@       -(t[7]-t[6])
			
 
				+	VADD.S16	Q10,Q10,Q8	@ Q10=t[5]
			
 
				+	VADD.S16	Q14,Q14,Q9	@ Q14=t[6]
			
 
				+	VSUB.S16	Q13,Q14,Q10	@ Q13=t[5]=t[6]-t[5]
			
 
				+	VADD.S16	Q14,Q14,Q10	@ Q14=t[6]=t[6]+t[5]
			
 
				+	VADD.S16	Q10,Q1, Q2	@ Q10= t[1]=t[0]+t[2]
			
 
				+	VSUB.S16	Q2, Q1, Q2	@ Q2 = t[2]=t[0]-t[2]
			
 
				+@ Stage 4
			
 
				+	VADD.S16	Q8, Q11,Q15	@ Q8  = y[0]=t[0]+t[7]
			
 
				+	VADD.S16	Q9, Q10,Q14	@ Q9  = y[1]=t[1]+t[6]
			
 
				+	VSUB.S16	Q15,Q11,Q15	@ Q15 = y[7]=t[0]-t[7]
			
 
				+	VSUB.S16	Q14,Q10,Q14	@ Q14 = y[6]=t[1]-t[6]
			
 
				+	VADD.S16	Q10,Q2, Q13	@ Q10 = y[2]=t[2]+t[5]
			
 
				+	VADD.S16	Q11,Q3, Q12	@ Q11 = y[3]=t[3]+t[4]
			
 
				+	VSUB.S16	Q12,Q3, Q12	@ Q12 = y[4]=t[3]-t[4]
			
 
				+	VSUB.S16	Q13,Q2, Q13	@ Q13 = y[5]=t[2]-t[5]
			
 
				+	VMOV.I8		D2, #0
			
 
				+	VRSHR.S16	Q8, Q8, #4	@ Q8  = y[0]+8>>4
			
 
				+	VST1.64		{D2}, [r1,:64], r12
			
 
				+	VRSHR.S16	Q9, Q9, #4	@ Q9  = y[1]+8>>4
			
 
				+	VRSHR.S16	Q10,Q10,#4	@ Q10 = y[2]+8>>4
			
 
				+	VST1.64		{D2}, [r1,:64], r12
			
 
				+	VRSHR.S16	Q11,Q11,#4	@ Q11 = y[3]+8>>4
			
 
				+	VRSHR.S16	Q12,Q12,#4	@ Q12 = y[4]+8>>4
			
 
				+	VST1.64		{D2}, [r1,:64], r12
			
 
				+	VRSHR.S16	Q13,Q13,#4	@ Q13 = y[5]+8>>4
			
 
				+	VRSHR.S16	Q14,Q14,#4	@ Q14 = y[6]+8>>4
			
 
				+	VST1.64		{D2}, [r1,:64]
			
 
				+	VRSHR.S16	Q15,Q15,#4	@ Q15 = y[7]+8>>4
			
 
				+	VSTMIA		r0, {D16-D31}
			
 
				+	MOV	PC, r14
			
 
				+	.size oc_idct8x8_10_neon, .-oc_idct8x8_10_neon	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+	@ END
			
 
				+    .section	.note.GNU-stack,"",%progbits
			
--- a/modules/theoraplayer/native/theora/lib/arm/armidct.s
+++ b/modules/theoraplayer/native/theora/lib/arm/armidct.s
@@ -0,0 +1,1853 @@
 
				+;********************************************************************
			
 
				+;*                                                                  *
			
 
				+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+;*                                                                  *
			
 
				+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+;*                                                                  *
			
 
				+;********************************************************************
			
 
				+; Original implementation:
			
 
				+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+; last mod: $Id: armidct.s 17728 2010-12-07 10:28:07Z tterribe $
			
 
				+;********************************************************************
			
 
				+
			
 
				+	AREA	|.text|, CODE, READONLY
			
 
				+
			
 
				+	GET	armopts.s
			
 
				+
			
 
				+	EXPORT	oc_idct8x8_1_arm
			
 
				+	EXPORT	oc_idct8x8_arm
			
 
				+
			
 
				+oc_idct8x8_1_arm PROC
			
 
				+	; r0 = ogg_int16_t  *_y
			
 
				+	; r1 = ogg_uint16_t  _dc
			
 
				+	ORR	r1, r1, r1, LSL #16
			
 
				+	MOV	r2, r1
			
 
				+	MOV	r3, r1
			
 
				+	MOV	r12,r1
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	MOV	PC, r14
			
 
				+	ENDP
			
 
				+
			
 
				+oc_idct8x8_arm PROC
			
 
				+	; r0 = ogg_int16_t *_y
			
 
				+	; r1 = ogg_int16_t *_x
			
 
				+	; r2 = int          _last_zzi
			
 
				+	CMP	r2, #3
			
 
				+	BLE	oc_idct8x8_3_arm
			
 
				+	CMP	r2, #6
			
 
				+	BLE	oc_idct8x8_6_arm
			
 
				+	CMP	r2, #10
			
 
				+	BLE	oc_idct8x8_10_arm
			
 
				+oc_idct8x8_slow_arm
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+; Row transforms
			
 
				+	STR	r0, [r13,#-4]!
			
 
				+	ADD	r0, r13, #4	; Write to temp storage.
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	LDR	r0, [r13], #4	; Write to the final destination.
			
 
				+	SUB	r2, r1, #8*16
			
 
				+	; Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	MOV	r6, #0
			
 
				+	MOV	r7, #0
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	MOV	r1, r13		; And read from temp storage.
			
 
				+; Column transforms
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_idct8x8_10_arm PROC
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+; Row transforms
			
 
				+	MOV	r2, r0
			
 
				+	MOV	r0, r13		; Write to temp storage.
			
 
				+	BL	idct4core_arm
			
 
				+	BL	idct3core_arm
			
 
				+	BL	idct2core_arm
			
 
				+	BL	idct1core_arm
			
 
				+	; Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-4*16]!
			
 
				+	STR	r4, [r1,#4]
			
 
				+	STR	r4, [r1,#16]
			
 
				+	STR	r4, [r1,#20]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	STR	r4, [r1,#48]
			
 
				+	MOV	r1, r13		; Read from temp storage.
			
 
				+	MOV	r0, r2		; Write to the final destination
			
 
				+oc_idct8x8_10_arm_cols
			
 
				+; Column transforms
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_idct8x8_6_arm PROC
			
 
				+	STMFD	r13!,{r4-r7,r9-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+; Row transforms
			
 
				+	MOV	r2, r0
			
 
				+	MOV	r0, r13		; Write to temp storage.
			
 
				+	BL	idct3core_arm
			
 
				+	BL	idct2core_arm
			
 
				+	BL	idct1core_arm
			
 
				+	; Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-3*16]!
			
 
				+	STR	r4, [r1,#4]
			
 
				+	STR	r4, [r1,#16]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	MOV	r1, r13		; Read from temp storage.
			
 
				+	MOV	r0, r2		; Write to the final destination
			
 
				+; Column transforms
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r7,r9-r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_idct8x8_3_arm PROC
			
 
				+	STMFD	r13!,{r4-r7,r9-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+; Row transforms
			
 
				+	MOV	r2, r0
			
 
				+	MOV	r0, r13		; Write to temp storage.
			
 
				+	BL	idct2core_arm
			
 
				+	BL	idct1core_arm
			
 
				+	; Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-2*16]!
			
 
				+	STR	r4, [r1,#16]
			
 
				+	MOV	r1, r13		; Read from temp storage.
			
 
				+	MOV	r0, r2		; Write to the final destination
			
 
				+; Column transforms
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r7,r9-r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+idct1core_arm PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r3, [r1], #16
			
 
				+	MOV	r12,#0x05
			
 
				+	ORR	r12,r12,#0xB500
			
 
				+	MUL	r3, r12, r3
			
 
				+	; Stall ?
			
 
				+	MOV	r3, r3, ASR #16
			
 
				+	STRH	r3, [r0], #2
			
 
				+	STRH	r3, [r0, #14]
			
 
				+	STRH	r3, [r0, #30]
			
 
				+	STRH	r3, [r0, #46]
			
 
				+	STRH	r3, [r0, #62]
			
 
				+	STRH	r3, [r0, #78]
			
 
				+	STRH	r3, [r0, #94]
			
 
				+	STRH	r3, [r0, #110]
			
 
				+	MOV	PC,R14
			
 
				+	ENDP
			
 
				+
			
 
				+idct2core_arm PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		; r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4
			
 
				+	LDRSH	r11,[r1, #-14]		; r11= x[1]
			
 
				+	LDR	r3, OC_C7S1
			
 
				+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r10,OC_C1S7
			
 
				+	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	MOV	r9, r9, ASR #16		; r9 = t[0]
			
 
				+	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	MOV	r3, r3, ASR #16		; r3 = t[4]
			
 
				+	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		; r11= t[7]
			
 
				+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	MOV	r10,r10,ASR #16		; r10= t[5]
			
 
				+	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]
			
 
				+	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
			
 
				+	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
			
 
				+	ADD	r3, r3, r9		; r3 = t[0]+t[4]
			
 
				+	ADD	r11,r11,r9		; r11= t[0]+t[7]
			
 
				+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
			
 
				+	STRH	r12,[r0, #14]		; y[1] = t[0]+t[6]
			
 
				+	STRH	r10,[r0, #30]		; y[2] = t[0]+t[5]
			
 
				+	STRH	r3, [r0, #46]		; y[3] = t[0]+t[4]
			
 
				+	RSB	r3, r3, r9, LSL #1	; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
			
 
				+	RSB	r10,r10,r9, LSL #1	; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
			
 
				+	RSB	r12,r12,r9, LSL #1	; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
			
 
				+	RSB	r11,r11,r9, LSL #1	; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
			
 
				+	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
			
 
				+	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
			
 
				+	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
			
 
				+	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+idct2core_down_arm PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		; r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4
			
 
				+	LDRSH	r11,[r1, #-14]		; r11= x[1]
			
 
				+	LDR	r3, OC_C7S1
			
 
				+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r10,OC_C1S7
			
 
				+	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	MOV	r9, r9, ASR #16		; r9 = t[0]
			
 
				+	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	ADD	r9, r9, #8		; r9 = t[0]+8
			
 
				+	MOV	r3, r3, ASR #16		; r3 = t[4]
			
 
				+	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		; r11= t[7]
			
 
				+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	MOV	r10,r10,ASR #16		; r10= t[5]
			
 
				+	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]+8
			
 
				+	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
			
 
				+	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
			
 
				+	ADD	r3, r3, r9		; r3 = t[0]+t[4]+8
			
 
				+	ADD	r11,r11,r9		; r11= t[0]+t[7]+8
			
 
				+	; TODO: This is wrong.
			
 
				+	; The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	;  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r4, r11,ASR #4
			
 
				+	MOV	r5, r12,ASR #4
			
 
				+	MOV	r6, r10,ASR #4
			
 
				+	MOV	r7, r3, ASR #4
			
 
				+	RSB	r3, r3, r9, LSL #1	;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
			
 
				+	RSB	r10,r10,r9, LSL #1	;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
			
 
				+	RSB	r12,r12,r9, LSL #1	;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
			
 
				+	RSB	r11,r11,r9, LSL #1	;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r12,r12,ASR #4
			
 
				+	MOV	r11,r11,ASR #4
			
 
				+	STRH	r4, [r0], #2		; y[0] = t[0]+t[7]
			
 
				+	STRH	r5, [r0, #14]		; y[1] = t[0]+t[6]
			
 
				+	STRH	r6, [r0, #30]		; y[2] = t[0]+t[5]
			
 
				+	STRH	r7, [r0, #46]		; y[3] = t[0]+t[4]
			
 
				+	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
			
 
				+	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
			
 
				+	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
			
 
				+	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+idct3core_arm PROC
			
 
				+	LDRSH	r9, [r1], #16		; r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4		; r12= OC_C4S4
			
 
				+	LDRSH	r3, [r1, #-12]		; r3 = x[2]
			
 
				+	LDR	r10,OC_C6S2		; r10= OC_C6S2
			
 
				+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r4, OC_C2S6		; r4 = OC_C2S6
			
 
				+	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r11,[r1, #-14]		; r11= x[1]
			
 
				+	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r4, OC_C7S1		; r4 = OC_C7S1
			
 
				+	LDR	r5, OC_C1S7		; r5 = OC_C1S7
			
 
				+	MOV	r9, r9, ASR #16		; r9 = t[0]
			
 
				+	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]
			
 
				+	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	MOV	r4, r4, ASR #16		; r4 = t[4]
			
 
				+	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		; r11= t[7]
			
 
				+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	ADD	r10,r9, r10,ASR #16	; r10= t[1] = t[0]+t[2]
			
 
				+	RSB	r6, r10,r9, LSL #1	; r6 = t[2] = t[0]-t[2]
			
 
				+					; r3 = t2[0] = t[0]+t[3]
			
 
				+	RSB	r9, r3, r9, LSL #1	; r9 = t2[3] = t[0]-t[3]
			
 
				+	MOV	r12,r12,ASR #16		; r12= t[6]
			
 
				+	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
			
 
				+	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
			
 
				+	ADD	r11,r3, r11		; r11= t2[0]+t[7]
			
 
				+	ADD	r5, r10,r5		; r5 = t[1]+t2[6]
			
 
				+	ADD	r12,r6, r12		; r12= t[2]+t2[5]
			
 
				+	ADD	r4, r9, r4		; r4 = t2[3]+t[4]
			
 
				+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
			
 
				+	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
			
 
				+	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
			
 
				+	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
			
 
				+	RSB	r11,r11,r3, LSL #1	; r11= t2[0] - t[7]
			
 
				+	RSB	r5, r5, r10,LSL #1	; r5 = t[1]  - t2[6]
			
 
				+	RSB	r12,r12,r6, LSL #1	; r6 = t[2]  - t2[5]
			
 
				+	RSB	r4, r4, r9, LSL #1	; r4 = t2[3] - t[4]
			
 
				+	STRH	r4, [r0, #62]		; y[4] = t2[3]-t[4]
			
 
				+	STRH	r12,[r0, #78]		; y[5] = t[2]-t2[5]
			
 
				+	STRH	r5, [r0, #94]		; y[6] = t[1]-t2[6]
			
 
				+	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,R14
			
 
				+	ENDP
			
 
				+
			
 
				+idct3core_down_arm PROC
			
 
				+	LDRSH	r9, [r1], #16		; r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4		; r12= OC_C4S4
			
 
				+	LDRSH	r3, [r1, #-12]		; r3 = x[2]
			
 
				+	LDR	r10,OC_C6S2		; r10= OC_C6S2
			
 
				+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r4, OC_C2S6		; r4 = OC_C2S6
			
 
				+	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r11,[r1, #-14]		; r11= x[1]
			
 
				+	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r4, OC_C7S1		; r4 = OC_C7S1
			
 
				+	LDR	r5, OC_C1S7		; r5 = OC_C1S7
			
 
				+	MOV	r9, r9, ASR #16		; r9 = t[0]
			
 
				+	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	ADD	r9, r9, #8		; r9 = t[0]+8
			
 
				+	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]+8
			
 
				+	MOV	r4, r4, ASR #16		; r4 = t[4]
			
 
				+	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		; r11= t[7]
			
 
				+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	ADD	r10,r9, r10,ASR #16	; r10= t[1]+8 = t[0]+t[2]+8
			
 
				+	RSB	r6, r10,r9, LSL #1	; r6 = t[2]+8 = t[0]-t[2]+8
			
 
				+					; r3 = t2[0]+8 = t[0]+t[3]+8
			
 
				+	RSB	r9, r3, r9, LSL #1	; r9 = t2[3]+8 = t[0]-t[3]+8
			
 
				+	MOV	r12,r12,ASR #16		; r12= t[6]
			
 
				+	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
			
 
				+	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
			
 
				+	ADD	r11,r3, r11		; r11= t2[0]+t[7] +8
			
 
				+	ADD	r5, r10,r5		; r5 = t[1] +t2[6]+8
			
 
				+	ADD	r12,r6, r12		; r12= t[2] +t2[5]+8
			
 
				+	ADD	r4, r9, r4		; r4 = t2[3]+t[4] +8
			
 
				+	RSB	r3, r11,r3, LSL #1	; r11= t2[0] - t[7]  + 8
			
 
				+	RSB	r10,r5, r10,LSL #1	; r5 = t[1]  - t2[6] + 8
			
 
				+	RSB	r6, r12,r6, LSL #1	; r6 = t[2]  - t2[5] + 8
			
 
				+	RSB	r9, r4, r9, LSL #1	; r4 = t2[3] - t[4]  + 8
			
 
				+	; TODO: This is wrong.
			
 
				+	; The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	;  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r11,r11,ASR #4
			
 
				+	MOV	r5, r5, ASR #4
			
 
				+	MOV	r12,r12,ASR #4
			
 
				+	MOV	r4, r4, ASR #4
			
 
				+	MOV	r9, r9, ASR #4
			
 
				+	MOV	r6, r6, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
			
 
				+	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
			
 
				+	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
			
 
				+	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
			
 
				+	STRH	r9, [r0, #62]		; y[4] = t2[3]-t[4]
			
 
				+	STRH	r6, [r0, #78]		; y[5] = t[2]-t2[5]
			
 
				+	STRH	r10,[r0, #94]		; y[6] = t[1]-t2[6]
			
 
				+	STRH	r3, [r0, #110]		; y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,R14
			
 
				+	ENDP
			
 
				+
			
 
				+idct4core_arm PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		; r9 = x[0]
			
 
				+	LDR	r10,OC_C4S4		; r10= OC_C4S4
			
 
				+	LDRSH	r12,[r1, #-12]		; r12= x[2]
			
 
				+	LDR	r4, OC_C6S2		; r4 = OC_C6S2
			
 
				+	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r5, OC_C2S6		; r5 = OC_C2S6
			
 
				+	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r3, [r1, #-14]		; r3 = x[1]
			
 
				+	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r6, OC_C7S1		; r6 = OC_C7S1
			
 
				+	LDR	r12,OC_C1S7		; r12= OC_C1S7
			
 
				+	LDRSH	r11,[r1, #-10]		; r11= x[3]
			
 
				+	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	LDR	r7, OC_C5S3		; r7 = OC_C5S3
			
 
				+	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
			
 
				+	LDR	r8, OC_C3S5		; r8 = OC_C3S5
			
 
				+	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
			
 
				+	MOV	r9, r9, ASR #16		; r9 = t[0]
			
 
				+	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
			
 
				+	MOV	r6, r6, ASR #16		; r6 = t[4]
			
 
				+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+; before multiplying, not after (this is not equivalent)
			
 
				+	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
			
 
				+	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
			
 
				+	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
			
 
				+	MOV	r3, r3, ASR #16		; r3 = t[7]
			
 
				+	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
			
 
				+	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
			
 
				+	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
			
 
				+	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2]
			
 
				+	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2]
			
 
				+	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3]
			
 
				+	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3]
			
 
				+	MOV	r3, r3, ASR #16		; r3 = t2[6]
			
 
				+	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
			
 
				+	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
			
 
				+	ADD	r11,r5, r11		; r11= t[0]+t2[7]
			
 
				+	ADD	r6, r4, r6		; r6 = t[1]+t3[6]
			
 
				+	ADD	r3, r10,r3		; r3 = t[2]+t3[5]
			
 
				+	ADD	r7, r9, r7		; r7 = t[3]+t2[4]
			
 
				+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
			
 
				+	STRH	r6, [r0, #14]		; y[1] = t[1]+t2[6]
			
 
				+	STRH	r3, [r0, #30]		; y[2] = t[2]+t2[5]
			
 
				+	STRH	r7, [r0, #46]		; y[3] = t2[3]+t[4]
			
 
				+	RSB	r11,r11,r5, LSL #1	; r11= t[0]-t2[7]
			
 
				+	RSB	r6, r6, r4, LSL #1	; r6 = t[1]-t3[6]
			
 
				+	RSB	r3, r3, r10,LSL #1	; r3 = t[2]-t3[5]
			
 
				+	RSB	r7, r7, r9, LSL #1	; r7 = t[3]-t2[4]
			
 
				+	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
			
 
				+	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
			
 
				+	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
			
 
				+	STRH	r11, [r0, #110]		; y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+idct4core_down_arm PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		; r9 = x[0]
			
 
				+	LDR	r10,OC_C4S4		; r10= OC_C4S4
			
 
				+	LDRSH	r12,[r1, #-12]		; r12= x[2]
			
 
				+	LDR	r4, OC_C6S2		; r4 = OC_C6S2
			
 
				+	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r5, OC_C2S6		; r5 = OC_C2S6
			
 
				+	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r3, [r1, #-14]		; r3 = x[1]
			
 
				+	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r6, OC_C7S1		; r6 = OC_C7S1
			
 
				+	LDR	r12,OC_C1S7		; r12= OC_C1S7
			
 
				+	LDRSH	r11,[r1, #-10]		; r11= x[3]
			
 
				+	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	LDR	r7, OC_C5S3		; r7 = OC_C5S3
			
 
				+	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
			
 
				+	LDR	r8, OC_C3S5		; r8 = OC_C3S5
			
 
				+	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
			
 
				+	MOV	r9, r9, ASR #16		; r9 = t[0]
			
 
				+	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
			
 
				+	MOV	r6, r6, ASR #16		; r6 = t[4]
			
 
				+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+; before multiplying, not after (this is not equivalent)
			
 
				+	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
			
 
				+	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
			
 
				+	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
			
 
				+	MOV	r3, r3, ASR #16		; r3 = t[7]
			
 
				+	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
			
 
				+	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
			
 
				+	ADD	r9, r9, #8		; r9 = t[0]+8
			
 
				+	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
			
 
				+	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2] + 8
			
 
				+	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2] + 8
			
 
				+	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3] + 8
			
 
				+	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3] + 8
			
 
				+	MOV	r3, r3, ASR #16		; r3 = t2[6]
			
 
				+	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
			
 
				+	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
			
 
				+	ADD	r5, r5, r11		; r5 = t[0]+t2[7]+8
			
 
				+	ADD	r4, r4, r6		; r4 = t[1]+t3[6]+8
			
 
				+	ADD	r10,r10,r3		; r10= t[2]+t3[5]+8
			
 
				+	ADD	r9, r9, r7		; r9 = t[3]+t2[4]+8
			
 
				+	SUB	r11,r5, r11,LSL #1	; r11= t[0]-t2[7]+8
			
 
				+	SUB	r6, r4, r6, LSL #1	; r6 = t[1]-t3[6]+8
			
 
				+	SUB	r3, r10,r3, LSL #1	; r3 = t[2]-t3[5]+8
			
 
				+	SUB	r7, r9, r7, LSL #1	; r7 = t[3]-t2[4]+8
			
 
				+	; TODO: This is wrong.
			
 
				+	; The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	;  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r11,r11,ASR #4
			
 
				+	MOV	r6, r6, ASR #4
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	MOV	r7, r7, ASR #4
			
 
				+	MOV	r9, r9, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r4, r4, ASR #4
			
 
				+	MOV	r5, r5, ASR #4
			
 
				+	STRH	r5,[r0], #2		; y[0] = t[0]+t[7]
			
 
				+	STRH	r4, [r0, #14]		; y[1] = t[1]+t2[6]
			
 
				+	STRH	r10,[r0, #30]		; y[2] = t[2]+t2[5]
			
 
				+	STRH	r9, [r0, #46]		; y[3] = t2[3]+t[4]
			
 
				+	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
			
 
				+	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
			
 
				+	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
			
 
				+	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+idct8core_arm PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r2, [r1],#16		; r2 = x[0]
			
 
				+	STMFD	r13!,{r1,r14}
			
 
				+	LDRSH	r6, [r1, #-8]		; r6 = x[4]
			
 
				+	LDR	r12,OC_C4S4		; r12= C4S4
			
 
				+	LDRSH	r4, [r1, #-12]		; r4 = x[2]
			
 
				+	ADD	r2, r2, r6		; r2 = x[0] + x[4]
			
 
				+	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
			
 
				+	; For spec compliance, these sums must be truncated to 16-bit precision
			
 
				+	; _before_ the multiply (not after).
			
 
				+	; Sadly, ARMv4 provides no simple way to do that.
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	MOV	r2, r2, ASR #16
			
 
				+	MOV	r6, r6, ASR #16
			
 
				+	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
			
 
				+	LDRSH	r8, [r1, #-4]		; r8 = x[6]
			
 
				+	LDR	r7, OC_C6S2		; r7 = OC_C6S2
			
 
				+	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
			
 
				+	LDR	r14,OC_C2S6		; r14= OC_C2S6
			
 
				+	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
			
 
				+	LDR	r5, OC_C7S1		; r5 = OC_C7S1
			
 
				+	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
			
 
				+	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
			
 
				+	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
			
 
				+	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
			
 
				+	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
			
 
				+	LDR	r7, OC_C1S7		; r7 = OC_C1S7
			
 
				+	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
			
 
				+	LDRSH	r14,[r1, #-14]		; r14= x[1]
			
 
				+	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
			
 
				+	LDRSH	r8, [r1, #-2]		; r8 = x[7]
			
 
				+	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
			
 
				+	LDRSH	r10,[r1, #-6]		; r10= x[5]
			
 
				+	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
			
 
				+	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
			
 
				+	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
			
 
				+	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
			
 
				+	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
			
 
				+	LDRSH	r1, [r1, #-10]		; r1 = x[3]
			
 
				+	LDR	r5, OC_C3S5		; r5 = OC_C3S5
			
 
				+	LDR	r11,OC_C5S3		; r11= OC_C5S3
			
 
				+	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
			
 
				+	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
			
 
				+	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
			
 
				+	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
			
 
				+	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
			
 
				+	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
			
 
				+	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
			
 
				+	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
			
 
				+	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
			
 
				+	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
			
 
				+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
			
 
				+	; r10=t[6] r12=C4S4 r14=t[5]
			
 
				+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+; before multiplying, not after (this is not equivalent)
			
 
				+	; Stage 2
			
 
				+	; 4-5 butterfly
			
 
				+	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
			
 
				+	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
			
 
				+	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
			
 
				+	; 7-6 butterfly
			
 
				+	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
			
 
				+	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
			
 
				+	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
			
 
				+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
			
 
				+	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
			
 
				+	; Stage 3
			
 
				+	; 0-3 butterfly
			
 
				+	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3]
			
 
				+	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3]
			
 
				+	; 1-2 butterfly
			
 
				+	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2]
			
 
				+	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2]
			
 
				+	; 6-5 butterfly
			
 
				+	MOV	r14,r14,ASR #16		; r14= t2[5]
			
 
				+	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
			
 
				+	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
			
 
				+	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
			
 
				+	; r10=t3[6] r14=t3[5]
			
 
				+	; Stage 4
			
 
				+	ADD	r2, r2, r8		; r2 = t[0] + t[7]
			
 
				+	ADD	r6, r6, r10		; r6 = t[1] + t[6]
			
 
				+	ADD	r3, r3, r14		; r3 = t[2] + t[5]
			
 
				+	ADD	r4, r4, r9		; r4 = t[3] + t[4]
			
 
				+	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7]
			
 
				+	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6]
			
 
				+	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5]
			
 
				+	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4]
			
 
				+	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
			
 
				+	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
			
 
				+	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
			
 
				+	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
			
 
				+	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
			
 
				+	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
			
 
				+	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
			
 
				+	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
			
 
				+	LDMFD	r13!,{r1,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+idct8core_down_arm PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r2, [r1],#16		; r2 = x[0]
			
 
				+	STMFD	r13!,{r1,r14}
			
 
				+	LDRSH	r6, [r1, #-8]		; r6 = x[4]
			
 
				+	LDR	r12,OC_C4S4		; r12= C4S4
			
 
				+	LDRSH	r4, [r1, #-12]		; r4 = x[2]
			
 
				+	ADD	r2, r2, r6		; r2 = x[0] + x[4]
			
 
				+	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
			
 
				+	; For spec compliance, these sums must be truncated to 16-bit precision
			
 
				+	; _before_ the multiply (not after).
			
 
				+	; Sadly, ARMv4 provides no simple way to do that.
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	MOV	r2, r2, ASR #16
			
 
				+	MOV	r6, r6, ASR #16
			
 
				+	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
			
 
				+	LDRSH	r8, [r1, #-4]		; r8 = x[6]
			
 
				+	LDR	r7, OC_C6S2		; r7 = OC_C6S2
			
 
				+	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
			
 
				+	LDR	r14,OC_C2S6		; r14= OC_C2S6
			
 
				+	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
			
 
				+	LDR	r5, OC_C7S1		; r5 = OC_C7S1
			
 
				+	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
			
 
				+	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
			
 
				+	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
			
 
				+	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
			
 
				+	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
			
 
				+	LDR	r7, OC_C1S7		; r7 = OC_C1S7
			
 
				+	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
			
 
				+	LDRSH	r14,[r1, #-14]		; r14= x[1]
			
 
				+	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
			
 
				+	LDRSH	r8, [r1, #-2]		; r8 = x[7]
			
 
				+	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
			
 
				+	LDRSH	r10,[r1, #-6]		; r10= x[5]
			
 
				+	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
			
 
				+	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
			
 
				+	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
			
 
				+	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
			
 
				+	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
			
 
				+	LDRSH	r1, [r1, #-10]		; r1 = x[3]
			
 
				+	LDR	r5, OC_C3S5		; r5 = OC_C3S5
			
 
				+	LDR	r11,OC_C5S3		; r11= OC_C5S3
			
 
				+	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
			
 
				+	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
			
 
				+	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
			
 
				+	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
			
 
				+	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
			
 
				+	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
			
 
				+	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
			
 
				+	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
			
 
				+	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
			
 
				+	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
			
 
				+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
			
 
				+	; r10=t[6] r12=C4S4 r14=t[5]
			
 
				+	; Stage 2
			
 
				+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+; before multiplying, not after (this is not equivalent)
			
 
				+	; 4-5 butterfly
			
 
				+	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
			
 
				+	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
			
 
				+	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
			
 
				+	; 7-6 butterfly
			
 
				+	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
			
 
				+	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
			
 
				+	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
			
 
				+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
			
 
				+	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
			
 
				+	; Stage 3
			
 
				+	ADD	r2, r2, #8<<16		; r2 = t[0]+8<<16
			
 
				+	ADD	r6, r6, #8<<16		; r6 = t[1]+8<<16
			
 
				+	; 0-3 butterfly
			
 
				+	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3] + 8
			
 
				+	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3] + 8
			
 
				+	; 1-2 butterfly
			
 
				+	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2] + 8
			
 
				+	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2] + 8
			
 
				+	; 6-5 butterfly
			
 
				+	MOV	r14,r14,ASR #16		; r14= t2[5]
			
 
				+	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
			
 
				+	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
			
 
				+	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
			
 
				+	; r10=t3[6] r14=t3[5]
			
 
				+	; Stage 4
			
 
				+	ADD	r2, r2, r8		; r2 = t[0] + t[7] + 8
			
 
				+	ADD	r6, r6, r10		; r6 = t[1] + t[6] + 8
			
 
				+	ADD	r3, r3, r14		; r3 = t[2] + t[5] + 8
			
 
				+	ADD	r4, r4, r9		; r4 = t[3] + t[4] + 8
			
 
				+	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7] + 8
			
 
				+	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6] + 8
			
 
				+	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5] + 8
			
 
				+	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4] + 8
			
 
				+	; TODO: This is wrong.
			
 
				+	; The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	;  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r2, r2, ASR #4
			
 
				+	MOV	r6, r6, ASR #4
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	MOV	r4, r4, ASR #4
			
 
				+	MOV	r8, r8, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r14,r14,ASR #4
			
 
				+	MOV	r9, r9, ASR #4
			
 
				+	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
			
 
				+	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
			
 
				+	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
			
 
				+	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
			
 
				+	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
			
 
				+	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
			
 
				+	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
			
 
				+	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
			
 
				+	LDMFD	r13!,{r1,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+ [ OC_ARM_ASM_MEDIA
			
 
				+	EXPORT	oc_idct8x8_1_v6
			
 
				+	EXPORT	oc_idct8x8_v6
			
 
				+
			
 
				+oc_idct8x8_1_v6 PROC
			
 
				+	; r0 = ogg_int16_t  *_y
			
 
				+	; r1 = ogg_uint16_t  _dc
			
 
				+	ORR	r2, r1, r1, LSL #16
			
 
				+	ORR	r3, r1, r1, LSL #16
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	STRD	r2, [r0], #8
			
 
				+	MOV	PC, r14
			
 
				+	ENDP
			
 
				+
			
 
				+oc_idct8x8_v6 PROC
			
 
				+	; r0 = ogg_int16_t *_y
			
 
				+	; r1 = ogg_int16_t *_x
			
 
				+	; r2 = int          _last_zzi
			
 
				+	CMP	r2, #3
			
 
				+	BLE	oc_idct8x8_3_v6
			
 
				+	;CMP	r2, #6
			
 
				+	;BLE	oc_idct8x8_6_v6
			
 
				+	CMP	r2, #10
			
 
				+	BLE	oc_idct8x8_10_v6
			
 
				+oc_idct8x8_slow_v6
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+; Row transforms
			
 
				+	STR	r0, [r13,#-4]!
			
 
				+	ADD	r0, r13, #4	; Write to temp storage.
			
 
				+	BL	idct8_8core_v6
			
 
				+	BL	idct8_8core_v6
			
 
				+	BL	idct8_8core_v6
			
 
				+	BL	idct8_8core_v6
			
 
				+	LDR	r0, [r13], #4	; Write to the final destination.
			
 
				+	; Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	STRD	r4, [r1,#-8*16]!
			
 
				+	STRD	r4, [r1,#8]
			
 
				+	STRD	r4, [r1,#16]
			
 
				+	STRD	r4, [r1,#24]
			
 
				+	STRD	r4, [r1,#32]
			
 
				+	STRD	r4, [r1,#40]
			
 
				+	STRD	r4, [r1,#48]
			
 
				+	STRD	r4, [r1,#56]
			
 
				+	STRD	r4, [r1,#64]
			
 
				+	STRD	r4, [r1,#72]
			
 
				+	STRD	r4, [r1,#80]
			
 
				+	STRD	r4, [r1,#88]
			
 
				+	STRD	r4, [r1,#96]
			
 
				+	STRD	r4, [r1,#104]
			
 
				+	STRD	r4, [r1,#112]
			
 
				+	STRD	r4, [r1,#120]
			
 
				+	MOV	r1, r13		; And read from temp storage.
			
 
				+; Column transforms
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_idct8x8_10_v6 PROC
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2+4
			
 
				+; Row transforms
			
 
				+	MOV	r2, r13
			
 
				+	STR	r0, [r13,#-4]!
			
 
				+	AND	r0, r2, #4	; Align the stack.
			
 
				+	ADD	r0, r0, r2	; Write to temp storage.
			
 
				+	BL	idct4_3core_v6
			
 
				+	BL	idct2_1core_v6
			
 
				+	LDR	r0, [r13], #4	; Write to the final destination.
			
 
				+	; Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	STRD	r4, [r1,#-4*16]!
			
 
				+	STRD	r4, [r1,#16]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	STR	r4, [r1,#48]
			
 
				+	AND	r1, r13,#4	; Align the stack.
			
 
				+	ADD	r1, r1, r13	; And read from temp storage.
			
 
				+; Column transforms
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	ADD	r13,r13,#64*2+4
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_idct8x8_3_v6 PROC
			
 
				+	STMFD	r13!,{r4-r8,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+; Row transforms
			
 
				+	MOV	r8, r0
			
 
				+	MOV	r0, r13		; Write to temp storage.
			
 
				+	BL	idct2_1core_v6
			
 
				+	; Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-2*16]!
			
 
				+	STR	r4, [r1,#16]
			
 
				+	MOV	r1, r13		; Read from temp storage.
			
 
				+	MOV	r0, r8		; Write to the final destination.
			
 
				+; Column transforms
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r8,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+idct2_1core_v6 PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+; Stage 1:
			
 
				+	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
			
 
				+	LDR	r3, OC_C4S4
			
 
				+	LDRSH	r6, [r1], #16		; r6 = x[1,0]
			
 
				+	SMULWB	r12,r3, r2		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
			
 
				+	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
			
 
				+	SMULWB	r6, r3, r6		; r6 = t[1,0]=OC_C4S4*x[1,0]>>16
			
 
				+	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	SMULWT	r7, r5, r2		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+; Stage 2:
			
 
				+	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r12,r12,r6, LSL #16	; r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r7, r7, r3		; r7 = <0|t[0,7]>
			
 
				+; Stage 3:
			
 
				+	PKHBT	r5, r6, r5, LSL #16	; r5 = <t[0,5]|t[0,6]>
			
 
				+	PKHBT	r4, r4, r3		; r4 = <0|t[0,4]>
			
 
				+	SASX	r5, r5, r5		; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
			
 
				+; Stage 4:
			
 
				+	PKHTB	r6, r3, r5, ASR #16	; r6 = <0|t[0,6]>
			
 
				+	PKHBT	r5, r5, r3		; r5 = <0|t[0,5]>
			
 
				+	SADD16	r3, r12,r7		; r3 = t[0]+t[7]
			
 
				+	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]
			
 
				+	SADD16	r3, r12,r6		; r3 = t[0]+t[6]
			
 
				+	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]
			
 
				+	SADD16	r3, r12,r5		; r3 = t[0]+t[5]
			
 
				+	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]
			
 
				+	SADD16	r3, r12,r4		; r3 = t[0]+t[4]
			
 
				+	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]
			
 
				+	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]
			
 
				+	STR	r4, [r0, #60]		; y[4<<3] = t[0]-t[4]
			
 
				+	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]
			
 
				+	STR	r5, [r0, #76]		; y[5<<3] = t[0]-t[5]
			
 
				+	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]
			
 
				+	STR	r6, [r0, #92]		; y[6<<3] = t[0]-t[6]
			
 
				+	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]
			
 
				+	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+ ]
			
 
				+
			
 
				+	ALIGN 8
			
 
				+OC_C7S1
			
 
				+	DCD	12785 ; 31F1
			
 
				+OC_C1S7
			
 
				+	DCD	64277 ; FB15
			
 
				+OC_C6S2
			
 
				+	DCD	25080 ; 61F8
			
 
				+OC_C2S6
			
 
				+	DCD	60547 ; EC83
			
 
				+OC_C5S3
			
 
				+	DCD	36410 ; 8E3A
			
 
				+OC_C3S5
			
 
				+	DCD	54491 ; D4DB
			
 
				+OC_C4S4
			
 
				+	DCD	46341 ; B505
			
 
				+
			
 
				+ [ OC_ARM_ASM_MEDIA
			
 
				+idct2_2core_down_v6 PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+; Stage 1:
			
 
				+	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
			
 
				+	LDR	r3, OC_C4S4
			
 
				+	MOV	r7 ,#8			; r7  = 8
			
 
				+	LDR	r6, [r1], #16		; r6 = <x[1,1]|x[1,0]>
			
 
				+	SMLAWB	r12,r3, r2, r7		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
			
 
				+	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
			
 
				+	SMLAWB	r7, r3, r6, r7		; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
			
 
				+	SMULWT  r5, r5, r2		; r2 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	PKHBT	r12,r12,r7, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
			
 
				+	PKHBT	r7, r5, r5, LSL #16	; r7 = <t[0,7]|t[0,7]>
			
 
				+; Stage 2:
			
 
				+	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r4, r4, r4, LSL #16	; r4 = <t[0,4]|t[0,4]>
			
 
				+	SMULWT	r2, r3, r7		; r2 = t[1,6]=OC_C4S4*t[1,7]>>16
			
 
				+	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r6, r6, r2, LSL #16	; r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r2, r3, r4		; r2 = t[1,5]=OC_C4S4*t[1,4]>>16
			
 
				+	PKHBT	r2, r5, r2, LSL #16	; r2 = <t[1,5]|t[0,5]>
			
 
				+; Stage 3:
			
 
				+	SSUB16	r5, r6, r2		; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
			
 
				+	SADD16	r6, r6, r2		; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
			
 
				+; Stage 4:
			
 
				+	SADD16	r2, r12,r7		; r2 = t[0]+t[7]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[7]+8>>4
			
 
				+	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
			
 
				+	SADD16	r2, r12,r6		; r2 = t[0]+t[6]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[6]+8>>4
			
 
				+	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]+8>>4
			
 
				+	SADD16	r2, r12,r5		; r2 = t[0]+t[5]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[5]+8>>4
			
 
				+	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]+8>>4
			
 
				+	SADD16	r2, r12,r4		; r2 = t[0]+t[4]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[4]+8>>4
			
 
				+	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]+8>>4
			
 
				+	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]+8
			
 
				+	MOV	r3, r4, ASR #4
			
 
				+	MOV	r4, r4, LSL #16
			
 
				+	PKHTB	r3, r3, r4, ASR #20	; r3 = t[0]-t[4]+8>>4
			
 
				+	STR	r3, [r0, #60]		; y[4<<3] = t[0]-t[4]+8>>4
			
 
				+	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]+8
			
 
				+	MOV	r3, r5, ASR #4
			
 
				+	MOV	r5, r5, LSL #16
			
 
				+	PKHTB	r3, r3, r5, ASR #20	; r3 = t[0]-t[5]+8>>4
			
 
				+	STR	r3, [r0, #76]		; y[5<<3] = t[0]-t[5]+8>>4
			
 
				+	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]+8
			
 
				+	MOV	r3, r6, ASR #4
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	PKHTB	r3, r3, r6, ASR #20	; r3 = t[0]-t[6]+8>>4
			
 
				+	STR	r3, [r0, #92]		; y[6<<3] = t[0]-t[6]+8>>4
			
 
				+	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]+8
			
 
				+	MOV	r3, r7, ASR #4
			
 
				+	MOV	r7, r7, LSL #16
			
 
				+	PKHTB	r3, r3, r7, ASR #20	; r3 = t[0]-t[7]+8>>4
			
 
				+	STR	r3, [r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
			
 
				+;  pay for increased branch mis-prediction to get here, but in practice it
			
 
				+;  doesn't seem to slow anything down to take it out, and it's less code this
			
 
				+;  way.
			
 
				+ [ 0
			
 
				+oc_idct8x8_6_v6 PROC
			
 
				+	STMFD	r13!,{r4-r8,r10,r11,r14}
			
 
				+	SUB	r13,r13,#64*2+4
			
 
				+; Row transforms
			
 
				+	MOV	r8, r0
			
 
				+	AND	r0, r13,#4	; Align the stack.
			
 
				+	ADD	r0, r0, r13	; Write to temp storage.
			
 
				+	BL	idct3_2core_v6
			
 
				+	BL	idct1core_v6
			
 
				+	; Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	STRD	r4, [r1,#-3*16]!
			
 
				+	STR	r4, [r1,#16]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	AND	r1, r13,#4	; Align the stack.
			
 
				+	MOV	r0, r8		; Write to the final destination.
			
 
				+	ADD	r1, r1, r13	; And read from temp storage.
			
 
				+; Column transforms
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	ADD	r13,r13,#64*2+4
			
 
				+	LDMFD	r13!,{r4-r8,r10,r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+idct1core_v6 PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r3, [r1], #16
			
 
				+	MOV	r12,#0x05
			
 
				+	ORR	r12,r12,#0xB500
			
 
				+	MUL	r3, r12, r3
			
 
				+	; Stall ?
			
 
				+	MOV	r3, r3, ASR #16
			
 
				+	; Don't need to actually store the odd lines; they won't be read.
			
 
				+	STRH	r3, [r0], #2
			
 
				+	STRH	r3, [r0, #30]
			
 
				+	STRH	r3, [r0, #62]
			
 
				+	STRH	r3, [r0, #94]
			
 
				+	MOV	PC,R14
			
 
				+	ENDP
			
 
				+
			
 
				+idct3_2core_v6 PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+; Stage 1:
			
 
				+	LDRD	r4, [r1], #16		; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
			
 
				+	LDRD	r10,OC_C6S2_3_v6	; r10= OC_C6S2; r11= OC_C2S6
			
 
				+	; Stall
			
 
				+	SMULWB	r3, r11,r5		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMULWB	r2, r10,r5		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r5, [r1], #16		; r5 = <x[1,1]|x[1,0]>
			
 
				+	SMULWB	r12,r11,r4		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
			
 
				+	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	SMULWB	r10,r11,r5		; r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
			
 
				+	PKHBT	r12,r12,r10,LSL #16	; r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWT  r10,r7, r5		; r10= t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	PKHBT	r2, r2, r11		; r2 = <0|t[0,2]>
			
 
				+	SMULWT  r7, r7, r4		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	PKHBT	r3, r3, r11		; r3 = <0|t[0,3]>
			
 
				+	SMULWT	r5, r6, r5		; r10= t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+	PKHBT	r7, r7, r10,LSL #16	; r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+; Stage 2:
			
 
				+	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r4, r4, r5, LSL #16	; r4 = <t[1,4]|t[0,4]>
			
 
				+	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
			
 
				+	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
			
 
				+; Stage 3:
			
 
				+	B	idct4_3core_stage3_v6
			
 
				+	ENDP
			
 
				+
			
 
				+; Another copy so the LDRD offsets are less than +/- 255.
			
 
				+	ALIGN 8
			
 
				+OC_C7S1_3_v6
			
 
				+	DCD	12785 ; 31F1
			
 
				+OC_C1S7_3_v6
			
 
				+	DCD	64277 ; FB15
			
 
				+OC_C6S2_3_v6
			
 
				+	DCD	25080 ; 61F8
			
 
				+OC_C2S6_3_v6
			
 
				+	DCD	60547 ; EC83
			
 
				+
			
 
				+idct3_3core_down_v6 PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+; Stage 1:
			
 
				+	LDRD	r10,[r1], #16		; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
			
 
				+	LDRD	r6, OC_C6S2_3_v6	; r6 = OC_C6S2; r7 = OC_C2S6
			
 
				+	LDR	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>
			
 
				+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	MOV	r7,#8
			
 
				+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
			
 
				+; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
			
 
				+	PKHBT	r3, r3, r3, LSL #16	; r3 = <t[0,3]|t[0,3]>
			
 
				+	SMLAWB	r5, r11,r4, r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
			
 
				+	PKHBT	r2, r2, r2, LSL #16	; r2 = <t[0,2]|t[0,2]>
			
 
				+	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+; Stage 2:
			
 
				+	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
			
 
				+	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
			
 
				+	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
			
 
				+; Stage 3:
			
 
				+	B	idct4_4core_down_stage3_v6
			
 
				+	ENDP
			
 
				+ ]
			
 
				+
			
 
				+idct4_3core_v6 PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+; Stage 1:
			
 
				+	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
			
 
				+	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
			
 
				+	LDRD	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
			
 
				+	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
			
 
				+	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
			
 
				+	PKHBT	r9, r9, r2		; r9 = <0|t[0,6]>
			
 
				+	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
			
 
				+	PKHBT	r8, r8, r2		; r9 = <0|-t[0,5]>
			
 
				+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
			
 
				+	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
			
 
				+	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
			
 
				+	SMULWB	r12,r11,r10		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
			
 
				+	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
			
 
				+	SMULWB	r5, r11,r4		; r5 = t[1,0]=OC_C4S4*x[1,0]>>16
			
 
				+	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+; Stage 2:
			
 
				+	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
			
 
				+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
			
 
				+	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
			
 
				+	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
			
 
				+	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
			
 
				+	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
			
 
				+	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
			
 
				+; Stage 3:
			
 
				+idct4_3core_stage3_v6
			
 
				+	SADD16	r11,r12,r2		; r11= t[1]=t[0]+t[2]
			
 
				+	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
			
 
				+	SSUB16	r2, r12,r2		; r2 = t[2]=t[0]-t[2]
			
 
				+idct4_3core_stage3_5_v6
			
 
				+	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
			
 
				+	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
			
 
				+	SADD16	r10,r12,r3		; r10= t[0]'=t[0]+t[3]
			
 
				+	SSUB16	r3, r12,r3		; r3 = t[3]=t[0]-t[3]
			
 
				+; Stage 4:
			
 
				+	SADD16	r12,r10,r7		; r12= t[0]+t[7]
			
 
				+	STR	r12,[r0], #4		; y[0<<3] = t[0]+t[7]
			
 
				+	SADD16	r12,r11,r6		; r12= t[1]+t[6]
			
 
				+	STR	r12,[r0, #12]		; y[1<<3] = t[1]+t[6]
			
 
				+	SADD16	r12,r2, r5		; r12= t[2]+t[5]
			
 
				+	STR	r12,[r0, #28]		; y[2<<3] = t[2]+t[5]
			
 
				+	SADD16	r12,r3, r4		; r12= t[3]+t[4]
			
 
				+	STR	r12,[r0, #44]		; y[3<<3] = t[3]+t[4]
			
 
				+	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]
			
 
				+	STR	r4, [r0, #60]		; y[4<<3] = t[3]-t[4]
			
 
				+	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]
			
 
				+	STR	r5, [r0, #76]		; y[5<<3] = t[2]-t[5]
			
 
				+	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]
			
 
				+	STR	r6, [r0, #92]		; y[6<<3] = t[1]-t[6]
			
 
				+	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]
			
 
				+	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+; Another copy so the LDRD offsets are less than +/- 255.
			
 
				+	ALIGN 8
			
 
				+OC_C7S1_4_v6
			
 
				+	DCD	12785 ; 31F1
			
 
				+OC_C1S7_4_v6
			
 
				+	DCD	64277 ; FB15
			
 
				+OC_C6S2_4_v6
			
 
				+	DCD	25080 ; 61F8
			
 
				+OC_C2S6_4_v6
			
 
				+	DCD	60547 ; EC83
			
 
				+OC_C5S3_4_v6
			
 
				+	DCD	36410 ; 8E3A
			
 
				+OC_C3S5_4_v6
			
 
				+	DCD	54491 ; D4DB
			
 
				+
			
 
				+idct4_4core_down_v6 PROC
			
 
				+	; r0 =       ogg_int16_t *_y (destination)
			
 
				+	; r1 = const ogg_int16_t *_x (source)
			
 
				+; Stage 1:
			
 
				+	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
			
 
				+	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
			
 
				+	LDRD	r4, [r1], #16	; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
			
 
				+	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
			
 
				+	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
			
 
				+	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
			
 
				+; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
			
 
				+	PKHBT	r9, r9, r9, LSL #16	; r9 = <t[0,6]|t[0,6]>
			
 
				+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	PKHBT	r8, r8, r8, LSL #16	; r8 = <-t[0,5]|-t[0,5]>
			
 
				+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
			
 
				+	MOV	r7,#8
			
 
				+	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
			
 
				+	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
			
 
				+	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
			
 
				+	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
			
 
				+	SMLAWB	r5, r11,r4 ,r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
			
 
				+	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+; Stage 2:
			
 
				+	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
			
 
				+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
			
 
				+	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
			
 
				+	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
			
 
				+	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
			
 
				+	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
			
 
				+	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
			
 
				+; Stage 3:
			
 
				+idct4_4core_down_stage3_v6
			
 
				+	SADD16	r11,r12,r2		; r11= t[1]+8=t[0]+t[2]+8
			
 
				+	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
			
 
				+	SSUB16	r2, r12,r2		; r2 = t[2]+8=t[0]-t[2]+8
			
 
				+	B	idct8_8core_down_stage3_5_v6
			
 
				+	ENDP
			
 
				+
			
 
				+idct8_8core_v6 PROC
			
 
				+	STMFD	r13!,{r0,r14}
			
 
				+; Stage 1:
			
 
				+	;5-6 rotation by 3pi/16
			
 
				+	LDRD	r10,OC_C5S3_4_v6	; r10= OC_C5S3, r11= OC_C3S5
			
 
				+	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
			
 
				+	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
			
 
				+	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
			
 
				+	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
			
 
				+	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
			
 
				+	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
			
 
				+	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
			
 
				+	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
			
 
				+	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
			
 
				+	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
			
 
				+	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
			
 
				+	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
			
 
				+	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
			
 
				+	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
			
 
				+	;2-3 rotation by 6pi/16
			
 
				+	LDRD	r10,OC_C6S2_4_v6	; r10= OC_C6S2, r11= OC_C2S6
			
 
				+	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
			
 
				+	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
			
 
				+	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
			
 
				+	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
			
 
				+	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
			
 
				+	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
			
 
				+	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
			
 
				+	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
			
 
				+	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
			
 
				+	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
			
 
				+	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
			
 
				+	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
			
 
				+	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
			
 
				+	;4-7 rotation by 7pi/16
			
 
				+	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
			
 
				+	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
			
 
				+	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
			
 
				+	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
			
 
				+	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
			
 
				+	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
			
 
				+	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
			
 
				+	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
			
 
				+	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
			
 
				+	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
			
 
				+	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
			
 
				+	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
			
 
				+	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
			
 
				+	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
			
 
				+	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
			
 
				+	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
			
 
				+	;0-1 butterfly
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
			
 
				+	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
			
 
				+	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
			
 
				+	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
			
 
				+	SMULWB	r8, r11,r7		; r8 = t[0,0]=OC_C4S4*r7B>>16
			
 
				+	SMULWT	r12,r11,r7		; r12= t[1,0]=OC_C4S4*r7T>>16
			
 
				+	SMULWB	r7, r11,r4		; r7 = t[0,1]=OC_C4S4*r4B>>16
			
 
				+	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWT	r8, r11,r4		; r8 = t[1,1]=OC_C4S4*r4T>>16
			
 
				+; Stage 2:
			
 
				+	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
			
 
				+	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]|t[0,0]>
			
 
				+	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
			
 
				+	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
			
 
				+	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
			
 
				+	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
			
 
				+	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
			
 
				+	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
			
 
				+	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+; Stage 3:
			
 
				+	SADD16	r11,r8, r2		; r11= t[1]'=t[1]+t[2]
			
 
				+	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
			
 
				+	SSUB16	r2, r8, r2		; r2 = t[2]=t[1]-t[2]
			
 
				+	LDMFD	r13!,{r0,r14}
			
 
				+	B	idct4_3core_stage3_5_v6
			
 
				+	ENDP
			
 
				+
			
 
				+; Another copy so the LDRD offsets are less than +/- 255.
			
 
				+	ALIGN 8
			
 
				+OC_C7S1_8_v6
			
 
				+	DCD	12785 ; 31F1
			
 
				+OC_C1S7_8_v6
			
 
				+	DCD	64277 ; FB15
			
 
				+OC_C6S2_8_v6
			
 
				+	DCD	25080 ; 61F8
			
 
				+OC_C2S6_8_v6
			
 
				+	DCD	60547 ; EC83
			
 
				+OC_C5S3_8_v6
			
 
				+	DCD	36410 ; 8E3A
			
 
				+OC_C3S5_8_v6
			
 
				+	DCD	54491 ; D4DB
			
 
				+
			
 
				+idct8_8core_down_v6 PROC
			
 
				+	STMFD	r13!,{r0,r14}
			
 
				+; Stage 1:
			
 
				+	;5-6 rotation by 3pi/16
			
 
				+	LDRD	r10,OC_C5S3_8_v6	; r10= OC_C5S3, r11= OC_C3S5
			
 
				+	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
			
 
				+	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
			
 
				+	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
			
 
				+	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
			
 
				+	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
			
 
				+	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
			
 
				+	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
			
 
				+	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
			
 
				+	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
			
 
				+	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
			
 
				+	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
			
 
				+	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
			
 
				+	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
			
 
				+	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
			
 
				+	;2-3 rotation by 6pi/16
			
 
				+	LDRD	r10,OC_C6S2_8_v6	; r10= OC_C6S2, r11= OC_C2S6
			
 
				+	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
			
 
				+	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
			
 
				+	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
			
 
				+	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
			
 
				+	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
			
 
				+	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
			
 
				+	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
			
 
				+	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
			
 
				+	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
			
 
				+	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
			
 
				+	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
			
 
				+	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
			
 
				+	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
			
 
				+	;4-7 rotation by 7pi/16
			
 
				+	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
			
 
				+	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
			
 
				+	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
			
 
				+	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
			
 
				+	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
			
 
				+	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
			
 
				+	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
			
 
				+	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
			
 
				+	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
			
 
				+	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
			
 
				+	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
			
 
				+	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
			
 
				+	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
			
 
				+	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
			
 
				+	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
			
 
				+	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
			
 
				+	;0-1 butterfly
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	MOV	r14,#8
			
 
				+	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
			
 
				+	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
			
 
				+	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
			
 
				+	SMLAWB	r8, r11,r7, r14		; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
			
 
				+	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
			
 
				+	SMLAWT	r12,r11,r7, r14		; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
			
 
				+	SMLAWB	r7, r11,r4, r14		; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
			
 
				+	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMLAWT	r8, r11,r4, r14		; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
			
 
				+; Stage 2:
			
 
				+	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
			
 
				+	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]+8|t[0,0]+8>
			
 
				+	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
			
 
				+	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
			
 
				+	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
			
 
				+	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
			
 
				+	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
			
 
				+	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
			
 
				+	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+; Stage 3:
			
 
				+	SADD16	r11,r8, r2		; r11= t[1]'+8=t[1]+t[2]+8
			
 
				+	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
			
 
				+	SSUB16	r2, r8, r2		; r2 = t[2]+8=t[1]-t[2]+8
			
 
				+	LDMFD	r13!,{r0,r14}
			
 
				+idct8_8core_down_stage3_5_v6
			
 
				+	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
			
 
				+	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
			
 
				+	SADD16	r10,r12,r3		; r10= t[0]'+8=t[0]+t[3]+8
			
 
				+	SSUB16	r3, r12,r3		; r3 = t[3]+8=t[0]-t[3]+8
			
 
				+; Stage 4:
			
 
				+	SADD16	r12,r10,r7		; r12= t[0]+t[7]+8
			
 
				+	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	; r10= t[0]+t[7]+8>>4
			
 
				+	STR	r10,[r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
			
 
				+	SADD16	r12,r11,r6		; r12= t[1]+t[6]+8
			
 
				+	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	; r10= t[1]+t[6]+8>>4
			
 
				+	STR	r10,[r0, #12]		; y[1<<3] = t[1]+t[6]+8>>4
			
 
				+	SADD16	r12,r2, r5		; r12= t[2]+t[5]+8
			
 
				+	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	; r10= t[2]+t[5]+8>>4
			
 
				+	STR	r10,[r0, #28]		; y[2<<3] = t[2]+t[5]+8>>4
			
 
				+	SADD16	r12,r3, r4		; r12= t[3]+t[4]+8
			
 
				+	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	; r10= t[3]+t[4]+8>>4
			
 
				+	STR	r10,[r0, #44]		; y[3<<3] = t[3]+t[4]+8>>4
			
 
				+	MOV	r10,r4, ASR #4
			
 
				+	MOV	r4, r4, LSL #16
			
 
				+	PKHTB	r10,r10,r4, ASR #20	; r10= t[3]-t[4]+8>>4
			
 
				+	STR	r10,[r0, #60]		; y[4<<3] = t[3]-t[4]+8>>4
			
 
				+	MOV	r10,r5, ASR #4
			
 
				+	MOV	r5, r5, LSL #16
			
 
				+	PKHTB	r10,r10,r5, ASR #20	; r10= t[2]-t[5]+8>>4
			
 
				+	STR	r10,[r0, #76]		; y[5<<3] = t[2]-t[5]+8>>4
			
 
				+	MOV	r10,r6, ASR #4
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	PKHTB	r10,r10,r6, ASR #20	; r10= t[1]-t[6]+8>>4
			
 
				+	STR	r10,[r0, #92]		; y[6<<3] = t[1]-t[6]+8>>4
			
 
				+	MOV	r10,r7, ASR #4
			
 
				+	MOV	r7, r7, LSL #16
			
 
				+	PKHTB	r10,r10,r7, ASR #20	; r10= t[0]-t[7]+8>>4
			
 
				+	STR	r10,[r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+ ]
			
 
				+
			
 
				+ [ OC_ARM_ASM_NEON
			
 
				+	EXPORT	oc_idct8x8_1_neon
			
 
				+	EXPORT	oc_idct8x8_neon
			
 
				+
			
 
				+	ALIGN 16
			
 
				+OC_IDCT_CONSTS_NEON
			
 
				+	DCW	    8
			
 
				+	DCW	64277 ; FB15 (C1S7)
			
 
				+	DCW	60547 ; EC83 (C2S6)
			
 
				+	DCW	54491 ; D4DB (C3S5)
			
 
				+	DCW	46341 ; B505 (C4S4)
			
 
				+	DCW	36410 ; 471D (C5S3)
			
 
				+	DCW	25080 ; 30FC (C6S2)
			
 
				+	DCW	12785 ; 31F1 (C7S1)
			
 
				+
			
 
				+oc_idct8x8_1_neon PROC
			
 
				+	; r0 = ogg_int16_t  *_y
			
 
				+	; r1 = ogg_uint16_t  _dc
			
 
				+	VDUP.S16	Q0, r1
			
 
				+	VMOV		Q1, Q0
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0@128]!
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0@128]
			
 
				+	MOV	PC, r14
			
 
				+	ENDP
			
 
				+
			
 
				+oc_idct8x8_neon PROC
			
 
				+	; r0 = ogg_int16_t *_y
			
 
				+	; r1 = ogg_int16_t *_x
			
 
				+	; r2 = int          _last_zzi
			
 
				+	CMP	r2, #10
			
 
				+	BLE	oc_idct8x8_10_neon
			
 
				+oc_idct8x8_slow_neon
			
 
				+	VPUSH		{D8-D15}
			
 
				+	MOV	r2, r1
			
 
				+	ADR	r3, OC_IDCT_CONSTS_NEON
			
 
				+	; Row transforms (input is pre-transposed)
			
 
				+	VLD1.64		{D16,D17,D18,D19}, [r2@128]!
			
 
				+	VLD1.64		{D20,D21,D22,D23}, [r2@128]!
			
 
				+	VLD1.64		{D24,D25,D26,D27}, [r2@128]!
			
 
				+	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
			
 
				+	VLD1.64		{D28,D29,D30,D31}, [r2@128]
			
 
				+	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
			
 
				+	VLD1.64		{D0,D1},           [r3@128]
			
 
				+	MOV	r12, r14
			
 
				+	BL	oc_idct8x8_stage123_neon
			
 
				+; Stage 4
			
 
				+	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
			
 
				+	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
			
 
				+	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
			
 
				+	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
			
 
				+	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
			
 
				+	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
			
 
				+	VTRN.16		Q14,Q15
			
 
				+	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
			
 
				+	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
			
 
				+	; 8x8 Transpose
			
 
				+	VTRN.16		Q8, Q9
			
 
				+	VTRN.16		Q10,Q11
			
 
				+	VTRN.16		Q12,Q13
			
 
				+	VTRN.32		Q8, Q10
			
 
				+	VTRN.32		Q9, Q11
			
 
				+	VTRN.32		Q12,Q14
			
 
				+	VTRN.32		Q13,Q15
			
 
				+	VSWP		D17,D24
			
 
				+	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
			
 
				+	VSWP		D19,D26
			
 
				+	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
			
 
				+	VSWP		D21,D28
			
 
				+	VSWP		D23,D30
			
 
				+	; Column transforms
			
 
				+	BL	oc_idct8x8_stage123_neon
			
 
				+	; We have to put the return address back in the LR, or the branch
			
 
				+	;  predictor will not recognize the function return and mis-predict the
			
 
				+	;  entire call stack.
			
 
				+	MOV	r14, r12
			
 
				+; Stage 4
			
 
				+	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
			
 
				+	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
			
 
				+	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
			
 
				+	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
			
 
				+	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
			
 
				+	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
			
 
				+	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
			
 
				+	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
			
 
				+	VMOV.I8		Q2,#0
			
 
				+	VPOP		{D8-D15}
			
 
				+	VMOV.I8		Q3,#0
			
 
				+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
			
 
				+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
			
 
				+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
			
 
				+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
			
 
				+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1@128]!
			
 
				+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
			
 
				+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1@128]
			
 
				+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
			
 
				+	VSTMIA		r0, {D16-D31}
			
 
				+	MOV	PC, r14
			
 
				+	ENDP
			
 
				+
			
 
				+oc_idct8x8_stage123_neon PROC
			
 
				+; Stages 1 & 2
			
 
				+	VMULL.S16	Q4, D18,D1[3]
			
 
				+	VMULL.S16	Q5, D19,D1[3]
			
 
				+	VMULL.S16	Q7, D30,D1[3]
			
 
				+	VMULL.S16	Q6, D31,D1[3]
			
 
				+	VMULL.S16	Q2, D30,D0[1]
			
 
				+	VMULL.S16	Q3, D31,D0[1]
			
 
				+	VSHRN.S32	D8, Q4, #16
			
 
				+	VSHRN.S32	D9, Q5, #16	; Q4 = (OC_C7S1*x[1]>>16)
			
 
				+	VSHRN.S32	D14,Q7, #16
			
 
				+	VSHRN.S32	D15,Q6, #16	; Q7 = (OC_C7S1*x[7]>>16)
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q3, #16	; Q2 = (OC_C1S7*x[7]>>16)-x[7]
			
 
				+	VSUB.S16	Q4, Q4, Q15
			
 
				+	VADD.S16	Q7, Q7, Q9
			
 
				+	VSUB.S16	Q4, Q4, Q2	; Q4 = t[4]
			
 
				+	VMULL.S16	Q2, D18,D0[1]
			
 
				+	VMULL.S16	Q9, D19,D0[1]
			
 
				+	VMULL.S16	Q5, D26,D0[3]
			
 
				+	VMULL.S16	Q3, D27,D0[3]
			
 
				+	VMULL.S16	Q6, D22,D0[3]
			
 
				+	VMULL.S16	Q12,D23,D0[3]
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q9, #16	; Q2 = (OC_C1S7*x[1]>>16)-x[1]
			
 
				+	VSHRN.S32	D10,Q5, #16
			
 
				+	VSHRN.S32	D11,Q3, #16	; Q5 = (OC_C3S5*x[5]>>16)-x[5]
			
 
				+	VSHRN.S32	D12,Q6, #16
			
 
				+	VSHRN.S32	D13,Q12,#16	; Q6 = (OC_C3S5*x[3]>>16)-x[3]
			
 
				+	VADD.S16	Q7, Q7, Q2	; Q7 = t[7]
			
 
				+	VSUB.S16	Q5, Q5, Q11
			
 
				+	VADD.S16	Q6, Q6, Q11
			
 
				+	VADD.S16	Q5, Q5, Q13
			
 
				+	VADD.S16	Q6, Q6, Q13
			
 
				+	VMULL.S16	Q9, D22,D1[1]
			
 
				+	VMULL.S16	Q11,D23,D1[1]
			
 
				+	VMULL.S16	Q15,D26,D1[1]
			
 
				+	VMULL.S16	Q13,D27,D1[1]
			
 
				+	VMULL.S16	Q2, D20,D1[2]
			
 
				+	VMULL.S16	Q12,D21,D1[2]
			
 
				+	VSHRN.S32	D18,Q9, #16
			
 
				+	VSHRN.S32	D19,Q11,#16	; Q9 = (OC_C5S3*x[3]>>16)-x[3]
			
 
				+	VSHRN.S32	D30,Q15,#16
			
 
				+	VSHRN.S32	D31,Q13,#16	; Q15= (OC_C5S3*x[5]>>16)-x[5]
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q12,#16	; Q2 = (OC_C6S2*x[2]>>16)
			
 
				+	VSUB.S16	Q5, Q5, Q9	; Q5 = t[5]
			
 
				+	VADD.S16	Q6, Q6, Q15	; Q6 = t[6]
			
 
				+	VSUB.S16	Q2, Q2, Q14
			
 
				+	VMULL.S16	Q3, D28,D1[2]
			
 
				+	VMULL.S16	Q11,D29,D1[2]
			
 
				+	VMULL.S16	Q12,D28,D0[2]
			
 
				+	VMULL.S16	Q9, D29,D0[2]
			
 
				+	VMULL.S16	Q13,D20,D0[2]
			
 
				+	VMULL.S16	Q15,D21,D0[2]
			
 
				+	VSHRN.S32	D6, Q3, #16
			
 
				+	VSHRN.S32	D7, Q11,#16	; Q3 = (OC_C6S2*x[6]>>16)
			
 
				+	VSHRN.S32	D24,Q12,#16
			
 
				+	VSHRN.S32	D25,Q9, #16	; Q12= (OC_C2S6*x[6]>>16)-x[6]
			
 
				+	VSHRN.S32	D26,Q13,#16
			
 
				+	VSHRN.S32	D27,Q15,#16	; Q13= (OC_C2S6*x[2]>>16)-x[2]
			
 
				+	VSUB.S16	Q9, Q4, Q5	; Q9 = t[4]-t[5]
			
 
				+	VSUB.S16	Q11,Q7, Q6	; Q11= t[7]-t[6]
			
 
				+	VADD.S16	Q3, Q3, Q10
			
 
				+	VADD.S16	Q4, Q4, Q5	; Q4 = t[4]'=t[4]+t[5]
			
 
				+	VADD.S16	Q7, Q7, Q6	; Q7 = t[7]'=t[7]+t[6]
			
 
				+	VSUB.S16	Q2, Q2, Q12	; Q2 = t[2]
			
 
				+	VADD.S16	Q3, Q3, Q13	; Q3 = t[3]
			
 
				+	VMULL.S16	Q12,D16,D1[0]
			
 
				+	VMULL.S16	Q13,D17,D1[0]
			
 
				+	VMULL.S16	Q14,D2, D1[0]
			
 
				+	VMULL.S16	Q15,D3, D1[0]
			
 
				+	VMULL.S16	Q5, D18,D1[0]
			
 
				+	VMULL.S16	Q6, D22,D1[0]
			
 
				+	VSHRN.S32	D24,Q12,#16
			
 
				+	VSHRN.S32	D25,Q13,#16
			
 
				+	VSHRN.S32	D28,Q14,#16
			
 
				+	VSHRN.S32	D29,Q15,#16
			
 
				+	VMULL.S16	Q13,D19,D1[0]
			
 
				+	VMULL.S16	Q15,D23,D1[0]
			
 
				+	VADD.S16	Q8, Q8, Q12	; Q8 = t[0]
			
 
				+	VADD.S16	Q1, Q1, Q14	; Q1 = t[1]
			
 
				+	VSHRN.S32	D10,Q5, #16
			
 
				+	VSHRN.S32	D12,Q6, #16
			
 
				+	VSHRN.S32	D11,Q13,#16
			
 
				+	VSHRN.S32	D13,Q15,#16
			
 
				+	VADD.S16	Q5, Q5, Q9	; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16
			
 
				+	VADD.S16	Q6, Q6, Q11	; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16
			
 
				+; Stage 3
			
 
				+	VSUB.S16	Q11,Q8, Q3	; Q11 = t[3]''=t[0]-t[3]
			
 
				+	VADD.S16	Q8, Q8, Q3	; Q8  = t[0]''=t[0]+t[3]
			
 
				+	VADD.S16	Q9, Q1, Q2	; Q9  = t[1]''=t[1]+t[2]
			
 
				+	VADD.S16	Q3, Q6, Q5	; Q3  = t[6]''=t[6]'+t[5]'
			
 
				+	VSUB.S16	Q10,Q1, Q2	; Q10 = t[2]''=t[1]-t[2]
			
 
				+	VSUB.S16	Q5, Q6, Q5	; Q5  = t[5]''=t[6]'-t[5]'
			
 
				+	MOV	PC, r14
			
 
				+	ENDP
			
 
				+
			
 
				+oc_idct8x8_10_neon PROC
			
 
				+	ADR	r3, OC_IDCT_CONSTS_NEON
			
 
				+	VLD1.64		{D0,D1},          [r3@128]
			
 
				+	MOV	r2, r1
			
 
				+	; Row transforms (input is pre-transposed)
			
 
				+; Stage 1
			
 
				+	VLD1.64		{D16,D17,D18,D19},[r2@128]!
			
 
				+	MOV	r12, #16
			
 
				+	VMULL.S16	Q15,D16,D1[0]	; Q15= OC_C4S4*x[0]-(x[0]<<16)
			
 
				+	VLD1.64		{D17},            [r2@64], r12
			
 
				+	VMULL.S16	Q2, D18,D0[1]	; Q2 = OC_C1S7*x[1]-(x[1]<<16)
			
 
				+	VLD1.64		{D19},            [r2@64]
			
 
				+	VMULL.S16	Q14,D17,D0[2]	; Q14= OC_C2S6*x[2]-(x[2]<<16)
			
 
				+	VMULL.S16	Q3, D19,D0[3]	; Q3 = OC_C3S5*x[3]-(x[3]<<16)
			
 
				+	VMULL.S16	Q13,D19,D1[1]	; Q13= OC_C5S3*x[3]-(x[3]<<16)
			
 
				+	VMULL.S16	Q12,D18,D1[3]	; Q12= OC_C7S1*x[1]
			
 
				+	VMULL.S16	Q1, D17,D1[2]	; Q1 = OC_C6S2*x[2]
			
 
				+	VSHRN.S32	D30,Q15,#16	; D30= t[0]-x[0]
			
 
				+	VSHRN.S32	D4, Q2, #16	; D4 = t[7]-x[1]
			
 
				+	VSHRN.S32	D31,Q14,#16	; D31= t[3]-x[2]
			
 
				+	VSHRN.S32	D6, Q3, #16	; D6 = t[6]-x[3]
			
 
				+	VSHRN.S32	D7, Q13,#16	; D7 = -t[5]-x[3]
			
 
				+	VSHRN.S32	D5, Q12,#16	; D5 = t[4]
			
 
				+	VSHRN.S32	D2, Q1, #16	; D2 = t[2]
			
 
				+	VADD.S16	D4, D4, D18	; D4 = t[7]
			
 
				+	VADD.S16	D6, D6, D19	; D6 = t[6]
			
 
				+	VADD.S16	D7, D7, D19	; D7 = -t[5]
			
 
				+	VADD.S16	Q15,Q15,Q8	; D30= t[0]
			
 
				+					; D31= t[3]
			
 
				+; Stages 2 & 3
			
 
				+	VSUB.S16	Q12,Q2, Q3	; D24= t[7]-t[6]
			
 
				+					; D25= t[4]'=t[4]+t[5]
			
 
				+	VADD.S16	Q13,Q2, Q3	; D26= t[7]'=t[7]+t[6]
			
 
				+					; D27= t[4]-t[5]
			
 
				+	VMULL.S16	Q11,D24,D1[0]	; Q11= OC_C4S4*(t[7]-t[6])
			
 
				+					;       -(t[7]-t[6]<<16)
			
 
				+	VMULL.S16	Q14,D27,D1[0]	; Q14= OC_C4S4*(t[4]-t[5])
			
 
				+					;       -(t[4]-t[5]<<16)
			
 
				+	VADD.S16	D16,D30,D31	; D16= t[0]'=t[0]+t[3]
			
 
				+	VSUB.S16	D17,D30,D2	; D17= t[2]'=t[0]-t[2]
			
 
				+	VADD.S16	D18,D30,D2	; D18= t[1]'=t[0]+t[2]
			
 
				+	VSHRN.S32	D22,Q11,#16	; D22= (OC_C4S4*(t[7]-t[6])>>16)
			
 
				+					;       -(t[7]-t[6])
			
 
				+	VSHRN.S32	D23,Q14,#16	; D23= (OC_C4S4*(t[4]-t[5])>>16)
			
 
				+					;       -(t[4]-t[5])
			
 
				+	VSUB.S16	D19,D30,D31	; D19= t[3]'=t[0]-t[3]
			
 
				+	VADD.S16	D22,D22,D24	; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16
			
 
				+	VADD.S16	D23,D23,D27	; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16
			
 
				+	VSUB.S16	D27,D22,D23	; D27= t[5]''=t[6]'-t[5]'
			
 
				+	VADD.S16	D24,D22,D23	; D24= t[6]''=t[6]'+t[5]'
			
 
				+; Stage 4
			
 
				+	VSUB.S16	Q11,Q8, Q13	; D22= y[7]=t[0]'-t[7]'
			
 
				+					; D23= y[5]=t[2]'-t[5]''
			
 
				+	VSUB.S16	Q10,Q9, Q12	; D20= y[6]=t[1]'-t[6]'
			
 
				+					; D21= y[4]=t[3]'-t[4]''
			
 
				+	VADD.S16	Q8, Q8, Q13	; D16= y[0]=t[0]'+t[7]'
			
 
				+					; D17= y[2]=t[2]'+t[5]''
			
 
				+	VADD.S16	Q9, Q9, Q12	; D18= y[1]=t[1]'-t[6]'
			
 
				+					; D19= y[3]=t[3]'-t[4]''
			
 
				+	; 8x4 transpose
			
 
				+	VTRN.16		Q10,Q11		; Q10= c5c4a5a4 c7c6a7a6
			
 
				+					; Q11= d5d4b5b4 d7d6b7b6
			
 
				+	VTRN.16		Q8, Q9		; Q8 = c3c2a3a2 c1c0a1a0
			
 
				+					; Q9 = d3d2b3b2 d1d0b1b0
			
 
				+	VSWP		D20,D21		; Q10= c7c6a7a6 c5c4a5a4
			
 
				+	VSWP		D22,D23		; Q11= d7d6b7b6 d5d4b5b4
			
 
				+	VUZP.32		Q9, Q11		; Q9 = b7b6b5b4 b3b2b1b0
			
 
				+					; Q11= d7d6d5d4 d3d2d1d0
			
 
				+	VMULL.S16	Q15,D18,D0[1]
			
 
				+	VMULL.S16	Q13,D22,D1[1]
			
 
				+	VUZP.32		Q8, Q10		; Q8 = a7a6a5a4 a3a2a1a0
			
 
				+					; Q10= c7c6c5c4 c3c2c1c0
			
 
				+	; Column transforms
			
 
				+; Stages 1, 2, & 3
			
 
				+	VMULL.S16	Q14,D19,D0[1]	; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
			
 
				+	VMULL.S16	Q12,D23,D1[1]	; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
			
 
				+	VMULL.S16	Q3, D22,D0[3]
			
 
				+	VMULL.S16	Q2, D23,D0[3]	;  Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
			
 
				+	VSHRN.S32	D30,Q15,#16
			
 
				+	VSHRN.S32	D31,Q14,#16	; Q15= (OC_C1S7*x[1]>>16)-x[1]
			
 
				+	VSHRN.S32	D26,Q13,#16
			
 
				+	VSHRN.S32	D27,Q12,#16	; Q13= (OC_C5S3*x[3]>>16)-x[3]
			
 
				+	VSHRN.S32	D28,Q3, #16
			
 
				+	VSHRN.S32	D29,Q2, #16	; Q14= (OC_C3S5*x[3]>>16)-x[3]
			
 
				+	VADD.S16	Q15,Q15,Q9	; Q15= t[7]
			
 
				+	VADD.S16	Q13,Q13,Q11	; Q13= -t[5]
			
 
				+	VADD.S16	Q14,Q14,Q11	; Q14= t[6]
			
 
				+	VMULL.S16	Q12,D18,D1[3]
			
 
				+	VMULL.S16	Q2, D19,D1[3]	;  Q2:Q12= OC_C7S1*x[1]
			
 
				+	VMULL.S16	Q1, D16,D1[0]
			
 
				+	VMULL.S16	Q11,D17,D1[0]	; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
			
 
				+	VMULL.S16	Q3, D20,D0[2]
			
 
				+	VMULL.S16	Q9, D21,D0[2]	;  Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
			
 
				+	VSHRN.S32	D24,Q12,#16
			
 
				+	VSHRN.S32	D25,Q2, #16	; Q12= t[4]
			
 
				+	VMULL.S16	Q2, D20,D1[2]
			
 
				+	VSHRN.S32	D2, Q1, #16
			
 
				+	VSHRN.S32	D3, Q11,#16	; Q1 = (OC_C4S4*x[0]>>16)-x[0]
			
 
				+	VMULL.S16	Q11,D21,D1[2]	;  Q2:Q11= OC_C6S2*x[2]
			
 
				+	VSHRN.S32	D6, Q3, #16
			
 
				+	VSHRN.S32	D7, Q9, #16	; Q3 = (OC_C2S6*x[2]>>16)-x[2]
			
 
				+	VSUB.S16	Q9, Q15,Q14	; Q9 = t[7]-t[6]
			
 
				+	VADD.S16	Q15,Q15,Q14	; Q15= t[7]'=t[7]+t[6]
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q11,#16	; Q2 = t[2]
			
 
				+	VADD.S16	Q1, Q1, Q8	; Q1 = t[0]
			
 
				+	VADD.S16	Q8, Q12,Q13	; Q8 = t[4]-t[5]
			
 
				+	VADD.S16	Q3, Q3, Q10	; Q3 = t[3]
			
 
				+	VMULL.S16	Q10,D16,D1[0]
			
 
				+	VMULL.S16	Q11,D17,D1[0]	; Q11:Q10= OC_C4S4*(t[4]-t[5])
			
 
				+					;           -(t[4]-t[5]<<16)
			
 
				+	VSUB.S16	Q12,Q12,Q13	; Q12= t[4]'=t[4]+t[5]
			
 
				+	VMULL.S16	Q14,D18,D1[0]
			
 
				+	VMULL.S16	Q13,D19,D1[0]	; Q13:Q14= OC_C4S4*(t[6]-t[7])
			
 
				+					;           -(t[6]-t[7]<<16)
			
 
				+	VSHRN.S32	D20,Q10,#16
			
 
				+	VSHRN.S32	D21,Q11,#16	; Q10= (OC_C4S4*(t[4]-t[5])>>16)
			
 
				+					;       -(t[4]-t[5])
			
 
				+	VADD.S16	Q11,Q1, Q3	; Q11= t[0]'=t[0]+t[3]
			
 
				+	VSUB.S16	Q3, Q1, Q3	; Q3 = t[3]'=t[0]-t[3]
			
 
				+	VSHRN.S32	D28,Q14,#16
			
 
				+	VSHRN.S32	D29,Q13,#16	; Q14= (OC_C4S4*(t[7]-t[6])>>16)
			
 
				+					;       -(t[7]-t[6])
			
 
				+	VADD.S16	Q10,Q10,Q8	; Q10=t[5]'
			
 
				+	VADD.S16	Q14,Q14,Q9	; Q14=t[6]'
			
 
				+	VSUB.S16	Q13,Q14,Q10	; Q13=t[5]''=t[6]'-t[5]'
			
 
				+	VADD.S16	Q14,Q14,Q10	; Q14=t[6]''=t[6]'+t[5]'
			
 
				+	VADD.S16	Q10,Q1, Q2	; Q10= t[1]'=t[0]+t[2]
			
 
				+	VSUB.S16	Q2, Q1, Q2	; Q2 = t[2]'=t[0]-t[2]
			
 
				+; Stage 4
			
 
				+	VADD.S16	Q8, Q11,Q15	; Q8  = y[0]=t[0]'+t[7]'
			
 
				+	VADD.S16	Q9, Q10,Q14	; Q9  = y[1]=t[1]'+t[6]''
			
 
				+	VSUB.S16	Q15,Q11,Q15	; Q15 = y[7]=t[0]'-t[7]'
			
 
				+	VSUB.S16	Q14,Q10,Q14	; Q14 = y[6]=t[1]'-t[6]''
			
 
				+	VADD.S16	Q10,Q2, Q13	; Q10 = y[2]=t[2]'+t[5]''
			
 
				+	VADD.S16	Q11,Q3, Q12	; Q11 = y[3]=t[3]'+t[4]'
			
 
				+	VSUB.S16	Q12,Q3, Q12	; Q12 = y[4]=t[3]'-t[4]'
			
 
				+	VSUB.S16	Q13,Q2, Q13	; Q13 = y[5]=t[2]'-t[5]''
			
 
				+	VMOV.I8		D2, #0
			
 
				+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
			
 
				+	VST1.64		{D2}, [r1@64], r12
			
 
				+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
			
 
				+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
			
 
				+	VST1.64		{D2}, [r1@64], r12
			
 
				+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
			
 
				+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
			
 
				+	VST1.64		{D2}, [r1@64], r12
			
 
				+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
			
 
				+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
			
 
				+	VST1.64		{D2}, [r1@64]
			
 
				+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
			
 
				+	VSTMIA		r0, {D16-D31}
			
 
				+	MOV	PC, r14
			
 
				+	ENDP
			
 
				+ ]
			
 
				+
			
 
				+	END
			
--- a/modules/theoraplayer/native/theora/lib/arm/armint.h
+++ b/modules/theoraplayer/native/theora/lib/arm/armint.h
@@ -0,0 +1,126 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#if !defined(_arm_armint_H)
			
 
				+# define _arm_armint_H (1)
			
 
				+# include "../internal.h"
			
 
				+
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+
			
 
				+#  if defined(__ARMEB__)
			
 
				+#   error "Big-endian configurations are not supported by the ARM asm. " \
			
 
				+ "Reconfigure with --disable-asm or undefine OC_ARM_ASM."
			
 
				+#  endif
			
 
				+
			
 
				+#  define oc_state_accel_init oc_state_accel_init_arm
			
 
				+/*This function is implemented entirely in asm, so it's helpful to pull out all
			
 
				+   of the things that depend on structure offsets.
			
 
				+  We reuse the function pointer with the wrong prototype, though.*/
			
 
				+#  define oc_state_loop_filter_frag_rows(_state,_bv,_refi,_pli, \
			
 
				+ _fragy0,_fragy_end) \
			
 
				+  ((oc_loop_filter_frag_rows_arm_func) \
			
 
				+   (_state)->opt_vtable.state_loop_filter_frag_rows)( \
			
 
				+   (_state)->ref_frame_data[(_refi)],(_state)->ref_ystride[(_pli)], \
			
 
				+   (_bv), \
			
 
				+   (_state)->frags, \
			
 
				+   (_state)->fplanes[(_pli)].froffset \
			
 
				+   +(_fragy0)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
			
 
				+   (_state)->fplanes[(_pli)].froffset \
			
 
				+   +(_fragy_end)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
			
 
				+   (_state)->fplanes[(_pli)].froffset, \
			
 
				+   (_state)->fplanes[(_pli)].froffset+(_state)->fplanes[(_pli)].nfrags, \
			
 
				+   (_state)->frag_buf_offs, \
			
 
				+   (_state)->fplanes[(_pli)].nhfrags)
			
 
				+/*For everything else the default vtable macros are fine.*/
			
 
				+#  define OC_STATE_USE_VTABLE (1)
			
 
				+# endif
			
 
				+
			
 
				+# include "../state.h"
			
 
				+# include "armcpu.h"
			
 
				+
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+typedef void (*oc_loop_filter_frag_rows_arm_func)(
			
 
				+ unsigned char *_ref_frame_data,int _ystride,signed char _bv[256],
			
 
				+ const oc_fragment *_frags,ptrdiff_t _fragi0,ptrdiff_t _fragi0_end,
			
 
				+ ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
			
 
				+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
			
 
				+
			
 
				+void oc_state_accel_init_arm(oc_theora_state *_state);
			
 
				+void oc_frag_copy_list_arm(unsigned char *_dst_frame,
			
 
				+ const unsigned char *_src_frame,int _ystride,
			
 
				+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
			
 
				+void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride,
			
 
				+ const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter_arm(unsigned char *_dst,const unsigned char *_src,
			
 
				+ int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
			
 
				+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_idct8x8_1_arm(ogg_int16_t _y[64],ogg_uint16_t _dc);
			
 
				+void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
			
 
				+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
			
 
				+void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data,
			
 
				+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
			
 
				+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
			
 
				+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
			
 
				+
			
 
				+#  if defined(OC_ARM_ASM_EDSP)
			
 
				+void oc_frag_copy_list_edsp(unsigned char *_dst_frame,
			
 
				+ const unsigned char *_src_frame,int _ystride,
			
 
				+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
			
 
				+
			
 
				+#   if defined(OC_ARM_ASM_MEDIA)
			
 
				+void oc_frag_recon_intra_v6(unsigned char *_dst,int _ystride,
			
 
				+ const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter_v6(unsigned char *_dst,const unsigned char *_src,
			
 
				+ int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1,
			
 
				+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_idct8x8_1_v6(ogg_int16_t _y[64],ogg_uint16_t _dc);
			
 
				+void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
			
 
				+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
			
 
				+void oc_loop_filter_init_v6(signed char *_bv,int _flimit);
			
 
				+void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data,
			
 
				+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
			
 
				+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
			
 
				+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
			
 
				+
			
 
				+#    if defined(OC_ARM_ASM_NEON)
			
 
				+void oc_frag_copy_list_neon(unsigned char *_dst_frame,
			
 
				+ const unsigned char *_src_frame,int _ystride,
			
 
				+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
			
 
				+void oc_frag_recon_intra_neon(unsigned char *_dst,int _ystride,
			
 
				+ const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter_neon(unsigned char *_dst,const unsigned char *_src,
			
 
				+ int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1,
			
 
				+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_idct8x8_1_neon(ogg_int16_t _y[64],ogg_uint16_t _dc);
			
 
				+void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
			
 
				+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
			
 
				+void oc_loop_filter_init_neon(signed char *_bv,int _flimit);
			
 
				+void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data,
			
 
				+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
			
 
				+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
			
 
				+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
			
 
				+#    endif
			
 
				+#   endif
			
 
				+#  endif
			
 
				+# endif
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm/armloop.asm
+++ b/modules/theoraplayer/native/theora/lib/arm/armloop.asm
@@ -0,0 +1,677 @@
 
				+@********************************************************************
			
 
				+@*                                                                  *
			
 
				+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+@*                                                                  *
			
 
				+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+@*                                                                  *
			
 
				+@********************************************************************
			
 
				+@ Original implementation:
			
 
				+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+@ last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $
			
 
				+@********************************************************************
			
 
				+
			
 
				+    .text;   .p2align 2
			
 
				+
			
 
				+	.include "armopts-gnu.S"
			
 
				+
			
 
				+	.global	oc_loop_filter_frag_rows_arm
			
 
				+
			
 
				+@ Which bit this is depends on the order of packing within a bitfield.
			
 
				+@ Hopefully that doesn't change among any of the relevant compilers.
			
 
				+ .set OC_FRAG_CODED_FLAG,	1
			
 
				+
			
 
				+	@ Vanilla ARM v4 version
			
 
				+	.type	loop_filter_h_arm, %function; loop_filter_h_arm: @ PROC
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int           *_bv
			
 
				+	@ preserves r0-r3
			
 
				+	STMFD	r13!,{r3-r6,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r6, #255
			
 
				+lfh_arm_lp:
			
 
				+	LDRB	r3, [r0, #-2]		@ r3 = _pix[0]
			
 
				+	LDRB	r12,[r0, #1]		@ r12= _pix[3]
			
 
				+	LDRB	r4, [r0, #-1]		@ r4 = _pix[1]
			
 
				+	LDRB	r5, [r0]		@ r5 = _pix[2]
			
 
				+	SUB	r3, r3, r12		@ r3 = _pix[0]-_pix[3]+4
			
 
				+	ADD	r3, r3, #4
			
 
				+	SUB	r12,r5, r4		@ r12= _pix[2]-_pix[1]
			
 
				+	ADD	r12,r12,r12,LSL #1	@ r12= 3*(_pix[2]-_pix[1])
			
 
				+	ADD	r12,r12,r3	@ r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
			
 
				+	MOV	r12,r12,ASR #3
			
 
				+	LDRSB	r12,[r2, r12]
			
 
				+	@ Stall (2 on Xscale)
			
 
				+	ADDS	r4, r4, r12
			
 
				+	CMPGT	r6, r4
			
 
				+	EORLT	r4, r6, r4, ASR #32
			
 
				+	SUBS	r5, r5, r12
			
 
				+	CMPGT	r6, r5
			
 
				+	EORLT	r5, r6, r5, ASR #32
			
 
				+	STRB	r4, [r0, #-1]
			
 
				+	STRB	r5, [r0], r1
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	lfh_arm_lp
			
 
				+	SUB	r0, r0, r1, LSL #3
			
 
				+	LDMFD	r13!,{r3-r6,PC}
			
 
				+	.size loop_filter_h_arm, .-loop_filter_h_arm	@ ENDP
			
 
				+
			
 
				+	.type	loop_filter_v_arm, %function; loop_filter_v_arm: @ PROC
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int           *_bv
			
 
				+	@ preserves r0-r3
			
 
				+	STMFD	r13!,{r3-r6,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r6, #255
			
 
				+lfv_arm_lp:
			
 
				+	LDRB	r3, [r0, -r1, LSL #1]	@ r3 = _pix[0]
			
 
				+	LDRB	r12,[r0, r1]		@ r12= _pix[3]
			
 
				+	LDRB	r4, [r0, -r1]		@ r4 = _pix[1]
			
 
				+	LDRB	r5, [r0]		@ r5 = _pix[2]
			
 
				+	SUB	r3, r3, r12		@ r3 = _pix[0]-_pix[3]+4
			
 
				+	ADD	r3, r3, #4
			
 
				+	SUB	r12,r5, r4		@ r12= _pix[2]-_pix[1]
			
 
				+	ADD	r12,r12,r12,LSL #1	@ r12= 3*(_pix[2]-_pix[1])
			
 
				+	ADD	r12,r12,r3	@ r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
			
 
				+	MOV	r12,r12,ASR #3
			
 
				+	LDRSB	r12,[r2, r12]
			
 
				+	@ Stall (2 on Xscale)
			
 
				+	ADDS	r4, r4, r12
			
 
				+	CMPGT	r6, r4
			
 
				+	EORLT	r4, r6, r4, ASR #32
			
 
				+	SUBS	r5, r5, r12
			
 
				+	CMPGT	r6, r5
			
 
				+	EORLT	r5, r6, r5, ASR #32
			
 
				+	STRB	r4, [r0, -r1]
			
 
				+	STRB	r5, [r0], #1
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	lfv_arm_lp
			
 
				+	SUB	r0, r0, #8
			
 
				+	LDMFD	r13!,{r3-r6,PC}
			
 
				+	.size loop_filter_v_arm, .-loop_filter_v_arm	@ ENDP
			
 
				+
			
 
				+	.type	oc_loop_filter_frag_rows_arm, %function; oc_loop_filter_frag_rows_arm: @ PROC
			
 
				+	@ r0 = _ref_frame_data
			
 
				+	@ r1 = _ystride
			
 
				+	@ r2 = _bv
			
 
				+	@ r3 = _frags
			
 
				+	@ r4 = _fragi0
			
 
				+	@ r5 = _fragi0_end
			
 
				+	@ r6 = _fragi_top
			
 
				+	@ r7 = _fragi_bot
			
 
				+	@ r8 = _frag_buf_offs
			
 
				+	@ r9 = _nhfrags
			
 
				+	MOV	r12,r13
			
 
				+	STMFD	r13!,{r0,r4-r11,r14}
			
 
				+	LDMFD	r12,{r4-r9}
			
 
				+	ADD	r2, r2, #127	@ _bv += 127
			
 
				+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
			
 
				+	BGE	oslffri_arm_end	@   bail
			
 
				+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
			
 
				+	BLE	oslffri_arm_end	@			  bail
			
 
				+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
			
 
				+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
			
 
				+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
			
 
				+oslffri_arm_lp1:
			
 
				+	MOV	r10,r4		@ r10= fragi = _fragi0
			
 
				+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
			
 
				+oslffri_arm_lp2:
			
 
				+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
			
 
				+	LDR	r0, [r13]	@ r0 = _ref_frame_data
			
 
				+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
			
 
				+	TST	r14,#OC_FRAG_CODED_FLAG
			
 
				+	BEQ	oslffri_arm_uncoded
			
 
				+	CMP	r10,r4		@ if (fragi>_fragi0)
			
 
				+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
			
 
				+	BLGT	loop_filter_h_arm
			
 
				+	CMP	r4, r6		@ if (_fragi0>_fragi_top)
			
 
				+	BLGT	loop_filter_v_arm
			
 
				+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
			
 
				+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
			
 
				+	ADD	r0, r0, #8
			
 
				+	ADD	r10,r10,#1	@ r10 = fragi+1;
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
			
 
				+	BLLT	loop_filter_h_arm
			
 
				+	CMP	r10,r7		@ if (fragi<_fragi_bot)
			
 
				+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
			
 
				+	SUB	r0, r0, #8
			
 
				+	ADD	r0, r0, r1, LSL #3
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG
			
 
				+	BLLT	loop_filter_v_arm
			
 
				+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
			
 
				+	BLE	oslffri_arm_lp2
			
 
				+	MOV	r4, r10		@ r4 = fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_arm_lp1
			
 
				+oslffri_arm_end:
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+oslffri_arm_uncoded:
			
 
				+	ADD	r10,r10,#1
			
 
				+	CMP	r10,r11
			
 
				+	BLE	oslffri_arm_lp2
			
 
				+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_arm_lp1
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+	.size oc_loop_filter_frag_rows_arm, .-oc_loop_filter_frag_rows_arm	@ ENDP
			
 
				+
			
 
				+  .if OC_ARM_ASM_MEDIA
			
 
				+	.global	oc_loop_filter_init_v6
			
 
				+	.global	oc_loop_filter_frag_rows_v6
			
 
				+
			
 
				+	.type	oc_loop_filter_init_v6, %function; oc_loop_filter_init_v6: @ PROC
			
 
				+	@ r0 = _bv
			
 
				+	@ r1 = _flimit (=L from the spec)
			
 
				+	MVN	r1, r1, LSL #1		@ r1 = <0xFFFFFF|255-2*L>
			
 
				+	AND	r1, r1, #255		@ r1 = ll=r10x0xFF
			
 
				+	ORR	r1, r1, r1, LSL #8	@ r1 = <ll|ll>
			
 
				+	PKHBT	r1, r1, r1, LSL #16	@ r1 = <ll|ll|ll|ll>
			
 
				+	STR	r1, [r0]
			
 
				+	MOV	PC,r14
			
 
				+	.size oc_loop_filter_init_v6, .-oc_loop_filter_init_v6	@ ENDP
			
 
				+
			
 
				+@ We could use the same strategy as the v filter below, but that would require
			
 
				+@  40 instructions to load the data and transpose it into columns and another
			
 
				+@  32 to write out the results at the end, plus the 52 instructions to do the
			
 
				+@  filtering itself.
			
 
				+@ This is slightly less, and less code, even assuming we could have shared the
			
 
				+@  52 instructions in the middle with the other function.
			
 
				+@ It executes slightly fewer instructions than the ARMv6 approach David Conrad
			
 
				+@  proposed for FFmpeg, but not by much:
			
 
				+@  http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
			
 
				+@ His is a lot less code, though, because it only does two rows at once instead
			
 
				+@  of four.
			
 
				+	.type	loop_filter_h_v6, %function; loop_filter_h_v6: @ PROC
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int            _ll
			
 
				+	@ preserves r0-r3
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	LDR	r12,=0x10003
			
 
				+	BL loop_filter_h_core_v6
			
 
				+	ADD	r0, r0, r1, LSL #2
			
 
				+	BL loop_filter_h_core_v6
			
 
				+	SUB	r0, r0, r1, LSL #2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	.size loop_filter_h_v6, .-loop_filter_h_v6	@ ENDP
			
 
				+
			
 
				+	.type	loop_filter_h_core_v6, %function; loop_filter_h_core_v6: @ PROC
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int            _ll
			
 
				+	@ r12= 0x10003
			
 
				+	@ Preserves r0-r3, r12; Clobbers r4-r11.
			
 
				+	LDR	r4,[r0, #-2]!		@ r4 = <p3|p2|p1|p0>
			
 
				+	@ Single issue
			
 
				+	LDR	r5,[r0, r1]!		@ r5 = <q3|q2|q1|q0>
			
 
				+	UXTB16	r6, r4, ROR #16		@ r6 = <p0|p2>
			
 
				+	UXTB16	r4, r4, ROR #8		@ r4 = <p3|p1>
			
 
				+	UXTB16	r7, r5, ROR #16		@ r7 = <q0|q2>
			
 
				+	UXTB16	r5, r5, ROR #8		@ r5 = <q3|q1>
			
 
				+	PKHBT	r8, r4, r5, LSL #16	@ r8 = <__|q1|__|p1>
			
 
				+	PKHBT	r9, r6, r7, LSL #16	@ r9 = <__|q2|__|p2>
			
 
				+	SSUB16	r6, r4, r6		@ r6 = <p3-p0|p1-p2>
			
 
				+	SMLAD	r6, r6, r12,r12		@ r6 = <????|(p3-p0)+3*(p1-p2)+3>
			
 
				+	SSUB16	r7, r5, r7		@ r7 = <q3-q0|q1-q2>
			
 
				+	SMLAD	r7, r7, r12,r12		@ r7 = <????|(q0-q3)+3*(q2-q1)+4>
			
 
				+	LDR	r4,[r0, r1]!		@ r4 = <r3|r2|r1|r0>
			
 
				+	MOV	r6, r6, ASR #3		@ r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
			
 
				+	LDR	r5,[r0, r1]!		@ r5 = <s3|s2|s1|s0>
			
 
				+	PKHBT	r11,r6, r7, LSL #13	@ r11= <??|-R_q|??|-R_p>
			
 
				+	UXTB16	r6, r4, ROR #16		@ r6 = <r0|r2>
			
 
				+	UXTB16	r11,r11			@ r11= <__|-R_q|__|-R_p>
			
 
				+	UXTB16	r4, r4, ROR #8		@ r4 = <r3|r1>
			
 
				+	UXTB16	r7, r5, ROR #16		@ r7 = <s0|s2>
			
 
				+	PKHBT	r10,r6, r7, LSL #16	@ r10= <__|s2|__|r2>
			
 
				+	SSUB16	r6, r4, r6		@ r6 = <r3-r0|r1-r2>
			
 
				+	UXTB16	r5, r5, ROR #8		@ r5 = <s3|s1>
			
 
				+	SMLAD	r6, r6, r12,r12		@ r6 = <????|(r3-r0)+3*(r2-r1)+3>
			
 
				+	SSUB16	r7, r5, r7		@ r7 = <r3-r0|r1-r2>
			
 
				+	SMLAD	r7, r7, r12,r12		@ r7 = <????|(s0-s3)+3*(s2-s1)+4>
			
 
				+	ORR	r9, r9, r10, LSL #8	@ r9 = <s2|q2|r2|p2>
			
 
				+	MOV	r6, r6, ASR #3		@ r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
			
 
				+	PKHBT	r10,r4, r5, LSL #16	@ r10= <__|s1|__|r1>
			
 
				+	PKHBT	r6, r6, r7, LSL #13	@ r6 = <??|-R_s|??|-R_r>
			
 
				+	ORR	r8, r8, r10, LSL #8	@ r8 = <s1|q1|r1|p1>
			
 
				+	UXTB16	r6, r6			@ r6 = <__|-R_s|__|-R_r>
			
 
				+	MOV	r10,#0
			
 
				+	ORR	r6, r11,r6, LSL #8	@ r6 = <-R_s|-R_q|-R_r|-R_p>
			
 
				+	@ Single issue
			
 
				+	@ There's no min, max or abs instruction.
			
 
				+	@ SSUB8 and SEL will work for abs, and we can do all the rest with
			
 
				+	@  unsigned saturated adds, which means the GE flags are still all
			
 
				+	@  set when we're done computing lflim(abs(R_i),L).
			
 
				+	@ This allows us to both add and subtract, and split the results by
			
 
				+	@  the original sign of R_i.
			
 
				+	SSUB8	r7, r10,r6
			
 
				+	@ Single issue
			
 
				+	SEL	r7, r7, r6		@ r7 = abs(R_i)
			
 
				+	@ Single issue
			
 
				+	UQADD8	r4, r7, r2		@ r4 = 255-max(2*L-abs(R_i),0)
			
 
				+	@ Single issue
			
 
				+	UQADD8	r7, r7, r4
			
 
				+	@ Single issue
			
 
				+	UQSUB8	r7, r7, r4		@ r7 = min(abs(R_i),max(2*L-abs(R_i),0))
			
 
				+	@ Single issue
			
 
				+	UQSUB8	r4, r8, r7
			
 
				+	UQADD8	r5, r9, r7
			
 
				+	UQADD8	r8, r8, r7
			
 
				+	UQSUB8	r9, r9, r7
			
 
				+	SEL	r8, r8, r4		@ r8 = p1+lflim(R_i,L)
			
 
				+	SEL	r9, r9, r5		@ r9 = p2-lflim(R_i,L)
			
 
				+	MOV	r5, r9, LSR #24		@ r5 = s2
			
 
				+	STRB	r5, [r0,#2]!
			
 
				+	MOV	r4, r8, LSR #24		@ r4 = s1
			
 
				+	STRB	r4, [r0,#-1]
			
 
				+	MOV	r5, r9, LSR #8		@ r5 = r2
			
 
				+	STRB	r5, [r0,-r1]!
			
 
				+	MOV	r4, r8, LSR #8		@ r4 = r1
			
 
				+	STRB	r4, [r0,#-1]
			
 
				+	MOV	r5, r9, LSR #16		@ r5 = q2
			
 
				+	STRB	r5, [r0,-r1]!
			
 
				+	MOV	r4, r8, LSR #16		@ r4 = q1
			
 
				+	STRB	r4, [r0,#-1]
			
 
				+	@ Single issue
			
 
				+	STRB	r9, [r0,-r1]!
			
 
				+	@ Single issue
			
 
				+	STRB	r8, [r0,#-1]
			
 
				+	MOV	PC,r14
			
 
				+	.size loop_filter_h_core_v6, .-loop_filter_h_core_v6	@ ENDP
			
 
				+
			
 
				+@ This uses the same strategy as the MMXEXT version for x86, except that UHADD8
			
 
				+@  computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
			
 
				+@ This works just as well, with the following procedure for computing the
			
 
				+@  filter value, f:
			
 
				+@   u = ~UHADD8(p1,~p2);
			
 
				+@   v = UHADD8(~p1,p2);
			
 
				+@   m = v-u;
			
 
				+@   a = m^UHADD8(m^p0,m^~p3);
			
 
				+@   f = UHADD8(UHADD8(a,u1),v1);
			
 
				+@  where f = 127+R, with R in [-127,128] defined as in the spec.
			
 
				+@ This is exactly the same amount of arithmetic as the version that uses PAVGB
			
 
				+@  as the basic operator.
			
 
				+@ It executes about 2/3 the number of instructions of David Conrad's approach,
			
 
				+@  but requires more code, because it does all eight columns at once, instead
			
 
				+@  of four at a time.
			
 
				+	.type	loop_filter_v_v6, %function; loop_filter_v_v6: @ PROC
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int            _ll
			
 
				+	@ preserves r0-r11
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	LDRD	r6, [r0, -r1]!		@ r7, r6 = <p5|p1>
			
 
				+	LDRD	r4, [r0, -r1]		@ r5, r4 = <p4|p0>
			
 
				+	LDRD	r8, [r0, r1]!		@ r9, r8 = <p6|p2>
			
 
				+	MVN	r14,r6			@ r14= ~p1
			
 
				+	LDRD	r10,[r0, r1]		@ r11,r10= <p7|p3>
			
 
				+	@ Filter the first four columns.
			
 
				+	MVN	r12,r8			@ r12= ~p2
			
 
				+	UHADD8	r14,r14,r8		@ r14= v1=~p1+p2>>1
			
 
				+	UHADD8	r12,r12,r6		@ r12= p1+~p2>>1
			
 
				+	MVN	r10, r10		@ r10=~p3
			
 
				+	MVN	r12,r12			@ r12= u1=~p1+p2+1>>1
			
 
				+	SSUB8	r14,r14,r12		@ r14= m1=v1-u1
			
 
				+	@ Single issue
			
 
				+	EOR	r4, r4, r14		@ r4 = m1^p0
			
 
				+	EOR	r10,r10,r14		@ r10= m1^~p3
			
 
				+	UHADD8	r4, r4, r10		@ r4 = (m1^p0)+(m1^~p3)>>1
			
 
				+	@ Single issue
			
 
				+	EOR	r4, r4, r14		@ r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
			
 
				+	SADD8	r14,r14,r12		@ r14= v1=m1+u1
			
 
				+	UHADD8	r4, r4, r12		@ r4 = a1+u1>>1
			
 
				+	MVN	r12,r9			@ r12= ~p6
			
 
				+	UHADD8	r4, r4, r14		@ r4 = f1=(a1+u1>>1)+v1>>1
			
 
				+	@ Filter the second four columns.
			
 
				+	MVN	r14,r7			@ r14= ~p5
			
 
				+	UHADD8	r12,r12,r7		@ r12= p5+~p6>>1
			
 
				+	UHADD8	r14,r14,r9		@ r14= v2=~p5+p6>>1
			
 
				+	MVN	r12,r12			@ r12= u2=~p5+p6+1>>1
			
 
				+	MVN	r11,r11			@ r11=~p7
			
 
				+	SSUB8	r10,r14,r12		@ r10= m2=v2-u2
			
 
				+	@ Single issue
			
 
				+	EOR	r5, r5, r10		@ r5 = m2^p4
			
 
				+	EOR	r11,r11,r10		@ r11= m2^~p7
			
 
				+	UHADD8	r5, r5, r11		@ r5 = (m2^p4)+(m2^~p7)>>1
			
 
				+	@ Single issue
			
 
				+	EOR	r5, r5, r10		@ r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
			
 
				+	@ Single issue
			
 
				+	UHADD8	r5, r5, r12		@ r5 = a2+u2>>1
			
 
				+	LDR	r12,=0x7F7F7F7F		@ r12 = {127}x4
			
 
				+	UHADD8	r5, r5, r14		@ r5 = f2=(a2+u2>>1)+v2>>1
			
 
				+	@ Now split f[i] by sign.
			
 
				+	@ There's no min or max instruction.
			
 
				+	@ We could use SSUB8 and SEL, but this is just as many instructions and
			
 
				+	@  dual issues more (for v7 without NEON).
			
 
				+	UQSUB8	r10,r4, r12		@ r10= R_i>0?R_i:0
			
 
				+	UQSUB8	r4, r12,r4		@ r4 = R_i<0?-R_i:0
			
 
				+	UQADD8	r11,r10,r2		@ r11= 255-max(2*L-abs(R_i<0),0)
			
 
				+	UQADD8	r14,r4, r2		@ r14= 255-max(2*L-abs(R_i>0),0)
			
 
				+	UQADD8	r10,r10,r11
			
 
				+	UQADD8	r4, r4, r14
			
 
				+	UQSUB8	r10,r10,r11		@ r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
			
 
				+	UQSUB8	r4, r4, r14		@ r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
			
 
				+	UQSUB8	r11,r5, r12		@ r11= R_i>0?R_i:0
			
 
				+	UQADD8	r6, r6, r10
			
 
				+	UQSUB8	r8, r8, r10
			
 
				+	UQSUB8	r5, r12,r5		@ r5 = R_i<0?-R_i:0
			
 
				+	UQSUB8	r6, r6, r4		@ r6 = p1+lflim(R_i,L)
			
 
				+	UQADD8	r8, r8, r4		@ r8 = p2-lflim(R_i,L)
			
 
				+	UQADD8	r10,r11,r2		@ r10= 255-max(2*L-abs(R_i<0),0)
			
 
				+	UQADD8	r14,r5, r2		@ r14= 255-max(2*L-abs(R_i>0),0)
			
 
				+	UQADD8	r11,r11,r10
			
 
				+	UQADD8	r5, r5, r14
			
 
				+	UQSUB8	r11,r11,r10		@ r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
			
 
				+	UQSUB8	r5, r5, r14		@ r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
			
 
				+	UQADD8	r7, r7, r11
			
 
				+	UQSUB8	r9, r9, r11
			
 
				+	UQSUB8	r7, r7, r5		@ r7 = p5+lflim(R_i,L)
			
 
				+	STRD	r6, [r0, -r1]		@ [p5:p1] = [r7: r6]
			
 
				+	UQADD8	r9, r9, r5		@ r9 = p6-lflim(R_i,L)
			
 
				+	STRD	r8, [r0]		@ [p6:p2] = [r9: r8]
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	.size loop_filter_v_v6, .-loop_filter_v_v6	@ ENDP
			
 
				+
			
 
				+	.type	oc_loop_filter_frag_rows_v6, %function; oc_loop_filter_frag_rows_v6: @ PROC
			
 
				+	@ r0 = _ref_frame_data
			
 
				+	@ r1 = _ystride
			
 
				+	@ r2 = _bv
			
 
				+	@ r3 = _frags
			
 
				+	@ r4 = _fragi0
			
 
				+	@ r5 = _fragi0_end
			
 
				+	@ r6 = _fragi_top
			
 
				+	@ r7 = _fragi_bot
			
 
				+	@ r8 = _frag_buf_offs
			
 
				+	@ r9 = _nhfrags
			
 
				+	MOV	r12,r13
			
 
				+	STMFD	r13!,{r0,r4-r11,r14}
			
 
				+	LDMFD	r12,{r4-r9}
			
 
				+	LDR	r2, [r2]	@ ll = *(int *)_bv
			
 
				+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
			
 
				+	BGE	oslffri_v6_end	@   bail
			
 
				+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
			
 
				+	BLE	oslffri_v6_end	@			  bail
			
 
				+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
			
 
				+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
			
 
				+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
			
 
				+oslffri_v6_lp1:
			
 
				+	MOV	r10,r4		@ r10= fragi = _fragi0
			
 
				+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
			
 
				+oslffri_v6_lp2:
			
 
				+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
			
 
				+	LDR	r0, [r13]	@ r0 = _ref_frame_data
			
 
				+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
			
 
				+	TST	r14,#OC_FRAG_CODED_FLAG
			
 
				+	BEQ	oslffri_v6_uncoded
			
 
				+	CMP	r10,r4		@ if (fragi>_fragi0)
			
 
				+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
			
 
				+	BLGT	loop_filter_h_v6
			
 
				+	CMP	r4, r6		@ if (fragi0>_fragi_top)
			
 
				+	BLGT	loop_filter_v_v6
			
 
				+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
			
 
				+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
			
 
				+	ADD	r0, r0, #8
			
 
				+	ADD	r10,r10,#1	@ r10 = fragi+1;
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
			
 
				+	BLLT	loop_filter_h_v6
			
 
				+	CMP	r10,r7		@ if (fragi<_fragi_bot)
			
 
				+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
			
 
				+	SUB	r0, r0, #8
			
 
				+	ADD	r0, r0, r1, LSL #3
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG
			
 
				+	BLLT	loop_filter_v_v6
			
 
				+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
			
 
				+	BLE	oslffri_v6_lp2
			
 
				+	MOV	r4, r10		@ r4 = fragi0 += nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_v6_lp1
			
 
				+oslffri_v6_end:
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+oslffri_v6_uncoded:
			
 
				+	ADD	r10,r10,#1
			
 
				+	CMP	r10,r11
			
 
				+	BLE	oslffri_v6_lp2
			
 
				+	MOV	r4, r10		@ r4 = fragi0 += nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_v6_lp1
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+	.size oc_loop_filter_frag_rows_v6, .-oc_loop_filter_frag_rows_v6	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+  .if OC_ARM_ASM_NEON
			
 
				+	.global	oc_loop_filter_init_neon
			
 
				+	.global	oc_loop_filter_frag_rows_neon
			
 
				+
			
 
				+	.type	oc_loop_filter_init_neon, %function; oc_loop_filter_init_neon: @ PROC
			
 
				+	@ r0 = _bv
			
 
				+	@ r1 = _flimit (=L from the spec)
			
 
				+	MOV		r1, r1, LSL #1  @ r1 = 2*L
			
 
				+	VDUP.S16	Q15, r1		@ Q15= 2L in U16s
			
 
				+	VST1.64		{D30,D31}, [r0,:128]
			
 
				+	MOV	PC,r14
			
 
				+	.size oc_loop_filter_init_neon, .-oc_loop_filter_init_neon	@ ENDP
			
 
				+
			
 
				+	.type	loop_filter_h_neon, %function; loop_filter_h_neon: @ PROC
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int           *_bv
			
 
				+	@ preserves r0-r3
			
 
				+	@ We assume Q15= 2*L in U16s
			
 
				+	@                    My best guesses at cycle counts (and latency)--vvv
			
 
				+	SUB	r12,r0, #2
			
 
				+	@ Doing a 2-element structure load saves doing two VTRN's below, at the
			
 
				+	@  cost of using two more slower single-lane loads vs. the faster
			
 
				+	@  all-lane loads.
			
 
				+	@ It's less code this way, though, and benches a hair faster, but it
			
 
				+	@  leaves D2 and D4 swapped.
			
 
				+	VLD2.16	{D0[],D2[]},  [r12], r1		@ D0 = ____________1100     2,1
			
 
				+						@ D2 = ____________3322
			
 
				+	VLD2.16	{D4[],D6[]},  [r12], r1		@ D4 = ____________5544     2,1
			
 
				+						@ D6 = ____________7766
			
 
				+	VLD2.16	{D0[1],D2[1]},[r12], r1		@ D0 = ________99881100     3,1
			
 
				+						@ D2 = ________BBAA3322
			
 
				+	VLD2.16	{D4[1],D6[1]},[r12], r1		@ D4 = ________DDCC5544     3,1
			
 
				+						@ D6 = ________FFEE7766
			
 
				+	VLD2.16	{D0[2],D2[2]},[r12], r1		@ D0 = ____GGHH99881100     3,1
			
 
				+						@ D2 = ____JJIIBBAA3322
			
 
				+	VLD2.16	{D4[2],D6[2]},[r12], r1		@ D4 = ____KKLLDDCC5544     3,1
			
 
				+						@ D6 = ____NNMMFFEE7766
			
 
				+	VLD2.16	{D0[3],D2[3]},[r12], r1		@ D0 = PPOOGGHH99881100     3,1
			
 
				+						@ D2 = RRQQJJIIBBAA3322
			
 
				+	VLD2.16	{D4[3],D6[3]},[r12], r1		@ D4 = TTSSKKLLDDCC5544     3,1
			
 
				+						@ D6 = VVUUNNMMFFEE7766
			
 
				+	VTRN.8	D0, D4	@ D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511       1,1
			
 
				+	VTRN.8	D2, D6	@ D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733       1,1
			
 
				+	VSUBL.U8	Q0, D0, D6	@ Q0 = 00 - 33 in S16s              1,3
			
 
				+	VSUBL.U8	Q8, D2, D4	@ Q8 = 22 - 11 in S16s              1,3
			
 
				+	ADD	r12,r0, #8
			
 
				+	VADD.S16	Q0, Q0, Q8	@                                   1,3
			
 
				+	PLD	[r12]
			
 
				+	VADD.S16	Q0, Q0, Q8	@                                   1,3
			
 
				+	PLD	[r12,r1]
			
 
				+	VADD.S16	Q0, Q0, Q8	@ Q0 = [0-3]+3*[2-1]                1,3
			
 
				+	PLD	[r12,r1, LSL #1]
			
 
				+	VRSHR.S16	Q0, Q0, #3	@ Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
			
 
				+	ADD	r12,r12,r1, LSL #2
			
 
				+	@  We want to do
			
 
				+	@ f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
			
 
				+	@   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
			
 
				+	@ So we've reduced the left and right hand terms to be the same, except
			
 
				+	@ for a negation.
			
 
				+	@ Stall x3
			
 
				+	VABS.S16	Q9, Q0		@ Q9 = |f| in U16s                  1,4
			
 
				+	PLD	[r12,-r1]
			
 
				+	VSHR.S16	Q0, Q0, #15	@ Q0 = -1 or 0 according to sign    1,3
			
 
				+	PLD	[r12]
			
 
				+	VQSUB.U16	Q10,Q15,Q9	@ Q10= MAX(2L-|f|,0) in U16s        1,4
			
 
				+	PLD	[r12,r1]
			
 
				+	VMOVL.U8	Q1, D2	   @ Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
			
 
				+	PLD	[r12,r1,LSL #1]
			
 
				+	VMIN.U16	Q9, Q10,Q9	@ Q9 = MIN(|f|,MAX(2L-|f|))         1,4
			
 
				+	ADD	r12,r12,r1, LSL #2
			
 
				+	@ Now we need to correct for the sign of f.
			
 
				+	@ For negative elements of Q0, we want to subtract the appropriate
			
 
				+	@ element of Q9. For positive elements we want to add them. No NEON
			
 
				+	@ instruction exists to do this, so we need to negate the negative
			
 
				+	@ elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
			
 
				+	VADD.S16	Q9, Q9, Q0	@				    1,3
			
 
				+	PLD	[r12,-r1]
			
 
				+	VEOR.S16	Q9, Q9, Q0	@ Q9 = real value of f              1,3
			
 
				+	@ Bah. No VRSBW.U8
			
 
				+	@ Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
			
 
				+	VADDW.U8	Q2, Q9, D4 @ Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
			
 
				+	VSUB.S16	Q1, Q1, Q9 @ Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
			
 
				+	VQMOVUN.S16	D4, Q2		@ D4 = TTPPLLHHDD995511		    1,1
			
 
				+	VQMOVUN.S16	D2, Q1		@ D2 = UUQQMMIIEEAA6622		    1,1
			
 
				+	SUB	r12,r0, #1
			
 
				+	VTRN.8	D4, D2		@ D4 = QQPPIIHHAA992211	D2 = MMLLEEDD6655   1,1
			
 
				+	VST1.16	{D4[0]}, [r12], r1
			
 
				+	VST1.16	{D2[0]}, [r12], r1
			
 
				+	VST1.16	{D4[1]}, [r12], r1
			
 
				+	VST1.16	{D2[1]}, [r12], r1
			
 
				+	VST1.16	{D4[2]}, [r12], r1
			
 
				+	VST1.16	{D2[2]}, [r12], r1
			
 
				+	VST1.16	{D4[3]}, [r12], r1
			
 
				+	VST1.16	{D2[3]}, [r12], r1
			
 
				+	MOV	PC,r14
			
 
				+	.size loop_filter_h_neon, .-loop_filter_h_neon	@ ENDP
			
 
				+
			
 
				+	.type	loop_filter_v_neon, %function; loop_filter_v_neon: @ PROC
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int           *_bv
			
 
				+	@ preserves r0-r3
			
 
				+	@ We assume Q15= 2*L in U16s
			
 
				+	@                    My best guesses at cycle counts (and latency)--vvv
			
 
				+	SUB	r12,r0, r1, LSL #1
			
 
				+	VLD1.64	{D0}, [r12,:64], r1		@ D0 = SSOOKKGGCC884400     2,1
			
 
				+	VLD1.64	{D2}, [r12,:64], r1		@ D2 = TTPPLLHHDD995511     2,1
			
 
				+	VLD1.64	{D4}, [r12,:64], r1		@ D4 = UUQQMMIIEEAA6622     2,1
			
 
				+	VLD1.64	{D6}, [r12,:64]			@ D6 = VVRRNNJJFFBB7733     2,1
			
 
				+	VSUBL.U8	Q8, D4, D2	@ Q8 = 22 - 11 in S16s              1,3
			
 
				+	VSUBL.U8	Q0, D0, D6	@ Q0 = 00 - 33 in S16s              1,3
			
 
				+	ADD	r12, #8
			
 
				+	VADD.S16	Q0, Q0, Q8	@                                   1,3
			
 
				+	PLD	[r12]
			
 
				+	VADD.S16	Q0, Q0, Q8	@                                   1,3
			
 
				+	PLD	[r12,r1]
			
 
				+	VADD.S16	Q0, Q0, Q8	@ Q0 = [0-3]+3*[2-1]                1,3
			
 
				+	SUB	r12, r0, r1
			
 
				+	VRSHR.S16	Q0, Q0, #3	@ Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
			
 
				+	@  We want to do
			
 
				+	@ f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
			
 
				+	@   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
			
 
				+	@ So we've reduced the left and right hand terms to be the same, except
			
 
				+	@ for a negation.
			
 
				+	@ Stall x3
			
 
				+	VABS.S16	Q9, Q0		@ Q9 = |f| in U16s                  1,4
			
 
				+	VSHR.S16	Q0, Q0, #15	@ Q0 = -1 or 0 according to sign    1,3
			
 
				+	@ Stall x2
			
 
				+	VQSUB.U16	Q10,Q15,Q9	@ Q10= MAX(2L-|f|,0) in U16s        1,4
			
 
				+	VMOVL.U8	Q2, D4	   @ Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
			
 
				+	@ Stall x2
			
 
				+	VMIN.U16	Q9, Q10,Q9	@ Q9 = MIN(|f|,MAX(2L-|f|))         1,4
			
 
				+	@ Now we need to correct for the sign of f.
			
 
				+	@ For negative elements of Q0, we want to subtract the appropriate
			
 
				+	@ element of Q9. For positive elements we want to add them. No NEON
			
 
				+	@ instruction exists to do this, so we need to negate the negative
			
 
				+	@ elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
			
 
				+	@ Stall x3
			
 
				+	VADD.S16	Q9, Q9, Q0	@				    1,3
			
 
				+	@ Stall x2
			
 
				+	VEOR.S16	Q9, Q9, Q0	@ Q9 = real value of f              1,3
			
 
				+	@ Bah. No VRSBW.U8
			
 
				+	@ Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
			
 
				+	VADDW.U8	Q1, Q9, D2 @ Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
			
 
				+	VSUB.S16	Q2, Q2, Q9 @ Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
			
 
				+	VQMOVUN.S16	D2, Q1		@ D2 = TTPPLLHHDD995511		    1,1
			
 
				+	VQMOVUN.S16	D4, Q2		@ D4 = UUQQMMIIEEAA6622		    1,1
			
 
				+	VST1.64	{D2}, [r12,:64], r1
			
 
				+	VST1.64	{D4}, [r12,:64], r1
			
 
				+	MOV	PC,r14
			
 
				+	.size loop_filter_v_neon, .-loop_filter_v_neon	@ ENDP
			
 
				+
			
 
				+	.type	oc_loop_filter_frag_rows_neon, %function; oc_loop_filter_frag_rows_neon: @ PROC
			
 
				+	@ r0 = _ref_frame_data
			
 
				+	@ r1 = _ystride
			
 
				+	@ r2 = _bv
			
 
				+	@ r3 = _frags
			
 
				+	@ r4 = _fragi0
			
 
				+	@ r5 = _fragi0_end
			
 
				+	@ r6 = _fragi_top
			
 
				+	@ r7 = _fragi_bot
			
 
				+	@ r8 = _frag_buf_offs
			
 
				+	@ r9 = _nhfrags
			
 
				+	MOV	r12,r13
			
 
				+	STMFD	r13!,{r0,r4-r11,r14}
			
 
				+	LDMFD	r12,{r4-r9}
			
 
				+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
			
 
				+	BGE	oslffri_neon_end@   bail
			
 
				+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
			
 
				+	BLE	oslffri_neon_end	@		  bail
			
 
				+	VLD1.64	{D30,D31}, [r2,:128]	@ Q15= 2L in U16s
			
 
				+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
			
 
				+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
			
 
				+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
			
 
				+oslffri_neon_lp1:
			
 
				+	MOV	r10,r4		@ r10= fragi = _fragi0
			
 
				+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
			
 
				+oslffri_neon_lp2:
			
 
				+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
			
 
				+	LDR	r0, [r13]	@ r0 = _ref_frame_data
			
 
				+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
			
 
				+	TST	r14,#OC_FRAG_CODED_FLAG
			
 
				+	BEQ	oslffri_neon_uncoded
			
 
				+	CMP	r10,r4		@ if (fragi>_fragi0)
			
 
				+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
			
 
				+	BLGT	loop_filter_h_neon
			
 
				+	CMP	r4, r6		@ if (_fragi0>_fragi_top)
			
 
				+	BLGT	loop_filter_v_neon
			
 
				+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
			
 
				+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
			
 
				+	ADD	r0, r0, #8
			
 
				+	ADD	r10,r10,#1	@ r10 = fragi+1;
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
			
 
				+	BLLT	loop_filter_h_neon
			
 
				+	CMP	r10,r7		@ if (fragi<_fragi_bot)
			
 
				+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
			
 
				+	SUB	r0, r0, #8
			
 
				+	ADD	r0, r0, r1, LSL #3
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG
			
 
				+	BLLT	loop_filter_v_neon
			
 
				+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
			
 
				+	BLE	oslffri_neon_lp2
			
 
				+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_neon_lp1
			
 
				+oslffri_neon_end:
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+oslffri_neon_uncoded:
			
 
				+	ADD	r10,r10,#1
			
 
				+	CMP	r10,r11
			
 
				+	BLE	oslffri_neon_lp2
			
 
				+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_neon_lp1
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+	.size oc_loop_filter_frag_rows_neon, .-oc_loop_filter_frag_rows_neon	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+	@ END
			
 
				+    .section	.note.GNU-stack,"",%progbits
			
--- a/modules/theoraplayer/native/theora/lib/arm/armloop.s
+++ b/modules/theoraplayer/native/theora/lib/arm/armloop.s
@@ -0,0 +1,676 @@
 
				+;********************************************************************
			
 
				+;*                                                                  *
			
 
				+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+;*                                                                  *
			
 
				+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+;*                                                                  *
			
 
				+;********************************************************************
			
 
				+; Original implementation:
			
 
				+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+; last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $
			
 
				+;********************************************************************
			
 
				+
			
 
				+	AREA	|.text|, CODE, READONLY
			
 
				+
			
 
				+	GET	armopts.s
			
 
				+
			
 
				+	EXPORT	oc_loop_filter_frag_rows_arm
			
 
				+
			
 
				+; Which bit this is depends on the order of packing within a bitfield.
			
 
				+; Hopefully that doesn't change among any of the relevant compilers.
			
 
				+OC_FRAG_CODED_FLAG	*	1
			
 
				+
			
 
				+	; Vanilla ARM v4 version
			
 
				+loop_filter_h_arm PROC
			
 
				+	; r0 = unsigned char *_pix
			
 
				+	; r1 = int            _ystride
			
 
				+	; r2 = int           *_bv
			
 
				+	; preserves r0-r3
			
 
				+	STMFD	r13!,{r3-r6,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r6, #255
			
 
				+lfh_arm_lp
			
 
				+	LDRB	r3, [r0, #-2]		; r3 = _pix[0]
			
 
				+	LDRB	r12,[r0, #1]		; r12= _pix[3]
			
 
				+	LDRB	r4, [r0, #-1]		; r4 = _pix[1]
			
 
				+	LDRB	r5, [r0]		; r5 = _pix[2]
			
 
				+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
			
 
				+	ADD	r3, r3, #4
			
 
				+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
			
 
				+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
			
 
				+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
			
 
				+	MOV	r12,r12,ASR #3
			
 
				+	LDRSB	r12,[r2, r12]
			
 
				+	; Stall (2 on Xscale)
			
 
				+	ADDS	r4, r4, r12
			
 
				+	CMPGT	r6, r4
			
 
				+	EORLT	r4, r6, r4, ASR #32
			
 
				+	SUBS	r5, r5, r12
			
 
				+	CMPGT	r6, r5
			
 
				+	EORLT	r5, r6, r5, ASR #32
			
 
				+	STRB	r4, [r0, #-1]
			
 
				+	STRB	r5, [r0], r1
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	lfh_arm_lp
			
 
				+	SUB	r0, r0, r1, LSL #3
			
 
				+	LDMFD	r13!,{r3-r6,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+loop_filter_v_arm PROC
			
 
				+	; r0 = unsigned char *_pix
			
 
				+	; r1 = int            _ystride
			
 
				+	; r2 = int           *_bv
			
 
				+	; preserves r0-r3
			
 
				+	STMFD	r13!,{r3-r6,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r6, #255
			
 
				+lfv_arm_lp
			
 
				+	LDRB	r3, [r0, -r1, LSL #1]	; r3 = _pix[0]
			
 
				+	LDRB	r12,[r0, r1]		; r12= _pix[3]
			
 
				+	LDRB	r4, [r0, -r1]		; r4 = _pix[1]
			
 
				+	LDRB	r5, [r0]		; r5 = _pix[2]
			
 
				+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
			
 
				+	ADD	r3, r3, #4
			
 
				+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
			
 
				+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
			
 
				+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
			
 
				+	MOV	r12,r12,ASR #3
			
 
				+	LDRSB	r12,[r2, r12]
			
 
				+	; Stall (2 on Xscale)
			
 
				+	ADDS	r4, r4, r12
			
 
				+	CMPGT	r6, r4
			
 
				+	EORLT	r4, r6, r4, ASR #32
			
 
				+	SUBS	r5, r5, r12
			
 
				+	CMPGT	r6, r5
			
 
				+	EORLT	r5, r6, r5, ASR #32
			
 
				+	STRB	r4, [r0, -r1]
			
 
				+	STRB	r5, [r0], #1
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	lfv_arm_lp
			
 
				+	SUB	r0, r0, #8
			
 
				+	LDMFD	r13!,{r3-r6,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_loop_filter_frag_rows_arm PROC
			
 
				+	; r0 = _ref_frame_data
			
 
				+	; r1 = _ystride
			
 
				+	; r2 = _bv
			
 
				+	; r3 = _frags
			
 
				+	; r4 = _fragi0
			
 
				+	; r5 = _fragi0_end
			
 
				+	; r6 = _fragi_top
			
 
				+	; r7 = _fragi_bot
			
 
				+	; r8 = _frag_buf_offs
			
 
				+	; r9 = _nhfrags
			
 
				+	MOV	r12,r13
			
 
				+	STMFD	r13!,{r0,r4-r11,r14}
			
 
				+	LDMFD	r12,{r4-r9}
			
 
				+	ADD	r2, r2, #127	; _bv += 127
			
 
				+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
			
 
				+	BGE	oslffri_arm_end	;   bail
			
 
				+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
			
 
				+	BLE	oslffri_arm_end	;			  bail
			
 
				+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
			
 
				+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
			
 
				+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
			
 
				+oslffri_arm_lp1
			
 
				+	MOV	r10,r4		; r10= fragi = _fragi0
			
 
				+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
			
 
				+oslffri_arm_lp2
			
 
				+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
			
 
				+	LDR	r0, [r13]	; r0 = _ref_frame_data
			
 
				+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
			
 
				+	TST	r14,#OC_FRAG_CODED_FLAG
			
 
				+	BEQ	oslffri_arm_uncoded
			
 
				+	CMP	r10,r4		; if (fragi>_fragi0)
			
 
				+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
			
 
				+	BLGT	loop_filter_h_arm
			
 
				+	CMP	r4, r6		; if (_fragi0>_fragi_top)
			
 
				+	BLGT	loop_filter_v_arm
			
 
				+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
			
 
				+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
			
 
				+	ADD	r0, r0, #8
			
 
				+	ADD	r10,r10,#1	; r10 = fragi+1;
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
			
 
				+	BLLT	loop_filter_h_arm
			
 
				+	CMP	r10,r7		; if (fragi<_fragi_bot)
			
 
				+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
			
 
				+	SUB	r0, r0, #8
			
 
				+	ADD	r0, r0, r1, LSL #3
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG
			
 
				+	BLLT	loop_filter_v_arm
			
 
				+	CMP	r10,r11		; while(fragi<=fragi_end-1)
			
 
				+	BLE	oslffri_arm_lp2
			
 
				+	MOV	r4, r10		; r4 = fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_arm_lp1
			
 
				+oslffri_arm_end
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+oslffri_arm_uncoded
			
 
				+	ADD	r10,r10,#1
			
 
				+	CMP	r10,r11
			
 
				+	BLE	oslffri_arm_lp2
			
 
				+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_arm_lp1
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+ [ OC_ARM_ASM_MEDIA
			
 
				+	EXPORT	oc_loop_filter_init_v6
			
 
				+	EXPORT	oc_loop_filter_frag_rows_v6
			
 
				+
			
 
				+oc_loop_filter_init_v6 PROC
			
 
				+	; r0 = _bv
			
 
				+	; r1 = _flimit (=L from the spec)
			
 
				+	MVN	r1, r1, LSL #1		; r1 = <0xFFFFFF|255-2*L>
			
 
				+	AND	r1, r1, #255		; r1 = ll=r1&0xFF
			
 
				+	ORR	r1, r1, r1, LSL #8	; r1 = <ll|ll>
			
 
				+	PKHBT	r1, r1, r1, LSL #16	; r1 = <ll|ll|ll|ll>
			
 
				+	STR	r1, [r0]
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+; We could use the same strategy as the v filter below, but that would require
			
 
				+;  40 instructions to load the data and transpose it into columns and another
			
 
				+;  32 to write out the results at the end, plus the 52 instructions to do the
			
 
				+;  filtering itself.
			
 
				+; This is slightly less, and less code, even assuming we could have shared the
			
 
				+;  52 instructions in the middle with the other function.
			
 
				+; It executes slightly fewer instructions than the ARMv6 approach David Conrad
			
 
				+;  proposed for FFmpeg, but not by much:
			
 
				+;  http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
			
 
				+; His is a lot less code, though, because it only does two rows at once instead
			
 
				+;  of four.
			
 
				+loop_filter_h_v6 PROC
			
 
				+	; r0 = unsigned char *_pix
			
 
				+	; r1 = int            _ystride
			
 
				+	; r2 = int            _ll
			
 
				+	; preserves r0-r3
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	LDR	r12,=0x10003
			
 
				+	BL loop_filter_h_core_v6
			
 
				+	ADD	r0, r0, r1, LSL #2
			
 
				+	BL loop_filter_h_core_v6
			
 
				+	SUB	r0, r0, r1, LSL #2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+loop_filter_h_core_v6 PROC
			
 
				+	; r0 = unsigned char *_pix
			
 
				+	; r1 = int            _ystride
			
 
				+	; r2 = int            _ll
			
 
				+	; r12= 0x10003
			
 
				+	; Preserves r0-r3, r12; Clobbers r4-r11.
			
 
				+	LDR	r4,[r0, #-2]!		; r4 = <p3|p2|p1|p0>
			
 
				+	; Single issue
			
 
				+	LDR	r5,[r0, r1]!		; r5 = <q3|q2|q1|q0>
			
 
				+	UXTB16	r6, r4, ROR #16		; r6 = <p0|p2>
			
 
				+	UXTB16	r4, r4, ROR #8		; r4 = <p3|p1>
			
 
				+	UXTB16	r7, r5, ROR #16		; r7 = <q0|q2>
			
 
				+	UXTB16	r5, r5, ROR #8		; r5 = <q3|q1>
			
 
				+	PKHBT	r8, r4, r5, LSL #16	; r8 = <__|q1|__|p1>
			
 
				+	PKHBT	r9, r6, r7, LSL #16	; r9 = <__|q2|__|p2>
			
 
				+	SSUB16	r6, r4, r6		; r6 = <p3-p0|p1-p2>
			
 
				+	SMLAD	r6, r6, r12,r12		; r6 = <????|(p3-p0)+3*(p1-p2)+3>
			
 
				+	SSUB16	r7, r5, r7		; r7 = <q3-q0|q1-q2>
			
 
				+	SMLAD	r7, r7, r12,r12		; r7 = <????|(q0-q3)+3*(q2-q1)+4>
			
 
				+	LDR	r4,[r0, r1]!		; r4 = <r3|r2|r1|r0>
			
 
				+	MOV	r6, r6, ASR #3		; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
			
 
				+	LDR	r5,[r0, r1]!		; r5 = <s3|s2|s1|s0>
			
 
				+	PKHBT	r11,r6, r7, LSL #13	; r11= <??|-R_q|??|-R_p>
			
 
				+	UXTB16	r6, r4, ROR #16		; r6 = <r0|r2>
			
 
				+	UXTB16	r11,r11			; r11= <__|-R_q|__|-R_p>
			
 
				+	UXTB16	r4, r4, ROR #8		; r4 = <r3|r1>
			
 
				+	UXTB16	r7, r5, ROR #16		; r7 = <s0|s2>
			
 
				+	PKHBT	r10,r6, r7, LSL #16	; r10= <__|s2|__|r2>
			
 
				+	SSUB16	r6, r4, r6		; r6 = <r3-r0|r1-r2>
			
 
				+	UXTB16	r5, r5, ROR #8		; r5 = <s3|s1>
			
 
				+	SMLAD	r6, r6, r12,r12		; r6 = <????|(r3-r0)+3*(r2-r1)+3>
			
 
				+	SSUB16	r7, r5, r7		; r7 = <r3-r0|r1-r2>
			
 
				+	SMLAD	r7, r7, r12,r12		; r7 = <????|(s0-s3)+3*(s2-s1)+4>
			
 
				+	ORR	r9, r9, r10, LSL #8	; r9 = <s2|q2|r2|p2>
			
 
				+	MOV	r6, r6, ASR #3		; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
			
 
				+	PKHBT	r10,r4, r5, LSL #16	; r10= <__|s1|__|r1>
			
 
				+	PKHBT	r6, r6, r7, LSL #13	; r6 = <??|-R_s|??|-R_r>
			
 
				+	ORR	r8, r8, r10, LSL #8	; r8 = <s1|q1|r1|p1>
			
 
				+	UXTB16	r6, r6			; r6 = <__|-R_s|__|-R_r>
			
 
				+	MOV	r10,#0
			
 
				+	ORR	r6, r11,r6, LSL #8	; r6 = <-R_s|-R_q|-R_r|-R_p>
			
 
				+	; Single issue
			
 
				+	; There's no min, max or abs instruction.
			
 
				+	; SSUB8 and SEL will work for abs, and we can do all the rest with
			
 
				+	;  unsigned saturated adds, which means the GE flags are still all
			
 
				+	;  set when we're done computing lflim(abs(R_i),L).
			
 
				+	; This allows us to both add and subtract, and split the results by
			
 
				+	;  the original sign of R_i.
			
 
				+	SSUB8	r7, r10,r6
			
 
				+	; Single issue
			
 
				+	SEL	r7, r7, r6		; r7 = abs(R_i)
			
 
				+	; Single issue
			
 
				+	UQADD8	r4, r7, r2		; r4 = 255-max(2*L-abs(R_i),0)
			
 
				+	; Single issue
			
 
				+	UQADD8	r7, r7, r4
			
 
				+	; Single issue
			
 
				+	UQSUB8	r7, r7, r4		; r7 = min(abs(R_i),max(2*L-abs(R_i),0))
			
 
				+	; Single issue
			
 
				+	UQSUB8	r4, r8, r7
			
 
				+	UQADD8	r5, r9, r7
			
 
				+	UQADD8	r8, r8, r7
			
 
				+	UQSUB8	r9, r9, r7
			
 
				+	SEL	r8, r8, r4		; r8 = p1+lflim(R_i,L)
			
 
				+	SEL	r9, r9, r5		; r9 = p2-lflim(R_i,L)
			
 
				+	MOV	r5, r9, LSR #24		; r5 = s2
			
 
				+	STRB	r5, [r0,#2]!
			
 
				+	MOV	r4, r8, LSR #24		; r4 = s1
			
 
				+	STRB	r4, [r0,#-1]
			
 
				+	MOV	r5, r9, LSR #8		; r5 = r2
			
 
				+	STRB	r5, [r0,-r1]!
			
 
				+	MOV	r4, r8, LSR #8		; r4 = r1
			
 
				+	STRB	r4, [r0,#-1]
			
 
				+	MOV	r5, r9, LSR #16		; r5 = q2
			
 
				+	STRB	r5, [r0,-r1]!
			
 
				+	MOV	r4, r8, LSR #16		; r4 = q1
			
 
				+	STRB	r4, [r0,#-1]
			
 
				+	; Single issue
			
 
				+	STRB	r9, [r0,-r1]!
			
 
				+	; Single issue
			
 
				+	STRB	r8, [r0,#-1]
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
			
 
				+;  computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
			
 
				+; This works just as well, with the following procedure for computing the
			
 
				+;  filter value, f:
			
 
				+;   u = ~UHADD8(p1,~p2);
			
 
				+;   v = UHADD8(~p1,p2);
			
 
				+;   m = v-u;
			
 
				+;   a = m^UHADD8(m^p0,m^~p3);
			
 
				+;   f = UHADD8(UHADD8(a,u1),v1);
			
 
				+;  where f = 127+R, with R in [-127,128] defined as in the spec.
			
 
				+; This is exactly the same amount of arithmetic as the version that uses PAVGB
			
 
				+;  as the basic operator.
			
 
				+; It executes about 2/3 the number of instructions of David Conrad's approach,
			
 
				+;  but requires more code, because it does all eight columns at once, instead
			
 
				+;  of four at a time.
			
 
				+loop_filter_v_v6 PROC
			
 
				+	; r0 = unsigned char *_pix
			
 
				+	; r1 = int            _ystride
			
 
				+	; r2 = int            _ll
			
 
				+	; preserves r0-r11
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	LDRD	r6, [r0, -r1]!		; r7, r6 = <p5|p1>
			
 
				+	LDRD	r4, [r0, -r1]		; r5, r4 = <p4|p0>
			
 
				+	LDRD	r8, [r0, r1]!		; r9, r8 = <p6|p2>
			
 
				+	MVN	r14,r6			; r14= ~p1
			
 
				+	LDRD	r10,[r0, r1]		; r11,r10= <p7|p3>
			
 
				+	; Filter the first four columns.
			
 
				+	MVN	r12,r8			; r12= ~p2
			
 
				+	UHADD8	r14,r14,r8		; r14= v1=~p1+p2>>1
			
 
				+	UHADD8	r12,r12,r6		; r12= p1+~p2>>1
			
 
				+	MVN	r10, r10		; r10=~p3
			
 
				+	MVN	r12,r12			; r12= u1=~p1+p2+1>>1
			
 
				+	SSUB8	r14,r14,r12		; r14= m1=v1-u1
			
 
				+	; Single issue
			
 
				+	EOR	r4, r4, r14		; r4 = m1^p0
			
 
				+	EOR	r10,r10,r14		; r10= m1^~p3
			
 
				+	UHADD8	r4, r4, r10		; r4 = (m1^p0)+(m1^~p3)>>1
			
 
				+	; Single issue
			
 
				+	EOR	r4, r4, r14		; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
			
 
				+	SADD8	r14,r14,r12		; r14= v1=m1+u1
			
 
				+	UHADD8	r4, r4, r12		; r4 = a1+u1>>1
			
 
				+	MVN	r12,r9			; r12= ~p6
			
 
				+	UHADD8	r4, r4, r14		; r4 = f1=(a1+u1>>1)+v1>>1
			
 
				+	; Filter the second four columns.
			
 
				+	MVN	r14,r7			; r14= ~p5
			
 
				+	UHADD8	r12,r12,r7		; r12= p5+~p6>>1
			
 
				+	UHADD8	r14,r14,r9		; r14= v2=~p5+p6>>1
			
 
				+	MVN	r12,r12			; r12= u2=~p5+p6+1>>1
			
 
				+	MVN	r11,r11			; r11=~p7
			
 
				+	SSUB8	r10,r14,r12		; r10= m2=v2-u2
			
 
				+	; Single issue
			
 
				+	EOR	r5, r5, r10		; r5 = m2^p4
			
 
				+	EOR	r11,r11,r10		; r11= m2^~p7
			
 
				+	UHADD8	r5, r5, r11		; r5 = (m2^p4)+(m2^~p7)>>1
			
 
				+	; Single issue
			
 
				+	EOR	r5, r5, r10		; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
			
 
				+	; Single issue
			
 
				+	UHADD8	r5, r5, r12		; r5 = a2+u2>>1
			
 
				+	LDR	r12,=0x7F7F7F7F		; r12 = {127}x4
			
 
				+	UHADD8	r5, r5, r14		; r5 = f2=(a2+u2>>1)+v2>>1
			
 
				+	; Now split f[i] by sign.
			
 
				+	; There's no min or max instruction.
			
 
				+	; We could use SSUB8 and SEL, but this is just as many instructions and
			
 
				+	;  dual issues more (for v7 without NEON).
			
 
				+	UQSUB8	r10,r4, r12		; r10= R_i>0?R_i:0
			
 
				+	UQSUB8	r4, r12,r4		; r4 = R_i<0?-R_i:0
			
 
				+	UQADD8	r11,r10,r2		; r11= 255-max(2*L-abs(R_i<0),0)
			
 
				+	UQADD8	r14,r4, r2		; r14= 255-max(2*L-abs(R_i>0),0)
			
 
				+	UQADD8	r10,r10,r11
			
 
				+	UQADD8	r4, r4, r14
			
 
				+	UQSUB8	r10,r10,r11		; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
			
 
				+	UQSUB8	r4, r4, r14		; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
			
 
				+	UQSUB8	r11,r5, r12		; r11= R_i>0?R_i:0
			
 
				+	UQADD8	r6, r6, r10
			
 
				+	UQSUB8	r8, r8, r10
			
 
				+	UQSUB8	r5, r12,r5		; r5 = R_i<0?-R_i:0
			
 
				+	UQSUB8	r6, r6, r4		; r6 = p1+lflim(R_i,L)
			
 
				+	UQADD8	r8, r8, r4		; r8 = p2-lflim(R_i,L)
			
 
				+	UQADD8	r10,r11,r2		; r10= 255-max(2*L-abs(R_i<0),0)
			
 
				+	UQADD8	r14,r5, r2		; r14= 255-max(2*L-abs(R_i>0),0)
			
 
				+	UQADD8	r11,r11,r10
			
 
				+	UQADD8	r5, r5, r14
			
 
				+	UQSUB8	r11,r11,r10		; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
			
 
				+	UQSUB8	r5, r5, r14		; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
			
 
				+	UQADD8	r7, r7, r11
			
 
				+	UQSUB8	r9, r9, r11
			
 
				+	UQSUB8	r7, r7, r5		; r7 = p5+lflim(R_i,L)
			
 
				+	STRD	r6, [r0, -r1]		; [p5:p1] = [r7: r6]
			
 
				+	UQADD8	r9, r9, r5		; r9 = p6-lflim(R_i,L)
			
 
				+	STRD	r8, [r0]		; [p6:p2] = [r9: r8]
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	ENDP
			
 
				+
			
 
				+oc_loop_filter_frag_rows_v6 PROC
			
 
				+	; r0 = _ref_frame_data
			
 
				+	; r1 = _ystride
			
 
				+	; r2 = _bv
			
 
				+	; r3 = _frags
			
 
				+	; r4 = _fragi0
			
 
				+	; r5 = _fragi0_end
			
 
				+	; r6 = _fragi_top
			
 
				+	; r7 = _fragi_bot
			
 
				+	; r8 = _frag_buf_offs
			
 
				+	; r9 = _nhfrags
			
 
				+	MOV	r12,r13
			
 
				+	STMFD	r13!,{r0,r4-r11,r14}
			
 
				+	LDMFD	r12,{r4-r9}
			
 
				+	LDR	r2, [r2]	; ll = *(int *)_bv
			
 
				+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
			
 
				+	BGE	oslffri_v6_end	;   bail
			
 
				+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
			
 
				+	BLE	oslffri_v6_end	;			  bail
			
 
				+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
			
 
				+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
			
 
				+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
			
 
				+oslffri_v6_lp1
			
 
				+	MOV	r10,r4		; r10= fragi = _fragi0
			
 
				+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
			
 
				+oslffri_v6_lp2
			
 
				+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
			
 
				+	LDR	r0, [r13]	; r0 = _ref_frame_data
			
 
				+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
			
 
				+	TST	r14,#OC_FRAG_CODED_FLAG
			
 
				+	BEQ	oslffri_v6_uncoded
			
 
				+	CMP	r10,r4		; if (fragi>_fragi0)
			
 
				+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
			
 
				+	BLGT	loop_filter_h_v6
			
 
				+	CMP	r4, r6		; if (fragi0>_fragi_top)
			
 
				+	BLGT	loop_filter_v_v6
			
 
				+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
			
 
				+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
			
 
				+	ADD	r0, r0, #8
			
 
				+	ADD	r10,r10,#1	; r10 = fragi+1;
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
			
 
				+	BLLT	loop_filter_h_v6
			
 
				+	CMP	r10,r7		; if (fragi<_fragi_bot)
			
 
				+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
			
 
				+	SUB	r0, r0, #8
			
 
				+	ADD	r0, r0, r1, LSL #3
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG
			
 
				+	BLLT	loop_filter_v_v6
			
 
				+	CMP	r10,r11		; while(fragi<=fragi_end-1)
			
 
				+	BLE	oslffri_v6_lp2
			
 
				+	MOV	r4, r10		; r4 = fragi0 += nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_v6_lp1
			
 
				+oslffri_v6_end
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+oslffri_v6_uncoded
			
 
				+	ADD	r10,r10,#1
			
 
				+	CMP	r10,r11
			
 
				+	BLE	oslffri_v6_lp2
			
 
				+	MOV	r4, r10		; r4 = fragi0 += nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_v6_lp1
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+	ENDP
			
 
				+ ]
			
 
				+
			
 
				+ [ OC_ARM_ASM_NEON
			
 
				+	EXPORT	oc_loop_filter_init_neon
			
 
				+	EXPORT	oc_loop_filter_frag_rows_neon
			
 
				+
			
 
				+oc_loop_filter_init_neon PROC
			
 
				+	; r0 = _bv
			
 
				+	; r1 = _flimit (=L from the spec)
			
 
				+	MOV		r1, r1, LSL #1  ; r1 = 2*L
			
 
				+	VDUP.S16	Q15, r1		; Q15= 2L in U16s
			
 
				+	VST1.64		{D30,D31}, [r0@128]
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+loop_filter_h_neon PROC
			
 
				+	; r0 = unsigned char *_pix
			
 
				+	; r1 = int            _ystride
			
 
				+	; r2 = int           *_bv
			
 
				+	; preserves r0-r3
			
 
				+	; We assume Q15= 2*L in U16s
			
 
				+	;                    My best guesses at cycle counts (and latency)--vvv
			
 
				+	SUB	r12,r0, #2
			
 
				+	; Doing a 2-element structure load saves doing two VTRN's below, at the
			
 
				+	;  cost of using two more slower single-lane loads vs. the faster
			
 
				+	;  all-lane loads.
			
 
				+	; It's less code this way, though, and benches a hair faster, but it
			
 
				+	;  leaves D2 and D4 swapped.
			
 
				+	VLD2.16	{D0[],D2[]},  [r12], r1		; D0 = ____________1100     2,1
			
 
				+						; D2 = ____________3322
			
 
				+	VLD2.16	{D4[],D6[]},  [r12], r1		; D4 = ____________5544     2,1
			
 
				+						; D6 = ____________7766
			
 
				+	VLD2.16	{D0[1],D2[1]},[r12], r1		; D0 = ________99881100     3,1
			
 
				+						; D2 = ________BBAA3322
			
 
				+	VLD2.16	{D4[1],D6[1]},[r12], r1		; D4 = ________DDCC5544     3,1
			
 
				+						; D6 = ________FFEE7766
			
 
				+	VLD2.16	{D0[2],D2[2]},[r12], r1		; D0 = ____GGHH99881100     3,1
			
 
				+						; D2 = ____JJIIBBAA3322
			
 
				+	VLD2.16	{D4[2],D6[2]},[r12], r1		; D4 = ____KKLLDDCC5544     3,1
			
 
				+						; D6 = ____NNMMFFEE7766
			
 
				+	VLD2.16	{D0[3],D2[3]},[r12], r1		; D0 = PPOOGGHH99881100     3,1
			
 
				+						; D2 = RRQQJJIIBBAA3322
			
 
				+	VLD2.16	{D4[3],D6[3]},[r12], r1		; D4 = TTSSKKLLDDCC5544     3,1
			
 
				+						; D6 = VVUUNNMMFFEE7766
			
 
				+	VTRN.8	D0, D4	; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511       1,1
			
 
				+	VTRN.8	D2, D6	; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733       1,1
			
 
				+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
			
 
				+	VSUBL.U8	Q8, D2, D4	; Q8 = 22 - 11 in S16s              1,3
			
 
				+	ADD	r12,r0, #8
			
 
				+	VADD.S16	Q0, Q0, Q8	;                                   1,3
			
 
				+	PLD	[r12]
			
 
				+	VADD.S16	Q0, Q0, Q8	;                                   1,3
			
 
				+	PLD	[r12,r1]
			
 
				+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
			
 
				+	PLD	[r12,r1, LSL #1]
			
 
				+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
			
 
				+	ADD	r12,r12,r1, LSL #2
			
 
				+	;  We want to do
			
 
				+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
			
 
				+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
			
 
				+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
			
 
				+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
			
 
				+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
			
 
				+	; So we've reduced the left and right hand terms to be the same, except
			
 
				+	; for a negation.
			
 
				+	; Stall x3
			
 
				+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
			
 
				+	PLD	[r12,-r1]
			
 
				+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
			
 
				+	PLD	[r12]
			
 
				+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
			
 
				+	PLD	[r12,r1]
			
 
				+	VMOVL.U8	Q1, D2	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
			
 
				+	PLD	[r12,r1,LSL #1]
			
 
				+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
			
 
				+	ADD	r12,r12,r1, LSL #2
			
 
				+	; Now we need to correct for the sign of f.
			
 
				+	; For negative elements of Q0, we want to subtract the appropriate
			
 
				+	; element of Q9. For positive elements we want to add them. No NEON
			
 
				+	; instruction exists to do this, so we need to negate the negative
			
 
				+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
			
 
				+	VADD.S16	Q9, Q9, Q0	;				    1,3
			
 
				+	PLD	[r12,-r1]
			
 
				+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
			
 
				+	; Bah. No VRSBW.U8
			
 
				+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
			
 
				+	VADDW.U8	Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
			
 
				+	VSUB.S16	Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
			
 
				+	VQMOVUN.S16	D4, Q2		; D4 = TTPPLLHHDD995511		    1,1
			
 
				+	VQMOVUN.S16	D2, Q1		; D2 = UUQQMMIIEEAA6622		    1,1
			
 
				+	SUB	r12,r0, #1
			
 
				+	VTRN.8	D4, D2		; D4 = QQPPIIHHAA992211	D2 = MMLLEEDD6655   1,1
			
 
				+	VST1.16	{D4[0]}, [r12], r1
			
 
				+	VST1.16	{D2[0]}, [r12], r1
			
 
				+	VST1.16	{D4[1]}, [r12], r1
			
 
				+	VST1.16	{D2[1]}, [r12], r1
			
 
				+	VST1.16	{D4[2]}, [r12], r1
			
 
				+	VST1.16	{D2[2]}, [r12], r1
			
 
				+	VST1.16	{D4[3]}, [r12], r1
			
 
				+	VST1.16	{D2[3]}, [r12], r1
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+loop_filter_v_neon PROC
			
 
				+	; r0 = unsigned char *_pix
			
 
				+	; r1 = int            _ystride
			
 
				+	; r2 = int           *_bv
			
 
				+	; preserves r0-r3
			
 
				+	; We assume Q15= 2*L in U16s
			
 
				+	;                    My best guesses at cycle counts (and latency)--vvv
			
 
				+	SUB	r12,r0, r1, LSL #1
			
 
				+	VLD1.64	{D0}, [r12@64], r1		; D0 = SSOOKKGGCC884400     2,1
			
 
				+	VLD1.64	{D2}, [r12@64], r1		; D2 = TTPPLLHHDD995511     2,1
			
 
				+	VLD1.64	{D4}, [r12@64], r1		; D4 = UUQQMMIIEEAA6622     2,1
			
 
				+	VLD1.64	{D6}, [r12@64]			; D6 = VVRRNNJJFFBB7733     2,1
			
 
				+	VSUBL.U8	Q8, D4, D2	; Q8 = 22 - 11 in S16s              1,3
			
 
				+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
			
 
				+	ADD	r12, #8
			
 
				+	VADD.S16	Q0, Q0, Q8	;                                   1,3
			
 
				+	PLD	[r12]
			
 
				+	VADD.S16	Q0, Q0, Q8	;                                   1,3
			
 
				+	PLD	[r12,r1]
			
 
				+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
			
 
				+	SUB	r12, r0, r1
			
 
				+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
			
 
				+	;  We want to do
			
 
				+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
			
 
				+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
			
 
				+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
			
 
				+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
			
 
				+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
			
 
				+	; So we've reduced the left and right hand terms to be the same, except
			
 
				+	; for a negation.
			
 
				+	; Stall x3
			
 
				+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
			
 
				+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
			
 
				+	; Stall x2
			
 
				+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
			
 
				+	VMOVL.U8	Q2, D4	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
			
 
				+	; Stall x2
			
 
				+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
			
 
				+	; Now we need to correct for the sign of f.
			
 
				+	; For negative elements of Q0, we want to subtract the appropriate
			
 
				+	; element of Q9. For positive elements we want to add them. No NEON
			
 
				+	; instruction exists to do this, so we need to negate the negative
			
 
				+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
			
 
				+	; Stall x3
			
 
				+	VADD.S16	Q9, Q9, Q0	;				    1,3
			
 
				+	; Stall x2
			
 
				+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
			
 
				+	; Bah. No VRSBW.U8
			
 
				+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
			
 
				+	VADDW.U8	Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
			
 
				+	VSUB.S16	Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
			
 
				+	VQMOVUN.S16	D2, Q1		; D2 = TTPPLLHHDD995511		    1,1
			
 
				+	VQMOVUN.S16	D4, Q2		; D4 = UUQQMMIIEEAA6622		    1,1
			
 
				+	VST1.64	{D2}, [r12@64], r1
			
 
				+	VST1.64	{D4}, [r12@64], r1
			
 
				+	MOV	PC,r14
			
 
				+	ENDP
			
 
				+
			
 
				+oc_loop_filter_frag_rows_neon PROC
			
 
				+	; r0 = _ref_frame_data
			
 
				+	; r1 = _ystride
			
 
				+	; r2 = _bv
			
 
				+	; r3 = _frags
			
 
				+	; r4 = _fragi0
			
 
				+	; r5 = _fragi0_end
			
 
				+	; r6 = _fragi_top
			
 
				+	; r7 = _fragi_bot
			
 
				+	; r8 = _frag_buf_offs
			
 
				+	; r9 = _nhfrags
			
 
				+	MOV	r12,r13
			
 
				+	STMFD	r13!,{r0,r4-r11,r14}
			
 
				+	LDMFD	r12,{r4-r9}
			
 
				+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
			
 
				+	BGE	oslffri_neon_end;   bail
			
 
				+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
			
 
				+	BLE	oslffri_neon_end	;		  bail
			
 
				+	VLD1.64	{D30,D31}, [r2@128]	; Q15= 2L in U16s
			
 
				+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
			
 
				+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
			
 
				+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
			
 
				+oslffri_neon_lp1
			
 
				+	MOV	r10,r4		; r10= fragi = _fragi0
			
 
				+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
			
 
				+oslffri_neon_lp2
			
 
				+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
			
 
				+	LDR	r0, [r13]	; r0 = _ref_frame_data
			
 
				+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
			
 
				+	TST	r14,#OC_FRAG_CODED_FLAG
			
 
				+	BEQ	oslffri_neon_uncoded
			
 
				+	CMP	r10,r4		; if (fragi>_fragi0)
			
 
				+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
			
 
				+	BLGT	loop_filter_h_neon
			
 
				+	CMP	r4, r6		; if (_fragi0>_fragi_top)
			
 
				+	BLGT	loop_filter_v_neon
			
 
				+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
			
 
				+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
			
 
				+	ADD	r0, r0, #8
			
 
				+	ADD	r10,r10,#1	; r10 = fragi+1;
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
			
 
				+	BLLT	loop_filter_h_neon
			
 
				+	CMP	r10,r7		; if (fragi<_fragi_bot)
			
 
				+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
			
 
				+	SUB	r0, r0, #8
			
 
				+	ADD	r0, r0, r1, LSL #3
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG
			
 
				+	BLLT	loop_filter_v_neon
			
 
				+	CMP	r10,r11		; while(fragi<=fragi_end-1)
			
 
				+	BLE	oslffri_neon_lp2
			
 
				+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_neon_lp1
			
 
				+oslffri_neon_end
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+oslffri_neon_uncoded
			
 
				+	ADD	r10,r10,#1
			
 
				+	CMP	r10,r11
			
 
				+	BLE	oslffri_neon_lp2
			
 
				+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_neon_lp1
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+	ENDP
			
 
				+ ]
			
 
				+
			
 
				+	END
			
--- a/modules/theoraplayer/native/theora/lib/arm/armopts-gnu.s
+++ b/modules/theoraplayer/native/theora/lib/arm/armopts-gnu.s
@@ -0,0 +1,39 @@
 
				+@********************************************************************
			
 
				+@*                                                                  *
			
 
				+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+@*                                                                  *
			
 
				+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+@*                                                                  *
			
 
				+@********************************************************************
			
 
				+@ Original implementation:
			
 
				+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+@ last mod: $Id: armopts.s.in 17430 2010-09-22 21:54:09Z tterribe $
			
 
				+@********************************************************************
			
 
				+
			
 
				+@ Set the following to 1 if we have EDSP instructions
			
 
				+@  (LDRD/STRD, etc., ARMv5E and later).
			
 
				+ .set OC_ARM_ASM_EDSP,	1
			
 
				+
			
 
				+@ Set the following to 1 if we have ARMv6 media instructions.
			
 
				+ .set OC_ARM_ASM_MEDIA,	1
			
 
				+
			
 
				+@ Set the following to 1 if we have NEON (some ARMv7)
			
 
				+ .set OC_ARM_ASM_NEON,	1
			
 
				+
			
 
				+@ Set the following to 1 if LDR/STR can work on unaligned addresses
			
 
				+@ This is assumed to be true for ARMv6 and later code
			
 
				+ .set OC_ARM_CAN_UNALIGN,	1
			
 
				+
			
 
				+@ Large unaligned loads and stores are often configured to cause an exception.
			
 
				+@ They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
			
 
				+@  boundary, so it's usually a bad idea to use them anyway if they can be
			
 
				+@  avoided.
			
 
				+
			
 
				+@ Set the following to 1 if LDRD/STRD can work on unaligned addresses
			
 
				+ .set OC_ARM_CAN_UNALIGN_LDRD,	0
			
 
				+
			
 
				+@ END:
			
--- a/modules/theoraplayer/native/theora/lib/arm/armopts.s
+++ b/modules/theoraplayer/native/theora/lib/arm/armopts.s
@@ -0,0 +1,39 @@
 
				+;********************************************************************
			
 
				+;*                                                                  *
			
 
				+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+;*                                                                  *
			
 
				+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+;*                                                                  *
			
 
				+;********************************************************************
			
 
				+; Original implementation:
			
 
				+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+; last mod: $Id: armopts.s.in 17430 2010-09-22 21:54:09Z tterribe $
			
 
				+;********************************************************************
			
 
				+
			
 
				+; Set the following to 1 if we have EDSP instructions
			
 
				+;  (LDRD/STRD, etc., ARMv5E and later).
			
 
				+OC_ARM_ASM_EDSP		*	0
			
 
				+
			
 
				+; Set the following to 1 if we have ARMv6 media instructions.
			
 
				+OC_ARM_ASM_MEDIA	*	0
			
 
				+
			
 
				+; Set the following to 1 if we have NEON (some ARMv7)
			
 
				+OC_ARM_ASM_NEON		*	0
			
 
				+
			
 
				+; Set the following to 1 if LDR/STR can work on unaligned addresses
			
 
				+; This is assumed to be true for ARMv6 and later code
			
 
				+OC_ARM_CAN_UNALIGN	*	0
			
 
				+
			
 
				+; Large unaligned loads and stores are often configured to cause an exception.
			
 
				+; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
			
 
				+;  boundary, so it's usually a bad idea to use them anyway if they can be
			
 
				+;  avoided.
			
 
				+
			
 
				+; Set the following to 1 if LDRD/STRD can work on unaligned addresses
			
 
				+OC_ARM_CAN_UNALIGN_LDRD	*	0
			
 
				+
			
 
				+	END
			
--- a/modules/theoraplayer/native/theora/lib/arm/armopts.s.in
+++ b/modules/theoraplayer/native/theora/lib/arm/armopts.s.in
@@ -0,0 +1,39 @@
 
				+;********************************************************************
			
 
				+;*                                                                  *
			
 
				+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+;*                                                                  *
			
 
				+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+;*                                                                  *
			
 
				+;********************************************************************
			
 
				+; Original implementation:
			
 
				+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+; last mod: $Id: armopts.s.in 17430 2010-09-22 21:54:09Z tterribe $
			
 
				+;********************************************************************
			
 
				+
			
 
				+; Set the following to 1 if we have EDSP instructions
			
 
				+;  (LDRD/STRD, etc., ARMv5E and later).
			
 
				+OC_ARM_ASM_EDSP		*	@HAVE_ARM_ASM_EDSP@
			
 
				+
			
 
				+; Set the following to 1 if we have ARMv6 media instructions.
			
 
				+OC_ARM_ASM_MEDIA	*	@HAVE_ARM_ASM_MEDIA@
			
 
				+
			
 
				+; Set the following to 1 if we have NEON (some ARMv7)
			
 
				+OC_ARM_ASM_NEON		*	@HAVE_ARM_ASM_NEON@
			
 
				+
			
 
				+; Set the following to 1 if LDR/STR can work on unaligned addresses
			
 
				+; This is assumed to be true for ARMv6 and later code
			
 
				+OC_ARM_CAN_UNALIGN	*	0
			
 
				+
			
 
				+; Large unaligned loads and stores are often configured to cause an exception.
			
 
				+; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
			
 
				+;  boundary, so it's usually a bad idea to use them anyway if they can be
			
 
				+;  avoided.
			
 
				+
			
 
				+; Set the following to 1 if LDRD/STRD can work on unaligned addresses
			
 
				+OC_ARM_CAN_UNALIGN_LDRD	*	0
			
 
				+
			
 
				+	END
			
--- a/modules/theoraplayer/native/theora/lib/arm/armstate.c
+++ b/modules/theoraplayer/native/theora/lib/arm/armstate.c
@@ -0,0 +1,219 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include "armint.h"
			
 
				+
			
 
				+#if defined(OC_ARM_ASM)
			
 
				+
			
 
				+# if defined(OC_ARM_ASM_NEON)
			
 
				+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
			
 
				+   the destination.*/
			
 
				+static const unsigned char OC_FZIG_ZAG_NEON[128]={
			
 
				+   0, 8, 1, 2, 9,16,24,17,
			
 
				+  10, 3, 4,11,18,25,32,40,
			
 
				+  33,26,19,12, 5, 6,13,20,
			
 
				+  27,34,41,48,56,49,42,35,
			
 
				+  28,21,14, 7,15,22,29,36,
			
 
				+  43,50,57,58,51,44,37,30,
			
 
				+  23,31,38,45,52,59,60,53,
			
 
				+  46,39,47,54,61,62,55,63,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64
			
 
				+};
			
 
				+# endif
			
 
				+
			
 
				+void oc_state_accel_init_arm(oc_theora_state *_state){
			
 
				+  oc_state_accel_init_c(_state);
			
 
				+  _state->cpu_flags=oc_cpu_flags_get();
			
 
				+# if defined(OC_STATE_USE_VTABLE)
			
 
				+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_arm;
			
 
				+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm;
			
 
				+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
			
 
				+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
			
 
				+  _state->opt_vtable.idct8x8=oc_idct8x8_arm;
			
 
				+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm;
			
 
				+  /*Note: We _must_ set this function pointer, because the macro in armint.h
			
 
				+     calls it with different arguments, so the C version will segfault.*/
			
 
				+  _state->opt_vtable.state_loop_filter_frag_rows=
			
 
				+   (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_arm;
			
 
				+# endif
			
 
				+# if defined(OC_ARM_ASM_EDSP)
			
 
				+  if(_state->cpu_flags&OC_CPU_ARM_EDSP){
			
 
				+#  if defined(OC_STATE_USE_VTABLE)
			
 
				+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_edsp;
			
 
				+#  endif
			
 
				+  }
			
 
				+#  if defined(OC_ARM_ASM_MEDIA)
			
 
				+  if(_state->cpu_flags&OC_CPU_ARM_MEDIA){
			
 
				+#   if defined(OC_STATE_USE_VTABLE)
			
 
				+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_v6;
			
 
				+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6;
			
 
				+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6;
			
 
				+    _state->opt_vtable.idct8x8=oc_idct8x8_v6;
			
 
				+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_v6;
			
 
				+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6;
			
 
				+    _state->opt_vtable.state_loop_filter_frag_rows=
			
 
				+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6;
			
 
				+#   endif
			
 
				+  }
			
 
				+#   if defined(OC_ARM_ASM_NEON)
			
 
				+  if(_state->cpu_flags&OC_CPU_ARM_NEON){
			
 
				+#    if defined(OC_STATE_USE_VTABLE)
			
 
				+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_neon;
			
 
				+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon;
			
 
				+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon;
			
 
				+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon;
			
 
				+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_neon;
			
 
				+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon;
			
 
				+    _state->opt_vtable.state_loop_filter_frag_rows=
			
 
				+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon;
			
 
				+    _state->opt_vtable.idct8x8=oc_idct8x8_neon;
			
 
				+#    endif
			
 
				+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_NEON;
			
 
				+  }
			
 
				+#   endif
			
 
				+#  endif
			
 
				+# endif
			
 
				+}
			
 
				+
			
 
				+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
			
 
				+  unsigned char *dst;
			
 
				+  ptrdiff_t      frag_buf_off;
			
 
				+  int            ystride;
			
 
				+  int            refi;
			
 
				+  /*Apply the inverse transform.*/
			
 
				+  /*Special case only having a DC component.*/
			
 
				+  if(_last_zzi<2){
			
 
				+    ogg_uint16_t p;
			
 
				+    /*We round this dequant product (and not any of the others) because there's
			
 
				+       no iDCT rounding.*/
			
 
				+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
			
 
				+    oc_idct8x8_1_arm(_dct_coeffs+64,p);
			
 
				+  }
			
 
				+  else{
			
 
				+    /*First, dequantize the DC coefficient.*/
			
 
				+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
			
 
				+    oc_idct8x8_arm(_dct_coeffs+64,_dct_coeffs,_last_zzi);
			
 
				+  }
			
 
				+  /*Fill in the target buffer.*/
			
 
				+  frag_buf_off=_state->frag_buf_offs[_fragi];
			
 
				+  refi=_state->frags[_fragi].refi;
			
 
				+  ystride=_state->ref_ystride[_pli];
			
 
				+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
			
 
				+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
			
 
				+  else{
			
 
				+    const unsigned char *ref;
			
 
				+    int                  mvoffsets[2];
			
 
				+    ref=_state->ref_frame_data[refi]+frag_buf_off;
			
 
				+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
			
 
				+     _state->frag_mvs[_fragi])>1){
			
 
				+      oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
			
 
				+       _dct_coeffs+64);
			
 
				+    }
			
 
				+    else oc_frag_recon_inter_arm(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+# if defined(OC_ARM_ASM_MEDIA)
			
 
				+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
			
 
				+  unsigned char *dst;
			
 
				+  ptrdiff_t      frag_buf_off;
			
 
				+  int            ystride;
			
 
				+  int            refi;
			
 
				+  /*Apply the inverse transform.*/
			
 
				+  /*Special case only having a DC component.*/
			
 
				+  if(_last_zzi<2){
			
 
				+    ogg_uint16_t p;
			
 
				+    /*We round this dequant product (and not any of the others) because there's
			
 
				+       no iDCT rounding.*/
			
 
				+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
			
 
				+    oc_idct8x8_1_v6(_dct_coeffs+64,p);
			
 
				+  }
			
 
				+  else{
			
 
				+    /*First, dequantize the DC coefficient.*/
			
 
				+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
			
 
				+    oc_idct8x8_v6(_dct_coeffs+64,_dct_coeffs,_last_zzi);
			
 
				+  }
			
 
				+  /*Fill in the target buffer.*/
			
 
				+  frag_buf_off=_state->frag_buf_offs[_fragi];
			
 
				+  refi=_state->frags[_fragi].refi;
			
 
				+  ystride=_state->ref_ystride[_pli];
			
 
				+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
			
 
				+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
			
 
				+  else{
			
 
				+    const unsigned char *ref;
			
 
				+    int                  mvoffsets[2];
			
 
				+    ref=_state->ref_frame_data[refi]+frag_buf_off;
			
 
				+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
			
 
				+     _state->frag_mvs[_fragi])>1){
			
 
				+      oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
			
 
				+       _dct_coeffs+64);
			
 
				+    }
			
 
				+    else oc_frag_recon_inter_v6(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+# if defined(OC_ARM_ASM_NEON)
			
 
				+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
			
 
				+  unsigned char *dst;
			
 
				+  ptrdiff_t      frag_buf_off;
			
 
				+  int            ystride;
			
 
				+  int            refi;
			
 
				+  /*Apply the inverse transform.*/
			
 
				+  /*Special case only having a DC component.*/
			
 
				+  if(_last_zzi<2){
			
 
				+    ogg_uint16_t p;
			
 
				+    /*We round this dequant product (and not any of the others) because there's
			
 
				+       no iDCT rounding.*/
			
 
				+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
			
 
				+    oc_idct8x8_1_neon(_dct_coeffs+64,p);
			
 
				+  }
			
 
				+  else{
			
 
				+    /*First, dequantize the DC coefficient.*/
			
 
				+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
			
 
				+    oc_idct8x8_neon(_dct_coeffs+64,_dct_coeffs,_last_zzi);
			
 
				+  }
			
 
				+  /*Fill in the target buffer.*/
			
 
				+  frag_buf_off=_state->frag_buf_offs[_fragi];
			
 
				+  refi=_state->frags[_fragi].refi;
			
 
				+  ystride=_state->ref_ystride[_pli];
			
 
				+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
			
 
				+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
			
 
				+  else{
			
 
				+    const unsigned char *ref;
			
 
				+    int                  mvoffsets[2];
			
 
				+    ref=_state->ref_frame_data[refi]+frag_buf_off;
			
 
				+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
			
 
				+     _state->frag_mvs[_fragi])>1){
			
 
				+      oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
			
 
				+       _dct_coeffs+64);
			
 
				+    }
			
 
				+    else oc_frag_recon_inter_neon(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
			
 
				+  }
			
 
				+}
			
 
				+#  endif
			
 
				+# endif
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm_llvm/armbits.asm
+++ b/modules/theoraplayer/native/theora/lib/arm_llvm/armbits.asm
@@ -0,0 +1,236 @@
 
				+#ifdef OC_ARM_ASM
			
 
				+@********************************************************************
			
 
				+@*                                                                  *
			
 
				+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+@*                                                                  *
			
 
				+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+@*                                                                  *
			
 
				+@********************************************************************
			
 
				+@
			
 
				+@ function:
			
 
				+@   last mod: $Id: armbits.s 17481 2010-10-03 22:49:42Z tterribe $
			
 
				+@
			
 
				+@********************************************************************
			
 
				+
			
 
				+    .text;   .p2align 2
			
 
				+
			
 
				+	.global _oc_pack_read_arm
			
 
				+	.global _oc_pack_read1_arm
			
 
				+	.global _oc_huff_token_decode_arm
			
 
				+
			
 
				+	@ .type oc_pack_read1_arm, %function; oc_pack_read1_arm: @ PROC
			
 
				+_oc_pack_read1_arm:
			
 
				+	@ r0 = oc_pack_buf *_b
			
 
				+	ADD r12,r0,#8
			
 
				+	LDMIA r12,{r2,r3}      @ r2 = window
			
 
				+	@ Stall...             ; r3 = available
			
 
				+	@ Stall...
			
 
				+	SUBS r3,r3,#1          @ r3 = available-1, available<1 => LT
			
 
				+	BLT oc_pack_read1_refill
			
 
				+	MOV r0,r2,LSR #31      @ r0 = window>>31
			
 
				+	MOV r2,r2,LSL #1       @ r2 = window<<=1
			
 
				+	STMIA r12,{r2,r3}      @ window = r2
			
 
				+	                       @ available = r3
			
 
				+	MOV PC,r14
			
 
				+	@ .size oc_pack_read1_arm, .-oc_pack_read1_arm	@ ENDP
			
 
				+
			
 
				+	@ .type oc_pack_read_arm, %function; oc_pack_read_arm: @ PROC
			
 
				+_oc_pack_read_arm:
			
 
				+	@ r0 = oc_pack_buf *_b
			
 
				+	@ r1 = int          _bits
			
 
				+	ADD r12,r0,#8
			
 
				+	LDMIA r12,{r2,r3}      @ r2 = window
			
 
				+	@ Stall...             ; r3 = available
			
 
				+	@ Stall...
			
 
				+	SUBS r3,r3,r1          @ r3 = available-_bits, available<_bits => LT
			
 
				+	BLT oc_pack_read_refill
			
 
				+	RSB r0,r1,#32          @ r0 = 32-_bits
			
 
				+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
			
 
				+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
			
 
				+	STMIA r12,{r2,r3}      @ window = r2
			
 
				+	                       @ available = r3
			
 
				+	MOV PC,r14
			
 
				+
			
 
				+@ We need to refill window.
			
 
				+oc_pack_read1_refill:
			
 
				+	MOV r1,#1
			
 
				+oc_pack_read_refill:
			
 
				+	STMFD r13!,{r10,r11,r14}
			
 
				+	LDMIA r0,{r10,r11}     @ r10 = stop
			
 
				+	                       @ r11 = ptr
			
 
				+	RSB r0,r1,#32          @ r0 = 32-_bits
			
 
				+	RSB r3,r3,r0           @ r3 = 32-available
			
 
				+@ We can use unsigned compares for both the pointers and for available
			
 
				+@  (allowing us to chain condition codes) because available will never be
			
 
				+@  larger than 32 (or we wouldn't be here), and thus 32-available will never be
			
 
				+@  negative.
			
 
				+	CMP r10,r11            @ ptr<stop => HI
			
 
				+	CMPHI r3,#7            @   available<=24 => HI
			
 
				+	LDRBHI r14,[r11],#1    @     r14 = *ptr++
			
 
				+	SUBHI r3,#8            @     available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    @     r2 = window|=r14<<32-available
			
 
				+	CMPHI r10,r11          @     ptr<stop => HI
			
 
				+	CMPHI r3,#7            @       available<=24 => HI
			
 
				+	LDRBHI r14,[r11],#1    @         r14 = *ptr++
			
 
				+	SUBHI r3,#8            @         available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    @         r2 = window|=r14<<32-available
			
 
				+	CMPHI r10,r11          @         ptr<stop => HI
			
 
				+	CMPHI r3,#7            @           available<=24 => HI
			
 
				+	LDRBHI r14,[r11],#1    @             r14 = *ptr++
			
 
				+	SUBHI r3,#8            @             available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    @             r2 = window|=r14<<32-available
			
 
				+	CMPHI r10,r11          @             ptr<stop => HI
			
 
				+	CMPHI r3,#7            @               available<=24 => HI
			
 
				+	LDRBHI r14,[r11],#1    @                 r14 = *ptr++
			
 
				+	SUBHI r3,#8            @                 available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r2,r14,LSL r3    @                 r2 = window|=r14<<32-available
			
 
				+	SUBS r3,r0,r3          @ r3 = available-=_bits, available<bits => GT
			
 
				+	BLT oc_pack_read_refill_last
			
 
				+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
			
 
				+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
			
 
				+	STR r11,[r12,#-4]      @ ptr = r11
			
 
				+	STMIA r12,{r2,r3}      @ window = r2
			
 
				+	                       @ available = r3
			
 
				+	LDMFD r13!,{r10,r11,PC}
			
 
				+
			
 
				+@ Either we wanted to read more than 24 bits and didn't have enough room to
			
 
				+@  stuff the last byte into the window, or we hit the end of the packet.
			
 
				+oc_pack_read_refill_last:
			
 
				+	CMP r11,r10            @ ptr<stop => LO
			
 
				+@ If we didn't hit the end of the packet, then pull enough of the next byte to
			
 
				+@  to fill up the window.
			
 
				+	LDRBLO r14,[r11]       @ (LO) r14 = *ptr
			
 
				+@ Otherwise, set the EOF flag and pretend we have lots of available bits.
			
 
				+	MOVHS r14,#1           @ (HS) r14 = 1
			
 
				+	ADDLO r10,r3,r1        @ (LO) r10 = available
			
 
				+	STRHS r14,[r12,#8]     @ (HS) eof = 1
			
 
				+	ANDLO r10,r10,#7       @ (LO) r10 = available0x7
			
 
				+	MOVHS r3,#1<<30        @ (HS) available = OC_LOTS_OF_BITS
			
 
				+	ORRLO r2,r14,LSL r10   @ (LO) r2 = window|=*ptr>>(available0x7)
			
 
				+	MOV r0,r2,LSR r0       @ r0 = window>>32-_bits
			
 
				+	MOV r2,r2,LSL r1       @ r2 = window<<=_bits
			
 
				+	STR r11,[r12,#-4]      @ ptr = r11
			
 
				+	STMIA r12,{r2,r3}      @ window = r2
			
 
				+	                       @ available = r3
			
 
				+	LDMFD r13!,{r10,r11,PC}
			
 
				+	@ .size oc_pack_read_arm, .-oc_pack_read_arm	@ ENDP
			
 
				+
			
 
				+
			
 
				+
			
 
				+	@ .type oc_huff_token_decode_arm, %function; oc_huff_token_decode_arm: @ PROC
			
 
				+_oc_huff_token_decode_arm:
			
 
				+	@ r0 = oc_pack_buf       *_b
			
 
				+	@ r1 = const ogg_int16_t *_tree
			
 
				+	STMFD r13!,{r4,r5,r10,r14}
			
 
				+	LDRSH r10,[r1]         @ r10 = n=_tree[0]
			
 
				+	LDMIA r0,{r2-r5}       @ r2 = stop
			
 
				+	@ Stall...             ; r3 = ptr
			
 
				+	@ Stall...             ; r4 = window
			
 
				+	                       @ r5 = available
			
 
				+	CMP r10,r5             @ n>available => GT
			
 
				+	BGT oc_huff_token_decode_refill0
			
 
				+	RSB r14,r10,#32        @ r14 = 32-n
			
 
				+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
			
 
				+	ADD r14,r1,r14,LSL #1  @ r14 = _tree+bits
			
 
				+	LDRSH r12,[r14,#2]     @ r12 = node=_tree[1+bits]
			
 
				+	@ Stall...
			
 
				+	@ Stall...
			
 
				+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
			
 
				+	BMI oc_huff_token_decode_continue
			
 
				+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
			
 
				+	MOV r4,r4,LSL r10      @ r4 = window<<=n
			
 
				+	SUB r5,r10             @ r5 = available-=n
			
 
				+	STMIB r0,{r3-r5}       @ ptr = r3
			
 
				+	                       @ window = r4
			
 
				+	                       @ available = r5
			
 
				+	AND r0,r14,#255        @ r0 = node0x255
			
 
				+	LDMFD r13!,{r4,r5,r10,pc}
			
 
				+
			
 
				+@ The first tree node wasn't enough to reach a leaf, read another
			
 
				+oc_huff_token_decode_continue:
			
 
				+	ADD r12,r1,r12,LSL #1  @ r12 = _tree+node
			
 
				+	MOV r4,r4,LSL r10      @ r4 = window<<=n
			
 
				+	SUB r5,r5,r10          @ r5 = available-=n
			
 
				+	LDRSH r10,[r12],#2     @ r10 = n=_tree[node]
			
 
				+	@ Stall...             ; r12 = _tree+node+1
			
 
				+	@ Stall...
			
 
				+	CMP r10,r5             @ n>available => GT
			
 
				+	BGT oc_huff_token_decode_refill
			
 
				+	RSB r14,r10,#32        @ r14 = 32-n
			
 
				+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
			
 
				+	ADD r12,r12,r14        @
			
 
				+	LDRSH r12,[r12,r14]    @ r12 = node=_tree[node+1+bits]
			
 
				+	@ Stall...
			
 
				+	@ Stall...
			
 
				+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
			
 
				+	BMI oc_huff_token_decode_continue
			
 
				+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
			
 
				+	MOV r4,r4,LSL r10      @ r4 = window<<=n
			
 
				+	SUB r5,r10             @ r5 = available-=n
			
 
				+	STMIB r0,{r3-r5}       @ ptr = r3
			
 
				+	                       @ window = r4
			
 
				+	                       @ available = r5
			
 
				+	AND r0,r14,#255        @ r0 = node0x255
			
 
				+	LDMFD r13!,{r4,r5,r10,pc}
			
 
				+
			
 
				+oc_huff_token_decode_refill0:
			
 
				+	ADD r12,r1,#2          @ r12 = _tree+1
			
 
				+oc_huff_token_decode_refill:
			
 
				+@ We can't possibly need more than 15 bits, so available must be <= 15.
			
 
				+@ Therefore we can load at least two bytes without checking it.
			
 
				+	CMP r2,r3              @ ptr<stop => HI
			
 
				+	LDRBHI r14,[r3],#1     @   r14 = *ptr++
			
 
				+	RSBHI r5,r5,#24        @ (HI) available = 32-(available+=8)
			
 
				+	RSBLS r5,r5,#32        @ (LS) r5 = 32-available
			
 
				+	ORRHI r4,r14,LSL r5    @   r4 = window|=r14<<32-available
			
 
				+	CMPHI r2,r3            @   ptr<stop => HI
			
 
				+	LDRBHI r14,[r3],#1     @     r14 = *ptr++
			
 
				+	SUBHI r5,#8            @     available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r4,r14,LSL r5    @     r4 = window|=r14<<32-available
			
 
				+@ We can use unsigned compares for both the pointers and for available
			
 
				+@  (allowing us to chain condition codes) because available will never be
			
 
				+@  larger than 32 (or we wouldn't be here), and thus 32-available will never be
			
 
				+@  negative.
			
 
				+	CMPHI r2,r3            @     ptr<stop => HI
			
 
				+	CMPHI r5,#7            @       available<=24 => HI
			
 
				+	LDRBHI r14,[r3],#1     @         r14 = *ptr++
			
 
				+	SUBHI r5,#8            @         available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r4,r14,LSL r5    @         r4 = window|=r14<<32-available
			
 
				+	CMP r2,r3              @ ptr<stop => HI
			
 
				+	MOVLS r5,#-1<<30       @ (LS) available = OC_LOTS_OF_BITS+32
			
 
				+	CMPHI r5,#7            @ (HI) available<=24 => HI
			
 
				+	LDRBHI r14,[r3],#1     @ (HI)   r14 = *ptr++
			
 
				+	SUBHI r5,#8            @ (HI)   available += 8
			
 
				+	@ (HI) Stall...
			
 
				+	ORRHI r4,r14,LSL r5    @ (HI)   r4 = window|=r14<<32-available
			
 
				+	RSB r14,r10,#32        @ r14 = 32-n
			
 
				+	MOV r14,r4,LSR r14     @ r14 = bits=window>>32-n
			
 
				+	ADD r12,r12,r14        @
			
 
				+	LDRSH r12,[r12,r14]    @ r12 = node=_tree[node+1+bits]
			
 
				+	RSB r5,r5,#32          @ r5 = available
			
 
				+	@ Stall...
			
 
				+	RSBS r14,r12,#0        @ r14 = -node, node>0 => MI
			
 
				+	BMI oc_huff_token_decode_continue
			
 
				+	MOV r10,r14,LSR #8     @ r10 = n=node>>8
			
 
				+	MOV r4,r4,LSL r10      @ r4 = window<<=n
			
 
				+	SUB r5,r10             @ r5 = available-=n
			
 
				+	STMIB r0,{r3-r5}       @ ptr = r3
			
 
				+	                       @ window = r4
			
 
				+	                       @ available = r5
			
 
				+	AND r0,r14,#255        @ r0 = node0x255
			
 
				+	LDMFD r13!,{r4,r5,r10,pc}
			
 
				+	@ .size oc_huff_token_decode_arm, .-oc_huff_token_decode_arm	@ ENDP
			
 
				+
			
 
				+	@ END
			
 
				+    @ .section	.note.GNU-stack,"",%progbits
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm_llvm/armbits.h
+++ b/modules/theoraplayer/native/theora/lib/arm_llvm/armbits.h
@@ -0,0 +1,32 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#if !defined(_arm_armbits_H)
			
 
				+# define _arm_armbits_H (1)
			
 
				+# include "../bitpack.h"
			
 
				+# include "armcpu.h"
			
 
				+
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+#  define oc_pack_read oc_pack_read_arm
			
 
				+#  define oc_pack_read1 oc_pack_read1_arm
			
 
				+#  define oc_huff_token_decode oc_huff_token_decode_arm
			
 
				+# endif
			
 
				+
			
 
				+long oc_pack_read_arm(oc_pack_buf *_b,int _bits);
			
 
				+int oc_pack_read1_arm(oc_pack_buf *_b);
			
 
				+int oc_huff_token_decode_arm(oc_pack_buf *_b,const ogg_int16_t *_tree);
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm_llvm/armcpu.c
+++ b/modules/theoraplayer/native/theora/lib/arm_llvm/armcpu.c
@@ -0,0 +1,127 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+ CPU capability detection for ARM processors.
			
 
				+
			
 
				+ function:
			
 
				+  last mod: $Id: cpu.c 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#include "armcpu.h"
			
 
				+
			
 
				+#if !defined(OC_ARM_ASM)|| \
			
 
				+ !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_MEDIA)&& \
			
 
				+ !defined(OC_ARM_ASM_NEON)
			
 
				+ogg_uint32_t oc_cpu_flags_get(void){
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+#elif defined(_MSC_VER)
			
 
				+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
			
 
				+# define WIN32_LEAN_AND_MEAN
			
 
				+# define WIN32_EXTRA_LEAN
			
 
				+# include <windows.h>
			
 
				+
			
 
				+ogg_uint32_t oc_cpu_flags_get(void){
			
 
				+  ogg_uint32_t flags;
			
 
				+  flags=0;
			
 
				+  /*MSVC has no inline __asm support for ARM, but it does let you __emit
			
 
				+     instructions via their assembled hex code.
			
 
				+    All of these instructions should be essentially nops.*/
			
 
				+# if defined(OC_ARM_ASM_EDSP)
			
 
				+  __try{
			
 
				+    /*PLD [r13]*/
			
 
				+    __emit(0xF5DDF000);
			
 
				+    flags|=OC_CPU_ARM_EDSP;
			
 
				+  }
			
 
				+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
			
 
				+    /*Ignore exception.*/
			
 
				+  }
			
 
				+#  if defined(OC_ARM_ASM_MEDIA)
			
 
				+  __try{
			
 
				+    /*SHADD8 r3,r3,r3*/
			
 
				+    __emit(0xE6333F93);
			
 
				+    flags|=OC_CPU_ARM_MEDIA;
			
 
				+  }
			
 
				+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
			
 
				+    /*Ignore exception.*/
			
 
				+  }
			
 
				+#   if defined(OC_ARM_ASM_NEON)
			
 
				+  __try{
			
 
				+    /*VORR q0,q0,q0*/
			
 
				+    __emit(0xF2200150);
			
 
				+    flags|=OC_CPU_ARM_NEON;
			
 
				+  }
			
 
				+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
			
 
				+    /*Ignore exception.*/
			
 
				+  }
			
 
				+#   endif
			
 
				+#  endif
			
 
				+# endif
			
 
				+  return flags;
			
 
				+}
			
 
				+
			
 
				+#elif defined(__linux__)
			
 
				+# include <stdio.h>
			
 
				+# include <stdlib.h>
			
 
				+# include <string.h>
			
 
				+
			
 
				+ogg_uint32_t oc_cpu_flags_get(void){
			
 
				+  ogg_uint32_t  flags;
			
 
				+  FILE         *fin;
			
 
				+  flags=0;
			
 
				+  /*Reading /proc/self/auxv would be easier, but that doesn't work reliably on
			
 
				+     Android.
			
 
				+    This also means that detection will fail in Scratchbox.*/
			
 
				+  fin=fopen("/proc/cpuinfo","r");
			
 
				+  if(fin!=NULL){
			
 
				+    /*512 should be enough for anybody (it's even enough for all the flags that
			
 
				+       x86 has accumulated... so far).*/
			
 
				+    char buf[512];
			
 
				+    while(fgets(buf,511,fin)!=NULL){
			
 
				+      if(memcmp(buf,"Features",8)==0){
			
 
				+        char *p;
			
 
				+        p=strstr(buf," edsp");
			
 
				+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_EDSP;
			
 
				+        p=strstr(buf," neon");
			
 
				+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_NEON;
			
 
				+      }
			
 
				+      if(memcmp(buf,"CPU architecture:",17)==0){
			
 
				+        int version;
			
 
				+        version=atoi(buf+17);
			
 
				+        if(version>=6)flags|=OC_CPU_ARM_MEDIA;
			
 
				+      }
			
 
				+    }
			
 
				+    fclose(fin);
			
 
				+  }
			
 
				+  return flags;
			
 
				+}
			
 
				+
			
 
				+#elif defined(_IOS)
			
 
				+
			
 
				+ogg_uint32_t oc_cpu_flags_get(void){
			
 
				+	ogg_uint32_t flags;
			
 
				+	flags=0;
			
 
				+	flags|=OC_CPU_ARM_EDSP;
			
 
				+	flags|=OC_CPU_ARM_MEDIA;
			
 
				+	flags|=OC_CPU_ARM_NEON;
			
 
				+	return flags;
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+/*The feature registers which can tell us what the processor supports are
			
 
				+   accessible in priveleged modes only, so we can't have a general user-space
			
 
				+   detection method like on x86.*/
			
 
				+# error "Configured to use ARM asm but no CPU detection method available for " \
			
 
				+ "your platform.  Reconfigure with --disable-asm (or send patches)."
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm_llvm/armcpu.h
+++ b/modules/theoraplayer/native/theora/lib/arm_llvm/armcpu.h
@@ -0,0 +1,29 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+ function:
			
 
				+    last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#if !defined(_arm_armcpu_H)
			
 
				+# define _arm_armcpu_H (1)
			
 
				+#include "../internal.h"
			
 
				+
			
 
				+/*"Parallel instructions" from ARM v6 and above.*/
			
 
				+#define OC_CPU_ARM_MEDIA    (1<<24)
			
 
				+/*Flags chosen to match arch/arm/include/asm/hwcap.h in the Linux kernel.*/
			
 
				+#define OC_CPU_ARM_EDSP     (1<<7)
			
 
				+#define OC_CPU_ARM_NEON     (1<<12)
			
 
				+
			
 
				+ogg_uint32_t oc_cpu_flags_get(void);
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm_llvm/armenc.c
+++ b/modules/theoraplayer/native/theora/lib/arm_llvm/armenc.c
@@ -0,0 +1,57 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include "armenc.h"
			
 
				+
			
 
				+#if defined(OC_ARM_ASM)
			
 
				+
			
 
				+void oc_enc_accel_init_arm(oc_enc_ctx *_enc){
			
 
				+  ogg_uint32_t cpu_flags;
			
 
				+  cpu_flags=_enc->state.cpu_flags;
			
 
				+  oc_enc_accel_init_c(_enc);
			
 
				+# if defined(OC_ENC_USE_VTABLE)
			
 
				+  /*TODO: Add ARMv4 functions here.*/
			
 
				+# endif
			
 
				+# if defined(OC_ARM_ASM_EDSP)
			
 
				+  if(cpu_flags&OC_CPU_ARM_EDSP){
			
 
				+#  if defined(OC_STATE_USE_VTABLE)
			
 
				+    /*TODO: Add EDSP functions here.*/
			
 
				+#  endif
			
 
				+  }
			
 
				+#  if defined(OC_ARM_ASM_MEDIA)
			
 
				+  if(cpu_flags&OC_CPU_ARM_MEDIA){
			
 
				+#   if defined(OC_STATE_USE_VTABLE)
			
 
				+    /*TODO: Add Media functions here.*/
			
 
				+#   endif
			
 
				+  }
			
 
				+#   if defined(OC_ARM_ASM_NEON)
			
 
				+  if(cpu_flags&OC_CPU_ARM_NEON){
			
 
				+#    if defined(OC_STATE_USE_VTABLE)
			
 
				+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_neon;
			
 
				+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_neon;
			
 
				+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_neon;
			
 
				+    _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_neon;
			
 
				+    _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_neon;
			
 
				+    _enc->opt_vtable.quantize=oc_enc_quantize_neon;
			
 
				+#    endif
			
 
				+    _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
			
 
				+    _enc->opt_data.enquant_table_alignment=16;
			
 
				+  }
			
 
				+#   endif
			
 
				+#  endif
			
 
				+# endif
			
 
				+}
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm_llvm/armenc.h
+++ b/modules/theoraplayer/native/theora/lib/arm_llvm/armenc.h
@@ -0,0 +1,51 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#if !defined(_arm_armenc_H)
			
 
				+# define _arm_armenc_H (1)
			
 
				+# include "armint.h"
			
 
				+
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+#  define oc_enc_accel_init oc_enc_accel_init_arm
			
 
				+#  define OC_ENC_USE_VTABLE (1)
			
 
				+# endif
			
 
				+
			
 
				+# include "../encint.h"
			
 
				+
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+void oc_enc_accel_init_arm(oc_enc_ctx *_enc);
			
 
				+
			
 
				+#  if defined(OC_ARM_ASM_EDSP)
			
 
				+#   if defined(OC_ARM_ASM_MEDIA)
			
 
				+#    if defined(OC_ARM_ASM_NEON)
			
 
				+unsigned oc_enc_frag_satd_neon(int *_dc,const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride);
			
 
				+unsigned oc_enc_frag_satd2_neon(int *_dc,const unsigned char *_src,
			
 
				+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
			
 
				+unsigned oc_enc_frag_intra_satd_neon(int *_dc,
			
 
				+ const unsigned char *_src,int _ystride);
			
 
				+
			
 
				+void oc_enc_enquant_table_init_neon(void *_enquant,
			
 
				+ const ogg_uint16_t _dequant[64]);
			
 
				+void oc_enc_enquant_table_fixup_neon(void *_enquant[3][3][2],int _nqis);
			
 
				+int oc_enc_quantize_neon(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
			
 
				+ const ogg_uint16_t _dequant[64],const void *_enquant);
			
 
				+#    endif
			
 
				+#   endif
			
 
				+#  endif
			
 
				+# endif
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm_llvm/armfrag.asm
+++ b/modules/theoraplayer/native/theora/lib/arm_llvm/armfrag.asm
@@ -0,0 +1,668 @@
 
				+#ifdef OC_ARM_ASM
			
 
				+@********************************************************************
			
 
				+@*                                                                  *
			
 
				+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+@*                                                                  *
			
 
				+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+@*                                                                  *
			
 
				+@********************************************************************
			
 
				+@ Original implementation:
			
 
				+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+@ last mod: $Id: armfrag.s 17874 2011-02-24 14:49:11Z tterribe $
			
 
				+@********************************************************************
			
 
				+
			
 
				+    .text;   .p2align 2
			
 
				+
			
 
				+@ Vanilla ARM v4 versions
			
 
				+	.global	_oc_frag_copy_list_arm
			
 
				+	.global	_oc_frag_recon_intra_arm
			
 
				+	.global	_oc_frag_recon_inter_arm
			
 
				+	.global	_oc_frag_recon_inter2_arm
			
 
				+
			
 
				+	@ .type oc_frag_copy_list_arm, %function; oc_frag_copy_list_arm: @ PROC
			
 
				+_oc_frag_copy_list_arm:
			
 
				+	@ r0 = _dst_frame
			
 
				+	@ r1 = _src_frame
			
 
				+	@ r2 = _ystride
			
 
				+	@ r3 = _fragis
			
 
				+	@ <> = _nfragis
			
 
				+	@ <> = _frag_buf_offs
			
 
				+	LDR	r12,[r13]		@ r12 = _nfragis
			
 
				+	STMFD	r13!,{r4-r6,r11,r14}
			
 
				+	SUBS	r12, r12, #1
			
 
				+	LDR	r4,[r3],#4		@ r4 = _fragis[fragii]
			
 
				+	LDRGE	r14,[r13,#4*6]		@ r14 = _frag_buf_offs
			
 
				+	BLT	ofcl_arm_end
			
 
				+	SUB	r2, r2, #4
			
 
				+ofcl_arm_lp:
			
 
				+	LDR	r11,[r14,r4,LSL #2]	@ r11 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	SUBS	r12, r12, #1
			
 
				+	@ Stall (on XScale)
			
 
				+	ADD	r4, r1, r11		@ r4 = _src_frame+frag_buf_off
			
 
				+	LDR	r6, [r4], #4
			
 
				+	ADD	r11,r0, r11		@ r11 = _dst_frame+frag_buf_off
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4], r2
			
 
				+	STR	r6, [r11],#4
			
 
				+	LDR	r6, [r4], #4
			
 
				+	STR	r5, [r11],r2
			
 
				+	LDR	r5, [r4]
			
 
				+	LDRGE	r4,[r3],#4		@ r4 = _fragis[fragii]
			
 
				+	STR	r6, [r11],#4
			
 
				+	STR	r5, [r11]
			
 
				+	BGE	ofcl_arm_lp
			
 
				+ofcl_arm_end:
			
 
				+	LDMFD	r13!,{r4-r6,r11,PC}
			
 
				+_oc_frag_recon_intra_arm:
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 =       int            _ystride
			
 
				+	@ r2 = const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4,r5,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r5, #255
			
 
				+	SUB	r1, r1, #7
			
 
				+ofrintra_lp_arm:
			
 
				+	LDRSH	r3, [r2], #2
			
 
				+	LDRSH	r4, [r2], #2
			
 
				+	LDRSH	r12,[r2], #2
			
 
				+	ADDS	r3, r3, #128
			
 
				+	CMPGT	r5, r3
			
 
				+	EORLT	r3, r5, r3, ASR #32
			
 
				+	STRB	r3, [r0], #1
			
 
				+	ADDS	r4, r4, #128
			
 
				+	CMPGT	r5, r4
			
 
				+	EORLT	r4, r5, r4, ASR #32
			
 
				+	LDRSH	r3, [r2], #2
			
 
				+	STRB	r4, [r0], #1
			
 
				+	ADDS	r12,r12,#128
			
 
				+	CMPGT	r5, r12
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	LDRSH	r4, [r2], #2
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r3, r3, #128
			
 
				+	CMPGT	r5, r3
			
 
				+	EORLT	r3, r5, r3, ASR #32
			
 
				+	LDRSH	r12,[r2], #2
			
 
				+	STRB	r3, [r0], #1
			
 
				+	ADDS	r4, r4, #128
			
 
				+	CMPGT	r5, r4
			
 
				+	EORLT	r4, r5, r4, ASR #32
			
 
				+	LDRSH	r3, [r2], #2
			
 
				+	STRB	r4, [r0], #1
			
 
				+	ADDS	r12,r12,#128
			
 
				+	CMPGT	r5, r12
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	LDRSH	r4, [r2], #2
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r3, r3, #128
			
 
				+	CMPGT	r5, r3
			
 
				+	EORLT	r3, r5, r3, ASR #32
			
 
				+	STRB	r3, [r0], #1
			
 
				+	ADDS	r4, r4, #128
			
 
				+	CMPGT	r5, r4
			
 
				+	EORLT	r4, r5, r4, ASR #32
			
 
				+	STRB	r4, [r0], r1
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	ofrintra_lp_arm
			
 
				+	LDMFD	r13!,{r4,r5,PC}
			
 
				+	@ .size oc_frag_copy_list_arm, .-oc_frag_copy_list_arm	@ ENDP
			
 
				+
			
 
				+	@ .type oc_frag_recon_inter_arm, %function; oc_frag_recon_inter_arm: @ PROC
			
 
				+_oc_frag_recon_inter_arm:
			
 
				+	@ r0 =       unsigned char *dst
			
 
				+	@ r1 = const unsigned char *src
			
 
				+	@ r2 =       int            ystride
			
 
				+	@ r3 = const ogg_int16_t    residue[64]
			
 
				+	STMFD	r13!,{r5,r9-r11,r14}
			
 
				+	MOV	r9, #8
			
 
				+	MOV	r5, #255
			
 
				+	SUB	r2, r2, #7
			
 
				+ofrinter_lp_arm:
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], #1
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], #1
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	LDRSH	r12,[r3], #2
			
 
				+	LDRB	r14,[r1], #1
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], #1
			
 
				+	ADDS	r12,r12,r14
			
 
				+	CMPGT	r5, r12
			
 
				+	LDRSH	r11,[r3], #2
			
 
				+	LDRB	r10,[r1], r2
			
 
				+	EORLT	r12,r5, r12,ASR #32
			
 
				+	STRB	r12,[r0], #1
			
 
				+	ADDS	r11,r11,r10
			
 
				+	CMPGT	r5, r11
			
 
				+	EORLT	r11,r5, r11,ASR #32
			
 
				+	STRB	r11,[r0], r2
			
 
				+	SUBS	r9, r9, #1
			
 
				+	BGT	ofrinter_lp_arm
			
 
				+	LDMFD	r13!,{r5,r9-r11,PC}
			
 
				+	@ .size oc_frag_recon_inter_arm, .-oc_frag_recon_inter_arm	@ ENDP
			
 
				+
			
 
				+	@ .type oc_frag_recon_inter2_arm, %function; oc_frag_recon_inter2_arm: @ PROC
			
 
				+_oc_frag_recon_inter2_arm:
			
 
				+	@ r0 =       unsigned char *dst
			
 
				+	@ r1 = const unsigned char *src1
			
 
				+	@ r2 = const unsigned char *src2
			
 
				+	@ r3 =       int            ystride
			
 
				+	LDR	r12,[r13]
			
 
				+	@ r12= const ogg_int16_t    residue[64]
			
 
				+	STMFD	r13!,{r4-r8,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r8, #255
			
 
				+	SUB	r3, r3, #7
			
 
				+ofrinter2_lp_arm:
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	LDRB	r7, [r1], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r7, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r7, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	LDRB	r5, [r1], #1
			
 
				+	LDRB	r6, [r2], #1
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], #1
			
 
				+	ADD	r5, r5, r6
			
 
				+	ADDS	r5, r4, r5, LSR #1
			
 
				+	CMPGT	r8, r5
			
 
				+	LDRB	r7, [r1], r3
			
 
				+	LDRB	r6, [r2], r3
			
 
				+	LDRSH	r4, [r12],#2
			
 
				+	EORLT	r5, r8, r5, ASR #32
			
 
				+	STRB	r5, [r0], #1
			
 
				+	ADD	r7, r7, r6
			
 
				+	ADDS	r7, r4, r7, LSR #1
			
 
				+	CMPGT	r8, r7
			
 
				+	EORLT	r7, r8, r7, ASR #32
			
 
				+	STRB	r7, [r0], r3
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	ofrinter2_lp_arm
			
 
				+	LDMFD	r13!,{r4-r8,PC}
			
 
				+	@ .size oc_frag_recon_inter2_arm, .-oc_frag_recon_inter2_arm	@ ENDP
			
 
				+
			
 
				+  .if OC_ARM_ASM_EDSP
			
 
				+	.global	_oc_frag_copy_list_edsp
			
 
				+
			
 
				+	@ .type oc_frag_copy_list_edsp, %function; oc_frag_copy_list_edsp: @ PROC
			
 
				+_oc_frag_copy_list_edsp:
			
 
				+	@ r0 = _dst_frame
			
 
				+	@ r1 = _src_frame
			
 
				+	@ r2 = _ystride
			
 
				+	@ r3 = _fragis
			
 
				+	@ <> = _nfragis
			
 
				+	@ <> = _frag_buf_offs
			
 
				+	LDR	r12,[r13]		@ r12 = _nfragis
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUBS	r12, r12, #1
			
 
				+	LDRGE	r5, [r3],#4		@ r5 = _fragis[fragii]
			
 
				+	LDRGE	r14,[r13,#4*10]		@ r14 = _frag_buf_offs
			
 
				+	BLT	ofcl_edsp_end
			
 
				+ofcl_edsp_lp:
			
 
				+	MOV	r4, r1
			
 
				+	LDR	r5, [r14,r5, LSL #2]	@ r5 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	SUBS	r12, r12, #1
			
 
				+	@ Stall (on XScale)
			
 
				+	LDRD	r6, r7, [r4, r5]!		@ r4 = _src_frame+frag_buf_off
			
 
				+	LDRD	r8, r9, [r4, r2]!
			
 
				+	@ Stall
			
 
				+	STRD	r6, r7, [r5, r0]!		@ r5 = _dst_frame+frag_buf_off
			
 
				+	STRD	r8, r9, [r5, r2]!
			
 
				+	@ Stall
			
 
				+	LDRD	r6, r7, [r4, r2]!	@ On Xscale at least, doing 3 consecutive
			
 
				+	LDRD	r8, r9, [r4, r2]!	@ loads causes a stall, but thats no worse
			
 
				+	LDRD	r10,r11,[r4, r2]!	@ than us only doing 2, and having to do
			
 
				+				@ another pair of LDRD/STRD later on.
			
 
				+	@ Stall
			
 
				+	STRD	r6, r7, [r5, r2]!
			
 
				+	STRD	r8, r9, [r5, r2]!
			
 
				+	STRD	r10,r11,[r5, r2]!
			
 
				+	LDRD	r6, r7, [r4, r2]!
			
 
				+	LDRD	r8, r9, [r4, r2]!
			
 
				+	LDRD	r10,r11,[r4, r2]!
			
 
				+	STRD	r6, r7, [r5, r2]!
			
 
				+	STRD	r8, r9, [r5, r2]!
			
 
				+	STRD	r10,r11,[r5, r2]!
			
 
				+	LDRGE	r5, [r3],#4		@ r5 = _fragis[fragii]
			
 
				+	BGE	ofcl_edsp_lp
			
 
				+ofcl_edsp_end:
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	@ .size oc_frag_copy_list_edsp, .-oc_frag_copy_list_edsp	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+  .if OC_ARM_ASM_MEDIA
			
 
				+	.global	_oc_frag_recon_intra_v6
			
 
				+	.global	_oc_frag_recon_inter_v6
			
 
				+	.global	_oc_frag_recon_inter2_v6
			
 
				+
			
 
				+	@ .type oc_frag_recon_intra_v6, %function; oc_frag_recon_intra_v6: @ PROC
			
 
				+_oc_frag_recon_intra_v6:
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 =       int            _ystride
			
 
				+	@ r2 = const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4-r6,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r12,r2
			
 
				+	MOV	r6, #0x0080
			
 
				+	MOVT	r6, #0x0080
			
 
				+ofrintra_v6_lp:
			
 
				+	LDRD	r2, r3, [r12],#8	@ r2 = 11110000 r3 = 33332222
			
 
				+	LDRD	r4, r5, [r12],#8	@ r4 = 55554444 r5 = 77776666
			
 
				+	SUBS	r14,r14,#1
			
 
				+	QADD16	r2, r2, r6
			
 
				+	QADD16	r3, r3, r6
			
 
				+	QADD16	r4, r4, r6
			
 
				+	QADD16	r5, r5, r6
			
 
				+	USAT16	r2, #8, r2		@ r2 = __11__00
			
 
				+	USAT16	r3, #8, r3		@ r3 = __33__22
			
 
				+	USAT16	r4, #8, r4		@ r4 = __55__44
			
 
				+	USAT16	r5, #8, r5		@ r5 = __77__66
			
 
				+	ORR	r2, r2, r2, LSR #8	@ r2 = __111100
			
 
				+	ORR	r3, r3, r3, LSR #8	@ r3 = __333322
			
 
				+	ORR	r4, r4, r4, LSR #8	@ r4 = __555544
			
 
				+	ORR	r5, r5, r5, LSR #8	@ r5 = __777766
			
 
				+	PKHBT   r2, r2, r3, LSL #16     @ r2 = 33221100
			
 
				+	PKHBT   r3, r4, r5, LSL #16     @ r3 = 77665544
			
 
				+	STRD	r2, r3, [r0], r1
			
 
				+	BGT	ofrintra_v6_lp
			
 
				+	LDMFD	r13!,{r4-r6,PC}
			
 
				+	@ .size oc_frag_recon_intra_v6, .-oc_frag_recon_intra_v6	@ ENDP
			
 
				+
			
 
				+	@ .type oc_frag_recon_inter_v6, %function; oc_frag_recon_inter_v6: @ PROC
			
 
				+_oc_frag_recon_inter_v6:
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 = const unsigned char *_src
			
 
				+	@ r2 =       int            _ystride
			
 
				+	@ r3 = const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4-r7,r14}
			
 
				+	MOV	r14,#8
			
 
				+ofrinter_v6_lp:
			
 
				+	LDRD	r6, r7, [r3], #8		@ r6 = 11110000 r7 = 33332222
			
 
				+	SUBS	r14,r14,#1
			
 
				+  .if OC_ARM_CAN_UNALIGN_LDRD
			
 
				+	LDRD	r4, r5, [r1], r2	@ Unaligned ; r4 = 33221100 r5 = 77665544
			
 
				+  .else
			
 
				+	LDR	r5, [r1, #4]
			
 
				+	LDR	r4, [r1], r2
			
 
				+  .endif
			
 
				+	PKHBT	r12,r6, r7, LSL #16	@ r12= 22220000
			
 
				+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 33331111
			
 
				+	UXTB16	r6,r4			@ r6 = __22__00
			
 
				+	UXTB16	r4,r4, ROR #8		@ r4 = __33__11
			
 
				+	QADD16	r12,r12,r6		@ r12= xx22xx00
			
 
				+	QADD16	r4, r7, r4		@ r4 = xx33xx11
			
 
				+	LDRD	r6, r7, [r3], #8		@ r6 = 55554444 r7 = 77776666
			
 
				+	USAT16	r4, #8, r4		@ r4 = __33__11
			
 
				+	USAT16	r12,#8,r12		@ r12= __22__00
			
 
				+	ORR	r4, r12,r4, LSL #8	@ r4 = 33221100
			
 
				+	PKHBT	r12,r6, r7, LSL #16	@ r12= 66664444
			
 
				+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 77775555
			
 
				+	UXTB16	r6,r5			@ r6 = __66__44
			
 
				+	UXTB16	r5,r5, ROR #8		@ r5 = __77__55
			
 
				+	QADD16	r12,r12,r6		@ r12= xx66xx44
			
 
				+	QADD16	r5, r7, r5		@ r5 = xx77xx55
			
 
				+	USAT16	r12,#8, r12		@ r12= __66__44
			
 
				+	USAT16	r5, #8, r5		@ r4 = __77__55
			
 
				+	ORR	r5, r12,r5, LSL #8	@ r5 = 33221100
			
 
				+	STRD	r4, r5, [r0], r2
			
 
				+	BGT	ofrinter_v6_lp
			
 
				+	LDMFD	r13!,{r4-r7,PC}
			
 
				+	@ .size oc_frag_recon_inter_v6, .-oc_frag_recon_inter_v6	@ ENDP
			
 
				+
			
 
				+	@ .type oc_frag_recon_inter2_v6, %function; oc_frag_recon_inter2_v6: @ PROC
			
 
				+_oc_frag_recon_inter2_v6:
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 = const unsigned char *_src1
			
 
				+	@ r2 = const unsigned char *_src2
			
 
				+	@ r3 =       int            _ystride
			
 
				+	LDR	r12,[r13]
			
 
				+	@ r12= const ogg_int16_t    _residue[64]
			
 
				+	STMFD	r13!,{r4-r9,r14}
			
 
				+	MOV	r14,#8
			
 
				+ofrinter2_v6_lp:
			
 
				+	LDRD	r6, r7, [r12,#8]	@ r6 = 55554444 r7 = 77776666
			
 
				+	SUBS	r14,r14,#1
			
 
				+	LDR	r4, [r1, #4]	@ Unaligned	; r4 = src1[1] = 77665544
			
 
				+	LDR	r5, [r2, #4]	@ Unaligned	; r5 = src2[1] = 77665544
			
 
				+	PKHBT	r8, r6, r7, LSL #16	@ r8 = 66664444
			
 
				+	PKHTB	r9, r7, r6, ASR #16	@ r9 = 77775555
			
 
				+	UHADD8	r4, r4, r5	@ r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
			
 
				+	UXTB16	r5, r4			@ r5 = __66__44
			
 
				+	UXTB16	r4, r4, ROR #8		@ r4 = __77__55
			
 
				+	QADD16	r8, r8, r5		@ r8 = xx66xx44
			
 
				+	QADD16	r9, r9, r4		@ r9 = xx77xx55
			
 
				+	LDRD	r6, r7, [r12],#16	@ r6 = 33332222 r7 = 11110000
			
 
				+	USAT16	r8, #8, r8		@ r8 = __66__44
			
 
				+	LDR	r4, [r1], r3	@ Unaligned	; r4 = src1[0] = 33221100
			
 
				+	USAT16	r9, #8, r9		@ r9 = __77__55
			
 
				+	LDR	r5, [r2], r3	@ Unaligned	; r5 = src2[0] = 33221100
			
 
				+	ORR	r9, r8, r9, LSL #8	@ r9 = 77665544
			
 
				+	PKHBT	r8, r6, r7, LSL #16	@ r8 = 22220000
			
 
				+	UHADD8	r4, r4, r5	@ r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
			
 
				+	PKHTB	r7, r7, r6, ASR #16	@ r7 = 33331111
			
 
				+	UXTB16	r5, r4			@ r5 = __22__00
			
 
				+	UXTB16	r4, r4, ROR #8		@ r4 = __33__11
			
 
				+	QADD16	r8, r8, r5		@ r8 = xx22xx00
			
 
				+	QADD16	r7, r7, r4		@ r7 = xx33xx11
			
 
				+	USAT16	r8, #8, r8		@ r8 = __22__00
			
 
				+	USAT16	r7, #8, r7		@ r7 = __33__11
			
 
				+	ORR	r8, r8, r7, LSL #8	@ r8 = 33221100
			
 
				+	STRD	r8, r9, [r0], r3
			
 
				+	BGT	ofrinter2_v6_lp
			
 
				+	LDMFD	r13!,{r4-r9,PC}
			
 
				+	@ .size oc_frag_recon_inter2_v6, .-oc_frag_recon_inter2_v6	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+  .if OC_ARM_ASM_NEON
			
 
				+	.global	_oc_frag_copy_list_neon
			
 
				+	.global	_oc_frag_recon_intra_neon
			
 
				+	.global	_oc_frag_recon_inter_neon
			
 
				+	.global	_oc_frag_recon_inter2_neon
			
 
				+
			
 
				+	@ .type oc_frag_copy_list_neon, %function; oc_frag_copy_list_neon: @ PROC
			
 
				+_oc_frag_copy_list_neon:
			
 
				+	@ r0 = _dst_frame
			
 
				+	@ r1 = _src_frame
			
 
				+	@ r2 = _ystride
			
 
				+	@ r3 = _fragis
			
 
				+	@ <> = _nfragis
			
 
				+	@ <> = _frag_buf_offs
			
 
				+	LDR	r12,[r13]		@ r12 = _nfragis
			
 
				+	STMFD	r13!,{r4-r7,r14}
			
 
				+	CMP	r12, #1
			
 
				+	LDRGE	r6, [r3]		@ r6 = _fragis[fragii]
			
 
				+	LDRGE	r14,[r13,#4*6]		@ r14 = _frag_buf_offs
			
 
				+	BLT	ofcl_neon_end
			
 
				+	@ Stall (2 on Xscale)
			
 
				+	LDR	r6, [r14,r6, LSL #2]	@ r6 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	@ Stall (on XScale)
			
 
				+	MOV	r7, r6			@ Guarantee PLD points somewhere valid.
			
 
				+ofcl_neon_lp:
			
 
				+	ADD	r4, r1, r6
			
 
				+	VLD1.64	{D0}, [r4,:64], r2
			
 
				+	ADD	r5, r0, r6
			
 
				+	VLD1.64	{D1}, [r4,:64], r2
			
 
				+	SUBS	r12, r12, #1
			
 
				+	VLD1.64	{D2}, [r4,:64], r2
			
 
				+	LDRGT	r6, [r3,#4]!		@ r6 = _fragis[fragii]
			
 
				+	VLD1.64	{D3}, [r4,:64], r2
			
 
				+	LDRGT	r6, [r14,r6, LSL #2]	@ r6 = _frag_buf_offs[_fragis[fragii]]
			
 
				+	VLD1.64	{D4}, [r4,:64], r2
			
 
				+	ADDGT	r7, r1, r6
			
 
				+	VLD1.64	{D5}, [r4,:64], r2
			
 
				+	PLD	[r7]
			
 
				+	VLD1.64	{D6}, [r4,:64], r2
			
 
				+	PLD	[r7, r2]
			
 
				+	VLD1.64	{D7}, [r4,:64]
			
 
				+	PLD	[r7, r2, LSL #1]
			
 
				+	VST1.64	{D0}, [r5,:64], r2
			
 
				+	ADDGT	r7, r7, r2, LSL #2
			
 
				+	VST1.64	{D1}, [r5,:64], r2
			
 
				+	PLD	[r7, -r2]
			
 
				+	VST1.64	{D2}, [r5,:64], r2
			
 
				+	PLD	[r7]
			
 
				+	VST1.64	{D3}, [r5,:64], r2
			
 
				+	PLD	[r7, r2]
			
 
				+	VST1.64	{D4}, [r5,:64], r2
			
 
				+	PLD	[r7, r2, LSL #1]
			
 
				+	VST1.64	{D5}, [r5,:64], r2
			
 
				+	ADDGT	r7, r7, r2, LSL #2
			
 
				+	VST1.64	{D6}, [r5,:64], r2
			
 
				+	PLD	[r7, -r2]
			
 
				+	VST1.64	{D7}, [r5,:64]
			
 
				+	BGT	ofcl_neon_lp
			
 
				+ofcl_neon_end:
			
 
				+	LDMFD	r13!,{r4-r7,PC}
			
 
				+	@ .size oc_frag_copy_list_neon, .-oc_frag_copy_list_neon	@ ENDP
			
 
				+
			
 
				+	@ .type oc_frag_recon_intra_neon, %function; oc_frag_recon_intra_neon: @ PROC
			
 
				+_oc_frag_recon_intra_neon:
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 =       int            _ystride
			
 
				+	@ r2 = const ogg_int16_t    _residue[64]
			
 
				+	VMOV.I16	Q0, #128
			
 
				+	VLDMIA	r2,  {D16-D31}	@ D16= 3333222211110000 etc	; 9(8) cycles
			
 
				+	VQADD.S16	Q8, Q8, Q0
			
 
				+	VQADD.S16	Q9, Q9, Q0
			
 
				+	VQADD.S16	Q10,Q10,Q0
			
 
				+	VQADD.S16	Q11,Q11,Q0
			
 
				+	VQADD.S16	Q12,Q12,Q0
			
 
				+	VQADD.S16	Q13,Q13,Q0
			
 
				+	VQADD.S16	Q14,Q14,Q0
			
 
				+	VQADD.S16	Q15,Q15,Q0
			
 
				+	VQMOVUN.S16	D16,Q8	@ D16= 7766554433221100		; 1 cycle
			
 
				+	VQMOVUN.S16	D17,Q9	@ D17= FFEEDDCCBBAA9988		; 1 cycle
			
 
				+	VQMOVUN.S16	D18,Q10	@ D18= NNMMLLKKJJIIHHGG		; 1 cycle
			
 
				+	VST1.64	{D16},[r0,:64], r1
			
 
				+	VQMOVUN.S16	D19,Q11	@ D19= VVUUTTSSRRQQPPOO		; 1 cycle
			
 
				+	VST1.64	{D17},[r0,:64], r1
			
 
				+	VQMOVUN.S16	D20,Q12	@ D20= ddccbbaaZZYYXXWW		; 1 cycle
			
 
				+	VST1.64	{D18},[r0,:64], r1
			
 
				+	VQMOVUN.S16	D21,Q13	@ D21= llkkjjiihhggffee		; 1 cycle
			
 
				+	VST1.64	{D19},[r0,:64], r1
			
 
				+	VQMOVUN.S16	D22,Q14	@ D22= ttssrrqqppoonnmm		; 1 cycle
			
 
				+	VST1.64	{D20},[r0,:64], r1
			
 
				+	VQMOVUN.S16	D23,Q15	@ D23= !!,:@zzyyxxwwvvuu		; 1 cycle
			
 
				+	VST1.64	{D21},[r0,:64], r1
			
 
				+	VST1.64	{D22},[r0,:64], r1
			
 
				+	VST1.64	{D23},[r0,:64], r1
			
 
				+	MOV	PC,R14
			
 
				+	@ .size oc_frag_recon_intra_neon, .-oc_frag_recon_intra_neon	@ ENDP
			
 
				+
			
 
				+	@ .type oc_frag_recon_inter_neon, %function; oc_frag_recon_inter_neon: @ PROC
			
 
				+_oc_frag_recon_inter_neon:
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 = const unsigned char *_src
			
 
				+	@ r2 =       int            _ystride
			
 
				+	@ r3 = const ogg_int16_t    _residue[64]
			
 
				+	VLDMIA	r3, {D16-D31}	@ D16= 3333222211110000 etc	; 9(8) cycles
			
 
				+	VLD1.64	{D0}, [r1], r2
			
 
				+	VLD1.64	{D2}, [r1], r2
			
 
				+	VMOVL.U8	Q0, D0	@ Q0 = __77__66__55__44__33__22__11__00
			
 
				+	VLD1.64	{D4}, [r1], r2
			
 
				+	VMOVL.U8	Q1, D2	@ etc
			
 
				+	VLD1.64	{D6}, [r1], r2
			
 
				+	VMOVL.U8	Q2, D4
			
 
				+	VMOVL.U8	Q3, D6
			
 
				+	VQADD.S16	Q8, Q8, Q0
			
 
				+	VLD1.64	{D0}, [r1], r2
			
 
				+	VQADD.S16	Q9, Q9, Q1
			
 
				+	VLD1.64	{D2}, [r1], r2
			
 
				+	VQADD.S16	Q10,Q10,Q2
			
 
				+	VLD1.64	{D4}, [r1], r2
			
 
				+	VQADD.S16	Q11,Q11,Q3
			
 
				+	VLD1.64	{D6}, [r1], r2
			
 
				+	VMOVL.U8	Q0, D0
			
 
				+	VMOVL.U8	Q1, D2
			
 
				+	VMOVL.U8	Q2, D4
			
 
				+	VMOVL.U8	Q3, D6
			
 
				+	VQADD.S16	Q12,Q12,Q0
			
 
				+	VQADD.S16	Q13,Q13,Q1
			
 
				+	VQADD.S16	Q14,Q14,Q2
			
 
				+	VQADD.S16	Q15,Q15,Q3
			
 
				+	VQMOVUN.S16	D16,Q8
			
 
				+	VQMOVUN.S16	D17,Q9
			
 
				+	VQMOVUN.S16	D18,Q10
			
 
				+	VST1.64	{D16},[r0,:64], r2
			
 
				+	VQMOVUN.S16	D19,Q11
			
 
				+	VST1.64	{D17},[r0,:64], r2
			
 
				+	VQMOVUN.S16	D20,Q12
			
 
				+	VST1.64	{D18},[r0,:64], r2
			
 
				+	VQMOVUN.S16	D21,Q13
			
 
				+	VST1.64	{D19},[r0,:64], r2
			
 
				+	VQMOVUN.S16	D22,Q14
			
 
				+	VST1.64	{D20},[r0,:64], r2
			
 
				+	VQMOVUN.S16	D23,Q15
			
 
				+	VST1.64	{D21},[r0,:64], r2
			
 
				+	VST1.64	{D22},[r0,:64], r2
			
 
				+	VST1.64	{D23},[r0,:64], r2
			
 
				+	MOV	PC,R14
			
 
				+	@ .size oc_frag_recon_inter_neon, .-oc_frag_recon_inter_neon	@ ENDP
			
 
				+
			
 
				+	@ .type oc_frag_recon_inter2_neon, %function; oc_frag_recon_inter2_neon: @ PROC
			
 
				+_oc_frag_recon_inter2_neon:
			
 
				+	@ r0 =       unsigned char *_dst
			
 
				+	@ r1 = const unsigned char *_src1
			
 
				+	@ r2 = const unsigned char *_src2
			
 
				+	@ r3 =       int            _ystride
			
 
				+	LDR	r12,[r13]
			
 
				+	@ r12= const ogg_int16_t    _residue[64]
			
 
				+	VLDMIA	r12,{D16-D31}
			
 
				+	VLD1.64	{D0}, [r1], r3
			
 
				+	VLD1.64	{D4}, [r2], r3
			
 
				+	VLD1.64	{D1}, [r1], r3
			
 
				+	VLD1.64	{D5}, [r2], r3
			
 
				+	VHADD.U8	Q2, Q0, Q2	@ Q2 = FFEEDDCCBBAA99887766554433221100
			
 
				+	VLD1.64	{D2}, [r1], r3
			
 
				+	VLD1.64	{D6}, [r2], r3
			
 
				+	VMOVL.U8	Q0, D4		@ Q0 = __77__66__55__44__33__22__11__00
			
 
				+	VLD1.64	{D3}, [r1], r3
			
 
				+	VMOVL.U8	Q2, D5		@ etc
			
 
				+	VLD1.64	{D7}, [r2], r3
			
 
				+	VHADD.U8	Q3, Q1, Q3
			
 
				+	VQADD.S16	Q8, Q8, Q0
			
 
				+	VQADD.S16	Q9, Q9, Q2
			
 
				+	VLD1.64	{D0}, [r1], r3
			
 
				+	VMOVL.U8	Q1, D6
			
 
				+	VLD1.64	{D4}, [r2], r3
			
 
				+	VMOVL.U8	Q3, D7
			
 
				+	VLD1.64	{D1}, [r1], r3
			
 
				+	VQADD.S16	Q10,Q10,Q1
			
 
				+	VLD1.64	{D5}, [r2], r3
			
 
				+	VQADD.S16	Q11,Q11,Q3
			
 
				+	VLD1.64	{D2}, [r1], r3
			
 
				+	VHADD.U8	Q2, Q0, Q2
			
 
				+	VLD1.64	{D6}, [r2], r3
			
 
				+	VLD1.64	{D3}, [r1], r3
			
 
				+	VMOVL.U8	Q0, D4
			
 
				+	VLD1.64	{D7}, [r2], r3
			
 
				+	VMOVL.U8	Q2, D5
			
 
				+	VHADD.U8	Q3, Q1, Q3
			
 
				+	VQADD.S16	Q12,Q12,Q0
			
 
				+	VQADD.S16	Q13,Q13,Q2
			
 
				+	VMOVL.U8	Q1, D6
			
 
				+	VMOVL.U8	Q3, D7
			
 
				+	VQADD.S16	Q14,Q14,Q1
			
 
				+	VQADD.S16	Q15,Q15,Q3
			
 
				+	VQMOVUN.S16	D16,Q8
			
 
				+	VQMOVUN.S16	D17,Q9
			
 
				+	VQMOVUN.S16	D18,Q10
			
 
				+	VST1.64	{D16},[r0,:64], r3
			
 
				+	VQMOVUN.S16	D19,Q11
			
 
				+	VST1.64	{D17},[r0,:64], r3
			
 
				+	VQMOVUN.S16	D20,Q12
			
 
				+	VST1.64	{D18},[r0,:64], r3
			
 
				+	VQMOVUN.S16	D21,Q13
			
 
				+	VST1.64	{D19},[r0,:64], r3
			
 
				+	VQMOVUN.S16	D22,Q14
			
 
				+	VST1.64	{D20},[r0,:64], r3
			
 
				+	VQMOVUN.S16	D23,Q15
			
 
				+	VST1.64	{D21},[r0,:64], r3
			
 
				+	VST1.64	{D22},[r0,:64], r3
			
 
				+	VST1.64	{D23},[r0,:64], r3
			
 
				+	MOV	PC,R14
			
 
				+	@ .size oc_frag_recon_inter2_neon, .-oc_frag_recon_inter2_neon	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+	@ END
			
 
				+    @ .section	.note.GNU-stack,"",%progbits
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm_llvm/armidct.asm
+++ b/modules/theoraplayer/native/theora/lib/arm_llvm/armidct.asm
@@ -0,0 +1,1886 @@
 
				+#ifdef OC_ARM_ASM
			
 
				+@********************************************************************
			
 
				+@*                                                                  *
			
 
				+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+@*                                                                  *
			
 
				+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+@*                                                                  *
			
 
				+@********************************************************************
			
 
				+@ Original implementation:
			
 
				+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+@ last mod: $Id: armidct.s 17728 2010-12-07 10:28:07Z tterribe $
			
 
				+@********************************************************************
			
 
				+
			
 
				+    .text;   .p2align 2
			
 
				+
			
 
				+	.global	_oc_idct8x8_1_arm
			
 
				+	.global	_oc_idct8x8_arm
			
 
				+
			
 
				+	@ .type oc_idct8x8_1_arm, %function; oc_idct8x8_1_arm: @ PROC
			
 
				+_oc_idct8x8_1_arm:
			
 
				+	@ r0 = ogg_int16_t  *_y
			
 
				+	@ r1 = ogg_uint16_t  _dc
			
 
				+	ORR	r1, r1, r1, LSL #16
			
 
				+	MOV	r2, r1
			
 
				+	MOV	r3, r1
			
 
				+	MOV	r12,r1
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	STMIA	r0!,{r1,r2,r3,r12}
			
 
				+	MOV	PC, r14
			
 
				+	@ .size oc_idct8x8_1_arm, .-oc_idct8x8_1_arm	@ ENDP
			
 
				+
			
 
				+	@ .type oc_idct8x8_arm, %function; oc_idct8x8_arm: @ PROC
			
 
				+_oc_idct8x8_arm:
			
 
				+	@ r0 = ogg_int16_t *_y
			
 
				+	@ r1 = ogg_int16_t *_x
			
 
				+	@ r2 = int          _last_zzi
			
 
				+	CMP	r2, #3
			
 
				+	BLE	oc_idct8x8_3_arm
			
 
				+	CMP	r2, #6
			
 
				+	BLE	oc_idct8x8_6_arm
			
 
				+	CMP	r2, #10
			
 
				+	BLE	oc_idct8x8_10_arm
			
 
				+oc_idct8x8_slow_arm:
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	STR	r0, [r13,#-4]!
			
 
				+	ADD	r0, r13, #4	@ Write to temp storage.
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	BL	idct8core_arm
			
 
				+	LDR	r0, [r13], #4	@ Write to the final destination.
			
 
				+	SUB	r2, r1, #8*16
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	MOV	r6, #0
			
 
				+	MOV	r7, #0
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	STMIA	r2!,{r4,r5,r6,r7}
			
 
				+	MOV	r1, r13		@ And read from temp storage.
			
 
				+@ Column transforms
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	BL	idct8core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	@ .size oc_idct8x8_arm, .-oc_idct8x8_arm	@ ENDP
			
 
				+
			
 
				+	@ .type oc_idct8x8_10_arm, %function; oc_idct8x8_10_arm: @ PROC
			
 
				+oc_idct8x8_10_arm:
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	MOV	r2, r0
			
 
				+	MOV	r0, r13		@ Write to temp storage.
			
 
				+	BL	idct4core_arm
			
 
				+	BL	idct3core_arm
			
 
				+	BL	idct2core_arm
			
 
				+	BL	idct1core_arm
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-4*16]!
			
 
				+	STR	r4, [r1,#4]
			
 
				+	STR	r4, [r1,#16]
			
 
				+	STR	r4, [r1,#20]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	STR	r4, [r1,#48]
			
 
				+	MOV	r1, r13		@ Read from temp storage.
			
 
				+	MOV	r0, r2		@ Write to the final destination
			
 
				+oc_idct8x8_10_arm_cols:
			
 
				+@ Column transforms
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	BL	idct4core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	@ .size oc_idct8x8_10_arm, .-oc_idct8x8_10_arm	@ ENDP
			
 
				+
			
 
				+	@ .type oc_idct8x8_6_arm, %function; oc_idct8x8_6_arm: @ PROC
			
 
				+oc_idct8x8_6_arm:
			
 
				+	STMFD	r13!,{r4-r7,r9-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	MOV	r2, r0
			
 
				+	MOV	r0, r13		@ Write to temp storage.
			
 
				+	BL	idct3core_arm
			
 
				+	BL	idct2core_arm
			
 
				+	BL	idct1core_arm
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-3*16]!
			
 
				+	STR	r4, [r1,#4]
			
 
				+	STR	r4, [r1,#16]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	MOV	r1, r13		@ Read from temp storage.
			
 
				+	MOV	r0, r2		@ Write to the final destination
			
 
				+@ Column transforms
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	BL	idct3core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r7,r9-r11,PC}
			
 
				+	@ .size oc_idct8x8_6_arm, .-oc_idct8x8_6_arm	@ ENDP
			
 
				+
			
 
				+	@ .type oc_idct8x8_3_arm, %function; oc_idct8x8_3_arm: @ PROC
			
 
				+oc_idct8x8_3_arm:
			
 
				+	STMFD	r13!,{r4-r7,r9-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	MOV	r2, r0
			
 
				+	MOV	r0, r13		@ Write to temp storage.
			
 
				+	BL	idct2core_arm
			
 
				+	BL	idct1core_arm
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-2*16]!
			
 
				+	STR	r4, [r1,#16]
			
 
				+	MOV	r1, r13		@ Read from temp storage.
			
 
				+	MOV	r0, r2		@ Write to the final destination
			
 
				+@ Column transforms
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	BL	idct2core_down_arm
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r7,r9-r11,PC}
			
 
				+	@ .size oc_idct8x8_3_arm, .-oc_idct8x8_3_arm	@ ENDP
			
 
				+
			
 
				+	@ .type idct1core_arm, %function; idct1core_arm: @ PROC
			
 
				+idct1core_arm:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r3, [r1], #16
			
 
				+	MOV	r12,#0x05
			
 
				+	ORR	r12,r12,#0xB500
			
 
				+	MUL	r3, r12, r3
			
 
				+	@ Stall ?
			
 
				+	MOV	r3, r3, ASR #16
			
 
				+	STRH	r3, [r0], #2
			
 
				+	STRH	r3, [r0, #14]
			
 
				+	STRH	r3, [r0, #30]
			
 
				+	STRH	r3, [r0, #46]
			
 
				+	STRH	r3, [r0, #62]
			
 
				+	STRH	r3, [r0, #78]
			
 
				+	STRH	r3, [r0, #94]
			
 
				+	STRH	r3, [r0, #110]
			
 
				+	MOV	PC,R14
			
 
				+	@ .size idct1core_arm, .-idct1core_arm	@ ENDP
			
 
				+
			
 
				+	@ .type idct2core_arm, %function; idct2core_arm: @ PROC
			
 
				+idct2core_arm:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4
			
 
				+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
			
 
				+	LDR	r3, OC_C7S1
			
 
				+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r10,OC_C1S7
			
 
				+	MUL	r3, r11,r3		@ r3 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r11,r10,r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t[4]
			
 
				+	MUL	r10,r12,r3		@ r10= t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		@ r11= t[7]
			
 
				+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	MOV	r10,r10,ASR #16		@ r10= t[5]
			
 
				+	ADD	r12,r9,r12,ASR #16	@ r12= t[0]+t[6]
			
 
				+	ADD	r12,r12,r10		@ r12= t[0]+t2[6] = t[0]+t[6]+t[5]
			
 
				+	SUB	r10,r12,r10,LSL #1	@ r10= t[0]+t2[5] = t[0]+t[6]-t[5]
			
 
				+	ADD	r3, r3, r9		@ r3 = t[0]+t[4]
			
 
				+	ADD	r11,r11,r9		@ r11= t[0]+t[7]
			
 
				+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r12,[r0, #14]		@ y[1] = t[0]+t[6]
			
 
				+	STRH	r10,[r0, #30]		@ y[2] = t[0]+t[5]
			
 
				+	STRH	r3, [r0, #46]		@ y[3] = t[0]+t[4]
			
 
				+	RSB	r3, r3, r9, LSL #1	@ r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
			
 
				+	RSB	r10,r10,r9, LSL #1	@ r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
			
 
				+	RSB	r12,r12,r9, LSL #1	@ r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
			
 
				+	RSB	r11,r11,r9, LSL #1	@ r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
			
 
				+	STRH	r3, [r0, #62]		@ y[4] = t[0]-t[4]
			
 
				+	STRH	r10,[r0, #78]		@ y[5] = t[0]-t[5]
			
 
				+	STRH	r12,[r0, #94]		@ y[6] = t[0]-t[6]
			
 
				+	STRH	r11,[r0, #110]		@ y[7] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	@ .size idct2core_arm, .-idct2core_arm	@ ENDP
			
 
				+
			
 
				+	@ .type idct2core_down_arm, %function; idct2core_down_arm: @ PROC
			
 
				+idct2core_down_arm:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4
			
 
				+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
			
 
				+	LDR	r3, OC_C7S1
			
 
				+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r10,OC_C1S7
			
 
				+	MUL	r3, r11,r3		@ r3 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r11,r10,r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	ADD	r9, r9, #8		@ r9 = t[0]+8
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t[4]
			
 
				+	MUL	r10,r12,r3		@ r10= t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		@ r11= t[7]
			
 
				+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	MOV	r10,r10,ASR #16		@ r10= t[5]
			
 
				+	ADD	r12,r9,r12,ASR #16	@ r12= t[0]+t[6]+8
			
 
				+	ADD	r12,r12,r10		@ r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
			
 
				+	SUB	r10,r12,r10,LSL #1	@ r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
			
 
				+	ADD	r3, r3, r9		@ r3 = t[0]+t[4]+8
			
 
				+	ADD	r11,r11,r9		@ r11= t[0]+t[7]+8
			
 
				+	@ TODO: This is wrong.
			
 
				+	@ The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	@  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r4, r11,ASR #4
			
 
				+	MOV	r5, r12,ASR #4
			
 
				+	MOV	r6, r10,ASR #4
			
 
				+	MOV	r7, r3, ASR #4
			
 
				+	RSB	r3, r3, r9, LSL #1	@r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
			
 
				+	RSB	r10,r10,r9, LSL #1	@r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
			
 
				+	RSB	r12,r12,r9, LSL #1	@r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
			
 
				+	RSB	r11,r11,r9, LSL #1	@r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r12,r12,ASR #4
			
 
				+	MOV	r11,r11,ASR #4
			
 
				+	STRH	r4, [r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r5, [r0, #14]		@ y[1] = t[0]+t[6]
			
 
				+	STRH	r6, [r0, #30]		@ y[2] = t[0]+t[5]
			
 
				+	STRH	r7, [r0, #46]		@ y[3] = t[0]+t[4]
			
 
				+	STRH	r3, [r0, #62]		@ y[4] = t[0]-t[4]
			
 
				+	STRH	r10,[r0, #78]		@ y[5] = t[0]-t[5]
			
 
				+	STRH	r12,[r0, #94]		@ y[6] = t[0]-t[6]
			
 
				+	STRH	r11,[r0, #110]		@ y[7] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	@ .size idct2core_down_arm, .-idct2core_down_arm	@ ENDP
			
 
				+
			
 
				+	@ .type idct3core_arm, %function; idct3core_arm: @ PROC
			
 
				+idct3core_arm:
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4		@ r12= OC_C4S4
			
 
				+	LDRSH	r3, [r1, #-12]		@ r3 = x[2]
			
 
				+	LDR	r10,OC_C6S2		@ r10= OC_C6S2
			
 
				+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r4, OC_C2S6		@ r4 = OC_C2S6
			
 
				+	MUL	r10,r3, r10		@ r10= t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
			
 
				+	MUL	r3, r4, r3		@ r3 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r4, OC_C7S1		@ r4 = OC_C7S1
			
 
				+	LDR	r5, OC_C1S7		@ r5 = OC_C1S7
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r4, r11,r4		@ r4 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	ADD	r3, r9, r3, ASR #16	@ r3 = t[0]+t[3]
			
 
				+	MUL	r11,r5, r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	MOV	r4, r4, ASR #16		@ r4 = t[4]
			
 
				+	MUL	r5, r12,r4		@ r5 = t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		@ r11= t[7]
			
 
				+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	ADD	r10,r9, r10,ASR #16	@ r10= t[1] = t[0]+t[2]
			
 
				+	RSB	r6, r10,r9, LSL #1	@ r6 = t[2] = t[0]-t[2]
			
 
				+					@ r3 = t2[0] = t[0]+t[3]
			
 
				+	RSB	r9, r3, r9, LSL #1	@ r9 = t2[3] = t[0]-t[3]
			
 
				+	MOV	r12,r12,ASR #16		@ r12= t[6]
			
 
				+	ADD	r5, r12,r5, ASR #16	@ r5 = t2[6] = t[6]+t[5]
			
 
				+	RSB	r12,r5, r12,LSL #1	@ r12= t2[5] = t[6]-t[5]
			
 
				+	ADD	r11,r3, r11		@ r11= t2[0]+t[7]
			
 
				+	ADD	r5, r10,r5		@ r5 = t[1]+t2[6]
			
 
				+	ADD	r12,r6, r12		@ r12= t[2]+t2[5]
			
 
				+	ADD	r4, r9, r4		@ r4 = t2[3]+t[4]
			
 
				+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r5, [r0, #14]		@ y[1] = t[1]+t2[6]
			
 
				+	STRH	r12,[r0, #30]		@ y[2] = t[2]+t2[5]
			
 
				+	STRH	r4, [r0, #46]		@ y[3] = t2[3]+t[4]
			
 
				+	RSB	r11,r11,r3, LSL #1	@ r11= t2[0] - t[7]
			
 
				+	RSB	r5, r5, r10,LSL #1	@ r5 = t[1]  - t2[6]
			
 
				+	RSB	r12,r12,r6, LSL #1	@ r6 = t[2]  - t2[5]
			
 
				+	RSB	r4, r4, r9, LSL #1	@ r4 = t2[3] - t[4]
			
 
				+	STRH	r4, [r0, #62]		@ y[4] = t2[3]-t[4]
			
 
				+	STRH	r12,[r0, #78]		@ y[5] = t[2]-t2[5]
			
 
				+	STRH	r5, [r0, #94]		@ y[6] = t[1]-t2[6]
			
 
				+	STRH	r11,[r0, #110]		@ y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,R14
			
 
				+	@ .size idct3core_arm, .-idct3core_arm	@ ENDP
			
 
				+
			
 
				+	@ .type idct3core_down_arm, %function; idct3core_down_arm: @ PROC
			
 
				+idct3core_down_arm:
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r12,OC_C4S4		@ r12= OC_C4S4
			
 
				+	LDRSH	r3, [r1, #-12]		@ r3 = x[2]
			
 
				+	LDR	r10,OC_C6S2		@ r10= OC_C6S2
			
 
				+	MUL	r9, r12,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r4, OC_C2S6		@ r4 = OC_C2S6
			
 
				+	MUL	r10,r3, r10		@ r10= t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r11,[r1, #-14]		@ r11= x[1]
			
 
				+	MUL	r3, r4, r3		@ r3 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r4, OC_C7S1		@ r4 = OC_C7S1
			
 
				+	LDR	r5, OC_C1S7		@ r5 = OC_C1S7
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r4, r11,r4		@ r4 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	ADD	r9, r9, #8		@ r9 = t[0]+8
			
 
				+	MUL	r11,r5, r11		@ r11= t[7]<<16 = OC_C1S7*x[1]
			
 
				+	ADD	r3, r9, r3, ASR #16	@ r3 = t[0]+t[3]+8
			
 
				+	MOV	r4, r4, ASR #16		@ r4 = t[4]
			
 
				+	MUL	r5, r12,r4		@ r5 = t[5]<<16 = OC_C4S4*t[4]
			
 
				+	MOV	r11,r11,ASR #16		@ r11= t[7]
			
 
				+	MUL	r12,r11,r12		@ r12= t[6]<<16 = OC_C4S4*t[7]
			
 
				+	ADD	r10,r9, r10,ASR #16	@ r10= t[1]+8 = t[0]+t[2]+8
			
 
				+	RSB	r6, r10,r9, LSL #1	@ r6 = t[2]+8 = t[0]-t[2]+8
			
 
				+					@ r3 = t2[0]+8 = t[0]+t[3]+8
			
 
				+	RSB	r9, r3, r9, LSL #1	@ r9 = t2[3]+8 = t[0]-t[3]+8
			
 
				+	MOV	r12,r12,ASR #16		@ r12= t[6]
			
 
				+	ADD	r5, r12,r5, ASR #16	@ r5 = t2[6] = t[6]+t[5]
			
 
				+	RSB	r12,r5, r12,LSL #1	@ r12= t2[5] = t[6]-t[5]
			
 
				+	ADD	r11,r3, r11		@ r11= t2[0]+t[7] +8
			
 
				+	ADD	r5, r10,r5		@ r5 = t[1] +t2[6]+8
			
 
				+	ADD	r12,r6, r12		@ r12= t[2] +t2[5]+8
			
 
				+	ADD	r4, r9, r4		@ r4 = t2[3]+t[4] +8
			
 
				+	RSB	r3, r11,r3, LSL #1	@ r11= t2[0] - t[7]  + 8
			
 
				+	RSB	r10,r5, r10,LSL #1	@ r5 = t[1]  - t2[6] + 8
			
 
				+	RSB	r6, r12,r6, LSL #1	@ r6 = t[2]  - t2[5] + 8
			
 
				+	RSB	r9, r4, r9, LSL #1	@ r4 = t2[3] - t[4]  + 8
			
 
				+	@ TODO: This is wrong.
			
 
				+	@ The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	@  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r11,r11,ASR #4
			
 
				+	MOV	r5, r5, ASR #4
			
 
				+	MOV	r12,r12,ASR #4
			
 
				+	MOV	r4, r4, ASR #4
			
 
				+	MOV	r9, r9, ASR #4
			
 
				+	MOV	r6, r6, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r5, [r0, #14]		@ y[1] = t[1]+t2[6]
			
 
				+	STRH	r12,[r0, #30]		@ y[2] = t[2]+t2[5]
			
 
				+	STRH	r4, [r0, #46]		@ y[3] = t2[3]+t[4]
			
 
				+	STRH	r9, [r0, #62]		@ y[4] = t2[3]-t[4]
			
 
				+	STRH	r6, [r0, #78]		@ y[5] = t[2]-t2[5]
			
 
				+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t2[6]
			
 
				+	STRH	r3, [r0, #110]		@ y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,R14
			
 
				+	@ .size idct3core_down_arm, .-idct3core_down_arm	@ ENDP
			
 
				+
			
 
				+	@ .type idct4core_arm, %function; idct4core_arm: @ PROC
			
 
				+idct4core_arm:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r10,OC_C4S4		@ r10= OC_C4S4
			
 
				+	LDRSH	r12,[r1, #-12]		@ r12= x[2]
			
 
				+	LDR	r4, OC_C6S2		@ r4 = OC_C6S2
			
 
				+	MUL	r9, r10,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r5, OC_C2S6		@ r5 = OC_C2S6
			
 
				+	MUL	r4, r12,r4		@ r4 = t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r3, [r1, #-14]		@ r3 = x[1]
			
 
				+	MUL	r5, r12,r5		@ r5 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r6, OC_C7S1		@ r6 = OC_C7S1
			
 
				+	LDR	r12,OC_C1S7		@ r12= OC_C1S7
			
 
				+	LDRSH	r11,[r1, #-10]		@ r11= x[3]
			
 
				+	MUL	r6, r3, r6		@ r6 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	LDR	r7, OC_C5S3		@ r7 = OC_C5S3
			
 
				+	MUL	r3, r12,r3		@ r3 = t[7]<<16 = OC_C1S7*x[1]
			
 
				+	LDR	r8, OC_C3S5		@ r8 = OC_C3S5
			
 
				+	MUL	r7, r11,r7		@ r7 = -t[5]<<16 = OC_C5S3*x[3]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r11,r8, r11		@ r11= t[6]<<16 = OC_C3S5*x[3]
			
 
				+	MOV	r6, r6, ASR #16		@ r6 = t[4]
			
 
				+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+@ before multiplying, not after (this is not equivalent)
			
 
				+	SUB	r7, r6, r7, ASR #16	@ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
			
 
				+	RSB	r6, r7, r6, LSL #1	@ r6 = t[4]-t[5]
			
 
				+	MUL	r6, r10,r6		@ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t[7]
			
 
				+	ADD	r11,r3, r11,ASR #16	@ r11= t2[7]=t[7]+t[6]
			
 
				+	RSB	r3, r11,r3, LSL #1	@ r3 = t[7]-t[6]
			
 
				+	MUL	r3, r10,r3		@ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
			
 
				+	ADD	r4, r9, r4, ASR #16	@ r4 = t[1] = t[0] + t[2]
			
 
				+	RSB	r10,r4, r9, LSL #1	@ r10= t[2] = t[0] - t[2]
			
 
				+	ADD	r5, r9, r5, ASR #16	@ r5 = t[0] = t[0] + t[3]
			
 
				+	RSB	r9, r5, r9, LSL #1	@ r9 = t[3] = t[0] - t[3]
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t2[6]
			
 
				+	ADD	r6, r3, r6, ASR #16	@ r6 = t3[6] = t2[6]+t2[5]
			
 
				+	RSB	r3, r6, r3, LSL #1	@ r3 = t3[5] = t2[6]-t2[5]
			
 
				+	ADD	r11,r5, r11		@ r11= t[0]+t2[7]
			
 
				+	ADD	r6, r4, r6		@ r6 = t[1]+t3[6]
			
 
				+	ADD	r3, r10,r3		@ r3 = t[2]+t3[5]
			
 
				+	ADD	r7, r9, r7		@ r7 = t[3]+t2[4]
			
 
				+	STRH	r11,[r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t2[6]
			
 
				+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t2[5]
			
 
				+	STRH	r7, [r0, #46]		@ y[3] = t2[3]+t[4]
			
 
				+	RSB	r11,r11,r5, LSL #1	@ r11= t[0]-t2[7]
			
 
				+	RSB	r6, r6, r4, LSL #1	@ r6 = t[1]-t3[6]
			
 
				+	RSB	r3, r3, r10,LSL #1	@ r3 = t[2]-t3[5]
			
 
				+	RSB	r7, r7, r9, LSL #1	@ r7 = t[3]-t2[4]
			
 
				+	STRH	r7, [r0, #62]		@ y[4] = t2[3]-t[4]
			
 
				+	STRH	r3, [r0, #78]		@ y[5] = t[2]-t2[5]
			
 
				+	STRH	r6, [r0, #94]		@ y[6] = t[1]-t2[6]
			
 
				+	STRH	r11, [r0, #110]		@ y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	@ .size idct4core_arm, .-idct4core_arm	@ ENDP
			
 
				+
			
 
				+	@ .type idct4core_down_arm, %function; idct4core_down_arm: @ PROC
			
 
				+idct4core_down_arm:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r9, [r1], #16		@ r9 = x[0]
			
 
				+	LDR	r10,OC_C4S4		@ r10= OC_C4S4
			
 
				+	LDRSH	r12,[r1, #-12]		@ r12= x[2]
			
 
				+	LDR	r4, OC_C6S2		@ r4 = OC_C6S2
			
 
				+	MUL	r9, r10,r9		@ r9 = t[0]<<16 = OC_C4S4*x[0]
			
 
				+	LDR	r5, OC_C2S6		@ r5 = OC_C2S6
			
 
				+	MUL	r4, r12,r4		@ r4 = t[2]<<16 = OC_C6S2*x[2]
			
 
				+	LDRSH	r3, [r1, #-14]		@ r3 = x[1]
			
 
				+	MUL	r5, r12,r5		@ r5 = t[3]<<16 = OC_C2S6*x[2]
			
 
				+	LDR	r6, OC_C7S1		@ r6 = OC_C7S1
			
 
				+	LDR	r12,OC_C1S7		@ r12= OC_C1S7
			
 
				+	LDRSH	r11,[r1, #-10]		@ r11= x[3]
			
 
				+	MUL	r6, r3, r6		@ r6 = t[4]<<16 = OC_C7S1*x[1]
			
 
				+	LDR	r7, OC_C5S3		@ r7 = OC_C5S3
			
 
				+	MUL	r3, r12,r3		@ r3 = t[7]<<16 = OC_C1S7*x[1]
			
 
				+	LDR	r8, OC_C3S5		@ r8 = OC_C3S5
			
 
				+	MUL	r7, r11,r7		@ r7 = -t[5]<<16 = OC_C5S3*x[3]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = t[0]
			
 
				+	MUL	r11,r8, r11		@ r11= t[6]<<16 = OC_C3S5*x[3]
			
 
				+	MOV	r6, r6, ASR #16		@ r6 = t[4]
			
 
				+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+@ before multiplying, not after (this is not equivalent)
			
 
				+	SUB	r7, r6, r7, ASR #16	@ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
			
 
				+	RSB	r6, r7, r6, LSL #1	@ r6 = t[4]-t[5]
			
 
				+	MUL	r6, r10,r6		@ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t[7]
			
 
				+	ADD	r11,r3, r11,ASR #16	@ r11= t2[7]=t[7]+t[6]
			
 
				+	RSB	r3, r11,r3, LSL #1	@ r3 = t[7]-t[6]
			
 
				+	ADD	r9, r9, #8		@ r9 = t[0]+8
			
 
				+	MUL	r3, r10,r3		@ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
			
 
				+	ADD	r4, r9, r4, ASR #16	@ r4 = t[1] = t[0] + t[2] + 8
			
 
				+	RSB	r10,r4, r9, LSL #1	@ r10= t[2] = t[0] - t[2] + 8
			
 
				+	ADD	r5, r9, r5, ASR #16	@ r5 = t[0] = t[0] + t[3] + 8
			
 
				+	RSB	r9, r5, r9, LSL #1	@ r9 = t[3] = t[0] - t[3] + 8
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = t2[6]
			
 
				+	ADD	r6, r3, r6, ASR #16	@ r6 = t3[6] = t2[6]+t2[5]
			
 
				+	RSB	r3, r6, r3, LSL #1	@ r3 = t3[5] = t2[6]-t2[5]
			
 
				+	ADD	r5, r5, r11		@ r5 = t[0]+t2[7]+8
			
 
				+	ADD	r4, r4, r6		@ r4 = t[1]+t3[6]+8
			
 
				+	ADD	r10,r10,r3		@ r10= t[2]+t3[5]+8
			
 
				+	ADD	r9, r9, r7		@ r9 = t[3]+t2[4]+8
			
 
				+	SUB	r11,r5, r11,LSL #1	@ r11= t[0]-t2[7]+8
			
 
				+	SUB	r6, r4, r6, LSL #1	@ r6 = t[1]-t3[6]+8
			
 
				+	SUB	r3, r10,r3, LSL #1	@ r3 = t[2]-t3[5]+8
			
 
				+	SUB	r7, r9, r7, LSL #1	@ r7 = t[3]-t2[4]+8
			
 
				+	@ TODO: This is wrong.
			
 
				+	@ The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	@  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r11,r11,ASR #4
			
 
				+	MOV	r6, r6, ASR #4
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	MOV	r7, r7, ASR #4
			
 
				+	MOV	r9, r9, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r4, r4, ASR #4
			
 
				+	MOV	r5, r5, ASR #4
			
 
				+	STRH	r5,[r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r4, [r0, #14]		@ y[1] = t[1]+t2[6]
			
 
				+	STRH	r10,[r0, #30]		@ y[2] = t[2]+t2[5]
			
 
				+	STRH	r9, [r0, #46]		@ y[3] = t2[3]+t[4]
			
 
				+	STRH	r7, [r0, #62]		@ y[4] = t2[3]-t[4]
			
 
				+	STRH	r3, [r0, #78]		@ y[5] = t[2]-t2[5]
			
 
				+	STRH	r6, [r0, #94]		@ y[6] = t[1]-t2[6]
			
 
				+	STRH	r11,[r0, #110]		@ y[7] = t2[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	@ .size idct4core_down_arm, .-idct4core_down_arm	@ ENDP
			
 
				+
			
 
				+	@ .type idct8core_arm, %function; idct8core_arm: @ PROC
			
 
				+idct8core_arm:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r2, [r1],#16		@ r2 = x[0]
			
 
				+	STMFD	r13!,{r1,r14}
			
 
				+	LDRSH	r6, [r1, #-8]		@ r6 = x[4]
			
 
				+	LDR	r12,OC_C4S4		@ r12= C4S4
			
 
				+	LDRSH	r4, [r1, #-12]		@ r4 = x[2]
			
 
				+	ADD	r2, r2, r6		@ r2 = x[0] + x[4]
			
 
				+	SUB	r6, r2, r6, LSL #1	@ r6 = x[0] - x[4]
			
 
				+	@ For spec compliance, these sums must be truncated to 16-bit precision
			
 
				+	@ _before_ the multiply (not after).
			
 
				+	@ Sadly, ARMv4 provides no simple way to do that.
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	MOV	r2, r2, ASR #16
			
 
				+	MOV	r6, r6, ASR #16
			
 
				+	MUL	r2, r12,r2		@ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
			
 
				+	LDRSH	r8, [r1, #-4]		@ r8 = x[6]
			
 
				+	LDR	r7, OC_C6S2		@ r7 = OC_C6S2
			
 
				+	MUL	r6, r12,r6		@ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
			
 
				+	LDR	r14,OC_C2S6		@ r14= OC_C2S6
			
 
				+	MUL	r3, r4, r7		@ r3 = OC_C6S2*x[2]
			
 
				+	LDR	r5, OC_C7S1		@ r5 = OC_C7S1
			
 
				+	MUL	r4, r14,r4		@ r4 = OC_C2S6*x[2]
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = OC_C6S2*x[2]>>16
			
 
				+	MUL	r14,r8, r14		@ r14= OC_C2S6*x[6]
			
 
				+	MOV	r4, r4, ASR #16		@ r4 = OC_C2S6*x[2]>>16
			
 
				+	MUL	r8, r7, r8		@ r8 = OC_C6S2*x[6]
			
 
				+	LDR	r7, OC_C1S7		@ r7 = OC_C1S7
			
 
				+	SUB	r3, r3, r14,ASR #16	@ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
			
 
				+	LDRSH	r14,[r1, #-14]		@ r14= x[1]
			
 
				+	ADD	r4, r4, r8, ASR #16	@ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
			
 
				+	LDRSH	r8, [r1, #-2]		@ r8 = x[7]
			
 
				+	MUL	r9, r5, r14		@ r9 = OC_C7S1*x[1]
			
 
				+	LDRSH	r10,[r1, #-6]		@ r10= x[5]
			
 
				+	MUL	r14,r7, r14		@ r14= OC_C1S7*x[1]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = OC_C7S1*x[1]>>16
			
 
				+	MUL	r7, r8, r7		@ r7 = OC_C1S7*x[7]
			
 
				+	MOV	r14,r14,ASR #16		@ r14= OC_C1S7*x[1]>>16
			
 
				+	MUL	r8, r5, r8		@ r8 = OC_C7S1*x[7]
			
 
				+	LDRSH	r1, [r1, #-10]		@ r1 = x[3]
			
 
				+	LDR	r5, OC_C3S5		@ r5 = OC_C3S5
			
 
				+	LDR	r11,OC_C5S3		@ r11= OC_C5S3
			
 
				+	ADD	r8, r14,r8, ASR #16	@ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
			
 
				+	MUL	r14,r5, r10		@ r14= OC_C3S5*x[5]
			
 
				+	SUB	r9, r9, r7, ASR #16	@ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
			
 
				+	MUL	r10,r11,r10		@ r10= OC_C5S3*x[5]
			
 
				+	MOV	r14,r14,ASR #16		@ r14= OC_C3S5*x[5]>>16
			
 
				+	MUL	r11,r1, r11		@ r11= OC_C5S3*x[3]
			
 
				+	MOV	r10,r10,ASR #16		@ r10= OC_C5S3*x[5]>>16
			
 
				+	MUL	r1, r5, r1		@ r1 = OC_C3S5*x[3]
			
 
				+	SUB	r14,r14,r11,ASR #16	@r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
			
 
				+	ADD	r10,r10,r1, ASR #16	@r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
			
 
				+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
			
 
				+	@ r10=t[6] r12=C4S4 r14=t[5]
			
 
				+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+@ before multiplying, not after (this is not equivalent)
			
 
				+	@ Stage 2
			
 
				+	@ 4-5 butterfly
			
 
				+	ADD	r9, r9, r14		@ r9 = t2[4]     =       t[4]+t[5]
			
 
				+	SUB	r14,r9, r14, LSL #1	@ r14=                   t[4]-t[5]
			
 
				+	MUL	r14,r12,r14		@ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
			
 
				+	@ 7-6 butterfly
			
 
				+	ADD	r8, r8, r10		@ r8 = t2[7]     =       t[7]+t[6]
			
 
				+	SUB	r10,r8, r10, LSL #1	@ r10=                   t[7]-t[6]
			
 
				+	MUL	r10,r12,r10		@ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
			
 
				+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
			
 
				+	@ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
			
 
				+	@ Stage 3
			
 
				+	@ 0-3 butterfly
			
 
				+	ADD	r2, r4, r2, ASR #16	@ r2 = t2[0] = t[0] + t[3]
			
 
				+	SUB	r4, r2, r4, LSL #1	@ r4 = t2[3] = t[0] - t[3]
			
 
				+	@ 1-2 butterfly
			
 
				+	ADD	r6, r3, r6, ASR #16	@ r6 = t2[1] = t[1] + t[2]
			
 
				+	SUB	r3, r6, r3, LSL #1	@ r3 = t2[2] = t[1] - t[2]
			
 
				+	@ 6-5 butterfly
			
 
				+	MOV	r14,r14,ASR #16		@ r14= t2[5]
			
 
				+	ADD	r10,r14,r10,ASR #16	@ r10= t3[6] = t[6] + t[5]
			
 
				+	SUB	r14,r10,r14,LSL #1	@ r14= t3[5] = t[6] - t[5]
			
 
				+	@ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
			
 
				+	@ r10=t3[6] r14=t3[5]
			
 
				+	@ Stage 4
			
 
				+	ADD	r2, r2, r8		@ r2 = t[0] + t[7]
			
 
				+	ADD	r6, r6, r10		@ r6 = t[1] + t[6]
			
 
				+	ADD	r3, r3, r14		@ r3 = t[2] + t[5]
			
 
				+	ADD	r4, r4, r9		@ r4 = t[3] + t[4]
			
 
				+	SUB	r8, r2, r8, LSL #1	@ r8 = t[0] - t[7]
			
 
				+	SUB	r10,r6, r10,LSL #1	@ r10= t[1] - t[6]
			
 
				+	SUB	r14,r3, r14,LSL #1	@ r14= t[2] - t[5]
			
 
				+	SUB	r9, r4, r9, LSL #1	@ r9 = t[3] - t[4]
			
 
				+	STRH	r2, [r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t[6]
			
 
				+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t[5]
			
 
				+	STRH	r4, [r0, #46]		@ y[3] = t[3]+t[4]
			
 
				+	STRH	r9, [r0, #62]		@ y[4] = t[3]-t[4]
			
 
				+	STRH	r14,[r0, #78]		@ y[5] = t[2]-t[5]
			
 
				+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t[6]
			
 
				+	STRH	r8, [r0, #110]		@ y[7] = t[0]-t[7]
			
 
				+	LDMFD	r13!,{r1,PC}
			
 
				+	@ .size idct8core_arm, .-idct8core_arm	@ ENDP
			
 
				+
			
 
				+	@ .type idct8core_down_arm, %function; idct8core_down_arm: @ PROC
			
 
				+idct8core_down_arm:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r2, [r1],#16		@ r2 = x[0]
			
 
				+	STMFD	r13!,{r1,r14}
			
 
				+	LDRSH	r6, [r1, #-8]		@ r6 = x[4]
			
 
				+	LDR	r12,OC_C4S4		@ r12= C4S4
			
 
				+	LDRSH	r4, [r1, #-12]		@ r4 = x[2]
			
 
				+	ADD	r2, r2, r6		@ r2 = x[0] + x[4]
			
 
				+	SUB	r6, r2, r6, LSL #1	@ r6 = x[0] - x[4]
			
 
				+	@ For spec compliance, these sums must be truncated to 16-bit precision
			
 
				+	@ _before_ the multiply (not after).
			
 
				+	@ Sadly, ARMv4 provides no simple way to do that.
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	MOV	r2, r2, ASR #16
			
 
				+	MOV	r6, r6, ASR #16
			
 
				+	MUL	r2, r12,r2		@ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
			
 
				+	LDRSH	r8, [r1, #-4]		@ r8 = x[6]
			
 
				+	LDR	r7, OC_C6S2		@ r7 = OC_C6S2
			
 
				+	MUL	r6, r12,r6		@ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
			
 
				+	LDR	r14,OC_C2S6		@ r14= OC_C2S6
			
 
				+	MUL	r3, r4, r7		@ r3 = OC_C6S2*x[2]
			
 
				+	LDR	r5, OC_C7S1		@ r5 = OC_C7S1
			
 
				+	MUL	r4, r14,r4		@ r4 = OC_C2S6*x[2]
			
 
				+	MOV	r3, r3, ASR #16		@ r3 = OC_C6S2*x[2]>>16
			
 
				+	MUL	r14,r8, r14		@ r14= OC_C2S6*x[6]
			
 
				+	MOV	r4, r4, ASR #16		@ r4 = OC_C2S6*x[2]>>16
			
 
				+	MUL	r8, r7, r8		@ r8 = OC_C6S2*x[6]
			
 
				+	LDR	r7, OC_C1S7		@ r7 = OC_C1S7
			
 
				+	SUB	r3, r3, r14,ASR #16	@ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
			
 
				+	LDRSH	r14,[r1, #-14]		@ r14= x[1]
			
 
				+	ADD	r4, r4, r8, ASR #16	@ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
			
 
				+	LDRSH	r8, [r1, #-2]		@ r8 = x[7]
			
 
				+	MUL	r9, r5, r14		@ r9 = OC_C7S1*x[1]
			
 
				+	LDRSH	r10,[r1, #-6]		@ r10= x[5]
			
 
				+	MUL	r14,r7, r14		@ r14= OC_C1S7*x[1]
			
 
				+	MOV	r9, r9, ASR #16		@ r9 = OC_C7S1*x[1]>>16
			
 
				+	MUL	r7, r8, r7		@ r7 = OC_C1S7*x[7]
			
 
				+	MOV	r14,r14,ASR #16		@ r14= OC_C1S7*x[1]>>16
			
 
				+	MUL	r8, r5, r8		@ r8 = OC_C7S1*x[7]
			
 
				+	LDRSH	r1, [r1, #-10]		@ r1 = x[3]
			
 
				+	LDR	r5, OC_C3S5		@ r5 = OC_C3S5
			
 
				+	LDR	r11,OC_C5S3		@ r11= OC_C5S3
			
 
				+	ADD	r8, r14,r8, ASR #16	@ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
			
 
				+	MUL	r14,r5, r10		@ r14= OC_C3S5*x[5]
			
 
				+	SUB	r9, r9, r7, ASR #16	@ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
			
 
				+	MUL	r10,r11,r10		@ r10= OC_C5S3*x[5]
			
 
				+	MOV	r14,r14,ASR #16		@ r14= OC_C3S5*x[5]>>16
			
 
				+	MUL	r11,r1, r11		@ r11= OC_C5S3*x[3]
			
 
				+	MOV	r10,r10,ASR #16		@ r10= OC_C5S3*x[5]>>16
			
 
				+	MUL	r1, r5, r1		@ r1 = OC_C3S5*x[3]
			
 
				+	SUB	r14,r14,r11,ASR #16	@r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
			
 
				+	ADD	r10,r10,r1, ASR #16	@r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
			
 
				+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
			
 
				+	@ r10=t[6] r12=C4S4 r14=t[5]
			
 
				+	@ Stage 2
			
 
				+@ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
			
 
				+@ before multiplying, not after (this is not equivalent)
			
 
				+	@ 4-5 butterfly
			
 
				+	ADD	r9, r9, r14		@ r9 = t2[4]     =       t[4]+t[5]
			
 
				+	SUB	r14,r9, r14, LSL #1	@ r14=                   t[4]-t[5]
			
 
				+	MUL	r14,r12,r14		@ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
			
 
				+	@ 7-6 butterfly
			
 
				+	ADD	r8, r8, r10		@ r8 = t2[7]     =       t[7]+t[6]
			
 
				+	SUB	r10,r8, r10, LSL #1	@ r10=                   t[7]-t[6]
			
 
				+	MUL	r10,r12,r10		@ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
			
 
				+	@ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
			
 
				+	@ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
			
 
				+	@ Stage 3
			
 
				+	ADD	r2, r2, #8<<16		@ r2 = t[0]+8<<16
			
 
				+	ADD	r6, r6, #8<<16		@ r6 = t[1]+8<<16
			
 
				+	@ 0-3 butterfly
			
 
				+	ADD	r2, r4, r2, ASR #16	@ r2 = t2[0] = t[0] + t[3] + 8
			
 
				+	SUB	r4, r2, r4, LSL #1	@ r4 = t2[3] = t[0] - t[3] + 8
			
 
				+	@ 1-2 butterfly
			
 
				+	ADD	r6, r3, r6, ASR #16	@ r6 = t2[1] = t[1] + t[2] + 8
			
 
				+	SUB	r3, r6, r3, LSL #1	@ r3 = t2[2] = t[1] - t[2] + 8
			
 
				+	@ 6-5 butterfly
			
 
				+	MOV	r14,r14,ASR #16		@ r14= t2[5]
			
 
				+	ADD	r10,r14,r10,ASR #16	@ r10= t3[6] = t[6] + t[5]
			
 
				+	SUB	r14,r10,r14,LSL #1	@ r14= t3[5] = t[6] - t[5]
			
 
				+	@ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
			
 
				+	@ r10=t3[6] r14=t3[5]
			
 
				+	@ Stage 4
			
 
				+	ADD	r2, r2, r8		@ r2 = t[0] + t[7] + 8
			
 
				+	ADD	r6, r6, r10		@ r6 = t[1] + t[6] + 8
			
 
				+	ADD	r3, r3, r14		@ r3 = t[2] + t[5] + 8
			
 
				+	ADD	r4, r4, r9		@ r4 = t[3] + t[4] + 8
			
 
				+	SUB	r8, r2, r8, LSL #1	@ r8 = t[0] - t[7] + 8
			
 
				+	SUB	r10,r6, r10,LSL #1	@ r10= t[1] - t[6] + 8
			
 
				+	SUB	r14,r3, r14,LSL #1	@ r14= t[2] - t[5] + 8
			
 
				+	SUB	r9, r4, r9, LSL #1	@ r9 = t[3] - t[4] + 8
			
 
				+	@ TODO: This is wrong.
			
 
				+	@ The C code truncates to 16 bits by storing to RAM and doing the
			
 
				+	@  shifts later; we've got an extra 4 bits here.
			
 
				+	MOV	r2, r2, ASR #4
			
 
				+	MOV	r6, r6, ASR #4
			
 
				+	MOV	r3, r3, ASR #4
			
 
				+	MOV	r4, r4, ASR #4
			
 
				+	MOV	r8, r8, ASR #4
			
 
				+	MOV	r10,r10,ASR #4
			
 
				+	MOV	r14,r14,ASR #4
			
 
				+	MOV	r9, r9, ASR #4
			
 
				+	STRH	r2, [r0], #2		@ y[0] = t[0]+t[7]
			
 
				+	STRH	r6, [r0, #14]		@ y[1] = t[1]+t[6]
			
 
				+	STRH	r3, [r0, #30]		@ y[2] = t[2]+t[5]
			
 
				+	STRH	r4, [r0, #46]		@ y[3] = t[3]+t[4]
			
 
				+	STRH	r9, [r0, #62]		@ y[4] = t[3]-t[4]
			
 
				+	STRH	r14,[r0, #78]		@ y[5] = t[2]-t[5]
			
 
				+	STRH	r10,[r0, #94]		@ y[6] = t[1]-t[6]
			
 
				+	STRH	r8, [r0, #110]		@ y[7] = t[0]-t[7]
			
 
				+	LDMFD	r13!,{r1,PC}
			
 
				+	@ .size idct8core_down_arm, .-idct8core_down_arm	@ ENDP
			
 
				+
			
 
				+  .if OC_ARM_ASM_MEDIA
			
 
				+	.global	_oc_idct8x8_1_v6
			
 
				+	.global	_oc_idct8x8_v6
			
 
				+
			
 
				+	@ .type oc_idct8x8_1_v6, %function; oc_idct8x8_1_v6: @ PROC
			
 
				+_oc_idct8x8_1_v6:
			
 
				+	@ r0 = ogg_int16_t  *_y
			
 
				+	@ r1 = ogg_uint16_t  _dc
			
 
				+	ORR	r2, r1, r1, LSL #16
			
 
				+	ORR	r3, r1, r1, LSL #16
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	STRD	r2, r3, [r0], #8
			
 
				+	MOV	PC, r14
			
 
				+	@ .size oc_idct8x8_1_v6, .-oc_idct8x8_1_v6	@ ENDP
			
 
				+
			
 
				+	@ .type oc_idct8x8_v6, %function; oc_idct8x8_v6: @ PROC
			
 
				+_oc_idct8x8_v6:
			
 
				+	@ r0 = ogg_int16_t *_y
			
 
				+	@ r1 = ogg_int16_t *_x
			
 
				+	@ r2 = int          _last_zzi
			
 
				+	CMP	r2, #3
			
 
				+	BLE	oc_idct8x8_3_v6
			
 
				+	@CMP	r2, #6
			
 
				+	@BLE	oc_idct8x8_6_v6
			
 
				+	CMP	r2, #10
			
 
				+	BLE	oc_idct8x8_10_v6
			
 
				+oc_idct8x8_slow_v6:
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	STR	r0, [r13,#-4]!
			
 
				+	ADD	r0, r13, #4	@ Write to temp storage.
			
 
				+	BL	idct8_8core_v6
			
 
				+	BL	idct8_8core_v6
			
 
				+	BL	idct8_8core_v6
			
 
				+	BL	idct8_8core_v6
			
 
				+	LDR	r0, [r13], #4	@ Write to the final destination.
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	STRD	r4, r5, [r1,#-8*16]!
			
 
				+	STRD	r4, r5, [r1,#8]
			
 
				+	STRD	r4, r5, [r1,#16]
			
 
				+	STRD	r4, r5, [r1,#24]
			
 
				+	STRD	r4, r5, [r1,#32]
			
 
				+	STRD	r4, r5, [r1,#40]
			
 
				+	STRD	r4, r5, [r1,#48]
			
 
				+	STRD	r4, r5, [r1,#56]
			
 
				+	STRD	r4, r5, [r1,#64]
			
 
				+	STRD	r4, r5, [r1,#72]
			
 
				+	STRD	r4, r5, [r1,#80]
			
 
				+	STRD	r4, r5, [r1,#88]
			
 
				+	STRD	r4, r5, [r1,#96]
			
 
				+	STRD	r4, r5, [r1,#104]
			
 
				+	STRD	r4, r5, [r1,#112]
			
 
				+	STRD	r4, r5, [r1,#120]
			
 
				+	MOV	r1, r13		@ And read from temp storage.
			
 
				+@ Column transforms
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	BL	idct8_8core_down_v6
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	@ .size oc_idct8x8_v6, .-oc_idct8x8_v6	@ ENDP
			
 
				+
			
 
				+	@ .type oc_idct8x8_10_v6, %function; oc_idct8x8_10_v6: @ PROC
			
 
				+oc_idct8x8_10_v6:
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	SUB	r13,r13,#64*2+4
			
 
				+@ Row transforms
			
 
				+	MOV	r2, r13
			
 
				+	STR	r0, [r13,#-4]!
			
 
				+	AND	r0, r2, #4	@ Align the stack.
			
 
				+	ADD	r0, r0, r2	@ Write to temp storage.
			
 
				+	BL	idct4_3core_v6
			
 
				+	BL	idct2_1core_v6
			
 
				+	LDR	r0, [r13], #4	@ Write to the final destination.
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	STRD	r4, r5, [r1,#-4*16]!
			
 
				+	STRD	r4, r5, [r1,#16]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	STR	r4, [r1,#48]
			
 
				+	AND	r1, r13,#4	@ Align the stack.
			
 
				+	ADD	r1, r1, r13	@ And read from temp storage.
			
 
				+@ Column transforms
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	BL	idct4_4core_down_v6
			
 
				+	ADD	r13,r13,#64*2+4
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	@ .size oc_idct8x8_10_v6, .-oc_idct8x8_10_v6	@ ENDP
			
 
				+
			
 
				+	@ .type oc_idct8x8_3_v6, %function; oc_idct8x8_3_v6: @ PROC
			
 
				+oc_idct8x8_3_v6:
			
 
				+	STMFD	r13!,{r4-r8,r14}
			
 
				+	SUB	r13,r13,#64*2
			
 
				+@ Row transforms
			
 
				+	MOV	r8, r0
			
 
				+	MOV	r0, r13		@ Write to temp storage.
			
 
				+	BL	idct2_1core_v6
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	STR	r4, [r1,#-2*16]!
			
 
				+	STR	r4, [r1,#16]
			
 
				+	MOV	r1, r13		@ Read from temp storage.
			
 
				+	MOV	r0, r8		@ Write to the final destination.
			
 
				+@ Column transforms
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	BL	idct2_2core_down_v6
			
 
				+	ADD	r13,r13,#64*2
			
 
				+	LDMFD	r13!,{r4-r8,PC}
			
 
				+	@ .size oc_idct8x8_3_v6, .-oc_idct8x8_3_v6	@ ENDP
			
 
				+
			
 
				+	@ .type idct2_1core_v6, %function; idct2_1core_v6: @ PROC
			
 
				+idct2_1core_v6:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDR	r2, [r1], #16		@ r2 = <x[0,1]|x[0,0]>
			
 
				+	LDR	r3, OC_C4S4
			
 
				+	LDRSH	r6, [r1], #16		@ r6 = x[1,0]
			
 
				+	SMULWB	r12,r3, r2		@ r12= t[0,0]=OC_C4S4*x[0,0]>>16
			
 
				+	LDRD	r4, r5, OC_C7S1		@ r4 = OC_C7S1; r5 = OC_C1S7
			
 
				+	SMULWB	r6, r3, r6		@ r6 = t[1,0]=OC_C4S4*x[1,0]>>16
			
 
				+	SMULWT	r4, r4, r2		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	SMULWT	r7, r5, r2		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+@ Stage 2:
			
 
				+	SMULWB	r5, r3, r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r12,r12,r6, LSL #16	@ r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWB	r6, r3, r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r7, r7, r3		@ r7 = <0|t[0,7]>
			
 
				+@ Stage 3:
			
 
				+	PKHBT	r5, r6, r5, LSL #16	@ r5 = <t[0,5]|t[0,6]>
			
 
				+	PKHBT	r4, r4, r3		@ r4 = <0|t[0,4]>
			
 
				+	SADDSUBX	r5, r5, r5		@ r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
			
 
				+@ Stage 4:
			
 
				+	PKHTB	r6, r3, r5, ASR #16	@ r6 = <0|t[0,6]>
			
 
				+	PKHBT	r5, r5, r3		@ r5 = <0|t[0,5]>
			
 
				+	SADD16	r3, r12,r7		@ r3 = t[0]+t[7]
			
 
				+	STR	r3, [r0], #4		@ y[0<<3] = t[0]+t[7]
			
 
				+	SADD16	r3, r12,r6		@ r3 = t[0]+t[6]
			
 
				+	STR	r3, [r0, #12]		@ y[1<<3] = t[0]+t[6]
			
 
				+	SADD16	r3, r12,r5		@ r3 = t[0]+t[5]
			
 
				+	STR	r3, [r0, #28]		@ y[2<<3] = t[0]+t[5]
			
 
				+	SADD16	r3, r12,r4		@ r3 = t[0]+t[4]
			
 
				+	STR	r3, [r0, #44]		@ y[3<<3] = t[0]+t[4]
			
 
				+	SSUB16	r4, r12,r4		@ r4 = t[0]-t[4]
			
 
				+	STR	r4, [r0, #60]		@ y[4<<3] = t[0]-t[4]
			
 
				+	SSUB16	r5, r12,r5		@ r5 = t[0]-t[5]
			
 
				+	STR	r5, [r0, #76]		@ y[5<<3] = t[0]-t[5]
			
 
				+	SSUB16	r6, r12,r6		@ r6 = t[0]-t[6]
			
 
				+	STR	r6, [r0, #92]		@ y[6<<3] = t[0]-t[6]
			
 
				+	SSUB16	r7, r12,r7		@ r7 = t[0]-t[7]
			
 
				+	STR	r7, [r0, #108]		@ y[7<<3] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	@ .size idct2_1core_v6, .-idct2_1core_v6	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+	.balign 8
			
 
				+OC_C7S1:
			
 
				+	.word	12785 @ 31F1
			
 
				+OC_C1S7:
			
 
				+	.word	64277 @ FB15
			
 
				+OC_C6S2:
			
 
				+	.word	25080 @ 61F8
			
 
				+OC_C2S6:
			
 
				+	.word	60547 @ EC83
			
 
				+OC_C5S3:
			
 
				+	.word	36410 @ 8E3A
			
 
				+OC_C3S5:
			
 
				+	.word	54491 @ D4DB
			
 
				+OC_C4S4:
			
 
				+	.word	46341 @ B505
			
 
				+
			
 
				+  .if OC_ARM_ASM_MEDIA
			
 
				+	@ .type idct2_2core_down_v6, %function; idct2_2core_down_v6: @ PROC
			
 
				+idct2_2core_down_v6:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDR	r2, [r1], #16		@ r2 = <x[0,1]|x[0,0]>
			
 
				+	LDR	r3, OC_C4S4
			
 
				+	MOV	r7 ,#8			@ r7  = 8
			
 
				+	LDR	r6, [r1], #16		@ r6 = <x[1,1]|x[1,0]>
			
 
				+	SMLAWB	r12,r3, r2, r7		@ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
			
 
				+	LDRD	r4, r5, OC_C7S1		@ r4 = OC_C7S1; r5 = OC_C1S7
			
 
				+	SMLAWB	r7, r3, r6, r7		@ r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
			
 
				+	SMULWT  r5, r5, r2		@ r2 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	PKHBT	r12,r12,r7, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMULWT	r4, r4, r2		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+@ Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
			
 
				+	PKHBT	r7, r5, r5, LSL #16	@ r7 = <t[0,7]|t[0,7]>
			
 
				+@ Stage 2:
			
 
				+	SMULWB	r6, r3, r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r4, r4, r4, LSL #16	@ r4 = <t[0,4]|t[0,4]>
			
 
				+	SMULWT	r2, r3, r7		@ r2 = t[1,6]=OC_C4S4*t[1,7]>>16
			
 
				+	SMULWB	r5, r3, r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r6, r6, r2, LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r2, r3, r4		@ r2 = t[1,5]=OC_C4S4*t[1,4]>>16
			
 
				+	PKHBT	r2, r5, r2, LSL #16	@ r2 = <t[1,5]|t[0,5]>
			
 
				+@ Stage 3:
			
 
				+	SSUB16	r5, r6, r2		@ r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
			
 
				+	SADD16	r6, r6, r2		@ r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
			
 
				+@ Stage 4:
			
 
				+	SADD16	r2, r12,r7		@ r2 = t[0]+t[7]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[7]+8>>4
			
 
				+	STR	r3, [r0], #4		@ y[0<<3] = t[0]+t[7]+8>>4
			
 
				+	SADD16	r2, r12,r6		@ r2 = t[0]+t[6]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[6]+8>>4
			
 
				+	STR	r3, [r0, #12]		@ y[1<<3] = t[0]+t[6]+8>>4
			
 
				+	SADD16	r2, r12,r5		@ r2 = t[0]+t[5]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[5]+8>>4
			
 
				+	STR	r3, [r0, #28]		@ y[2<<3] = t[0]+t[5]+8>>4
			
 
				+	SADD16	r2, r12,r4		@ r2 = t[0]+t[4]+8
			
 
				+	MOV	r3, r2, ASR #4
			
 
				+	MOV	r2, r2, LSL #16
			
 
				+	PKHTB	r3, r3, r2, ASR #20	@ r3 = t[0]+t[4]+8>>4
			
 
				+	STR	r3, [r0, #44]		@ y[3<<3] = t[0]+t[4]+8>>4
			
 
				+	SSUB16	r4, r12,r4		@ r4 = t[0]-t[4]+8
			
 
				+	MOV	r3, r4, ASR #4
			
 
				+	MOV	r4, r4, LSL #16
			
 
				+	PKHTB	r3, r3, r4, ASR #20	@ r3 = t[0]-t[4]+8>>4
			
 
				+	STR	r3, [r0, #60]		@ y[4<<3] = t[0]-t[4]+8>>4
			
 
				+	SSUB16	r5, r12,r5		@ r5 = t[0]-t[5]+8
			
 
				+	MOV	r3, r5, ASR #4
			
 
				+	MOV	r5, r5, LSL #16
			
 
				+	PKHTB	r3, r3, r5, ASR #20	@ r3 = t[0]-t[5]+8>>4
			
 
				+	STR	r3, [r0, #76]		@ y[5<<3] = t[0]-t[5]+8>>4
			
 
				+	SSUB16	r6, r12,r6		@ r6 = t[0]-t[6]+8
			
 
				+	MOV	r3, r6, ASR #4
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	PKHTB	r3, r3, r6, ASR #20	@ r3 = t[0]-t[6]+8>>4
			
 
				+	STR	r3, [r0, #92]		@ y[6<<3] = t[0]-t[6]+8>>4
			
 
				+	SSUB16	r7, r12,r7		@ r7 = t[0]-t[7]+8
			
 
				+	MOV	r3, r7, ASR #4
			
 
				+	MOV	r7, r7, LSL #16
			
 
				+	PKHTB	r3, r3, r7, ASR #20	@ r3 = t[0]-t[7]+8>>4
			
 
				+	STR	r3, [r0, #108]		@ y[7<<3] = t[0]-t[7]+8>>4
			
 
				+	MOV	PC,r14
			
 
				+	@ .size idct2_2core_down_v6, .-idct2_2core_down_v6	@ ENDP
			
 
				+
			
 
				+@ In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
			
 
				+@  pay for increased branch mis-prediction to get here, but in practice it
			
 
				+@  doesn't seem to slow anything down to take it out, and it's less code this
			
 
				+@  way.
			
 
				+  .if 0
			
 
				+	@ .type oc_idct8x8_6_v6, %function; oc_idct8x8_6_v6: @ PROC
			
 
				+_oc_idct8x8_6_v6:
			
 
				+	STMFD	r13!,{r4-r8,r10,r11,r14}
			
 
				+	SUB	r13,r13,#64*2+4
			
 
				+@ Row transforms
			
 
				+	MOV	r8, r0
			
 
				+	AND	r0, r13,#4	@ Align the stack.
			
 
				+	ADD	r0, r0, r13	@ Write to temp storage.
			
 
				+	BL	idct3_2core_v6
			
 
				+	BL	idct1core_v6
			
 
				+	@ Clear input data for next block.
			
 
				+	MOV	r4, #0
			
 
				+	MOV	r5, #0
			
 
				+	STRD	r4, r5, [r1,#-3*16]!
			
 
				+	STR	r4, [r1,#16]
			
 
				+	STR	r4, [r1,#32]
			
 
				+	AND	r1, r13,#4	@ Align the stack.
			
 
				+	MOV	r0, r8		@ Write to the final destination.
			
 
				+	ADD	r1, r1, r13	@ And read from temp storage.
			
 
				+@ Column transforms
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	BL	idct3_3core_down_v6
			
 
				+	ADD	r13,r13,#64*2+4
			
 
				+	LDMFD	r13!,{r4-r8,r10,r11,PC}
			
 
				+	@ .size oc_idct8x8_6_v6, .-oc_idct8x8_6_v6	@ ENDP
			
 
				+
			
 
				+	@ .type idct1core_v6, %function; idct1core_v6: @ PROC
			
 
				+_idct1core_v6:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+	LDRSH	r3, [r1], #16
			
 
				+	MOV	r12,#0x05
			
 
				+	ORR	r12,r12,#0xB500
			
 
				+	MUL	r3, r12, r3
			
 
				+	@ Stall ?
			
 
				+	MOV	r3, r3, ASR #16
			
 
				+	@ Don't need to actually store the odd lines; they won't be read.
			
 
				+	STRH	r3, [r0], #2
			
 
				+	STRH	r3, [r0, #30]
			
 
				+	STRH	r3, [r0, #62]
			
 
				+	STRH	r3, [r0, #94]
			
 
				+	MOV	PC,R14
			
 
				+	@ .size idct1core_v6, .-idct1core_v6	@ ENDP
			
 
				+
			
 
				+	@ .type idct3_2core_v6, %function; idct3_2core_v6: @ PROC
			
 
				+_idct3_2core_v6:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDRD	r4, r5, [r1], #16		@ r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
			
 
				+	LDRD	r10,r11,OC_C6S2_3_v6	@ r10= OC_C6S2; r11= OC_C2S6
			
 
				+	@ Stall
			
 
				+	SMULWB	r3, r11,r5		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMULWB	r2, r10,r5		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r5, [r1], #16		@ r5 = <x[1,1]|x[1,0]>
			
 
				+	SMULWB	r12,r11,r4		@ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
			
 
				+	LDRD	r6, r7, OC_C7S1_3_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	SMULWB	r10,r11,r5		@ r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
			
 
				+	PKHBT	r12,r12,r10,LSL #16	@ r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWT  r10,r7, r5		@ r10= t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	PKHBT	r2, r2, r11		@ r2 = <0|t[0,2]>
			
 
				+	SMULWT  r7, r7, r4		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	PKHBT	r3, r3, r11		@ r3 = <0|t[0,3]>
			
 
				+	SMULWT	r5, r6, r5		@ r10= t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+	PKHBT	r7, r7, r10,LSL #16	@ r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		@ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+@ Stage 2:
			
 
				+	SMULWB	r6, r11,r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r4, r4, r5, LSL #16	@ r4 = <t[1,4]|t[0,4]>
			
 
				+	SMULWT	r10,r11,r7		@ r10= t[1,6]=OC_C4S4*t[1,7]>>16
			
 
				+	SMULWB	r5, r11,r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r6, r6, r10,LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r10,r11,r4		@ r10= t[1,5]=OC_C4S4*t[1,4]>>16
			
 
				+@ Stage 3:
			
 
				+	B	idct4_3core_stage3_v6
			
 
				+	@ .size idct3_2core_v6, .-idct3_2core_v6	@ ENDP
			
 
				+
			
 
				+@ Another copy so the LDRD offsets are less than +/- 255.
			
 
				+	.balign 8
			
 
				+OC_C7S1_3_v6:
			
 
				+	.word	12785 @ 31F1
			
 
				+OC_C1S7_3_v6:
			
 
				+	.word	64277 @ FB15
			
 
				+OC_C6S2_3_v6:
			
 
				+	.word	25080 @ 61F8
			
 
				+OC_C2S6_3_v6:
			
 
				+	.word	60547 @ EC83
			
 
				+
			
 
				+	@ .type idct3_3core_down_v6, %function; idct3_3core_down_v6: @ PROC
			
 
				+_idct3_3core_down_v6:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDRD	r10,r11,[r1], #16		@ r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
			
 
				+	LDRD	r6, r7, OC_C6S2_3_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
			
 
				+	LDR	r4, [r1], #16		@ r4 = <x[1,1]|x[1,0]>
			
 
				+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	MOV	r7,#8
			
 
				+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMLAWB	r12,r11,r10,r7		@ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
			
 
				+@ Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
			
 
				+	PKHBT	r3, r3, r3, LSL #16	@ r3 = <t[0,3]|t[0,3]>
			
 
				+	SMLAWB	r5, r11,r4, r7		@ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
			
 
				+	PKHBT	r2, r2, r2, LSL #16	@ r2 = <t[0,2]|t[0,2]>
			
 
				+	LDRD	r6, r7, OC_C7S1_3_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+@ Stage 2:
			
 
				+	SMULWB	r6, r11,r7		@ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
			
 
				+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
			
 
				+	SMULWT	r10,r11,r7		@ r10= t[1,6]=OC_C4S4*t[1,7]>>16
			
 
				+	SMULWB	r5, r11,r4		@ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
			
 
				+	PKHBT	r6, r6, r10,LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r10,r11,r4		@ r10= t[1,5]=OC_C4S4*t[1,4]>>16
			
 
				+@ Stage 3:
			
 
				+	B	idct4_4core_down_stage3_v6
			
 
				+	@ .size idct3_3core_down_v6, .-idct3_3core_down_v6	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+	@ .type idct4_3core_v6, %function; idct4_3core_v6: @ PROC
			
 
				+idct4_3core_v6:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDRD	r10,r11,[r1], #16	@ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
			
 
				+	LDRD	r2, r3, OC_C5S3_4_v6	@ r2 = OC_C5S3; r3 = OC_C3S5
			
 
				+	LDRD	r4, r5, [r1], #16		@ r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
			
 
				+	SMULWT	r9, r3, r11		@ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
			
 
				+	SMULWT	r8, r2, r11		@ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
			
 
				+	PKHBT	r9, r9, r2		@ r9 = <0|t[0,6]>
			
 
				+	LDRD	r6, r7, OC_C6S2_4_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
			
 
				+	PKHBT	r8, r8, r2		@ r9 = <0|-t[0,5]>
			
 
				+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMULWB	r12,r7, r5		@ r12= t[1,3]=OC_C2S6*x[1,2]>>16
			
 
				+	SMULWB	r5, r6, r5		@ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
			
 
				+	PKHBT	r3, r3, r12,LSL #16	@ r3 = <t[1,3]|t[0,3]>
			
 
				+	SMULWB	r12,r11,r10		@ r12= t[0,0]=OC_C4S4*x[0,0]>>16
			
 
				+	PKHBT	r2, r2, r5, LSL #16	@ r2 = <t[1,2]|t[0,2]>
			
 
				+	SMULWB	r5, r11,r4		@ r5 = t[1,0]=OC_C4S4*x[1,0]>>16
			
 
				+	LDRD	r6, r7, OC_C7S1_4_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+@ Stage 2:
			
 
				+	SSUB16	r6, r7, r9		@ r6 = t[7]-t[6]
			
 
				+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
			
 
				+	SADD16	r7, r7, r9		@ r7 = t[7]=t[7]+t[6]
			
 
				+	SMULWT	r9, r11,r6		@ r9 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+	SADD16	r5, r4, r8		@ r5 = t[4]-t[5]
			
 
				+	SMULWB	r6, r11,r6		@ r6 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	SSUB16	r4, r4, r8		@ r4 = t[4]=t[4]+t[5]
			
 
				+	SMULWT	r10,r11,r5		@ r10= t[1,5]=OC_C4S4*r5T>>16
			
 
				+	PKHBT	r6, r6, r9, LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r5, r11,r5		@ r5 = t[0,5]=OC_C4S4*r5B>>16
			
 
				+@ Stage 3:
			
 
				+idct4_3core_stage3_v6:
			
 
				+	SADD16	r11,r12,r2		@ r11= t[1]=t[0]+t[2]
			
 
				+	PKHBT	r10,r5, r10,LSL #16	@ r10= <t[1,5]|t[0,5]>
			
 
				+	SSUB16	r2, r12,r2		@ r2 = t[2]=t[0]-t[2]
			
 
				+idct4_3core_stage3_5_v6:
			
 
				+	SSUB16	r5, r6, r10		@ r5 = t[5]=t[6]-t[5]
			
 
				+	SADD16	r6, r6, r10		@ r6 = t[6]=t[6]+t[5]
			
 
				+	SADD16	r10,r12,r3		@ r10= t[0]=t[0]+t[3]
			
 
				+	SSUB16	r3, r12,r3		@ r3 = t[3]=t[0]-t[3]
			
 
				+@ Stage 4:
			
 
				+	SADD16	r12,r10,r7		@ r12= t[0]+t[7]
			
 
				+	STR	r12,[r0], #4		@ y[0<<3] = t[0]+t[7]
			
 
				+	SADD16	r12,r11,r6		@ r12= t[1]+t[6]
			
 
				+	STR	r12,[r0, #12]		@ y[1<<3] = t[1]+t[6]
			
 
				+	SADD16	r12,r2, r5		@ r12= t[2]+t[5]
			
 
				+	STR	r12,[r0, #28]		@ y[2<<3] = t[2]+t[5]
			
 
				+	SADD16	r12,r3, r4		@ r12= t[3]+t[4]
			
 
				+	STR	r12,[r0, #44]		@ y[3<<3] = t[3]+t[4]
			
 
				+	SSUB16	r4, r3, r4		@ r4 = t[3]-t[4]
			
 
				+	STR	r4, [r0, #60]		@ y[4<<3] = t[3]-t[4]
			
 
				+	SSUB16	r5, r2, r5		@ r5 = t[2]-t[5]
			
 
				+	STR	r5, [r0, #76]		@ y[5<<3] = t[2]-t[5]
			
 
				+	SSUB16	r6, r11,r6		@ r6 = t[1]-t[6]
			
 
				+	STR	r6, [r0, #92]		@ y[6<<3] = t[1]-t[6]
			
 
				+	SSUB16	r7, r10,r7		@ r7 = t[0]-t[7]
			
 
				+	STR	r7, [r0, #108]		@ y[7<<3] = t[0]-t[7]
			
 
				+	MOV	PC,r14
			
 
				+	@ .size idct4_3core_v6, .-idct4_3core_v6	@ ENDP
			
 
				+
			
 
				+@ Another copy so the LDRD offsets are less than +/- 255.
			
 
				+	.balign 8
			
 
				+OC_C7S1_4_v6:
			
 
				+	.word	12785 @ 31F1
			
 
				+OC_C1S7_4_v6:
			
 
				+	.word	64277 @ FB15
			
 
				+OC_C6S2_4_v6:
			
 
				+	.word	25080 @ 61F8
			
 
				+OC_C2S6_4_v6:
			
 
				+	.word	60547 @ EC83
			
 
				+OC_C5S3_4_v6:
			
 
				+	.word	36410 @ 8E3A
			
 
				+OC_C3S5_4_v6:
			
 
				+	.word	54491 @ D4DB
			
 
				+
			
 
				+	@ .type idct4_4core_down_v6, %function; idct4_4core_down_v6: @ PROC
			
 
				+idct4_4core_down_v6:
			
 
				+	@ r0 =       ogg_int16_t *_y (destination)
			
 
				+	@ r1 = const ogg_int16_t *_x (source)
			
 
				+@ Stage 1:
			
 
				+	LDRD	r10,r11,[r1], #16	@ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
			
 
				+	LDRD	r2, r3, OC_C5S3_4_v6	@ r2 = OC_C5S3; r3 = OC_C3S5
			
 
				+	LDRD	r4, r5, [r1], #16	@ r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
			
 
				+	SMULWT	r9, r3, r11		@ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
			
 
				+	LDRD	r6, r7, OC_C6S2_4_v6	@ r6 = OC_C6S2; r7 = OC_C2S6
			
 
				+	SMULWT	r8, r2, r11		@ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
			
 
				+@ Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
			
 
				+	PKHBT	r9, r9, r9, LSL #16	@ r9 = <t[0,6]|t[0,6]>
			
 
				+	SMULWB	r3, r7, r11		@ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
			
 
				+	PKHBT	r8, r8, r8, LSL #16	@ r8 = <-t[0,5]|-t[0,5]>
			
 
				+	SMULWB	r2, r6, r11		@ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	SMULWB	r12,r7, r5		@ r12= t[1,3]=OC_C2S6*x[1,2]>>16
			
 
				+	MOV	r7,#8
			
 
				+	SMULWB	r5, r6, r5		@ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
			
 
				+	PKHBT	r3, r3, r12,LSL #16	@ r3 = <t[1,3]|t[0,3]>
			
 
				+	SMLAWB	r12,r11,r10,r7		@ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
			
 
				+	PKHBT	r2, r2, r5, LSL #16	@ r2 = <t[1,2]|t[0,2]>
			
 
				+	SMLAWB	r5, r11,r4 ,r7		@ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
			
 
				+	LDRD	r6, r7, OC_C7S1_4_v6	@ r6 = OC_C7S1; r7 = OC_C1S7
			
 
				+	PKHBT	r12,r12,r5, LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMULWT  r5, r7, r4		@ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
			
 
				+	SMULWT  r7, r7, r10		@ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
			
 
				+	SMULWT	r10,r6, r10		@ r10= t[0,4]=OC_C7S1*x[0,1]>>16
			
 
				+	PKHBT	r7, r7, r5, LSL #16	@ r7 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r4, r6, r4		@ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
			
 
				+@ Stage 2:
			
 
				+	SSUB16	r6, r7, r9		@ r6 = t[7]-t[6]
			
 
				+	PKHBT	r4, r10,r4, LSL #16	@ r4 = <t[1,4]|t[0,4]>
			
 
				+	SADD16	r7, r7, r9		@ r7 = t[7]=t[7]+t[6]
			
 
				+	SMULWT	r9, r11,r6		@ r9 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+	SADD16	r5, r4, r8		@ r5 = t[4]-t[5]
			
 
				+	SMULWB	r6, r11,r6		@ r6 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	SSUB16	r4, r4, r8		@ r4 = t[4]=t[4]+t[5]
			
 
				+	SMULWT	r10,r11,r5		@ r10= t[1,5]=OC_C4S4*r5T>>16
			
 
				+	PKHBT	r6, r6, r9, LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r5, r11,r5		@ r5 = t[0,5]=OC_C4S4*r5B>>16
			
 
				+@ Stage 3:
			
 
				+idct4_4core_down_stage3_v6:
			
 
				+	SADD16	r11,r12,r2		@ r11= t[1]+8=t[0]+t[2]+8
			
 
				+	PKHBT	r10,r5, r10,LSL #16	@ r10= <t[1,5]|t[0,5]>
			
 
				+	SSUB16	r2, r12,r2		@ r2 = t[2]+8=t[0]-t[2]+8
			
 
				+	B	idct8_8core_down_stage3_5_v6
			
 
				+	@ .size idct4_4core_down_v6, .-idct4_4core_down_v6	@ ENDP
			
 
				+
			
 
				+	@ .type idct8_8core_v6, %function; idct8_8core_v6: @ PROC
			
 
				+idct8_8core_v6:
			
 
				+	STMFD	r13!,{r0,r14}
			
 
				+@ Stage 1:
			
 
				+	@5-6 rotation by 3pi/16
			
 
				+	LDRD	r10,r11,OC_C5S3_4_v6	@ r10= OC_C5S3, r11= OC_C3S5
			
 
				+	LDR	r4, [r1,#8]		@ r4 = <x[0,5]|x[0,4]>
			
 
				+	LDR	r7, [r1,#24]		@ r7 = <x[1,5]|x[1,4]>
			
 
				+	SMULWT	r5, r11,r4		@ r5 = OC_C3S5*x[0,5]>>16
			
 
				+	LDR	r0, [r1,#4]		@ r0 = <x[0,3]|x[0,2]>
			
 
				+	SMULWT	r3, r11,r7		@ r3 = OC_C3S5*x[1,5]>>16
			
 
				+	LDR	r12,[r1,#20]		@ r12= <x[1,3]|x[1,2]>
			
 
				+	SMULWT	r6, r11,r0		@ r6 = OC_C3S5*x[0,3]>>16
			
 
				+	SMULWT	r11,r11,r12		@ r11= OC_C3S5*x[1,3]>>16
			
 
				+	SMLAWT	r6, r10,r4, r6		@ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
			
 
				+	PKHBT	r5, r5, r3, LSL #16	@ r5 = <r3|r5>
			
 
				+	SMLAWT	r11,r10,r7, r11		@ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
			
 
				+	PKHBT	r4, r4, r7, LSL #16	@ r4 = <x[1,4]|x[0,4]>
			
 
				+	SMULWT	r3, r10,r0		@ r3 = OC_C5S3*x[0,3]>>16
			
 
				+	PKHBT	r6, r6, r11,LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r8, r10,r12		@ r8 = OC_C5S3*x[1,3]>>16
			
 
				+	@2-3 rotation by 6pi/16
			
 
				+	LDRD	r10,r11,OC_C6S2_4_v6	@ r10= OC_C6S2, r11= OC_C2S6
			
 
				+	PKHBT	r3, r3, r8, LSL #16	@ r3 = <r8|r3>
			
 
				+	LDR	r8, [r1,#12]		@ r8 = <x[0,7]|x[0,6]>
			
 
				+	SMULWB	r2, r10,r0		@ r2 = OC_C6S2*x[0,2]>>16
			
 
				+	SSUB16	r5, r5, r3		@ r5 = <t[1,5]|t[0,5]>
			
 
				+	SMULWB	r9, r10,r12		@ r9 = OC_C6S2*x[1,2]>>16
			
 
				+	LDR	r7, [r1,#28]		@ r7 = <x[1,7]|x[1,6]>
			
 
				+	SMULWB	r3, r10,r8		@ r3 = OC_C6S2*x[0,6]>>16
			
 
				+	SMULWB	r10,r10,r7		@ r10= OC_C6S2*x[1,6]>>16
			
 
				+	PKHBT	r2, r2, r9, LSL #16	@ r2 = <r2|r9>
			
 
				+	SMLAWB	r3, r11,r0, r3		@ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
			
 
				+	SMLAWB	r10,r11,r12,r10		@ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
			
 
				+	SMULWB	r9, r11,r8		@ r9 = OC_C2S6*x[0,6]>>16
			
 
				+	PKHBT	r3, r3, r10,LSL #16	@ r3 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r12,r11,r7		@ r12= OC_C2S6*x[1,6]>>16
			
 
				+	@4-7 rotation by 7pi/16
			
 
				+	LDRD	r10,r11,OC_C7S1_8_v6	@ r10= OC_C7S1, r11= OC_C1S7
			
 
				+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <r9|r12>
			
 
				+	LDR	r0, [r1],#16		@ r0 = <x[0,1]|x[0,0]>
			
 
				+	PKHTB	r7, r7, r8, ASR #16	@ r7 = <x[1,7]|x[0,7]>
			
 
				+	SSUB16	r2, r2, r9		@ r2 = <t[1,2]|t[0,2]>
			
 
				+	SMULWB	r9, r10,r7		@ r9 = OC_C7S1*x[0,7]>>16
			
 
				+	LDR	r14,[r1],#16		@ r14= <x[1,1]|x[1,0]>
			
 
				+	SMULWT	r12,r10,r7		@ r12= OC_C7S1*x[1,7]>>16
			
 
				+	SMULWT	r8, r10,r0		@ r8 = OC_C7S1*x[0,1]>>16
			
 
				+	SMULWT	r10,r10,r14		@ r10= OC_C7S1*x[1,1]>>16
			
 
				+	SMLAWT	r9, r11,r0, r9		@ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
			
 
				+	PKHBT	r8, r8, r10,LSL #16	@ r8 = <r12|r8>
			
 
				+	SMLAWT	r12,r11,r14,r12		@ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
			
 
				+	PKHBT	r0, r0, r14,LSL #16	@ r0 = <x[1,0]|x[0,0]>
			
 
				+	SMULWB	r10,r11,r7		@ r10= OC_C1S7*x[0,6]>>16
			
 
				+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r12,r11,r7		@ r12= OC_C1S7*x[1,6]>>16
			
 
				+	@0-1 butterfly
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	PKHBT	r10,r10,r12,LSL #16	@ r10= <r12|r10>
			
 
				+	SADD16	r7, r0, r4		@ r7 = x[0]+x[4]
			
 
				+	SSUB16	r10,r8, r10		@ r10= <t[1,4]|t[0,4]>
			
 
				+	SSUB16	r4, r0, r4		@ r4 = x[0]-x[4]
			
 
				+	SMULWB	r8, r11,r7		@ r8 = t[0,0]=OC_C4S4*r7B>>16
			
 
				+	SMULWT	r12,r11,r7		@ r12= t[1,0]=OC_C4S4*r7T>>16
			
 
				+	SMULWB	r7, r11,r4		@ r7 = t[0,1]=OC_C4S4*r4B>>16
			
 
				+	PKHBT	r12,r8, r12,LSL #16	@ r12= <t[1,0]|t[0,0]>
			
 
				+	SMULWT	r8, r11,r4		@ r8 = t[1,1]=OC_C4S4*r4T>>16
			
 
				+@ Stage 2:
			
 
				+	SADD16	r4, r10,r5		@ r4 = t[4]=t[4]+t[5]
			
 
				+	PKHBT	r8, r7, r8, LSL #16	@ r8 = <t[1,0]|t[0,0]>
			
 
				+	SSUB16	r5, r10,r5		@ r5 = t[4]-t[5]
			
 
				+	SMULWB	r10,r11,r5		@ r10= t[0,5]=OC_C4S4*r5B>>16
			
 
				+	SADD16	r7, r9, r6		@ r7 = t[7]=t[7]+t[6]
			
 
				+	SMULWT	r5, r11,r5		@ r5 = t[1,5]=OC_C4S4*r5T>>16
			
 
				+	SSUB16	r6, r9, r6		@ r6 = t[7]-t[6]
			
 
				+	SMULWB	r9, r11,r6		@ r9 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	PKHBT	r10,r10,r5, LSL #16	@ r10= <t[1,5]|t[0,5]>
			
 
				+	SMULWT	r6, r11,r6		@ r6 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+@ Stage 3:
			
 
				+	SADD16	r11,r8, r2		@ r11= t[1]=t[1]+t[2]
			
 
				+	PKHBT	r6, r9, r6, LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SSUB16	r2, r8, r2		@ r2 = t[2]=t[1]-t[2]
			
 
				+	LDMFD	r13!,{r0,r14}
			
 
				+	B	idct4_3core_stage3_5_v6
			
 
				+	@ .size idct8_8core_v6, .-idct8_8core_v6	@ ENDP
			
 
				+
			
 
				+@ Another copy so the LDRD offsets are less than +/- 255.
			
 
				+	.balign 8
			
 
				+OC_C7S1_8_v6:
			
 
				+	.word	12785 @ 31F1
			
 
				+OC_C1S7_8_v6:
			
 
				+	.word	64277 @ FB15
			
 
				+OC_C6S2_8_v6:
			
 
				+	.word	25080 @ 61F8
			
 
				+OC_C2S6_8_v6:
			
 
				+	.word	60547 @ EC83
			
 
				+OC_C5S3_8_v6:
			
 
				+	.word	36410 @ 8E3A
			
 
				+OC_C3S5_8_v6:
			
 
				+	.word	54491 @ D4DB
			
 
				+
			
 
				+	@ .type idct8_8core_down_v6, %function; idct8_8core_down_v6: @ PROC
			
 
				+idct8_8core_down_v6:
			
 
				+	STMFD	r13!,{r0,r14}
			
 
				+@ Stage 1:
			
 
				+	@5-6 rotation by 3pi/16
			
 
				+	LDRD	r10,r11,OC_C5S3_8_v6	@ r10= OC_C5S3, r11= OC_C3S5
			
 
				+	LDR	r4, [r1,#8]		@ r4 = <x[0,5]|x[0,4]>
			
 
				+	LDR	r7, [r1,#24]		@ r7 = <x[1,5]|x[1,4]>
			
 
				+	SMULWT	r5, r11,r4		@ r5 = OC_C3S5*x[0,5]>>16
			
 
				+	LDR	r0, [r1,#4]		@ r0 = <x[0,3]|x[0,2]>
			
 
				+	SMULWT	r3, r11,r7		@ r3 = OC_C3S5*x[1,5]>>16
			
 
				+	LDR	r12,[r1,#20]		@ r12= <x[1,3]|x[1,2]>
			
 
				+	SMULWT	r6, r11,r0		@ r6 = OC_C3S5*x[0,3]>>16
			
 
				+	SMULWT	r11,r11,r12		@ r11= OC_C3S5*x[1,3]>>16
			
 
				+	SMLAWT	r6, r10,r4, r6		@ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
			
 
				+	PKHBT	r5, r5, r3, LSL #16	@ r5 = <r3|r5>
			
 
				+	SMLAWT	r11,r10,r7, r11		@ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
			
 
				+	PKHBT	r4, r4, r7, LSL #16	@ r4 = <x[1,4]|x[0,4]>
			
 
				+	SMULWT	r3, r10,r0		@ r3 = OC_C5S3*x[0,3]>>16
			
 
				+	PKHBT	r6, r6, r11,LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SMULWT	r8, r10,r12		@ r8 = OC_C5S3*x[1,3]>>16
			
 
				+	@2-3 rotation by 6pi/16
			
 
				+	LDRD	r10,r11,OC_C6S2_8_v6	@ r10= OC_C6S2, r11= OC_C2S6
			
 
				+	PKHBT	r3, r3, r8, LSL #16	@ r3 = <r8|r3>
			
 
				+	LDR	r8, [r1,#12]		@ r8 = <x[0,7]|x[0,6]>
			
 
				+	SMULWB	r2, r10,r0		@ r2 = OC_C6S2*x[0,2]>>16
			
 
				+	SSUB16	r5, r5, r3		@ r5 = <t[1,5]|t[0,5]>
			
 
				+	SMULWB	r9, r10,r12		@ r9 = OC_C6S2*x[1,2]>>16
			
 
				+	LDR	r7, [r1,#28]		@ r7 = <x[1,7]|x[1,6]>
			
 
				+	SMULWB	r3, r10,r8		@ r3 = OC_C6S2*x[0,6]>>16
			
 
				+	SMULWB	r10,r10,r7		@ r10= OC_C6S2*x[1,6]>>16
			
 
				+	PKHBT	r2, r2, r9, LSL #16	@ r2 = <r2|r9>
			
 
				+	SMLAWB	r3, r11,r0, r3		@ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
			
 
				+	SMLAWB	r10,r11,r12,r10		@ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
			
 
				+	SMULWB	r9, r11,r8		@ r9 = OC_C2S6*x[0,6]>>16
			
 
				+	PKHBT	r3, r3, r10,LSL #16	@ r3 = <t[1,6]|t[0,6]>
			
 
				+	SMULWB	r12,r11,r7		@ r12= OC_C2S6*x[1,6]>>16
			
 
				+	@4-7 rotation by 7pi/16
			
 
				+	LDRD	r10,r11,OC_C7S1_8_v6	@ r10= OC_C7S1, r11= OC_C1S7
			
 
				+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <r9|r12>
			
 
				+	LDR	r0, [r1],#16		@ r0 = <x[0,1]|x[0,0]>
			
 
				+	PKHTB	r7, r7, r8, ASR #16	@ r7 = <x[1,7]|x[0,7]>
			
 
				+	SSUB16	r2, r2, r9		@ r2 = <t[1,2]|t[0,2]>
			
 
				+	SMULWB	r9, r10,r7		@ r9 = OC_C7S1*x[0,7]>>16
			
 
				+	LDR	r14,[r1],#16		@ r14= <x[1,1]|x[1,0]>
			
 
				+	SMULWT	r12,r10,r7		@ r12= OC_C7S1*x[1,7]>>16
			
 
				+	SMULWT	r8, r10,r0		@ r8 = OC_C7S1*x[0,1]>>16
			
 
				+	SMULWT	r10,r10,r14		@ r10= OC_C7S1*x[1,1]>>16
			
 
				+	SMLAWT	r9, r11,r0, r9		@ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
			
 
				+	PKHBT	r8, r8, r10,LSL #16	@ r8 = <r12|r8>
			
 
				+	SMLAWT	r12,r11,r14,r12		@ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
			
 
				+	PKHBT	r0, r0, r14,LSL #16	@ r0 = <x[1,0]|x[0,0]>
			
 
				+	SMULWB	r10,r11,r7		@ r10= OC_C1S7*x[0,6]>>16
			
 
				+	PKHBT	r9, r9, r12,LSL #16	@ r9 = <t[1,7]|t[0,7]>
			
 
				+	SMULWT	r12,r11,r7		@ r12= OC_C1S7*x[1,6]>>16
			
 
				+	@0-1 butterfly
			
 
				+	LDR	r11,OC_C4S4
			
 
				+	MOV	r14,#8
			
 
				+	PKHBT	r10,r10,r12,LSL #16	@ r10= <r12|r10>
			
 
				+	SADD16	r7, r0, r4		@ r7 = x[0]+x[4]
			
 
				+	SSUB16	r10,r8, r10		@ r10= <t[1,4]|t[0,4]>
			
 
				+	SMLAWB	r8, r11,r7, r14		@ r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
			
 
				+	SSUB16	r4, r0, r4		@ r4 = x[0]-x[4]
			
 
				+	SMLAWT	r12,r11,r7, r14		@ r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
			
 
				+	SMLAWB	r7, r11,r4, r14		@ r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
			
 
				+	PKHBT	r12,r8, r12,LSL #16	@ r12= <t[1,0]+8|t[0,0]+8>
			
 
				+	SMLAWT	r8, r11,r4, r14		@ r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
			
 
				+@ Stage 2:
			
 
				+	SADD16	r4, r10,r5		@ r4 = t[4]=t[4]+t[5]
			
 
				+	PKHBT	r8, r7, r8, LSL #16	@ r8 = <t[1,0]+8|t[0,0]+8>
			
 
				+	SSUB16	r5, r10,r5		@ r5 = t[4]-t[5]
			
 
				+	SMULWB	r10,r11,r5		@ r10= t[0,5]=OC_C4S4*r5B>>16
			
 
				+	SADD16	r7, r9, r6		@ r7 = t[7]=t[7]+t[6]
			
 
				+	SMULWT	r5, r11,r5		@ r5 = t[1,5]=OC_C4S4*r5T>>16
			
 
				+	SSUB16	r6, r9, r6		@ r6 = t[7]-t[6]
			
 
				+	SMULWB	r9, r11,r6		@ r9 = t[0,6]=OC_C4S4*r6B>>16
			
 
				+	PKHBT	r10,r10,r5, LSL #16	@ r10= <t[1,5]|t[0,5]>
			
 
				+	SMULWT	r6, r11,r6		@ r6 = t[1,6]=OC_C4S4*r6T>>16
			
 
				+@ Stage 3:
			
 
				+	SADD16	r11,r8, r2		@ r11= t[1]+8=t[1]+t[2]+8
			
 
				+	PKHBT	r6, r9, r6, LSL #16	@ r6 = <t[1,6]|t[0,6]>
			
 
				+	SSUB16	r2, r8, r2		@ r2 = t[2]+8=t[1]-t[2]+8
			
 
				+	LDMFD	r13!,{r0,r14}
			
 
				+idct8_8core_down_stage3_5_v6:
			
 
				+	SSUB16	r5, r6, r10		@ r5 = t[5]=t[6]-t[5]
			
 
				+	SADD16	r6, r6, r10		@ r6 = t[6]=t[6]+t[5]
			
 
				+	SADD16	r10,r12,r3		@ r10= t[0]+8=t[0]+t[3]+8
			
 
				+	SSUB16	r3, r12,r3		@ r3 = t[3]+8=t[0]-t[3]+8
			
 
				+@ Stage 4:
			
 
				+	SADD16	r12,r10,r7		@ r12= t[0]+t[7]+8
			
 
				+	SSUB16	r7, r10,r7		@ r7 = t[0]-t[7]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[0]+t[7]+8>>4
			
 
				+	STR	r10,[r0], #4		@ y[0<<3] = t[0]+t[7]+8>>4
			
 
				+	SADD16	r12,r11,r6		@ r12= t[1]+t[6]+8
			
 
				+	SSUB16	r6, r11,r6		@ r6 = t[1]-t[6]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[1]+t[6]+8>>4
			
 
				+	STR	r10,[r0, #12]		@ y[1<<3] = t[1]+t[6]+8>>4
			
 
				+	SADD16	r12,r2, r5		@ r12= t[2]+t[5]+8
			
 
				+	SSUB16	r5, r2, r5		@ r5 = t[2]-t[5]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[2]+t[5]+8>>4
			
 
				+	STR	r10,[r0, #28]		@ y[2<<3] = t[2]+t[5]+8>>4
			
 
				+	SADD16	r12,r3, r4		@ r12= t[3]+t[4]+8
			
 
				+	SSUB16	r4, r3, r4		@ r4 = t[3]-t[4]+8
			
 
				+	MOV	r10,r12,ASR #4
			
 
				+	MOV	r12,r12,LSL #16
			
 
				+	PKHTB	r10,r10,r12,ASR #20	@ r10= t[3]+t[4]+8>>4
			
 
				+	STR	r10,[r0, #44]		@ y[3<<3] = t[3]+t[4]+8>>4
			
 
				+	MOV	r10,r4, ASR #4
			
 
				+	MOV	r4, r4, LSL #16
			
 
				+	PKHTB	r10,r10,r4, ASR #20	@ r10= t[3]-t[4]+8>>4
			
 
				+	STR	r10,[r0, #60]		@ y[4<<3] = t[3]-t[4]+8>>4
			
 
				+	MOV	r10,r5, ASR #4
			
 
				+	MOV	r5, r5, LSL #16
			
 
				+	PKHTB	r10,r10,r5, ASR #20	@ r10= t[2]-t[5]+8>>4
			
 
				+	STR	r10,[r0, #76]		@ y[5<<3] = t[2]-t[5]+8>>4
			
 
				+	MOV	r10,r6, ASR #4
			
 
				+	MOV	r6, r6, LSL #16
			
 
				+	PKHTB	r10,r10,r6, ASR #20	@ r10= t[1]-t[6]+8>>4
			
 
				+	STR	r10,[r0, #92]		@ y[6<<3] = t[1]-t[6]+8>>4
			
 
				+	MOV	r10,r7, ASR #4
			
 
				+	MOV	r7, r7, LSL #16
			
 
				+	PKHTB	r10,r10,r7, ASR #20	@ r10= t[0]-t[7]+8>>4
			
 
				+	STR	r10,[r0, #108]		@ y[7<<3] = t[0]-t[7]+8>>4
			
 
				+	MOV	PC,r14
			
 
				+	@ .size idct8_8core_down_v6, .-idct8_8core_down_v6	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+  .if OC_ARM_ASM_NEON
			
 
				+	.global	_oc_idct8x8_1_neon
			
 
				+	.global	_oc_idct8x8_neon
			
 
				+
			
 
				+	.balign 16
			
 
				+OC_IDCT_CONSTS_NEON:
			
 
				+	.short	    8
			
 
				+	.short	64277 @ FB15 (C1S7)
			
 
				+	.short	60547 @ EC83 (C2S6)
			
 
				+	.short	54491 @ D4DB (C3S5)
			
 
				+	.short	46341 @ B505 (C4S4)
			
 
				+	.short	36410 @ 471D (C5S3)
			
 
				+	.short	25080 @ 30FC (C6S2)
			
 
				+	.short	12785 @ 31F1 (C7S1)
			
 
				+
			
 
				+	@ .type oc_idct8x8_1_neon, %function; oc_idct8x8_1_neon: @ PROC
			
 
				+_oc_idct8x8_1_neon:
			
 
				+	@ r0 = ogg_int16_t  *_y
			
 
				+	@ r1 = ogg_uint16_t  _dc
			
 
				+	VDUP.S16	Q0, r1
			
 
				+	VMOV		Q1, Q0
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0,:128]!
			
 
				+	VST1.64		{D0, D1, D2, D3}, [r0,:128]
			
 
				+	MOV	PC, r14
			
 
				+	@ .size oc_idct8x8_1_neon, .-oc_idct8x8_1_neon	@ ENDP
			
 
				+
			
 
				+	@ .type oc_idct8x8_neon, %function; oc_idct8x8_neon: @ PROC
			
 
				+_oc_idct8x8_neon:
			
 
				+	@ r0 = ogg_int16_t *_y
			
 
				+	@ r1 = ogg_int16_t *_x
			
 
				+	@ r2 = int          _last_zzi
			
 
				+	CMP	r2, #10
			
 
				+	BLE	oc_idct8x8_10_neon
			
 
				+oc_idct8x8_slow_neon:
			
 
				+	VPUSH		{D8-D15}
			
 
				+	MOV	r2, r1
			
 
				+	ADR	r3, OC_IDCT_CONSTS_NEON
			
 
				+	@ Row transforms (input is pre-transposed)
			
 
				+	VLD1.64		{D16,D17,D18,D19}, [r2,:128]!
			
 
				+	VLD1.64		{D20,D21,D22,D23}, [r2,:128]!
			
 
				+	VLD1.64		{D24,D25,D26,D27}, [r2,:128]!
			
 
				+	VSUB.S16	Q1, Q8, Q12	@ Q8 = x[0]-x[4]
			
 
				+	VLD1.64		{D28,D29,D30,D31}, [r2,:128]
			
 
				+	VADD.S16	Q8, Q8, Q12	@ Q1 = x[0]+x[4]
			
 
				+	VLD1.64		{D0,D1},           [r3,:128]
			
 
				+	MOV	r12, r14
			
 
				+	BL	oc_idct8x8_stage123_neon
			
 
				+@ Stage 4
			
 
				+	VSUB.S16	Q15,Q8, Q7	@ Q15 = y[7]=t[0]-t[7]
			
 
				+	VADD.S16	Q8, Q8, Q7	@ Q8  = y[0]=t[0]+t[7]
			
 
				+	VSUB.S16	Q14,Q9, Q3	@ Q14 = y[6]=t[1]-t[6]
			
 
				+	VADD.S16	Q9, Q9, Q3	@ Q9  = y[1]=t[1]+t[6]
			
 
				+	VSUB.S16	Q13,Q10,Q5	@ Q13 = y[5]=t[2]-t[5]
			
 
				+	VADD.S16	Q10,Q10,Q5	@ Q10 = y[2]=t[2]+t[5]
			
 
				+	VTRN.16		Q14,Q15
			
 
				+	VSUB.S16	Q12,Q11,Q4	@ Q12 = y[4]=t[3]-t[4]
			
 
				+	VADD.S16	Q11,Q11,Q4	@ Q11 = y[3]=t[3]+t[4]
			
 
				+	@ 8x8 Transpose
			
 
				+	VTRN.16		Q8, Q9
			
 
				+	VTRN.16		Q10,Q11
			
 
				+	VTRN.16		Q12,Q13
			
 
				+	VTRN.32		Q8, Q10
			
 
				+	VTRN.32		Q9, Q11
			
 
				+	VTRN.32		Q12,Q14
			
 
				+	VTRN.32		Q13,Q15
			
 
				+	VSWP		D17,D24
			
 
				+	VSUB.S16	Q1, Q8, Q12	@ Q8 = x[0]-x[4]
			
 
				+	VSWP		D19,D26
			
 
				+	VADD.S16	Q8, Q8, Q12	@ Q1 = x[0]+x[4]
			
 
				+	VSWP		D21,D28
			
 
				+	VSWP		D23,D30
			
 
				+	@ Column transforms
			
 
				+	BL	oc_idct8x8_stage123_neon
			
 
				+	@ We have to put the return address back in the LR, or the branch
			
 
				+	@  predictor will not recognize the function return and mis-predict the
			
 
				+	@  entire call stack.
			
 
				+	MOV	r14, r12
			
 
				+@ Stage 4
			
 
				+	VSUB.S16	Q15,Q8, Q7	@ Q15 = y[7]=t[0]-t[7]
			
 
				+	VADD.S16	Q8, Q8, Q7	@ Q8  = y[0]=t[0]+t[7]
			
 
				+	VSUB.S16	Q14,Q9, Q3	@ Q14 = y[6]=t[1]-t[6]
			
 
				+	VADD.S16	Q9, Q9, Q3	@ Q9  = y[1]=t[1]+t[6]
			
 
				+	VSUB.S16	Q13,Q10,Q5	@ Q13 = y[5]=t[2]-t[5]
			
 
				+	VADD.S16	Q10,Q10,Q5	@ Q10 = y[2]=t[2]+t[5]
			
 
				+	VSUB.S16	Q12,Q11,Q4	@ Q12 = y[4]=t[3]-t[4]
			
 
				+	VADD.S16	Q11,Q11,Q4	@ Q11 = y[3]=t[3]+t[4]
			
 
				+	VMOV.I8		Q2,#0
			
 
				+	VPOP		{D8-D15}
			
 
				+	VMOV.I8		Q3,#0
			
 
				+	VRSHR.S16	Q8, Q8, #4	@ Q8  = y[0]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
			
 
				+	VRSHR.S16	Q9, Q9, #4	@ Q9  = y[1]+8>>4
			
 
				+	VRSHR.S16	Q10,Q10,#4	@ Q10 = y[2]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
			
 
				+	VRSHR.S16	Q11,Q11,#4	@ Q11 = y[3]+8>>4
			
 
				+	VRSHR.S16	Q12,Q12,#4	@ Q12 = y[4]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1,:128]!
			
 
				+	VRSHR.S16	Q13,Q13,#4	@ Q13 = y[5]+8>>4
			
 
				+	VRSHR.S16	Q14,Q14,#4	@ Q14 = y[6]+8>>4
			
 
				+	VST1.64		{D4, D5, D6, D7}, [r1,:128]
			
 
				+	VRSHR.S16	Q15,Q15,#4	@ Q15 = y[7]+8>>4
			
 
				+	VSTMIA		r0, {D16-D31}
			
 
				+	MOV	PC, r14
			
 
				+	@ .size oc_idct8x8_neon, .-oc_idct8x8_neon	@ ENDP
			
 
				+
			
 
				+	@ .type oc_idct8x8_stage123_neon, %function; oc_idct8x8_stage123_neon: @ PROC
			
 
				+oc_idct8x8_stage123_neon:
			
 
				+@ Stages 1 & 2
			
 
				+	VMULL.S16	Q4, D18,D1[3]
			
 
				+	VMULL.S16	Q5, D19,D1[3]
			
 
				+	VMULL.S16	Q7, D30,D1[3]
			
 
				+	VMULL.S16	Q6, D31,D1[3]
			
 
				+	VMULL.S16	Q2, D30,D0[1]
			
 
				+	VMULL.S16	Q3, D31,D0[1]
			
 
				+	VSHRN.S32	D8, Q4, #16
			
 
				+	VSHRN.S32	D9, Q5, #16	@ Q4 = (OC_C7S1*x[1]>>16)
			
 
				+	VSHRN.S32	D14,Q7, #16
			
 
				+	VSHRN.S32	D15,Q6, #16	@ Q7 = (OC_C7S1*x[7]>>16)
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q3, #16	@ Q2 = (OC_C1S7*x[7]>>16)-x[7]
			
 
				+	VSUB.S16	Q4, Q4, Q15
			
 
				+	VADD.S16	Q7, Q7, Q9
			
 
				+	VSUB.S16	Q4, Q4, Q2	@ Q4 = t[4]
			
 
				+	VMULL.S16	Q2, D18,D0[1]
			
 
				+	VMULL.S16	Q9, D19,D0[1]
			
 
				+	VMULL.S16	Q5, D26,D0[3]
			
 
				+	VMULL.S16	Q3, D27,D0[3]
			
 
				+	VMULL.S16	Q6, D22,D0[3]
			
 
				+	VMULL.S16	Q12,D23,D0[3]
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q9, #16	@ Q2 = (OC_C1S7*x[1]>>16)-x[1]
			
 
				+	VSHRN.S32	D10,Q5, #16
			
 
				+	VSHRN.S32	D11,Q3, #16	@ Q5 = (OC_C3S5*x[5]>>16)-x[5]
			
 
				+	VSHRN.S32	D12,Q6, #16
			
 
				+	VSHRN.S32	D13,Q12,#16	@ Q6 = (OC_C3S5*x[3]>>16)-x[3]
			
 
				+	VADD.S16	Q7, Q7, Q2	@ Q7 = t[7]
			
 
				+	VSUB.S16	Q5, Q5, Q11
			
 
				+	VADD.S16	Q6, Q6, Q11
			
 
				+	VADD.S16	Q5, Q5, Q13
			
 
				+	VADD.S16	Q6, Q6, Q13
			
 
				+	VMULL.S16	Q9, D22,D1[1]
			
 
				+	VMULL.S16	Q11,D23,D1[1]
			
 
				+	VMULL.S16	Q15,D26,D1[1]
			
 
				+	VMULL.S16	Q13,D27,D1[1]
			
 
				+	VMULL.S16	Q2, D20,D1[2]
			
 
				+	VMULL.S16	Q12,D21,D1[2]
			
 
				+	VSHRN.S32	D18,Q9, #16
			
 
				+	VSHRN.S32	D19,Q11,#16	@ Q9 = (OC_C5S3*x[3]>>16)-x[3]
			
 
				+	VSHRN.S32	D30,Q15,#16
			
 
				+	VSHRN.S32	D31,Q13,#16	@ Q15= (OC_C5S3*x[5]>>16)-x[5]
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q12,#16	@ Q2 = (OC_C6S2*x[2]>>16)
			
 
				+	VSUB.S16	Q5, Q5, Q9	@ Q5 = t[5]
			
 
				+	VADD.S16	Q6, Q6, Q15	@ Q6 = t[6]
			
 
				+	VSUB.S16	Q2, Q2, Q14
			
 
				+	VMULL.S16	Q3, D28,D1[2]
			
 
				+	VMULL.S16	Q11,D29,D1[2]
			
 
				+	VMULL.S16	Q12,D28,D0[2]
			
 
				+	VMULL.S16	Q9, D29,D0[2]
			
 
				+	VMULL.S16	Q13,D20,D0[2]
			
 
				+	VMULL.S16	Q15,D21,D0[2]
			
 
				+	VSHRN.S32	D6, Q3, #16
			
 
				+	VSHRN.S32	D7, Q11,#16	@ Q3 = (OC_C6S2*x[6]>>16)
			
 
				+	VSHRN.S32	D24,Q12,#16
			
 
				+	VSHRN.S32	D25,Q9, #16	@ Q12= (OC_C2S6*x[6]>>16)-x[6]
			
 
				+	VSHRN.S32	D26,Q13,#16
			
 
				+	VSHRN.S32	D27,Q15,#16	@ Q13= (OC_C2S6*x[2]>>16)-x[2]
			
 
				+	VSUB.S16	Q9, Q4, Q5	@ Q9 = t[4]-t[5]
			
 
				+	VSUB.S16	Q11,Q7, Q6	@ Q11= t[7]-t[6]
			
 
				+	VADD.S16	Q3, Q3, Q10
			
 
				+	VADD.S16	Q4, Q4, Q5	@ Q4 = t[4]=t[4]+t[5]
			
 
				+	VADD.S16	Q7, Q7, Q6	@ Q7 = t[7]=t[7]+t[6]
			
 
				+	VSUB.S16	Q2, Q2, Q12	@ Q2 = t[2]
			
 
				+	VADD.S16	Q3, Q3, Q13	@ Q3 = t[3]
			
 
				+	VMULL.S16	Q12,D16,D1[0]
			
 
				+	VMULL.S16	Q13,D17,D1[0]
			
 
				+	VMULL.S16	Q14,D2, D1[0]
			
 
				+	VMULL.S16	Q15,D3, D1[0]
			
 
				+	VMULL.S16	Q5, D18,D1[0]
			
 
				+	VMULL.S16	Q6, D22,D1[0]
			
 
				+	VSHRN.S32	D24,Q12,#16
			
 
				+	VSHRN.S32	D25,Q13,#16
			
 
				+	VSHRN.S32	D28,Q14,#16
			
 
				+	VSHRN.S32	D29,Q15,#16
			
 
				+	VMULL.S16	Q13,D19,D1[0]
			
 
				+	VMULL.S16	Q15,D23,D1[0]
			
 
				+	VADD.S16	Q8, Q8, Q12	@ Q8 = t[0]
			
 
				+	VADD.S16	Q1, Q1, Q14	@ Q1 = t[1]
			
 
				+	VSHRN.S32	D10,Q5, #16
			
 
				+	VSHRN.S32	D12,Q6, #16
			
 
				+	VSHRN.S32	D11,Q13,#16
			
 
				+	VSHRN.S32	D13,Q15,#16
			
 
				+	VADD.S16	Q5, Q5, Q9	@ Q5 = t[5]=OC_C4S4*(t[4]-t[5])>>16
			
 
				+	VADD.S16	Q6, Q6, Q11	@ Q6 = t[6]=OC_C4S4*(t[7]-t[6])>>16
			
 
				+@ Stage 3
			
 
				+	VSUB.S16	Q11,Q8, Q3	@ Q11 = t[3]=t[0]-t[3]
			
 
				+	VADD.S16	Q8, Q8, Q3	@ Q8  = t[0]=t[0]+t[3]
			
 
				+	VADD.S16	Q9, Q1, Q2	@ Q9  = t[1]=t[1]+t[2]
			
 
				+	VADD.S16	Q3, Q6, Q5	@ Q3  = t[6]=t[6]+t[5]
			
 
				+	VSUB.S16	Q10,Q1, Q2	@ Q10 = t[2]=t[1]-t[2]
			
 
				+	VSUB.S16	Q5, Q6, Q5	@ Q5  = t[5]=t[6]-t[5]
			
 
				+	MOV	PC, r14
			
 
				+	@ .size oc_idct8x8_stage123_neon, .-oc_idct8x8_stage123_neon	@ ENDP
			
 
				+
			
 
				+	@ .type oc_idct8x8_10_neon, %function; oc_idct8x8_10_neon: @ PROC
			
 
				+oc_idct8x8_10_neon:
			
 
				+	ADR	r3, OC_IDCT_CONSTS_NEON
			
 
				+	VLD1.64		{D0,D1},          [r3,:128]
			
 
				+	MOV	r2, r1
			
 
				+	@ Row transforms (input is pre-transposed)
			
 
				+@ Stage 1
			
 
				+	VLD1.64		{D16,D17,D18,D19},[r2,:128]!
			
 
				+	MOV	r12, #16
			
 
				+	VMULL.S16	Q15,D16,D1[0]	@ Q15= OC_C4S4*x[0]-(x[0]<<16)
			
 
				+	VLD1.64		{D17},            [r2,:64], r12
			
 
				+	VMULL.S16	Q2, D18,D0[1]	@ Q2 = OC_C1S7*x[1]-(x[1]<<16)
			
 
				+	VLD1.64		{D19},            [r2,:64]
			
 
				+	VMULL.S16	Q14,D17,D0[2]	@ Q14= OC_C2S6*x[2]-(x[2]<<16)
			
 
				+	VMULL.S16	Q3, D19,D0[3]	@ Q3 = OC_C3S5*x[3]-(x[3]<<16)
			
 
				+	VMULL.S16	Q13,D19,D1[1]	@ Q13= OC_C5S3*x[3]-(x[3]<<16)
			
 
				+	VMULL.S16	Q12,D18,D1[3]	@ Q12= OC_C7S1*x[1]
			
 
				+	VMULL.S16	Q1, D17,D1[2]	@ Q1 = OC_C6S2*x[2]
			
 
				+	VSHRN.S32	D30,Q15,#16	@ D30= t[0]-x[0]
			
 
				+	VSHRN.S32	D4, Q2, #16	@ D4 = t[7]-x[1]
			
 
				+	VSHRN.S32	D31,Q14,#16	@ D31= t[3]-x[2]
			
 
				+	VSHRN.S32	D6, Q3, #16	@ D6 = t[6]-x[3]
			
 
				+	VSHRN.S32	D7, Q13,#16	@ D7 = -t[5]-x[3]
			
 
				+	VSHRN.S32	D5, Q12,#16	@ D5 = t[4]
			
 
				+	VSHRN.S32	D2, Q1, #16	@ D2 = t[2]
			
 
				+	VADD.S16	D4, D4, D18	@ D4 = t[7]
			
 
				+	VADD.S16	D6, D6, D19	@ D6 = t[6]
			
 
				+	VADD.S16	D7, D7, D19	@ D7 = -t[5]
			
 
				+	VADD.S16	Q15,Q15,Q8	@ D30= t[0]
			
 
				+					@ D31= t[3]
			
 
				+@ Stages 2 & 3
			
 
				+	VSUB.S16	Q12,Q2, Q3	@ D24= t[7]-t[6]
			
 
				+					@ D25= t[4]'=t[4]+t[5]
			
 
				+	VADD.S16	Q13,Q2, Q3	@ D26= t[7]=t[7]+t[6]
			
 
				+					@ D27= t[4]-t[5]
			
 
				+	VMULL.S16	Q11,D24,D1[0]	@ Q11= OC_C4S4*(t[7]-t[6])
			
 
				+					@       -(t[7]-t[6]<<16)
			
 
				+	VMULL.S16	Q14,D27,D1[0]	@ Q14= OC_C4S4*(t[4]-t[5])
			
 
				+					@       -(t[4]-t[5]<<16)
			
 
				+	VADD.S16	D16,D30,D31	@ D16= t[0]=t[0]+t[3]
			
 
				+	VSUB.S16	D17,D30,D2	@ D17= t[2]=t[0]-t[2]
			
 
				+	VADD.S16	D18,D30,D2	@ D18= t[1]=t[0]+t[2]
			
 
				+	VSHRN.S32	D22,Q11,#16	@ D22= (OC_C4S4*(t[7]-t[6])>>16)
			
 
				+					@       -(t[7]-t[6])
			
 
				+	VSHRN.S32	D23,Q14,#16	@ D23= (OC_C4S4*(t[4]-t[5])>>16)
			
 
				+					@       -(t[4]-t[5])
			
 
				+	VSUB.S16	D19,D30,D31	@ D19= t[3]=t[0]-t[3]
			
 
				+	VADD.S16	D22,D22,D24	@ D22= t[6]=OC_C4S4*(t[7]-t[6])>>16
			
 
				+	VADD.S16	D23,D23,D27	@ D23= t[5]=OC_C4S4*(t[4]-t[5])>>16
			
 
				+	VSUB.S16	D27,D22,D23	@ D27= t[5]=t[6]-t[5]
			
 
				+	VADD.S16	D24,D22,D23	@ D24= t[6]=t[6]+t[5]
			
 
				+@ Stage 4
			
 
				+	VSUB.S16	Q11,Q8, Q13	@ D22= y[7]=t[0]-t[7]
			
 
				+					@ D23= y[5]=t[2]'-t[5]''
			
 
				+	VSUB.S16	Q10,Q9, Q12	@ D20= y[6]=t[1]-t[6]
			
 
				+					@ D21= y[4]=t[3]'-t[4]''
			
 
				+	VADD.S16	Q8, Q8, Q13	@ D16= y[0]=t[0]+t[7]
			
 
				+					@ D17= y[2]=t[2]'+t[5]''
			
 
				+	VADD.S16	Q9, Q9, Q12	@ D18= y[1]=t[1]-t[6]
			
 
				+					@ D19= y[3]=t[3]'-t[4]''
			
 
				+	@ 8x4 transpose
			
 
				+	VTRN.16		Q10,Q11		@ Q10= c5c4a5a4 c7c6a7a6
			
 
				+					@ Q11= d5d4b5b4 d7d6b7b6
			
 
				+	VTRN.16		Q8, Q9		@ Q8 = c3c2a3a2 c1c0a1a0
			
 
				+					@ Q9 = d3d2b3b2 d1d0b1b0
			
 
				+	VSWP		D20,D21		@ Q10= c7c6a7a6 c5c4a5a4
			
 
				+	VSWP		D22,D23		@ Q11= d7d6b7b6 d5d4b5b4
			
 
				+	VUZP.32		Q9, Q11		@ Q9 = b7b6b5b4 b3b2b1b0
			
 
				+					@ Q11= d7d6d5d4 d3d2d1d0
			
 
				+	VMULL.S16	Q15,D18,D0[1]
			
 
				+	VMULL.S16	Q13,D22,D1[1]
			
 
				+	VUZP.32		Q8, Q10		@ Q8 = a7a6a5a4 a3a2a1a0
			
 
				+					@ Q10= c7c6c5c4 c3c2c1c0
			
 
				+	@ Column transforms
			
 
				+@ Stages 1, 2, & 3
			
 
				+	VMULL.S16	Q14,D19,D0[1]	@ Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
			
 
				+	VMULL.S16	Q12,D23,D1[1]	@ Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
			
 
				+	VMULL.S16	Q3, D22,D0[3]
			
 
				+	VMULL.S16	Q2, D23,D0[3]	@  Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
			
 
				+	VSHRN.S32	D30,Q15,#16
			
 
				+	VSHRN.S32	D31,Q14,#16	@ Q15= (OC_C1S7*x[1]>>16)-x[1]
			
 
				+	VSHRN.S32	D26,Q13,#16
			
 
				+	VSHRN.S32	D27,Q12,#16	@ Q13= (OC_C5S3*x[3]>>16)-x[3]
			
 
				+	VSHRN.S32	D28,Q3, #16
			
 
				+	VSHRN.S32	D29,Q2, #16	@ Q14= (OC_C3S5*x[3]>>16)-x[3]
			
 
				+	VADD.S16	Q15,Q15,Q9	@ Q15= t[7]
			
 
				+	VADD.S16	Q13,Q13,Q11	@ Q13= -t[5]
			
 
				+	VADD.S16	Q14,Q14,Q11	@ Q14= t[6]
			
 
				+	VMULL.S16	Q12,D18,D1[3]
			
 
				+	VMULL.S16	Q2, D19,D1[3]	@  Q2:Q12= OC_C7S1*x[1]
			
 
				+	VMULL.S16	Q1, D16,D1[0]
			
 
				+	VMULL.S16	Q11,D17,D1[0]	@ Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
			
 
				+	VMULL.S16	Q3, D20,D0[2]
			
 
				+	VMULL.S16	Q9, D21,D0[2]	@  Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
			
 
				+	VSHRN.S32	D24,Q12,#16
			
 
				+	VSHRN.S32	D25,Q2, #16	@ Q12= t[4]
			
 
				+	VMULL.S16	Q2, D20,D1[2]
			
 
				+	VSHRN.S32	D2, Q1, #16
			
 
				+	VSHRN.S32	D3, Q11,#16	@ Q1 = (OC_C4S4*x[0]>>16)-x[0]
			
 
				+	VMULL.S16	Q11,D21,D1[2]	@  Q2:Q11= OC_C6S2*x[2]
			
 
				+	VSHRN.S32	D6, Q3, #16
			
 
				+	VSHRN.S32	D7, Q9, #16	@ Q3 = (OC_C2S6*x[2]>>16)-x[2]
			
 
				+	VSUB.S16	Q9, Q15,Q14	@ Q9 = t[7]-t[6]
			
 
				+	VADD.S16	Q15,Q15,Q14	@ Q15= t[7]=t[7]+t[6]
			
 
				+	VSHRN.S32	D4, Q2, #16
			
 
				+	VSHRN.S32	D5, Q11,#16	@ Q2 = t[2]
			
 
				+	VADD.S16	Q1, Q1, Q8	@ Q1 = t[0]
			
 
				+	VADD.S16	Q8, Q12,Q13	@ Q8 = t[4]-t[5]
			
 
				+	VADD.S16	Q3, Q3, Q10	@ Q3 = t[3]
			
 
				+	VMULL.S16	Q10,D16,D1[0]
			
 
				+	VMULL.S16	Q11,D17,D1[0]	@ Q11:Q10= OC_C4S4*(t[4]-t[5])
			
 
				+					@           -(t[4]-t[5]<<16)
			
 
				+	VSUB.S16	Q12,Q12,Q13	@ Q12= t[4]=t[4]+t[5]
			
 
				+	VMULL.S16	Q14,D18,D1[0]
			
 
				+	VMULL.S16	Q13,D19,D1[0]	@ Q13:Q14= OC_C4S4*(t[6]-t[7])
			
 
				+					@           -(t[6]-t[7]<<16)
			
 
				+	VSHRN.S32	D20,Q10,#16
			
 
				+	VSHRN.S32	D21,Q11,#16	@ Q10= (OC_C4S4*(t[4]-t[5])>>16)
			
 
				+					@       -(t[4]-t[5])
			
 
				+	VADD.S16	Q11,Q1, Q3	@ Q11= t[0]=t[0]+t[3]
			
 
				+	VSUB.S16	Q3, Q1, Q3	@ Q3 = t[3]=t[0]-t[3]
			
 
				+	VSHRN.S32	D28,Q14,#16
			
 
				+	VSHRN.S32	D29,Q13,#16	@ Q14= (OC_C4S4*(t[7]-t[6])>>16)
			
 
				+					@       -(t[7]-t[6])
			
 
				+	VADD.S16	Q10,Q10,Q8	@ Q10=t[5]
			
 
				+	VADD.S16	Q14,Q14,Q9	@ Q14=t[6]
			
 
				+	VSUB.S16	Q13,Q14,Q10	@ Q13=t[5]=t[6]-t[5]
			
 
				+	VADD.S16	Q14,Q14,Q10	@ Q14=t[6]=t[6]+t[5]
			
 
				+	VADD.S16	Q10,Q1, Q2	@ Q10= t[1]=t[0]+t[2]
			
 
				+	VSUB.S16	Q2, Q1, Q2	@ Q2 = t[2]=t[0]-t[2]
			
 
				+@ Stage 4
			
 
				+	VADD.S16	Q8, Q11,Q15	@ Q8  = y[0]=t[0]+t[7]
			
 
				+	VADD.S16	Q9, Q10,Q14	@ Q9  = y[1]=t[1]+t[6]
			
 
				+	VSUB.S16	Q15,Q11,Q15	@ Q15 = y[7]=t[0]-t[7]
			
 
				+	VSUB.S16	Q14,Q10,Q14	@ Q14 = y[6]=t[1]-t[6]
			
 
				+	VADD.S16	Q10,Q2, Q13	@ Q10 = y[2]=t[2]+t[5]
			
 
				+	VADD.S16	Q11,Q3, Q12	@ Q11 = y[3]=t[3]+t[4]
			
 
				+	VSUB.S16	Q12,Q3, Q12	@ Q12 = y[4]=t[3]-t[4]
			
 
				+	VSUB.S16	Q13,Q2, Q13	@ Q13 = y[5]=t[2]-t[5]
			
 
				+	VMOV.I8		D2, #0
			
 
				+	VRSHR.S16	Q8, Q8, #4	@ Q8  = y[0]+8>>4
			
 
				+	VST1.64		{D2}, [r1,:64], r12
			
 
				+	VRSHR.S16	Q9, Q9, #4	@ Q9  = y[1]+8>>4
			
 
				+	VRSHR.S16	Q10,Q10,#4	@ Q10 = y[2]+8>>4
			
 
				+	VST1.64		{D2}, [r1,:64], r12
			
 
				+	VRSHR.S16	Q11,Q11,#4	@ Q11 = y[3]+8>>4
			
 
				+	VRSHR.S16	Q12,Q12,#4	@ Q12 = y[4]+8>>4
			
 
				+	VST1.64		{D2}, [r1,:64], r12
			
 
				+	VRSHR.S16	Q13,Q13,#4	@ Q13 = y[5]+8>>4
			
 
				+	VRSHR.S16	Q14,Q14,#4	@ Q14 = y[6]+8>>4
			
 
				+	VST1.64		{D2}, [r1,:64]
			
 
				+	VRSHR.S16	Q15,Q15,#4	@ Q15 = y[7]+8>>4
			
 
				+	VSTMIA		r0, {D16-D31}
			
 
				+	MOV	PC, r14
			
 
				+	@ .size oc_idct8x8_10_neon, .-oc_idct8x8_10_neon	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+	@ END
			
 
				+    @ .section	.note.GNU-stack,"",%progbits
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm_llvm/armint.h
+++ b/modules/theoraplayer/native/theora/lib/arm_llvm/armint.h
@@ -0,0 +1,126 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#if !defined(_arm_armint_H)
			
 
				+# define _arm_armint_H (1)
			
 
				+# include "../internal.h"
			
 
				+
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+
			
 
				+#  if defined(__ARMEB__)
			
 
				+#   error "Big-endian configurations are not supported by the ARM asm. " \
			
 
				+ "Reconfigure with --disable-asm or undefine OC_ARM_ASM."
			
 
				+#  endif
			
 
				+
			
 
				+#  define oc_state_accel_init oc_state_accel_init_arm
			
 
				+/*This function is implemented entirely in asm, so it's helpful to pull out all
			
 
				+   of the things that depend on structure offsets.
			
 
				+  We reuse the function pointer with the wrong prototype, though.*/
			
 
				+#  define oc_state_loop_filter_frag_rows(_state,_bv,_refi,_pli, \
			
 
				+ _fragy0,_fragy_end) \
			
 
				+  ((oc_loop_filter_frag_rows_arm_func) \
			
 
				+   (_state)->opt_vtable.state_loop_filter_frag_rows)( \
			
 
				+   (_state)->ref_frame_data[(_refi)],(_state)->ref_ystride[(_pli)], \
			
 
				+   (_bv), \
			
 
				+   (_state)->frags, \
			
 
				+   (_state)->fplanes[(_pli)].froffset \
			
 
				+   +(_fragy0)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
			
 
				+   (_state)->fplanes[(_pli)].froffset \
			
 
				+   +(_fragy_end)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
			
 
				+   (_state)->fplanes[(_pli)].froffset, \
			
 
				+   (_state)->fplanes[(_pli)].froffset+(_state)->fplanes[(_pli)].nfrags, \
			
 
				+   (_state)->frag_buf_offs, \
			
 
				+   (_state)->fplanes[(_pli)].nhfrags)
			
 
				+/*For everything else the default vtable macros are fine.*/
			
 
				+#  define OC_STATE_USE_VTABLE (1)
			
 
				+# endif
			
 
				+
			
 
				+# include "../state.h"
			
 
				+# include "armcpu.h"
			
 
				+
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+typedef void (*oc_loop_filter_frag_rows_arm_func)(
			
 
				+ unsigned char *_ref_frame_data,int _ystride,signed char _bv[256],
			
 
				+ const oc_fragment *_frags,ptrdiff_t _fragi0,ptrdiff_t _fragi0_end,
			
 
				+ ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
			
 
				+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
			
 
				+
			
 
				+void oc_state_accel_init_arm(oc_theora_state *_state);
			
 
				+void oc_frag_copy_list_arm(unsigned char *_dst_frame,
			
 
				+ const unsigned char *_src_frame,int _ystride,
			
 
				+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
			
 
				+void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride,
			
 
				+ const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter_arm(unsigned char *_dst,const unsigned char *_src,
			
 
				+ int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
			
 
				+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_idct8x8_1_arm(ogg_int16_t _y[64],ogg_uint16_t _dc);
			
 
				+void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
			
 
				+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
			
 
				+void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data,
			
 
				+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
			
 
				+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
			
 
				+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
			
 
				+
			
 
				+#  if defined(OC_ARM_ASM_EDSP)
			
 
				+void oc_frag_copy_list_edsp(unsigned char *_dst_frame,
			
 
				+ const unsigned char *_src_frame,int _ystride,
			
 
				+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
			
 
				+
			
 
				+#   if defined(OC_ARM_ASM_MEDIA)
			
 
				+void oc_frag_recon_intra_v6(unsigned char *_dst,int _ystride,
			
 
				+ const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter_v6(unsigned char *_dst,const unsigned char *_src,
			
 
				+ int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1,
			
 
				+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_idct8x8_1_v6(ogg_int16_t _y[64],ogg_uint16_t _dc);
			
 
				+void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
			
 
				+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
			
 
				+void oc_loop_filter_init_v6(signed char *_bv,int _flimit);
			
 
				+void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data,
			
 
				+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
			
 
				+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
			
 
				+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
			
 
				+
			
 
				+#    if defined(OC_ARM_ASM_NEON)
			
 
				+void oc_frag_copy_list_neon(unsigned char *_dst_frame,
			
 
				+ const unsigned char *_src_frame,int _ystride,
			
 
				+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
			
 
				+void oc_frag_recon_intra_neon(unsigned char *_dst,int _ystride,
			
 
				+ const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter_neon(unsigned char *_dst,const unsigned char *_src,
			
 
				+ int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1,
			
 
				+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_idct8x8_1_neon(ogg_int16_t _y[64],ogg_uint16_t _dc);
			
 
				+void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
			
 
				+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
			
 
				+void oc_loop_filter_init_neon(signed char *_bv,int _flimit);
			
 
				+void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data,
			
 
				+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
			
 
				+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
			
 
				+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
			
 
				+#    endif
			
 
				+#   endif
			
 
				+#  endif
			
 
				+# endif
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm_llvm/armloop.asm
+++ b/modules/theoraplayer/native/theora/lib/arm_llvm/armloop.asm
@@ -0,0 +1,691 @@
 
				+#ifdef OC_ARM_ASM
			
 
				+@********************************************************************
			
 
				+@*                                                                  *
			
 
				+@* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+@* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+@* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+@* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+@*                                                                  *
			
 
				+@* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+@* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+@*                                                                  *
			
 
				+@********************************************************************
			
 
				+@ Original implementation:
			
 
				+@  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
			
 
				+@ last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $
			
 
				+@********************************************************************
			
 
				+
			
 
				+    .text;   .p2align 2
			
 
				+
			
 
				+	.global	_oc_loop_filter_frag_rows_arm
			
 
				+
			
 
				+@ Which bit this is depends on the order of packing within a bitfield.
			
 
				+@ Hopefully that doesn't change among any of the relevant compilers.
			
 
				+ .set OC_FRAG_CODED_FLAG,	1
			
 
				+
			
 
				+	@ Vanilla ARM v4 version
			
 
				+	@ .type loop_filter_h_arm, %function; loop_filter_h_arm: @ PROC
			
 
				+loop_filter_h_arm:
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int           *_bv
			
 
				+	@ preserves r0-r3
			
 
				+	STMFD	r13!,{r3-r6,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r6, #255
			
 
				+lfh_arm_lp:
			
 
				+	LDRB	r3, [r0, #-2]		@ r3 = _pix[0]
			
 
				+	LDRB	r12,[r0, #1]		@ r12= _pix[3]
			
 
				+	LDRB	r4, [r0, #-1]		@ r4 = _pix[1]
			
 
				+	LDRB	r5, [r0]		@ r5 = _pix[2]
			
 
				+	SUB	r3, r3, r12		@ r3 = _pix[0]-_pix[3]+4
			
 
				+	ADD	r3, r3, #4
			
 
				+	SUB	r12,r5, r4		@ r12= _pix[2]-_pix[1]
			
 
				+	ADD	r12,r12,r12,LSL #1	@ r12= 3*(_pix[2]-_pix[1])
			
 
				+	ADD	r12,r12,r3	@ r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
			
 
				+	MOV	r12,r12,ASR #3
			
 
				+	LDRSB	r12,[r2, r12]
			
 
				+	@ Stall (2 on Xscale)
			
 
				+	ADDS	r4, r4, r12
			
 
				+	CMPGT	r6, r4
			
 
				+	EORLT	r4, r6, r4, ASR #32
			
 
				+	SUBS	r5, r5, r12
			
 
				+	CMPGT	r6, r5
			
 
				+	EORLT	r5, r6, r5, ASR #32
			
 
				+	STRB	r4, [r0, #-1]
			
 
				+	STRB	r5, [r0], r1
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	lfh_arm_lp
			
 
				+	SUB	r0, r0, r1, LSL #3
			
 
				+	LDMFD	r13!,{r3-r6,PC}
			
 
				+	@ @ .size loop_filter_h_arm, .-loop_filter_h_arm	@ ENDP
			
 
				+
			
 
				+	@ .type loop_filter_v_arm, %function; loop_filter_v_arm: @ PROC
			
 
				+loop_filter_v_arm:
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int           *_bv
			
 
				+	@ preserves r0-r3
			
 
				+	STMFD	r13!,{r3-r6,r14}
			
 
				+	MOV	r14,#8
			
 
				+	MOV	r6, #255
			
 
				+lfv_arm_lp:
			
 
				+	LDRB	r3, [r0, -r1, LSL #1]	@ r3 = _pix[0]
			
 
				+	LDRB	r12,[r0, r1]		@ r12= _pix[3]
			
 
				+	LDRB	r4, [r0, -r1]		@ r4 = _pix[1]
			
 
				+	LDRB	r5, [r0]		@ r5 = _pix[2]
			
 
				+	SUB	r3, r3, r12		@ r3 = _pix[0]-_pix[3]+4
			
 
				+	ADD	r3, r3, #4
			
 
				+	SUB	r12,r5, r4		@ r12= _pix[2]-_pix[1]
			
 
				+	ADD	r12,r12,r12,LSL #1	@ r12= 3*(_pix[2]-_pix[1])
			
 
				+	ADD	r12,r12,r3	@ r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
			
 
				+	MOV	r12,r12,ASR #3
			
 
				+	LDRSB	r12,[r2, r12]
			
 
				+	@ Stall (2 on Xscale)
			
 
				+	ADDS	r4, r4, r12
			
 
				+	CMPGT	r6, r4
			
 
				+	EORLT	r4, r6, r4, ASR #32
			
 
				+	SUBS	r5, r5, r12
			
 
				+	CMPGT	r6, r5
			
 
				+	EORLT	r5, r6, r5, ASR #32
			
 
				+	STRB	r4, [r0, -r1]
			
 
				+	STRB	r5, [r0], #1
			
 
				+	SUBS	r14,r14,#1
			
 
				+	BGT	lfv_arm_lp
			
 
				+	SUB	r0, r0, #8
			
 
				+	LDMFD	r13!,{r3-r6,PC}
			
 
				+	@ @ .size loop_filter_v_arm, .-loop_filter_v_arm	@ ENDP
			
 
				+
			
 
				+	@ .type oc_loop_filter_frag_rows_arm, %function; oc_loop_filter_frag_rows_arm: @ PROC
			
 
				+_oc_loop_filter_frag_rows_arm:
			
 
				+	@ r0 = _ref_frame_data
			
 
				+	@ r1 = _ystride
			
 
				+	@ r2 = _bv
			
 
				+	@ r3 = _frags
			
 
				+	@ r4 = _fragi0
			
 
				+	@ r5 = _fragi0_end
			
 
				+	@ r6 = _fragi_top
			
 
				+	@ r7 = _fragi_bot
			
 
				+	@ r8 = _frag_buf_offs
			
 
				+	@ r9 = _nhfrags
			
 
				+	MOV	r12,r13
			
 
				+	STMFD	r13!,{r0,r4-r11,r14}
			
 
				+	LDMFD	r12,{r4-r9}
			
 
				+	ADD	r2, r2, #127	@ _bv += 127
			
 
				+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
			
 
				+	BGE	oslffri_arm_end	@   bail
			
 
				+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
			
 
				+	BLE	oslffri_arm_end	@			  bail
			
 
				+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
			
 
				+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
			
 
				+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
			
 
				+oslffri_arm_lp1:
			
 
				+	MOV	r10,r4		@ r10= fragi = _fragi0
			
 
				+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
			
 
				+oslffri_arm_lp2:
			
 
				+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
			
 
				+	LDR	r0, [r13]	@ r0 = _ref_frame_data
			
 
				+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
			
 
				+	TST	r14,#OC_FRAG_CODED_FLAG
			
 
				+	BEQ	oslffri_arm_uncoded
			
 
				+	CMP	r10,r4		@ if (fragi>_fragi0)
			
 
				+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
			
 
				+	BLGT	loop_filter_h_arm
			
 
				+	CMP	r4, r6		@ if (_fragi0>_fragi_top)
			
 
				+	BLGT	loop_filter_v_arm
			
 
				+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
			
 
				+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
			
 
				+	ADD	r0, r0, #8
			
 
				+	ADD	r10,r10,#1	@ r10 = fragi+1;
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
			
 
				+	BLLT	loop_filter_h_arm
			
 
				+	CMP	r10,r7		@ if (fragi<_fragi_bot)
			
 
				+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
			
 
				+	SUB	r0, r0, #8
			
 
				+	ADD	r0, r0, r1, LSL #3
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG
			
 
				+	BLLT	loop_filter_v_arm
			
 
				+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
			
 
				+	BLE	oslffri_arm_lp2
			
 
				+	MOV	r4, r10		@ r4 = fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_arm_lp1
			
 
				+oslffri_arm_end:
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+oslffri_arm_uncoded:
			
 
				+	ADD	r10,r10,#1
			
 
				+	CMP	r10,r11
			
 
				+	BLE	oslffri_arm_lp2
			
 
				+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_arm_lp1
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+	@ @ .size oc_loop_filter_frag_rows_arm, .-oc_loop_filter_frag_rows_arm	@ ENDP
			
 
				+
			
 
				+  .if OC_ARM_ASM_MEDIA
			
 
				+	.global	_oc_loop_filter_init_v6
			
 
				+	.global	_oc_loop_filter_frag_rows_v6
			
 
				+
			
 
				+	@ .type oc_loop_filter_init_v6, %function; oc_loop_filter_init_v6: @ PROC
			
 
				+_oc_loop_filter_init_v6:
			
 
				+	@ r0 = _bv
			
 
				+	@ r1 = _flimit (=L from the spec)
			
 
				+	MVN	r1, r1, LSL #1		@ r1 = <0xFFFFFF|255-2*L>
			
 
				+	AND	r1, r1, #255		@ r1 = ll=r10x0xFF
			
 
				+	ORR	r1, r1, r1, LSL #8	@ r1 = <ll|ll>
			
 
				+	PKHBT	r1, r1, r1, LSL #16	@ r1 = <ll|ll|ll|ll>
			
 
				+	STR	r1, [r0]
			
 
				+	MOV	PC,r14
			
 
				+	@ @ .size oc_loop_filter_init_v6, .-oc_loop_filter_init_v6	@ ENDP
			
 
				+
			
 
				+@ We could use the same strategy as the v filter below, but that would require
			
 
				+@  40 instructions to load the data and transpose it into columns and another
			
 
				+@  32 to write out the results at the end, plus the 52 instructions to do the
			
 
				+@  filtering itself.
			
 
				+@ This is slightly less, and less code, even assuming we could have shared the
			
 
				+@  52 instructions in the middle with the other function.
			
 
				+@ It executes slightly fewer instructions than the ARMv6 approach David Conrad
			
 
				+@  proposed for FFmpeg, but not by much:
			
 
				+@  http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
			
 
				+@ His is a lot less code, though, because it only does two rows at once instead
			
 
				+@  of four.
			
 
				+	@ .type loop_filter_h_v6, %function; loop_filter_h_v6: @ PROC
			
 
				+loop_filter_h_v6:
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int            _ll
			
 
				+	@ preserves r0-r3
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	MOV	r12, 0x0003
			
 
				+	MOVT	r12, 0x1
			
 
				+	BL loop_filter_h_core_v6
			
 
				+	ADD	r0, r0, r1, LSL #2
			
 
				+	BL loop_filter_h_core_v6
			
 
				+	SUB	r0, r0, r1, LSL #2
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	@ @ .size loop_filter_h_v6, .-loop_filter_h_v6	@ ENDP
			
 
				+
			
 
				+	@ .type loop_filter_h_core_v6, %function; loop_filter_h_core_v6: @ PROC
			
 
				+loop_filter_h_core_v6:
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int            _ll
			
 
				+	@ r12= 0x10003
			
 
				+	@ Preserves r0-r3, r12; Clobbers r4-r11.
			
 
				+	LDR	r4,[r0, #-2]!		@ r4 = <p3|p2|p1|p0>
			
 
				+	@ Single issue
			
 
				+	LDR	r5,[r0, r1]!		@ r5 = <q3|q2|q1|q0>
			
 
				+	UXTB16	r6, r4, ROR #16		@ r6 = <p0|p2>
			
 
				+	UXTB16	r4, r4, ROR #8		@ r4 = <p3|p1>
			
 
				+	UXTB16	r7, r5, ROR #16		@ r7 = <q0|q2>
			
 
				+	UXTB16	r5, r5, ROR #8		@ r5 = <q3|q1>
			
 
				+	PKHBT	r8, r4, r5, LSL #16	@ r8 = <__|q1|__|p1>
			
 
				+	PKHBT	r9, r6, r7, LSL #16	@ r9 = <__|q2|__|p2>
			
 
				+	SSUB16	r6, r4, r6		@ r6 = <p3-p0|p1-p2>
			
 
				+	SMLAD	r6, r6, r12,r12		@ r6 = <????|(p3-p0)+3*(p1-p2)+3>
			
 
				+	SSUB16	r7, r5, r7		@ r7 = <q3-q0|q1-q2>
			
 
				+	SMLAD	r7, r7, r12,r12		@ r7 = <????|(q0-q3)+3*(q2-q1)+4>
			
 
				+	LDR	r4,[r0, r1]!		@ r4 = <r3|r2|r1|r0>
			
 
				+	MOV	r6, r6, ASR #3		@ r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
			
 
				+	LDR	r5,[r0, r1]!		@ r5 = <s3|s2|s1|s0>
			
 
				+	PKHBT	r11,r6, r7, LSL #13	@ r11= <??|-R_q|??|-R_p>
			
 
				+	UXTB16	r6, r4, ROR #16		@ r6 = <r0|r2>
			
 
				+	UXTB16	r11,r11			@ r11= <__|-R_q|__|-R_p>
			
 
				+	UXTB16	r4, r4, ROR #8		@ r4 = <r3|r1>
			
 
				+	UXTB16	r7, r5, ROR #16		@ r7 = <s0|s2>
			
 
				+	PKHBT	r10,r6, r7, LSL #16	@ r10= <__|s2|__|r2>
			
 
				+	SSUB16	r6, r4, r6		@ r6 = <r3-r0|r1-r2>
			
 
				+	UXTB16	r5, r5, ROR #8		@ r5 = <s3|s1>
			
 
				+	SMLAD	r6, r6, r12,r12		@ r6 = <????|(r3-r0)+3*(r2-r1)+3>
			
 
				+	SSUB16	r7, r5, r7		@ r7 = <r3-r0|r1-r2>
			
 
				+	SMLAD	r7, r7, r12,r12		@ r7 = <????|(s0-s3)+3*(s2-s1)+4>
			
 
				+	ORR	r9, r9, r10, LSL #8	@ r9 = <s2|q2|r2|p2>
			
 
				+	MOV	r6, r6, ASR #3		@ r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
			
 
				+	PKHBT	r10,r4, r5, LSL #16	@ r10= <__|s1|__|r1>
			
 
				+	PKHBT	r6, r6, r7, LSL #13	@ r6 = <??|-R_s|??|-R_r>
			
 
				+	ORR	r8, r8, r10, LSL #8	@ r8 = <s1|q1|r1|p1>
			
 
				+	UXTB16	r6, r6			@ r6 = <__|-R_s|__|-R_r>
			
 
				+	MOV	r10,#0
			
 
				+	ORR	r6, r11,r6, LSL #8	@ r6 = <-R_s|-R_q|-R_r|-R_p>
			
 
				+	@ Single issue
			
 
				+	@ There's no min, max or abs instruction.
			
 
				+	@ SSUB8 and SEL will work for abs, and we can do all the rest with
			
 
				+	@  unsigned saturated adds, which means the GE flags are still all
			
 
				+	@  set when we're done computing lflim(abs(R_i),L).
			
 
				+	@ This allows us to both add and subtract, and split the results by
			
 
				+	@  the original sign of R_i.
			
 
				+	SSUB8	r7, r10,r6
			
 
				+	@ Single issue
			
 
				+	SEL	r7, r7, r6		@ r7 = abs(R_i)
			
 
				+	@ Single issue
			
 
				+	UQADD8	r4, r7, r2		@ r4 = 255-max(2*L-abs(R_i),0)
			
 
				+	@ Single issue
			
 
				+	UQADD8	r7, r7, r4
			
 
				+	@ Single issue
			
 
				+	UQSUB8	r7, r7, r4		@ r7 = min(abs(R_i),max(2*L-abs(R_i),0))
			
 
				+	@ Single issue
			
 
				+	UQSUB8	r4, r8, r7
			
 
				+	UQADD8	r5, r9, r7
			
 
				+	UQADD8	r8, r8, r7
			
 
				+	UQSUB8	r9, r9, r7
			
 
				+	SEL	r8, r8, r4		@ r8 = p1+lflim(R_i,L)
			
 
				+	SEL	r9, r9, r5		@ r9 = p2-lflim(R_i,L)
			
 
				+	MOV	r5, r9, LSR #24		@ r5 = s2
			
 
				+	STRB	r5, [r0,#2]!
			
 
				+	MOV	r4, r8, LSR #24		@ r4 = s1
			
 
				+	STRB	r4, [r0,#-1]
			
 
				+	MOV	r5, r9, LSR #8		@ r5 = r2
			
 
				+	STRB	r5, [r0,-r1]!
			
 
				+	MOV	r4, r8, LSR #8		@ r4 = r1
			
 
				+	STRB	r4, [r0,#-1]
			
 
				+	MOV	r5, r9, LSR #16		@ r5 = q2
			
 
				+	STRB	r5, [r0,-r1]!
			
 
				+	MOV	r4, r8, LSR #16		@ r4 = q1
			
 
				+	STRB	r4, [r0,#-1]
			
 
				+	@ Single issue
			
 
				+	STRB	r9, [r0,-r1]!
			
 
				+	@ Single issue
			
 
				+	STRB	r8, [r0,#-1]
			
 
				+	MOV	PC,r14
			
 
				+	@ @ .size loop_filter_h_core_v6, .-loop_filter_h_core_v6	@ ENDP
			
 
				+
			
 
				+@ This uses the same strategy as the MMXEXT version for x86, except that UHADD8
			
 
				+@  computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
			
 
				+@ This works just as well, with the following procedure for computing the
			
 
				+@  filter value, f:
			
 
				+@   u = ~UHADD8(p1,~p2);
			
 
				+@   v = UHADD8(~p1,p2);
			
 
				+@   m = v-u;
			
 
				+@   a = m^UHADD8(m^p0,m^~p3);
			
 
				+@   f = UHADD8(UHADD8(a,u1),v1);
			
 
				+@  where f = 127+R, with R in [-127,128] defined as in the spec.
			
 
				+@ This is exactly the same amount of arithmetic as the version that uses PAVGB
			
 
				+@  as the basic operator.
			
 
				+@ It executes about 2/3 the number of instructions of David Conrad's approach,
			
 
				+@  but requires more code, because it does all eight columns at once, instead
			
 
				+@  of four at a time.
			
 
				+	@ .type loop_filter_v_v6, %function; loop_filter_v_v6: @ PROC
			
 
				+loop_filter_v_v6:
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int            _ll
			
 
				+	@ preserves r0-r11
			
 
				+	STMFD	r13!,{r4-r11,r14}
			
 
				+	LDRD	r6, r7, [r0, -r1]!		@ r7, r6 = <p5|p1>
			
 
				+	LDRD	r4, r5, [r0, -r1]		@ r5, r4 = <p4|p0>
			
 
				+	LDRD	r8, r9, [r0, r1]!		@ r9, r8 = <p6|p2>
			
 
				+	MVN	r14,r6			@ r14= ~p1
			
 
				+	LDRD	r10,r11,[r0, r1]		@ r11,r10= <p7|p3>
			
 
				+	@ Filter the first four columns.
			
 
				+	MVN	r12,r8			@ r12= ~p2
			
 
				+	UHADD8	r14,r14,r8		@ r14= v1=~p1+p2>>1
			
 
				+	UHADD8	r12,r12,r6		@ r12= p1+~p2>>1
			
 
				+	MVN	r10, r10		@ r10=~p3
			
 
				+	MVN	r12,r12			@ r12= u1=~p1+p2+1>>1
			
 
				+	SSUB8	r14,r14,r12		@ r14= m1=v1-u1
			
 
				+	@ Single issue
			
 
				+	EOR	r4, r4, r14		@ r4 = m1^p0
			
 
				+	EOR	r10,r10,r14		@ r10= m1^~p3
			
 
				+	UHADD8	r4, r4, r10		@ r4 = (m1^p0)+(m1^~p3)>>1
			
 
				+	@ Single issue
			
 
				+	EOR	r4, r4, r14		@ r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
			
 
				+	SADD8	r14,r14,r12		@ r14= v1=m1+u1
			
 
				+	UHADD8	r4, r4, r12		@ r4 = a1+u1>>1
			
 
				+	MVN	r12,r9			@ r12= ~p6
			
 
				+	UHADD8	r4, r4, r14		@ r4 = f1=(a1+u1>>1)+v1>>1
			
 
				+	@ Filter the second four columns.
			
 
				+	MVN	r14,r7			@ r14= ~p5
			
 
				+	UHADD8	r12,r12,r7		@ r12= p5+~p6>>1
			
 
				+	UHADD8	r14,r14,r9		@ r14= v2=~p5+p6>>1
			
 
				+	MVN	r12,r12			@ r12= u2=~p5+p6+1>>1
			
 
				+	MVN	r11,r11			@ r11=~p7
			
 
				+	SSUB8	r10,r14,r12		@ r10= m2=v2-u2
			
 
				+	@ Single issue
			
 
				+	EOR	r5, r5, r10		@ r5 = m2^p4
			
 
				+	EOR	r11,r11,r10		@ r11= m2^~p7
			
 
				+	UHADD8	r5, r5, r11		@ r5 = (m2^p4)+(m2^~p7)>>1
			
 
				+	@ Single issue
			
 
				+	EOR	r5, r5, r10		@ r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
			
 
				+	@ Single issue
			
 
				+	UHADD8	r5, r5, r12		@ r5 = a2+u2>>1
			
 
				+	MOV	r12, #0x7F7F		@ r12 = {127}x4
			
 
				+	MOVT	r12, #0x7F7F		@ r12 = {127}x4
			
 
				+	UHADD8	r5, r5, r14		@ r5 = f2=(a2+u2>>1)+v2>>1
			
 
				+	@ Now split f[i] by sign.
			
 
				+	@ There's no min or max instruction.
			
 
				+	@ We could use SSUB8 and SEL, but this is just as many instructions and
			
 
				+	@  dual issues more (for v7 without NEON).
			
 
				+	UQSUB8	r10,r4, r12		@ r10= R_i>0?R_i:0
			
 
				+	UQSUB8	r4, r12,r4		@ r4 = R_i<0?-R_i:0
			
 
				+	UQADD8	r11,r10,r2		@ r11= 255-max(2*L-abs(R_i<0),0)
			
 
				+	UQADD8	r14,r4, r2		@ r14= 255-max(2*L-abs(R_i>0),0)
			
 
				+	UQADD8	r10,r10,r11
			
 
				+	UQADD8	r4, r4, r14
			
 
				+	UQSUB8	r10,r10,r11		@ r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
			
 
				+	UQSUB8	r4, r4, r14		@ r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
			
 
				+	UQSUB8	r11,r5, r12		@ r11= R_i>0?R_i:0
			
 
				+	UQADD8	r6, r6, r10
			
 
				+	UQSUB8	r8, r8, r10
			
 
				+	UQSUB8	r5, r12,r5		@ r5 = R_i<0?-R_i:0
			
 
				+	UQSUB8	r6, r6, r4		@ r6 = p1+lflim(R_i,L)
			
 
				+	UQADD8	r8, r8, r4		@ r8 = p2-lflim(R_i,L)
			
 
				+	UQADD8	r10,r11,r2		@ r10= 255-max(2*L-abs(R_i<0),0)
			
 
				+	UQADD8	r14,r5, r2		@ r14= 255-max(2*L-abs(R_i>0),0)
			
 
				+	UQADD8	r11,r11,r10
			
 
				+	UQADD8	r5, r5, r14
			
 
				+	UQSUB8	r11,r11,r10		@ r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
			
 
				+	UQSUB8	r5, r5, r14		@ r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
			
 
				+	UQADD8	r7, r7, r11
			
 
				+	UQSUB8	r9, r9, r11
			
 
				+	UQSUB8	r7, r7, r5		@ r7 = p5+lflim(R_i,L)
			
 
				+	STRD	r6, r7, [r0, -r1]		@ [p5:p1] = [r7: r6]
			
 
				+	UQADD8	r9, r9, r5		@ r9 = p6-lflim(R_i,L)
			
 
				+	STRD	r8, r9, [r0]		@ [p6:p2] = [r9: r8]
			
 
				+	LDMFD	r13!,{r4-r11,PC}
			
 
				+	@ @ .size loop_filter_v_v6, .-loop_filter_v_v6	@ ENDP
			
 
				+
			
 
				+	@ .type oc_loop_filter_frag_rows_v6, %function; oc_loop_filter_frag_rows_v6: @ PROC
			
 
				+_oc_loop_filter_frag_rows_v6:
			
 
				+	@ r0 = _ref_frame_data
			
 
				+	@ r1 = _ystride
			
 
				+	@ r2 = _bv
			
 
				+	@ r3 = _frags
			
 
				+	@ r4 = _fragi0
			
 
				+	@ r5 = _fragi0_end
			
 
				+	@ r6 = _fragi_top
			
 
				+	@ r7 = _fragi_bot
			
 
				+	@ r8 = _frag_buf_offs
			
 
				+	@ r9 = _nhfrags
			
 
				+	MOV	r12,r13
			
 
				+	STMFD	r13!,{r0,r4-r11,r14}
			
 
				+	LDMFD	r12,{r4-r9}
			
 
				+	LDR	r2, [r2]	@ ll = *(int *)_bv
			
 
				+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
			
 
				+	BGE	oslffri_v6_end	@   bail
			
 
				+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
			
 
				+	BLE	oslffri_v6_end	@			  bail
			
 
				+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
			
 
				+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
			
 
				+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
			
 
				+oslffri_v6_lp1:
			
 
				+	MOV	r10,r4		@ r10= fragi = _fragi0
			
 
				+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
			
 
				+oslffri_v6_lp2:
			
 
				+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
			
 
				+	LDR	r0, [r13]	@ r0 = _ref_frame_data
			
 
				+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
			
 
				+	TST	r14,#OC_FRAG_CODED_FLAG
			
 
				+	BEQ	oslffri_v6_uncoded
			
 
				+	CMP	r10,r4		@ if (fragi>_fragi0)
			
 
				+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
			
 
				+	BLGT	loop_filter_h_v6
			
 
				+	CMP	r4, r6		@ if (fragi0>_fragi_top)
			
 
				+	BLGT	loop_filter_v_v6
			
 
				+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
			
 
				+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
			
 
				+	ADD	r0, r0, #8
			
 
				+	ADD	r10,r10,#1	@ r10 = fragi+1;
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
			
 
				+	BLLT	loop_filter_h_v6
			
 
				+	CMP	r10,r7		@ if (fragi<_fragi_bot)
			
 
				+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
			
 
				+	SUB	r0, r0, #8
			
 
				+	ADD	r0, r0, r1, LSL #3
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG
			
 
				+	BLLT	loop_filter_v_v6
			
 
				+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
			
 
				+	BLE	oslffri_v6_lp2
			
 
				+	MOV	r4, r10		@ r4 = fragi0 += nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_v6_lp1
			
 
				+oslffri_v6_end:
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+oslffri_v6_uncoded:
			
 
				+	ADD	r10,r10,#1
			
 
				+	CMP	r10,r11
			
 
				+	BLE	oslffri_v6_lp2
			
 
				+	MOV	r4, r10		@ r4 = fragi0 += nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_v6_lp1
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+	@ @ .size oc_loop_filter_frag_rows_v6, .-oc_loop_filter_frag_rows_v6	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+  .if OC_ARM_ASM_NEON
			
 
				+	.global	_oc_loop_filter_init_neon
			
 
				+	.global	_oc_loop_filter_frag_rows_neon
			
 
				+
			
 
				+	@ .type oc_loop_filter_init_neon, %function; oc_loop_filter_init_neon: @ PROC
			
 
				+_oc_loop_filter_init_neon:
			
 
				+	@ r0 = _bv
			
 
				+	@ r1 = _flimit (=L from the spec)
			
 
				+	MOV		r1, r1, LSL #1  @ r1 = 2*L
			
 
				+	VDUP.S16	Q15, r1		@ Q15= 2L in U16s
			
 
				+	VST1.64		{D30,D31}, [r0,:128]
			
 
				+	MOV	PC,r14
			
 
				+	@ @ .size oc_loop_filter_init_neon, .-oc_loop_filter_init_neon	@ ENDP
			
 
				+
			
 
				+	@ .type loop_filter_h_neon, %function; loop_filter_h_neon: @ PROC
			
 
				+loop_filter_h_neon:
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int           *_bv
			
 
				+	@ preserves r0-r3
			
 
				+	@ We assume Q15= 2*L in U16s
			
 
				+	@                    My best guesses at cycle counts (and latency)--vvv
			
 
				+	SUB	r12,r0, #2
			
 
				+	@ Doing a 2-element structure load saves doing two VTRN's below, at the
			
 
				+	@  cost of using two more slower single-lane loads vs. the faster
			
 
				+	@  all-lane loads.
			
 
				+	@ It's less code this way, though, and benches a hair faster, but it
			
 
				+	@  leaves D2 and D4 swapped.
			
 
				+	VLD2.16	{D0[],D2[]},  [r12], r1		@ D0 = ____________1100     2,1
			
 
				+						@ D2 = ____________3322
			
 
				+	VLD2.16	{D4[],D6[]},  [r12], r1		@ D4 = ____________5544     2,1
			
 
				+						@ D6 = ____________7766
			
 
				+	VLD2.16	{D0[1],D2[1]},[r12], r1		@ D0 = ________99881100     3,1
			
 
				+						@ D2 = ________BBAA3322
			
 
				+	VLD2.16	{D4[1],D6[1]},[r12], r1		@ D4 = ________DDCC5544     3,1
			
 
				+						@ D6 = ________FFEE7766
			
 
				+	VLD2.16	{D0[2],D2[2]},[r12], r1		@ D0 = ____GGHH99881100     3,1
			
 
				+						@ D2 = ____JJIIBBAA3322
			
 
				+	VLD2.16	{D4[2],D6[2]},[r12], r1		@ D4 = ____KKLLDDCC5544     3,1
			
 
				+						@ D6 = ____NNMMFFEE7766
			
 
				+	VLD2.16	{D0[3],D2[3]},[r12], r1		@ D0 = PPOOGGHH99881100     3,1
			
 
				+						@ D2 = RRQQJJIIBBAA3322
			
 
				+	VLD2.16	{D4[3],D6[3]},[r12], r1		@ D4 = TTSSKKLLDDCC5544     3,1
			
 
				+						@ D6 = VVUUNNMMFFEE7766
			
 
				+	VTRN.8	D0, D4	@ D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511       1,1
			
 
				+	VTRN.8	D2, D6	@ D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733       1,1
			
 
				+	VSUBL.U8	Q0, D0, D6	@ Q0 = 00 - 33 in S16s              1,3
			
 
				+	VSUBL.U8	Q8, D2, D4	@ Q8 = 22 - 11 in S16s              1,3
			
 
				+	ADD	r12,r0, #8
			
 
				+	VADD.S16	Q0, Q0, Q8	@                                   1,3
			
 
				+	PLD	[r12]
			
 
				+	VADD.S16	Q0, Q0, Q8	@                                   1,3
			
 
				+	PLD	[r12,r1]
			
 
				+	VADD.S16	Q0, Q0, Q8	@ Q0 = [0-3]+3*[2-1]                1,3
			
 
				+	PLD	[r12,r1, LSL #1]
			
 
				+	VRSHR.S16	Q0, Q0, #3	@ Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
			
 
				+	ADD	r12,r12,r1, LSL #2
			
 
				+	@  We want to do
			
 
				+	@ f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
			
 
				+	@   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
			
 
				+	@ So we've reduced the left and right hand terms to be the same, except
			
 
				+	@ for a negation.
			
 
				+	@ Stall x3
			
 
				+	VABS.S16	Q9, Q0		@ Q9 = |f| in U16s                  1,4
			
 
				+	PLD	[r12,-r1]
			
 
				+	VSHR.S16	Q0, Q0, #15	@ Q0 = -1 or 0 according to sign    1,3
			
 
				+	PLD	[r12]
			
 
				+	VQSUB.U16	Q10,Q15,Q9	@ Q10= MAX(2L-|f|,0) in U16s        1,4
			
 
				+	PLD	[r12,r1]
			
 
				+	VMOVL.U8	Q1, D2	   @ Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
			
 
				+	PLD	[r12,r1,LSL #1]
			
 
				+	VMIN.U16	Q9, Q10,Q9	@ Q9 = MIN(|f|,MAX(2L-|f|))         1,4
			
 
				+	ADD	r12,r12,r1, LSL #2
			
 
				+	@ Now we need to correct for the sign of f.
			
 
				+	@ For negative elements of Q0, we want to subtract the appropriate
			
 
				+	@ element of Q9. For positive elements we want to add them. No NEON
			
 
				+	@ instruction exists to do this, so we need to negate the negative
			
 
				+	@ elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
			
 
				+	VADD.S16	Q9, Q9, Q0	@				    1,3
			
 
				+	PLD	[r12,-r1]
			
 
				+	VEOR.S16	Q9, Q9, Q0	@ Q9 = real value of f              1,3
			
 
				+	@ Bah. No VRSBW.U8
			
 
				+	@ Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
			
 
				+	VADDW.U8	Q2, Q9, D4 @ Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
			
 
				+	VSUB.S16	Q1, Q1, Q9 @ Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
			
 
				+	VQMOVUN.S16	D4, Q2		@ D4 = TTPPLLHHDD995511		    1,1
			
 
				+	VQMOVUN.S16	D2, Q1		@ D2 = UUQQMMIIEEAA6622		    1,1
			
 
				+	SUB	r12,r0, #1
			
 
				+	VTRN.8	D4, D2		@ D4 = QQPPIIHHAA992211	D2 = MMLLEEDD6655   1,1
			
 
				+	VST1.16	{D4[0]}, [r12], r1
			
 
				+	VST1.16	{D2[0]}, [r12], r1
			
 
				+	VST1.16	{D4[1]}, [r12], r1
			
 
				+	VST1.16	{D2[1]}, [r12], r1
			
 
				+	VST1.16	{D4[2]}, [r12], r1
			
 
				+	VST1.16	{D2[2]}, [r12], r1
			
 
				+	VST1.16	{D4[3]}, [r12], r1
			
 
				+	VST1.16	{D2[3]}, [r12], r1
			
 
				+	MOV	PC,r14
			
 
				+	@ @ .size loop_filter_h_neon, .-loop_filter_h_neon	@ ENDP
			
 
				+
			
 
				+	@ .type loop_filter_v_neon, %function; loop_filter_v_neon: @ PROC
			
 
				+loop_filter_v_neon:
			
 
				+	@ r0 = unsigned char *_pix
			
 
				+	@ r1 = int            _ystride
			
 
				+	@ r2 = int           *_bv
			
 
				+	@ preserves r0-r3
			
 
				+	@ We assume Q15= 2*L in U16s
			
 
				+	@                    My best guesses at cycle counts (and latency)--vvv
			
 
				+	SUB	r12,r0, r1, LSL #1
			
 
				+	VLD1.64	{D0}, [r12,:64], r1		@ D0 = SSOOKKGGCC884400     2,1
			
 
				+	VLD1.64	{D2}, [r12,:64], r1		@ D2 = TTPPLLHHDD995511     2,1
			
 
				+	VLD1.64	{D4}, [r12,:64], r1		@ D4 = UUQQMMIIEEAA6622     2,1
			
 
				+	VLD1.64	{D6}, [r12,:64]			@ D6 = VVRRNNJJFFBB7733     2,1
			
 
				+	VSUBL.U8	Q8, D4, D2	@ Q8 = 22 - 11 in S16s              1,3
			
 
				+	VSUBL.U8	Q0, D0, D6	@ Q0 = 00 - 33 in S16s              1,3
			
 
				+	ADD	r12, #8
			
 
				+	VADD.S16	Q0, Q0, Q8	@                                   1,3
			
 
				+	PLD	[r12]
			
 
				+	VADD.S16	Q0, Q0, Q8	@                                   1,3
			
 
				+	PLD	[r12,r1]
			
 
				+	VADD.S16	Q0, Q0, Q8	@ Q0 = [0-3]+3*[2-1]                1,3
			
 
				+	SUB	r12, r0, r1
			
 
				+	VRSHR.S16	Q0, Q0, #3	@ Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
			
 
				+	@  We want to do
			
 
				+	@ f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
			
 
				+	@   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
			
 
				+	@   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
			
 
				+	@ So we've reduced the left and right hand terms to be the same, except
			
 
				+	@ for a negation.
			
 
				+	@ Stall x3
			
 
				+	VABS.S16	Q9, Q0		@ Q9 = |f| in U16s                  1,4
			
 
				+	VSHR.S16	Q0, Q0, #15	@ Q0 = -1 or 0 according to sign    1,3
			
 
				+	@ Stall x2
			
 
				+	VQSUB.U16	Q10,Q15,Q9	@ Q10= MAX(2L-|f|,0) in U16s        1,4
			
 
				+	VMOVL.U8	Q2, D4	   @ Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
			
 
				+	@ Stall x2
			
 
				+	VMIN.U16	Q9, Q10,Q9	@ Q9 = MIN(|f|,MAX(2L-|f|))         1,4
			
 
				+	@ Now we need to correct for the sign of f.
			
 
				+	@ For negative elements of Q0, we want to subtract the appropriate
			
 
				+	@ element of Q9. For positive elements we want to add them. No NEON
			
 
				+	@ instruction exists to do this, so we need to negate the negative
			
 
				+	@ elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
			
 
				+	@ Stall x3
			
 
				+	VADD.S16	Q9, Q9, Q0	@				    1,3
			
 
				+	@ Stall x2
			
 
				+	VEOR.S16	Q9, Q9, Q0	@ Q9 = real value of f              1,3
			
 
				+	@ Bah. No VRSBW.U8
			
 
				+	@ Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
			
 
				+	VADDW.U8	Q1, Q9, D2 @ Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
			
 
				+	VSUB.S16	Q2, Q2, Q9 @ Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
			
 
				+	VQMOVUN.S16	D2, Q1		@ D2 = TTPPLLHHDD995511		    1,1
			
 
				+	VQMOVUN.S16	D4, Q2		@ D4 = UUQQMMIIEEAA6622		    1,1
			
 
				+	VST1.64	{D2}, [r12,:64], r1
			
 
				+	VST1.64	{D4}, [r12,:64], r1
			
 
				+	MOV	PC,r14
			
 
				+	@ @ .size loop_filter_v_neon, .-loop_filter_v_neon	@ ENDP
			
 
				+
			
 
				+	@ .type oc_loop_filter_frag_rows_neon, %function; oc_loop_filter_frag_rows_neon: @ PROC
			
 
				+_oc_loop_filter_frag_rows_neon:
			
 
				+	@ r0 = _ref_frame_data
			
 
				+	@ r1 = _ystride
			
 
				+	@ r2 = _bv
			
 
				+	@ r3 = _frags
			
 
				+	@ r4 = _fragi0
			
 
				+	@ r5 = _fragi0_end
			
 
				+	@ r6 = _fragi_top
			
 
				+	@ r7 = _fragi_bot
			
 
				+	@ r8 = _frag_buf_offs
			
 
				+	@ r9 = _nhfrags
			
 
				+	MOV	r12,r13
			
 
				+	STMFD	r13!,{r0,r4-r11,r14}
			
 
				+	LDMFD	r12,{r4-r9}
			
 
				+	CMP	r4, r5		@ if(_fragi0>=_fragi0_end)
			
 
				+	BGE	oslffri_neon_end	@   bail
			
 
				+	SUBS	r9, r9, #1	@ r9 = _nhfrags-1	if (r9<=0)
			
 
				+	BLE	oslffri_neon_end	@		  bail
			
 
				+	VLD1.64	{D30,D31}, [r2,:128]	@ Q15= 2L in U16s
			
 
				+	ADD	r3, r3, r4, LSL #2	@ r3 = &_frags[fragi]
			
 
				+	ADD	r8, r8, r4, LSL #2	@ r8 = &_frag_buf_offs[fragi]
			
 
				+	SUB	r7, r7, r9	@ _fragi_bot -= _nhfrags;
			
 
				+oslffri_neon_lp1:
			
 
				+	MOV	r10,r4		@ r10= fragi = _fragi0
			
 
				+	ADD	r11,r4, r9	@ r11= fragi_end-1=fragi+_nhfrags-1
			
 
				+oslffri_neon_lp2:
			
 
				+	LDR	r14,[r3], #4	@ r14= _frags[fragi]	_frags++
			
 
				+	LDR	r0, [r13]	@ r0 = _ref_frame_data
			
 
				+	LDR	r12,[r8], #4	@ r12= _frag_buf_offs[fragi]   _frag_buf_offs++
			
 
				+	TST	r14,#OC_FRAG_CODED_FLAG
			
 
				+	BEQ	oslffri_neon_uncoded
			
 
				+	CMP	r10,r4		@ if (fragi>_fragi0)
			
 
				+	ADD	r0, r0, r12	@ r0 = _ref_frame_data + _frag_buf_offs[fragi]
			
 
				+	BLGT	loop_filter_h_neon
			
 
				+	CMP	r4, r6		@ if (_fragi0>_fragi_top)
			
 
				+	BLGT	loop_filter_v_neon
			
 
				+	CMP	r10,r11		@ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
			
 
				+	LDRLT	r12,[r3]	@ r12 = _frags[fragi+1]
			
 
				+	ADD	r0, r0, #8
			
 
				+	ADD	r10,r10,#1	@ r10 = fragi+1;
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG	@ && _frags[fragi+1].coded==0
			
 
				+	BLLT	loop_filter_h_neon
			
 
				+	CMP	r10,r7		@ if (fragi<_fragi_bot)
			
 
				+	LDRLT	r12,[r3, r9, LSL #2]	@ r12 = _frags[fragi+1+_nhfrags-1]
			
 
				+	SUB	r0, r0, #8
			
 
				+	ADD	r0, r0, r1, LSL #3
			
 
				+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
			
 
				+	CMPLT	r12,#OC_FRAG_CODED_FLAG
			
 
				+	BLLT	loop_filter_v_neon
			
 
				+	CMP	r10,r11		@ while(fragi<=fragi_end-1)
			
 
				+	BLE	oslffri_neon_lp2
			
 
				+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_neon_lp1
			
 
				+oslffri_neon_end:
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+oslffri_neon_uncoded:
			
 
				+	ADD	r10,r10,#1
			
 
				+	CMP	r10,r11
			
 
				+	BLE	oslffri_neon_lp2
			
 
				+	MOV	r4, r10		@ r4 = _fragi0 += _nhfrags
			
 
				+	CMP	r4, r5
			
 
				+	BLT	oslffri_neon_lp1
			
 
				+	LDMFD	r13!,{r0,r4-r11,PC}
			
 
				+	@ @ .size oc_loop_filter_frag_rows_neon, .-oc_loop_filter_frag_rows_neon	@ ENDP
			
 
				+  .endif
			
 
				+
			
 
				+	@ END
			
 
				+    @ .section	.note.GNU-stack,"",%progbits
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/arm_llvm/armstate.c
+++ b/modules/theoraplayer/native/theora/lib/arm_llvm/armstate.c
@@ -0,0 +1,219 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include "armint.h"
			
 
				+
			
 
				+#if defined(OC_ARM_ASM)
			
 
				+
			
 
				+# if defined(OC_ARM_ASM_NEON)
			
 
				+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
			
 
				+   the destination.*/
			
 
				+static const unsigned char OC_FZIG_ZAG_NEON[128]={
			
 
				+   0, 8, 1, 2, 9,16,24,17,
			
 
				+  10, 3, 4,11,18,25,32,40,
			
 
				+  33,26,19,12, 5, 6,13,20,
			
 
				+  27,34,41,48,56,49,42,35,
			
 
				+  28,21,14, 7,15,22,29,36,
			
 
				+  43,50,57,58,51,44,37,30,
			
 
				+  23,31,38,45,52,59,60,53,
			
 
				+  46,39,47,54,61,62,55,63,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64,
			
 
				+  64,64,64,64,64,64,64,64
			
 
				+};
			
 
				+# endif
			
 
				+
			
 
				+void oc_state_accel_init_arm(oc_theora_state *_state){
			
 
				+  oc_state_accel_init_c(_state);
			
 
				+  _state->cpu_flags=oc_cpu_flags_get();
			
 
				+# if defined(OC_STATE_USE_VTABLE)
			
 
				+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_arm;
			
 
				+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm;
			
 
				+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
			
 
				+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
			
 
				+  _state->opt_vtable.idct8x8=oc_idct8x8_arm;
			
 
				+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm;
			
 
				+  /*Note: We _must_ set this function pointer, because the macro in armint.h
			
 
				+     calls it with different arguments, so the C version will segfault.*/
			
 
				+  _state->opt_vtable.state_loop_filter_frag_rows=
			
 
				+   (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_arm;
			
 
				+# endif
			
 
				+# if defined(OC_ARM_ASM_EDSP)
			
 
				+  if(_state->cpu_flags&OC_CPU_ARM_EDSP){
			
 
				+#  if defined(OC_STATE_USE_VTABLE)
			
 
				+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_edsp;
			
 
				+#  endif
			
 
				+  }
			
 
				+#  if defined(OC_ARM_ASM_MEDIA)
			
 
				+  if(_state->cpu_flags&OC_CPU_ARM_MEDIA){
			
 
				+#   if defined(OC_STATE_USE_VTABLE)
			
 
				+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_v6;
			
 
				+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6;
			
 
				+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6;
			
 
				+    _state->opt_vtable.idct8x8=oc_idct8x8_v6;
			
 
				+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_v6;
			
 
				+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6;
			
 
				+    _state->opt_vtable.state_loop_filter_frag_rows=
			
 
				+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6;
			
 
				+#   endif
			
 
				+  }
			
 
				+#   if defined(OC_ARM_ASM_NEON)
			
 
				+  if(_state->cpu_flags&OC_CPU_ARM_NEON){
			
 
				+#    if defined(OC_STATE_USE_VTABLE)
			
 
				+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_neon;
			
 
				+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon;
			
 
				+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon;
			
 
				+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon;
			
 
				+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_neon;
			
 
				+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon;
			
 
				+    _state->opt_vtable.state_loop_filter_frag_rows=
			
 
				+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon;
			
 
				+    _state->opt_vtable.idct8x8=oc_idct8x8_neon;
			
 
				+#    endif
			
 
				+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_NEON;
			
 
				+  }
			
 
				+#   endif
			
 
				+#  endif
			
 
				+# endif
			
 
				+}
			
 
				+
			
 
				+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
			
 
				+  unsigned char *dst;
			
 
				+  ptrdiff_t      frag_buf_off;
			
 
				+  int            ystride;
			
 
				+  int            refi;
			
 
				+  /*Apply the inverse transform.*/
			
 
				+  /*Special case only having a DC component.*/
			
 
				+  if(_last_zzi<2){
			
 
				+    ogg_uint16_t p;
			
 
				+    /*We round this dequant product (and not any of the others) because there's
			
 
				+       no iDCT rounding.*/
			
 
				+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
			
 
				+    oc_idct8x8_1_arm(_dct_coeffs+64,p);
			
 
				+  }
			
 
				+  else{
			
 
				+    /*First, dequantize the DC coefficient.*/
			
 
				+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
			
 
				+    oc_idct8x8_arm(_dct_coeffs+64,_dct_coeffs,_last_zzi);
			
 
				+  }
			
 
				+  /*Fill in the target buffer.*/
			
 
				+  frag_buf_off=_state->frag_buf_offs[_fragi];
			
 
				+  refi=_state->frags[_fragi].refi;
			
 
				+  ystride=_state->ref_ystride[_pli];
			
 
				+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
			
 
				+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
			
 
				+  else{
			
 
				+    const unsigned char *ref;
			
 
				+    int                  mvoffsets[2];
			
 
				+    ref=_state->ref_frame_data[refi]+frag_buf_off;
			
 
				+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
			
 
				+     _state->frag_mvs[_fragi])>1){
			
 
				+      oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
			
 
				+       _dct_coeffs+64);
			
 
				+    }
			
 
				+    else oc_frag_recon_inter_arm(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+# if defined(OC_ARM_ASM_MEDIA)
			
 
				+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
			
 
				+  unsigned char *dst;
			
 
				+  ptrdiff_t      frag_buf_off;
			
 
				+  int            ystride;
			
 
				+  int            refi;
			
 
				+  /*Apply the inverse transform.*/
			
 
				+  /*Special case only having a DC component.*/
			
 
				+  if(_last_zzi<2){
			
 
				+    ogg_uint16_t p;
			
 
				+    /*We round this dequant product (and not any of the others) because there's
			
 
				+       no iDCT rounding.*/
			
 
				+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
			
 
				+    oc_idct8x8_1_v6(_dct_coeffs+64,p);
			
 
				+  }
			
 
				+  else{
			
 
				+    /*First, dequantize the DC coefficient.*/
			
 
				+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
			
 
				+    oc_idct8x8_v6(_dct_coeffs+64,_dct_coeffs,_last_zzi);
			
 
				+  }
			
 
				+  /*Fill in the target buffer.*/
			
 
				+  frag_buf_off=_state->frag_buf_offs[_fragi];
			
 
				+  refi=_state->frags[_fragi].refi;
			
 
				+  ystride=_state->ref_ystride[_pli];
			
 
				+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
			
 
				+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
			
 
				+  else{
			
 
				+    const unsigned char *ref;
			
 
				+    int                  mvoffsets[2];
			
 
				+    ref=_state->ref_frame_data[refi]+frag_buf_off;
			
 
				+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
			
 
				+     _state->frag_mvs[_fragi])>1){
			
 
				+      oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
			
 
				+       _dct_coeffs+64);
			
 
				+    }
			
 
				+    else oc_frag_recon_inter_v6(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+# if defined(OC_ARM_ASM_NEON)
			
 
				+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
			
 
				+  unsigned char *dst;
			
 
				+  ptrdiff_t      frag_buf_off;
			
 
				+  int            ystride;
			
 
				+  int            refi;
			
 
				+  /*Apply the inverse transform.*/
			
 
				+  /*Special case only having a DC component.*/
			
 
				+  if(_last_zzi<2){
			
 
				+    ogg_uint16_t p;
			
 
				+    /*We round this dequant product (and not any of the others) because there's
			
 
				+       no iDCT rounding.*/
			
 
				+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
			
 
				+    oc_idct8x8_1_neon(_dct_coeffs+64,p);
			
 
				+  }
			
 
				+  else{
			
 
				+    /*First, dequantize the DC coefficient.*/
			
 
				+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
			
 
				+    oc_idct8x8_neon(_dct_coeffs+64,_dct_coeffs,_last_zzi);
			
 
				+  }
			
 
				+  /*Fill in the target buffer.*/
			
 
				+  frag_buf_off=_state->frag_buf_offs[_fragi];
			
 
				+  refi=_state->frags[_fragi].refi;
			
 
				+  ystride=_state->ref_ystride[_pli];
			
 
				+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
			
 
				+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
			
 
				+  else{
			
 
				+    const unsigned char *ref;
			
 
				+    int                  mvoffsets[2];
			
 
				+    ref=_state->ref_frame_data[refi]+frag_buf_off;
			
 
				+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
			
 
				+     _state->frag_mvs[_fragi])>1){
			
 
				+      oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
			
 
				+       _dct_coeffs+64);
			
 
				+    }
			
 
				+    else oc_frag_recon_inter_neon(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
			
 
				+  }
			
 
				+}
			
 
				+#  endif
			
 
				+# endif
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/bitpack.c
+++ b/modules/theoraplayer/native/theora/lib/bitpack.c
@@ -0,0 +1,114 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function: packing variable sized words into an octet stream
			
 
				+  last mod: $Id: bitpack.c 17410 2010-09-21 21:53:48Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include <string.h>
			
 
				+#include <stdlib.h>
			
 
				+#include "bitpack.h"
			
 
				+
			
 
				+/*We're 'MSb' endian; if we write a word but read individual bits,
			
 
				+   then we'll read the MSb first.*/
			
 
				+
			
 
				+void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes){
			
 
				+  memset(_b,0,sizeof(*_b));
			
 
				+  _b->ptr=_buf;
			
 
				+  _b->stop=_buf+_bytes;
			
 
				+}
			
 
				+
			
 
				+static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
			
 
				+  const unsigned char *ptr;
			
 
				+  const unsigned char *stop;
			
 
				+  oc_pb_window         window;
			
 
				+  int                  available;
			
 
				+  unsigned             shift;
			
 
				+  stop=_b->stop;
			
 
				+  ptr=_b->ptr;
			
 
				+  window=_b->window;
			
 
				+  available=_b->bits;
			
 
				+  shift=OC_PB_WINDOW_SIZE-available;
			
 
				+  while(7<shift&&ptr<stop){
			
 
				+    shift-=8;
			
 
				+    window|=(oc_pb_window)*ptr++<<shift;
			
 
				+  }
			
 
				+  _b->ptr=ptr;
			
 
				+  available=OC_PB_WINDOW_SIZE-shift;
			
 
				+  if(_bits>available){
			
 
				+    if(ptr>=stop){
			
 
				+      _b->eof=1;
			
 
				+      available=OC_LOTS_OF_BITS;
			
 
				+    }
			
 
				+    else window|=*ptr>>(available&7);
			
 
				+  }
			
 
				+  _b->bits=available;
			
 
				+  return window;
			
 
				+}
			
 
				+
			
 
				+int oc_pack_look1(oc_pack_buf *_b){
			
 
				+  oc_pb_window window;
			
 
				+  int          available;
			
 
				+  window=_b->window;
			
 
				+  available=_b->bits;
			
 
				+  if(available<1)_b->window=window=oc_pack_refill(_b,1);
			
 
				+  return window>>OC_PB_WINDOW_SIZE-1;
			
 
				+}
			
 
				+
			
 
				+void oc_pack_adv1(oc_pack_buf *_b){
			
 
				+  _b->window<<=1;
			
 
				+  _b->bits--;
			
 
				+}
			
 
				+
			
 
				+/*Here we assume that 0<=_bits&&_bits<=32.*/
			
 
				+long oc_pack_read_c(oc_pack_buf *_b,int _bits){
			
 
				+  oc_pb_window window;
			
 
				+  int          available;
			
 
				+  long         result;
			
 
				+  window=_b->window;
			
 
				+  available=_b->bits;
			
 
				+  if(_bits==0)return 0;
			
 
				+  if(available<_bits){
			
 
				+    window=oc_pack_refill(_b,_bits);
			
 
				+    available=_b->bits;
			
 
				+  }
			
 
				+  result=window>>OC_PB_WINDOW_SIZE-_bits;
			
 
				+  available-=_bits;
			
 
				+  window<<=1;
			
 
				+  window<<=_bits-1;
			
 
				+  _b->window=window;
			
 
				+  _b->bits=available;
			
 
				+  return result;
			
 
				+}
			
 
				+
			
 
				+int oc_pack_read1_c(oc_pack_buf *_b){
			
 
				+  oc_pb_window window;
			
 
				+  int          available;
			
 
				+  int          result;
			
 
				+  window=_b->window;
			
 
				+  available=_b->bits;
			
 
				+  if(available<1){
			
 
				+    window=oc_pack_refill(_b,1);
			
 
				+    available=_b->bits;
			
 
				+  }
			
 
				+  result=window>>OC_PB_WINDOW_SIZE-1;
			
 
				+  available--;
			
 
				+  window<<=1;
			
 
				+  _b->window=window;
			
 
				+  _b->bits=available;
			
 
				+  return result;
			
 
				+}
			
 
				+
			
 
				+long oc_pack_bytes_left(oc_pack_buf *_b){
			
 
				+  if(_b->eof)return -1;
			
 
				+  return _b->stop-_b->ptr+(_b->bits>>3);
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/bitpack.h
+++ b/modules/theoraplayer/native/theora/lib/bitpack.h
@@ -0,0 +1,76 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function: packing variable sized words into an octet stream
			
 
				+  last mod: $Id: bitwise.c 7675 2004-09-01 00:34:39Z xiphmont $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#if !defined(_bitpack_H)
			
 
				+# define _bitpack_H (1)
			
 
				+# include <stddef.h>
			
 
				+# include <limits.h>
			
 
				+# include "internal.h"
			
 
				+
			
 
				+
			
 
				+
			
 
				+typedef size_t             oc_pb_window;
			
 
				+typedef struct oc_pack_buf oc_pack_buf;
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Custom bitpacker implementations.*/
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+#  include "arm/armbits.h"
			
 
				+# endif
			
 
				+
			
 
				+# if !defined(oc_pack_read)
			
 
				+#  define oc_pack_read oc_pack_read_c
			
 
				+# endif
			
 
				+# if !defined(oc_pack_read1)
			
 
				+#  define oc_pack_read1 oc_pack_read1_c
			
 
				+# endif
			
 
				+# if !defined(oc_huff_token_decode)
			
 
				+#  define oc_huff_token_decode oc_huff_token_decode_c
			
 
				+# endif
			
 
				+
			
 
				+# define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
			
 
				+/*This is meant to be a large, positive constant that can still be efficiently
			
 
				+   loaded as an immediate (on platforms like ARM, for example).
			
 
				+  Even relatively modest values like 100 would work fine.*/
			
 
				+# define OC_LOTS_OF_BITS (0x40000000)
			
 
				+
			
 
				+
			
 
				+
			
 
				+struct oc_pack_buf{
			
 
				+  const unsigned char *stop;
			
 
				+  const unsigned char *ptr;
			
 
				+  oc_pb_window         window;
			
 
				+  int                  bits;
			
 
				+  int                  eof;
			
 
				+};
			
 
				+
			
 
				+void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes);
			
 
				+int oc_pack_look1(oc_pack_buf *_b);
			
 
				+void oc_pack_adv1(oc_pack_buf *_b);
			
 
				+/*Here we assume 0<=_bits&&_bits<=32.*/
			
 
				+long oc_pack_read_c(oc_pack_buf *_b,int _bits);
			
 
				+int oc_pack_read1_c(oc_pack_buf *_b);
			
 
				+/* returns -1 for read beyond EOF, or the number of whole bytes available */
			
 
				+long oc_pack_bytes_left(oc_pack_buf *_b);
			
 
				+
			
 
				+/*These two functions are implemented locally in huffdec.c*/
			
 
				+/*Read in bits without advancing the bitptr.
			
 
				+  Here we assume 0<=_bits&&_bits<=32.*/
			
 
				+/*static int oc_pack_look(oc_pack_buf *_b,int _bits);*/
			
 
				+/*static void oc_pack_adv(oc_pack_buf *_b,int _bits);*/
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/c64x/c64xdec.c
+++ b/modules/theoraplayer/native/theora/lib/c64x/c64xdec.c
@@ -0,0 +1,153 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id$
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include "c64xdec.h"
			
 
				+
			
 
				+#if defined(OC_C64X_ASM)
			
 
				+
			
 
				+void oc_dec_accel_init_c64x(oc_dec_ctx *_dec){
			
 
				+# if defined(OC_DEC_USE_VTABLE)
			
 
				+  _dec->opt_vtable.dc_unpredict_mcu_plane=oc_dec_dc_unpredict_mcu_plane_c64x;
			
 
				+# endif
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*Undo the DC prediction in a single plane of an MCU (one or two super block
			
 
				+   rows).
			
 
				+  As a side effect, the number of coded and uncoded fragments in this plane of
			
 
				+   the MCU is also computed.*/
			
 
				+void oc_dec_dc_unpredict_mcu_plane_c64x(oc_dec_ctx *_dec,
			
 
				+ oc_dec_pipeline_state *_pipe,int _pli){
			
 
				+  const oc_fragment_plane *fplane;
			
 
				+  oc_fragment             *frags;
			
 
				+  int                     *pred_last;
			
 
				+  ptrdiff_t                ncoded_fragis;
			
 
				+  ptrdiff_t                fragi;
			
 
				+  int                      fragx;
			
 
				+  int                      fragy;
			
 
				+  int                      fragy0;
			
 
				+  int                      fragy_end;
			
 
				+  int                      nhfrags;
			
 
				+  /*Compute the first and last fragment row of the current MCU for this
			
 
				+     plane.*/
			
 
				+  fplane=_dec->state.fplanes+_pli;
			
 
				+  fragy0=_pipe->fragy0[_pli];
			
 
				+  fragy_end=_pipe->fragy_end[_pli];
			
 
				+  nhfrags=fplane->nhfrags;
			
 
				+  pred_last=_pipe->pred_last[_pli];
			
 
				+  frags=_dec->state.frags;
			
 
				+  ncoded_fragis=0;
			
 
				+  fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
			
 
				+  for(fragy=fragy0;fragy<fragy_end;fragy++){
			
 
				+    if(fragy==0){
			
 
				+      /*For the first row, all of the cases reduce to just using the previous
			
 
				+         predictor for the same reference frame.*/
			
 
				+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
			
 
				+        int coded;
			
 
				+        int refi;
			
 
				+        /*The TI compiler refuses to pipeline this if we put it in an if(coded)
			
 
				+           block.
			
 
				+          We can do the loads unconditionally, which helps move them earlier.
			
 
				+          We do the store unconditionally too, because if we use a conditional
			
 
				+           store, the compiler propagates the condition back to the operations
			
 
				+           the store depended on, presumably to reduce cache pressure by
			
 
				+           eliminating dead loads.
			
 
				+          However, these loads are "free" in the cache sense, since reading the
			
 
				+           coded flag brings in all four bytes anyway, and starting the loads
			
 
				+           before we know the coded flag saves 6 cycles.*/
			
 
				+        refi=frags[fragi].refi;
			
 
				+        coded=frags[fragi].coded;
			
 
				+        frags[fragi].dc=pred_last[refi]+=frags[fragi].dc&-coded;
			
 
				+        ncoded_fragis+=coded;
			
 
				+      }
			
 
				+    }
			
 
				+    else{
			
 
				+      oc_fragment *u_frags;
			
 
				+      int          l_ref;
			
 
				+      int          ul_ref;
			
 
				+      int          u_ref;
			
 
				+      u_frags=frags-nhfrags;
			
 
				+      l_ref=-1;
			
 
				+      ul_ref=-1;
			
 
				+      u_ref=u_frags[fragi].refi;
			
 
				+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
			
 
				+        int ur_ref;
			
 
				+        int refi;
			
 
				+        if(fragx+1>=nhfrags)ur_ref=-1;
			
 
				+        else ur_ref=u_frags[fragi+1].refi;
			
 
				+        refi=frags[fragi].refi;
			
 
				+        if(frags[fragi].coded){
			
 
				+          static const int OC_PRED_SCALE[16][2]={
			
 
				+            {0x00000000,0x00000000},
			
 
				+            {0x00000000,0x00000080},
			
 
				+            {0x00800000,0x00000000},
			
 
				+            {0x00000000,0x00000080},
			
 
				+            {0x00000080,0x00000000},
			
 
				+            {0x00000040,0x00000040},
			
 
				+            {0x00000080,0x00000000},
			
 
				+            {0xFF980074,0x00000074},
			
 
				+            {0x00000000,0x00800000},
			
 
				+            {0x00000000,0x0035004B},
			
 
				+            {0x00400000,0x00400000},
			
 
				+            {0x00000000,0x0035004B},
			
 
				+            {0x00000080,0x00000000},
			
 
				+            {0x00000000,0x0035004B},
			
 
				+            {0x00180050,0x00180000},
			
 
				+            {0xFF980074,0x00000074},
			
 
				+          };
			
 
				+          ogg_int16_t p0;
			
 
				+          ogg_int16_t p1;
			
 
				+          ogg_int16_t p2;
			
 
				+          ogg_int16_t p3;
			
 
				+          int         pred;
			
 
				+          int         pflags;
			
 
				+          /*29 cycles.*/
			
 
				+          /*HACK: This p0 reference could potentially be out of bounds, but
			
 
				+             because we know what allocator Leonora is using, we know it can't
			
 
				+             segfault.*/
			
 
				+          p0=u_frags[fragi-1].dc;
			
 
				+          p1=u_frags[fragi].dc;
			
 
				+          p2=u_frags[fragi+1].dc;
			
 
				+          p3=frags[fragi-1].dc;
			
 
				+          pflags=_cmpeq4(_packl4(_pack2(ur_ref,u_ref),_pack2(ul_ref,l_ref)),
			
 
				+           _packl4(_pack2(refi,refi),_pack2(refi,refi)));
			
 
				+          if(pflags==0)pred=pred_last[refi];
			
 
				+          else{
			
 
				+            pred=(_dotp2(_pack2(p0,p1),OC_PRED_SCALE[pflags][0])
			
 
				+             +_dotp2(_pack2(p2,p3),OC_PRED_SCALE[pflags][1]))/128;
			
 
				+            if((pflags&7)==7){
			
 
				+              if(abs(pred-p1)>128)pred=p1;
			
 
				+              else if(abs(pred-p3)>128)pred=p3;
			
 
				+              else if(abs(pred-p0)>128)pred=p0;
			
 
				+            }
			
 
				+          }
			
 
				+          pred_last[refi]=frags[fragi].dc+=pred;
			
 
				+          ncoded_fragis++;
			
 
				+          l_ref=refi;
			
 
				+        }
			
 
				+        else l_ref=-1;
			
 
				+        ul_ref=u_ref;
			
 
				+        u_ref=ur_ref;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
			
 
				+  /*Also save the number of uncoded fragments so we know how many to copy.*/
			
 
				+  _pipe->nuncoded_fragis[_pli]=
			
 
				+   (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/c64x/c64xdec.h
+++ b/modules/theoraplayer/native/theora/lib/c64x/c64xdec.h
@@ -0,0 +1,33 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id$
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#if !defined(_c64x_c64xdec_H)
			
 
				+# define _c64x_c64xdec_H (1)
			
 
				+# include "c64xint.h"
			
 
				+
			
 
				+# if defined(OC_C64X_ASM)
			
 
				+#  define oc_dec_accel_init oc_dec_accel_init_c64x
			
 
				+#  define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c64x
			
 
				+# endif
			
 
				+
			
 
				+# include "../decint.h"
			
 
				+
			
 
				+void oc_dec_accel_init_c64x(oc_dec_ctx *_dec);
			
 
				+
			
 
				+void oc_dec_dc_unpredict_mcu_plane_c64x(oc_dec_ctx *_dec,
			
 
				+ oc_dec_pipeline_state *_pipe,int _pli);
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/c64x/c64xfrag.c
+++ b/modules/theoraplayer/native/theora/lib/c64x/c64xfrag.c
@@ -0,0 +1,447 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id$
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include <string.h>
			
 
				+#include "c64xint.h"
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*14 cycles.*/
			
 
				+void oc_frag_copy_c64x(unsigned char *restrict _dst,
			
 
				+ const unsigned char *restrict _src,int _ystride){
			
 
				+  unsigned char *restrict       d2;
			
 
				+  const unsigned char *restrict s2;
			
 
				+  d2=_dst+_ystride;
			
 
				+  s2=_src+_ystride;
			
 
				+#define OC_ITER() \
			
 
				+  do{ \
			
 
				+    _amem8(_dst)=_mem8(_src); \
			
 
				+    _dst+=2*_ystride; \
			
 
				+    _src+=2*_ystride; \
			
 
				+    _amem8(d2)=_mem8(s2); \
			
 
				+    d2+=2*_ystride; \
			
 
				+    s2+=2*_ystride; \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+  OC_ITER();
			
 
				+  OC_ITER();
			
 
				+  OC_ITER();
			
 
				+  OC_ITER();
			
 
				+#undef OC_ITER
			
 
				+}
			
 
				+
			
 
				+void oc_frag_copy_list_c64x(unsigned char *_dst_frame,
			
 
				+ const unsigned char *_src_frame,int _ystride,
			
 
				+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
			
 
				+  ptrdiff_t fragii;
			
 
				+  /*9 cycles per iteration.*/
			
 
				+  for(fragii=0;fragii<_nfragis;fragii++){
			
 
				+    const unsigned char *restrict src;
			
 
				+    const unsigned char *restrict s2;
			
 
				+    unsigned char       *restrict dst;
			
 
				+    unsigned char       *restrict d2;
			
 
				+    ptrdiff_t                     frag_buf_off;
			
 
				+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
			
 
				+    dst=_dst_frame+frag_buf_off;
			
 
				+    src=_src_frame+frag_buf_off;
			
 
				+    d2=dst+_ystride;
			
 
				+    s2=src+_ystride;
			
 
				+#define OC_ITER() \
			
 
				+  do{ \
			
 
				+    _amem8(dst)=_amem8_const(src); \
			
 
				+    dst+=2*_ystride; \
			
 
				+    src+=2*_ystride; \
			
 
				+    _amem8(d2)=_amem8_const(s2); \
			
 
				+    d2+=2*_ystride; \
			
 
				+    s2+=2*_ystride; \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+    OC_ITER();
			
 
				+    OC_ITER();
			
 
				+    OC_ITER();
			
 
				+    OC_ITER();
			
 
				+#undef OC_ITER
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*34 cycles.*/
			
 
				+void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
			
 
				+ const ogg_int16_t _residue[64]){
			
 
				+  int i;
			
 
				+  for(i=0;i<8;i++){
			
 
				+    long long ll;
			
 
				+    int       x1;
			
 
				+    int       y1;
			
 
				+    int       x2;
			
 
				+    int       y2;
			
 
				+    ll=_amem8_const(_residue+i*8+0);
			
 
				+    x1=_sadd2(_loll(ll),0x00800080);
			
 
				+    y1=_sadd2(_hill(ll),0x00800080);
			
 
				+    ll=_amem8_const(_residue+i*8+4);
			
 
				+    x2=_sadd2(_loll(ll),0x00800080);
			
 
				+    y2=_sadd2(_hill(ll),0x00800080);
			
 
				+    _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
			
 
				+    _dst+=_ystride;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*41 cycles.*/
			
 
				+void oc_frag_recon_inter_c64x(unsigned char *_dst,const unsigned char *_src,
			
 
				+ int _ystride,const ogg_int16_t _residue[64]){
			
 
				+  int i;
			
 
				+  for(i=0;i<8;i++){
			
 
				+    long long ll;
			
 
				+    int       x1;
			
 
				+    int       y1;
			
 
				+    int       z1;
			
 
				+    int       x2;
			
 
				+    int       y2;
			
 
				+    int       z2;
			
 
				+    ll=_mem8_const(_src);
			
 
				+    z1=_loll(ll);
			
 
				+    z2=_hill(ll);
			
 
				+    ll=_amem8_const(_residue+i*8+0);
			
 
				+    x1=_sadd2(_unpklu4(z1),_loll(ll));
			
 
				+    y1=_sadd2(_unpkhu4(z1),_hill(ll));
			
 
				+    ll=_amem8_const(_residue+i*8+4);
			
 
				+    x2=_sadd2(_unpklu4(z2),_loll(ll));
			
 
				+    y2=_sadd2(_unpkhu4(z2),_hill(ll));
			
 
				+    _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
			
 
				+    _dst+=_ystride;
			
 
				+    _src+=_ystride;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*56 cycles.*/
			
 
				+void oc_frag_recon_inter2_c64x(unsigned char *_dst,
			
 
				+ const unsigned char *_src1,const unsigned char *_src2,int _ystride,
			
 
				+ const ogg_int16_t _residue[64]){
			
 
				+  int i;
			
 
				+  for(i=0;i<8;i++){
			
 
				+    long long ll;
			
 
				+    int       a;
			
 
				+    int       b;
			
 
				+    int       c;
			
 
				+    int       d;
			
 
				+    int       x1;
			
 
				+    int       y1;
			
 
				+    int       z1;
			
 
				+    int       x2;
			
 
				+    int       y2;
			
 
				+    int       z2;
			
 
				+    ll=_mem8_const(_src1);
			
 
				+    a=_loll(ll);
			
 
				+    b=_hill(ll);
			
 
				+    ll=_mem8_const(_src2);
			
 
				+    c=_loll(ll);
			
 
				+    d=_hill(ll);
			
 
				+    ll=_amem8_const(_residue+i*8+0);
			
 
				+    z1=~_avgu4(~a,~c);
			
 
				+    x1=_sadd2(_unpklu4(z1),_loll(ll));
			
 
				+    y1=_sadd2(_unpkhu4(z1),_hill(ll));
			
 
				+    ll=_amem8_const(_residue+i*8+4);
			
 
				+    z2=~_avgu4(~b,~d);
			
 
				+    x2=_sadd2(_unpklu4(z2),_loll(ll));
			
 
				+    y2=_sadd2(_unpkhu4(z2),_hill(ll));
			
 
				+    _amem8(_dst)=_itoll(_spacku4(y2,x2),_spacku4(y1,x1));
			
 
				+    _dst+=_ystride;
			
 
				+    _src1+=_ystride;
			
 
				+    _src2+=_ystride;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
			
 
				+  unsigned char *dst;
			
 
				+  ptrdiff_t      frag_buf_off;
			
 
				+  int            ystride;
			
 
				+  int            refi;
			
 
				+  /*Apply the inverse transform.*/
			
 
				+  /*Special case only having a DC component.*/
			
 
				+  if(_last_zzi<2){
			
 
				+    int         p;
			
 
				+    long long   ll;
			
 
				+    int         ci;
			
 
				+    /*We round this dequant product (and not any of the others) because there's
			
 
				+       no iDCT rounding.*/
			
 
				+    p=_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5;
			
 
				+    ll=_itoll(_pack2(p,p),_pack2(p,p));
			
 
				+    for(ci=0;ci<64;ci+=4)_amem8(_dct_coeffs+64+ci)=ll;
			
 
				+  }
			
 
				+  else{
			
 
				+    /*First, dequantize the DC coefficient.*/
			
 
				+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
			
 
				+    oc_idct8x8_c64x(_dct_coeffs+64,_dct_coeffs,_last_zzi);
			
 
				+  }
			
 
				+  /*Fill in the target buffer.*/
			
 
				+  frag_buf_off=_state->frag_buf_offs[_fragi];
			
 
				+  refi=_state->frags[_fragi].refi;
			
 
				+  ystride=_state->ref_ystride[_pli];
			
 
				+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
			
 
				+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_c64x(dst,ystride,_dct_coeffs+64);
			
 
				+  else{
			
 
				+    const unsigned char *ref;
			
 
				+    int                  mvoffsets[2];
			
 
				+    ref=_state->ref_frame_data[refi]+frag_buf_off;
			
 
				+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
			
 
				+     _state->frag_mvs[_fragi])>1){
			
 
				+      oc_frag_recon_inter2_c64x(dst,ref+mvoffsets[0],ref+mvoffsets[1],
			
 
				+       ystride,_dct_coeffs+64);
			
 
				+    }
			
 
				+    else oc_frag_recon_inter_c64x(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*46 cycles.*/
			
 
				+static void loop_filter_h(unsigned char *restrict _pix,int _ystride,int _ll){
			
 
				+  int p0;
			
 
				+  int p1;
			
 
				+  int p2;
			
 
				+  int p3;
			
 
				+  int p4;
			
 
				+  int p5;
			
 
				+  int p6;
			
 
				+  int p7;
			
 
				+  int y;
			
 
				+  _pix-=2;
			
 
				+  /*Do all the loads now to avoid the compiler's inability to prove they're not
			
 
				+     dependent on the stores later.*/
			
 
				+  p0=_mem4(_pix+_ystride*0);
			
 
				+  p1=_mem4(_pix+_ystride*1);
			
 
				+  p2=_mem4(_pix+_ystride*2);
			
 
				+  p3=_mem4(_pix+_ystride*3);
			
 
				+  p4=_mem4(_pix+_ystride*4);
			
 
				+  p5=_mem4(_pix+_ystride*5);
			
 
				+  p6=_mem4(_pix+_ystride*6);
			
 
				+  p7=_mem4(_pix+_ystride*7);
			
 
				+  for(y=0;y<8;y+=4){
			
 
				+    int f;
			
 
				+    int a;
			
 
				+    int b;
			
 
				+    int u;
			
 
				+    int v;
			
 
				+    /*We could pack things right after the dot product, but delaying it
			
 
				+       actually saves three cycles due to better instruction scheduling.*/
			
 
				+    a=_dotpsu4(0x01FD03FF,p0)+3>>3;
			
 
				+    b=_dotpsu4(0x01FD03FF,p1)+3>>3;
			
 
				+    u=_dotpsu4(0x01FD03FF,p2)+3>>3;
			
 
				+    v=_dotpsu4(0x01FD03FF,p3)+3>>3;
			
 
				+    f=_packl4(_pack2(v,u),_pack2(b,a));
			
 
				+    /*We split the results by sign and work with abs(f) here, since the C64x
			
 
				+       signed-unsigned addition with unsigned saturation is only available for
			
 
				+       16-bit operands.
			
 
				+      For 8-bit operands, we have to emulate it with a saturated addition and a
			
 
				+       saturated subtraction using separate unsigned values.
			
 
				+      There's no direct support for 8-bit saturated subtraction, either, so we
			
 
				+       have to emulate that as well, using either x-_minu4(x,y) or
			
 
				+       ~_saddu4(~x,y), depending on which one schedules better.*/
			
 
				+    f=_add4(0x80808080,f);
			
 
				+    b=_minu4(0x80808080,f);
			
 
				+    a=0x80808080-b;
			
 
				+    b=f-b;
			
 
				+    /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
			
 
				+    u=_saddu4(a,_ll);
			
 
				+    v=_saddu4(b,_ll);
			
 
				+    a=_saddu4(a,u);
			
 
				+    b=_saddu4(b,v);
			
 
				+    a=a-_minu4(a,u);
			
 
				+    b=b-_minu4(b,v);
			
 
				+    /*Apply the changes to the original pixels.*/
			
 
				+    u=_pack2(p1>>8,p0>>8);
			
 
				+    v=_pack2(p3>>8,p2>>8);
			
 
				+    p1=_packl4(v,u);
			
 
				+    p2=_packh4(v,u);
			
 
				+    p1=_saddu4(~_saddu4(~p1,b),a);
			
 
				+    p2=_saddu4(p2-_minu4(p2,a),b);
			
 
				+    /*For unaligned short stores, we have to store byte by byte.
			
 
				+      It's faster to do it explicitly than to use _mem2().*/
			
 
				+    _pix[_ystride*0+1]=(unsigned char)p1;
			
 
				+    _pix[_ystride*0+2]=(unsigned char)p2;
			
 
				+    _pix[_ystride*1+1]=(unsigned char)(p1>>8);
			
 
				+    _pix[_ystride*1+2]=(unsigned char)(p2>>8);
			
 
				+    _pix[_ystride*2+1]=(unsigned char)(p1>>16);
			
 
				+    _pix[_ystride*2+2]=(unsigned char)(p2>>16);
			
 
				+    _pix[_ystride*3+1]=(unsigned char)(p1>>24);
			
 
				+    _pix[_ystride*3+2]=(unsigned char)(p2>>24);
			
 
				+    p0=p4;
			
 
				+    p1=p5;
			
 
				+    p2=p6;
			
 
				+    p3=p7;
			
 
				+    _pix+=4*_ystride;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*38 cycles.*/
			
 
				+static void loop_filter_v(unsigned char * restrict _pix,int _ystride,int _ll){
			
 
				+  long long ll;
			
 
				+  int       p0;
			
 
				+  int       p1;
			
 
				+  int       p2;
			
 
				+  int       p3;
			
 
				+  int       p4;
			
 
				+  int       p5;
			
 
				+  int       p6;
			
 
				+  int       p7;
			
 
				+  int       a1;
			
 
				+  int       b1;
			
 
				+  int       f1;
			
 
				+  int       m1;
			
 
				+  int       u1;
			
 
				+  int       v1;
			
 
				+  int       a2;
			
 
				+  int       b2;
			
 
				+  int       f2;
			
 
				+  int       m2;
			
 
				+  int       u2;
			
 
				+  int       v2;
			
 
				+  /*Do all the loads now to avoid the compiler's inability to prove they're not
			
 
				+     dependent on the stores later.*/
			
 
				+  ll=_amem8(_pix-_ystride*2);
			
 
				+  p0=_loll(ll);
			
 
				+  p4=_hill(ll);
			
 
				+  ll=_amem8(_pix-_ystride*1);
			
 
				+  p1=_loll(ll);
			
 
				+  p5=_hill(ll);
			
 
				+  ll=_amem8(_pix+_ystride*0);
			
 
				+  p2=_loll(ll);
			
 
				+  p6=_hill(ll);
			
 
				+  ll=_amem8(_pix+_ystride*1);
			
 
				+  p3=_loll(ll);
			
 
				+  p7=_hill(ll);
			
 
				+  /*I can't find a way to put the rest in a loop that the compiler thinks is
			
 
				+     unrollable, so instead it's unrolled manually.*/
			
 
				+  /*This first part is based on the transformation
			
 
				+    f = -(3*(p2-p1)+p0-p3+4>>3)
			
 
				+      = -(3*(p2+255-p1)+(p0+255-p3)+4-1020>>3)
			
 
				+      = -(3*(p2+~p1)+(p0+~p3)-1016>>3)
			
 
				+      = 127-(3*(p2+~p1)+(p0+~p3)>>3)
			
 
				+      = 128+~(3*(p2+~p1)+(p0+~p3)>>3) (mod 256).
			
 
				+    Although _avgu4(a,b) = (a+b+1>>1) (biased up), we rely heavily on the
			
 
				+     fact that ~_avgu4(~a,~b) = (a+b>>1) (biased down).*/
			
 
				+  /*We need this first average both biased up and biased down.*/
			
 
				+  u1=~_avgu4(~p1,p2);
			
 
				+  v1=_avgu4(p1,~p2);
			
 
				+  /*The difference controls whether (p3+255-p0>>1) is biased up or down.*/
			
 
				+  m1=_sub4(u1,v1);
			
 
				+  a1=m1^_avgu4(m1^~p0,m1^p3);
			
 
				+  f1=_avgu4(_avgu4(a1,u1),v1);
			
 
				+  /*Instead of removing the bias by 128, we use it to split f by sign, since
			
 
				+     the C64x signed-unsigned addition with unsigned saturation is only
			
 
				+     available for 16-bit operands.
			
 
				+    For 8-bit operands, we have to emulate it with a saturated addition and a
			
 
				+     saturated subtraction using separate unsigned values.
			
 
				+    There's no direct support for 8-bit saturated subtraction, either, so we
			
 
				+     have to emulate that as well, using either x-_minu4(x,y) or
			
 
				+     ~_saddu4(~x,y), depending on which one schedules better.*/
			
 
				+  b1=_minu4(0x80808080,f1);
			
 
				+  a1=0x80808080-b1;
			
 
				+  b1=f1-b1;
			
 
				+  /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
			
 
				+  u1=_saddu4(a1,_ll);
			
 
				+  v1=_saddu4(b1,_ll);
			
 
				+  a1=_saddu4(a1,u1);
			
 
				+  b1=_saddu4(b1,v1);
			
 
				+  a1=a1-_minu4(a1,u1);
			
 
				+  b1=b1-_minu4(b1,v1);
			
 
				+  /*Apply the changes to the original pixels.*/
			
 
				+  p1=_saddu4(p1-_minu4(p1,b1),a1);
			
 
				+  p2=_saddu4(p2-_minu4(p2,a1),b1);
			
 
				+  /*We need this first average both biased up and biased down.*/
			
 
				+  u2=~_avgu4(~p5,p6);
			
 
				+  v2=_avgu4(p5,~p6);
			
 
				+  /*The difference controls whether (p3+255-p0>>1) is biased up or down.*/
			
 
				+  m2=_sub4(u2,v2);
			
 
				+  a2=m2^_avgu4(m2^~p4,m2^p7);
			
 
				+  f2=_avgu4(_avgu4(a2,u2),v2);
			
 
				+  /*Instead of removing the bias by 128, we use it to split f by sign.*/
			
 
				+  b2=_minu4(0x80808080,f2);
			
 
				+  a2=0x80808080-b2;
			
 
				+  b2=f2-b2;
			
 
				+  /*Compute f=clamp(0,2*L-abs(f),abs(f)).*/
			
 
				+  u2=_saddu4(a2,_ll);
			
 
				+  v2=_saddu4(b2,_ll);
			
 
				+  a2=_saddu4(a2,u2);
			
 
				+  b2=_saddu4(b2,v2);
			
 
				+  a2=a2-_minu4(a2,u2);
			
 
				+  b2=b2-_minu4(b2,v2);
			
 
				+  /*Apply the changes to the original pixels.*/
			
 
				+  p5=_saddu4(p5-_minu4(p5,b2),a2);
			
 
				+  p6=_saddu4(p6-_minu4(p6,a2),b2);
			
 
				+  /*Write out the results.*/
			
 
				+  _amem8(_pix-_ystride)=_itoll(p5,p1);
			
 
				+  _amem8(_pix)=_itoll(p6,p2);
			
 
				+}
			
 
				+
			
 
				+void oc_loop_filter_init_c64x(signed char _bv[256],int _flimit){
			
 
				+  int ll;
			
 
				+  ll=_flimit<<1;
			
 
				+  ll=_pack2(ll,ll);
			
 
				+  ll=~_spacku4(ll,ll);
			
 
				+  *((int *)_bv)=ll;
			
 
				+}
			
 
				+
			
 
				+void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,
			
 
				+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
			
 
				+  const oc_fragment_plane *fplane;
			
 
				+  const oc_fragment       *frags;
			
 
				+  const ptrdiff_t         *frag_buf_offs;
			
 
				+  unsigned char           *ref_frame_data;
			
 
				+  ptrdiff_t                fragi_top;
			
 
				+  ptrdiff_t                fragi_bot;
			
 
				+  ptrdiff_t                fragi0;
			
 
				+  ptrdiff_t                fragi0_end;
			
 
				+  int                      ystride;
			
 
				+  int                      nhfrags;
			
 
				+  int                      ll;
			
 
				+  fplane=_state->fplanes+_pli;
			
 
				+  nhfrags=fplane->nhfrags;
			
 
				+  fragi_top=fplane->froffset;
			
 
				+  fragi_bot=fragi_top+fplane->nfrags;
			
 
				+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
			
 
				+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
			
 
				+  ystride=_state->ref_ystride[_pli];
			
 
				+  frags=_state->frags;
			
 
				+  frag_buf_offs=_state->frag_buf_offs;
			
 
				+  ref_frame_data=_state->ref_frame_data[_refi];
			
 
				+  ll=*((int *)_bv);
			
 
				+  /*The following loops are constructed somewhat non-intuitively on purpose.
			
 
				+    The main idea is: if a block boundary has at least one coded fragment on
			
 
				+     it, the filter is applied to it.
			
 
				+    However, the order that the filters are applied in matters, and VP3 chose
			
 
				+     the somewhat strange ordering used below.*/
			
 
				+  while(fragi0<fragi0_end){
			
 
				+    ptrdiff_t fragi;
			
 
				+    ptrdiff_t fragi_end;
			
 
				+    fragi=fragi0;
			
 
				+    fragi_end=fragi+nhfrags;
			
 
				+    while(fragi<fragi_end){
			
 
				+      if(frags[fragi].coded){
			
 
				+        unsigned char *ref;
			
 
				+        ref=ref_frame_data+frag_buf_offs[fragi];
			
 
				+        if(fragi>fragi0)loop_filter_h(ref,ystride,ll);
			
 
				+        if(fragi0>fragi_top)loop_filter_v(ref,ystride,ll);
			
 
				+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
			
 
				+          loop_filter_h(ref+8,ystride,ll);
			
 
				+        }
			
 
				+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
			
 
				+          loop_filter_v(ref+(ystride<<3),ystride,ll);
			
 
				+        }
			
 
				+      }
			
 
				+      fragi++;
			
 
				+    }
			
 
				+    fragi0+=nhfrags;
			
 
				+  }
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/c64x/c64xidct.c
+++ b/modules/theoraplayer/native/theora/lib/c64x/c64xidct.c
@@ -0,0 +1,415 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id$
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include <string.h>
			
 
				+#include "c64xint.h"
			
 
				+#include "dct.h"
			
 
				+
			
 
				+#define OC_C1S7D ((OC_C1S7<<16)|(OC_C1S7&0xFFFF))
			
 
				+#define OC_C2S6D ((OC_C2S6<<16)|(OC_C2S6&0xFFFF))
			
 
				+#define OC_C3S5D ((OC_C3S5<<16)|(OC_C3S5&0xFFFF))
			
 
				+#define OC_C4S4D ((OC_C4S4<<16)|(OC_C4S4&0xFFFF))
			
 
				+#define OC_C5S3D ((OC_C5S3<<16)|(OC_C5S3&0xFFFF))
			
 
				+#define OC_C6S2D ((OC_C6S2<<16)|(OC_C6S2&0xFFFF))
			
 
				+#define OC_C7S1D ((OC_C7S1<<16)|(OC_C7S1&0xFFFF))
			
 
				+
			
 
				+/*Various building blocks for the iDCT implementations.
			
 
				+  These are done in macros instead of functions so that we can use all local
			
 
				+   variables, which avoids leaving the compiler to try to sort out memory
			
 
				+   reference dependencies.*/
			
 
				+
			
 
				+/*Load two rows into x0...x7.*/
			
 
				+#define OC_IDCT8x2_LOAD8(_x) \
			
 
				+  do{ \
			
 
				+    long long ll; \
			
 
				+    ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
			
 
				+    x0=_loll(ll); \
			
 
				+    x1=_hill(ll); \
			
 
				+    ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
			
 
				+    x2=_loll(ll); \
			
 
				+    x3=_hill(ll); \
			
 
				+    ll=_dpack2(_amem4_const((_x)+12),_amem4_const((_x)+4)); \
			
 
				+    x4=_loll(ll); \
			
 
				+    x5=_hill(ll); \
			
 
				+    ll=_dpack2(_amem4_const((_x)+14),_amem4_const((_x)+6)); \
			
 
				+    x6=_loll(ll); \
			
 
				+    x7=_hill(ll); \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+
			
 
				+/*Load two rows into x0...x3.
			
 
				+  Uses ll as a temporary.*/
			
 
				+#define OC_IDCT8x2_LOAD4(_x) \
			
 
				+  do{ \
			
 
				+    long long ll; \
			
 
				+    ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
			
 
				+    x0=_loll(ll); \
			
 
				+    x1=_hill(ll); \
			
 
				+    ll=_dpack2(_amem4_const((_x)+10),_amem4_const((_x)+2)); \
			
 
				+    x2=_loll(ll); \
			
 
				+    x3=_hill(ll); \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+
			
 
				+/*Load two rows into x0...x1.*/
			
 
				+#define OC_IDCT8x2_LOAD2(_x) \
			
 
				+  do{ \
			
 
				+    long long ll; \
			
 
				+    ll=_dpack2(_amem4_const((_x)+8),_amem4_const((_x)+0)); \
			
 
				+    x0=_loll(ll); \
			
 
				+    x1=_hill(ll); \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+
			
 
				+/*Load two columns into x0...x1.*/
			
 
				+#define OC_IDCT8x2_LOAD2T(_x) \
			
 
				+  do{ \
			
 
				+    x0=_amem4_const((_x)+(0<<3)); \
			
 
				+    x1=_amem4_const((_x)+(1<<3)); \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+
			
 
				+/*Transform x0...x7 into t0...t7.*/
			
 
				+#define OC_IDCT8x2() \
			
 
				+  do{ \
			
 
				+    long long ll; \
			
 
				+    int       a; \
			
 
				+    int       b; \
			
 
				+    /*Stage 1:*/ \
			
 
				+    ll=_addsub2(x0,x4); \
			
 
				+    a=_hill(ll); \
			
 
				+    b=_loll(ll); \
			
 
				+    t0=_packh2(_mpyhus(OC_C4S4D,a),_mpyus(OC_C4S4D,a)); \
			
 
				+    t1=_packh2(_mpyhus(OC_C4S4D,b),_mpyus(OC_C4S4D,b)); \
			
 
				+    ll=_mpy2ll(OC_C6S2D,x2); \
			
 
				+    a=_packh2(_hill(ll),_loll(ll)); \
			
 
				+    ll=_mpy2ll(OC_C2S6D,x6); \
			
 
				+    b=_add2(_packh2(_hill(ll),_loll(ll)),x6); \
			
 
				+    t2=_sub2(a,b); \
			
 
				+    ll=_mpy2ll(OC_C2S6D,x2); \
			
 
				+    a=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
			
 
				+    ll=_mpy2ll(OC_C6S2D,x6); \
			
 
				+    b=_packh2(_hill(ll),_loll(ll)); \
			
 
				+    t3=_add2(a,b); \
			
 
				+    ll=_mpy2ll(OC_C7S1D,x1); \
			
 
				+    a=_packh2(_hill(ll),_loll(ll)); \
			
 
				+    ll=_mpy2ll(OC_C1S7D,x7); \
			
 
				+    b=_add2(_packh2(_hill(ll),_loll(ll)),x7); \
			
 
				+    t4=_sub2(a,b); \
			
 
				+    ll=_mpy2ll(OC_C3S5D,x5); \
			
 
				+    a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
			
 
				+    ll=_mpy2ll(OC_C5S3D,x3); \
			
 
				+    b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
			
 
				+    t5=_sub2(a,b); \
			
 
				+    ll=_mpy2ll(OC_C5S3D,x5); \
			
 
				+    a=_add2(_packh2(_hill(ll),_loll(ll)),x5); \
			
 
				+    ll=_mpy2ll(OC_C3S5D,x3); \
			
 
				+    b=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
			
 
				+    t6=_add2(a,b); \
			
 
				+    ll=_mpy2ll(OC_C1S7D,x1); \
			
 
				+    a=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
			
 
				+    ll=_mpy2ll(OC_C7S1D,x7); \
			
 
				+    b=_packh2(_hill(ll),_loll(ll)); \
			
 
				+    t7=_add2(a,b); \
			
 
				+    /*Stage 2:*/ \
			
 
				+    ll=_addsub2(t4,t5); \
			
 
				+    t4=_hill(ll); \
			
 
				+    b=_loll(ll); \
			
 
				+    ll=_mpy2ll(OC_C4S4D,b); \
			
 
				+    t5=_add2(_packh2(_hill(ll),_loll(ll)),b); \
			
 
				+    ll=_addsub2(t7,t6); \
			
 
				+    t7=_hill(ll); \
			
 
				+    b=_loll(ll); \
			
 
				+    ll=_mpy2ll(OC_C4S4D,b); \
			
 
				+    t6=_add2(_packh2(_hill(ll),_loll(ll)),b); \
			
 
				+    /*Stage 3:*/ \
			
 
				+    ll=_addsub2(t0,t3); \
			
 
				+    t0=_hill(ll); \
			
 
				+    t3=_loll(ll); \
			
 
				+    ll=_addsub2(t1,t2); \
			
 
				+    t1=_hill(ll); \
			
 
				+    t2=_loll(ll); \
			
 
				+    ll=_addsub2(t6,t5); \
			
 
				+    t6=_hill(ll); \
			
 
				+    t5=_loll(ll); \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+
			
 
				+/*Transform x0...x3 into t0...t7, assuming x4...x7 are zero.*/
			
 
				+#define OC_IDCT8x2_4() \
			
 
				+  do{ \
			
 
				+    long long ll; \
			
 
				+    int       a; \
			
 
				+    /*Stage 1:*/ \
			
 
				+    ll=_mpy2ll(OC_C4S4D,x0); \
			
 
				+    t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
			
 
				+    t1=t0; \
			
 
				+    ll=_mpy2ll(OC_C6S2D,x2); \
			
 
				+    t2=_packh2(_hill(ll),_loll(ll)); \
			
 
				+    ll=_mpy2ll(OC_C2S6D,x2); \
			
 
				+    t3=_add2(_packh2(_hill(ll),_loll(ll)),x2); \
			
 
				+    ll=_mpy2ll(OC_C7S1D,x1); \
			
 
				+    t4=_packh2(_hill(ll),_loll(ll)); \
			
 
				+    ll=_mpy2ll(OC_C5S3D,x3); \
			
 
				+    t5=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
			
 
				+    ll=_mpy2ll(OC_C3S5D,x3); \
			
 
				+    t6=_add2(_packh2(_hill(ll),_loll(ll)),x3); \
			
 
				+    ll=_mpy2ll(OC_C1S7D,x1); \
			
 
				+    t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
			
 
				+    /*Stage 2:*/ \
			
 
				+    ll=_addsub2(t4,t5); \
			
 
				+    t4=_loll(ll); \
			
 
				+    a=_hill(ll); \
			
 
				+    ll=_mpy2ll(OC_C4S4D,a); \
			
 
				+    t5=_add2(_packh2(_hill(ll),_loll(ll)),a); \
			
 
				+    ll=_addsub2(t7,t6); \
			
 
				+    t7=_hill(ll); \
			
 
				+    a=_loll(ll); \
			
 
				+    ll=_mpy2ll(OC_C4S4D,a); \
			
 
				+    t6=_add2(_packh2(_hill(ll),_loll(ll)),a); \
			
 
				+    /*Stage 3:*/ \
			
 
				+    ll=_addsub2(t0,t3); \
			
 
				+    t0=_hill(ll); \
			
 
				+    t3=_loll(ll); \
			
 
				+    ll=_addsub2(t1,t2); \
			
 
				+    t1=_hill(ll); \
			
 
				+    t2=_loll(ll); \
			
 
				+    ll=_addsub2(t6,t5); \
			
 
				+    t6=_hill(ll); \
			
 
				+    t5=_loll(ll); \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+
			
 
				+/*Transform x0...x1 into t0...t7, assuming x2...x7 are zero.*/
			
 
				+#define OC_IDCT8x2_2() \
			
 
				+  do{ \
			
 
				+    long long ll; \
			
 
				+    /*Stage 1:*/ \
			
 
				+    ll=_mpy2ll(OC_C4S4D,x0); \
			
 
				+    t0=_add2(_packh2(_hill(ll),_loll(ll)),x0); \
			
 
				+    t1=t0; \
			
 
				+    ll=_mpy2ll(OC_C7S1D,x1); \
			
 
				+    t4=_packh2(_hill(ll),_loll(ll)); \
			
 
				+    ll=_mpy2ll(OC_C1S7D,x1); \
			
 
				+    t7=_add2(_packh2(_hill(ll),_loll(ll)),x1); \
			
 
				+    /*Stage 2:*/ \
			
 
				+    ll=_mpy2ll(OC_C4S4D,t4); \
			
 
				+    t5=_add2(_packh2(_hill(ll),_loll(ll)),t4); \
			
 
				+    ll=_mpy2ll(OC_C4S4D,t7); \
			
 
				+    t6=_add2(_packh2(_hill(ll),_loll(ll)),t7); \
			
 
				+    /*Stage 3:*/ \
			
 
				+    t3=t0; \
			
 
				+    t2=t1; \
			
 
				+    ll=_addsub2(t6,t5); \
			
 
				+    t6=_hill(ll); \
			
 
				+    t5=_loll(ll); \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+
			
 
				+/*Finish transforming t0...t7 and store two rows.*/
			
 
				+#define OC_IDCT8x2_STORE(_y) \
			
 
				+  do{ \
			
 
				+    long long ll; \
			
 
				+    int       a; \
			
 
				+    int       b; \
			
 
				+    int       c; \
			
 
				+    int       d; \
			
 
				+    /*Stage 4:*/ \
			
 
				+    ll=_addsub2(t0,t7); \
			
 
				+    a=_hill(ll); \
			
 
				+    c=_loll(ll); \
			
 
				+    ll=_addsub2(t1,t6); \
			
 
				+    b=_hill(ll); \
			
 
				+    d=_loll(ll); \
			
 
				+    ll=_dpack2(b,a); \
			
 
				+    _amem4((_y)+0)=_loll(ll); \
			
 
				+    _amem4((_y)+8)=_hill(ll); \
			
 
				+    ll=_dpack2(c,d); \
			
 
				+    _amem4((_y)+6)=_loll(ll); \
			
 
				+    _amem4((_y)+14)=_hill(ll); \
			
 
				+    ll=_addsub2(t2,t5); \
			
 
				+    a=_hill(ll); \
			
 
				+    c=_loll(ll); \
			
 
				+    ll=_addsub2(t3,t4); \
			
 
				+    b=_hill(ll); \
			
 
				+    d=_loll(ll); \
			
 
				+    ll=_dpack2(b,a); \
			
 
				+    _amem4((_y)+2)=_loll(ll); \
			
 
				+    _amem4((_y)+10)=_hill(ll); \
			
 
				+    ll=_dpack2(c,d); \
			
 
				+    _amem4((_y)+4)=_loll(ll); \
			
 
				+    _amem4((_y)+12)=_hill(ll); \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+
			
 
				+/*Finish transforming t0...t7 and store two columns.*/
			
 
				+#define OC_IDCT8x2_STORET(_y) \
			
 
				+  do{ \
			
 
				+    long long ll; \
			
 
				+    /*Stage 4:*/ \
			
 
				+    ll=_addsub2(t0,t7); \
			
 
				+    _amem4((_y)+(0<<3))=_hill(ll); \
			
 
				+    _amem4((_y)+(7<<3))=_loll(ll); \
			
 
				+    ll=_addsub2(t1,t6); \
			
 
				+    _amem4((_y)+(1<<3))=_hill(ll); \
			
 
				+    _amem4((_y)+(6<<3))=_loll(ll); \
			
 
				+    ll=_addsub2(t2,t5); \
			
 
				+    _amem4((_y)+(2<<3))=_hill(ll); \
			
 
				+    _amem4((_y)+(5<<3))=_loll(ll); \
			
 
				+    ll=_addsub2(t3,t4); \
			
 
				+    _amem4((_y)+(3<<3))=_hill(ll); \
			
 
				+    _amem4((_y)+(4<<3))=_loll(ll); \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+
			
 
				+/*Finish transforming t0...t7, round and scale, and store two columns.*/
			
 
				+#define OC_IDCT8x2_ROUND_STORET(_y) \
			
 
				+  do{ \
			
 
				+    long long ll; \
			
 
				+    /*Stage 4:*/ \
			
 
				+    /*Adjust for the scale factor.*/ \
			
 
				+    ll=_addsub2(t0,t7); \
			
 
				+    _amem4((_y)+(0<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
			
 
				+    _amem4((_y)+(7<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
			
 
				+    ll=_addsub2(t1,t6); \
			
 
				+    _amem4((_y)+(1<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
			
 
				+    _amem4((_y)+(6<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
			
 
				+    ll=_addsub2(t2,t5); \
			
 
				+    _amem4((_y)+(2<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
			
 
				+    _amem4((_y)+(5<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
			
 
				+    ll=_addsub2(t3,t4); \
			
 
				+    _amem4((_y)+(3<<3))=_shr2(_add2(_hill(ll),0x00080008),4); \
			
 
				+    _amem4((_y)+(4<<3))=_shr2(_add2(_loll(ll),0x00080008),4); \
			
 
				+  } \
			
 
				+  while(0)
			
 
				+
			
 
				+/*196 cycles.*/
			
 
				+static void oc_idct8x8_slow_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
			
 
				+  ogg_int16_t w[64];
			
 
				+  int         x0;
			
 
				+  int         x1;
			
 
				+  int         x2;
			
 
				+  int         x3;
			
 
				+  int         x4;
			
 
				+  int         x5;
			
 
				+  int         x6;
			
 
				+  int         x7;
			
 
				+  int         t0;
			
 
				+  int         t1;
			
 
				+  int         t2;
			
 
				+  int         t3;
			
 
				+  int         t4;
			
 
				+  int         t5;
			
 
				+  int         t6;
			
 
				+  int         t7;
			
 
				+  int         i;
			
 
				+  /*Transform rows of x into columns of w.*/
			
 
				+  for(i=0;i<8;i+=2){
			
 
				+    OC_IDCT8x2_LOAD8(_x+i*8);
			
 
				+    _amem8(_x+i*8)=0LL;
			
 
				+    _amem8(_x+i*8+4)=0LL;
			
 
				+    _amem8(_x+i*8+8)=0LL;
			
 
				+    _amem8(_x+i*8+12)=0LL;
			
 
				+    OC_IDCT8x2();
			
 
				+    OC_IDCT8x2_STORET(w+i);
			
 
				+  }
			
 
				+  /*Transform rows of w into columns of y.*/
			
 
				+  for(i=0;i<8;i+=2){
			
 
				+    OC_IDCT8x2_LOAD8(w+i*8);
			
 
				+    OC_IDCT8x2();
			
 
				+    OC_IDCT8x2_ROUND_STORET(_y+i);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*106 cycles.*/
			
 
				+static void oc_idct8x8_10_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
			
 
				+  ogg_int16_t w[64];
			
 
				+  int         t0;
			
 
				+  int         t1;
			
 
				+  int         t2;
			
 
				+  int         t3;
			
 
				+  int         t4;
			
 
				+  int         t5;
			
 
				+  int         t6;
			
 
				+  int         t7;
			
 
				+  int         x0;
			
 
				+  int         x1;
			
 
				+  int         x2;
			
 
				+  int         x3;
			
 
				+  int         i;
			
 
				+  /*Transform rows of x into columns of w.*/
			
 
				+  OC_IDCT8x2_LOAD4(_x);
			
 
				+  OC_IDCT8x2_4();
			
 
				+  OC_IDCT8x2_STORET(w);
			
 
				+  OC_IDCT8x2_LOAD2(_x+16);
			
 
				+  _amem8(_x)=0LL;
			
 
				+  _amem8(_x+8)=0LL;
			
 
				+  _amem4(_x+16)=0;
			
 
				+  _amem4(_x+24)=0;
			
 
				+  OC_IDCT8x2_2();
			
 
				+  OC_IDCT8x2_STORET(w+2);
			
 
				+  /*Transform rows of w into columns of y.*/
			
 
				+  for(i=0;i<8;i+=2){
			
 
				+    OC_IDCT8x2_LOAD4(w+i*8);
			
 
				+    OC_IDCT8x2_4();
			
 
				+    OC_IDCT8x2_ROUND_STORET(_y+i);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+#if 0
			
 
				+/*This used to compile to something faster (88 cycles), but no longer, and I'm
			
 
				+   not sure what changed to cause this.
			
 
				+  In any case, it's barely an advantage over the 10-coefficient version, and is
			
 
				+   now hardly worth the icache space.*/
			
 
				+/*95 cycles.*/
			
 
				+static inline void oc_idct8x8_3_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
			
 
				+  ogg_int16_t w[64];
			
 
				+  int         t0;
			
 
				+  int         t1;
			
 
				+  int         t2;
			
 
				+  int         t3;
			
 
				+  int         t4;
			
 
				+  int         t5;
			
 
				+  int         t6;
			
 
				+  int         t7;
			
 
				+  int         x0;
			
 
				+  int         x1;
			
 
				+  int         i;
			
 
				+  /*Transform rows of x into rows of w.*/
			
 
				+  for(i=0;i<2;i+=2){
			
 
				+    OC_IDCT8x2_LOAD2(_x+i*8);
			
 
				+    OC_IDCT8x2_2();
			
 
				+    OC_IDCT8x2_STORE(w+i*8);
			
 
				+  }
			
 
				+  _amem4(_x)=0;
			
 
				+  _amem4(_x+8)=0;
			
 
				+  /*Transform columns of w into columns of y.*/
			
 
				+  for(i=0;i<8;i+=2){
			
 
				+    OC_IDCT8x2_LOAD2T(w+i);
			
 
				+    OC_IDCT8x2_2();
			
 
				+    OC_IDCT8x2_ROUND_STORET(_y+i);
			
 
				+  }
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/*Performs an inverse 8x8 Type-II DCT transform.
			
 
				+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
			
 
				+   version of the transform.*/
			
 
				+void oc_idct8x8_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
			
 
				+  /*if(_last_zzi<=3)oc_idct8x8_3_c64x(_y,_x);
			
 
				+  else*/ if(_last_zzi<=10)oc_idct8x8_10_c64x(_y,_x);
			
 
				+  else oc_idct8x8_slow_c64x(_y,_x);
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/c64x/c64xint.h
+++ b/modules/theoraplayer/native/theora/lib/c64x/c64xint.h
@@ -0,0 +1,67 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id$
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#if !defined(_c64x_c64xint_H)
			
 
				+# define _c64x_c64xint_H (1)
			
 
				+# include "../internal.h"
			
 
				+
			
 
				+# if defined(OC_C64X_ASM)
			
 
				+#  define oc_state_accel_init oc_state_accel_init_c64x
			
 
				+#  define oc_frag_copy(_state,_dst,_src,_ystride) \
			
 
				+  oc_frag_copy_c64x(_dst,_src,_ystride)
			
 
				+#  define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
			
 
				+ _fragis,_nfragis,_frag_buf_offs) \
			
 
				+  oc_frag_copy_list_c64x(_dst_frame,_src_frame,_ystride, \
			
 
				+   _fragis,_nfragis,_frag_buf_offs)
			
 
				+#  define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
			
 
				+  oc_frag_recon_intra_c64x(_dst,_dst_ystride,_residue)
			
 
				+#  define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
			
 
				+  oc_frag_recon_inter_c64x(_dst,_src,_ystride,_residue)
			
 
				+#  define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
			
 
				+  oc_frag_recon_inter2_c64x(_dst,_src1,_src2,_ystride,_residue)
			
 
				+#  define oc_idct8x8(_state,_y,_x,_last_zzi) \
			
 
				+  oc_idct8x8_c64x(_y,_x,_last_zzi)
			
 
				+#  define oc_state_frag_recon oc_state_frag_recon_c64x
			
 
				+#  define oc_loop_filter_init(_state,_bv,_flimit) \
			
 
				+  oc_loop_filter_init_c64x(_bv,_flimit)
			
 
				+#  define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c64x
			
 
				+#  define oc_restore_fpu(_state) do{}while(0)
			
 
				+# endif
			
 
				+
			
 
				+# include "../state.h"
			
 
				+
			
 
				+void oc_state_accel_init_c64x(oc_theora_state *_state);
			
 
				+
			
 
				+void oc_frag_copy_c64x(unsigned char *_dst,
			
 
				+ const unsigned char *_src,int _ystride);
			
 
				+void oc_frag_copy_list_c64x(unsigned char *_dst_frame,
			
 
				+ const unsigned char *_src_frame,int _ystride,
			
 
				+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
			
 
				+void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
			
 
				+ const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter_c64x(unsigned char *_dst,
			
 
				+ const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_frag_recon_inter2_c64x(unsigned char *_dst,const unsigned char *_src1,
			
 
				+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
			
 
				+void oc_idct8x8_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
			
 
				+void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
			
 
				+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
			
 
				+void oc_loop_filter_init_c64x(signed char _bv[256],int _flimit);
			
 
				+void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,
			
 
				+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/c64x/c64xstate.c
+++ b/modules/theoraplayer/native/theora/lib/c64x/c64xstate.c
@@ -0,0 +1,39 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id$
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#include "c64xint.h"
			
 
				+
			
 
				+#if defined(OC_C64X_ASM)
			
 
				+
			
 
				+void oc_state_accel_init_c64x(oc_theora_state *_state){
			
 
				+  oc_state_accel_init_c(_state);
			
 
				+# if defined(OC_STATE_USE_VTABLE)
			
 
				+  _state->opt_vtable.frag_copy=oc_frag_copy_c64x;
			
 
				+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c64x;
			
 
				+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c64x;
			
 
				+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c64x;
			
 
				+  _state->opt_vtable.idct8x8=oc_idct8x8_c64x;
			
 
				+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c64x;
			
 
				+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_c64x;
			
 
				+  _state->opt_vtable.loop_filter_init=oc_loop_filter_init_c64x;
			
 
				+  _state->opt_vtable.state_loop_filter_frag_rows=
			
 
				+   oc_state_loop_filter_frag_rows_c64x;
			
 
				+  _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
			
 
				+# endif
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/collect.c
+++ b/modules/theoraplayer/native/theora/lib/collect.c
@@ -0,0 +1,974 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2011                *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function: mode selection code
			
 
				+  last mod: $Id$
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include <stdio.h>
			
 
				+#include <limits.h>
			
 
				+#include <math.h>
			
 
				+#include <string.h>
			
 
				+#include "collect.h"
			
 
				+
			
 
				+#if defined(OC_COLLECT_METRICS)
			
 
				+
			
 
				+int              OC_HAS_MODE_METRICS;
			
 
				+double           OC_MODE_RD_WEIGHT_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
			
 
				+double           OC_MODE_RD_WEIGHT_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
			
 
				+oc_mode_metrics  OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
			
 
				+oc_mode_metrics  OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
			
 
				+const char      *OC_MODE_METRICS_FILENAME="modedec.stats";
			
 
				+
			
 
				+void oc_mode_metrics_add(oc_mode_metrics *_metrics,
			
 
				+ double _w,int _s,int _q,int _r,double _d){
			
 
				+  if(_metrics->w>0){
			
 
				+    double ds;
			
 
				+    double dq;
			
 
				+    double dr;
			
 
				+    double dd;
			
 
				+    double ds2;
			
 
				+    double dq2;
			
 
				+    double s2;
			
 
				+    double sq;
			
 
				+    double q2;
			
 
				+    double sr;
			
 
				+    double qr;
			
 
				+    double sd;
			
 
				+    double qd;
			
 
				+    double s2q;
			
 
				+    double sq2;
			
 
				+    double w;
			
 
				+    double wa;
			
 
				+    double rwa;
			
 
				+    double rwa2;
			
 
				+    double rwb;
			
 
				+    double rwb2;
			
 
				+    double rw2;
			
 
				+    double rw3;
			
 
				+    double rw4;
			
 
				+    wa=_metrics->w;
			
 
				+    ds=_s-_metrics->s/wa;
			
 
				+    dq=_q-_metrics->q/wa;
			
 
				+    dr=_r-_metrics->r/wa;
			
 
				+    dd=_d-_metrics->d/wa;
			
 
				+    ds2=ds*ds;
			
 
				+    dq2=dq*dq;
			
 
				+    s2=_metrics->s2;
			
 
				+    sq=_metrics->sq;
			
 
				+    q2=_metrics->q2;
			
 
				+    sr=_metrics->sr;
			
 
				+    qr=_metrics->qr;
			
 
				+    sd=_metrics->sd;
			
 
				+    qd=_metrics->qd;
			
 
				+    s2q=_metrics->s2q;
			
 
				+    sq2=_metrics->sq2;
			
 
				+    w=wa+_w;
			
 
				+    rwa=wa/w;
			
 
				+    rwb=_w/w;
			
 
				+    rwa2=rwa*rwa;
			
 
				+    rwb2=rwb*rwb;
			
 
				+    rw2=wa*rwb;
			
 
				+    rw3=rw2*(rwa2-rwb2);
			
 
				+    rw4=_w*rwa2*rwa2+wa*rwb2*rwb2;
			
 
				+    _metrics->s2q2+=-2*(ds*sq2+dq*s2q)*rwb
			
 
				+     +(ds2*q2+4*ds*dq*sq+dq2*s2)*rwb2+ds2*dq2*rw4;
			
 
				+    _metrics->s2q+=(-2*ds*sq-dq*s2)*rwb+ds2*dq*rw3;
			
 
				+    _metrics->sq2+=(-ds*q2-2*dq*sq)*rwb+ds*dq2*rw3;
			
 
				+    _metrics->sqr+=(-ds*qr-dq*sr-dr*sq)*rwb+ds*dq*dr*rw3;
			
 
				+    _metrics->sqd+=(-ds*qd-dq*sd-dd*sq)*rwb+ds*dq*dd*rw3;
			
 
				+    _metrics->s2+=ds2*rw2;
			
 
				+    _metrics->sq+=ds*dq*rw2;
			
 
				+    _metrics->q2+=dq2*rw2;
			
 
				+    _metrics->sr+=ds*dr*rw2;
			
 
				+    _metrics->qr+=dq*dr*rw2;
			
 
				+    _metrics->r2+=dr*dr*rw2;
			
 
				+    _metrics->sd+=ds*dd*rw2;
			
 
				+    _metrics->qd+=dq*dd*rw2;
			
 
				+    _metrics->d2+=dd*dd*rw2;
			
 
				+  }
			
 
				+  _metrics->w+=_w;
			
 
				+  _metrics->s+=_s*_w;
			
 
				+  _metrics->q+=_q*_w;
			
 
				+  _metrics->r+=_r*_w;
			
 
				+  _metrics->d+=_d*_w;
			
 
				+}
			
 
				+
			
 
				+void oc_mode_metrics_merge(oc_mode_metrics *_dst,
			
 
				+ const oc_mode_metrics *_src,int _n){
			
 
				+  int i;
			
 
				+  /*Find a non-empty set of metrics.*/
			
 
				+  for(i=0;i<_n&&_src[i].w==0;i++);
			
 
				+  if(i>=_n){
			
 
				+    memset(_dst,0,sizeof(*_dst));
			
 
				+    return;
			
 
				+  }
			
 
				+  memcpy(_dst,_src+i,sizeof(*_dst));
			
 
				+  /*And iterate over the remaining non-empty sets of metrics.*/
			
 
				+  for(i++;i<_n;i++)if(_src[i].w!=0){
			
 
				+    double ds;
			
 
				+    double dq;
			
 
				+    double dr;
			
 
				+    double dd;
			
 
				+    double ds2;
			
 
				+    double dq2;
			
 
				+    double s2a;
			
 
				+    double s2b;
			
 
				+    double sqa;
			
 
				+    double sqb;
			
 
				+    double q2a;
			
 
				+    double q2b;
			
 
				+    double sra;
			
 
				+    double srb;
			
 
				+    double qra;
			
 
				+    double qrb;
			
 
				+    double sda;
			
 
				+    double sdb;
			
 
				+    double qda;
			
 
				+    double qdb;
			
 
				+    double s2qa;
			
 
				+    double s2qb;
			
 
				+    double sq2a;
			
 
				+    double sq2b;
			
 
				+    double w;
			
 
				+    double wa;
			
 
				+    double wb;
			
 
				+    double rwa;
			
 
				+    double rwb;
			
 
				+    double rwa2;
			
 
				+    double rwb2;
			
 
				+    double rw2;
			
 
				+    double rw3;
			
 
				+    double rw4;
			
 
				+    wa=_dst->w;
			
 
				+    wb=_src[i].w;
			
 
				+    ds=_src[i].s/wb-_dst->s/wa;
			
 
				+    dq=_src[i].q/wb-_dst->q/wa;
			
 
				+    dr=_src[i].r/wb-_dst->r/wa;
			
 
				+    dd=_src[i].d/wb-_dst->d/wa;
			
 
				+    ds2=ds*ds;
			
 
				+    dq2=dq*dq;
			
 
				+    s2a=_dst->s2;
			
 
				+    sqa=_dst->sq;
			
 
				+    q2a=_dst->q2;
			
 
				+    sra=_dst->sr;
			
 
				+    qra=_dst->qr;
			
 
				+    sda=_dst->sd;
			
 
				+    qda=_dst->qd;
			
 
				+    s2qa=_dst->s2q;
			
 
				+    sq2a=_dst->sq2;
			
 
				+    s2b=_src[i].s2;
			
 
				+    sqb=_src[i].sq;
			
 
				+    q2b=_src[i].q2;
			
 
				+    srb=_src[i].sr;
			
 
				+    qrb=_src[i].qr;
			
 
				+    sdb=_src[i].sd;
			
 
				+    qdb=_src[i].qd;
			
 
				+    s2qb=_src[i].s2q;
			
 
				+    sq2b=_src[i].sq2;
			
 
				+    w=wa+wb;
			
 
				+    if(w==0)rwa=rwb=0;
			
 
				+    else{
			
 
				+      rwa=wa/w;
			
 
				+      rwb=wb/w;
			
 
				+    }
			
 
				+    rwa2=rwa*rwa;
			
 
				+    rwb2=rwb*rwb;
			
 
				+    rw2=wa*rwb;
			
 
				+    rw3=rw2*(rwa2-rwb2);
			
 
				+    rw4=wb*rwa2*rwa2+wa*rwb2*rwb2;
			
 
				+    /*
			
 
				+    (1,1,1) ->
			
 
				+     (0,0,0)#
			
 
				+     (1,0,0) C(1,1)*C(1,0)*C(1,0)->  d^{1,0,0}*(rwa*B_{0,1,1}-rwb*A_{0,1,1})
			
 
				+     (0,1,0) C(1,0)*C(1,1)*C(1,0)->  d^{0,1,0}*(rwa*B_{1,0,1}-rwb*A_{1,0,1})
			
 
				+     (0,0,1) C(1,0)*C(1,0)*C(1,1)->  d^{0,0,1}*(rwa*B_{1,1,0}-rwb*A_{1,1,0})
			
 
				+     (1,1,0)*
			
 
				+     (1,0,1)*
			
 
				+     (0,1,1)*
			
 
				+     (1,1,1) C(1,1)*C(1,1)*C(1,1)->  d^{1,1,1}*(rwa^3*wb-rwb^3*wa)
			
 
				+    (2,1) ->
			
 
				+     (0,0)#
			
 
				+     (1,0) C(2,1)*C(1,1)->2*d^{1,0}*(rwa*B_{1,1}-rwb*A_{1,1})
			
 
				+     (0,1) C(2,0)*C(1,1)->  d^{0,1}*(rwa*B_{2,0}-rwb*A_{2,0})
			
 
				+     (2,0)*
			
 
				+     (1,1)*
			
 
				+     (2,1) C(2,2)*C(1,1)->  d^{2,1}*(rwa^3*wb-rwb^3*wa)
			
 
				+    (2,2) ->
			
 
				+     (0,0)#
			
 
				+     (1,0) C(2,1)*C(2,0)->2*d^{1,0}*(rwa*B_{1,2}-rwb*A_{1,2})
			
 
				+     (0,1) C(2,0)*C(2,1)->2*d^{0,1}*(rwa*B_{2,1}-rwb*A_{2,1})
			
 
				+     (2,0) C(2,2)*C(2,0)->  d^{2,0}*(rwa^2*B_{0,2}+rwb^2*A_{0,2})
			
 
				+     (1,1) C(2,1)*C(2,1)->4*d^{1,1}*(rwa^2*B_{1,1}+rwb^2*A_{1,1})
			
 
				+     (0,2) C(2,0)*C(2,2)->  d^{0,2}*(rwa^2*B_{2,0}+rwb^2*A_{2,0})
			
 
				+     (1,2)*
			
 
				+     (2,1)*
			
 
				+     (2,2) C(2,2)*C(2,2)*d^{2,2}*(rwa^4*wb+rwb^4*wa)
			
 
				+    */
			
 
				+    _dst->s2q2+=_src[i].s2q2+2*(ds*(rwa*sq2b-rwb*sq2a)+dq*(rwa*s2qb-rwb*s2qa))
			
 
				+     +ds2*(rwa2*q2b+rwb2*q2a)+4*ds*dq*(rwa2*sqb+rwb2*sqa)
			
 
				+     +dq2*(rwa2*s2b+rwb2*s2a)+ds2*dq2*rw4;
			
 
				+    _dst->s2q+=_src[i].s2q+2*ds*(rwa*sqb-rwb*sqa)
			
 
				+     +dq*(rwa*s2b-rwb*s2a)+ds2*dq*rw3;
			
 
				+    _dst->sq2+=_src[i].sq2+ds*(rwa*q2b-rwb*q2a)
			
 
				+     +2*dq*(rwa*sqb-rwb*sqa)+ds*dq2*rw3;
			
 
				+    _dst->sqr+=_src[i].sqr+ds*(rwa*qrb-rwb*qra)+dq*(rwa*srb-rwb*sra)
			
 
				+     +dr*(rwa*sqb-rwb*sqa)+ds*dq*dr*rw3;
			
 
				+    _dst->sqd+=_src[i].sqd+ds*(rwa*qdb-rwb*qda)+dq*(rwa*sdb-rwb*sda)
			
 
				+     +dd*(rwa*sqb-rwb*sqa)+ds*dq*dd*rw3;
			
 
				+    _dst->s2+=_src[i].s2+ds2*rw2;
			
 
				+    _dst->sq+=_src[i].sq+ds*dq*rw2;
			
 
				+    _dst->q2+=_src[i].q2+dq2*rw2;
			
 
				+    _dst->sr+=_src[i].sr+ds*dr*rw2;
			
 
				+    _dst->qr+=_src[i].qr+dq*dr*rw2;
			
 
				+    _dst->r2+=_src[i].r2+dr*dr*rw2;
			
 
				+    _dst->sd+=_src[i].sd+ds*dd*rw2;
			
 
				+    _dst->qd+=_src[i].qd+dq*dd*rw2;
			
 
				+    _dst->d2+=_src[i].d2+dd*dd*rw2;
			
 
				+    _dst->w+=_src[i].w;
			
 
				+    _dst->s+=_src[i].s;
			
 
				+    _dst->q+=_src[i].q;
			
 
				+    _dst->r+=_src[i].r;
			
 
				+    _dst->d+=_src[i].d;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Adjust a single corner of a set of metric bins to minimize the squared
			
 
				+   prediction error of R and D.
			
 
				+  Each bin is assumed to cover a quad like so:
			
 
				+    (s0,q0)    (s1,q0)
			
 
				+       A----------B
			
 
				+       |          |
			
 
				+       |          |
			
 
				+       |          |
			
 
				+       |          |
			
 
				+       C----------Z
			
 
				+    (s0,q1)    (s1,q1)
			
 
				+  The values A, B, and C are fixed, and Z is the free parameter.
			
 
				+  Then, for example, R_i is predicted via bilinear interpolation as
			
 
				+    x_i=(s_i-s0)/(s1-s0)
			
 
				+    y_i=(q_i-q0)/(q1-q0)
			
 
				+    dRds1_i=A+(B-A)*x_i
			
 
				+    dRds2_i=C+(Z-C)*x_i
			
 
				+    R_i=dRds1_i+(dRds2_i-dRds1_i)*y_i
			
 
				+  To find the Z that minimizes the squared prediction error over i, this can
			
 
				+   be rewritten as
			
 
				+    R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i)=x_i*y_i*Z
			
 
				+  Letting X={...,x_i*y_i,...}^T and
			
 
				+   Y={...,R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i),...}^T,
			
 
				+   the optimal Z is given by Z=(X^T.Y)/(X^T.X).
			
 
				+  Now, we need to compute these dot products without actually storing data for
			
 
				+   each sample.
			
 
				+  Starting with X^T.X, we have
			
 
				+   X^T.X = sum(x_i^2*y_i^2) = sum((s_i-s0)^2*(q_i-q0)^2)/((s1-s0)^2*(q1-q0)^2).
			
 
				+  Expanding the interior of the sum in a monomial basis of s_i and q_i gives
			
 
				+    s0^2*q0^2  *(1)
			
 
				+     -2*s0*q0^2*(s_i)
			
 
				+     -2*s0^2*q0*(q_i)
			
 
				+     +q0^2     *(s_i^2)
			
 
				+     +4*s0*q0  *(s_i*q_i)
			
 
				+     +s0^2     *(q_i^2)
			
 
				+     -2*q0     *(s_i^2*q_i)
			
 
				+     -2*s0     *(s_i*q_i^2)
			
 
				+     +1        *(s_i^2*q_i^2).
			
 
				+  However, computing things directly in this basis leads to gross numerical
			
 
				+   errors, as most of the terms will have similar size and destructive
			
 
				+   cancellation results.
			
 
				+  A much better basis is the central (co-)moment basis:
			
 
				+    {1,s_i-sbar,q_i-qbar,(s_i-sbar)^2,(s_i-sbar)*(q_i-qbar),(q_i-qbar)^2,
			
 
				+     (s_i-sbar)^2*(q_i-qbar),(s_i-sbar)*(q_i-qbar)^2,(s_i-sbar)^2*(q_i-qbar)^2},
			
 
				+   where sbar and qbar are the average s and q values over the bin,
			
 
				+   respectively.
			
 
				+  In that basis, letting ds=sbar-s0 and dq=qbar-q0, (s_i-s0)^2*(q_i-q0)^2 is
			
 
				+    ds^2*dq^2*(1)
			
 
				+     +dq^2   *((s_i-sbar)^2)
			
 
				+     +4*ds*dq*((s_i-sbar)*(q_i-qbar))
			
 
				+     +ds^2   *((q_i-qbar)^2)
			
 
				+     +2*dq   *((s_i-sbar)^2*(q_i-qbar))
			
 
				+     +2*ds   *((s_i-sbar)*(q_i-qbar)^2)
			
 
				+     +1      *((s_i-sbar)^2*(q_i-qbar)^2).
			
 
				+  With these expressions in the central (co-)moment bases, all we need to do
			
 
				+   is compute sums over the (co-)moment terms, which can be done
			
 
				+   incrementally (see oc_mode_metrics_add() and oc_mode_metrics_merge()),
			
 
				+   with no need to store the individual samples.
			
 
				+  Now, for X^T.Y, we have
			
 
				+    X^T.Y = sum((R_i-A-((B-A)/(s1-s0))*(s_i-s0)-((C-A)/(q1-q0))*(q_i-q0)
			
 
				+     -((A-B-C)/((s1-s0)*(q1-q0)))*(s_i-s0)*(q_i-q0))*(s_i-s0)*(q_i-q0))/
			
 
				+     ((s1-s0)*(q1-q0)),
			
 
				+   or, rewriting the constants to simplify notation,
			
 
				+    X^T.Y = sum((C0+C1*(s_i-s0)+C2*(q_i-q0)
			
 
				+     +C3*(s_i-s0)*(q_i-q0)+R_i)*(s_i-s0)*(q_i-q0))/((s1-s0)*(q1-q0)).
			
 
				+  Again, converting to the central (co-)moment basis, the interior of the
			
 
				+   above sum is
			
 
				+    ds*dq*(rbar+C0+C1*ds+C2*dq+C3*ds*dq)  *(1)
			
 
				+     +(C1*dq+C3*dq^2)                     *((s_i-sbar)^2)
			
 
				+     +(rbar+C0+2*C1*ds+2*C2*dq+4*C3*ds*dq)*((s_i-sbar)*(q_i-qbar))
			
 
				+     +(C2*ds+C3*ds^2)                     *((q_i-qbar)^2)
			
 
				+     +dq                                  *((s_i-sbar)*(r_i-rbar))
			
 
				+     +ds                                  *((q_i-qbar)*(r_i-rbar))
			
 
				+     +(C1+2*C3*dq)                        *((s_i-sbar)^2*(q_i-qbar))
			
 
				+     +(C2+2*C3*ds)                        *((s_i-sbar)*(q_i-qbar)^2)
			
 
				+     +1                                   *((s_i-sbar)*(q_i-qbar)*(r_i-rbar))
			
 
				+     +C3                                  *((s_i-sbar)^2*(q_i-qbar)^2).
			
 
				+  You might think it would be easier (if perhaps slightly less robust) to
			
 
				+   accumulate terms directly around s0 and q0.
			
 
				+  However, we update each corner of the bins in turn, so we would have to
			
 
				+   change basis to move the sums from corner to corner anyway.*/
			
 
				+double oc_mode_metrics_solve(double *_r,double *_d,
			
 
				+ const oc_mode_metrics *_metrics,const int *_s0,const int *_s1,
			
 
				+ const int *_q0,const int *_q1,
			
 
				+ const double *_ra,const double *_rb,const double *_rc,
			
 
				+ const double *_da,const double *_db,const double *_dc,int _n){
			
 
				+  double xx;
			
 
				+  double rxy;
			
 
				+  double dxy;
			
 
				+  double wt;
			
 
				+  int i;
			
 
				+  xx=rxy=dxy=wt=0;
			
 
				+  for(i=0;i<_n;i++)if(_metrics[i].w>0){
			
 
				+    double s10;
			
 
				+    double q10;
			
 
				+    double sq10;
			
 
				+    double ds;
			
 
				+    double dq;
			
 
				+    double ds2;
			
 
				+    double dq2;
			
 
				+    double r;
			
 
				+    double d;
			
 
				+    double s2;
			
 
				+    double sq;
			
 
				+    double q2;
			
 
				+    double sr;
			
 
				+    double qr;
			
 
				+    double sd;
			
 
				+    double qd;
			
 
				+    double s2q;
			
 
				+    double sq2;
			
 
				+    double sqr;
			
 
				+    double sqd;
			
 
				+    double s2q2;
			
 
				+    double c0;
			
 
				+    double c1;
			
 
				+    double c2;
			
 
				+    double c3;
			
 
				+    double w;
			
 
				+    w=_metrics[i].w;
			
 
				+    wt+=w;
			
 
				+    s10=_s1[i]-_s0[i];
			
 
				+    q10=_q1[i]-_q0[i];
			
 
				+    sq10=s10*q10;
			
 
				+    ds=_metrics[i].s/w-_s0[i];
			
 
				+    dq=_metrics[i].q/w-_q0[i];
			
 
				+    ds2=ds*ds;
			
 
				+    dq2=dq*dq;
			
 
				+    s2=_metrics[i].s2;
			
 
				+    sq=_metrics[i].sq;
			
 
				+    q2=_metrics[i].q2;
			
 
				+    s2q=_metrics[i].s2q;
			
 
				+    sq2=_metrics[i].sq2;
			
 
				+    s2q2=_metrics[i].s2q2;
			
 
				+    xx+=(dq2*(ds2*w+s2)+4*ds*dq*sq+ds2*q2+2*(dq*s2q+ds*sq2)+s2q2)/(sq10*sq10);
			
 
				+    r=_metrics[i].r/w;
			
 
				+    sr=_metrics[i].sr;
			
 
				+    qr=_metrics[i].qr;
			
 
				+    sqr=_metrics[i].sqr;
			
 
				+    c0=-_ra[i];
			
 
				+    c1=-(_rb[i]-_ra[i])/s10;
			
 
				+    c2=-(_rc[i]-_ra[i])/q10;
			
 
				+    c3=-(_ra[i]-_rb[i]-_rc[i])/sq10;
			
 
				+    rxy+=(ds*dq*(r+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2
			
 
				+     +(r+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sr+ds*qr
			
 
				+     +(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqr+c3*s2q2)/sq10;
			
 
				+    d=_metrics[i].d/w;
			
 
				+    sd=_metrics[i].sd;
			
 
				+    qd=_metrics[i].qd;
			
 
				+    sqd=_metrics[i].sqd;
			
 
				+    c0=-_da[i];
			
 
				+    c1=-(_db[i]-_da[i])/s10;
			
 
				+    c2=-(_dc[i]-_da[i])/q10;
			
 
				+    c3=-(_da[i]-_db[i]-_dc[i])/sq10;
			
 
				+    dxy+=(ds*dq*(d+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2
			
 
				+     +(d+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sd+ds*qd
			
 
				+     +(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqd+c3*s2q2)/sq10;
			
 
				+  }
			
 
				+  if(xx>1E-3){
			
 
				+    *_r=rxy/xx;
			
 
				+    *_d=dxy/xx;
			
 
				+  }
			
 
				+  else{
			
 
				+    *_r=0;
			
 
				+    *_d=0;
			
 
				+  }
			
 
				+  return wt;
			
 
				+}
			
 
				+
			
 
				+/*Compile collected SATD/logq/rate/RMSE metrics into a form that's immediately
			
 
				+   useful for mode decision.*/
			
 
				+void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
			
 
				+ int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
			
 
				+ int _shift,double (*_weight)[3][2][OC_COMP_BINS]){
			
 
				+  int niters;
			
 
				+  int prevdr;
			
 
				+  int prevdd;
			
 
				+  int dr;
			
 
				+  int dd;
			
 
				+  int pli;
			
 
				+  int qti;
			
 
				+  int qi;
			
 
				+  int si;
			
 
				+  dd=dr=INT_MAX;
			
 
				+  niters=0;
			
 
				+  /*The encoder interpolates rate and RMSE terms bilinearly from an
			
 
				+     OC_LOGQ_BINS by OC_COMP_BINS grid of sample points in _table.
			
 
				+    To find the sample values at the grid points that minimize the total
			
 
				+     squared prediction error actually requires solving a relatively sparse
			
 
				+     linear system with a number of variables equal to the number of grid
			
 
				+     points.
			
 
				+    Instead of writing a general sparse linear system solver, we just use
			
 
				+     Gauss-Seidel iteration, i.e., we update one grid point at time until
			
 
				+     they stop changing.*/
			
 
				+  do{
			
 
				+    prevdr=dr;
			
 
				+    prevdd=dd;
			
 
				+    dd=dr=0;
			
 
				+    for(pli=0;pli<3;pli++){
			
 
				+      for(qti=0;qti<2;qti++){
			
 
				+        for(qi=0;qi<OC_LOGQ_BINS;qi++){
			
 
				+          for(si=0;si<OC_COMP_BINS;si++){
			
 
				+            oc_mode_metrics m[4];
			
 
				+            int             s0[4];
			
 
				+            int             s1[4];
			
 
				+            int             q0[4];
			
 
				+            int             q1[4];
			
 
				+            double          ra[4];
			
 
				+            double          rb[4];
			
 
				+            double          rc[4];
			
 
				+            double          da[4];
			
 
				+            double          db[4];
			
 
				+            double          dc[4];
			
 
				+            double          r;
			
 
				+            double          d;
			
 
				+            int             rate;
			
 
				+            int             rmse;
			
 
				+            int             ds;
			
 
				+            int             n;
			
 
				+            n=0;
			
 
				+            /*Collect the statistics for the (up to) four bins grid point
			
 
				+               (si,qi) touches.*/
			
 
				+            if(qi>0&&si>0){
			
 
				+              q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
			
 
				+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
			
 
				+              s0[n]=si-1<<_shift;
			
 
				+              s1[n]=si<<_shift;
			
 
				+              ra[n]=ldexp(_table[qi-1][pli][qti][si-1].rate,-OC_BIT_SCALE);
			
 
				+              da[n]=ldexp(_table[qi-1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
			
 
				+              rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
			
 
				+              db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
			
 
				+              rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
			
 
				+              dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
			
 
				+              *(m+n++)=*(_metrics[qi-1][pli][qti]+si-1);
			
 
				+            }
			
 
				+            if(qi>0){
			
 
				+              ds=si+1<OC_COMP_BINS?1:-1;
			
 
				+              q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
			
 
				+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
			
 
				+              s0[n]=si+ds<<_shift;
			
 
				+              s1[n]=si<<_shift;
			
 
				+              ra[n]=ldexp(_table[qi-1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
			
 
				+              da[n]=
			
 
				+               ldexp(_table[qi-1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
			
 
				+              rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
			
 
				+              db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
			
 
				+              rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
			
 
				+              dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
			
 
				+              *(m+n++)=*(_metrics[qi-1][pli][qti]+si);
			
 
				+            }
			
 
				+            if(qi+1<OC_LOGQ_BINS&&si>0){
			
 
				+              q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
			
 
				+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
			
 
				+              s0[n]=si-1<<_shift;
			
 
				+              s1[n]=si<<_shift;
			
 
				+              ra[n]=ldexp(_table[qi+1][pli][qti][si-1].rate,-OC_BIT_SCALE);
			
 
				+              da[n]=ldexp(_table[qi+1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
			
 
				+              rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
			
 
				+              db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
			
 
				+              rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
			
 
				+              dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
			
 
				+              *(m+n++)=*(_metrics[qi][pli][qti]+si-1);
			
 
				+            }
			
 
				+            if(qi+1<OC_LOGQ_BINS){
			
 
				+              ds=si+1<OC_COMP_BINS?1:-1;
			
 
				+              q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
			
 
				+              q1[n]=OC_MODE_LOGQ[qi][pli][qti];
			
 
				+              s0[n]=si+ds<<_shift;
			
 
				+              s1[n]=si<<_shift;
			
 
				+              ra[n]=ldexp(_table[qi+1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
			
 
				+              da[n]=
			
 
				+               ldexp(_table[qi+1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
			
 
				+              rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
			
 
				+              db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
			
 
				+              rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
			
 
				+              dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
			
 
				+              *(m+n++)=*(_metrics[qi][pli][qti]+si);
			
 
				+            }
			
 
				+            /*On the first pass, initialize with a simple weighted average of
			
 
				+               the neighboring bins.*/
			
 
				+            if(!OC_HAS_MODE_METRICS&&niters==0){
			
 
				+              double w;
			
 
				+              w=r=d=0;
			
 
				+              while(n-->0){
			
 
				+                w+=m[n].w;
			
 
				+                r+=m[n].r;
			
 
				+                d+=m[n].d;
			
 
				+              }
			
 
				+              r=w>1E-3?r/w:0;
			
 
				+              d=w>1E-3?d/w:0;
			
 
				+              _weight[qi][pli][qti][si]=w;
			
 
				+            }
			
 
				+            else{
			
 
				+              /*Update the grid point and save the weight for later.*/
			
 
				+              _weight[qi][pli][qti][si]=
			
 
				+               oc_mode_metrics_solve(&r,&d,m,s0,s1,q0,q1,ra,rb,rc,da,db,dc,n);
			
 
				+            }
			
 
				+            rate=OC_CLAMPI(-32768,(int)(ldexp(r,OC_BIT_SCALE)+0.5),32767);
			
 
				+            rmse=OC_CLAMPI(-32768,(int)(ldexp(d,OC_RMSE_SCALE)+0.5),32767);
			
 
				+            dr+=abs(rate-_table[qi][pli][qti][si].rate);
			
 
				+            dd+=abs(rmse-_table[qi][pli][qti][si].rmse);
			
 
				+            _table[qi][pli][qti][si].rate=(ogg_int16_t)rate;
			
 
				+            _table[qi][pli][qti][si].rmse=(ogg_int16_t)rmse;
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  /*After a fixed number of initial iterations, only iterate so long as the
			
 
				+     total change is decreasing.
			
 
				+    This ensures we don't oscillate forever, which is a danger, as all of our
			
 
				+     results are rounded fairly coarsely.*/
			
 
				+  while((dr>0||dd>0)&&(niters++<_niters_min||(dr<prevdr&&dd<prevdd)));
			
 
				+  if(_reweight){
			
 
				+    /*Now, reduce the values of the optimal solution until we get enough
			
 
				+       samples in each bin to overcome the constant OC_ZWEIGHT factor.
			
 
				+      This encourages sampling under-populated bins and prevents a single large
			
 
				+       sample early on from discouraging coding in that bin ever again.*/
			
 
				+    for(pli=0;pli<3;pli++){
			
 
				+      for(qti=0;qti<2;qti++){
			
 
				+        for(qi=0;qi<OC_LOGQ_BINS;qi++){
			
 
				+          for(si=0;si<OC_COMP_BINS;si++){
			
 
				+            double wt;
			
 
				+            wt=_weight[qi][pli][qti][si];
			
 
				+            wt/=OC_ZWEIGHT+wt;
			
 
				+            _table[qi][pli][qti][si].rate=(ogg_int16_t)
			
 
				+             (_table[qi][pli][qti][si].rate*wt+0.5);
			
 
				+            _table[qi][pli][qti][si].rmse=(ogg_int16_t)
			
 
				+             (_table[qi][pli][qti][si].rmse*wt+0.5);
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Dump the in memory mode metrics to a file.
			
 
				+  Note this data format isn't portable between different platforms.*/
			
 
				+void oc_mode_metrics_dump(void){
			
 
				+  FILE *fmetrics;
			
 
				+  fmetrics=fopen(OC_MODE_METRICS_FILENAME,"wb");
			
 
				+  if(fmetrics!=NULL){
			
 
				+    (void)fwrite(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
			
 
				+    (void)fwrite(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
			
 
				+    (void)fwrite(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
			
 
				+    fclose(fmetrics);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void oc_mode_metrics_print_rd(FILE *_fout,const char *_table_name,
			
 
				+#if !defined(OC_COLLECT_METRICS)
			
 
				+ const oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
			
 
				+#else
			
 
				+ oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
			
 
				+#endif
			
 
				+  int qii;
			
 
				+  fprintf(_fout,
			
 
				+   "# if !defined(OC_COLLECT_METRICS)\n"
			
 
				+   "static const\n"
			
 
				+   "# endif\n"
			
 
				+   "oc_mode_rd %s[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={\n",_table_name);
			
 
				+  for(qii=0;qii<OC_LOGQ_BINS;qii++){
			
 
				+    int pli;
			
 
				+    fprintf(_fout,"  {\n");
			
 
				+    for(pli=0;pli<3;pli++){
			
 
				+      int qti;
			
 
				+      fprintf(_fout,"    {\n");
			
 
				+      for(qti=0;qti<2;qti++){
			
 
				+        int bin;
			
 
				+        int qi;
			
 
				+        static const char *pl_names[3]={"Y'","Cb","Cr"};
			
 
				+        static const char *qti_names[2]={"INTRA","INTER"};
			
 
				+        qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1);
			
 
				+        fprintf(_fout,"      /*%s  qi=%i  %s*/\n",
			
 
				+         pl_names[pli],qi,qti_names[qti]);
			
 
				+        fprintf(_fout,"      {\n");
			
 
				+        fprintf(_fout,"        ");
			
 
				+        for(bin=0;bin<OC_COMP_BINS;bin++){
			
 
				+          if(bin&&!(bin&0x3))fprintf(_fout,"\n        ");
			
 
				+          fprintf(_fout,"{%5i,%5i}",
			
 
				+           _mode_rd_table[qii][pli][qti][bin].rate,
			
 
				+           _mode_rd_table[qii][pli][qti][bin].rmse);
			
 
				+          if(bin+1<OC_COMP_BINS)fprintf(_fout,",");
			
 
				+        }
			
 
				+        fprintf(_fout,"\n      }");
			
 
				+        if(qti<1)fprintf(_fout,",");
			
 
				+        fprintf(_fout,"\n");
			
 
				+      }
			
 
				+      fprintf(_fout,"    }");
			
 
				+      if(pli<2)fprintf(_fout,",");
			
 
				+      fprintf(_fout,"\n");
			
 
				+    }
			
 
				+    fprintf(_fout,"  }");
			
 
				+    if(qii+1<OC_LOGQ_BINS)fprintf(_fout,",");
			
 
				+    fprintf(_fout,"\n");
			
 
				+  }
			
 
				+  fprintf(_fout,
			
 
				+   "};\n"
			
 
				+   "\n");
			
 
				+}
			
 
				+
			
 
				+void oc_mode_metrics_print(FILE *_fout){
			
 
				+  int qii;
			
 
				+  fprintf(_fout,
			
 
				+   "/*File generated by libtheora with OC_COLLECT_METRICS"
			
 
				+   " defined at compile time.*/\n"
			
 
				+   "#if !defined(_modedec_H)\n"
			
 
				+   "# define _modedec_H (1)\n"
			
 
				+   "# include \"encint.h\"\n"
			
 
				+   "\n"
			
 
				+   "\n"
			
 
				+   "\n"
			
 
				+   "/*The log of the average quantizer for each of the OC_MODE_RD table rows\n"
			
 
				+   "   (e.g., for the represented qi's, and each pli and qti), in Q10 format.\n"
			
 
				+   "  The actual statistics used by the encoder will be interpolated from\n"
			
 
				+   "   that table based on log_plq for the actual quantization matrix used.*/\n"
			
 
				+   "# if !defined(OC_COLLECT_METRICS)\n"
			
 
				+   "static const\n"
			
 
				+   "# endif\n"
			
 
				+   "ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={\n");
			
 
				+  for(qii=0;qii<OC_LOGQ_BINS;qii++){
			
 
				+    fprintf(_fout,"  { {0x%04X,0x%04X},{0x%04X,0x%04X},{0x%04X,0x%04X} }%s\n",
			
 
				+     OC_MODE_LOGQ[qii][0][0],OC_MODE_LOGQ[qii][0][1],OC_MODE_LOGQ[qii][1][0],
			
 
				+     OC_MODE_LOGQ[qii][1][1],OC_MODE_LOGQ[qii][2][0],OC_MODE_LOGQ[qii][2][1],
			
 
				+     qii+1<OC_LOGQ_BINS?",":"");
			
 
				+  }
			
 
				+  fprintf(_fout,
			
 
				+   "};\n"
			
 
				+   "\n");
			
 
				+  oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SATD",OC_MODE_RD_SATD);
			
 
				+  oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SAD",OC_MODE_RD_SAD);
			
 
				+  fprintf(_fout,
			
 
				+   "#endif\n");
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# if !defined(OC_COLLECT_NO_ENC_FUNCS)
			
 
				+void oc_enc_mode_metrics_load(oc_enc_ctx *_enc){
			
 
				+  oc_restore_fpu(&_enc->state);
			
 
				+  /*Load any existing mode metrics if we haven't already.*/
			
 
				+  if(!OC_HAS_MODE_METRICS){
			
 
				+    FILE *fmetrics;
			
 
				+    memset(OC_MODE_METRICS_SATD,0,sizeof(OC_MODE_METRICS_SATD));
			
 
				+    memset(OC_MODE_METRICS_SAD,0,sizeof(OC_MODE_METRICS_SAD));
			
 
				+    fmetrics=fopen(OC_MODE_METRICS_FILENAME,"rb");
			
 
				+    if(fmetrics!=NULL){
			
 
				+      /*Read in the binary structures as written my oc_mode_metrics_dump().
			
 
				+        Note this format isn't portable between different platforms.*/
			
 
				+      (void)fread(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
			
 
				+      (void)fread(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
			
 
				+      (void)fread(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
			
 
				+      fclose(fmetrics);
			
 
				+    }
			
 
				+    else{
			
 
				+      int qii;
			
 
				+      int qi;
			
 
				+      int pli;
			
 
				+      int qti;
			
 
				+      for(qii=0;qii<OC_LOGQ_BINS;qii++){
			
 
				+        qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1);
			
 
				+        for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
			
 
				+          OC_MODE_LOGQ[qii][pli][qti]=_enc->log_plq[qi][pli][qti];
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    oc_mode_metrics_update(OC_MODE_METRICS_SATD,100,1,
			
 
				+     OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
			
 
				+    oc_mode_metrics_update(OC_MODE_METRICS_SAD,100,1,
			
 
				+     OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
			
 
				+    OC_HAS_MODE_METRICS=1;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*The following token skipping code used to also be used in the decoder (and
			
 
				+   even at one point other places in the encoder).
			
 
				+  However, it was obsoleted by other optimizations, and is now only used here.
			
 
				+  It has been moved here to avoid generating the code when it's not needed.*/
			
 
				+
			
 
				+/*Determines the number of blocks or coefficients to be skipped for a given
			
 
				+   token value.
			
 
				+  _token:      The token value to skip.
			
 
				+  _extra_bits: The extra bits attached to this token.
			
 
				+  Return: A positive value indicates that number of coefficients are to be
			
 
				+           skipped in the current block.
			
 
				+          Otherwise, the negative of the return value indicates that number of
			
 
				+           blocks are to be ended.*/
			
 
				+typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits);
			
 
				+
			
 
				+/*Handles the simple end of block tokens.*/
			
 
				+static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
			
 
				+  int nblocks_adjust;
			
 
				+  nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
			
 
				+  return -_extra_bits-nblocks_adjust;
			
 
				+}
			
 
				+
			
 
				+/*The last EOB token has a special case, where an EOB run of size zero ends all
			
 
				+   the remaining blocks in the frame.*/
			
 
				+static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){
			
 
				+  /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not
			
 
				+     yet available everywhere; this should be equivalent.*/
			
 
				+  if(!_extra_bits)return -(~(size_t)0>>1);
			
 
				+  return -_extra_bits;
			
 
				+}
			
 
				+
			
 
				+/*Handles the pure zero run tokens.*/
			
 
				+static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){
			
 
				+  return _extra_bits+1;
			
 
				+}
			
 
				+
			
 
				+/*Handles a normal coefficient value token.*/
			
 
				+static ptrdiff_t oc_token_skip_val(void){
			
 
				+  return 1;
			
 
				+}
			
 
				+
			
 
				+/*Handles a category 1A zero run/coefficient value combo token.*/
			
 
				+static ptrdiff_t oc_token_skip_run_cat1a(int _token){
			
 
				+  return _token-OC_DCT_RUN_CAT1A+2;
			
 
				+}
			
 
				+
			
 
				+/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
			
 
				+static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
			
 
				+  int run_cati;
			
 
				+  int ncoeffs_mask;
			
 
				+  int ncoeffs_adjust;
			
 
				+  run_cati=_token-OC_DCT_RUN_CAT1B;
			
 
				+  ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
			
 
				+  ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
			
 
				+  return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
			
 
				+}
			
 
				+
			
 
				+/*A jump table for computing the number of coefficients or blocks to skip for
			
 
				+   a given token value.
			
 
				+  This reduces all the conditional branches, etc., needed to parse these token
			
 
				+   values down to one indirect jump.*/
			
 
				+static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={
			
 
				+  oc_token_skip_eob,
			
 
				+  oc_token_skip_eob,
			
 
				+  oc_token_skip_eob,
			
 
				+  oc_token_skip_eob,
			
 
				+  oc_token_skip_eob,
			
 
				+  oc_token_skip_eob,
			
 
				+  oc_token_skip_eob6,
			
 
				+  oc_token_skip_zrl,
			
 
				+  oc_token_skip_zrl,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_val,
			
 
				+  (oc_token_skip_func)oc_token_skip_run_cat1a,
			
 
				+  (oc_token_skip_func)oc_token_skip_run_cat1a,
			
 
				+  (oc_token_skip_func)oc_token_skip_run_cat1a,
			
 
				+  (oc_token_skip_func)oc_token_skip_run_cat1a,
			
 
				+  (oc_token_skip_func)oc_token_skip_run_cat1a,
			
 
				+  oc_token_skip_run,
			
 
				+  oc_token_skip_run,
			
 
				+  oc_token_skip_run,
			
 
				+  oc_token_skip_run
			
 
				+};
			
 
				+
			
 
				+/*Determines the number of blocks or coefficients to be skipped for a given
			
 
				+   token value.
			
 
				+  _token:      The token value to skip.
			
 
				+  _extra_bits: The extra bits attached to this token.
			
 
				+  Return: A positive value indicates that number of coefficients are to be
			
 
				+           skipped in the current block.
			
 
				+          Otherwise, the negative of the return value indicates that number of
			
 
				+           blocks are to be ended.
			
 
				+          0 will never be returned, so that at least one coefficient in one
			
 
				+           block will always be decoded for every token.*/
			
 
				+static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){
			
 
				+  return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
			
 
				+  static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
			
 
				+     0,16,16,16,16,16,32,32,
			
 
				+    32,32,32,32,32,32,32,48,
			
 
				+    48,48,48,48,48,48,48,48,
			
 
				+    48,48,48,48,64,64,64,64,
			
 
				+    64,64,64,64,64,64,64,64,
			
 
				+    64,64,64,64,64,64,64,64,
			
 
				+    64,64,64,64,64,64,64,64
			
 
				+  };
			
 
				+  const oc_fragment *frags;
			
 
				+  const unsigned    *frag_sad;
			
 
				+  const unsigned    *frag_satd;
			
 
				+  const unsigned    *frag_ssd;
			
 
				+  const ptrdiff_t   *coded_fragis;
			
 
				+  ptrdiff_t          ncoded_fragis;
			
 
				+  ptrdiff_t          fragii;
			
 
				+  double             fragw;
			
 
				+  int                modelines[3][3][2];
			
 
				+  int                qti;
			
 
				+  int                qii;
			
 
				+  int                qi;
			
 
				+  int                pli;
			
 
				+  int                zzi;
			
 
				+  int                token;
			
 
				+  int                eb;
			
 
				+  oc_restore_fpu(&_enc->state);
			
 
				+  /*Figure out which metric bins to use for this frame's quantizers.*/
			
 
				+  for(qii=0;qii<_enc->state.nqis;qii++){
			
 
				+    for(pli=0;pli<3;pli++){
			
 
				+      for(qti=0;qti<2;qti++){
			
 
				+        int log_plq;
			
 
				+        int modeline;
			
 
				+        log_plq=_enc->log_plq[_enc->state.qis[qii]][pli][qti];
			
 
				+        for(modeline=0;modeline<OC_LOGQ_BINS-1&&
			
 
				+         OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
			
 
				+        modelines[qii][pli][qti]=modeline;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  qti=_enc->state.frame_type;
			
 
				+  frags=_enc->state.frags;
			
 
				+  frag_sad=_enc->frag_sad;
			
 
				+  frag_satd=_enc->frag_satd;
			
 
				+  frag_ssd=_enc->frag_ssd;
			
 
				+  coded_fragis=_enc->state.coded_fragis;
			
 
				+  ncoded_fragis=fragii=0;
			
 
				+  /*Weight the fragments by the inverse frame size; this prevents HD content
			
 
				+     from dominating the statistics.*/
			
 
				+  fragw=1.0/_enc->state.nfrags;
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    ptrdiff_t ti[64];
			
 
				+    int       eob_token[64];
			
 
				+    int       eob_run[64];
			
 
				+    /*Set up token indices and eob run counts.
			
 
				+      We don't bother trying to figure out the real cost of the runs that span
			
 
				+       coefficients; instead we use the costs that were available when R-D
			
 
				+       token optimization was done.*/
			
 
				+    for(zzi=0;zzi<64;zzi++){
			
 
				+      ti[zzi]=_enc->dct_token_offs[pli][zzi];
			
 
				+      if(ti[zzi]>0){
			
 
				+        token=_enc->dct_tokens[pli][zzi][0];
			
 
				+        eb=_enc->extra_bits[pli][zzi][0];
			
 
				+        eob_token[zzi]=token;
			
 
				+        eob_run[zzi]=-oc_dct_token_skip(token,eb);
			
 
				+      }
			
 
				+      else{
			
 
				+        eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
			
 
				+        eob_run[zzi]=0;
			
 
				+      }
			
 
				+    }
			
 
				+    /*Scan the list of coded fragments for this plane.*/
			
 
				+    ncoded_fragis+=_enc->state.ncoded_fragis[pli];
			
 
				+    for(;fragii<ncoded_fragis;fragii++){
			
 
				+      ptrdiff_t fragi;
			
 
				+      int       frag_bits;
			
 
				+      int       huffi;
			
 
				+      int       skip;
			
 
				+      int       mb_mode;
			
 
				+      unsigned  sad;
			
 
				+      unsigned  satd;
			
 
				+      double    sqrt_ssd;
			
 
				+      int       bin;
			
 
				+      int       qtj;
			
 
				+      fragi=coded_fragis[fragii];
			
 
				+      frag_bits=0;
			
 
				+      for(zzi=0;zzi<64;){
			
 
				+        if(eob_run[zzi]>0){
			
 
				+          /*We've reached the end of the block.*/
			
 
				+          eob_run[zzi]--;
			
 
				+          break;
			
 
				+        }
			
 
				+        huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1]
			
 
				+         +OC_ZZI_HUFF_OFFSET[zzi];
			
 
				+        if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){
			
 
				+          /*This token caused an EOB run to be flushed.
			
 
				+            Therefore it gets the bits associated with it.*/
			
 
				+          frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits
			
 
				+           +OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]];
			
 
				+          eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
			
 
				+        }
			
 
				+        token=_enc->dct_tokens[pli][zzi][ti[zzi]];
			
 
				+        eb=_enc->extra_bits[pli][zzi][ti[zzi]];
			
 
				+        ti[zzi]++;
			
 
				+        skip=oc_dct_token_skip(token,eb);
			
 
				+        if(skip<0){
			
 
				+          eob_token[zzi]=token;
			
 
				+          eob_run[zzi]=-skip;
			
 
				+        }
			
 
				+        else{
			
 
				+          /*A regular DCT value token; accumulate the bits for it.*/
			
 
				+          frag_bits+=_enc->huff_codes[huffi][token].nbits
			
 
				+           +OC_DCT_TOKEN_EXTRA_BITS[token];
			
 
				+          zzi+=skip;
			
 
				+        }
			
 
				+      }
			
 
				+      mb_mode=frags[fragi].mb_mode;
			
 
				+      qii=frags[fragi].qii;
			
 
				+      qi=_enc->state.qis[qii];
			
 
				+      sad=frag_sad[fragi]<<(pli+1&2);
			
 
				+      satd=frag_satd[fragi]<<(pli+1&2);
			
 
				+      sqrt_ssd=sqrt(frag_ssd[fragi]);
			
 
				+      qtj=mb_mode!=OC_MODE_INTRA;
			
 
				+      /*Accumulate statistics.
			
 
				+        The rate (frag_bits) and RMSE (sqrt(frag_ssd)) are not scaled by
			
 
				+         OC_BIT_SCALE and OC_RMSE_SCALE; this lets us change the scale factor
			
 
				+         yet still use old data.*/
			
 
				+      bin=OC_MINI(satd>>OC_SATD_SHIFT,OC_COMP_BINS-1);
			
 
				+      oc_mode_metrics_add(
			
 
				+       OC_MODE_METRICS_SATD[modelines[qii][pli][qtj]][pli][qtj]+bin,
			
 
				+       fragw,satd,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
			
 
				+      bin=OC_MINI(sad>>OC_SAD_SHIFT,OC_COMP_BINS-1);
			
 
				+      oc_mode_metrics_add(
			
 
				+       OC_MODE_METRICS_SAD[modelines[qii][pli][qtj]][pli][qtj]+bin,
			
 
				+       fragw,sad,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
			
 
				+    }
			
 
				+  }
			
 
				+  /*Update global SA(T)D/logq/rate/RMSE estimation matrix.*/
			
 
				+  oc_mode_metrics_update(OC_MODE_METRICS_SATD,4,1,
			
 
				+   OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
			
 
				+  oc_mode_metrics_update(OC_MODE_METRICS_SAD,4,1,
			
 
				+   OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
			
 
				+}
			
 
				+# endif
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/collect.h
+++ b/modules/theoraplayer/native/theora/lib/collect.h
@@ -0,0 +1,109 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function: mode selection code
			
 
				+  last mod: $Id$
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#if !defined(_collect_H)
			
 
				+# define _collect_H (1)
			
 
				+# include "encint.h"
			
 
				+# if defined(OC_COLLECT_METRICS)
			
 
				+#  include <stdio.h>
			
 
				+
			
 
				+
			
 
				+
			
 
				+typedef struct oc_mode_metrics oc_mode_metrics;
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**Sets the file name to load/store mode metrics from/to.
			
 
				+ * The file name string is stored by reference, and so must be valid for the
			
 
				+ *  lifetime of the encoder.
			
 
				+ * Mode metric collection uses global tables; do not attempt to perform
			
 
				+ *  multiple collections at once.
			
 
				+ * \param[in] _buf <tt>char[]</tt> The file name.
			
 
				+ * \retval TH_EIMPL   Not supported by this implementation.*/
			
 
				+#define TH_ENCCTL_SET_METRICS_FILE (0x8000)
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Accumulates various weighted sums of the measurements.
			
 
				+  w -> weight
			
 
				+  s -> SATD
			
 
				+  q -> log quantizer
			
 
				+  r -> rate (in bits)
			
 
				+  d -> RMSE
			
 
				+  All of the single letters correspond to direct, weighted sums, e.g.,
			
 
				+   w=sum(w_i), s=sum(s_i*w_i), etc.
			
 
				+  The others correspond to central moments (or co-moments) of the given order,
			
 
				+   e.g., sq=sum((s_i-s/w)*(q_i-q/w)*w_i).
			
 
				+  Because we need some moments up to fourth order, we use central moments to
			
 
				+   minimize the dynamic range and prevent rounding error from dominating the
			
 
				+   calculations.*/
			
 
				+struct oc_mode_metrics{
			
 
				+  double w;
			
 
				+  double s;
			
 
				+  double q;
			
 
				+  double r;
			
 
				+  double d;
			
 
				+  double s2;
			
 
				+  double sq;
			
 
				+  double q2;
			
 
				+  double sr;
			
 
				+  double qr;
			
 
				+  double r2;
			
 
				+  double sd;
			
 
				+  double qd;
			
 
				+  double d2;
			
 
				+  double s2q;
			
 
				+  double sq2;
			
 
				+  double sqr;
			
 
				+  double sqd;
			
 
				+  double s2q2;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+# define OC_ZWEIGHT   (0.25)
			
 
				+
			
 
				+/*TODO: It may be helpful (for block-level quantizers especially) to separate
			
 
				+   out the contributions from AC and DC into separate tables.*/
			
 
				+
			
 
				+extern ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2];
			
 
				+extern oc_mode_rd  OC_MODE_RD_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
			
 
				+extern oc_mode_rd  OC_MODE_RD_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
			
 
				+
			
 
				+extern int              OC_HAS_MODE_METRICS;
			
 
				+extern oc_mode_metrics  OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
			
 
				+extern oc_mode_metrics  OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
			
 
				+extern const char      *OC_MODE_METRICS_FILENAME;
			
 
				+
			
 
				+void oc_mode_metrics_dump();
			
 
				+void oc_mode_metrics_print(FILE *_fout);
			
 
				+
			
 
				+void oc_mode_metrics_add(oc_mode_metrics *_metrics,
			
 
				+ double _w,int _s,int _q,int _r,double _d);
			
 
				+void oc_mode_metrics_merge(oc_mode_metrics *_dst,
			
 
				+ const oc_mode_metrics *_src,int _n);
			
 
				+double oc_mode_metrics_solve(double *_r,double *_d,
			
 
				+ const oc_mode_metrics *_metrics,const int *_s0,const int *_s1,
			
 
				+ const int *_q0,const int *_q1,
			
 
				+ const double *_ra,const double *_rb,const double *_rc,
			
 
				+ const double *_da,const double *_db,const double *_dc,int _n);
			
 
				+void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
			
 
				+ int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
			
 
				+ int shift,double (*_weight)[3][2][OC_COMP_BINS]);
			
 
				+void oc_enc_mode_metrics_load(oc_enc_ctx *_enc);
			
 
				+void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
			
 
				+
			
 
				+# endif
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/dct.h
+++ b/modules/theoraplayer/native/theora/lib/dct.h
@@ -0,0 +1,31 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+  last mod: $Id: dct.h 16503 2009-08-22 18:14:02Z giles $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+/*Definitions shared by the forward and inverse DCT transforms.*/
			
 
				+#if !defined(_dct_H)
			
 
				+# define _dct_H (1)
			
 
				+
			
 
				+/*cos(n*pi/16) (resp. sin(m*pi/16)) scaled by 65536.*/
			
 
				+#define OC_C1S7 ((ogg_int32_t)64277)
			
 
				+#define OC_C2S6 ((ogg_int32_t)60547)
			
 
				+#define OC_C3S5 ((ogg_int32_t)54491)
			
 
				+#define OC_C4S4 ((ogg_int32_t)46341)
			
 
				+#define OC_C5S3 ((ogg_int32_t)36410)
			
 
				+#define OC_C6S2 ((ogg_int32_t)25080)
			
 
				+#define OC_C7S1 ((ogg_int32_t)12785)
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/decapiwrapper.c
+++ b/modules/theoraplayer/native/theora/lib/decapiwrapper.c
@@ -0,0 +1,193 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: decapiwrapper.c 13596 2007-08-23 20:05:38Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <limits.h>
			
 
				+#include "apiwrapper.h"
			
 
				+#include "decint.h"
			
 
				+#include "theora/theoradec.h"
			
 
				+
			
 
				+static void th_dec_api_clear(th_api_wrapper *_api){
			
 
				+  if(_api->setup)th_setup_free(_api->setup);
			
 
				+  if(_api->decode)th_decode_free(_api->decode);
			
 
				+  memset(_api,0,sizeof(*_api));
			
 
				+}
			
 
				+
			
 
				+static void theora_decode_clear(theora_state *_td){
			
 
				+  if(_td->i!=NULL)theora_info_clear(_td->i);
			
 
				+  memset(_td,0,sizeof(*_td));
			
 
				+}
			
 
				+
			
 
				+static int theora_decode_control(theora_state *_td,int _req,
			
 
				+ void *_buf,size_t _buf_sz){
			
 
				+  return th_decode_ctl(((th_api_wrapper *)_td->i->codec_setup)->decode,
			
 
				+   _req,_buf,_buf_sz);
			
 
				+}
			
 
				+
			
 
				+static ogg_int64_t theora_decode_granule_frame(theora_state *_td,
			
 
				+ ogg_int64_t _gp){
			
 
				+  return th_granule_frame(((th_api_wrapper *)_td->i->codec_setup)->decode,_gp);
			
 
				+}
			
 
				+
			
 
				+static double theora_decode_granule_time(theora_state *_td,ogg_int64_t _gp){
			
 
				+  return th_granule_time(((th_api_wrapper *)_td->i->codec_setup)->decode,_gp);
			
 
				+}
			
 
				+
			
 
				+static const oc_state_dispatch_vtable OC_DEC_DISPATCH_VTBL={
			
 
				+  (oc_state_clear_func)theora_decode_clear,
			
 
				+  (oc_state_control_func)theora_decode_control,
			
 
				+  (oc_state_granule_frame_func)theora_decode_granule_frame,
			
 
				+  (oc_state_granule_time_func)theora_decode_granule_time,
			
 
				+};
			
 
				+
			
 
				+static void th_info2theora_info(theora_info *_ci,const th_info *_info){
			
 
				+  _ci->version_major=_info->version_major;
			
 
				+  _ci->version_minor=_info->version_minor;
			
 
				+  _ci->version_subminor=_info->version_subminor;
			
 
				+  _ci->width=_info->frame_width;
			
 
				+  _ci->height=_info->frame_height;
			
 
				+  _ci->frame_width=_info->pic_width;
			
 
				+  _ci->frame_height=_info->pic_height;
			
 
				+  _ci->offset_x=_info->pic_x;
			
 
				+  _ci->offset_y=_info->pic_y;
			
 
				+  _ci->fps_numerator=_info->fps_numerator;
			
 
				+  _ci->fps_denominator=_info->fps_denominator;
			
 
				+  _ci->aspect_numerator=_info->aspect_numerator;
			
 
				+  _ci->aspect_denominator=_info->aspect_denominator;
			
 
				+  switch(_info->colorspace){
			
 
				+    case TH_CS_ITU_REC_470M:_ci->colorspace=OC_CS_ITU_REC_470M;break;
			
 
				+    case TH_CS_ITU_REC_470BG:_ci->colorspace=OC_CS_ITU_REC_470BG;break;
			
 
				+    default:_ci->colorspace=OC_CS_UNSPECIFIED;break;
			
 
				+  }
			
 
				+  switch(_info->pixel_fmt){
			
 
				+    case TH_PF_420:_ci->pixelformat=OC_PF_420;break;
			
 
				+    case TH_PF_422:_ci->pixelformat=OC_PF_422;break;
			
 
				+    case TH_PF_444:_ci->pixelformat=OC_PF_444;break;
			
 
				+    default:_ci->pixelformat=OC_PF_RSVD;
			
 
				+  }
			
 
				+  _ci->target_bitrate=_info->target_bitrate;
			
 
				+  _ci->quality=_info->quality;
			
 
				+  _ci->keyframe_frequency_force=1<<_info->keyframe_granule_shift;
			
 
				+}
			
 
				+
			
 
				+int theora_decode_init(theora_state *_td,theora_info *_ci){
			
 
				+  th_api_info    *apiinfo;
			
 
				+  th_api_wrapper *api;
			
 
				+  th_info         info;
			
 
				+  api=(th_api_wrapper *)_ci->codec_setup;
			
 
				+  /*Allocate our own combined API wrapper/theora_info struct.
			
 
				+    We put them both in one malloc'd block so that when the API wrapper is
			
 
				+     freed, the info struct goes with it.
			
 
				+    This avoids having to figure out whether or not we need to free the info
			
 
				+     struct in either theora_info_clear() or theora_clear().*/
			
 
				+  apiinfo=(th_api_info *)_ogg_calloc(1,sizeof(*apiinfo));
			
 
				+  if(apiinfo==NULL)return OC_FAULT;
			
 
				+  /*Make our own copy of the info struct, since its lifetime should be
			
 
				+     independent of the one we were passed in.*/
			
 
				+  *&apiinfo->info=*_ci;
			
 
				+  /*Convert the info struct now instead of saving the the one we decoded with
			
 
				+     theora_decode_header(), since the user might have modified values (i.e.,
			
 
				+     color space, aspect ratio, etc. can be specified from a higher level).
			
 
				+    The user also might be doing something "clever" with the header packets if
			
 
				+     they are not using an Ogg encapsulation.*/
			
 
				+  oc_theora_info2th_info(&info,_ci);
			
 
				+  /*Don't bother to copy the setup info; th_decode_alloc() makes its own copy
			
 
				+     of the stuff it needs.*/
			
 
				+  apiinfo->api.decode=th_decode_alloc(&info,api->setup);
			
 
				+  if(apiinfo->api.decode==NULL){
			
 
				+    _ogg_free(apiinfo);
			
 
				+    return OC_EINVAL;
			
 
				+  }
			
 
				+  apiinfo->api.clear=(oc_setup_clear_func)th_dec_api_clear;
			
 
				+  _td->internal_encode=NULL;
			
 
				+  /*Provide entry points for ABI compatibility with old decoder shared libs.*/
			
 
				+  _td->internal_decode=(void *)&OC_DEC_DISPATCH_VTBL;
			
 
				+  _td->granulepos=0;
			
 
				+  _td->i=&apiinfo->info;
			
 
				+  _td->i->codec_setup=&apiinfo->api;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+int theora_decode_header(theora_info *_ci,theora_comment *_cc,ogg_packet *_op){
			
 
				+  th_api_wrapper *api;
			
 
				+  th_info         info;
			
 
				+  int             ret;
			
 
				+  api=(th_api_wrapper *)_ci->codec_setup;
			
 
				+  /*Allocate an API wrapper struct on demand, since it will not also include a
			
 
				+     theora_info struct like the ones that are used in a theora_state struct.*/
			
 
				+  if(api==NULL){
			
 
				+    _ci->codec_setup=_ogg_calloc(1,sizeof(*api));
			
 
				+    if(_ci->codec_setup==NULL)return OC_FAULT;
			
 
				+    api=(th_api_wrapper *)_ci->codec_setup;
			
 
				+    api->clear=(oc_setup_clear_func)th_dec_api_clear;
			
 
				+  }
			
 
				+  /*Convert from the theora_info struct instead of saving our own th_info
			
 
				+     struct between calls.
			
 
				+    The user might be doing something "clever" with the header packets if they
			
 
				+     are not using an Ogg encapsulation, and we don't want to break this.*/
			
 
				+  oc_theora_info2th_info(&info,_ci);
			
 
				+  /*We rely on the fact that theora_comment and th_comment structures are
			
 
				+     actually identical.
			
 
				+    Take care not to change this fact unless you change the code here as
			
 
				+     well!*/
			
 
				+  ret=th_decode_headerin(&info,(th_comment *)_cc,&api->setup,_op);
			
 
				+  /*We also rely on the fact that the error return code values are the same,
			
 
				+    and that the implementations of these two functions return the same set of
			
 
				+    them.
			
 
				+   Note that theora_decode_header() really can return OC_NOTFORMAT, even
			
 
				+    though it is not currently documented to do so.*/
			
 
				+  if(ret<0)return ret;
			
 
				+  th_info2theora_info(_ci,&info);
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+int theora_decode_packetin(theora_state *_td,ogg_packet *_op){
			
 
				+  th_api_wrapper *api;
			
 
				+  ogg_int64_t     gp;
			
 
				+  int             ret;
			
 
				+  if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
			
 
				+  api=(th_api_wrapper *)_td->i->codec_setup;
			
 
				+  ret=th_decode_packetin(api->decode,_op,&gp);
			
 
				+  if(ret<0)return OC_BADPACKET;
			
 
				+  _td->granulepos=gp;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+int theora_decode_YUVout(theora_state *_td,yuv_buffer *_yuv){
			
 
				+  th_api_wrapper  *api;
			
 
				+  th_dec_ctx      *decode;
			
 
				+  th_ycbcr_buffer  buf;
			
 
				+  int              ret;
			
 
				+  if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
			
 
				+  api=(th_api_wrapper *)_td->i->codec_setup;
			
 
				+  decode=(th_dec_ctx *)api->decode;
			
 
				+  if(!decode)return OC_FAULT;
			
 
				+  ret=th_decode_ycbcr_out(decode,buf);
			
 
				+  if(ret>=0){
			
 
				+    _yuv->y_width=buf[0].width;
			
 
				+    _yuv->y_height=buf[0].height;
			
 
				+    _yuv->y_stride=buf[0].stride;
			
 
				+    _yuv->uv_width=buf[1].width;
			
 
				+    _yuv->uv_height=buf[1].height;
			
 
				+    _yuv->uv_stride=buf[1].stride;
			
 
				+    _yuv->y=buf[0].data;
			
 
				+    _yuv->u=buf[1].data;
			
 
				+    _yuv->v=buf[2].data;
			
 
				+  }
			
 
				+  return ret;
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/decinfo.c
+++ b/modules/theoraplayer/native/theora/lib/decinfo.c
@@ -0,0 +1,250 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: decinfo.c 17276 2010-06-05 05:57:05Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <limits.h>
			
 
				+#include "decint.h"
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Unpacks a series of octets from a given byte array into the pack buffer.
			
 
				+  No checking is done to ensure the buffer contains enough data.
			
 
				+  _opb: The pack buffer to read the octets from.
			
 
				+  _buf: The byte array to store the unpacked bytes in.
			
 
				+  _len: The number of octets to unpack.*/
			
 
				+static void oc_unpack_octets(oc_pack_buf *_opb,char *_buf,size_t _len){
			
 
				+  while(_len-->0){
			
 
				+    long val;
			
 
				+    val=oc_pack_read(_opb,8);
			
 
				+    *_buf++=(char)val;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Unpacks a 32-bit integer encoded by octets in little-endian form.*/
			
 
				+static long oc_unpack_length(oc_pack_buf *_opb){
			
 
				+  long ret[4];
			
 
				+  int  i;
			
 
				+  for(i=0;i<4;i++)ret[i]=oc_pack_read(_opb,8);
			
 
				+  return ret[0]|ret[1]<<8|ret[2]<<16|ret[3]<<24;
			
 
				+}
			
 
				+
			
 
				+static int oc_info_unpack(oc_pack_buf *_opb,th_info *_info){
			
 
				+  long val;
			
 
				+  /*Check the codec bitstream version.*/
			
 
				+  val=oc_pack_read(_opb,8);
			
 
				+  _info->version_major=(unsigned char)val;
			
 
				+  val=oc_pack_read(_opb,8);
			
 
				+  _info->version_minor=(unsigned char)val;
			
 
				+  val=oc_pack_read(_opb,8);
			
 
				+  _info->version_subminor=(unsigned char)val;
			
 
				+  /*verify we can parse this bitstream version.
			
 
				+     We accept earlier minors and all subminors, by spec*/
			
 
				+  if(_info->version_major>TH_VERSION_MAJOR||
			
 
				+   _info->version_major==TH_VERSION_MAJOR&&
			
 
				+   _info->version_minor>TH_VERSION_MINOR){
			
 
				+    return TH_EVERSION;
			
 
				+  }
			
 
				+  /*Read the encoded frame description.*/
			
 
				+  val=oc_pack_read(_opb,16);
			
 
				+  _info->frame_width=(ogg_uint32_t)val<<4;
			
 
				+  val=oc_pack_read(_opb,16);
			
 
				+  _info->frame_height=(ogg_uint32_t)val<<4;
			
 
				+  val=oc_pack_read(_opb,24);
			
 
				+  _info->pic_width=(ogg_uint32_t)val;
			
 
				+  val=oc_pack_read(_opb,24);
			
 
				+  _info->pic_height=(ogg_uint32_t)val;
			
 
				+  val=oc_pack_read(_opb,8);
			
 
				+  _info->pic_x=(ogg_uint32_t)val;
			
 
				+  val=oc_pack_read(_opb,8);
			
 
				+  _info->pic_y=(ogg_uint32_t)val;
			
 
				+  val=oc_pack_read(_opb,32);
			
 
				+  _info->fps_numerator=(ogg_uint32_t)val;
			
 
				+  val=oc_pack_read(_opb,32);
			
 
				+  _info->fps_denominator=(ogg_uint32_t)val;
			
 
				+  if(_info->frame_width==0||_info->frame_height==0||
			
 
				+   _info->pic_width+_info->pic_x>_info->frame_width||
			
 
				+   _info->pic_height+_info->pic_y>_info->frame_height||
			
 
				+   _info->fps_numerator==0||_info->fps_denominator==0){
			
 
				+    return TH_EBADHEADER;
			
 
				+  }
			
 
				+  /*Note: The sense of pic_y is inverted in what we pass back to the
			
 
				+     application compared to how it is stored in the bitstream.
			
 
				+    This is because the bitstream uses a right-handed coordinate system, while
			
 
				+     applications expect a left-handed one.*/
			
 
				+  _info->pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
			
 
				+  val=oc_pack_read(_opb,24);
			
 
				+  _info->aspect_numerator=(ogg_uint32_t)val;
			
 
				+  val=oc_pack_read(_opb,24);
			
 
				+  _info->aspect_denominator=(ogg_uint32_t)val;
			
 
				+  val=oc_pack_read(_opb,8);
			
 
				+  _info->colorspace=(th_colorspace)val;
			
 
				+  val=oc_pack_read(_opb,24);
			
 
				+  _info->target_bitrate=(int)val;
			
 
				+  val=oc_pack_read(_opb,6);
			
 
				+  _info->quality=(int)val;
			
 
				+  val=oc_pack_read(_opb,5);
			
 
				+  _info->keyframe_granule_shift=(int)val;
			
 
				+  val=oc_pack_read(_opb,2);
			
 
				+  _info->pixel_fmt=(th_pixel_fmt)val;
			
 
				+  if(_info->pixel_fmt==TH_PF_RSVD)return TH_EBADHEADER;
			
 
				+  val=oc_pack_read(_opb,3);
			
 
				+  if(val!=0||oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static int oc_comment_unpack(oc_pack_buf *_opb,th_comment *_tc){
			
 
				+  long len;
			
 
				+  int  i;
			
 
				+  /*Read the vendor string.*/
			
 
				+  len=oc_unpack_length(_opb);
			
 
				+  if(len<0||len>oc_pack_bytes_left(_opb))return TH_EBADHEADER;
			
 
				+  _tc->vendor=_ogg_malloc((size_t)len+1);
			
 
				+  if(_tc->vendor==NULL)return TH_EFAULT;
			
 
				+  oc_unpack_octets(_opb,_tc->vendor,len);
			
 
				+  _tc->vendor[len]='\0';
			
 
				+  /*Read the user comments.*/
			
 
				+  _tc->comments=(int)oc_unpack_length(_opb);
			
 
				+  len=_tc->comments;
			
 
				+  if(len<0||len>(LONG_MAX>>2)||len<<2>oc_pack_bytes_left(_opb)){
			
 
				+    _tc->comments=0;
			
 
				+    return TH_EBADHEADER;
			
 
				+  }
			
 
				+  _tc->comment_lengths=(int *)_ogg_malloc(
			
 
				+   _tc->comments*sizeof(_tc->comment_lengths[0]));
			
 
				+  _tc->user_comments=(char **)_ogg_malloc(
			
 
				+   _tc->comments*sizeof(_tc->user_comments[0]));
			
 
				+  if(_tc->comment_lengths==NULL||_tc->user_comments==NULL){
			
 
				+    _tc->comments=0;
			
 
				+    return TH_EFAULT;
			
 
				+  }
			
 
				+  for(i=0;i<_tc->comments;i++){
			
 
				+    len=oc_unpack_length(_opb);
			
 
				+    if(len<0||len>oc_pack_bytes_left(_opb)){
			
 
				+      _tc->comments=i;
			
 
				+      return TH_EBADHEADER;
			
 
				+    }
			
 
				+    _tc->comment_lengths[i]=len;
			
 
				+    _tc->user_comments[i]=_ogg_malloc((size_t)len+1);
			
 
				+    if(_tc->user_comments[i]==NULL){
			
 
				+      _tc->comments=i;
			
 
				+      return TH_EFAULT;
			
 
				+    }
			
 
				+    oc_unpack_octets(_opb,_tc->user_comments[i],len);
			
 
				+    _tc->user_comments[i][len]='\0';
			
 
				+  }
			
 
				+  return oc_pack_bytes_left(_opb)<0?TH_EBADHEADER:0;
			
 
				+}
			
 
				+
			
 
				+static int oc_setup_unpack(oc_pack_buf *_opb,th_setup_info *_setup){
			
 
				+  int ret;
			
 
				+  /*Read the quantizer tables.*/
			
 
				+  ret=oc_quant_params_unpack(_opb,&_setup->qinfo);
			
 
				+  if(ret<0)return ret;
			
 
				+  /*Read the Huffman trees.*/
			
 
				+  return oc_huff_trees_unpack(_opb,_setup->huff_tables);
			
 
				+}
			
 
				+
			
 
				+static void oc_setup_clear(th_setup_info *_setup){
			
 
				+  oc_quant_params_clear(&_setup->qinfo);
			
 
				+  oc_huff_trees_clear(_setup->huff_tables);
			
 
				+}
			
 
				+
			
 
				+static int oc_dec_headerin(oc_pack_buf *_opb,th_info *_info,
			
 
				+ th_comment *_tc,th_setup_info **_setup,ogg_packet *_op){
			
 
				+  char buffer[6];
			
 
				+  long val;
			
 
				+  int  packtype;
			
 
				+  int  ret;
			
 
				+  val=oc_pack_read(_opb,8);
			
 
				+  packtype=(int)val;
			
 
				+  /*If we're at a data packet and we have received all three headers, we're
			
 
				+     done.*/
			
 
				+  if(!(packtype&0x80)&&_info->frame_width>0&&_tc->vendor!=NULL&&*_setup!=NULL){
			
 
				+    return 0;
			
 
				+  }
			
 
				+  /*Check the codec string.*/
			
 
				+  oc_unpack_octets(_opb,buffer,6);
			
 
				+  if(memcmp(buffer,"theora",6)!=0)return TH_ENOTFORMAT;
			
 
				+  switch(packtype){
			
 
				+    /*Codec info header.*/
			
 
				+    case 0x80:{
			
 
				+      /*This should be the first packet, and we should not already be
			
 
				+         initialized.*/
			
 
				+      if(!_op->b_o_s||_info->frame_width>0)return TH_EBADHEADER;
			
 
				+      ret=oc_info_unpack(_opb,_info);
			
 
				+      if(ret<0)th_info_clear(_info);
			
 
				+      else ret=3;
			
 
				+    }break;
			
 
				+    /*Comment header.*/
			
 
				+    case 0x81:{
			
 
				+      if(_tc==NULL)return TH_EFAULT;
			
 
				+      /*We shoud have already decoded the info header, and should not yet have
			
 
				+         decoded the comment header.*/
			
 
				+      if(_info->frame_width==0||_tc->vendor!=NULL)return TH_EBADHEADER;
			
 
				+      ret=oc_comment_unpack(_opb,_tc);
			
 
				+      if(ret<0)th_comment_clear(_tc);
			
 
				+      else ret=2;
			
 
				+    }break;
			
 
				+    /*Codec setup header.*/
			
 
				+    case 0x82:{
			
 
				+      oc_setup_info *setup;
			
 
				+      if(_tc==NULL||_setup==NULL)return TH_EFAULT;
			
 
				+      /*We should have already decoded the info header and the comment header,
			
 
				+         and should not yet have decoded the setup header.*/
			
 
				+      if(_info->frame_width==0||_tc->vendor==NULL||*_setup!=NULL){
			
 
				+        return TH_EBADHEADER;
			
 
				+      }
			
 
				+      setup=(oc_setup_info *)_ogg_calloc(1,sizeof(*setup));
			
 
				+      if(setup==NULL)return TH_EFAULT;
			
 
				+      ret=oc_setup_unpack(_opb,setup);
			
 
				+      if(ret<0){
			
 
				+        oc_setup_clear(setup);
			
 
				+        _ogg_free(setup);
			
 
				+      }
			
 
				+      else{
			
 
				+        *_setup=setup;
			
 
				+        ret=1;
			
 
				+      }
			
 
				+    }break;
			
 
				+    default:{
			
 
				+      /*We don't know what this header is.*/
			
 
				+      return TH_EBADHEADER;
			
 
				+    }break;
			
 
				+  }
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*Decodes one header packet.
			
 
				+  This should be called repeatedly with the packets at the beginning of the
			
 
				+   stream until it returns 0.*/
			
 
				+int th_decode_headerin(th_info *_info,th_comment *_tc,
			
 
				+ th_setup_info **_setup,ogg_packet *_op){
			
 
				+  oc_pack_buf opb;
			
 
				+  if(_op==NULL)return TH_EBADHEADER;
			
 
				+  if(_info==NULL)return TH_EFAULT;
			
 
				+  oc_pack_readinit(&opb,_op->packet,_op->bytes);
			
 
				+  return oc_dec_headerin(&opb,_info,_tc,_setup,_op);
			
 
				+}
			
 
				+
			
 
				+void th_setup_free(th_setup_info *_setup){
			
 
				+  if(_setup!=NULL){
			
 
				+    oc_setup_clear(_setup);
			
 
				+    _ogg_free(_setup);
			
 
				+  }
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/decint.h
+++ b/modules/theoraplayer/native/theora/lib/decint.h
@@ -0,0 +1,186 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: decint.h 17457 2010-09-24 02:05:49Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#include <limits.h>
			
 
				+#if !defined(_decint_H)
			
 
				+# define _decint_H (1)
			
 
				+# include "theora/theoradec.h"
			
 
				+# include "state.h"
			
 
				+# include "bitpack.h"
			
 
				+# include "huffdec.h"
			
 
				+# include "dequant.h"
			
 
				+
			
 
				+typedef struct th_setup_info         oc_setup_info;
			
 
				+typedef struct oc_dec_opt_vtable     oc_dec_opt_vtable;
			
 
				+typedef struct oc_dec_pipeline_state oc_dec_pipeline_state;
			
 
				+typedef struct th_dec_ctx            oc_dec_ctx;
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Decoder-specific accelerated functions.*/
			
 
				+# if defined(OC_C64X_ASM)
			
 
				+#  include "c64x/c64xdec.h"
			
 
				+# endif
			
 
				+
			
 
				+# if !defined(oc_dec_accel_init)
			
 
				+#  define oc_dec_accel_init oc_dec_accel_init_c
			
 
				+# endif
			
 
				+# if defined(OC_DEC_USE_VTABLE)
			
 
				+#  if !defined(oc_dec_dc_unpredict_mcu_plane)
			
 
				+#   define oc_dec_dc_unpredict_mcu_plane(_dec,_pipe,_pli) \
			
 
				+ ((*(_dec)->opt_vtable.dc_unpredict_mcu_plane)(_dec,_pipe,_pli))
			
 
				+#  endif
			
 
				+# else
			
 
				+#  if !defined(oc_dec_dc_unpredict_mcu_plane)
			
 
				+#   define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c
			
 
				+#  endif
			
 
				+# endif
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Constants for the packet-in state machine specific to the decoder.*/
			
 
				+
			
 
				+/*Next packet to read: Data packet.*/
			
 
				+#define OC_PACKET_DATA (0)
			
 
				+
			
 
				+
			
 
				+
			
 
				+struct th_setup_info{
			
 
				+  /*The Huffman codes.*/
			
 
				+  ogg_int16_t   *huff_tables[TH_NHUFFMAN_TABLES];
			
 
				+  /*The quantization parameters.*/
			
 
				+  th_quant_info  qinfo;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Decoder specific functions with accelerated variants.*/
			
 
				+struct oc_dec_opt_vtable{
			
 
				+  void (*dc_unpredict_mcu_plane)(oc_dec_ctx *_dec,
			
 
				+   oc_dec_pipeline_state *_pipe,int _pli);
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+struct oc_dec_pipeline_state{
			
 
				+  /*Decoded DCT coefficients.
			
 
				+    These are placed here instead of on the stack so that they can persist
			
 
				+     between blocks, which makes clearing them back to zero much faster when
			
 
				+     only a few non-zero coefficients were decoded.
			
 
				+    It requires at least 65 elements because the zig-zag index array uses the
			
 
				+     65th element as a dumping ground for out-of-range indices to protect us
			
 
				+     from buffer overflow.
			
 
				+    We make it fully twice as large so that the second half can serve as the
			
 
				+     reconstruction buffer, which saves passing another parameter to all the
			
 
				+     acceleration functios.
			
 
				+    It also solves problems with 16-byte alignment for NEON on ARM.
			
 
				+    gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte
			
 
				+     alignment, and silently produces incorrect results if you ask for 16.
			
 
				+    Finally, keeping it off the stack means there's less likely to be a data
			
 
				+     hazard beween the NEON co-processor and the regular ARM core, which avoids
			
 
				+     unnecessary stalls.*/
			
 
				+  OC_ALIGN16(ogg_int16_t dct_coeffs[128]);
			
 
				+  OC_ALIGN16(signed char bounding_values[256]);
			
 
				+  ptrdiff_t           ti[3][64];
			
 
				+  ptrdiff_t           ebi[3][64];
			
 
				+  ptrdiff_t           eob_runs[3][64];
			
 
				+  const ptrdiff_t    *coded_fragis[3];
			
 
				+  const ptrdiff_t    *uncoded_fragis[3];
			
 
				+  ptrdiff_t           ncoded_fragis[3];
			
 
				+  ptrdiff_t           nuncoded_fragis[3];
			
 
				+  const ogg_uint16_t *dequant[3][3][2];
			
 
				+  int                 fragy0[3];
			
 
				+  int                 fragy_end[3];
			
 
				+  int                 pred_last[3][4];
			
 
				+  int                 mcu_nvfrags;
			
 
				+  int                 loop_filter;
			
 
				+  int                 pp_level;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+struct th_dec_ctx{
			
 
				+  /*Shared encoder/decoder state.*/
			
 
				+  oc_theora_state        state;
			
 
				+  /*Whether or not packets are ready to be emitted.
			
 
				+    This takes on negative values while there are remaining header packets to
			
 
				+     be emitted, reaches 0 when the codec is ready for input, and goes to 1
			
 
				+     when a frame has been processed and a data packet is ready.*/
			
 
				+  int                    packet_state;
			
 
				+  /*Buffer in which to assemble packets.*/
			
 
				+  oc_pack_buf            opb;
			
 
				+  /*Huffman decode trees.*/
			
 
				+  ogg_int16_t           *huff_tables[TH_NHUFFMAN_TABLES];
			
 
				+  /*The index of the first token in each plane for each coefficient.*/
			
 
				+  ptrdiff_t              ti0[3][64];
			
 
				+  /*The number of outstanding EOB runs at the start of each coefficient in each
			
 
				+     plane.*/
			
 
				+  ptrdiff_t              eob_runs[3][64];
			
 
				+  /*The DCT token lists.*/
			
 
				+  unsigned char         *dct_tokens;
			
 
				+  /*The extra bits associated with DCT tokens.*/
			
 
				+  unsigned char         *extra_bits;
			
 
				+  /*The number of dct tokens unpacked so far.*/
			
 
				+  int                    dct_tokens_count;
			
 
				+  /*The out-of-loop post-processing level.*/
			
 
				+  int                    pp_level;
			
 
				+  /*The DC scale used for out-of-loop deblocking.*/
			
 
				+  int                    pp_dc_scale[64];
			
 
				+  /*The sharpen modifier used for out-of-loop deringing.*/
			
 
				+  int                    pp_sharp_mod[64];
			
 
				+  /*The DC quantization index of each block.*/
			
 
				+  unsigned char         *dc_qis;
			
 
				+  /*The variance of each block.*/
			
 
				+  int                   *variances;
			
 
				+  /*The storage for the post-processed frame buffer.*/
			
 
				+  unsigned char         *pp_frame_data;
			
 
				+  /*Whether or not the post-processsed frame buffer has space for chroma.*/
			
 
				+  int                    pp_frame_state;
			
 
				+  /*The buffer used for the post-processed frame.
			
 
				+    Note that this is _not_ guaranteed to have the same strides and offsets as
			
 
				+     the reference frame buffers.*/
			
 
				+  th_ycbcr_buffer        pp_frame_buf;
			
 
				+  /*The striped decode callback function.*/
			
 
				+  th_stripe_callback     stripe_cb;
			
 
				+  oc_dec_pipeline_state  pipe;
			
 
				+# if defined(OC_DEC_USE_VTABLE)
			
 
				+  /*Table for decoder acceleration functions.*/
			
 
				+  oc_dec_opt_vtable      opt_vtable;
			
 
				+# endif
			
 
				+# if defined(HAVE_CAIRO)
			
 
				+  /*Output metrics for debugging.*/
			
 
				+  int                    telemetry;
			
 
				+  int                    telemetry_mbmode;
			
 
				+  int                    telemetry_mv;
			
 
				+  int                    telemetry_qi;
			
 
				+  int                    telemetry_bits;
			
 
				+  int                    telemetry_frame_bytes;
			
 
				+  int                    telemetry_coding_bytes;
			
 
				+  int                    telemetry_mode_bytes;
			
 
				+  int                    telemetry_mv_bytes;
			
 
				+  int                    telemetry_qi_bytes;
			
 
				+  int                    telemetry_dc_bytes;
			
 
				+  unsigned char         *telemetry_frame_data;
			
 
				+# endif
			
 
				+};
			
 
				+
			
 
				+/*Default pure-C implementations of decoder-specific accelerated functions.*/
			
 
				+void oc_dec_accel_init_c(oc_dec_ctx *_dec);
			
 
				+
			
 
				+void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
			
 
				+ oc_dec_pipeline_state *_pipe,int _pli);
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/decode.c
+++ b/modules/theoraplayer/native/theora/lib/decode.c
@@ -0,0 +1,2992 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: decode.c 18268 2012-05-08 02:51:57Z tterribe $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <ogg/ogg.h>
			
 
				+#include "decint.h"
			
 
				+#if defined(OC_DUMP_IMAGES)
			
 
				+# include <stdio.h>
			
 
				+# include "png.h"
			
 
				+#endif
			
 
				+#if defined(HAVE_CAIRO)
			
 
				+# include <cairo.h>
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+/*No post-processing.*/
			
 
				+#define OC_PP_LEVEL_DISABLED  (0)
			
 
				+/*Keep track of DC qi for each block only.*/
			
 
				+#define OC_PP_LEVEL_TRACKDCQI (1)
			
 
				+/*Deblock the luma plane.*/
			
 
				+#define OC_PP_LEVEL_DEBLOCKY  (2)
			
 
				+/*Dering the luma plane.*/
			
 
				+#define OC_PP_LEVEL_DERINGY   (3)
			
 
				+/*Stronger luma plane deringing.*/
			
 
				+#define OC_PP_LEVEL_SDERINGY  (4)
			
 
				+/*Deblock the chroma planes.*/
			
 
				+#define OC_PP_LEVEL_DEBLOCKC  (5)
			
 
				+/*Dering the chroma planes.*/
			
 
				+#define OC_PP_LEVEL_DERINGC   (6)
			
 
				+/*Stronger chroma plane deringing.*/
			
 
				+#define OC_PP_LEVEL_SDERINGC  (7)
			
 
				+/*Maximum valid post-processing level.*/
			
 
				+#define OC_PP_LEVEL_MAX       (7)
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*The mode alphabets for the various mode coding schemes.
			
 
				+  Scheme 0 uses a custom alphabet, which is not stored in this table.*/
			
 
				+static const unsigned char OC_MODE_ALPHABETS[7][OC_NMODES]={
			
 
				+  /*Last MV dominates */
			
 
				+  {
			
 
				+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_MV,
			
 
				+    OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
			
 
				+    OC_MODE_INTER_MV_FOUR
			
 
				+  },
			
 
				+  {
			
 
				+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_NOMV,
			
 
				+    OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
			
 
				+    OC_MODE_INTER_MV_FOUR
			
 
				+  },
			
 
				+  {
			
 
				+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV,OC_MODE_INTER_MV_LAST2,
			
 
				+    OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
			
 
				+    OC_MODE_INTER_MV_FOUR
			
 
				+  },
			
 
				+  {
			
 
				+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV,OC_MODE_INTER_NOMV,
			
 
				+    OC_MODE_INTER_MV_LAST2,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,
			
 
				+    OC_MODE_GOLDEN_MV,OC_MODE_INTER_MV_FOUR
			
 
				+  },
			
 
				+  /*No MV dominates.*/
			
 
				+  {
			
 
				+    OC_MODE_INTER_NOMV,OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2,
			
 
				+    OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
			
 
				+    OC_MODE_INTER_MV_FOUR
			
 
				+  },
			
 
				+  {
			
 
				+    OC_MODE_INTER_NOMV,OC_MODE_GOLDEN_NOMV,OC_MODE_INTER_MV_LAST,
			
 
				+    OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_MV,
			
 
				+    OC_MODE_INTER_MV_FOUR
			
 
				+  },
			
 
				+  /*Default ordering.*/
			
 
				+  {
			
 
				+    OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_INTER_MV,OC_MODE_INTER_MV_LAST,
			
 
				+    OC_MODE_INTER_MV_LAST2,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
			
 
				+    OC_MODE_INTER_MV_FOUR
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*The original DCT tokens are extended and reordered during the construction of
			
 
				+   the Huffman tables.
			
 
				+  The extension means more bits can be read with fewer calls to the bitpacker
			
 
				+   during the Huffman decoding process (at the cost of larger Huffman tables),
			
 
				+   and fewer tokens require additional extra bits (reducing the average storage
			
 
				+   per decoded token).
			
 
				+  The revised ordering reveals essential information in the token value
			
 
				+   itself; specifically, whether or not there are additional extra bits to read
			
 
				+   and the parameter to which those extra bits are applied.
			
 
				+  The token is used to fetch a code word from the OC_DCT_CODE_WORD table below.
			
 
				+  The extra bits are added into code word at the bit position inferred from the
			
 
				+   token value, giving the final code word from which all required parameters
			
 
				+   are derived.
			
 
				+  The number of EOBs and the leading zero run length can be extracted directly.
			
 
				+  The coefficient magnitude is optionally negated before extraction, according
			
 
				+   to a 'flip' bit.*/
			
 
				+
			
 
				+/*The number of additional extra bits that are decoded with each of the
			
 
				+   internal DCT tokens.*/
			
 
				+static const unsigned char OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[15]={
			
 
				+  12,4,3,3,4,4,5,5,8,8,8,8,3,3,6
			
 
				+};
			
 
				+
			
 
				+/*Whether or not an internal token needs any additional extra bits.*/
			
 
				+#define OC_DCT_TOKEN_NEEDS_MORE(token) \
			
 
				+ (token<(int)(sizeof(OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)/ \
			
 
				+  sizeof(*OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)))
			
 
				+
			
 
				+/*This token (OC_DCT_REPEAT_RUN3_TOKEN) requires more than 8 extra bits.*/
			
 
				+#define OC_DCT_TOKEN_FAT_EOB (0)
			
 
				+
			
 
				+/*The number of EOBs to use for an end-of-frame token.
			
 
				+  Note: We want to set eobs to PTRDIFF_MAX here, but that requires C99, which
			
 
				+   is not yet available everywhere; this should be equivalent.*/
			
 
				+#define OC_DCT_EOB_FINISH (~(size_t)0>>1)
			
 
				+
			
 
				+/*The location of the (6) run length bits in the code word.
			
 
				+  These are placed at index 0 and given 8 bits (even though 6 would suffice)
			
 
				+   because it may be faster to extract the lower byte on some platforms.*/
			
 
				+#define OC_DCT_CW_RLEN_SHIFT (0)
			
 
				+/*The location of the (12) EOB bits in the code word.*/
			
 
				+#define OC_DCT_CW_EOB_SHIFT  (8)
			
 
				+/*The location of the (1) flip bit in the code word.
			
 
				+  This must be right under the magnitude bits.*/
			
 
				+#define OC_DCT_CW_FLIP_BIT   (20)
			
 
				+/*The location of the (11) token magnitude bits in the code word.
			
 
				+  These must be last, and rely on a sign-extending right shift.*/
			
 
				+#define OC_DCT_CW_MAG_SHIFT  (21)
			
 
				+
			
 
				+/*Pack the given fields into a code word.*/
			
 
				+#define OC_DCT_CW_PACK(_eobs,_rlen,_mag,_flip) \
			
 
				+ ((_eobs)<<OC_DCT_CW_EOB_SHIFT| \
			
 
				+ (_rlen)<<OC_DCT_CW_RLEN_SHIFT| \
			
 
				+ (_flip)<<OC_DCT_CW_FLIP_BIT| \
			
 
				+ (_mag)-(_flip)<<OC_DCT_CW_MAG_SHIFT)
			
 
				+
			
 
				+/*A special code word value that signals the end of the frame (a long EOB run
			
 
				+   of zero).*/
			
 
				+#define OC_DCT_CW_FINISH (0)
			
 
				+
			
 
				+/*The position at which to insert the extra bits in the code word.
			
 
				+  We use this formulation because Intel has no useful cmov.
			
 
				+  A real architecture would probably do better with two of those.
			
 
				+  This translates to 11 instructions(!), and is _still_ faster than either a
			
 
				+   table lookup (just barely) or the naive double-ternary implementation (which
			
 
				+   gcc translates to a jump and a cmov).
			
 
				+  This assumes OC_DCT_CW_RLEN_SHIFT is zero, but could easily be reworked if
			
 
				+   you want to make one of the other shifts zero.*/
			
 
				+#define OC_DCT_TOKEN_EB_POS(_token) \
			
 
				+ ((OC_DCT_CW_EOB_SHIFT-OC_DCT_CW_MAG_SHIFT&-((_token)<2)) \
			
 
				+ +(OC_DCT_CW_MAG_SHIFT&-((_token)<12)))
			
 
				+
			
 
				+/*The code words for each internal token.
			
 
				+  See the notes at OC_DCT_TOKEN_MAP for the reasons why things are out of
			
 
				+   order.*/
			
 
				+static const ogg_int32_t OC_DCT_CODE_WORD[92]={
			
 
				+  /*These tokens require additional extra bits for the EOB count.*/
			
 
				+  /*OC_DCT_REPEAT_RUN3_TOKEN (12 extra bits)*/
			
 
				+  OC_DCT_CW_FINISH,
			
 
				+  /*OC_DCT_REPEAT_RUN2_TOKEN (4 extra bits)*/
			
 
				+  OC_DCT_CW_PACK(16, 0,  0,0),
			
 
				+  /*These tokens require additional extra bits for the magnitude.*/
			
 
				+  /*OC_DCT_VAL_CAT5 (4 extra bits-1 already read)*/
			
 
				+  OC_DCT_CW_PACK( 0, 0, 13,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, 13,1),
			
 
				+  /*OC_DCT_VAL_CAT6 (5 extra bits-1 already read)*/
			
 
				+  OC_DCT_CW_PACK( 0, 0, 21,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, 21,1),
			
 
				+  /*OC_DCT_VAL_CAT7 (6 extra bits-1 already read)*/
			
 
				+  OC_DCT_CW_PACK( 0, 0, 37,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, 37,1),
			
 
				+  /*OC_DCT_VAL_CAT8 (10 extra bits-2 already read)*/
			
 
				+  OC_DCT_CW_PACK( 0, 0, 69,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0,325,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, 69,1),
			
 
				+  OC_DCT_CW_PACK( 0, 0,325,1),
			
 
				+  /*These tokens require additional extra bits for the run length.*/
			
 
				+  /*OC_DCT_RUN_CAT1C (4 extra bits-1 already read)*/
			
 
				+  OC_DCT_CW_PACK( 0,10, +1,0),
			
 
				+  OC_DCT_CW_PACK( 0,10, -1,0),
			
 
				+  /*OC_DCT_ZRL_TOKEN (6 extra bits)
			
 
				+    Flip is set to distinguish this from OC_DCT_CW_FINISH.*/
			
 
				+  OC_DCT_CW_PACK( 0, 0,  0,1),
			
 
				+  /*The remaining tokens require no additional extra bits.*/
			
 
				+  /*OC_DCT_EOB1_TOKEN (0 extra bits)*/
			
 
				+  OC_DCT_CW_PACK( 1, 0,  0,0),
			
 
				+  /*OC_DCT_EOB2_TOKEN (0 extra bits)*/
			
 
				+  OC_DCT_CW_PACK( 2, 0,  0,0),
			
 
				+  /*OC_DCT_EOB3_TOKEN (0 extra bits)*/
			
 
				+  OC_DCT_CW_PACK( 3, 0,  0,0),
			
 
				+  /*OC_DCT_RUN_CAT1A (1 extra bit-1 already read)x5*/
			
 
				+  OC_DCT_CW_PACK( 0, 1, +1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 1, -1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 2, +1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 2, -1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 3, +1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 3, -1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 4, +1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 4, -1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 5, +1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 5, -1,0),
			
 
				+  /*OC_DCT_RUN_CAT2A (2 extra bits-2 already read)*/
			
 
				+  OC_DCT_CW_PACK( 0, 1, +2,0),
			
 
				+  OC_DCT_CW_PACK( 0, 1, +3,0),
			
 
				+  OC_DCT_CW_PACK( 0, 1, -2,0),
			
 
				+  OC_DCT_CW_PACK( 0, 1, -3,0),
			
 
				+  /*OC_DCT_RUN_CAT1B (3 extra bits-3 already read)*/
			
 
				+  OC_DCT_CW_PACK( 0, 6, +1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 7, +1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 8, +1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 9, +1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 6, -1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 7, -1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 8, -1,0),
			
 
				+  OC_DCT_CW_PACK( 0, 9, -1,0),
			
 
				+  /*OC_DCT_RUN_CAT2B (3 extra bits-3 already read)*/
			
 
				+  OC_DCT_CW_PACK( 0, 2, +2,0),
			
 
				+  OC_DCT_CW_PACK( 0, 3, +2,0),
			
 
				+  OC_DCT_CW_PACK( 0, 2, +3,0),
			
 
				+  OC_DCT_CW_PACK( 0, 3, +3,0),
			
 
				+  OC_DCT_CW_PACK( 0, 2, -2,0),
			
 
				+  OC_DCT_CW_PACK( 0, 3, -2,0),
			
 
				+  OC_DCT_CW_PACK( 0, 2, -3,0),
			
 
				+  OC_DCT_CW_PACK( 0, 3, -3,0),
			
 
				+  /*OC_DCT_SHORT_ZRL_TOKEN (3 extra bits-3 already read)
			
 
				+    Flip is set on the first one to distinguish it from OC_DCT_CW_FINISH.*/
			
 
				+  OC_DCT_CW_PACK( 0, 0,  0,1),
			
 
				+  OC_DCT_CW_PACK( 0, 1,  0,0),
			
 
				+  OC_DCT_CW_PACK( 0, 2,  0,0),
			
 
				+  OC_DCT_CW_PACK( 0, 3,  0,0),
			
 
				+  OC_DCT_CW_PACK( 0, 4,  0,0),
			
 
				+  OC_DCT_CW_PACK( 0, 5,  0,0),
			
 
				+  OC_DCT_CW_PACK( 0, 6,  0,0),
			
 
				+  OC_DCT_CW_PACK( 0, 7,  0,0),
			
 
				+  /*OC_ONE_TOKEN (0 extra bits)*/
			
 
				+  OC_DCT_CW_PACK( 0, 0, +1,0),
			
 
				+  /*OC_MINUS_ONE_TOKEN (0 extra bits)*/
			
 
				+  OC_DCT_CW_PACK( 0, 0, -1,0),
			
 
				+  /*OC_TWO_TOKEN (0 extra bits)*/
			
 
				+  OC_DCT_CW_PACK( 0, 0, +2,0),
			
 
				+  /*OC_MINUS_TWO_TOKEN (0 extra bits)*/
			
 
				+  OC_DCT_CW_PACK( 0, 0, -2,0),
			
 
				+  /*OC_DCT_VAL_CAT2 (1 extra bit-1 already read)x4*/
			
 
				+  OC_DCT_CW_PACK( 0, 0, +3,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, -3,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, +4,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, -4,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, +5,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, -5,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, +6,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, -6,0),
			
 
				+  /*OC_DCT_VAL_CAT3 (2 extra bits-2 already read)*/
			
 
				+  OC_DCT_CW_PACK( 0, 0, +7,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, +8,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, -7,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, -8,0),
			
 
				+  /*OC_DCT_VAL_CAT4 (3 extra bits-3 already read)*/
			
 
				+  OC_DCT_CW_PACK( 0, 0, +9,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0,+10,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0,+11,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0,+12,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0, -9,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0,-10,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0,-11,0),
			
 
				+  OC_DCT_CW_PACK( 0, 0,-12,0),
			
 
				+  /*OC_DCT_REPEAT_RUN1_TOKEN (3 extra bits-3 already read)*/
			
 
				+  OC_DCT_CW_PACK( 8, 0,  0,0),
			
 
				+  OC_DCT_CW_PACK( 9, 0,  0,0),
			
 
				+  OC_DCT_CW_PACK(10, 0,  0,0),
			
 
				+  OC_DCT_CW_PACK(11, 0,  0,0),
			
 
				+  OC_DCT_CW_PACK(12, 0,  0,0),
			
 
				+  OC_DCT_CW_PACK(13, 0,  0,0),
			
 
				+  OC_DCT_CW_PACK(14, 0,  0,0),
			
 
				+  OC_DCT_CW_PACK(15, 0,  0,0),
			
 
				+  /*OC_DCT_REPEAT_RUN0_TOKEN (2 extra bits-2 already read)*/
			
 
				+  OC_DCT_CW_PACK( 4, 0,  0,0),
			
 
				+  OC_DCT_CW_PACK( 5, 0,  0,0),
			
 
				+  OC_DCT_CW_PACK( 6, 0,  0,0),
			
 
				+  OC_DCT_CW_PACK( 7, 0,  0,0),
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+static int oc_sb_run_unpack(oc_pack_buf *_opb){
			
 
				+  /*Coding scheme:
			
 
				+       Codeword            Run Length
			
 
				+     0                       1
			
 
				+     10x                     2-3
			
 
				+     110x                    4-5
			
 
				+     1110xx                  6-9
			
 
				+     11110xxx                10-17
			
 
				+     111110xxxx              18-33
			
 
				+     111111xxxxxxxxxxxx      34-4129*/
			
 
				+  static const ogg_int16_t OC_SB_RUN_TREE[22]={
			
 
				+    4,
			
 
				+     -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1),
			
 
				+     -(1<<8|1),-(1<<8|1),-(1<<8|1),-(1<<8|1),
			
 
				+     -(3<<8|2),-(3<<8|2),-(3<<8|3),-(3<<8|3),
			
 
				+     -(4<<8|4),-(4<<8|5),-(4<<8|2<<4|6-6),17,
			
 
				+      2,
			
 
				+       -(2<<8|2<<4|10-6),-(2<<8|2<<4|14-6),-(2<<8|4<<4|18-6),-(2<<8|12<<4|34-6)
			
 
				+  };
			
 
				+  int ret;
			
 
				+  ret=oc_huff_token_decode(_opb,OC_SB_RUN_TREE);
			
 
				+  if(ret>=0x10){
			
 
				+    int offs;
			
 
				+    offs=ret&0x1F;
			
 
				+    ret=6+offs+(int)oc_pack_read(_opb,ret-offs>>4);
			
 
				+  }
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+static int oc_block_run_unpack(oc_pack_buf *_opb){
			
 
				+  /*Coding scheme:
			
 
				+     Codeword             Run Length
			
 
				+     0x                      1-2
			
 
				+     10x                     3-4
			
 
				+     110x                    5-6
			
 
				+     1110xx                  7-10
			
 
				+     11110xx                 11-14
			
 
				+     11111xxxx               15-30*/
			
 
				+  static const ogg_int16_t OC_BLOCK_RUN_TREE[61]={
			
 
				+    5,
			
 
				+     -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
			
 
				+     -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
			
 
				+     -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2),
			
 
				+     -(2<<8|2),-(2<<8|2),-(2<<8|2),-(2<<8|2),
			
 
				+     -(3<<8|3),-(3<<8|3),-(3<<8|3),-(3<<8|3),
			
 
				+     -(3<<8|4),-(3<<8|4),-(3<<8|4),-(3<<8|4),
			
 
				+     -(4<<8|5),-(4<<8|5),-(4<<8|6),-(4<<8|6),
			
 
				+     33,       36,       39,       44,
			
 
				+      1,-(1<<8|7),-(1<<8|8),
			
 
				+      1,-(1<<8|9),-(1<<8|10),
			
 
				+      2,-(2<<8|11),-(2<<8|12),-(2<<8|13),-(2<<8|14),
			
 
				+      4,
			
 
				+       -(4<<8|15),-(4<<8|16),-(4<<8|17),-(4<<8|18),
			
 
				+       -(4<<8|19),-(4<<8|20),-(4<<8|21),-(4<<8|22),
			
 
				+       -(4<<8|23),-(4<<8|24),-(4<<8|25),-(4<<8|26),
			
 
				+       -(4<<8|27),-(4<<8|28),-(4<<8|29),-(4<<8|30)
			
 
				+  };
			
 
				+  return oc_huff_token_decode(_opb,OC_BLOCK_RUN_TREE);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+void oc_dec_accel_init_c(oc_dec_ctx *_dec){
			
 
				+# if defined(OC_DEC_USE_VTABLE)
			
 
				+  _dec->opt_vtable.dc_unpredict_mcu_plane=
			
 
				+   oc_dec_dc_unpredict_mcu_plane_c;
			
 
				+# endif
			
 
				+}
			
 
				+
			
 
				+static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
			
 
				+ const th_setup_info *_setup){
			
 
				+  int qti;
			
 
				+  int pli;
			
 
				+  int qi;
			
 
				+  int ret;
			
 
				+  ret=oc_state_init(&_dec->state,_info,3);
			
 
				+  if(ret<0)return ret;
			
 
				+  ret=oc_huff_trees_copy(_dec->huff_tables,
			
 
				+   (const ogg_int16_t *const *)_setup->huff_tables);
			
 
				+  if(ret<0){
			
 
				+    oc_state_clear(&_dec->state);
			
 
				+    return ret;
			
 
				+  }
			
 
				+  /*For each fragment, allocate one byte for every DCT coefficient token, plus
			
 
				+     one byte for extra-bits for each token, plus one more byte for the long
			
 
				+     EOB run, just in case it's the very last token and has a run length of
			
 
				+     one.*/
			
 
				+  _dec->dct_tokens=(unsigned char *)_ogg_malloc((64+64+1)*
			
 
				+   _dec->state.nfrags*sizeof(_dec->dct_tokens[0]));
			
 
				+  if(_dec->dct_tokens==NULL){
			
 
				+    oc_huff_trees_clear(_dec->huff_tables);
			
 
				+    oc_state_clear(&_dec->state);
			
 
				+    return TH_EFAULT;
			
 
				+  }
			
 
				+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
			
 
				+    _dec->state.dequant_tables[qi][pli][qti]=
			
 
				+     _dec->state.dequant_table_data[qi][pli][qti];
			
 
				+  }
			
 
				+  oc_dequant_tables_init(_dec->state.dequant_tables,_dec->pp_dc_scale,
			
 
				+   &_setup->qinfo);
			
 
				+  for(qi=0;qi<64;qi++){
			
 
				+    int qsum;
			
 
				+    qsum=0;
			
 
				+    for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
			
 
				+      qsum+=_dec->state.dequant_tables[qi][pli][qti][12]+
			
 
				+       _dec->state.dequant_tables[qi][pli][qti][17]+
			
 
				+       _dec->state.dequant_tables[qi][pli][qti][18]+
			
 
				+       _dec->state.dequant_tables[qi][pli][qti][24]<<(pli==0);
			
 
				+    }
			
 
				+    _dec->pp_sharp_mod[qi]=-(qsum>>11);
			
 
				+  }
			
 
				+  memcpy(_dec->state.loop_filter_limits,_setup->qinfo.loop_filter_limits,
			
 
				+   sizeof(_dec->state.loop_filter_limits));
			
 
				+  oc_dec_accel_init(_dec);
			
 
				+  _dec->pp_level=OC_PP_LEVEL_DISABLED;
			
 
				+  _dec->dc_qis=NULL;
			
 
				+  _dec->variances=NULL;
			
 
				+  _dec->pp_frame_data=NULL;
			
 
				+  _dec->stripe_cb.ctx=NULL;
			
 
				+  _dec->stripe_cb.stripe_decoded=NULL;
			
 
				+#if defined(HAVE_CAIRO)
			
 
				+  _dec->telemetry=0;
			
 
				+  _dec->telemetry_bits=0;
			
 
				+  _dec->telemetry_qi=0;
			
 
				+  _dec->telemetry_mbmode=0;
			
 
				+  _dec->telemetry_mv=0;
			
 
				+  _dec->telemetry_frame_data=NULL;
			
 
				+#endif
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static void oc_dec_clear(oc_dec_ctx *_dec){
			
 
				+#if defined(HAVE_CAIRO)
			
 
				+  _ogg_free(_dec->telemetry_frame_data);
			
 
				+#endif
			
 
				+  _ogg_free(_dec->pp_frame_data);
			
 
				+  _ogg_free(_dec->variances);
			
 
				+  _ogg_free(_dec->dc_qis);
			
 
				+  _ogg_free(_dec->dct_tokens);
			
 
				+  oc_huff_trees_clear(_dec->huff_tables);
			
 
				+  oc_state_clear(&_dec->state);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int oc_dec_frame_header_unpack(oc_dec_ctx *_dec){
			
 
				+  long val;
			
 
				+  /*Check to make sure this is a data packet.*/
			
 
				+  val=oc_pack_read1(&_dec->opb);
			
 
				+  if(val!=0)return TH_EBADPACKET;
			
 
				+  /*Read in the frame type (I or P).*/
			
 
				+  val=oc_pack_read1(&_dec->opb);
			
 
				+  _dec->state.frame_type=(int)val;
			
 
				+  /*Read in the qi list.*/
			
 
				+  val=oc_pack_read(&_dec->opb,6);
			
 
				+  _dec->state.qis[0]=(unsigned char)val;
			
 
				+  val=oc_pack_read1(&_dec->opb);
			
 
				+  if(!val)_dec->state.nqis=1;
			
 
				+  else{
			
 
				+    val=oc_pack_read(&_dec->opb,6);
			
 
				+    _dec->state.qis[1]=(unsigned char)val;
			
 
				+    val=oc_pack_read1(&_dec->opb);
			
 
				+    if(!val)_dec->state.nqis=2;
			
 
				+    else{
			
 
				+      val=oc_pack_read(&_dec->opb,6);
			
 
				+      _dec->state.qis[2]=(unsigned char)val;
			
 
				+      _dec->state.nqis=3;
			
 
				+    }
			
 
				+  }
			
 
				+  if(_dec->state.frame_type==OC_INTRA_FRAME){
			
 
				+    /*Keyframes have 3 unused configuration bits, holdovers from VP3 days.
			
 
				+      Most of the other unused bits in the VP3 headers were eliminated.
			
 
				+      I don't know why these remain.*/
			
 
				+    /*I wanted to eliminate wasted bits, but not all config wiggle room
			
 
				+       --Monty.*/
			
 
				+    val=oc_pack_read(&_dec->opb,3);
			
 
				+    if(val!=0)return TH_EIMPL;
			
 
				+  }
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+/*Mark all fragments as coded and in OC_MODE_INTRA.
			
 
				+  This also builds up the coded fragment list (in coded order), and clears the
			
 
				+   uncoded fragment list.
			
 
				+  It does not update the coded macro block list nor the super block flags, as
			
 
				+   those are not used when decoding INTRA frames.*/
			
 
				+static void oc_dec_mark_all_intra(oc_dec_ctx *_dec){
			
 
				+  const oc_sb_map   *sb_maps;
			
 
				+  const oc_sb_flags *sb_flags;
			
 
				+  oc_fragment       *frags;
			
 
				+  ptrdiff_t         *coded_fragis;
			
 
				+  ptrdiff_t          ncoded_fragis;
			
 
				+  ptrdiff_t          prev_ncoded_fragis;
			
 
				+  unsigned           nsbs;
			
 
				+  unsigned           sbi;
			
 
				+  int                pli;
			
 
				+  coded_fragis=_dec->state.coded_fragis;
			
 
				+  prev_ncoded_fragis=ncoded_fragis=0;
			
 
				+  sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
			
 
				+  sb_flags=_dec->state.sb_flags;
			
 
				+  frags=_dec->state.frags;
			
 
				+  sbi=nsbs=0;
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    nsbs+=_dec->state.fplanes[pli].nsbs;
			
 
				+    for(;sbi<nsbs;sbi++){
			
 
				+      int quadi;
			
 
				+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
			
 
				+        int bi;
			
 
				+        for(bi=0;bi<4;bi++){
			
 
				+          ptrdiff_t fragi;
			
 
				+          fragi=sb_maps[sbi][quadi][bi];
			
 
				+          if(fragi>=0){
			
 
				+            frags[fragi].coded=1;
			
 
				+            frags[fragi].refi=OC_FRAME_SELF;
			
 
				+            frags[fragi].mb_mode=OC_MODE_INTRA;
			
 
				+            coded_fragis[ncoded_fragis++]=fragi;
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
			
 
				+    prev_ncoded_fragis=ncoded_fragis;
			
 
				+  }
			
 
				+  _dec->state.ntotal_coded_fragis=ncoded_fragis;
			
 
				+}
			
 
				+
			
 
				+/*Decodes the bit flags indicating whether each super block is partially coded
			
 
				+   or not.
			
 
				+  Return: The number of partially coded super blocks.*/
			
 
				+static unsigned oc_dec_partial_sb_flags_unpack(oc_dec_ctx *_dec){
			
 
				+  oc_sb_flags *sb_flags;
			
 
				+  unsigned     nsbs;
			
 
				+  unsigned     sbi;
			
 
				+  unsigned     npartial;
			
 
				+  unsigned     run_count;
			
 
				+  long         val;
			
 
				+  int          flag;
			
 
				+  val=oc_pack_read1(&_dec->opb);
			
 
				+  flag=(int)val;
			
 
				+  sb_flags=_dec->state.sb_flags;
			
 
				+  nsbs=_dec->state.nsbs;
			
 
				+  sbi=npartial=0;
			
 
				+  while(sbi<nsbs){
			
 
				+    int full_run;
			
 
				+    run_count=oc_sb_run_unpack(&_dec->opb);
			
 
				+    full_run=run_count>=4129;
			
 
				+    do{
			
 
				+      sb_flags[sbi].coded_partially=flag;
			
 
				+      sb_flags[sbi].coded_fully=0;
			
 
				+      npartial+=flag;
			
 
				+      sbi++;
			
 
				+    }
			
 
				+    while(--run_count>0&&sbi<nsbs);
			
 
				+    if(full_run&&sbi<nsbs){
			
 
				+      val=oc_pack_read1(&_dec->opb);
			
 
				+      flag=(int)val;
			
 
				+    }
			
 
				+    else flag=!flag;
			
 
				+  }
			
 
				+  /*TODO: run_count should be 0 here.
			
 
				+    If it's not, we should issue a warning of some kind.*/
			
 
				+  return npartial;
			
 
				+}
			
 
				+
			
 
				+/*Decodes the bit flags for whether or not each non-partially-coded super
			
 
				+   block is fully coded or not.
			
 
				+  This function should only be called if there is at least one
			
 
				+   non-partially-coded super block.
			
 
				+  Return: The number of partially coded super blocks.*/
			
 
				+static void oc_dec_coded_sb_flags_unpack(oc_dec_ctx *_dec){
			
 
				+  oc_sb_flags *sb_flags;
			
 
				+  unsigned     nsbs;
			
 
				+  unsigned     sbi;
			
 
				+  unsigned     run_count;
			
 
				+  long         val;
			
 
				+  int          flag;
			
 
				+  sb_flags=_dec->state.sb_flags;
			
 
				+  nsbs=_dec->state.nsbs;
			
 
				+  /*Skip partially coded super blocks.*/
			
 
				+  for(sbi=0;sb_flags[sbi].coded_partially;sbi++);
			
 
				+  val=oc_pack_read1(&_dec->opb);
			
 
				+  flag=(int)val;
			
 
				+  do{
			
 
				+    int full_run;
			
 
				+    run_count=oc_sb_run_unpack(&_dec->opb);
			
 
				+    full_run=run_count>=4129;
			
 
				+    for(;sbi<nsbs;sbi++){
			
 
				+      if(sb_flags[sbi].coded_partially)continue;
			
 
				+      if(run_count--<=0)break;
			
 
				+      sb_flags[sbi].coded_fully=flag;
			
 
				+    }
			
 
				+    if(full_run&&sbi<nsbs){
			
 
				+      val=oc_pack_read1(&_dec->opb);
			
 
				+      flag=(int)val;
			
 
				+    }
			
 
				+    else flag=!flag;
			
 
				+  }
			
 
				+  while(sbi<nsbs);
			
 
				+  /*TODO: run_count should be 0 here.
			
 
				+    If it's not, we should issue a warning of some kind.*/
			
 
				+}
			
 
				+
			
 
				+static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
			
 
				+  const oc_sb_map   *sb_maps;
			
 
				+  const oc_sb_flags *sb_flags;
			
 
				+  signed char       *mb_modes;
			
 
				+  oc_fragment       *frags;
			
 
				+  unsigned           nsbs;
			
 
				+  unsigned           sbi;
			
 
				+  unsigned           npartial;
			
 
				+  long               val;
			
 
				+  int                pli;
			
 
				+  int                flag;
			
 
				+  int                run_count;
			
 
				+  ptrdiff_t         *coded_fragis;
			
 
				+  ptrdiff_t         *uncoded_fragis;
			
 
				+  ptrdiff_t          ncoded_fragis;
			
 
				+  ptrdiff_t          nuncoded_fragis;
			
 
				+  ptrdiff_t          prev_ncoded_fragis;
			
 
				+  npartial=oc_dec_partial_sb_flags_unpack(_dec);
			
 
				+  if(npartial<_dec->state.nsbs)oc_dec_coded_sb_flags_unpack(_dec);
			
 
				+  if(npartial>0){
			
 
				+    val=oc_pack_read1(&_dec->opb);
			
 
				+    flag=!(int)val;
			
 
				+  }
			
 
				+  else flag=0;
			
 
				+  sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
			
 
				+  sb_flags=_dec->state.sb_flags;
			
 
				+  mb_modes=_dec->state.mb_modes;
			
 
				+  frags=_dec->state.frags;
			
 
				+  sbi=nsbs=run_count=0;
			
 
				+  coded_fragis=_dec->state.coded_fragis;
			
 
				+  uncoded_fragis=coded_fragis+_dec->state.nfrags;
			
 
				+  prev_ncoded_fragis=ncoded_fragis=nuncoded_fragis=0;
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    nsbs+=_dec->state.fplanes[pli].nsbs;
			
 
				+    for(;sbi<nsbs;sbi++){
			
 
				+      int quadi;
			
 
				+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
			
 
				+        int quad_coded;
			
 
				+        int bi;
			
 
				+        quad_coded=0;
			
 
				+        for(bi=0;bi<4;bi++){
			
 
				+          ptrdiff_t fragi;
			
 
				+          fragi=sb_maps[sbi][quadi][bi];
			
 
				+          if(fragi>=0){
			
 
				+            int coded;
			
 
				+            if(sb_flags[sbi].coded_fully)coded=1;
			
 
				+            else if(!sb_flags[sbi].coded_partially)coded=0;
			
 
				+            else{
			
 
				+              if(run_count<=0){
			
 
				+                run_count=oc_block_run_unpack(&_dec->opb);
			
 
				+                flag=!flag;
			
 
				+              }
			
 
				+              run_count--;
			
 
				+              coded=flag;
			
 
				+            }
			
 
				+            if(coded)coded_fragis[ncoded_fragis++]=fragi;
			
 
				+            else *(uncoded_fragis-++nuncoded_fragis)=fragi;
			
 
				+            quad_coded|=coded;
			
 
				+            frags[fragi].coded=coded;
			
 
				+            frags[fragi].refi=OC_FRAME_NONE;
			
 
				+          }
			
 
				+        }
			
 
				+        /*Remember if there's a coded luma block in this macro block.*/
			
 
				+        if(!pli)mb_modes[sbi<<2|quadi]=quad_coded;
			
 
				+      }
			
 
				+    }
			
 
				+    _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
			
 
				+    prev_ncoded_fragis=ncoded_fragis;
			
 
				+  }
			
 
				+  _dec->state.ntotal_coded_fragis=ncoded_fragis;
			
 
				+  /*TODO: run_count should be 0 here.
			
 
				+    If it's not, we should issue a warning of some kind.*/
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*Coding scheme:
			
 
				+   Codeword            Mode Index
			
 
				+   0                       0
			
 
				+   10                      1
			
 
				+   110                     2
			
 
				+   1110                    3
			
 
				+   11110                   4
			
 
				+   111110                  5
			
 
				+   1111110                 6
			
 
				+   1111111                 7*/
			
 
				+static const ogg_int16_t OC_VLC_MODE_TREE[26]={
			
 
				+  4,
			
 
				+   -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0),
			
 
				+   -(1<<8|0),-(1<<8|0),-(1<<8|0),-(1<<8|0),
			
 
				+   -(2<<8|1),-(2<<8|1),-(2<<8|1),-(2<<8|1),
			
 
				+   -(3<<8|2),-(3<<8|2),-(4<<8|3),17,
			
 
				+    3,
			
 
				+     -(1<<8|4),-(1<<8|4),-(1<<8|4),-(1<<8|4),
			
 
				+     -(2<<8|5),-(2<<8|5),-(3<<8|6),-(3<<8|7)
			
 
				+};
			
 
				+
			
 
				+static const ogg_int16_t OC_CLC_MODE_TREE[9]={
			
 
				+  3,
			
 
				+   -(3<<8|0),-(3<<8|1),-(3<<8|2),-(3<<8|3),
			
 
				+   -(3<<8|4),-(3<<8|5),-(3<<8|6),-(3<<8|7)
			
 
				+};
			
 
				+
			
 
				+/*Unpacks the list of macro block modes for INTER frames.*/
			
 
				+static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){
			
 
				+  signed char         *mb_modes;
			
 
				+  const unsigned char *alphabet;
			
 
				+  unsigned char        scheme0_alphabet[8];
			
 
				+  const ogg_int16_t   *mode_tree;
			
 
				+  size_t               nmbs;
			
 
				+  size_t               mbi;
			
 
				+  long                 val;
			
 
				+  int                  mode_scheme;
			
 
				+  val=oc_pack_read(&_dec->opb,3);
			
 
				+  mode_scheme=(int)val;
			
 
				+  if(mode_scheme==0){
			
 
				+    int mi;
			
 
				+    /*Just in case, initialize the modes to something.
			
 
				+      If the bitstream doesn't contain each index exactly once, it's likely
			
 
				+       corrupt and the rest of the packet is garbage anyway, but this way we
			
 
				+       won't crash, and we'll decode SOMETHING.*/
			
 
				+    /*LOOP VECTORIZES*/
			
 
				+    for(mi=0;mi<OC_NMODES;mi++)scheme0_alphabet[mi]=OC_MODE_INTER_NOMV;
			
 
				+    for(mi=0;mi<OC_NMODES;mi++){
			
 
				+      val=oc_pack_read(&_dec->opb,3);
			
 
				+      scheme0_alphabet[val]=OC_MODE_ALPHABETS[6][mi];
			
 
				+    }
			
 
				+    alphabet=scheme0_alphabet;
			
 
				+  }
			
 
				+  else alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
			
 
				+  mode_tree=mode_scheme==7?OC_CLC_MODE_TREE:OC_VLC_MODE_TREE;
			
 
				+  mb_modes=_dec->state.mb_modes;
			
 
				+  nmbs=_dec->state.nmbs;
			
 
				+  for(mbi=0;mbi<nmbs;mbi++){
			
 
				+    if(mb_modes[mbi]>0){
			
 
				+      /*We have a coded luma block; decode a mode.*/
			
 
				+      mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)];
			
 
				+    }
			
 
				+    /*For other valid macro blocks, INTER_NOMV is forced, but we rely on the
			
 
				+       fact that OC_MODE_INTER_NOMV is already 0.*/
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static const ogg_int16_t OC_VLC_MV_COMP_TREE[101]={
			
 
				+  5,
			
 
				+   -(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0),-(3<<8|32+0),
			
 
				+   -(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1),-(3<<8|32+1),
			
 
				+   -(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1),-(3<<8|32-1),
			
 
				+   -(4<<8|32+2),-(4<<8|32+2),-(4<<8|32-2),-(4<<8|32-2),
			
 
				+   -(4<<8|32+3),-(4<<8|32+3),-(4<<8|32-3),-(4<<8|32-3),
			
 
				+   33,          36,          39,          42,
			
 
				+   45,          50,          55,          60,
			
 
				+   65,          74,          83,          92,
			
 
				+    1,-(1<<8|32+4),-(1<<8|32-4),
			
 
				+    1,-(1<<8|32+5),-(1<<8|32-5),
			
 
				+    1,-(1<<8|32+6),-(1<<8|32-6),
			
 
				+    1,-(1<<8|32+7),-(1<<8|32-7),
			
 
				+    2,-(2<<8|32+8),-(2<<8|32-8),-(2<<8|32+9),-(2<<8|32-9),
			
 
				+    2,-(2<<8|32+10),-(2<<8|32-10),-(2<<8|32+11),-(2<<8|32-11),
			
 
				+    2,-(2<<8|32+12),-(2<<8|32-12),-(2<<8|32+13),-(2<<8|32-13),
			
 
				+    2,-(2<<8|32+14),-(2<<8|32-14),-(2<<8|32+15),-(2<<8|32-15),
			
 
				+    3,
			
 
				+     -(3<<8|32+16),-(3<<8|32-16),-(3<<8|32+17),-(3<<8|32-17),
			
 
				+     -(3<<8|32+18),-(3<<8|32-18),-(3<<8|32+19),-(3<<8|32-19),
			
 
				+    3,
			
 
				+     -(3<<8|32+20),-(3<<8|32-20),-(3<<8|32+21),-(3<<8|32-21),
			
 
				+     -(3<<8|32+22),-(3<<8|32-22),-(3<<8|32+23),-(3<<8|32-23),
			
 
				+    3,
			
 
				+     -(3<<8|32+24),-(3<<8|32-24),-(3<<8|32+25),-(3<<8|32-25),
			
 
				+     -(3<<8|32+26),-(3<<8|32-26),-(3<<8|32+27),-(3<<8|32-27),
			
 
				+    3,
			
 
				+     -(3<<8|32+28),-(3<<8|32-28),-(3<<8|32+29),-(3<<8|32-29),
			
 
				+     -(3<<8|32+30),-(3<<8|32-30),-(3<<8|32+31),-(3<<8|32-31)
			
 
				+};
			
 
				+
			
 
				+static const ogg_int16_t OC_CLC_MV_COMP_TREE[65]={
			
 
				+  6,
			
 
				+   -(6<<8|32 +0),-(6<<8|32 -0),-(6<<8|32 +1),-(6<<8|32 -1),
			
 
				+   -(6<<8|32 +2),-(6<<8|32 -2),-(6<<8|32 +3),-(6<<8|32 -3),
			
 
				+   -(6<<8|32 +4),-(6<<8|32 -4),-(6<<8|32 +5),-(6<<8|32 -5),
			
 
				+   -(6<<8|32 +6),-(6<<8|32 -6),-(6<<8|32 +7),-(6<<8|32 -7),
			
 
				+   -(6<<8|32 +8),-(6<<8|32 -8),-(6<<8|32 +9),-(6<<8|32 -9),
			
 
				+   -(6<<8|32+10),-(6<<8|32-10),-(6<<8|32+11),-(6<<8|32-11),
			
 
				+   -(6<<8|32+12),-(6<<8|32-12),-(6<<8|32+13),-(6<<8|32-13),
			
 
				+   -(6<<8|32+14),-(6<<8|32-14),-(6<<8|32+15),-(6<<8|32-15),
			
 
				+   -(6<<8|32+16),-(6<<8|32-16),-(6<<8|32+17),-(6<<8|32-17),
			
 
				+   -(6<<8|32+18),-(6<<8|32-18),-(6<<8|32+19),-(6<<8|32-19),
			
 
				+   -(6<<8|32+20),-(6<<8|32-20),-(6<<8|32+21),-(6<<8|32-21),
			
 
				+   -(6<<8|32+22),-(6<<8|32-22),-(6<<8|32+23),-(6<<8|32-23),
			
 
				+   -(6<<8|32+24),-(6<<8|32-24),-(6<<8|32+25),-(6<<8|32-25),
			
 
				+   -(6<<8|32+26),-(6<<8|32-26),-(6<<8|32+27),-(6<<8|32-27),
			
 
				+   -(6<<8|32+28),-(6<<8|32-28),-(6<<8|32+29),-(6<<8|32-29),
			
 
				+   -(6<<8|32+30),-(6<<8|32-30),-(6<<8|32+31),-(6<<8|32-31)
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static oc_mv oc_mv_unpack(oc_pack_buf *_opb,const ogg_int16_t *_tree){
			
 
				+  int dx;
			
 
				+  int dy;
			
 
				+  dx=oc_huff_token_decode(_opb,_tree)-32;
			
 
				+  dy=oc_huff_token_decode(_opb,_tree)-32;
			
 
				+  return OC_MV(dx,dy);
			
 
				+}
			
 
				+
			
 
				+/*Unpacks the list of motion vectors for INTER frames, and propagtes the macro
			
 
				+   block modes and motion vectors to the individual fragments.*/
			
 
				+static void oc_dec_mv_unpack_and_frag_modes_fill(oc_dec_ctx *_dec){
			
 
				+  const oc_mb_map        *mb_maps;
			
 
				+  const signed char      *mb_modes;
			
 
				+  oc_set_chroma_mvs_func  set_chroma_mvs;
			
 
				+  const ogg_int16_t      *mv_comp_tree;
			
 
				+  oc_fragment            *frags;
			
 
				+  oc_mv                  *frag_mvs;
			
 
				+  const unsigned char    *map_idxs;
			
 
				+  int                     map_nidxs;
			
 
				+  oc_mv                   last_mv;
			
 
				+  oc_mv                   prior_mv;
			
 
				+  oc_mv                   cbmvs[4];
			
 
				+  size_t                  nmbs;
			
 
				+  size_t                  mbi;
			
 
				+  long                    val;
			
 
				+  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_dec->state.info.pixel_fmt];
			
 
				+  val=oc_pack_read1(&_dec->opb);
			
 
				+  mv_comp_tree=val?OC_CLC_MV_COMP_TREE:OC_VLC_MV_COMP_TREE;
			
 
				+  map_idxs=OC_MB_MAP_IDXS[_dec->state.info.pixel_fmt];
			
 
				+  map_nidxs=OC_MB_MAP_NIDXS[_dec->state.info.pixel_fmt];
			
 
				+  prior_mv=last_mv=0;
			
 
				+  frags=_dec->state.frags;
			
 
				+  frag_mvs=_dec->state.frag_mvs;
			
 
				+  mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
			
 
				+  mb_modes=_dec->state.mb_modes;
			
 
				+  nmbs=_dec->state.nmbs;
			
 
				+  for(mbi=0;mbi<nmbs;mbi++){
			
 
				+    int mb_mode;
			
 
				+    mb_mode=mb_modes[mbi];
			
 
				+    if(mb_mode!=OC_MODE_INVALID){
			
 
				+      oc_mv     mbmv;
			
 
				+      ptrdiff_t fragi;
			
 
				+      int       mapi;
			
 
				+      int       mapii;
			
 
				+      int       refi;
			
 
				+      if(mb_mode==OC_MODE_INTER_MV_FOUR){
			
 
				+        oc_mv lbmvs[4];
			
 
				+        int   bi;
			
 
				+        prior_mv=last_mv;
			
 
				+        for(bi=0;bi<4;bi++){
			
 
				+          fragi=mb_maps[mbi][0][bi];
			
 
				+          if(frags[fragi].coded){
			
 
				+            frags[fragi].refi=OC_FRAME_PREV;
			
 
				+            frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
			
 
				+            lbmvs[bi]=last_mv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
			
 
				+            frag_mvs[fragi]=lbmvs[bi];
			
 
				+          }
			
 
				+          else lbmvs[bi]=0;
			
 
				+        }
			
 
				+        (*set_chroma_mvs)(cbmvs,lbmvs);
			
 
				+        for(mapii=4;mapii<map_nidxs;mapii++){
			
 
				+          mapi=map_idxs[mapii];
			
 
				+          bi=mapi&3;
			
 
				+          fragi=mb_maps[mbi][mapi>>2][bi];
			
 
				+          if(frags[fragi].coded){
			
 
				+            frags[fragi].refi=OC_FRAME_PREV;
			
 
				+            frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
			
 
				+            frag_mvs[fragi]=cbmvs[bi];
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+      else{
			
 
				+        switch(mb_mode){
			
 
				+          case OC_MODE_INTER_MV:{
			
 
				+            prior_mv=last_mv;
			
 
				+            last_mv=mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
			
 
				+          }break;
			
 
				+          case OC_MODE_INTER_MV_LAST:mbmv=last_mv;break;
			
 
				+          case OC_MODE_INTER_MV_LAST2:{
			
 
				+            mbmv=prior_mv;
			
 
				+            prior_mv=last_mv;
			
 
				+            last_mv=mbmv;
			
 
				+          }break;
			
 
				+          case OC_MODE_GOLDEN_MV:{
			
 
				+            mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
			
 
				+          }break;
			
 
				+          default:mbmv=0;break;
			
 
				+        }
			
 
				+        /*Fill in the MVs for the fragments.*/
			
 
				+        refi=OC_FRAME_FOR_MODE(mb_mode);
			
 
				+        mapii=0;
			
 
				+        do{
			
 
				+          mapi=map_idxs[mapii];
			
 
				+          fragi=mb_maps[mbi][mapi>>2][mapi&3];
			
 
				+          if(frags[fragi].coded){
			
 
				+            frags[fragi].refi=refi;
			
 
				+            frags[fragi].mb_mode=mb_mode;
			
 
				+            frag_mvs[fragi]=mbmv;
			
 
				+          }
			
 
				+        }
			
 
				+        while(++mapii<map_nidxs);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static void oc_dec_block_qis_unpack(oc_dec_ctx *_dec){
			
 
				+  oc_fragment     *frags;
			
 
				+  const ptrdiff_t *coded_fragis;
			
 
				+  ptrdiff_t        ncoded_fragis;
			
 
				+  ptrdiff_t        fragii;
			
 
				+  ptrdiff_t        fragi;
			
 
				+  ncoded_fragis=_dec->state.ntotal_coded_fragis;
			
 
				+  if(ncoded_fragis<=0)return;
			
 
				+  frags=_dec->state.frags;
			
 
				+  coded_fragis=_dec->state.coded_fragis;
			
 
				+  if(_dec->state.nqis==1){
			
 
				+    /*If this frame has only a single qi value, then just use it for all coded
			
 
				+       fragments.*/
			
 
				+    for(fragii=0;fragii<ncoded_fragis;fragii++){
			
 
				+      frags[coded_fragis[fragii]].qii=0;
			
 
				+    }
			
 
				+  }
			
 
				+  else{
			
 
				+    long val;
			
 
				+    int  flag;
			
 
				+    int  nqi1;
			
 
				+    int  run_count;
			
 
				+    /*Otherwise, we decode a qi index for each fragment, using two passes of
			
 
				+      the same binary RLE scheme used for super-block coded bits.
			
 
				+     The first pass marks each fragment as having a qii of 0 or greater than
			
 
				+      0, and the second pass (if necessary), distinguishes between a qii of
			
 
				+      1 and 2.
			
 
				+     At first we just store the qii in the fragment.
			
 
				+     After all the qii's are decoded, we make a final pass to replace them
			
 
				+      with the corresponding qi's for this frame.*/
			
 
				+    val=oc_pack_read1(&_dec->opb);
			
 
				+    flag=(int)val;
			
 
				+    nqi1=0;
			
 
				+    fragii=0;
			
 
				+    while(fragii<ncoded_fragis){
			
 
				+      int full_run;
			
 
				+      run_count=oc_sb_run_unpack(&_dec->opb);
			
 
				+      full_run=run_count>=4129;
			
 
				+      do{
			
 
				+        frags[coded_fragis[fragii++]].qii=flag;
			
 
				+        nqi1+=flag;
			
 
				+      }
			
 
				+      while(--run_count>0&&fragii<ncoded_fragis);
			
 
				+      if(full_run&&fragii<ncoded_fragis){
			
 
				+        val=oc_pack_read1(&_dec->opb);
			
 
				+        flag=(int)val;
			
 
				+      }
			
 
				+      else flag=!flag;
			
 
				+    }
			
 
				+    /*TODO: run_count should be 0 here.
			
 
				+      If it's not, we should issue a warning of some kind.*/
			
 
				+    /*If we have 3 different qi's for this frame, and there was at least one
			
 
				+       fragment with a non-zero qi, make the second pass.*/
			
 
				+    if(_dec->state.nqis==3&&nqi1>0){
			
 
				+      /*Skip qii==0 fragments.*/
			
 
				+      for(fragii=0;frags[coded_fragis[fragii]].qii==0;fragii++);
			
 
				+      val=oc_pack_read1(&_dec->opb);
			
 
				+      flag=(int)val;
			
 
				+      do{
			
 
				+        int full_run;
			
 
				+        run_count=oc_sb_run_unpack(&_dec->opb);
			
 
				+        full_run=run_count>=4129;
			
 
				+        for(;fragii<ncoded_fragis;fragii++){
			
 
				+          fragi=coded_fragis[fragii];
			
 
				+          if(frags[fragi].qii==0)continue;
			
 
				+          if(run_count--<=0)break;
			
 
				+          frags[fragi].qii+=flag;
			
 
				+        }
			
 
				+        if(full_run&&fragii<ncoded_fragis){
			
 
				+          val=oc_pack_read1(&_dec->opb);
			
 
				+          flag=(int)val;
			
 
				+        }
			
 
				+        else flag=!flag;
			
 
				+      }
			
 
				+      while(fragii<ncoded_fragis);
			
 
				+      /*TODO: run_count should be 0 here.
			
 
				+        If it's not, we should issue a warning of some kind.*/
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Unpacks the DC coefficient tokens.
			
 
				+  Unlike when unpacking the AC coefficient tokens, we actually need to decode
			
 
				+   the DC coefficient values now so that we can do DC prediction.
			
 
				+  _huff_idx:   The index of the Huffman table to use for each color plane.
			
 
				+  _ntoks_left: The number of tokens left to be decoded in each color plane for
			
 
				+                each coefficient.
			
 
				+               This is updated as EOB tokens and zero run tokens are decoded.
			
 
				+  Return: The length of any outstanding EOB run.*/
			
 
				+static ptrdiff_t oc_dec_dc_coeff_unpack(oc_dec_ctx *_dec,int _huff_idxs[2],
			
 
				+ ptrdiff_t _ntoks_left[3][64]){
			
 
				+  unsigned char   *dct_tokens;
			
 
				+  oc_fragment     *frags;
			
 
				+  const ptrdiff_t *coded_fragis;
			
 
				+  ptrdiff_t        ncoded_fragis;
			
 
				+  ptrdiff_t        fragii;
			
 
				+  ptrdiff_t        eobs;
			
 
				+  ptrdiff_t        ti;
			
 
				+  int              pli;
			
 
				+  dct_tokens=_dec->dct_tokens;
			
 
				+  frags=_dec->state.frags;
			
 
				+  coded_fragis=_dec->state.coded_fragis;
			
 
				+  ncoded_fragis=fragii=eobs=ti=0;
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    ptrdiff_t run_counts[64];
			
 
				+    ptrdiff_t eob_count;
			
 
				+    ptrdiff_t eobi;
			
 
				+    int       rli;
			
 
				+    ncoded_fragis+=_dec->state.ncoded_fragis[pli];
			
 
				+    memset(run_counts,0,sizeof(run_counts));
			
 
				+    _dec->eob_runs[pli][0]=eobs;
			
 
				+    _dec->ti0[pli][0]=ti;
			
 
				+    /*Continue any previous EOB run, if there was one.*/
			
 
				+    eobi=eobs;
			
 
				+    if(ncoded_fragis-fragii<eobi)eobi=ncoded_fragis-fragii;
			
 
				+    eob_count=eobi;
			
 
				+    eobs-=eobi;
			
 
				+    while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
			
 
				+    while(fragii<ncoded_fragis){
			
 
				+      int token;
			
 
				+      int cw;
			
 
				+      int eb;
			
 
				+      int skip;
			
 
				+      token=oc_huff_token_decode(&_dec->opb,
			
 
				+       _dec->huff_tables[_huff_idxs[pli+1>>1]]);
			
 
				+      dct_tokens[ti++]=(unsigned char)token;
			
 
				+      if(OC_DCT_TOKEN_NEEDS_MORE(token)){
			
 
				+        eb=(int)oc_pack_read(&_dec->opb,
			
 
				+         OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[token]);
			
 
				+        dct_tokens[ti++]=(unsigned char)eb;
			
 
				+        if(token==OC_DCT_TOKEN_FAT_EOB)dct_tokens[ti++]=(unsigned char)(eb>>8);
			
 
				+        eb<<=OC_DCT_TOKEN_EB_POS(token);
			
 
				+      }
			
 
				+      else eb=0;
			
 
				+      cw=OC_DCT_CODE_WORD[token]+eb;
			
 
				+      eobs=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF;
			
 
				+      if(cw==OC_DCT_CW_FINISH)eobs=OC_DCT_EOB_FINISH;
			
 
				+      if(eobs){
			
 
				+        eobi=OC_MINI(eobs,ncoded_fragis-fragii);
			
 
				+        eob_count+=eobi;
			
 
				+        eobs-=eobi;
			
 
				+        while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
			
 
				+      }
			
 
				+      else{
			
 
				+        int coeff;
			
 
				+        skip=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT);
			
 
				+        cw^=-(cw&1<<OC_DCT_CW_FLIP_BIT);
			
 
				+        coeff=cw>>OC_DCT_CW_MAG_SHIFT;
			
 
				+        if(skip)coeff=0;
			
 
				+        run_counts[skip]++;
			
 
				+        frags[coded_fragis[fragii++]].dc=coeff;
			
 
				+      }
			
 
				+    }
			
 
				+    /*Add the total EOB count to the longest run length.*/
			
 
				+    run_counts[63]+=eob_count;
			
 
				+    /*And convert the run_counts array to a moment table.*/
			
 
				+    for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1];
			
 
				+    /*Finally, subtract off the number of coefficients that have been
			
 
				+       accounted for by runs started in this coefficient.*/
			
 
				+    for(rli=64;rli-->0;)_ntoks_left[pli][rli]-=run_counts[rli];
			
 
				+  }
			
 
				+  _dec->dct_tokens_count=ti;
			
 
				+  return eobs;
			
 
				+}
			
 
				+
			
 
				+/*Unpacks the AC coefficient tokens.
			
 
				+  This can completely discard coefficient values while unpacking, and so is
			
 
				+   somewhat simpler than unpacking the DC coefficient tokens.
			
 
				+  _huff_idx:   The index of the Huffman table to use for each color plane.
			
 
				+  _ntoks_left: The number of tokens left to be decoded in each color plane for
			
 
				+                each coefficient.
			
 
				+               This is updated as EOB tokens and zero run tokens are decoded.
			
 
				+  _eobs:       The length of any outstanding EOB run from previous
			
 
				+                coefficients.
			
 
				+  Return: The length of any outstanding EOB run.*/
			
 
				+static int oc_dec_ac_coeff_unpack(oc_dec_ctx *_dec,int _zzi,int _huff_idxs[2],
			
 
				+ ptrdiff_t _ntoks_left[3][64],ptrdiff_t _eobs){
			
 
				+  unsigned char *dct_tokens;
			
 
				+  ptrdiff_t      ti;
			
 
				+  int            pli;
			
 
				+  dct_tokens=_dec->dct_tokens;
			
 
				+  ti=_dec->dct_tokens_count;
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    ptrdiff_t run_counts[64];
			
 
				+    ptrdiff_t eob_count;
			
 
				+    size_t    ntoks_left;
			
 
				+    size_t    ntoks;
			
 
				+    int       rli;
			
 
				+    _dec->eob_runs[pli][_zzi]=_eobs;
			
 
				+    _dec->ti0[pli][_zzi]=ti;
			
 
				+    ntoks_left=_ntoks_left[pli][_zzi];
			
 
				+    memset(run_counts,0,sizeof(run_counts));
			
 
				+    eob_count=0;
			
 
				+    ntoks=0;
			
 
				+    while(ntoks+_eobs<ntoks_left){
			
 
				+      int token;
			
 
				+      int cw;
			
 
				+      int eb;
			
 
				+      int skip;
			
 
				+      ntoks+=_eobs;
			
 
				+      eob_count+=_eobs;
			
 
				+      token=oc_huff_token_decode(&_dec->opb,
			
 
				+       _dec->huff_tables[_huff_idxs[pli+1>>1]]);
			
 
				+      dct_tokens[ti++]=(unsigned char)token;
			
 
				+      if(OC_DCT_TOKEN_NEEDS_MORE(token)){
			
 
				+        eb=(int)oc_pack_read(&_dec->opb,
			
 
				+         OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[token]);
			
 
				+        dct_tokens[ti++]=(unsigned char)eb;
			
 
				+        if(token==OC_DCT_TOKEN_FAT_EOB)dct_tokens[ti++]=(unsigned char)(eb>>8);
			
 
				+        eb<<=OC_DCT_TOKEN_EB_POS(token);
			
 
				+      }
			
 
				+      else eb=0;
			
 
				+      cw=OC_DCT_CODE_WORD[token]+eb;
			
 
				+      skip=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT);
			
 
				+      _eobs=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF;
			
 
				+      if(cw==OC_DCT_CW_FINISH)_eobs=OC_DCT_EOB_FINISH;
			
 
				+      if(_eobs==0){
			
 
				+        run_counts[skip]++;
			
 
				+        ntoks++;
			
 
				+      }
			
 
				+    }
			
 
				+    /*Add the portion of the last EOB run actually used by this coefficient.*/
			
 
				+    eob_count+=ntoks_left-ntoks;
			
 
				+    /*And remove it from the remaining EOB count.*/
			
 
				+    _eobs-=ntoks_left-ntoks;
			
 
				+    /*Add the total EOB count to the longest run length.*/
			
 
				+    run_counts[63]+=eob_count;
			
 
				+    /*And convert the run_counts array to a moment table.*/
			
 
				+    for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1];
			
 
				+    /*Finally, subtract off the number of coefficients that have been
			
 
				+       accounted for by runs started in this coefficient.*/
			
 
				+    for(rli=64-_zzi;rli-->0;)_ntoks_left[pli][_zzi+rli]-=run_counts[rli];
			
 
				+  }
			
 
				+  _dec->dct_tokens_count=ti;
			
 
				+  return _eobs;
			
 
				+}
			
 
				+
			
 
				+/*Tokens describing the DCT coefficients that belong to each fragment are
			
 
				+   stored in the bitstream grouped by coefficient, not by fragment.
			
 
				+
			
 
				+  This means that we either decode all the tokens in order, building up a
			
 
				+   separate coefficient list for each fragment as we go, and then go back and
			
 
				+   do the iDCT on each fragment, or we have to create separate lists of tokens
			
 
				+   for each coefficient, so that we can pull the next token required off the
			
 
				+   head of the appropriate list when decoding a specific fragment.
			
 
				+
			
 
				+  The former was VP3's choice, and it meant 2*w*h extra storage for all the
			
 
				+   decoded coefficient values.
			
 
				+
			
 
				+  We take the second option, which lets us store just one to three bytes per
			
 
				+   token (generally far fewer than the number of coefficients, due to EOB
			
 
				+   tokens and zero runs), and which requires us to only maintain a counter for
			
 
				+   each of the 64 coefficients, instead of a counter for every fragment to
			
 
				+   determine where the next token goes.
			
 
				+
			
 
				+  We actually use 3 counters per coefficient, one for each color plane, so we
			
 
				+   can decode all color planes simultaneously.
			
 
				+  This lets color conversion, etc., be done as soon as a full MCU (one or
			
 
				+   two super block rows) is decoded, while the image data is still in cache.*/
			
 
				+
			
 
				+static void oc_dec_residual_tokens_unpack(oc_dec_ctx *_dec){
			
 
				+  static const unsigned char OC_HUFF_LIST_MAX[5]={1,6,15,28,64};
			
 
				+  ptrdiff_t  ntoks_left[3][64];
			
 
				+  int        huff_idxs[2];
			
 
				+  ptrdiff_t  eobs;
			
 
				+  long       val;
			
 
				+  int        pli;
			
 
				+  int        zzi;
			
 
				+  int        hgi;
			
 
				+  for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
			
 
				+    ntoks_left[pli][zzi]=_dec->state.ncoded_fragis[pli];
			
 
				+  }
			
 
				+  val=oc_pack_read(&_dec->opb,4);
			
 
				+  huff_idxs[0]=(int)val;
			
 
				+  val=oc_pack_read(&_dec->opb,4);
			
 
				+  huff_idxs[1]=(int)val;
			
 
				+  _dec->eob_runs[0][0]=0;
			
 
				+  eobs=oc_dec_dc_coeff_unpack(_dec,huff_idxs,ntoks_left);
			
 
				+#if defined(HAVE_CAIRO)
			
 
				+  _dec->telemetry_dc_bytes=oc_pack_bytes_left(&_dec->opb);
			
 
				+#endif
			
 
				+  val=oc_pack_read(&_dec->opb,4);
			
 
				+  huff_idxs[0]=(int)val;
			
 
				+  val=oc_pack_read(&_dec->opb,4);
			
 
				+  huff_idxs[1]=(int)val;
			
 
				+  zzi=1;
			
 
				+  for(hgi=1;hgi<5;hgi++){
			
 
				+    huff_idxs[0]+=16;
			
 
				+    huff_idxs[1]+=16;
			
 
				+    for(;zzi<OC_HUFF_LIST_MAX[hgi];zzi++){
			
 
				+      eobs=oc_dec_ac_coeff_unpack(_dec,zzi,huff_idxs,ntoks_left,eobs);
			
 
				+    }
			
 
				+  }
			
 
				+  /*TODO: eobs should be exactly zero, or 4096 or greater.
			
 
				+    The second case occurs when an EOB run of size zero is encountered, which
			
 
				+     gets treated as an infinite EOB run (where infinity is PTRDIFF_MAX).
			
 
				+    If neither of these conditions holds, then a warning should be issued.*/
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static int oc_dec_postprocess_init(oc_dec_ctx *_dec){
			
 
				+  /*pp_level 0: disabled; free any memory used and return*/
			
 
				+  if(_dec->pp_level<=OC_PP_LEVEL_DISABLED){
			
 
				+    if(_dec->dc_qis!=NULL){
			
 
				+      _ogg_free(_dec->dc_qis);
			
 
				+      _dec->dc_qis=NULL;
			
 
				+      _ogg_free(_dec->variances);
			
 
				+      _dec->variances=NULL;
			
 
				+      _ogg_free(_dec->pp_frame_data);
			
 
				+      _dec->pp_frame_data=NULL;
			
 
				+    }
			
 
				+    return 1;
			
 
				+  }
			
 
				+  if(_dec->dc_qis==NULL){
			
 
				+    /*If we haven't been tracking DC quantization indices, there's no point in
			
 
				+       starting now.*/
			
 
				+    if(_dec->state.frame_type!=OC_INTRA_FRAME)return 1;
			
 
				+    _dec->dc_qis=(unsigned char *)_ogg_malloc(
			
 
				+     _dec->state.nfrags*sizeof(_dec->dc_qis[0]));
			
 
				+    if(_dec->dc_qis==NULL)return 1;
			
 
				+    memset(_dec->dc_qis,_dec->state.qis[0],_dec->state.nfrags);
			
 
				+  }
			
 
				+  else{
			
 
				+    unsigned char   *dc_qis;
			
 
				+    const ptrdiff_t *coded_fragis;
			
 
				+    ptrdiff_t        ncoded_fragis;
			
 
				+    ptrdiff_t        fragii;
			
 
				+    unsigned char    qi0;
			
 
				+    /*Update the DC quantization index of each coded block.*/
			
 
				+    dc_qis=_dec->dc_qis;
			
 
				+    coded_fragis=_dec->state.coded_fragis;
			
 
				+    ncoded_fragis=_dec->state.ncoded_fragis[0]+
			
 
				+     _dec->state.ncoded_fragis[1]+_dec->state.ncoded_fragis[2];
			
 
				+    qi0=(unsigned char)_dec->state.qis[0];
			
 
				+    for(fragii=0;fragii<ncoded_fragis;fragii++){
			
 
				+      dc_qis[coded_fragis[fragii]]=qi0;
			
 
				+    }
			
 
				+  }
			
 
				+  /*pp_level 1: Stop after updating DC quantization indices.*/
			
 
				+  if(_dec->pp_level<=OC_PP_LEVEL_TRACKDCQI){
			
 
				+    if(_dec->variances!=NULL){
			
 
				+      _ogg_free(_dec->variances);
			
 
				+      _dec->variances=NULL;
			
 
				+      _ogg_free(_dec->pp_frame_data);
			
 
				+      _dec->pp_frame_data=NULL;
			
 
				+    }
			
 
				+    return 1;
			
 
				+  }
			
 
				+  if(_dec->variances==NULL){
			
 
				+    size_t frame_sz;
			
 
				+    size_t c_sz;
			
 
				+    int    c_w;
			
 
				+    int    c_h;
			
 
				+    frame_sz=_dec->state.info.frame_width*(size_t)_dec->state.info.frame_height;
			
 
				+    c_w=_dec->state.info.frame_width>>!(_dec->state.info.pixel_fmt&1);
			
 
				+    c_h=_dec->state.info.frame_height>>!(_dec->state.info.pixel_fmt&2);
			
 
				+    c_sz=c_w*(size_t)c_h;
			
 
				+    /*Allocate space for the chroma planes, even if we're not going to use
			
 
				+       them; this simplifies allocation state management, though it may waste
			
 
				+       memory on the few systems that don't overcommit pages.*/
			
 
				+    frame_sz+=c_sz<<1;
			
 
				+    _dec->pp_frame_data=(unsigned char *)_ogg_malloc(
			
 
				+     frame_sz*sizeof(_dec->pp_frame_data[0]));
			
 
				+    _dec->variances=(int *)_ogg_malloc(
			
 
				+     _dec->state.nfrags*sizeof(_dec->variances[0]));
			
 
				+    if(_dec->variances==NULL||_dec->pp_frame_data==NULL){
			
 
				+      _ogg_free(_dec->pp_frame_data);
			
 
				+      _dec->pp_frame_data=NULL;
			
 
				+      _ogg_free(_dec->variances);
			
 
				+      _dec->variances=NULL;
			
 
				+      return 1;
			
 
				+    }
			
 
				+    /*Force an update of the PP buffer pointers.*/
			
 
				+    _dec->pp_frame_state=0;
			
 
				+  }
			
 
				+  /*Update the PP buffer pointers if necessary.*/
			
 
				+  if(_dec->pp_frame_state!=1+(_dec->pp_level>=OC_PP_LEVEL_DEBLOCKC)){
			
 
				+    if(_dec->pp_level<OC_PP_LEVEL_DEBLOCKC){
			
 
				+      /*If chroma processing is disabled, just use the PP luma plane.*/
			
 
				+      _dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
			
 
				+      _dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
			
 
				+      _dec->pp_frame_buf[0].stride=-_dec->pp_frame_buf[0].width;
			
 
				+      _dec->pp_frame_buf[0].data=_dec->pp_frame_data+
			
 
				+       (1-_dec->pp_frame_buf[0].height)*(ptrdiff_t)_dec->pp_frame_buf[0].stride;
			
 
				+    }
			
 
				+    else{
			
 
				+      size_t y_sz;
			
 
				+      size_t c_sz;
			
 
				+      int    c_w;
			
 
				+      int    c_h;
			
 
				+      /*Otherwise, set up pointers to all three PP planes.*/
			
 
				+      y_sz=_dec->state.info.frame_width*(size_t)_dec->state.info.frame_height;
			
 
				+      c_w=_dec->state.info.frame_width>>!(_dec->state.info.pixel_fmt&1);
			
 
				+      c_h=_dec->state.info.frame_height>>!(_dec->state.info.pixel_fmt&2);
			
 
				+      c_sz=c_w*(size_t)c_h;
			
 
				+      _dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
			
 
				+      _dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
			
 
				+      _dec->pp_frame_buf[0].stride=_dec->pp_frame_buf[0].width;
			
 
				+      _dec->pp_frame_buf[0].data=_dec->pp_frame_data;
			
 
				+      _dec->pp_frame_buf[1].width=c_w;
			
 
				+      _dec->pp_frame_buf[1].height=c_h;
			
 
				+      _dec->pp_frame_buf[1].stride=_dec->pp_frame_buf[1].width;
			
 
				+      _dec->pp_frame_buf[1].data=_dec->pp_frame_buf[0].data+y_sz;
			
 
				+      _dec->pp_frame_buf[2].width=c_w;
			
 
				+      _dec->pp_frame_buf[2].height=c_h;
			
 
				+      _dec->pp_frame_buf[2].stride=_dec->pp_frame_buf[2].width;
			
 
				+      _dec->pp_frame_buf[2].data=_dec->pp_frame_buf[1].data+c_sz;
			
 
				+      oc_ycbcr_buffer_flip(_dec->pp_frame_buf,_dec->pp_frame_buf);
			
 
				+    }
			
 
				+    _dec->pp_frame_state=1+(_dec->pp_level>=OC_PP_LEVEL_DEBLOCKC);
			
 
				+  }
			
 
				+  /*If we're not processing chroma, copy the reference frame's chroma planes.*/
			
 
				+  if(_dec->pp_level<OC_PP_LEVEL_DEBLOCKC){
			
 
				+    memcpy(_dec->pp_frame_buf+1,
			
 
				+     _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]]+1,
			
 
				+     sizeof(_dec->pp_frame_buf[1])*2);
			
 
				+  }
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*Initialize the main decoding pipeline.*/
			
 
				+static void oc_dec_pipeline_init(oc_dec_ctx *_dec,
			
 
				+ oc_dec_pipeline_state *_pipe){
			
 
				+  const ptrdiff_t *coded_fragis;
			
 
				+  const ptrdiff_t *uncoded_fragis;
			
 
				+  int              flimit;
			
 
				+  int              pli;
			
 
				+  int              qii;
			
 
				+  int              qti;
			
 
				+  int              zzi;
			
 
				+  /*If chroma is sub-sampled in the vertical direction, we have to decode two
			
 
				+     super block rows of Y' for each super block row of Cb and Cr.*/
			
 
				+  _pipe->mcu_nvfrags=4<<!(_dec->state.info.pixel_fmt&2);
			
 
				+  /*Initialize the token and extra bits indices for each plane and
			
 
				+     coefficient.*/
			
 
				+  memcpy(_pipe->ti,_dec->ti0,sizeof(_pipe->ti));
			
 
				+  /*Also copy over the initial the EOB run counts.*/
			
 
				+  memcpy(_pipe->eob_runs,_dec->eob_runs,sizeof(_pipe->eob_runs));
			
 
				+  /*Set up per-plane pointers to the coded and uncoded fragments lists.*/
			
 
				+  coded_fragis=_dec->state.coded_fragis;
			
 
				+  uncoded_fragis=coded_fragis+_dec->state.nfrags;
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    ptrdiff_t ncoded_fragis;
			
 
				+    _pipe->coded_fragis[pli]=coded_fragis;
			
 
				+    _pipe->uncoded_fragis[pli]=uncoded_fragis;
			
 
				+    ncoded_fragis=_dec->state.ncoded_fragis[pli];
			
 
				+    coded_fragis+=ncoded_fragis;
			
 
				+    uncoded_fragis+=ncoded_fragis-_dec->state.fplanes[pli].nfrags;
			
 
				+  }
			
 
				+  /*Set up condensed quantizer tables.*/
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    for(qii=0;qii<_dec->state.nqis;qii++){
			
 
				+      for(qti=0;qti<2;qti++){
			
 
				+        _pipe->dequant[pli][qii][qti]=
			
 
				+         _dec->state.dequant_tables[_dec->state.qis[qii]][pli][qti];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  /*Set the previous DC predictor to 0 for all color planes and frame types.*/
			
 
				+  memset(_pipe->pred_last,0,sizeof(_pipe->pred_last));
			
 
				+  /*Initialize the bounding value array for the loop filter.*/
			
 
				+  flimit=_dec->state.loop_filter_limits[_dec->state.qis[0]];
			
 
				+  _pipe->loop_filter=flimit!=0;
			
 
				+  if(flimit!=0)oc_loop_filter_init(&_dec->state,_pipe->bounding_values,flimit);
			
 
				+  /*Initialize any buffers needed for post-processing.
			
 
				+    We also save the current post-processing level, to guard against the user
			
 
				+     changing it from a callback.*/
			
 
				+  if(!oc_dec_postprocess_init(_dec))_pipe->pp_level=_dec->pp_level;
			
 
				+  /*If we don't have enough information to post-process, disable it, regardless
			
 
				+     of the user-requested level.*/
			
 
				+  else{
			
 
				+    _pipe->pp_level=OC_PP_LEVEL_DISABLED;
			
 
				+    memcpy(_dec->pp_frame_buf,
			
 
				+     _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]],
			
 
				+     sizeof(_dec->pp_frame_buf[0])*3);
			
 
				+  }
			
 
				+  /*Clear down the DCT coefficient buffer for the first block.*/
			
 
				+  for(zzi=0;zzi<64;zzi++)_pipe->dct_coeffs[zzi]=0;
			
 
				+}
			
 
				+
			
 
				+/*Undo the DC prediction in a single plane of an MCU (one or two super block
			
 
				+   rows).
			
 
				+  As a side effect, the number of coded and uncoded fragments in this plane of
			
 
				+   the MCU is also computed.*/
			
 
				+void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
			
 
				+ oc_dec_pipeline_state *_pipe,int _pli){
			
 
				+  const oc_fragment_plane *fplane;
			
 
				+  oc_fragment             *frags;
			
 
				+  int                     *pred_last;
			
 
				+  ptrdiff_t                ncoded_fragis;
			
 
				+  ptrdiff_t                fragi;
			
 
				+  int                      fragx;
			
 
				+  int                      fragy;
			
 
				+  int                      fragy0;
			
 
				+  int                      fragy_end;
			
 
				+  int                      nhfrags;
			
 
				+  /*Compute the first and last fragment row of the current MCU for this
			
 
				+     plane.*/
			
 
				+  fplane=_dec->state.fplanes+_pli;
			
 
				+  fragy0=_pipe->fragy0[_pli];
			
 
				+  fragy_end=_pipe->fragy_end[_pli];
			
 
				+  nhfrags=fplane->nhfrags;
			
 
				+  pred_last=_pipe->pred_last[_pli];
			
 
				+  frags=_dec->state.frags;
			
 
				+  ncoded_fragis=0;
			
 
				+  fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
			
 
				+  for(fragy=fragy0;fragy<fragy_end;fragy++){
			
 
				+    if(fragy==0){
			
 
				+      /*For the first row, all of the cases reduce to just using the previous
			
 
				+         predictor for the same reference frame.*/
			
 
				+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
			
 
				+        if(frags[fragi].coded){
			
 
				+          int refi;
			
 
				+          refi=frags[fragi].refi;
			
 
				+          pred_last[refi]=frags[fragi].dc+=pred_last[refi];
			
 
				+          ncoded_fragis++;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    else{
			
 
				+      oc_fragment *u_frags;
			
 
				+      int          l_ref;
			
 
				+      int          ul_ref;
			
 
				+      int          u_ref;
			
 
				+      u_frags=frags-nhfrags;
			
 
				+      l_ref=-1;
			
 
				+      ul_ref=-1;
			
 
				+      u_ref=u_frags[fragi].refi;
			
 
				+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
			
 
				+        int ur_ref;
			
 
				+        if(fragx+1>=nhfrags)ur_ref=-1;
			
 
				+        else ur_ref=u_frags[fragi+1].refi;
			
 
				+        if(frags[fragi].coded){
			
 
				+          int pred;
			
 
				+          int refi;
			
 
				+          refi=frags[fragi].refi;
			
 
				+          /*We break out a separate case based on which of our neighbors use
			
 
				+             the same reference frames.
			
 
				+            This is somewhat faster than trying to make a generic case which
			
 
				+             handles all of them, since it reduces lots of poorly predicted
			
 
				+             jumps to one switch statement, and also lets a number of the
			
 
				+             multiplications be optimized out by strength reduction.*/
			
 
				+          switch((l_ref==refi)|(ul_ref==refi)<<1|
			
 
				+           (u_ref==refi)<<2|(ur_ref==refi)<<3){
			
 
				+            default:pred=pred_last[refi];break;
			
 
				+            case  1:
			
 
				+            case  3:pred=frags[fragi-1].dc;break;
			
 
				+            case  2:pred=u_frags[fragi-1].dc;break;
			
 
				+            case  4:
			
 
				+            case  6:
			
 
				+            case 12:pred=u_frags[fragi].dc;break;
			
 
				+            case  5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
			
 
				+            case  8:pred=u_frags[fragi+1].dc;break;
			
 
				+            case  9:
			
 
				+            case 11:
			
 
				+            case 13:{
			
 
				+              /*The TI compiler mis-compiles this line.*/
			
 
				+              pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
			
 
				+            }break;
			
 
				+            case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
			
 
				+            case 14:{
			
 
				+              pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
			
 
				+               +10*u_frags[fragi].dc)/16;
			
 
				+            }break;
			
 
				+            case  7:
			
 
				+            case 15:{
			
 
				+              int p0;
			
 
				+              int p1;
			
 
				+              int p2;
			
 
				+              p0=frags[fragi-1].dc;
			
 
				+              p1=u_frags[fragi-1].dc;
			
 
				+              p2=u_frags[fragi].dc;
			
 
				+              pred=(29*(p0+p2)-26*p1)/32;
			
 
				+              if(abs(pred-p2)>128)pred=p2;
			
 
				+              else if(abs(pred-p0)>128)pred=p0;
			
 
				+              else if(abs(pred-p1)>128)pred=p1;
			
 
				+            }break;
			
 
				+          }
			
 
				+          pred_last[refi]=frags[fragi].dc+=pred;
			
 
				+          ncoded_fragis++;
			
 
				+          l_ref=refi;
			
 
				+        }
			
 
				+        else l_ref=-1;
			
 
				+        ul_ref=u_ref;
			
 
				+        u_ref=ur_ref;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
			
 
				+  /*Also save the number of uncoded fragments so we know how many to copy.*/
			
 
				+  _pipe->nuncoded_fragis[_pli]=
			
 
				+   (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
			
 
				+}
			
 
				+
			
 
				+/*Reconstructs all coded fragments in a single MCU (one or two super block
			
 
				+   rows).
			
 
				+  This requires that each coded fragment have a proper macro block mode and
			
 
				+   motion vector (if not in INTRA mode), and have its DC value decoded, with
			
 
				+   the DC prediction process reversed, and the number of coded and uncoded
			
 
				+   fragments in this plane of the MCU be counted.
			
 
				+  The token lists for each color plane and coefficient should also be filled
			
 
				+   in, along with initial token offsets, extra bits offsets, and EOB run
			
 
				+   counts.*/
			
 
				+static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec,
			
 
				+ oc_dec_pipeline_state *_pipe,int _pli){
			
 
				+  unsigned char       *dct_tokens;
			
 
				+  const unsigned char *dct_fzig_zag;
			
 
				+  ogg_uint16_t         dc_quant[2];
			
 
				+  const oc_fragment   *frags;
			
 
				+  const ptrdiff_t     *coded_fragis;
			
 
				+  ptrdiff_t            ncoded_fragis;
			
 
				+  ptrdiff_t            fragii;
			
 
				+  ptrdiff_t           *ti;
			
 
				+  ptrdiff_t           *eob_runs;
			
 
				+  int                  qti;
			
 
				+  dct_tokens=_dec->dct_tokens;
			
 
				+  dct_fzig_zag=_dec->state.opt_data.dct_fzig_zag;
			
 
				+  frags=_dec->state.frags;
			
 
				+  coded_fragis=_pipe->coded_fragis[_pli];
			
 
				+  ncoded_fragis=_pipe->ncoded_fragis[_pli];
			
 
				+  ti=_pipe->ti[_pli];
			
 
				+  eob_runs=_pipe->eob_runs[_pli];
			
 
				+  for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0];
			
 
				+  for(fragii=0;fragii<ncoded_fragis;fragii++){
			
 
				+    const ogg_uint16_t *ac_quant;
			
 
				+    ptrdiff_t           fragi;
			
 
				+    int                 last_zzi;
			
 
				+    int                 zzi;
			
 
				+    fragi=coded_fragis[fragii];
			
 
				+    qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
			
 
				+    ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
			
 
				+    /*Decode the AC coefficients.*/
			
 
				+    for(zzi=0;zzi<64;){
			
 
				+      int token;
			
 
				+      last_zzi=zzi;
			
 
				+      if(eob_runs[zzi]){
			
 
				+        eob_runs[zzi]--;
			
 
				+        break;
			
 
				+      }
			
 
				+      else{
			
 
				+        ptrdiff_t eob;
			
 
				+        int       cw;
			
 
				+        int       rlen;
			
 
				+        int       coeff;
			
 
				+        int       lti;
			
 
				+        lti=ti[zzi];
			
 
				+        token=dct_tokens[lti++];
			
 
				+        cw=OC_DCT_CODE_WORD[token];
			
 
				+        /*These parts could be done branchless, but the branches are fairly
			
 
				+           predictable and the C code translates into more than a few
			
 
				+           instructions, so it's worth it to avoid them.*/
			
 
				+        if(OC_DCT_TOKEN_NEEDS_MORE(token)){
			
 
				+          cw+=dct_tokens[lti++]<<OC_DCT_TOKEN_EB_POS(token);
			
 
				+        }
			
 
				+        eob=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF;
			
 
				+        if(token==OC_DCT_TOKEN_FAT_EOB){
			
 
				+          eob+=dct_tokens[lti++]<<8;
			
 
				+          if(eob==0)eob=OC_DCT_EOB_FINISH;
			
 
				+        }
			
 
				+        rlen=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT);
			
 
				+        cw^=-(cw&1<<OC_DCT_CW_FLIP_BIT);
			
 
				+        coeff=cw>>OC_DCT_CW_MAG_SHIFT;
			
 
				+        eob_runs[zzi]=eob;
			
 
				+        ti[zzi]=lti;
			
 
				+        zzi+=rlen;
			
 
				+        _pipe->dct_coeffs[dct_fzig_zag[zzi]]=
			
 
				+         (ogg_int16_t)(coeff*(int)ac_quant[zzi]);
			
 
				+        zzi+=!eob;
			
 
				+      }
			
 
				+    }
			
 
				+    /*TODO: zzi should be exactly 64 here.
			
 
				+      If it's not, we should report some kind of warning.*/
			
 
				+    zzi=OC_MINI(zzi,64);
			
 
				+    _pipe->dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
			
 
				+    /*last_zzi is always initialized.
			
 
				+      If your compiler thinks otherwise, it is dumb.*/
			
 
				+    oc_state_frag_recon(&_dec->state,fragi,_pli,
			
 
				+     _pipe->dct_coeffs,last_zzi,dc_quant[qti]);
			
 
				+  }
			
 
				+  _pipe->coded_fragis[_pli]+=ncoded_fragis;
			
 
				+  /*Right now the reconstructed MCU has only the coded blocks in it.*/
			
 
				+  /*TODO: We make the decision here to always copy the uncoded blocks into it
			
 
				+     from the reference frame.
			
 
				+    We could also copy the coded blocks back over the reference frame, if we
			
 
				+     wait for an additional MCU to be decoded, which might be faster if only a
			
 
				+     small number of blocks are coded.
			
 
				+    However, this introduces more latency, creating a larger cache footprint.
			
 
				+    It's unknown which decision is better, but this one results in simpler
			
 
				+     code, and the hard case (high bitrate, high resolution) is handled
			
 
				+     correctly.*/
			
 
				+  /*Copy the uncoded blocks from the previous reference frame.*/
			
 
				+  if(_pipe->nuncoded_fragis[_pli]>0){
			
 
				+    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
			
 
				+    oc_frag_copy_list(&_dec->state,
			
 
				+     _dec->state.ref_frame_data[OC_FRAME_SELF],
			
 
				+     _dec->state.ref_frame_data[OC_FRAME_PREV],
			
 
				+     _dec->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
			
 
				+     _pipe->nuncoded_fragis[_pli],_dec->state.frag_buf_offs);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Filter a horizontal block edge.*/
			
 
				+static void oc_filter_hedge(unsigned char *_dst,int _dst_ystride,
			
 
				+ const unsigned char *_src,int _src_ystride,int _qstep,int _flimit,
			
 
				+ int *_variance0,int *_variance1){
			
 
				+  unsigned char       *rdst;
			
 
				+  const unsigned char *rsrc;
			
 
				+  unsigned char       *cdst;
			
 
				+  const unsigned char *csrc;
			
 
				+  int                  r[10];
			
 
				+  int                  sum0;
			
 
				+  int                  sum1;
			
 
				+  int                  bx;
			
 
				+  int                  by;
			
 
				+  rdst=_dst;
			
 
				+  rsrc=_src;
			
 
				+  for(bx=0;bx<8;bx++){
			
 
				+    cdst=rdst;
			
 
				+    csrc=rsrc;
			
 
				+    for(by=0;by<10;by++){
			
 
				+      r[by]=*csrc;
			
 
				+      csrc+=_src_ystride;
			
 
				+    }
			
 
				+    sum0=sum1=0;
			
 
				+    for(by=0;by<4;by++){
			
 
				+      sum0+=abs(r[by+1]-r[by]);
			
 
				+      sum1+=abs(r[by+5]-r[by+6]);
			
 
				+    }
			
 
				+    *_variance0+=OC_MINI(255,sum0);
			
 
				+    *_variance1+=OC_MINI(255,sum1);
			
 
				+    if(sum0<_flimit&&sum1<_flimit&&r[5]-r[4]<_qstep&&r[4]-r[5]<_qstep){
			
 
				+      *cdst=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3);
			
 
				+      cdst+=_dst_ystride;
			
 
				+      *cdst=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3);
			
 
				+      cdst+=_dst_ystride;
			
 
				+      for(by=0;by<4;by++){
			
 
				+        *cdst=(unsigned char)(r[by]+r[by+1]+r[by+2]+r[by+3]*2+
			
 
				+         r[by+4]+r[by+5]+r[by+6]+4>>3);
			
 
				+        cdst+=_dst_ystride;
			
 
				+      }
			
 
				+      *cdst=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3);
			
 
				+      cdst+=_dst_ystride;
			
 
				+      *cdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3);
			
 
				+    }
			
 
				+    else{
			
 
				+      for(by=1;by<=8;by++){
			
 
				+        *cdst=(unsigned char)r[by];
			
 
				+        cdst+=_dst_ystride;
			
 
				+      }
			
 
				+    }
			
 
				+    rdst++;
			
 
				+    rsrc++;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Filter a vertical block edge.*/
			
 
				+static void oc_filter_vedge(unsigned char *_dst,int _dst_ystride,
			
 
				+ int _qstep,int _flimit,int *_variances){
			
 
				+  unsigned char       *rdst;
			
 
				+  const unsigned char *rsrc;
			
 
				+  unsigned char       *cdst;
			
 
				+  int                  r[10];
			
 
				+  int                  sum0;
			
 
				+  int                  sum1;
			
 
				+  int                  bx;
			
 
				+  int                  by;
			
 
				+  cdst=_dst;
			
 
				+  for(by=0;by<8;by++){
			
 
				+    rsrc=cdst-1;
			
 
				+    rdst=cdst;
			
 
				+    for(bx=0;bx<10;bx++)r[bx]=*rsrc++;
			
 
				+    sum0=sum1=0;
			
 
				+    for(bx=0;bx<4;bx++){
			
 
				+      sum0+=abs(r[bx+1]-r[bx]);
			
 
				+      sum1+=abs(r[bx+5]-r[bx+6]);
			
 
				+    }
			
 
				+    _variances[0]+=OC_MINI(255,sum0);
			
 
				+    _variances[1]+=OC_MINI(255,sum1);
			
 
				+    if(sum0<_flimit&&sum1<_flimit&&r[5]-r[4]<_qstep&&r[4]-r[5]<_qstep){
			
 
				+      *rdst++=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3);
			
 
				+      *rdst++=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3);
			
 
				+      for(bx=0;bx<4;bx++){
			
 
				+        *rdst++=(unsigned char)(r[bx]+r[bx+1]+r[bx+2]+r[bx+3]*2+
			
 
				+         r[bx+4]+r[bx+5]+r[bx+6]+4>>3);
			
 
				+      }
			
 
				+      *rdst++=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3);
			
 
				+      *rdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3);
			
 
				+    }
			
 
				+    cdst+=_dst_ystride;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static void oc_dec_deblock_frag_rows(oc_dec_ctx *_dec,
			
 
				+ th_img_plane *_dst,th_img_plane *_src,int _pli,int _fragy0,
			
 
				+ int _fragy_end){
			
 
				+  oc_fragment_plane   *fplane;
			
 
				+  int                 *variance;
			
 
				+  unsigned char       *dc_qi;
			
 
				+  unsigned char       *dst;
			
 
				+  const unsigned char *src;
			
 
				+  ptrdiff_t            froffset;
			
 
				+  int                  dst_ystride;
			
 
				+  int                  src_ystride;
			
 
				+  int                  nhfrags;
			
 
				+  int                  width;
			
 
				+  int                  notstart;
			
 
				+  int                  notdone;
			
 
				+  int                  flimit;
			
 
				+  int                  qstep;
			
 
				+  int                  y_end;
			
 
				+  int                  y;
			
 
				+  int                  x;
			
 
				+  _dst+=_pli;
			
 
				+  _src+=_pli;
			
 
				+  fplane=_dec->state.fplanes+_pli;
			
 
				+  nhfrags=fplane->nhfrags;
			
 
				+  froffset=fplane->froffset+_fragy0*(ptrdiff_t)nhfrags;
			
 
				+  variance=_dec->variances+froffset;
			
 
				+  dc_qi=_dec->dc_qis+froffset;
			
 
				+  notstart=_fragy0>0;
			
 
				+  notdone=_fragy_end<fplane->nvfrags;
			
 
				+  /*We want to clear an extra row of variances, except at the end.*/
			
 
				+  memset(variance+(nhfrags&-notstart),0,
			
 
				+   (_fragy_end+notdone-_fragy0-notstart)*(nhfrags*sizeof(variance[0])));
			
 
				+  /*Except for the first time, we want to point to the middle of the row.*/
			
 
				+  y=(_fragy0<<3)+(notstart<<2);
			
 
				+  dst_ystride=_dst->stride;
			
 
				+  src_ystride=_src->stride;
			
 
				+  dst=_dst->data+y*(ptrdiff_t)dst_ystride;
			
 
				+  src=_src->data+y*(ptrdiff_t)src_ystride;
			
 
				+  width=_dst->width;
			
 
				+  for(;y<4;y++){
			
 
				+    memcpy(dst,src,width*sizeof(dst[0]));
			
 
				+    dst+=dst_ystride;
			
 
				+    src+=src_ystride;
			
 
				+  }
			
 
				+  /*We also want to skip the last row in the frame for this loop.*/
			
 
				+  y_end=_fragy_end-!notdone<<3;
			
 
				+  for(;y<y_end;y+=8){
			
 
				+    qstep=_dec->pp_dc_scale[*dc_qi];
			
 
				+    flimit=(qstep*3)>>2;
			
 
				+    oc_filter_hedge(dst,dst_ystride,src-src_ystride,src_ystride,
			
 
				+     qstep,flimit,variance,variance+nhfrags);
			
 
				+    variance++;
			
 
				+    dc_qi++;
			
 
				+    for(x=8;x<width;x+=8){
			
 
				+      qstep=_dec->pp_dc_scale[*dc_qi];
			
 
				+      flimit=(qstep*3)>>2;
			
 
				+      oc_filter_hedge(dst+x,dst_ystride,src+x-src_ystride,src_ystride,
			
 
				+       qstep,flimit,variance,variance+nhfrags);
			
 
				+      oc_filter_vedge(dst+x-(dst_ystride<<2)-4,dst_ystride,
			
 
				+       qstep,flimit,variance-1);
			
 
				+      variance++;
			
 
				+      dc_qi++;
			
 
				+    }
			
 
				+    dst+=dst_ystride<<3;
			
 
				+    src+=src_ystride<<3;
			
 
				+  }
			
 
				+  /*And finally, handle the last row in the frame, if it's in the range.*/
			
 
				+  if(!notdone){
			
 
				+    int height;
			
 
				+    height=_dst->height;
			
 
				+    for(;y<height;y++){
			
 
				+      memcpy(dst,src,width*sizeof(dst[0]));
			
 
				+      dst+=dst_ystride;
			
 
				+      src+=src_ystride;
			
 
				+    }
			
 
				+    /*Filter the last row of vertical block edges.*/
			
 
				+    dc_qi++;
			
 
				+    for(x=8;x<width;x+=8){
			
 
				+      qstep=_dec->pp_dc_scale[*dc_qi++];
			
 
				+      flimit=(qstep*3)>>2;
			
 
				+      oc_filter_vedge(dst+x-(dst_ystride<<3)-4,dst_ystride,
			
 
				+       qstep,flimit,variance++);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static void oc_dering_block(unsigned char *_idata,int _ystride,int _b,
			
 
				+ int _dc_scale,int _sharp_mod,int _strong){
			
 
				+  static const unsigned char OC_MOD_MAX[2]={24,32};
			
 
				+  static const unsigned char OC_MOD_SHIFT[2]={1,0};
			
 
				+  const unsigned char *psrc;
			
 
				+  const unsigned char *src;
			
 
				+  const unsigned char *nsrc;
			
 
				+  unsigned char       *dst;
			
 
				+  int                  vmod[72];
			
 
				+  int                  hmod[72];
			
 
				+  int                  mod_hi;
			
 
				+  int                  by;
			
 
				+  int                  bx;
			
 
				+  mod_hi=OC_MINI(3*_dc_scale,OC_MOD_MAX[_strong]);
			
 
				+  dst=_idata;
			
 
				+  src=dst;
			
 
				+  psrc=src-(_ystride&-!(_b&4));
			
 
				+  for(by=0;by<9;by++){
			
 
				+    for(bx=0;bx<8;bx++){
			
 
				+      int mod;
			
 
				+      mod=32+_dc_scale-(abs(src[bx]-psrc[bx])<<OC_MOD_SHIFT[_strong]);
			
 
				+      vmod[(by<<3)+bx]=mod<-64?_sharp_mod:OC_CLAMPI(0,mod,mod_hi);
			
 
				+    }
			
 
				+    psrc=src;
			
 
				+    src+=_ystride&-(!(_b&8)|by<7);
			
 
				+  }
			
 
				+  nsrc=dst;
			
 
				+  psrc=dst-!(_b&1);
			
 
				+  for(bx=0;bx<9;bx++){
			
 
				+    src=nsrc;
			
 
				+    for(by=0;by<8;by++){
			
 
				+      int mod;
			
 
				+      mod=32+_dc_scale-(abs(*src-*psrc)<<OC_MOD_SHIFT[_strong]);
			
 
				+      hmod[(bx<<3)+by]=mod<-64?_sharp_mod:OC_CLAMPI(0,mod,mod_hi);
			
 
				+      psrc+=_ystride;
			
 
				+      src+=_ystride;
			
 
				+    }
			
 
				+    psrc=nsrc;
			
 
				+    nsrc+=!(_b&2)|bx<7;
			
 
				+  }
			
 
				+  src=dst;
			
 
				+  psrc=src-(_ystride&-!(_b&4));
			
 
				+  nsrc=src+_ystride;
			
 
				+  for(by=0;by<8;by++){
			
 
				+    int a;
			
 
				+    int b;
			
 
				+    int w;
			
 
				+    a=128;
			
 
				+    b=64;
			
 
				+    w=hmod[by];
			
 
				+    a-=w;
			
 
				+    b+=w**(src-!(_b&1));
			
 
				+    w=vmod[by<<3];
			
 
				+    a-=w;
			
 
				+    b+=w*psrc[0];
			
 
				+    w=vmod[by+1<<3];
			
 
				+    a-=w;
			
 
				+    b+=w*nsrc[0];
			
 
				+    w=hmod[(1<<3)+by];
			
 
				+    a-=w;
			
 
				+    b+=w*src[1];
			
 
				+    dst[0]=OC_CLAMP255(a*src[0]+b>>7);
			
 
				+    for(bx=1;bx<7;bx++){
			
 
				+      a=128;
			
 
				+      b=64;
			
 
				+      w=hmod[(bx<<3)+by];
			
 
				+      a-=w;
			
 
				+      b+=w*src[bx-1];
			
 
				+      w=vmod[(by<<3)+bx];
			
 
				+      a-=w;
			
 
				+      b+=w*psrc[bx];
			
 
				+      w=vmod[(by+1<<3)+bx];
			
 
				+      a-=w;
			
 
				+      b+=w*nsrc[bx];
			
 
				+      w=hmod[(bx+1<<3)+by];
			
 
				+      a-=w;
			
 
				+      b+=w*src[bx+1];
			
 
				+      dst[bx]=OC_CLAMP255(a*src[bx]+b>>7);
			
 
				+    }
			
 
				+    a=128;
			
 
				+    b=64;
			
 
				+    w=hmod[(7<<3)+by];
			
 
				+    a-=w;
			
 
				+    b+=w*src[6];
			
 
				+    w=vmod[(by<<3)+7];
			
 
				+    a-=w;
			
 
				+    b+=w*psrc[7];
			
 
				+    w=vmod[(by+1<<3)+7];
			
 
				+    a-=w;
			
 
				+    b+=w*nsrc[7];
			
 
				+    w=hmod[(8<<3)+by];
			
 
				+    a-=w;
			
 
				+    b+=w*src[7+!(_b&2)];
			
 
				+    dst[7]=OC_CLAMP255(a*src[7]+b>>7);
			
 
				+    dst+=_ystride;
			
 
				+    psrc=src;
			
 
				+    src=nsrc;
			
 
				+    nsrc+=_ystride&-(!(_b&8)|by<6);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+#define OC_DERING_THRESH1 (384)
			
 
				+#define OC_DERING_THRESH2 (4*OC_DERING_THRESH1)
			
 
				+#define OC_DERING_THRESH3 (5*OC_DERING_THRESH1)
			
 
				+#define OC_DERING_THRESH4 (10*OC_DERING_THRESH1)
			
 
				+
			
 
				+static void oc_dec_dering_frag_rows(oc_dec_ctx *_dec,th_img_plane *_img,
			
 
				+ int _pli,int _fragy0,int _fragy_end){
			
 
				+  th_img_plane      *iplane;
			
 
				+  oc_fragment_plane *fplane;
			
 
				+  oc_fragment       *frag;
			
 
				+  int               *variance;
			
 
				+  unsigned char     *idata;
			
 
				+  ptrdiff_t          froffset;
			
 
				+  int                ystride;
			
 
				+  int                nhfrags;
			
 
				+  int                sthresh;
			
 
				+  int                strong;
			
 
				+  int                y_end;
			
 
				+  int                width;
			
 
				+  int                height;
			
 
				+  int                y;
			
 
				+  int                x;
			
 
				+  iplane=_img+_pli;
			
 
				+  fplane=_dec->state.fplanes+_pli;
			
 
				+  nhfrags=fplane->nhfrags;
			
 
				+  froffset=fplane->froffset+_fragy0*(ptrdiff_t)nhfrags;
			
 
				+  variance=_dec->variances+froffset;
			
 
				+  frag=_dec->state.frags+froffset;
			
 
				+  strong=_dec->pp_level>=(_pli?OC_PP_LEVEL_SDERINGC:OC_PP_LEVEL_SDERINGY);
			
 
				+  sthresh=_pli?OC_DERING_THRESH4:OC_DERING_THRESH3;
			
 
				+  y=_fragy0<<3;
			
 
				+  ystride=iplane->stride;
			
 
				+  idata=iplane->data+y*(ptrdiff_t)ystride;
			
 
				+  y_end=_fragy_end<<3;
			
 
				+  width=iplane->width;
			
 
				+  height=iplane->height;
			
 
				+  for(;y<y_end;y+=8){
			
 
				+    for(x=0;x<width;x+=8){
			
 
				+      int b;
			
 
				+      int qi;
			
 
				+      int var;
			
 
				+      qi=_dec->state.qis[frag->qii];
			
 
				+      var=*variance;
			
 
				+      b=(x<=0)|(x+8>=width)<<1|(y<=0)<<2|(y+8>=height)<<3;
			
 
				+      if(strong&&var>sthresh){
			
 
				+        oc_dering_block(idata+x,ystride,b,
			
 
				+         _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
			
 
				+        if(_pli||!(b&1)&&*(variance-1)>OC_DERING_THRESH4||
			
 
				+         !(b&2)&&variance[1]>OC_DERING_THRESH4||
			
 
				+         !(b&4)&&*(variance-nhfrags)>OC_DERING_THRESH4||
			
 
				+         !(b&8)&&variance[nhfrags]>OC_DERING_THRESH4){
			
 
				+          oc_dering_block(idata+x,ystride,b,
			
 
				+           _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
			
 
				+          oc_dering_block(idata+x,ystride,b,
			
 
				+           _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
			
 
				+        }
			
 
				+      }
			
 
				+      else if(var>OC_DERING_THRESH2){
			
 
				+        oc_dering_block(idata+x,ystride,b,
			
 
				+         _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
			
 
				+      }
			
 
				+      else if(var>OC_DERING_THRESH1){
			
 
				+        oc_dering_block(idata+x,ystride,b,
			
 
				+         _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],0);
			
 
				+      }
			
 
				+      frag++;
			
 
				+      variance++;
			
 
				+    }
			
 
				+    idata+=ystride<<3;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+th_dec_ctx *th_decode_alloc(const th_info *_info,const th_setup_info *_setup){
			
 
				+  oc_dec_ctx *dec;
			
 
				+  if(_info==NULL||_setup==NULL)return NULL;
			
 
				+  dec=oc_aligned_malloc(sizeof(*dec),16);
			
 
				+  if(dec==NULL||oc_dec_init(dec,_info,_setup)<0){
			
 
				+    oc_aligned_free(dec);
			
 
				+    return NULL;
			
 
				+  }
			
 
				+  dec->state.curframe_num=0;
			
 
				+  return dec;
			
 
				+}
			
 
				+
			
 
				+void th_decode_free(th_dec_ctx *_dec){
			
 
				+  if(_dec!=NULL){
			
 
				+    oc_dec_clear(_dec);
			
 
				+    oc_aligned_free(_dec);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
			
 
				+ size_t _buf_sz){
			
 
				+  switch(_req){
			
 
				+  case TH_DECCTL_GET_PPLEVEL_MAX:{
			
 
				+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
			
 
				+    (*(int *)_buf)=OC_PP_LEVEL_MAX;
			
 
				+    return 0;
			
 
				+  }break;
			
 
				+  case TH_DECCTL_SET_PPLEVEL:{
			
 
				+    int pp_level;
			
 
				+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
			
 
				+    pp_level=*(int *)_buf;
			
 
				+    if(pp_level<0||pp_level>OC_PP_LEVEL_MAX)return TH_EINVAL;
			
 
				+    _dec->pp_level=pp_level;
			
 
				+    return 0;
			
 
				+  }break;
			
 
				+  case TH_DECCTL_SET_GRANPOS:{
			
 
				+    ogg_int64_t granpos;
			
 
				+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+    if(_buf_sz!=sizeof(ogg_int64_t))return TH_EINVAL;
			
 
				+    granpos=*(ogg_int64_t *)_buf;
			
 
				+    if(granpos<0)return TH_EINVAL;
			
 
				+    _dec->state.granpos=granpos;
			
 
				+    _dec->state.keyframe_num=(granpos>>_dec->state.info.keyframe_granule_shift)
			
 
				+     -_dec->state.granpos_bias;
			
 
				+    _dec->state.curframe_num=_dec->state.keyframe_num
			
 
				+     +(granpos&(1<<_dec->state.info.keyframe_granule_shift)-1);
			
 
				+    return 0;
			
 
				+  }break;
			
 
				+  case TH_DECCTL_SET_STRIPE_CB:{
			
 
				+    th_stripe_callback *cb;
			
 
				+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+    if(_buf_sz!=sizeof(th_stripe_callback))return TH_EINVAL;
			
 
				+    cb=(th_stripe_callback *)_buf;
			
 
				+    _dec->stripe_cb.ctx=cb->ctx;
			
 
				+    _dec->stripe_cb.stripe_decoded=cb->stripe_decoded;
			
 
				+    return 0;
			
 
				+  }break;
			
 
				+#ifdef HAVE_CAIRO
			
 
				+  case TH_DECCTL_SET_TELEMETRY_MBMODE:{
			
 
				+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
			
 
				+    _dec->telemetry=1;
			
 
				+    _dec->telemetry_mbmode=*(int *)_buf;
			
 
				+    return 0;
			
 
				+  }break;
			
 
				+  case TH_DECCTL_SET_TELEMETRY_MV:{
			
 
				+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
			
 
				+    _dec->telemetry=1;
			
 
				+    _dec->telemetry_mv=*(int *)_buf;
			
 
				+    return 0;
			
 
				+  }break;
			
 
				+  case TH_DECCTL_SET_TELEMETRY_QI:{
			
 
				+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
			
 
				+    _dec->telemetry=1;
			
 
				+    _dec->telemetry_qi=*(int *)_buf;
			
 
				+    return 0;
			
 
				+  }break;
			
 
				+  case TH_DECCTL_SET_TELEMETRY_BITS:{
			
 
				+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
			
 
				+    _dec->telemetry=1;
			
 
				+    _dec->telemetry_bits=*(int *)_buf;
			
 
				+    return 0;
			
 
				+  }break;
			
 
				+#endif
			
 
				+  default:return TH_EIMPL;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*We're decoding an INTER frame, but have no initialized reference
			
 
				+   buffers (i.e., decoding did not start on a key frame).
			
 
				+  We initialize them to a solid gray here.*/
			
 
				+static void oc_dec_init_dummy_frame(th_dec_ctx *_dec){
			
 
				+  th_info   *info;
			
 
				+  size_t     yplane_sz;
			
 
				+  size_t     cplane_sz;
			
 
				+  ptrdiff_t  yoffset;
			
 
				+  int        yhstride;
			
 
				+  int        yheight;
			
 
				+  int        chstride;
			
 
				+  int        cheight;
			
 
				+  _dec->state.ref_frame_idx[OC_FRAME_GOLD]=0;
			
 
				+  _dec->state.ref_frame_idx[OC_FRAME_PREV]=0;
			
 
				+  _dec->state.ref_frame_idx[OC_FRAME_SELF]=0;
			
 
				+  _dec->state.ref_frame_data[OC_FRAME_GOLD]=
			
 
				+   _dec->state.ref_frame_data[OC_FRAME_PREV]=
			
 
				+   _dec->state.ref_frame_data[OC_FRAME_SELF]=
			
 
				+   _dec->state.ref_frame_bufs[0][0].data;
			
 
				+  memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[0],
			
 
				+   sizeof(_dec->pp_frame_buf[0])*3);
			
 
				+  info=&_dec->state.info;
			
 
				+  yhstride=abs(_dec->state.ref_ystride[0]);
			
 
				+  yheight=info->frame_height+2*OC_UMV_PADDING;
			
 
				+  chstride=abs(_dec->state.ref_ystride[1]);
			
 
				+  cheight=yheight>>!(info->pixel_fmt&2);
			
 
				+  yplane_sz=yhstride*(size_t)yheight+16;
			
 
				+  cplane_sz=chstride*(size_t)cheight;
			
 
				+  yoffset=yhstride*(ptrdiff_t)(yheight-OC_UMV_PADDING-1)+OC_UMV_PADDING;
			
 
				+  memset(_dec->state.ref_frame_data[0]-yoffset,0x80,yplane_sz+2*cplane_sz);
			
 
				+}
			
 
				+
			
 
				+#if defined(HAVE_CAIRO)
			
 
				+static void oc_render_telemetry(th_dec_ctx *_dec,th_ycbcr_buffer _ycbcr,
			
 
				+ int _telemetry){
			
 
				+  /*Stuff the plane into cairo.*/
			
 
				+  cairo_surface_t *cs;
			
 
				+  unsigned char   *data;
			
 
				+  unsigned char   *y_row;
			
 
				+  unsigned char   *u_row;
			
 
				+  unsigned char   *v_row;
			
 
				+  unsigned char   *rgb_row;
			
 
				+  int              cstride;
			
 
				+  int              w;
			
 
				+  int              h;
			
 
				+  int              x;
			
 
				+  int              y;
			
 
				+  int              hdec;
			
 
				+  int              vdec;
			
 
				+  w=_ycbcr[0].width;
			
 
				+  h=_ycbcr[0].height;
			
 
				+  hdec=!(_dec->state.info.pixel_fmt&1);
			
 
				+  vdec=!(_dec->state.info.pixel_fmt&2);
			
 
				+  /*Lazy data buffer init.
			
 
				+    We could try to re-use the post-processing buffer, which would save
			
 
				+     memory, but complicate the allocation logic there.
			
 
				+    I don't think anyone cares about memory usage when using telemetry; it is
			
 
				+     not meant for embedded devices.*/
			
 
				+  if(_dec->telemetry_frame_data==NULL){
			
 
				+    _dec->telemetry_frame_data=_ogg_malloc(
			
 
				+     (w*h+2*(w>>hdec)*(h>>vdec))*sizeof(*_dec->telemetry_frame_data));
			
 
				+    if(_dec->telemetry_frame_data==NULL)return;
			
 
				+  }
			
 
				+  cs=cairo_image_surface_create(CAIRO_FORMAT_RGB24,w,h);
			
 
				+  /*Sadly, no YUV support in Cairo (yet); convert into the RGB buffer.*/
			
 
				+  data=cairo_image_surface_get_data(cs);
			
 
				+  if(data==NULL){
			
 
				+    cairo_surface_destroy(cs);
			
 
				+    return;
			
 
				+  }
			
 
				+  cstride=cairo_image_surface_get_stride(cs);
			
 
				+  y_row=_ycbcr[0].data;
			
 
				+  u_row=_ycbcr[1].data;
			
 
				+  v_row=_ycbcr[2].data;
			
 
				+  rgb_row=data;
			
 
				+  for(y=0;y<h;y++){
			
 
				+    for(x=0;x<w;x++){
			
 
				+      int r;
			
 
				+      int g;
			
 
				+      int b;
			
 
				+      r=(1904000*y_row[x]+2609823*v_row[x>>hdec]-363703744)/1635200;
			
 
				+      g=(3827562*y_row[x]-1287801*u_row[x>>hdec]
			
 
				+       -2672387*v_row[x>>hdec]+447306710)/3287200;
			
 
				+      b=(952000*y_row[x]+1649289*u_row[x>>hdec]-225932192)/817600;
			
 
				+      rgb_row[4*x+0]=OC_CLAMP255(b);
			
 
				+      rgb_row[4*x+1]=OC_CLAMP255(g);
			
 
				+      rgb_row[4*x+2]=OC_CLAMP255(r);
			
 
				+    }
			
 
				+    y_row+=_ycbcr[0].stride;
			
 
				+    u_row+=_ycbcr[1].stride&-((y&1)|!vdec);
			
 
				+    v_row+=_ycbcr[2].stride&-((y&1)|!vdec);
			
 
				+    rgb_row+=cstride;
			
 
				+  }
			
 
				+  /*Draw coded identifier for each macroblock (stored in Hilbert order).*/
			
 
				+  {
			
 
				+    cairo_t           *c;
			
 
				+    const oc_fragment *frags;
			
 
				+    oc_mv             *frag_mvs;
			
 
				+    const signed char *mb_modes;
			
 
				+    oc_mb_map         *mb_maps;
			
 
				+    size_t             nmbs;
			
 
				+    size_t             mbi;
			
 
				+    int                row2;
			
 
				+    int                col2;
			
 
				+    int                qim[3]={0,0,0};
			
 
				+    if(_dec->state.nqis==2){
			
 
				+      int bqi;
			
 
				+      bqi=_dec->state.qis[0];
			
 
				+      if(_dec->state.qis[1]>bqi)qim[1]=1;
			
 
				+      if(_dec->state.qis[1]<bqi)qim[1]=-1;
			
 
				+    }
			
 
				+    if(_dec->state.nqis==3){
			
 
				+      int bqi;
			
 
				+      int cqi;
			
 
				+      int dqi;
			
 
				+      bqi=_dec->state.qis[0];
			
 
				+      cqi=_dec->state.qis[1];
			
 
				+      dqi=_dec->state.qis[2];
			
 
				+      if(cqi>bqi&&dqi>bqi){
			
 
				+        if(dqi>cqi){
			
 
				+          qim[1]=1;
			
 
				+          qim[2]=2;
			
 
				+        }
			
 
				+        else{
			
 
				+          qim[1]=2;
			
 
				+          qim[2]=1;
			
 
				+        }
			
 
				+      }
			
 
				+      else if(cqi<bqi&&dqi<bqi){
			
 
				+        if(dqi<cqi){
			
 
				+          qim[1]=-1;
			
 
				+          qim[2]=-2;
			
 
				+        }
			
 
				+        else{
			
 
				+          qim[1]=-2;
			
 
				+          qim[2]=-1;
			
 
				+        }
			
 
				+      }
			
 
				+      else{
			
 
				+        if(cqi<bqi)qim[1]=-1;
			
 
				+        else qim[1]=1;
			
 
				+        if(dqi<bqi)qim[2]=-1;
			
 
				+        else qim[2]=1;
			
 
				+      }
			
 
				+    }
			
 
				+    c=cairo_create(cs);
			
 
				+    frags=_dec->state.frags;
			
 
				+    frag_mvs=_dec->state.frag_mvs;
			
 
				+    mb_modes=_dec->state.mb_modes;
			
 
				+    mb_maps=_dec->state.mb_maps;
			
 
				+    nmbs=_dec->state.nmbs;
			
 
				+    row2=0;
			
 
				+    col2=0;
			
 
				+    for(mbi=0;mbi<nmbs;mbi++){
			
 
				+      float x;
			
 
				+      float y;
			
 
				+      int   bi;
			
 
				+      y=h-(row2+((col2+1>>1)&1))*16-16;
			
 
				+      x=(col2>>1)*16;
			
 
				+      cairo_set_line_width(c,1.);
			
 
				+      /*Keyframe (all intra) red box.*/
			
 
				+      if(_dec->state.frame_type==OC_INTRA_FRAME){
			
 
				+        if(_dec->telemetry_mbmode&0x02){
			
 
				+          cairo_set_source_rgba(c,1.,0,0,.5);
			
 
				+          cairo_rectangle(c,x+2.5,y+2.5,11,11);
			
 
				+          cairo_stroke_preserve(c);
			
 
				+          cairo_set_source_rgba(c,1.,0,0,.25);
			
 
				+          cairo_fill(c);
			
 
				+        }
			
 
				+      }
			
 
				+      else{
			
 
				+        ptrdiff_t fragi;
			
 
				+        int       frag_mvx;
			
 
				+        int       frag_mvy;
			
 
				+        for(bi=0;bi<4;bi++){
			
 
				+          fragi=mb_maps[mbi][0][bi];
			
 
				+          if(fragi>=0&&frags[fragi].coded){
			
 
				+            frag_mvx=OC_MV_X(frag_mvs[fragi]);
			
 
				+            frag_mvy=OC_MV_Y(frag_mvs[fragi]);
			
 
				+            break;
			
 
				+          }
			
 
				+        }
			
 
				+        if(bi<4){
			
 
				+          switch(mb_modes[mbi]){
			
 
				+            case OC_MODE_INTRA:{
			
 
				+              if(_dec->telemetry_mbmode&0x02){
			
 
				+                cairo_set_source_rgba(c,1.,0,0,.5);
			
 
				+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_source_rgba(c,1.,0,0,.25);
			
 
				+                cairo_fill(c);
			
 
				+              }
			
 
				+            }break;
			
 
				+            case OC_MODE_INTER_NOMV:{
			
 
				+              if(_dec->telemetry_mbmode&0x01){
			
 
				+                cairo_set_source_rgba(c,0,0,1.,.5);
			
 
				+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_source_rgba(c,0,0,1.,.25);
			
 
				+                cairo_fill(c);
			
 
				+              }
			
 
				+            }break;
			
 
				+            case OC_MODE_INTER_MV:{
			
 
				+              if(_dec->telemetry_mbmode&0x04){
			
 
				+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
			
 
				+                cairo_set_source_rgba(c,0,1.,0,.5);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+              if(_dec->telemetry_mv&0x04){
			
 
				+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
			
 
				+                cairo_set_source_rgba(c,1.,1.,1.,.9);
			
 
				+                cairo_set_line_width(c,3.);
			
 
				+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,2.);
			
 
				+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,1.);
			
 
				+                cairo_line_to(c,x+8,y+8);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+            }break;
			
 
				+            case OC_MODE_INTER_MV_LAST:{
			
 
				+              if(_dec->telemetry_mbmode&0x08){
			
 
				+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
			
 
				+                cairo_set_source_rgba(c,0,1.,0,.5);
			
 
				+                cairo_move_to(c,x+13.5,y+2.5);
			
 
				+                cairo_line_to(c,x+2.5,y+8);
			
 
				+                cairo_line_to(c,x+13.5,y+13.5);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+              if(_dec->telemetry_mv&0x08){
			
 
				+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
			
 
				+                cairo_set_source_rgba(c,1.,1.,1.,.9);
			
 
				+                cairo_set_line_width(c,3.);
			
 
				+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,2.);
			
 
				+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,1.);
			
 
				+                cairo_line_to(c,x+8,y+8);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+            }break;
			
 
				+            case OC_MODE_INTER_MV_LAST2:{
			
 
				+              if(_dec->telemetry_mbmode&0x10){
			
 
				+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
			
 
				+                cairo_set_source_rgba(c,0,1.,0,.5);
			
 
				+                cairo_move_to(c,x+8,y+2.5);
			
 
				+                cairo_line_to(c,x+2.5,y+8);
			
 
				+                cairo_line_to(c,x+8,y+13.5);
			
 
				+                cairo_move_to(c,x+13.5,y+2.5);
			
 
				+                cairo_line_to(c,x+8,y+8);
			
 
				+                cairo_line_to(c,x+13.5,y+13.5);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+              if(_dec->telemetry_mv&0x10){
			
 
				+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
			
 
				+                cairo_set_source_rgba(c,1.,1.,1.,.9);
			
 
				+                cairo_set_line_width(c,3.);
			
 
				+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,2.);
			
 
				+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,1.);
			
 
				+                cairo_line_to(c,x+8,y+8);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+            }break;
			
 
				+            case OC_MODE_GOLDEN_NOMV:{
			
 
				+              if(_dec->telemetry_mbmode&0x20){
			
 
				+                cairo_set_source_rgba(c,1.,1.,0,.5);
			
 
				+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_source_rgba(c,1.,1.,0,.25);
			
 
				+                cairo_fill(c);
			
 
				+              }
			
 
				+            }break;
			
 
				+            case OC_MODE_GOLDEN_MV:{
			
 
				+              if(_dec->telemetry_mbmode&0x40){
			
 
				+                cairo_rectangle(c,x+2.5,y+2.5,11,11);
			
 
				+                cairo_set_source_rgba(c,1.,1.,0,.5);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+              if(_dec->telemetry_mv&0x40){
			
 
				+                cairo_move_to(c,x+8+frag_mvx,y+8-frag_mvy);
			
 
				+                cairo_set_source_rgba(c,1.,1.,1.,.9);
			
 
				+                cairo_set_line_width(c,3.);
			
 
				+                cairo_line_to(c,x+8+frag_mvx*.66,y+8-frag_mvy*.66);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,2.);
			
 
				+                cairo_line_to(c,x+8+frag_mvx*.33,y+8-frag_mvy*.33);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,1.);
			
 
				+                cairo_line_to(c,x+8,y+8);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+            }break;
			
 
				+            case OC_MODE_INTER_MV_FOUR:{
			
 
				+              if(_dec->telemetry_mbmode&0x80){
			
 
				+                cairo_rectangle(c,x+2.5,y+2.5,4,4);
			
 
				+                cairo_rectangle(c,x+9.5,y+2.5,4,4);
			
 
				+                cairo_rectangle(c,x+2.5,y+9.5,4,4);
			
 
				+                cairo_rectangle(c,x+9.5,y+9.5,4,4);
			
 
				+                cairo_set_source_rgba(c,0,1.,0,.5);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+              /*4mv is odd, coded in raster order.*/
			
 
				+              fragi=mb_maps[mbi][0][0];
			
 
				+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
			
 
				+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
			
 
				+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
			
 
				+                cairo_move_to(c,x+4+frag_mvx,y+12-frag_mvy);
			
 
				+                cairo_set_source_rgba(c,1.,1.,1.,.9);
			
 
				+                cairo_set_line_width(c,3.);
			
 
				+                cairo_line_to(c,x+4+frag_mvx*.66,y+12-frag_mvy*.66);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,2.);
			
 
				+                cairo_line_to(c,x+4+frag_mvx*.33,y+12-frag_mvy*.33);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,1.);
			
 
				+                cairo_line_to(c,x+4,y+12);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+              fragi=mb_maps[mbi][0][1];
			
 
				+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
			
 
				+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
			
 
				+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
			
 
				+                cairo_move_to(c,x+12+frag_mvx,y+12-frag_mvy);
			
 
				+                cairo_set_source_rgba(c,1.,1.,1.,.9);
			
 
				+                cairo_set_line_width(c,3.);
			
 
				+                cairo_line_to(c,x+12+frag_mvx*.66,y+12-frag_mvy*.66);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,2.);
			
 
				+                cairo_line_to(c,x+12+frag_mvx*.33,y+12-frag_mvy*.33);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,1.);
			
 
				+                cairo_line_to(c,x+12,y+12);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+              fragi=mb_maps[mbi][0][2];
			
 
				+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
			
 
				+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
			
 
				+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
			
 
				+                cairo_move_to(c,x+4+frag_mvx,y+4-frag_mvy);
			
 
				+                cairo_set_source_rgba(c,1.,1.,1.,.9);
			
 
				+                cairo_set_line_width(c,3.);
			
 
				+                cairo_line_to(c,x+4+frag_mvx*.66,y+4-frag_mvy*.66);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,2.);
			
 
				+                cairo_line_to(c,x+4+frag_mvx*.33,y+4-frag_mvy*.33);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,1.);
			
 
				+                cairo_line_to(c,x+4,y+4);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+              fragi=mb_maps[mbi][0][3];
			
 
				+              if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
			
 
				+                frag_mvx=OC_MV_X(frag_mvs[fragi]);
			
 
				+                frag_mvx=OC_MV_Y(frag_mvs[fragi]);
			
 
				+                cairo_move_to(c,x+12+frag_mvx,y+4-frag_mvy);
			
 
				+                cairo_set_source_rgba(c,1.,1.,1.,.9);
			
 
				+                cairo_set_line_width(c,3.);
			
 
				+                cairo_line_to(c,x+12+frag_mvx*.66,y+4-frag_mvy*.66);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,2.);
			
 
				+                cairo_line_to(c,x+12+frag_mvx*.33,y+4-frag_mvy*.33);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_line_width(c,1.);
			
 
				+                cairo_line_to(c,x+12,y+4);
			
 
				+                cairo_stroke(c);
			
 
				+              }
			
 
				+            }break;
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+      /*qii illustration.*/
			
 
				+      if(_dec->telemetry_qi&0x2){
			
 
				+        cairo_set_line_cap(c,CAIRO_LINE_CAP_SQUARE);
			
 
				+        for(bi=0;bi<4;bi++){
			
 
				+          ptrdiff_t fragi;
			
 
				+          int       qiv;
			
 
				+          int       xp;
			
 
				+          int       yp;
			
 
				+          xp=x+(bi&1)*8;
			
 
				+          yp=y+8-(bi&2)*4;
			
 
				+          fragi=mb_maps[mbi][0][bi];
			
 
				+          if(fragi>=0&&frags[fragi].coded){
			
 
				+            qiv=qim[frags[fragi].qii];
			
 
				+            cairo_set_line_width(c,3.);
			
 
				+            cairo_set_source_rgba(c,0.,0.,0.,.5);
			
 
				+            switch(qiv){
			
 
				+              /*Double plus:*/
			
 
				+              case 2:{
			
 
				+                if((bi&1)^((bi&2)>>1)){
			
 
				+                  cairo_move_to(c,xp+2.5,yp+1.5);
			
 
				+                  cairo_line_to(c,xp+2.5,yp+3.5);
			
 
				+                  cairo_move_to(c,xp+1.5,yp+2.5);
			
 
				+                  cairo_line_to(c,xp+3.5,yp+2.5);
			
 
				+                  cairo_move_to(c,xp+5.5,yp+4.5);
			
 
				+                  cairo_line_to(c,xp+5.5,yp+6.5);
			
 
				+                  cairo_move_to(c,xp+4.5,yp+5.5);
			
 
				+                  cairo_line_to(c,xp+6.5,yp+5.5);
			
 
				+                  cairo_stroke_preserve(c);
			
 
				+                  cairo_set_source_rgba(c,0.,1.,1.,1.);
			
 
				+                }
			
 
				+                else{
			
 
				+                  cairo_move_to(c,xp+5.5,yp+1.5);
			
 
				+                  cairo_line_to(c,xp+5.5,yp+3.5);
			
 
				+                  cairo_move_to(c,xp+4.5,yp+2.5);
			
 
				+                  cairo_line_to(c,xp+6.5,yp+2.5);
			
 
				+                  cairo_move_to(c,xp+2.5,yp+4.5);
			
 
				+                  cairo_line_to(c,xp+2.5,yp+6.5);
			
 
				+                  cairo_move_to(c,xp+1.5,yp+5.5);
			
 
				+                  cairo_line_to(c,xp+3.5,yp+5.5);
			
 
				+                  cairo_stroke_preserve(c);
			
 
				+                  cairo_set_source_rgba(c,0.,1.,1.,1.);
			
 
				+                }
			
 
				+              }break;
			
 
				+              /*Double minus:*/
			
 
				+              case -2:{
			
 
				+                cairo_move_to(c,xp+2.5,yp+2.5);
			
 
				+                cairo_line_to(c,xp+5.5,yp+2.5);
			
 
				+                cairo_move_to(c,xp+2.5,yp+5.5);
			
 
				+                cairo_line_to(c,xp+5.5,yp+5.5);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_source_rgba(c,1.,1.,1.,1.);
			
 
				+              }break;
			
 
				+              /*Plus:*/
			
 
				+              case 1:{
			
 
				+                if((bi&2)==0)yp-=2;
			
 
				+                if((bi&1)==0)xp-=2;
			
 
				+                cairo_move_to(c,xp+4.5,yp+2.5);
			
 
				+                cairo_line_to(c,xp+4.5,yp+6.5);
			
 
				+                cairo_move_to(c,xp+2.5,yp+4.5);
			
 
				+                cairo_line_to(c,xp+6.5,yp+4.5);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_source_rgba(c,.1,1.,.3,1.);
			
 
				+                break;
			
 
				+              }
			
 
				+              /*Fall through.*/
			
 
				+              /*Minus:*/
			
 
				+              case -1:{
			
 
				+                cairo_move_to(c,xp+2.5,yp+4.5);
			
 
				+                cairo_line_to(c,xp+6.5,yp+4.5);
			
 
				+                cairo_stroke_preserve(c);
			
 
				+                cairo_set_source_rgba(c,1.,.3,.1,1.);
			
 
				+              }break;
			
 
				+              default:continue;
			
 
				+            }
			
 
				+            cairo_set_line_width(c,1.);
			
 
				+            cairo_stroke(c);
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+      col2++;
			
 
				+      if((col2>>1)>=_dec->state.nhmbs){
			
 
				+        col2=0;
			
 
				+        row2+=2;
			
 
				+      }
			
 
				+    }
			
 
				+    /*Bit usage indicator[s]:*/
			
 
				+    if(_dec->telemetry_bits){
			
 
				+      int widths[6];
			
 
				+      int fpsn;
			
 
				+      int fpsd;
			
 
				+      int mult;
			
 
				+      int fullw;
			
 
				+      int padw;
			
 
				+      int i;
			
 
				+      fpsn=_dec->state.info.fps_numerator;
			
 
				+      fpsd=_dec->state.info.fps_denominator;
			
 
				+      mult=(_dec->telemetry_bits>=0xFF?1:_dec->telemetry_bits);
			
 
				+      fullw=250.f*h*fpsd*mult/fpsn;
			
 
				+      padw=w-24;
			
 
				+      /*Header and coded block bits.*/
			
 
				+      if(_dec->telemetry_frame_bytes<0||
			
 
				+       _dec->telemetry_frame_bytes==OC_LOTS_OF_BITS){
			
 
				+        _dec->telemetry_frame_bytes=0;
			
 
				+      }
			
 
				+      if(_dec->telemetry_coding_bytes<0||
			
 
				+       _dec->telemetry_coding_bytes>_dec->telemetry_frame_bytes){
			
 
				+        _dec->telemetry_coding_bytes=0;
			
 
				+      }
			
 
				+      if(_dec->telemetry_mode_bytes<0||
			
 
				+       _dec->telemetry_mode_bytes>_dec->telemetry_frame_bytes){
			
 
				+        _dec->telemetry_mode_bytes=0;
			
 
				+      }
			
 
				+      if(_dec->telemetry_mv_bytes<0||
			
 
				+       _dec->telemetry_mv_bytes>_dec->telemetry_frame_bytes){
			
 
				+        _dec->telemetry_mv_bytes=0;
			
 
				+      }
			
 
				+      if(_dec->telemetry_qi_bytes<0||
			
 
				+       _dec->telemetry_qi_bytes>_dec->telemetry_frame_bytes){
			
 
				+        _dec->telemetry_qi_bytes=0;
			
 
				+      }
			
 
				+      if(_dec->telemetry_dc_bytes<0||
			
 
				+       _dec->telemetry_dc_bytes>_dec->telemetry_frame_bytes){
			
 
				+        _dec->telemetry_dc_bytes=0;
			
 
				+      }
			
 
				+      widths[0]=padw*
			
 
				+       (_dec->telemetry_frame_bytes-_dec->telemetry_coding_bytes)/fullw;
			
 
				+      widths[1]=padw*
			
 
				+       (_dec->telemetry_coding_bytes-_dec->telemetry_mode_bytes)/fullw;
			
 
				+      widths[2]=padw*
			
 
				+       (_dec->telemetry_mode_bytes-_dec->telemetry_mv_bytes)/fullw;
			
 
				+      widths[3]=padw*(_dec->telemetry_mv_bytes-_dec->telemetry_qi_bytes)/fullw;
			
 
				+      widths[4]=padw*(_dec->telemetry_qi_bytes-_dec->telemetry_dc_bytes)/fullw;
			
 
				+      widths[5]=padw*(_dec->telemetry_dc_bytes)/fullw;
			
 
				+      for(i=0;i<6;i++)if(widths[i]>w)widths[i]=w;
			
 
				+      cairo_set_source_rgba(c,.0,.0,.0,.6);
			
 
				+      cairo_rectangle(c,10,h-33,widths[0]+1,5);
			
 
				+      cairo_rectangle(c,10,h-29,widths[1]+1,5);
			
 
				+      cairo_rectangle(c,10,h-25,widths[2]+1,5);
			
 
				+      cairo_rectangle(c,10,h-21,widths[3]+1,5);
			
 
				+      cairo_rectangle(c,10,h-17,widths[4]+1,5);
			
 
				+      cairo_rectangle(c,10,h-13,widths[5]+1,5);
			
 
				+      cairo_fill(c);
			
 
				+      cairo_set_source_rgb(c,1,0,0);
			
 
				+      cairo_rectangle(c,10.5,h-32.5,widths[0],4);
			
 
				+      cairo_fill(c);
			
 
				+      cairo_set_source_rgb(c,0,1,0);
			
 
				+      cairo_rectangle(c,10.5,h-28.5,widths[1],4);
			
 
				+      cairo_fill(c);
			
 
				+      cairo_set_source_rgb(c,0,0,1);
			
 
				+      cairo_rectangle(c,10.5,h-24.5,widths[2],4);
			
 
				+      cairo_fill(c);
			
 
				+      cairo_set_source_rgb(c,.6,.4,.0);
			
 
				+      cairo_rectangle(c,10.5,h-20.5,widths[3],4);
			
 
				+      cairo_fill(c);
			
 
				+      cairo_set_source_rgb(c,.3,.3,.3);
			
 
				+      cairo_rectangle(c,10.5,h-16.5,widths[4],4);
			
 
				+      cairo_fill(c);
			
 
				+      cairo_set_source_rgb(c,.5,.5,.8);
			
 
				+      cairo_rectangle(c,10.5,h-12.5,widths[5],4);
			
 
				+      cairo_fill(c);
			
 
				+    }
			
 
				+    /*Master qi indicator[s]:*/
			
 
				+    if(_dec->telemetry_qi&0x1){
			
 
				+      cairo_text_extents_t extents;
			
 
				+      char                 buffer[10];
			
 
				+      int                  p;
			
 
				+      int                  y;
			
 
				+      p=0;
			
 
				+      y=h-7.5;
			
 
				+      if(_dec->state.qis[0]>=10)buffer[p++]=48+_dec->state.qis[0]/10;
			
 
				+      buffer[p++]=48+_dec->state.qis[0]%10;
			
 
				+      if(_dec->state.nqis>=2){
			
 
				+        buffer[p++]=' ';
			
 
				+        if(_dec->state.qis[1]>=10)buffer[p++]=48+_dec->state.qis[1]/10;
			
 
				+        buffer[p++]=48+_dec->state.qis[1]%10;
			
 
				+      }
			
 
				+      if(_dec->state.nqis==3){
			
 
				+        buffer[p++]=' ';
			
 
				+        if(_dec->state.qis[2]>=10)buffer[p++]=48+_dec->state.qis[2]/10;
			
 
				+        buffer[p++]=48+_dec->state.qis[2]%10;
			
 
				+      }
			
 
				+      buffer[p++]='\0';
			
 
				+      cairo_select_font_face(c,"sans",
			
 
				+       CAIRO_FONT_SLANT_NORMAL,CAIRO_FONT_WEIGHT_BOLD);
			
 
				+      cairo_set_font_size(c,18);
			
 
				+      cairo_text_extents(c,buffer,&extents);
			
 
				+      cairo_set_source_rgb(c,1,1,1);
			
 
				+      cairo_move_to(c,w-extents.x_advance-10,y);
			
 
				+      cairo_show_text(c,buffer);
			
 
				+      cairo_set_source_rgb(c,0,0,0);
			
 
				+      cairo_move_to(c,w-extents.x_advance-10,y);
			
 
				+      cairo_text_path(c,buffer);
			
 
				+      cairo_set_line_width(c,.8);
			
 
				+      cairo_set_line_join(c,CAIRO_LINE_JOIN_ROUND);
			
 
				+      cairo_stroke(c);
			
 
				+    }
			
 
				+    cairo_destroy(c);
			
 
				+  }
			
 
				+  /*Out of the Cairo plane into the telemetry YUV buffer.*/
			
 
				+  _ycbcr[0].data=_dec->telemetry_frame_data;
			
 
				+  _ycbcr[0].stride=_ycbcr[0].width;
			
 
				+  _ycbcr[1].data=_ycbcr[0].data+h*_ycbcr[0].stride;
			
 
				+  _ycbcr[1].stride=_ycbcr[1].width;
			
 
				+  _ycbcr[2].data=_ycbcr[1].data+(h>>vdec)*_ycbcr[1].stride;
			
 
				+  _ycbcr[2].stride=_ycbcr[2].width;
			
 
				+  y_row=_ycbcr[0].data;
			
 
				+  u_row=_ycbcr[1].data;
			
 
				+  v_row=_ycbcr[2].data;
			
 
				+  rgb_row=data;
			
 
				+  /*This is one of the few places it's worth handling chroma on a
			
 
				+     case-by-case basis.*/
			
 
				+  switch(_dec->state.info.pixel_fmt){
			
 
				+    case TH_PF_420:{
			
 
				+      for(y=0;y<h;y+=2){
			
 
				+        unsigned char *y_row2;
			
 
				+        unsigned char *rgb_row2;
			
 
				+        y_row2=y_row+_ycbcr[0].stride;
			
 
				+        rgb_row2=rgb_row+cstride;
			
 
				+        for(x=0;x<w;x+=2){
			
 
				+          int y;
			
 
				+          int u;
			
 
				+          int v;
			
 
				+          y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
			
 
				+           +24966*rgb_row[4*x+0]+4207500)/255000;
			
 
				+          y_row[x]=OC_CLAMP255(y);
			
 
				+          y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5]
			
 
				+           +24966*rgb_row[4*x+4]+4207500)/255000;
			
 
				+          y_row[x+1]=OC_CLAMP255(y);
			
 
				+          y=(65481*rgb_row2[4*x+2]+128553*rgb_row2[4*x+1]
			
 
				+           +24966*rgb_row2[4*x+0]+4207500)/255000;
			
 
				+          y_row2[x]=OC_CLAMP255(y);
			
 
				+          y=(65481*rgb_row2[4*x+6]+128553*rgb_row2[4*x+5]
			
 
				+           +24966*rgb_row2[4*x+4]+4207500)/255000;
			
 
				+          y_row2[x+1]=OC_CLAMP255(y);
			
 
				+          u=(-8372*(rgb_row[4*x+2]+rgb_row[4*x+6]
			
 
				+           +rgb_row2[4*x+2]+rgb_row2[4*x+6])
			
 
				+           -16436*(rgb_row[4*x+1]+rgb_row[4*x+5]
			
 
				+           +rgb_row2[4*x+1]+rgb_row2[4*x+5])
			
 
				+           +24808*(rgb_row[4*x+0]+rgb_row[4*x+4]
			
 
				+           +rgb_row2[4*x+0]+rgb_row2[4*x+4])+29032005)/225930;
			
 
				+          v=(39256*(rgb_row[4*x+2]+rgb_row[4*x+6]
			
 
				+           +rgb_row2[4*x+2]+rgb_row2[4*x+6])
			
 
				+           -32872*(rgb_row[4*x+1]+rgb_row[4*x+5]
			
 
				+            +rgb_row2[4*x+1]+rgb_row2[4*x+5])
			
 
				+           -6384*(rgb_row[4*x+0]+rgb_row[4*x+4]
			
 
				+            +rgb_row2[4*x+0]+rgb_row2[4*x+4])+45940035)/357510;
			
 
				+          u_row[x>>1]=OC_CLAMP255(u);
			
 
				+          v_row[x>>1]=OC_CLAMP255(v);
			
 
				+        }
			
 
				+        y_row+=_ycbcr[0].stride<<1;
			
 
				+        u_row+=_ycbcr[1].stride;
			
 
				+        v_row+=_ycbcr[2].stride;
			
 
				+        rgb_row+=cstride<<1;
			
 
				+      }
			
 
				+    }break;
			
 
				+    case TH_PF_422:{
			
 
				+      for(y=0;y<h;y++){
			
 
				+        for(x=0;x<w;x+=2){
			
 
				+          int y;
			
 
				+          int u;
			
 
				+          int v;
			
 
				+          y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
			
 
				+           +24966*rgb_row[4*x+0]+4207500)/255000;
			
 
				+          y_row[x]=OC_CLAMP255(y);
			
 
				+          y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5]
			
 
				+           +24966*rgb_row[4*x+4]+4207500)/255000;
			
 
				+          y_row[x+1]=OC_CLAMP255(y);
			
 
				+          u=(-16744*(rgb_row[4*x+2]+rgb_row[4*x+6])
			
 
				+           -32872*(rgb_row[4*x+1]+rgb_row[4*x+5])
			
 
				+           +49616*(rgb_row[4*x+0]+rgb_row[4*x+4])+29032005)/225930;
			
 
				+          v=(78512*(rgb_row[4*x+2]+rgb_row[4*x+6])
			
 
				+           -65744*(rgb_row[4*x+1]+rgb_row[4*x+5])
			
 
				+           -12768*(rgb_row[4*x+0]+rgb_row[4*x+4])+45940035)/357510;
			
 
				+          u_row[x>>1]=OC_CLAMP255(u);
			
 
				+          v_row[x>>1]=OC_CLAMP255(v);
			
 
				+        }
			
 
				+        y_row+=_ycbcr[0].stride;
			
 
				+        u_row+=_ycbcr[1].stride;
			
 
				+        v_row+=_ycbcr[2].stride;
			
 
				+        rgb_row+=cstride;
			
 
				+      }
			
 
				+    }break;
			
 
				+    /*case TH_PF_444:*/
			
 
				+    default:{
			
 
				+      for(y=0;y<h;y++){
			
 
				+        for(x=0;x<w;x++){
			
 
				+          int y;
			
 
				+          int u;
			
 
				+          int v;
			
 
				+          y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
			
 
				+           +24966*rgb_row[4*x+0]+4207500)/255000;
			
 
				+          u=(-33488*rgb_row[4*x+2]-65744*rgb_row[4*x+1]
			
 
				+           +99232*rgb_row[4*x+0]+29032005)/225930;
			
 
				+          v=(157024*rgb_row[4*x+2]-131488*rgb_row[4*x+1]
			
 
				+           -25536*rgb_row[4*x+0]+45940035)/357510;
			
 
				+          y_row[x]=OC_CLAMP255(y);
			
 
				+          u_row[x]=OC_CLAMP255(u);
			
 
				+          v_row[x]=OC_CLAMP255(v);
			
 
				+        }
			
 
				+        y_row+=_ycbcr[0].stride;
			
 
				+        u_row+=_ycbcr[1].stride;
			
 
				+        v_row+=_ycbcr[2].stride;
			
 
				+        rgb_row+=cstride;
			
 
				+      }
			
 
				+    }break;
			
 
				+  }
			
 
				+  /*Finished.
			
 
				+    Destroy the surface.*/
			
 
				+  cairo_surface_destroy(cs);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
			
 
				+ ogg_int64_t *_granpos){
			
 
				+  int ret;
			
 
				+  if(_dec==NULL||_op==NULL)return TH_EFAULT;
			
 
				+  /*A completely empty packet indicates a dropped frame and is treated exactly
			
 
				+     like an inter frame with no coded blocks.*/
			
 
				+  if(_op->bytes==0){
			
 
				+    _dec->state.frame_type=OC_INTER_FRAME;
			
 
				+    _dec->state.ntotal_coded_fragis=0;
			
 
				+  }
			
 
				+  else{
			
 
				+    oc_pack_readinit(&_dec->opb,_op->packet,_op->bytes);
			
 
				+    ret=oc_dec_frame_header_unpack(_dec);
			
 
				+    if(ret<0)return ret;
			
 
				+    if(_dec->state.frame_type==OC_INTRA_FRAME)oc_dec_mark_all_intra(_dec);
			
 
				+    else oc_dec_coded_flags_unpack(_dec);
			
 
				+  }
			
 
				+  /*If there have been no reference frames, and we need one, initialize one.*/
			
 
				+  if(_dec->state.frame_type!=OC_INTRA_FRAME&&
			
 
				+   (_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
			
 
				+   _dec->state.ref_frame_idx[OC_FRAME_PREV]<0)){
			
 
				+    oc_dec_init_dummy_frame(_dec);
			
 
				+  }
			
 
				+  /*If this was an inter frame with no coded blocks...*/
			
 
				+  if(_dec->state.ntotal_coded_fragis<=0){
			
 
				+    /*Just update the granule position and return.*/
			
 
				+    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
			
 
				+     _dec->state.info.keyframe_granule_shift)
			
 
				+     +(_dec->state.curframe_num-_dec->state.keyframe_num);
			
 
				+    _dec->state.curframe_num++;
			
 
				+    if(_granpos!=NULL)*_granpos=_dec->state.granpos;
			
 
				+    return TH_DUPFRAME;
			
 
				+  }
			
 
				+  else{
			
 
				+    th_ycbcr_buffer stripe_buf;
			
 
				+    int             stripe_fragy;
			
 
				+    int             refi;
			
 
				+    int             pli;
			
 
				+    int             notstart;
			
 
				+    int             notdone;
			
 
				+#ifdef HAVE_CAIRO
			
 
				+    int             telemetry;
			
 
				+    /*Save the current telemetry state.
			
 
				+      This prevents it from being modified in the middle of decoding this
			
 
				+       frame, which could cause us to skip calls to the striped decoding
			
 
				+       callback.*/
			
 
				+    telemetry=_dec->telemetry;
			
 
				+#endif
			
 
				+    /*Select a free buffer to use for the reconstructed version of this frame.*/
			
 
				+    for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]||
			
 
				+     refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++);
			
 
				+    _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
			
 
				+    _dec->state.ref_frame_data[OC_FRAME_SELF]=
			
 
				+     _dec->state.ref_frame_bufs[refi][0].data;
			
 
				+#if defined(HAVE_CAIRO)
			
 
				+    _dec->telemetry_frame_bytes=_op->bytes;
			
 
				+#endif
			
 
				+    if(_dec->state.frame_type==OC_INTRA_FRAME){
			
 
				+      _dec->state.keyframe_num=_dec->state.curframe_num;
			
 
				+#if defined(HAVE_CAIRO)
			
 
				+      _dec->telemetry_coding_bytes=
			
 
				+       _dec->telemetry_mode_bytes=
			
 
				+       _dec->telemetry_mv_bytes=oc_pack_bytes_left(&_dec->opb);
			
 
				+#endif
			
 
				+    }
			
 
				+    else{
			
 
				+#if defined(HAVE_CAIRO)
			
 
				+      _dec->telemetry_coding_bytes=oc_pack_bytes_left(&_dec->opb);
			
 
				+#endif
			
 
				+      oc_dec_mb_modes_unpack(_dec);
			
 
				+#if defined(HAVE_CAIRO)
			
 
				+      _dec->telemetry_mode_bytes=oc_pack_bytes_left(&_dec->opb);
			
 
				+#endif
			
 
				+      oc_dec_mv_unpack_and_frag_modes_fill(_dec);
			
 
				+#if defined(HAVE_CAIRO)
			
 
				+      _dec->telemetry_mv_bytes=oc_pack_bytes_left(&_dec->opb);
			
 
				+#endif
			
 
				+    }
			
 
				+    oc_dec_block_qis_unpack(_dec);
			
 
				+#if defined(HAVE_CAIRO)
			
 
				+    _dec->telemetry_qi_bytes=oc_pack_bytes_left(&_dec->opb);
			
 
				+#endif
			
 
				+    oc_dec_residual_tokens_unpack(_dec);
			
 
				+    /*Update granule position.
			
 
				+      This must be done before the striped decode callbacks so that the
			
 
				+       application knows what to do with the frame data.*/
			
 
				+    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
			
 
				+     _dec->state.info.keyframe_granule_shift)
			
 
				+     +(_dec->state.curframe_num-_dec->state.keyframe_num);
			
 
				+    _dec->state.curframe_num++;
			
 
				+    if(_granpos!=NULL)*_granpos=_dec->state.granpos;
			
 
				+    /*All of the rest of the operations -- DC prediction reversal,
			
 
				+       reconstructing coded fragments, copying uncoded fragments, loop
			
 
				+       filtering, extending borders, and out-of-loop post-processing -- should
			
 
				+       be pipelined.
			
 
				+      I.e., DC prediction reversal, reconstruction, and uncoded fragment
			
 
				+       copying are done for one or two super block rows, then loop filtering is
			
 
				+       run as far as it can, then bordering copying, then post-processing.
			
 
				+      For 4:2:0 video a Minimum Codable Unit or MCU contains two luma super
			
 
				+       block rows, and one chroma.
			
 
				+      Otherwise, an MCU consists of one super block row from each plane.
			
 
				+      Inside each MCU, we perform all of the steps on one color plane before
			
 
				+       moving on to the next.
			
 
				+      After reconstruction, the additional filtering stages introduce a delay
			
 
				+       since they need some pixels from the next fragment row.
			
 
				+      Thus the actual number of decoded rows available is slightly smaller for
			
 
				+       the first MCU, and slightly larger for the last.
			
 
				+
			
 
				+      This entire process allows us to operate on the data while it is still in
			
 
				+       cache, resulting in big performance improvements.
			
 
				+      An application callback allows further application processing (blitting
			
 
				+       to video memory, color conversion, etc.) to also use the data while it's
			
 
				+       in cache.*/
			
 
				+    oc_dec_pipeline_init(_dec,&_dec->pipe);
			
 
				+    oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf);
			
 
				+    notstart=0;
			
 
				+    notdone=1;
			
 
				+    for(stripe_fragy=0;notdone;stripe_fragy+=_dec->pipe.mcu_nvfrags){
			
 
				+      int avail_fragy0;
			
 
				+      int avail_fragy_end;
			
 
				+      avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags;
			
 
				+      notdone=stripe_fragy+_dec->pipe.mcu_nvfrags<avail_fragy_end;
			
 
				+      for(pli=0;pli<3;pli++){
			
 
				+        oc_fragment_plane *fplane;
			
 
				+        int                frag_shift;
			
 
				+        int                pp_offset;
			
 
				+        int                sdelay;
			
 
				+        int                edelay;
			
 
				+        fplane=_dec->state.fplanes+pli;
			
 
				+        /*Compute the first and last fragment row of the current MCU for this
			
 
				+           plane.*/
			
 
				+        frag_shift=pli!=0&&!(_dec->state.info.pixel_fmt&2);
			
 
				+        _dec->pipe.fragy0[pli]=stripe_fragy>>frag_shift;
			
 
				+        _dec->pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags,
			
 
				+         _dec->pipe.fragy0[pli]+(_dec->pipe.mcu_nvfrags>>frag_shift));
			
 
				+        oc_dec_dc_unpredict_mcu_plane(_dec,&_dec->pipe,pli);
			
 
				+        oc_dec_frags_recon_mcu_plane(_dec,&_dec->pipe,pli);
			
 
				+        sdelay=edelay=0;
			
 
				+        if(_dec->pipe.loop_filter){
			
 
				+          sdelay+=notstart;
			
 
				+          edelay+=notdone;
			
 
				+          oc_state_loop_filter_frag_rows(&_dec->state,
			
 
				+           _dec->pipe.bounding_values,OC_FRAME_SELF,pli,
			
 
				+           _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
			
 
				+        }
			
 
				+        /*To fill the borders, we have an additional two pixel delay, since a
			
 
				+           fragment in the next row could filter its top edge, using two pixels
			
 
				+           from a fragment in this row.
			
 
				+          But there's no reason to delay a full fragment between the two.*/
			
 
				+        oc_state_borders_fill_rows(&_dec->state,refi,pli,
			
 
				+         (_dec->pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1),
			
 
				+         (_dec->pipe.fragy_end[pli]-edelay<<3)-(edelay<<1));
			
 
				+        /*Out-of-loop post-processing.*/
			
 
				+        pp_offset=3*(pli!=0);
			
 
				+        if(_dec->pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
			
 
				+          /*Perform de-blocking in one plane.*/
			
 
				+          sdelay+=notstart;
			
 
				+          edelay+=notdone;
			
 
				+          oc_dec_deblock_frag_rows(_dec,_dec->pp_frame_buf,
			
 
				+           _dec->state.ref_frame_bufs[refi],pli,
			
 
				+           _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
			
 
				+          if(_dec->pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
			
 
				+            /*Perform de-ringing in one plane.*/
			
 
				+            sdelay+=notstart;
			
 
				+            edelay+=notdone;
			
 
				+            oc_dec_dering_frag_rows(_dec,_dec->pp_frame_buf,pli,
			
 
				+             _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
			
 
				+          }
			
 
				+        }
			
 
				+        /*If no post-processing is done, we still need to delay a row for the
			
 
				+           loop filter, thanks to the strange filtering order VP3 chose.*/
			
 
				+        else if(_dec->pipe.loop_filter){
			
 
				+          sdelay+=notstart;
			
 
				+          edelay+=notdone;
			
 
				+        }
			
 
				+        /*Compute the intersection of the available rows in all planes.
			
 
				+          If chroma is sub-sampled, the effect of each of its delays is
			
 
				+           doubled, but luma might have more post-processing filters enabled
			
 
				+           than chroma, so we don't know up front which one is the limiting
			
 
				+           factor.*/
			
 
				+        avail_fragy0=OC_MINI(avail_fragy0,
			
 
				+         _dec->pipe.fragy0[pli]-sdelay<<frag_shift);
			
 
				+        avail_fragy_end=OC_MINI(avail_fragy_end,
			
 
				+         _dec->pipe.fragy_end[pli]-edelay<<frag_shift);
			
 
				+      }
			
 
				+#ifdef HAVE_CAIRO
			
 
				+      if(_dec->stripe_cb.stripe_decoded!=NULL&&!telemetry){
			
 
				+#else
			
 
				+      if(_dec->stripe_cb.stripe_decoded!=NULL){
			
 
				+#endif
			
 
				+        /*The callback might want to use the FPU, so let's make sure they can.
			
 
				+          We violate all kinds of ABI restrictions by not doing this until
			
 
				+           now, but none of them actually matter since we don't use floating
			
 
				+           point ourselves.*/
			
 
				+        oc_restore_fpu(&_dec->state);
			
 
				+        /*Make the callback, ensuring we flip the sense of the "start" and
			
 
				+           "end" of the available region upside down.*/
			
 
				+        (*_dec->stripe_cb.stripe_decoded)(_dec->stripe_cb.ctx,stripe_buf,
			
 
				+         _dec->state.fplanes[0].nvfrags-avail_fragy_end,
			
 
				+         _dec->state.fplanes[0].nvfrags-avail_fragy0);
			
 
				+      }
			
 
				+      notstart=1;
			
 
				+    }
			
 
				+    /*Finish filling in the reference frame borders.*/
			
 
				+    for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_dec->state,refi,pli);
			
 
				+    /*Update the reference frame indices.*/
			
 
				+    if(_dec->state.frame_type==OC_INTRA_FRAME){
			
 
				+      /*The new frame becomes both the previous and gold reference frames.*/
			
 
				+      _dec->state.ref_frame_idx[OC_FRAME_GOLD]=
			
 
				+       _dec->state.ref_frame_idx[OC_FRAME_PREV]=
			
 
				+       _dec->state.ref_frame_idx[OC_FRAME_SELF];
			
 
				+      _dec->state.ref_frame_data[OC_FRAME_GOLD]=
			
 
				+       _dec->state.ref_frame_data[OC_FRAME_PREV]=
			
 
				+       _dec->state.ref_frame_data[OC_FRAME_SELF];
			
 
				+    }
			
 
				+    else{
			
 
				+      /*Otherwise, just replace the previous reference frame.*/
			
 
				+      _dec->state.ref_frame_idx[OC_FRAME_PREV]=
			
 
				+       _dec->state.ref_frame_idx[OC_FRAME_SELF];
			
 
				+      _dec->state.ref_frame_data[OC_FRAME_PREV]=
			
 
				+       _dec->state.ref_frame_data[OC_FRAME_SELF];
			
 
				+    }
			
 
				+    /*Restore the FPU before dump_frame, since that _does_ use the FPU (for PNG
			
 
				+       gamma values, if nothing else).*/
			
 
				+    oc_restore_fpu(&_dec->state);
			
 
				+#ifdef HAVE_CAIRO
			
 
				+    /*If telemetry ioctls are active, we need to draw to the output buffer.*/
			
 
				+    if(telemetry){
			
 
				+      oc_render_telemetry(_dec,stripe_buf,telemetry);
			
 
				+      /*If we had a striped decoding callback, we skipped calling it above
			
 
				+         (because the telemetry wasn't rendered yet).
			
 
				+        Call it now with the whole frame.*/
			
 
				+      if(_dec->stripe_cb.stripe_decoded!=NULL){
			
 
				+        (*_dec->stripe_cb.stripe_decoded)(_dec->stripe_cb.ctx,
			
 
				+         stripe_buf,0,_dec->state.fplanes[0].nvfrags);
			
 
				+      }
			
 
				+    }
			
 
				+#endif
			
 
				+#if defined(OC_DUMP_IMAGES)
			
 
				+    /*We only dump images if there were some coded blocks.*/
			
 
				+    oc_state_dump_frame(&_dec->state,OC_FRAME_SELF,"dec");
			
 
				+#endif
			
 
				+    return 0;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+int th_decode_ycbcr_out(th_dec_ctx *_dec,th_ycbcr_buffer _ycbcr){
			
 
				+  if(_dec==NULL||_ycbcr==NULL)return TH_EFAULT;
			
 
				+  oc_ycbcr_buffer_flip(_ycbcr,_dec->pp_frame_buf);
			
 
				+  return 0;
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/defexp.awk
+++ b/modules/theoraplayer/native/theora/lib/defexp.awk
@@ -0,0 +1,27 @@
 
				+# awk script to convert symbol export table formats
			
 
				+
			
 
				+# converts an msvc .def file to an darwin ld export-symbols-list file
			
 
				+# we only support the most basic module definition syntax
			
 
				+
			
 
				+# skip comments
			
 
				+/^\w*#.*/ {next}
			
 
				+/^\w*;.*/ {next}
			
 
				+
			
 
				+# remember and propagate the library name
			
 
				+/LIBRARY/ {name = $2; print "# export list for", name; next}
			
 
				+
			
 
				+# skip various other lines
			
 
				+/^\w*NAME/ ||
			
 
				+/^\w*VERSION/ ||
			
 
				+/^\w*EXPORTS/ ||
			
 
				+/^\w*HEAPSIZE/ ||
			
 
				+/^\w*STACKSIZE/ ||
			
 
				+/^\w*STUB/ {next}
			
 
				+
			
 
				+# todo: handle SECTIONS
			
 
				+
			
 
				+# for symbols, strip the semicolon and mangle the name
			
 
				+/[a-zA-Z]+/ {sub(/\;/, ""); print "_" $1}
			
 
				+
			
 
				+# todo: warn if we see publicname=privatename mappings
			
 
				+#       which other linkers don't support
			
--- a/modules/theoraplayer/native/theora/lib/dequant.c
+++ b/modules/theoraplayer/native/theora/lib/dequant.c
@@ -0,0 +1,182 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: dequant.c 16503 2009-08-22 18:14:02Z giles $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <ogg/ogg.h>
			
 
				+#include "dequant.h"
			
 
				+#include "decint.h"
			
 
				+
			
 
				+int oc_quant_params_unpack(oc_pack_buf *_opb,th_quant_info *_qinfo){
			
 
				+  th_quant_base *base_mats;
			
 
				+  long           val;
			
 
				+  int            nbase_mats;
			
 
				+  int            sizes[64];
			
 
				+  int            indices[64];
			
 
				+  int            nbits;
			
 
				+  int            bmi;
			
 
				+  int            ci;
			
 
				+  int            qti;
			
 
				+  int            pli;
			
 
				+  int            qri;
			
 
				+  int            qi;
			
 
				+  int            i;
			
 
				+  val=oc_pack_read(_opb,3);
			
 
				+  nbits=(int)val;
			
 
				+  for(qi=0;qi<64;qi++){
			
 
				+    val=oc_pack_read(_opb,nbits);
			
 
				+    _qinfo->loop_filter_limits[qi]=(unsigned char)val;
			
 
				+  }
			
 
				+  val=oc_pack_read(_opb,4);
			
 
				+  nbits=(int)val+1;
			
 
				+  for(qi=0;qi<64;qi++){
			
 
				+    val=oc_pack_read(_opb,nbits);
			
 
				+    _qinfo->ac_scale[qi]=(ogg_uint16_t)val;
			
 
				+  }
			
 
				+  val=oc_pack_read(_opb,4);
			
 
				+  nbits=(int)val+1;
			
 
				+  for(qi=0;qi<64;qi++){
			
 
				+    val=oc_pack_read(_opb,nbits);
			
 
				+    _qinfo->dc_scale[qi]=(ogg_uint16_t)val;
			
 
				+  }
			
 
				+  val=oc_pack_read(_opb,9);
			
 
				+  nbase_mats=(int)val+1;
			
 
				+  base_mats=_ogg_malloc(nbase_mats*sizeof(base_mats[0]));
			
 
				+  if(base_mats==NULL)return TH_EFAULT;
			
 
				+  for(bmi=0;bmi<nbase_mats;bmi++){
			
 
				+    for(ci=0;ci<64;ci++){
			
 
				+      val=oc_pack_read(_opb,8);
			
 
				+      base_mats[bmi][ci]=(unsigned char)val;
			
 
				+    }
			
 
				+  }
			
 
				+  nbits=oc_ilog(nbase_mats-1);
			
 
				+  for(i=0;i<6;i++){
			
 
				+    th_quant_ranges *qranges;
			
 
				+    th_quant_base   *qrbms;
			
 
				+    int             *qrsizes;
			
 
				+    qti=i/3;
			
 
				+    pli=i%3;
			
 
				+    qranges=_qinfo->qi_ranges[qti]+pli;
			
 
				+    if(i>0){
			
 
				+      val=oc_pack_read1(_opb);
			
 
				+      if(!val){
			
 
				+        int qtj;
			
 
				+        int plj;
			
 
				+        if(qti>0){
			
 
				+          val=oc_pack_read1(_opb);
			
 
				+          if(val){
			
 
				+            qtj=qti-1;
			
 
				+            plj=pli;
			
 
				+          }
			
 
				+          else{
			
 
				+            qtj=(i-1)/3;
			
 
				+            plj=(i-1)%3;
			
 
				+          }
			
 
				+        }
			
 
				+        else{
			
 
				+          qtj=(i-1)/3;
			
 
				+          plj=(i-1)%3;
			
 
				+        }
			
 
				+        *qranges=*(_qinfo->qi_ranges[qtj]+plj);
			
 
				+        continue;
			
 
				+      }
			
 
				+    }
			
 
				+    val=oc_pack_read(_opb,nbits);
			
 
				+    indices[0]=(int)val;
			
 
				+    for(qi=qri=0;qi<63;){
			
 
				+      val=oc_pack_read(_opb,oc_ilog(62-qi));
			
 
				+      sizes[qri]=(int)val+1;
			
 
				+      qi+=(int)val+1;
			
 
				+      val=oc_pack_read(_opb,nbits);
			
 
				+      indices[++qri]=(int)val;
			
 
				+    }
			
 
				+    /*Note: The caller is responsible for cleaning up any partially
			
 
				+       constructed qinfo.*/
			
 
				+    if(qi>63){
			
 
				+      _ogg_free(base_mats);
			
 
				+      return TH_EBADHEADER;
			
 
				+    }
			
 
				+    qranges->nranges=qri;
			
 
				+    qranges->sizes=qrsizes=(int *)_ogg_malloc(qri*sizeof(qrsizes[0]));
			
 
				+    if(qranges->sizes==NULL){
			
 
				+      /*Note: The caller is responsible for cleaning up any partially
			
 
				+         constructed qinfo.*/
			
 
				+      _ogg_free(base_mats);
			
 
				+      return TH_EFAULT;
			
 
				+    }
			
 
				+    memcpy(qrsizes,sizes,qri*sizeof(qrsizes[0]));
			
 
				+    qrbms=(th_quant_base *)_ogg_malloc((qri+1)*sizeof(qrbms[0]));
			
 
				+    if(qrbms==NULL){
			
 
				+      /*Note: The caller is responsible for cleaning up any partially
			
 
				+         constructed qinfo.*/
			
 
				+      _ogg_free(base_mats);
			
 
				+      return TH_EFAULT;
			
 
				+    }
			
 
				+    qranges->base_matrices=(const th_quant_base *)qrbms;
			
 
				+    do{
			
 
				+      bmi=indices[qri];
			
 
				+      /*Note: The caller is responsible for cleaning up any partially
			
 
				+         constructed qinfo.*/
			
 
				+      if(bmi>=nbase_mats){
			
 
				+        _ogg_free(base_mats);
			
 
				+        return TH_EBADHEADER;
			
 
				+      }
			
 
				+      memcpy(qrbms[qri],base_mats[bmi],sizeof(qrbms[qri]));
			
 
				+    }
			
 
				+    while(qri-->0);
			
 
				+  }
			
 
				+  _ogg_free(base_mats);
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+void oc_quant_params_clear(th_quant_info *_qinfo){
			
 
				+  int i;
			
 
				+  for(i=6;i-->0;){
			
 
				+    int qti;
			
 
				+    int pli;
			
 
				+    qti=i/3;
			
 
				+    pli=i%3;
			
 
				+    /*Clear any duplicate pointer references.*/
			
 
				+    if(i>0){
			
 
				+      int qtj;
			
 
				+      int plj;
			
 
				+      qtj=(i-1)/3;
			
 
				+      plj=(i-1)%3;
			
 
				+      if(_qinfo->qi_ranges[qti][pli].sizes==
			
 
				+       _qinfo->qi_ranges[qtj][plj].sizes){
			
 
				+        _qinfo->qi_ranges[qti][pli].sizes=NULL;
			
 
				+      }
			
 
				+      if(_qinfo->qi_ranges[qti][pli].base_matrices==
			
 
				+       _qinfo->qi_ranges[qtj][plj].base_matrices){
			
 
				+        _qinfo->qi_ranges[qti][pli].base_matrices=NULL;
			
 
				+      }
			
 
				+    }
			
 
				+    if(qti>0){
			
 
				+      if(_qinfo->qi_ranges[1][pli].sizes==
			
 
				+       _qinfo->qi_ranges[0][pli].sizes){
			
 
				+        _qinfo->qi_ranges[1][pli].sizes=NULL;
			
 
				+      }
			
 
				+      if(_qinfo->qi_ranges[1][pli].base_matrices==
			
 
				+       _qinfo->qi_ranges[0][pli].base_matrices){
			
 
				+        _qinfo->qi_ranges[1][pli].base_matrices=NULL;
			
 
				+      }
			
 
				+    }
			
 
				+    /*Now free all the non-duplicate storage.*/
			
 
				+    _ogg_free((void *)_qinfo->qi_ranges[qti][pli].sizes);
			
 
				+    _ogg_free((void *)_qinfo->qi_ranges[qti][pli].base_matrices);
			
 
				+  }
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/dequant.h
+++ b/modules/theoraplayer/native/theora/lib/dequant.h
@@ -0,0 +1,27 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+    last mod: $Id: dequant.h 16503 2009-08-22 18:14:02Z giles $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+
			
 
				+#if !defined(_dequant_H)
			
 
				+# define _dequant_H (1)
			
 
				+# include "quant.h"
			
 
				+# include "bitpack.h"
			
 
				+
			
 
				+int oc_quant_params_unpack(oc_pack_buf *_opb,
			
 
				+ th_quant_info *_qinfo);
			
 
				+void oc_quant_params_clear(th_quant_info *_qinfo);
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/encapiwrapper.c
+++ b/modules/theoraplayer/native/theora/lib/encapiwrapper.c
@@ -0,0 +1,168 @@
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <limits.h>
			
 
				+#include "apiwrapper.h"
			
 
				+#include "encint.h"
			
 
				+#include "theora/theoraenc.h"
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void th_enc_api_clear(th_api_wrapper *_api){
			
 
				+  if(_api->encode)th_encode_free(_api->encode);
			
 
				+  memset(_api,0,sizeof(*_api));
			
 
				+}
			
 
				+
			
 
				+static void theora_encode_clear(theora_state *_te){
			
 
				+  if(_te->i!=NULL)theora_info_clear(_te->i);
			
 
				+  memset(_te,0,sizeof(*_te));
			
 
				+}
			
 
				+
			
 
				+static int theora_encode_control(theora_state *_te,int _req,
			
 
				+ void *_buf,size_t _buf_sz){
			
 
				+  return th_encode_ctl(((th_api_wrapper *)_te->i->codec_setup)->encode,
			
 
				+   _req,_buf,_buf_sz);
			
 
				+}
			
 
				+
			
 
				+static ogg_int64_t theora_encode_granule_frame(theora_state *_te,
			
 
				+ ogg_int64_t _gp){
			
 
				+  return th_granule_frame(((th_api_wrapper *)_te->i->codec_setup)->encode,_gp);
			
 
				+}
			
 
				+
			
 
				+static double theora_encode_granule_time(theora_state *_te,ogg_int64_t _gp){
			
 
				+  return th_granule_time(((th_api_wrapper *)_te->i->codec_setup)->encode,_gp);
			
 
				+}
			
 
				+
			
 
				+static const oc_state_dispatch_vtable OC_ENC_DISPATCH_VTBL={
			
 
				+  (oc_state_clear_func)theora_encode_clear,
			
 
				+  (oc_state_control_func)theora_encode_control,
			
 
				+  (oc_state_granule_frame_func)theora_encode_granule_frame,
			
 
				+  (oc_state_granule_time_func)theora_encode_granule_time,
			
 
				+};
			
 
				+
			
 
				+int theora_encode_init(theora_state *_te,theora_info *_ci){
			
 
				+  th_api_info *apiinfo;
			
 
				+  th_info      info;
			
 
				+  ogg_uint32_t keyframe_frequency_force;
			
 
				+  /*Allocate our own combined API wrapper/theora_info struct.
			
 
				+    We put them both in one malloc'd block so that when the API wrapper is
			
 
				+     freed, the info struct goes with it.
			
 
				+    This avoids having to figure out whether or not we need to free the info
			
 
				+     struct in either theora_info_clear() or theora_clear().*/
			
 
				+  apiinfo=(th_api_info *)_ogg_malloc(sizeof(*apiinfo));
			
 
				+  if(apiinfo==NULL)return TH_EFAULT;
			
 
				+  /*Make our own copy of the info struct, since its lifetime should be
			
 
				+     independent of the one we were passed in.*/
			
 
				+  *&apiinfo->info=*_ci;
			
 
				+  oc_theora_info2th_info(&info,_ci);
			
 
				+  apiinfo->api.encode=th_encode_alloc(&info);
			
 
				+  if(apiinfo->api.encode==NULL){
			
 
				+    _ogg_free(apiinfo);
			
 
				+    return OC_EINVAL;
			
 
				+  }
			
 
				+  apiinfo->api.clear=(oc_setup_clear_func)th_enc_api_clear;
			
 
				+  /*Provide entry points for ABI compatibility with old decoder shared libs.*/
			
 
				+  _te->internal_encode=(void *)&OC_ENC_DISPATCH_VTBL;
			
 
				+  _te->internal_decode=NULL;
			
 
				+  _te->granulepos=0;
			
 
				+  _te->i=&apiinfo->info;
			
 
				+  _te->i->codec_setup=&apiinfo->api;
			
 
				+  /*Set the precise requested keyframe frequency.*/
			
 
				+  keyframe_frequency_force=_ci->keyframe_auto_p?
			
 
				+   _ci->keyframe_frequency_force:_ci->keyframe_frequency;
			
 
				+  th_encode_ctl(apiinfo->api.encode,
			
 
				+   TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE,
			
 
				+   &keyframe_frequency_force,sizeof(keyframe_frequency_force));
			
 
				+  /*TODO: Additional codec setup using the extra fields in theora_info.*/
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+int theora_encode_YUVin(theora_state *_te,yuv_buffer *_yuv){
			
 
				+  th_api_wrapper  *api;
			
 
				+  th_ycbcr_buffer  buf;
			
 
				+  int              ret;
			
 
				+  api=(th_api_wrapper *)_te->i->codec_setup;
			
 
				+  buf[0].width=_yuv->y_width;
			
 
				+  buf[0].height=_yuv->y_height;
			
 
				+  buf[0].stride=_yuv->y_stride;
			
 
				+  buf[0].data=_yuv->y;
			
 
				+  buf[1].width=_yuv->uv_width;
			
 
				+  buf[1].height=_yuv->uv_height;
			
 
				+  buf[1].stride=_yuv->uv_stride;
			
 
				+  buf[1].data=_yuv->u;
			
 
				+  buf[2].width=_yuv->uv_width;
			
 
				+  buf[2].height=_yuv->uv_height;
			
 
				+  buf[2].stride=_yuv->uv_stride;
			
 
				+  buf[2].data=_yuv->v;
			
 
				+  ret=th_encode_ycbcr_in(api->encode,buf);
			
 
				+  if(ret<0)return ret;
			
 
				+  _te->granulepos=api->encode->state.granpos;
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+int theora_encode_packetout(theora_state *_te,int _last_p,ogg_packet *_op){
			
 
				+  th_api_wrapper *api;
			
 
				+  api=(th_api_wrapper *)_te->i->codec_setup;
			
 
				+  return th_encode_packetout(api->encode,_last_p,_op);
			
 
				+}
			
 
				+
			
 
				+int theora_encode_header(theora_state *_te,ogg_packet *_op){
			
 
				+  oc_enc_ctx     *enc;
			
 
				+  th_api_wrapper *api;
			
 
				+  int             ret;
			
 
				+  api=(th_api_wrapper *)_te->i->codec_setup;
			
 
				+  enc=api->encode;
			
 
				+  /*If we've already started encoding, fail.*/
			
 
				+  if(enc->packet_state>OC_PACKET_EMPTY||enc->state.granpos!=0){
			
 
				+    return TH_EINVAL;
			
 
				+  }
			
 
				+  /*Reset the state to make sure we output an info packet.*/
			
 
				+  enc->packet_state=OC_PACKET_INFO_HDR;
			
 
				+  ret=th_encode_flushheader(api->encode,NULL,_op);
			
 
				+  return ret>=0?0:ret;
			
 
				+}
			
 
				+
			
 
				+int theora_encode_comment(theora_comment *_tc,ogg_packet *_op){
			
 
				+  oggpack_buffer  opb;
			
 
				+  void           *buf;
			
 
				+  int             packet_state;
			
 
				+  int             ret;
			
 
				+  packet_state=OC_PACKET_COMMENT_HDR;
			
 
				+  oggpackB_writeinit(&opb);
			
 
				+  ret=oc_state_flushheader(NULL,&packet_state,&opb,NULL,NULL,
			
 
				+   th_version_string(),(th_comment *)_tc,_op);
			
 
				+  if(ret>=0){
			
 
				+    /*The oggpack_buffer's lifetime ends with this function, so we have to
			
 
				+       copy out the packet contents.
			
 
				+      Presumably the application knows it is supposed to free this.
			
 
				+      This part works nothing like the Vorbis API, and the documentation on it
			
 
				+       has been wrong for some time, claiming libtheora owned the memory.*/
			
 
				+    buf=_ogg_malloc(_op->bytes);
			
 
				+    if(buf==NULL){
			
 
				+      _op->packet=NULL;
			
 
				+      ret=TH_EFAULT;
			
 
				+    }
			
 
				+    else{
			
 
				+      memcpy(buf,_op->packet,_op->bytes);
			
 
				+      _op->packet=buf;
			
 
				+      ret=0;
			
 
				+    }
			
 
				+  }
			
 
				+  oggpack_writeclear(&opb);
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+int theora_encode_tables(theora_state *_te,ogg_packet *_op){
			
 
				+  oc_enc_ctx     *enc;
			
 
				+  th_api_wrapper *api;
			
 
				+  int             ret;
			
 
				+  api=(th_api_wrapper *)_te->i->codec_setup;
			
 
				+  enc=api->encode;
			
 
				+  /*If we've already started encoding, fail.*/
			
 
				+  if(enc->packet_state>OC_PACKET_EMPTY||enc->state.granpos!=0){
			
 
				+    return TH_EINVAL;
			
 
				+  }
			
 
				+  /*Reset the state to make sure we output a setup packet.*/
			
 
				+  enc->packet_state=OC_PACKET_SETUP_HDR;
			
 
				+  ret=th_encode_flushheader(api->encode,NULL,_op);
			
 
				+  return ret>=0?0:ret;
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/encfrag.c
+++ b/modules/theoraplayer/native/theora/lib/encfrag.c
@@ -0,0 +1,379 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+  last mod: $Id: encfrag.c 17821 2011-02-09 22:08:34Z giles $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include "encint.h"
			
 
				+
			
 
				+
			
 
				+void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride){
			
 
				+  int i;
			
 
				+  for(i=0;i<8;i++){
			
 
				+    int j;
			
 
				+    for(j=0;j<8;j++)_diff[i*8+j]=(ogg_int16_t)(_src[j]-_ref[j]);
			
 
				+    _src+=_ystride;
			
 
				+    _ref+=_ystride;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void oc_enc_frag_sub_128_c(ogg_int16_t *_diff,
			
 
				+ const unsigned char *_src,int _ystride){
			
 
				+  int i;
			
 
				+  for(i=0;i<8;i++){
			
 
				+    int j;
			
 
				+    for(j=0;j<8;j++)_diff[i*8+j]=(ogg_int16_t)(_src[j]-128);
			
 
				+    _src+=_ystride;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+unsigned oc_enc_frag_sad_c(const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride){
			
 
				+  unsigned sad;
			
 
				+  int      i;
			
 
				+  sad=0;
			
 
				+  for(i=8;i-->0;){
			
 
				+    int j;
			
 
				+    for(j=0;j<8;j++)sad+=abs(_src[j]-_ref[j]);
			
 
				+    _src+=_ystride;
			
 
				+    _ref+=_ystride;
			
 
				+  }
			
 
				+  return sad;
			
 
				+}
			
 
				+
			
 
				+unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride,unsigned _thresh){
			
 
				+  unsigned sad;
			
 
				+  int      i;
			
 
				+  sad=0;
			
 
				+  for(i=8;i-->0;){
			
 
				+    int j;
			
 
				+    for(j=0;j<8;j++)sad+=abs(_src[j]-_ref[j]);
			
 
				+    if(sad>_thresh)break;
			
 
				+    _src+=_ystride;
			
 
				+    _ref+=_ystride;
			
 
				+  }
			
 
				+  return sad;
			
 
				+}
			
 
				+
			
 
				+unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
			
 
				+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
			
 
				+ unsigned _thresh){
			
 
				+  unsigned sad;
			
 
				+  int      i;
			
 
				+  sad=0;
			
 
				+  for(i=8;i-->0;){
			
 
				+    int j;
			
 
				+    for(j=0;j<8;j++)sad+=abs(_src[j]-(_ref1[j]+_ref2[j]>>1));
			
 
				+    if(sad>_thresh)break;
			
 
				+    _src+=_ystride;
			
 
				+    _ref1+=_ystride;
			
 
				+    _ref2+=_ystride;
			
 
				+  }
			
 
				+  return sad;
			
 
				+}
			
 
				+
			
 
				+unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride){
			
 
				+  const unsigned char *src = _src;
			
 
				+  unsigned dc;
			
 
				+  unsigned sad;
			
 
				+  int      i;
			
 
				+  dc=0;
			
 
				+  for(i=8;i-->0;){
			
 
				+    int j;
			
 
				+    for(j=0;j<8;j++)dc+=src[j];
			
 
				+    src+=_ystride;
			
 
				+  }
			
 
				+  dc=dc+32>>6;
			
 
				+  sad=0;
			
 
				+  for(i=8;i-->0;){
			
 
				+    int j;
			
 
				+    for(j=0;j<8;j++)sad+=abs(_src[j]-dc);
			
 
				+    _src+=_ystride;
			
 
				+  }
			
 
				+  return sad;
			
 
				+}
			
 
				+
			
 
				+static void oc_diff_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride){
			
 
				+  int i;
			
 
				+  for(i=0;i<8;i++){
			
 
				+    int t0;
			
 
				+    int t1;
			
 
				+    int t2;
			
 
				+    int t3;
			
 
				+    int t4;
			
 
				+    int t5;
			
 
				+    int t6;
			
 
				+    int t7;
			
 
				+    int r;
			
 
				+    /*Hadamard stage 1:*/
			
 
				+    t0=_src[0]-_ref[0]+_src[4]-_ref[4];
			
 
				+    t4=_src[0]-_ref[0]-_src[4]+_ref[4];
			
 
				+    t1=_src[1]-_ref[1]+_src[5]-_ref[5];
			
 
				+    t5=_src[1]-_ref[1]-_src[5]+_ref[5];
			
 
				+    t2=_src[2]-_ref[2]+_src[6]-_ref[6];
			
 
				+    t6=_src[2]-_ref[2]-_src[6]+_ref[6];
			
 
				+    t3=_src[3]-_ref[3]+_src[7]-_ref[7];
			
 
				+    t7=_src[3]-_ref[3]-_src[7]+_ref[7];
			
 
				+    /*Hadamard stage 2:*/
			
 
				+    r=t0;
			
 
				+    t0+=t2;
			
 
				+    t2=r-t2;
			
 
				+    r=t1;
			
 
				+    t1+=t3;
			
 
				+    t3=r-t3;
			
 
				+    r=t4;
			
 
				+    t4+=t6;
			
 
				+    t6=r-t6;
			
 
				+    r=t5;
			
 
				+    t5+=t7;
			
 
				+    t7=r-t7;
			
 
				+    /*Hadamard stage 3:*/
			
 
				+    _buf[0*8+i]=(ogg_int16_t)(t0+t1);
			
 
				+    _buf[1*8+i]=(ogg_int16_t)(t0-t1);
			
 
				+    _buf[2*8+i]=(ogg_int16_t)(t2+t3);
			
 
				+    _buf[3*8+i]=(ogg_int16_t)(t2-t3);
			
 
				+    _buf[4*8+i]=(ogg_int16_t)(t4+t5);
			
 
				+    _buf[5*8+i]=(ogg_int16_t)(t4-t5);
			
 
				+    _buf[6*8+i]=(ogg_int16_t)(t6+t7);
			
 
				+    _buf[7*8+i]=(ogg_int16_t)(t6-t7);
			
 
				+    _src+=_ystride;
			
 
				+    _ref+=_ystride;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static void oc_diff_hadamard2(ogg_int16_t _buf[64],const unsigned char *_src,
			
 
				+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
			
 
				+  int i;
			
 
				+  for(i=0;i<8;i++){
			
 
				+    int t0;
			
 
				+    int t1;
			
 
				+    int t2;
			
 
				+    int t3;
			
 
				+    int t4;
			
 
				+    int t5;
			
 
				+    int t6;
			
 
				+    int t7;
			
 
				+    int r;
			
 
				+    /*Hadamard stage 1:*/
			
 
				+    r=_ref1[0]+_ref2[0]>>1;
			
 
				+    t4=_ref1[4]+_ref2[4]>>1;
			
 
				+    t0=_src[0]-r+_src[4]-t4;
			
 
				+    t4=_src[0]-r-_src[4]+t4;
			
 
				+    r=_ref1[1]+_ref2[1]>>1;
			
 
				+    t5=_ref1[5]+_ref2[5]>>1;
			
 
				+    t1=_src[1]-r+_src[5]-t5;
			
 
				+    t5=_src[1]-r-_src[5]+t5;
			
 
				+    r=_ref1[2]+_ref2[2]>>1;
			
 
				+    t6=_ref1[6]+_ref2[6]>>1;
			
 
				+    t2=_src[2]-r+_src[6]-t6;
			
 
				+    t6=_src[2]-r-_src[6]+t6;
			
 
				+    r=_ref1[3]+_ref2[3]>>1;
			
 
				+    t7=_ref1[7]+_ref2[7]>>1;
			
 
				+    t3=_src[3]-r+_src[7]-t7;
			
 
				+    t7=_src[3]-r-_src[7]+t7;
			
 
				+    /*Hadamard stage 2:*/
			
 
				+    r=t0;
			
 
				+    t0+=t2;
			
 
				+    t2=r-t2;
			
 
				+    r=t1;
			
 
				+    t1+=t3;
			
 
				+    t3=r-t3;
			
 
				+    r=t4;
			
 
				+    t4+=t6;
			
 
				+    t6=r-t6;
			
 
				+    r=t5;
			
 
				+    t5+=t7;
			
 
				+    t7=r-t7;
			
 
				+    /*Hadamard stage 3:*/
			
 
				+    _buf[0*8+i]=(ogg_int16_t)(t0+t1);
			
 
				+    _buf[1*8+i]=(ogg_int16_t)(t0-t1);
			
 
				+    _buf[2*8+i]=(ogg_int16_t)(t2+t3);
			
 
				+    _buf[3*8+i]=(ogg_int16_t)(t2-t3);
			
 
				+    _buf[4*8+i]=(ogg_int16_t)(t4+t5);
			
 
				+    _buf[5*8+i]=(ogg_int16_t)(t4-t5);
			
 
				+    _buf[6*8+i]=(ogg_int16_t)(t6+t7);
			
 
				+    _buf[7*8+i]=(ogg_int16_t)(t6-t7);
			
 
				+    _src+=_ystride;
			
 
				+    _ref1+=_ystride;
			
 
				+    _ref2+=_ystride;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static void oc_intra_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
			
 
				+ int _ystride){
			
 
				+  int i;
			
 
				+  for(i=0;i<8;i++){
			
 
				+    int t0;
			
 
				+    int t1;
			
 
				+    int t2;
			
 
				+    int t3;
			
 
				+    int t4;
			
 
				+    int t5;
			
 
				+    int t6;
			
 
				+    int t7;
			
 
				+    int r;
			
 
				+    /*Hadamard stage 1:*/
			
 
				+    t0=_src[0]+_src[4];
			
 
				+    t4=_src[0]-_src[4];
			
 
				+    t1=_src[1]+_src[5];
			
 
				+    t5=_src[1]-_src[5];
			
 
				+    t2=_src[2]+_src[6];
			
 
				+    t6=_src[2]-_src[6];
			
 
				+    t3=_src[3]+_src[7];
			
 
				+    t7=_src[3]-_src[7];
			
 
				+    /*Hadamard stage 2:*/
			
 
				+    r=t0;
			
 
				+    t0+=t2;
			
 
				+    t2=r-t2;
			
 
				+    r=t1;
			
 
				+    t1+=t3;
			
 
				+    t3=r-t3;
			
 
				+    r=t4;
			
 
				+    t4+=t6;
			
 
				+    t6=r-t6;
			
 
				+    r=t5;
			
 
				+    t5+=t7;
			
 
				+    t7=r-t7;
			
 
				+    /*Hadamard stage 3:*/
			
 
				+    _buf[0*8+i]=(ogg_int16_t)(t0+t1);
			
 
				+    _buf[1*8+i]=(ogg_int16_t)(t0-t1);
			
 
				+    _buf[2*8+i]=(ogg_int16_t)(t2+t3);
			
 
				+    _buf[3*8+i]=(ogg_int16_t)(t2-t3);
			
 
				+    _buf[4*8+i]=(ogg_int16_t)(t4+t5);
			
 
				+    _buf[5*8+i]=(ogg_int16_t)(t4-t5);
			
 
				+    _buf[6*8+i]=(ogg_int16_t)(t6+t7);
			
 
				+    _buf[7*8+i]=(ogg_int16_t)(t6-t7);
			
 
				+    _src+=_ystride;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+unsigned oc_hadamard_sad(int *_dc,const ogg_int16_t _buf[64]){
			
 
				+  unsigned sad;
			
 
				+  int      dc;
			
 
				+  int      t0;
			
 
				+  int      t1;
			
 
				+  int      t2;
			
 
				+  int      t3;
			
 
				+  int      t4;
			
 
				+  int      t5;
			
 
				+  int      t6;
			
 
				+  int      t7;
			
 
				+  int      r;
			
 
				+  int      i;
			
 
				+  sad=dc=0;
			
 
				+  for(i=0;i<8;i++){
			
 
				+    /*Hadamard stage 1:*/
			
 
				+    t0=_buf[i*8+0]+_buf[i*8+4];
			
 
				+    t4=_buf[i*8+0]-_buf[i*8+4];
			
 
				+    t1=_buf[i*8+1]+_buf[i*8+5];
			
 
				+    t5=_buf[i*8+1]-_buf[i*8+5];
			
 
				+    t2=_buf[i*8+2]+_buf[i*8+6];
			
 
				+    t6=_buf[i*8+2]-_buf[i*8+6];
			
 
				+    t3=_buf[i*8+3]+_buf[i*8+7];
			
 
				+    t7=_buf[i*8+3]-_buf[i*8+7];
			
 
				+    /*Hadamard stage 2:*/
			
 
				+    r=t0;
			
 
				+    t0+=t2;
			
 
				+    t2=r-t2;
			
 
				+    r=t1;
			
 
				+    t1+=t3;
			
 
				+    t3=r-t3;
			
 
				+    r=t4;
			
 
				+    t4+=t6;
			
 
				+    t6=r-t6;
			
 
				+    r=t5;
			
 
				+    t5+=t7;
			
 
				+    t7=r-t7;
			
 
				+    /*Hadamard stage 3:*/
			
 
				+    r=abs(t0+t1)&-(i>0);
			
 
				+    r+=abs(t0-t1);
			
 
				+    r+=abs(t2+t3);
			
 
				+    r+=abs(t2-t3);
			
 
				+    r+=abs(t4+t5);
			
 
				+    r+=abs(t4-t5);
			
 
				+    r+=abs(t6+t7);
			
 
				+    r+=abs(t6-t7);
			
 
				+    sad+=r;
			
 
				+  }
			
 
				+  dc=_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7];
			
 
				+  *_dc=dc;
			
 
				+  return sad;
			
 
				+}
			
 
				+
			
 
				+unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride){
			
 
				+  ogg_int16_t buf[64];
			
 
				+  oc_diff_hadamard(buf,_src,_ref,_ystride);
			
 
				+  return oc_hadamard_sad(_dc,buf);
			
 
				+}
			
 
				+
			
 
				+unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
			
 
				+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
			
 
				+  ogg_int16_t buf[64];
			
 
				+  oc_diff_hadamard2(buf,_src,_ref1,_ref2,_ystride);
			
 
				+  return oc_hadamard_sad(_dc,buf);
			
 
				+}
			
 
				+
			
 
				+unsigned oc_enc_frag_intra_satd_c(int *_dc,
			
 
				+ const unsigned char *_src,int _ystride){
			
 
				+  ogg_int16_t buf[64];
			
 
				+  oc_intra_hadamard(buf,_src,_ystride);
			
 
				+  return oc_hadamard_sad(_dc,buf);
			
 
				+}
			
 
				+
			
 
				+unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride){
			
 
				+  unsigned ret;
			
 
				+  int      y;
			
 
				+  int      x;
			
 
				+  ret=0;
			
 
				+  for(y=0;y<8;y++){
			
 
				+    for(x=0;x<8;x++)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
			
 
				+    _src+=_ystride;
			
 
				+    _ref+=_ystride;
			
 
				+  }
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
			
 
				+  unsigned ret;
			
 
				+  int      y;
			
 
				+  int      x;
			
 
				+  ret=0;
			
 
				+  for(y=0;y<8;y++){
			
 
				+    for(x=0;x<8;x++,_mask>>=1){
			
 
				+      if(_mask&1)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
			
 
				+    }
			
 
				+    _src+=_ystride;
			
 
				+    _ref+=_ystride;
			
 
				+  }
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+void oc_enc_frag_copy2_c(unsigned char *_dst,
			
 
				+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
			
 
				+  int i;
			
 
				+  int j;
			
 
				+  for(i=8;i-->0;){
			
 
				+    for(j=0;j<8;j++)_dst[j]=_src1[j]+_src2[j]>>1;
			
 
				+    _dst+=_ystride;
			
 
				+    _src1+=_ystride;
			
 
				+    _src2+=_ystride;
			
 
				+  }
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/encinfo.c
+++ b/modules/theoraplayer/native/theora/lib/encinfo.c
@@ -0,0 +1,121 @@
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include "state.h"
			
 
				+#include "enquant.h"
			
 
				+#include "huffenc.h"
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Packs a series of octets from a given byte array into the pack buffer.
			
 
				+  _opb: The pack buffer to store the octets in.
			
 
				+  _buf: The byte array containing the bytes to pack.
			
 
				+  _len: The number of octets to pack.*/
			
 
				+static void oc_pack_octets(oggpack_buffer *_opb,const char *_buf,int _len){
			
 
				+  int i;
			
 
				+  for(i=0;i<_len;i++)oggpackB_write(_opb,_buf[i],8);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
			
 
				+ oggpack_buffer *_opb,const th_quant_info *_qinfo,
			
 
				+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
			
 
				+ const char *_vendor,th_comment *_tc,ogg_packet *_op){
			
 
				+  unsigned char *packet;
			
 
				+  int            b_o_s;
			
 
				+  if(_op==NULL)return TH_EFAULT;
			
 
				+  switch(*_packet_state){
			
 
				+    /*Codec info header.*/
			
 
				+    case OC_PACKET_INFO_HDR:{
			
 
				+      if(_state==NULL)return TH_EFAULT;
			
 
				+      oggpackB_reset(_opb);
			
 
				+      /*Mark this packet as the info header.*/
			
 
				+      oggpackB_write(_opb,0x80,8);
			
 
				+      /*Write the codec string.*/
			
 
				+      oc_pack_octets(_opb,"theora",6);
			
 
				+      /*Write the codec bitstream version.*/
			
 
				+      oggpackB_write(_opb,TH_VERSION_MAJOR,8);
			
 
				+      oggpackB_write(_opb,TH_VERSION_MINOR,8);
			
 
				+      oggpackB_write(_opb,TH_VERSION_SUB,8);
			
 
				+      /*Describe the encoded frame.*/
			
 
				+      oggpackB_write(_opb,_state->info.frame_width>>4,16);
			
 
				+      oggpackB_write(_opb,_state->info.frame_height>>4,16);
			
 
				+      oggpackB_write(_opb,_state->info.pic_width,24);
			
 
				+      oggpackB_write(_opb,_state->info.pic_height,24);
			
 
				+      oggpackB_write(_opb,_state->info.pic_x,8);
			
 
				+      oggpackB_write(_opb,_state->info.pic_y,8);
			
 
				+      oggpackB_write(_opb,_state->info.fps_numerator,32);
			
 
				+      oggpackB_write(_opb,_state->info.fps_denominator,32);
			
 
				+      oggpackB_write(_opb,_state->info.aspect_numerator,24);
			
 
				+      oggpackB_write(_opb,_state->info.aspect_denominator,24);
			
 
				+      oggpackB_write(_opb,_state->info.colorspace,8);
			
 
				+      oggpackB_write(_opb,_state->info.target_bitrate,24);
			
 
				+      oggpackB_write(_opb,_state->info.quality,6);
			
 
				+      oggpackB_write(_opb,_state->info.keyframe_granule_shift,5);
			
 
				+      oggpackB_write(_opb,_state->info.pixel_fmt,2);
			
 
				+      /*Spare configuration bits.*/
			
 
				+      oggpackB_write(_opb,0,3);
			
 
				+      b_o_s=1;
			
 
				+    }break;
			
 
				+    /*Comment header.*/
			
 
				+    case OC_PACKET_COMMENT_HDR:{
			
 
				+      int vendor_len;
			
 
				+      int i;
			
 
				+      if(_tc==NULL)return TH_EFAULT;
			
 
				+      vendor_len=strlen(_vendor);
			
 
				+      oggpackB_reset(_opb);
			
 
				+      /*Mark this packet as the comment header.*/
			
 
				+      oggpackB_write(_opb,0x81,8);
			
 
				+      /*Write the codec string.*/
			
 
				+      oc_pack_octets(_opb,"theora",6);
			
 
				+      /*Write the vendor string.*/
			
 
				+      oggpack_write(_opb,vendor_len,32);
			
 
				+      oc_pack_octets(_opb,_vendor,vendor_len);
			
 
				+      oggpack_write(_opb,_tc->comments,32);
			
 
				+      for(i=0;i<_tc->comments;i++){
			
 
				+        if(_tc->user_comments[i]!=NULL){
			
 
				+          oggpack_write(_opb,_tc->comment_lengths[i],32);
			
 
				+          oc_pack_octets(_opb,_tc->user_comments[i],_tc->comment_lengths[i]);
			
 
				+        }
			
 
				+        else oggpack_write(_opb,0,32);
			
 
				+      }
			
 
				+      b_o_s=0;
			
 
				+    }break;
			
 
				+    /*Codec setup header.*/
			
 
				+    case OC_PACKET_SETUP_HDR:{
			
 
				+      int ret;
			
 
				+      oggpackB_reset(_opb);
			
 
				+      /*Mark this packet as the setup header.*/
			
 
				+      oggpackB_write(_opb,0x82,8);
			
 
				+      /*Write the codec string.*/
			
 
				+      oc_pack_octets(_opb,"theora",6);
			
 
				+      /*Write the quantizer tables.*/
			
 
				+      oc_quant_params_pack(_opb,_qinfo);
			
 
				+      /*Write the huffman codes.*/
			
 
				+      ret=oc_huff_codes_pack(_opb,_codes);
			
 
				+      /*This should never happen, because we validate the tables when they
			
 
				+         are set.
			
 
				+        If you see, it's a good chance memory is being corrupted.*/
			
 
				+      if(ret<0)return ret;
			
 
				+      b_o_s=0;
			
 
				+    }break;
			
 
				+    /*No more headers to emit.*/
			
 
				+    default:return 0;
			
 
				+  }
			
 
				+  /*This is kind of fugly: we hand the user a buffer which they do not own.
			
 
				+    We will overwrite it when the next packet is output, so the user better be
			
 
				+     done with it by then.
			
 
				+    Vorbis is little better: it hands back buffers that it will free the next
			
 
				+     time the headers are requested, or when the encoder is cleared.
			
 
				+    Hopefully libogg2 will make this much cleaner.*/
			
 
				+  packet=oggpackB_get_buffer(_opb);
			
 
				+  /*If there's no packet, malloc failed while writing.*/
			
 
				+  if(packet==NULL)return TH_EFAULT;
			
 
				+  _op->packet=packet;
			
 
				+  _op->bytes=oggpackB_bytes(_opb);
			
 
				+  _op->b_o_s=b_o_s;
			
 
				+  _op->e_o_s=0;
			
 
				+  _op->granulepos=0;
			
 
				+  _op->packetno=*_packet_state+3;
			
 
				+  return ++(*_packet_state)+3;
			
 
				+}
			
--- a/modules/theoraplayer/native/theora/lib/encint.h
+++ b/modules/theoraplayer/native/theora/lib/encint.h
@@ -0,0 +1,845 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+  last mod: $Id: encint.h 18223 2012-03-31 18:49:57Z gmaxwell $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#if !defined(_encint_H)
			
 
				+# define _encint_H (1)
			
 
				+# include "theora/theoraenc.h"
			
 
				+# include "state.h"
			
 
				+# include "mathops.h"
			
 
				+# include "enquant.h"
			
 
				+# include "huffenc.h"
			
 
				+/*# define OC_COLLECT_METRICS*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+typedef oc_mv                         oc_mv2[2];
			
 
				+
			
 
				+typedef struct oc_enc_opt_vtable      oc_enc_opt_vtable;
			
 
				+typedef struct oc_enc_opt_data        oc_enc_opt_data;
			
 
				+typedef struct oc_mb_enc_info         oc_mb_enc_info;
			
 
				+typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
			
 
				+typedef struct oc_fr_state            oc_fr_state;
			
 
				+typedef struct oc_qii_state           oc_qii_state;
			
 
				+typedef struct oc_enc_pipeline_state  oc_enc_pipeline_state;
			
 
				+typedef struct oc_mode_rd             oc_mode_rd;
			
 
				+typedef struct oc_iir_filter          oc_iir_filter;
			
 
				+typedef struct oc_frame_metrics       oc_frame_metrics;
			
 
				+typedef struct oc_rc_state            oc_rc_state;
			
 
				+typedef struct th_enc_ctx             oc_enc_ctx;
			
 
				+typedef struct oc_token_checkpoint    oc_token_checkpoint;
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Encoder-specific accelerated functions.*/
			
 
				+# if defined(OC_X86_ASM)
			
 
				+#  if defined(_MSC_VER)
			
 
				+#   include "x86_vc/x86enc.h"
			
 
				+#  else
			
 
				+#   include "x86/x86enc.h"
			
 
				+#  endif
			
 
				+# endif
			
 
				+# if defined(OC_ARM_ASM)
			
 
				+#  include "arm/armenc.h"
			
 
				+# endif
			
 
				+
			
 
				+# if !defined(oc_enc_accel_init)
			
 
				+#  define oc_enc_accel_init oc_enc_accel_init_c
			
 
				+# endif
			
 
				+# if defined(OC_ENC_USE_VTABLE)
			
 
				+#  if !defined(oc_enc_frag_sub)
			
 
				+#   define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \
			
 
				+  ((*(_enc)->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_sub_128)
			
 
				+#   define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \
			
 
				+  ((*(_enc)->opt_vtable.frag_sub_128)(_diff,_src,_ystride))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_sad)
			
 
				+#   define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
			
 
				+  ((*(_enc)->opt_vtable.frag_sad)(_src,_ref,_ystride))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_sad_thresh)
			
 
				+#   define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
			
 
				+  ((*(_enc)->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_sad2_thresh)
			
 
				+#   define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
			
 
				+  ((*(_enc)->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,_thresh))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_intra_sad)
			
 
				+#   define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
			
 
				+  ((*(_enc)->opt_vtable.frag_intra_sad)(_src,_ystride))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_satd)
			
 
				+#   define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
			
 
				+  ((*(_enc)->opt_vtable.frag_satd)(_dc,_src,_ref,_ystride))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_satd2)
			
 
				+#   define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
			
 
				+  ((*(_enc)->opt_vtable.frag_satd2)(_dc,_src,_ref1,_ref2,_ystride))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_intra_satd)
			
 
				+#   define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
			
 
				+  ((*(_enc)->opt_vtable.frag_intra_satd)(_dc,_src,_ystride))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_ssd)
			
 
				+#   define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
			
 
				+  ((*(_enc)->opt_vtable.frag_ssd)(_src,_ref,_ystride))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_border_ssd)
			
 
				+#   define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
			
 
				+  ((*(_enc)->opt_vtable.frag_border_ssd)(_src,_ref,_ystride,_mask))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_copy2)
			
 
				+#   define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
			
 
				+  ((*(_enc)->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_enquant_table_init)
			
 
				+#   define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
			
 
				+  ((*(_enc)->opt_vtable.enquant_table_init)(_enquant,_dequant))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_enquant_table_fixup)
			
 
				+#   define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
			
 
				+  ((*(_enc)->opt_vtable.enquant_table_fixup)(_enquant,_nqis))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_quantize)
			
 
				+#   define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
			
 
				+  ((*(_enc)->opt_vtable.quantize)(_qdct,_dct,_dequant,_enquant))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_recon_intra)
			
 
				+#   define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
			
 
				+  ((*(_enc)->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_recon_inter)
			
 
				+#   define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
			
 
				+  ((*(_enc)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue))
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_fdct8x8)
			
 
				+#   define oc_enc_fdct8x8(_enc,_y,_x) \
			
 
				+  ((*(_enc)->opt_vtable.fdct8x8)(_y,_x))
			
 
				+#  endif
			
 
				+# else
			
 
				+#  if !defined(oc_enc_frag_sub)
			
 
				+#   define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \
			
 
				+  oc_enc_frag_sub_c(_diff,_src,_ref,_ystride)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_sub_128)
			
 
				+#   define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \
			
 
				+  oc_enc_frag_sub_128_c(_diff,_src,_ystride)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_sad)
			
 
				+#   define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
			
 
				+  oc_enc_frag_sad_c(_src,_ref,_ystride)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_sad_thresh)
			
 
				+#   define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
			
 
				+  oc_enc_frag_sad_thresh_c(_src,_ref,_ystride,_thresh)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_sad2_thresh)
			
 
				+#   define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
			
 
				+  oc_enc_frag_sad2_thresh_c(_src,_ref1,_ref2,_ystride,_thresh)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_intra_sad)
			
 
				+#   define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
			
 
				+  oc_enc_frag_intra_sad_c(_src,_ystride)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_satd)
			
 
				+#   define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
			
 
				+  oc_enc_frag_satd_c(_dc,_src,_ref,_ystride)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_satd2)
			
 
				+#   define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
			
 
				+  oc_enc_frag_satd2_c(_dc,_src,_ref1,_ref2,_ystride)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_intra_satd)
			
 
				+#   define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
			
 
				+  oc_enc_frag_intra_satd_c(_dc,_src,_ystride)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_ssd)
			
 
				+#   define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
			
 
				+  oc_enc_frag_ssd_c(_src,_ref,_ystride)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_border_ssd)
			
 
				+#   define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
			
 
				+  oc_enc_frag_border_ssd_c(_src,_ref,_ystride,_mask)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_copy2)
			
 
				+#   define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
			
 
				+  oc_enc_frag_copy2_c(_dst,_src1,_src2,_ystride)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_enquant_table_init)
			
 
				+#   define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
			
 
				+  oc_enc_enquant_table_init_c(_enquant,_dequant)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_enquant_table_fixup)
			
 
				+#   define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
			
 
				+  oc_enc_enquant_table_fixup_c(_enquant,_nqis)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_quantize)
			
 
				+#   define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
			
 
				+  oc_enc_quantize_c(_qdct,_dct,_dequant,_enquant)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_recon_intra)
			
 
				+#   define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
			
 
				+  oc_frag_recon_intra_c(_dst,_ystride,_residue)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_frag_recon_inter)
			
 
				+#   define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
			
 
				+  oc_frag_recon_inter_c(_dst,_src,_ystride,_residue)
			
 
				+#  endif
			
 
				+#  if !defined(oc_enc_fdct8x8)
			
 
				+#   define oc_enc_fdct8x8(_enc,_y,_x) oc_enc_fdct8x8_c(_y,_x)
			
 
				+#  endif
			
 
				+# endif
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Constants for the packet-out state machine specific to the encoder.*/
			
 
				+
			
 
				+/*Next packet to emit: Data packet, but none are ready yet.*/
			
 
				+#define OC_PACKET_EMPTY (0)
			
 
				+/*Next packet to emit: Data packet, and one is ready.*/
			
 
				+#define OC_PACKET_READY (1)
			
 
				+
			
 
				+/*All features enabled.*/
			
 
				+#define OC_SP_LEVEL_SLOW          (0)
			
 
				+/*Enable early skip.*/
			
 
				+#define OC_SP_LEVEL_EARLY_SKIP    (1)
			
 
				+/*Use analysis shortcuts, single quantizer, and faster tokenization.*/
			
 
				+#define OC_SP_LEVEL_FAST_ANALYSIS (2)
			
 
				+/*Use SAD instead of SATD*/
			
 
				+#define OC_SP_LEVEL_NOSATD        (3)
			
 
				+/*Disable motion compensation.*/
			
 
				+#define OC_SP_LEVEL_NOMC          (4)
			
 
				+/*Maximum valid speed level.*/
			
 
				+#define OC_SP_LEVEL_MAX           (4)
			
 
				+
			
 
				+
			
 
				+/*The number of extra bits of precision at which to store rate metrics.*/
			
 
				+# define OC_BIT_SCALE  (6)
			
 
				+/*The number of extra bits of precision at which to store RMSE metrics.
			
 
				+  This must be at least half OC_BIT_SCALE (rounded up).*/
			
 
				+# define OC_RMSE_SCALE (5)
			
 
				+/*The number of quantizer bins to partition statistics into.*/
			
 
				+# define OC_LOGQ_BINS  (8)
			
 
				+/*The number of SAD/SATD bins to partition statistics into.*/
			
 
				+# define OC_COMP_BINS   (24)
			
 
				+/*The number of bits of precision to drop from SAD and SATD scores
			
 
				+   to assign them to a bin.*/
			
 
				+# define OC_SAD_SHIFT  (6)
			
 
				+# define OC_SATD_SHIFT (9)
			
 
				+
			
 
				+/*Masking is applied by scaling the D used in R-D optimization (via rd_scale)
			
 
				+   or the lambda parameter (via rd_iscale).
			
 
				+  These are only equivalent within a single block; when more than one block is
			
 
				+   being considered, the former is the interpretation used.*/
			
 
				+
			
 
				+/*This must be at least 4 for OC_RD_SKIP_SCALE() to work below.*/
			
 
				+# define OC_RD_SCALE_BITS (12-OC_BIT_SCALE)
			
 
				+# define OC_RD_ISCALE_BITS (11)
			
 
				+
			
 
				+/*This macro is applied to _ssd values with just 4 bits of headroom
			
 
				+   ((15-OC_RMSE_SCALE)*2+OC_BIT_SCALE+2); since we want to allow rd_scales as
			
 
				+   large as 16, and need additional fractional bits, our only recourse that
			
 
				+   doesn't lose precision on blocks with very small SSDs is to use a wider
			
 
				+   multiply.*/
			
 
				+# if LONG_MAX>2147483647
			
 
				+#  define OC_RD_SCALE(_ssd,_rd_scale) \
			
 
				+ ((unsigned)((unsigned long)(_ssd)*(_rd_scale) \
			
 
				+ +((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS))
			
 
				+# else
			
 
				+#  define OC_RD_SCALE(_ssd,_rd_scale) \
			
 
				+ (((_ssd)>>OC_RD_SCALE_BITS)*(_rd_scale) \
			
 
				+ +(((_ssd)&(1<<OC_RD_SCALE_BITS)-1)*(_rd_scale) \
			
 
				+ +((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS))
			
 
				+# endif
			
 
				+# define OC_RD_SKIP_SCALE(_ssd,_rd_scale) \
			
 
				+ ((_ssd)*(_rd_scale)+((1<<OC_RD_SCALE_BITS-4)>>1)>>OC_RD_SCALE_BITS-4)
			
 
				+# define OC_RD_ISCALE(_lambda,_rd_iscale) \
			
 
				+ ((_lambda)*(_rd_iscale)+((1<<OC_RD_ISCALE_BITS)>>1)>>OC_RD_ISCALE_BITS)
			
 
				+
			
 
				+
			
 
				+/*The bits used for each of the MB mode codebooks.*/
			
 
				+extern const unsigned char OC_MODE_BITS[2][OC_NMODES];
			
 
				+
			
 
				+/*The bits used for each of the MV codebooks.*/
			
 
				+extern const unsigned char OC_MV_BITS[2][64];
			
 
				+
			
 
				+/*The minimum value that can be stored in a SB run for each codeword.
			
 
				+  The last entry is the upper bound on the length of a single SB run.*/
			
 
				+extern const ogg_uint16_t  OC_SB_RUN_VAL_MIN[8];
			
 
				+/*The bits used for each SB run codeword.*/
			
 
				+extern const unsigned char OC_SB_RUN_CODE_NBITS[7];
			
 
				+
			
 
				+/*The bits used for each block run length (starting with 1).*/
			
 
				+extern const unsigned char OC_BLOCK_RUN_CODE_NBITS[30];
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Encoder specific functions with accelerated variants.*/
			
 
				+struct oc_enc_opt_vtable{
			
 
				+  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
			
 
				+   const unsigned char *_ref,int _ystride);
			
 
				+  void     (*frag_sub_128)(ogg_int16_t _diff[64],
			
 
				+   const unsigned char *_src,int _ystride);
			
 
				+  unsigned (*frag_sad)(const unsigned char *_src,
			
 
				+   const unsigned char *_ref,int _ystride);
			
 
				+  unsigned (*frag_sad_thresh)(const unsigned char *_src,
			
 
				+   const unsigned char *_ref,int _ystride,unsigned _thresh);
			
 
				+  unsigned (*frag_sad2_thresh)(const unsigned char *_src,
			
 
				+   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
			
 
				+   unsigned _thresh);
			
 
				+  unsigned (*frag_intra_sad)(const unsigned char *_src,int _ystride);
			
 
				+  unsigned (*frag_satd)(int *_dc,const unsigned char *_src,
			
 
				+   const unsigned char *_ref,int _ystride);
			
 
				+  unsigned (*frag_satd2)(int *_dc,const unsigned char *_src,
			
 
				+   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
			
 
				+  unsigned (*frag_intra_satd)(int *_dc,const unsigned char *_src,int _ystride);
			
 
				+  unsigned (*frag_ssd)(const unsigned char *_src,
			
 
				+   const unsigned char *_ref,int _ystride);
			
 
				+  unsigned (*frag_border_ssd)(const unsigned char *_src,
			
 
				+   const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
			
 
				+  void     (*frag_copy2)(unsigned char *_dst,
			
 
				+   const unsigned char *_src1,const unsigned char *_src2,int _ystride);
			
 
				+  void     (*enquant_table_init)(void *_enquant,
			
 
				+   const ogg_uint16_t _dequant[64]);
			
 
				+  void     (*enquant_table_fixup)(void *_enquant[3][3][2],int _nqis);
			
 
				+  int      (*quantize)(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
			
 
				+   const ogg_uint16_t _dequant[64],const void *_enquant);
			
 
				+  void     (*frag_recon_intra)(unsigned char *_dst,int _ystride,
			
 
				+   const ogg_int16_t _residue[64]);
			
 
				+  void     (*frag_recon_inter)(unsigned char *_dst,
			
 
				+   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
			
 
				+  void     (*fdct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*Encoder specific data that varies according to which variants of the above
			
 
				+   functions are used.*/
			
 
				+struct oc_enc_opt_data{
			
 
				+  /*The size of a single quantizer table.
			
 
				+    This must be a multiple of enquant_table_alignment.*/
			
 
				+  size_t               enquant_table_size;
			
 
				+  /*The alignment required for the quantizer tables.
			
 
				+    This must be a positive power of two.*/
			
 
				+  int                  enquant_table_alignment;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+void oc_enc_accel_init(oc_enc_ctx *_enc);
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Encoder-specific macroblock information.*/
			
 
				+struct oc_mb_enc_info{
			
 
				+  /*Neighboring macro blocks that have MVs available from the current frame.*/
			
 
				+  unsigned      cneighbors[4];
			
 
				+  /*Neighboring macro blocks to use for MVs from the previous frame.*/
			
 
				+  unsigned      pneighbors[4];
			
 
				+  /*The number of current-frame neighbors.*/
			
 
				+  unsigned char ncneighbors;
			
 
				+  /*The number of previous-frame neighbors.*/
			
 
				+  unsigned char npneighbors;
			
 
				+  /*Flags indicating which MB modes have been refined.*/
			
 
				+  unsigned char refined;
			
 
				+  /*Motion vectors for a macro block for the current frame and the
			
 
				+     previous two frames.
			
 
				+    Each is a set of 2 vectors against OC_FRAME_GOLD and OC_FRAME_PREV, which
			
 
				+     can be used to estimate constant velocity and constant acceleration
			
 
				+     predictors.
			
 
				+    Uninitialized MVs are (0,0).*/
			
 
				+  oc_mv2        analysis_mv[3];
			
 
				+  /*Current unrefined analysis MVs.*/
			
 
				+  oc_mv         unref_mv[2];
			
 
				+  /*Unrefined block MVs.*/
			
 
				+  oc_mv         block_mv[4];
			
 
				+  /*Refined block MVs.*/
			
 
				+  oc_mv         ref_mv[4];
			
 
				+  /*Minimum motion estimation error from the analysis stage.*/
			
 
				+  ogg_uint16_t  error[2];
			
 
				+  /*MB error for half-pel refinement for each frame type.*/
			
 
				+  unsigned      satd[2];
			
 
				+  /*Block error for half-pel refinement.*/
			
 
				+  unsigned      block_satd[4];
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*State machine to estimate the opportunity cost of coding a MB mode.*/
			
 
				+struct oc_mode_scheme_chooser{
			
 
				+  /*Pointers to the a list containing the index of each mode in the mode
			
 
				+     alphabet used by each scheme.
			
 
				+    The first entry points to the dynamic scheme0_ranks, while the remaining 7
			
 
				+     point to the constant entries stored in OC_MODE_SCHEMES.*/
			
 
				+  const unsigned char *mode_ranks[8];
			
 
				+  /*The ranks for each mode when coded with scheme 0.
			
 
				+    These are optimized so that the more frequent modes have lower ranks.*/
			
 
				+  unsigned char        scheme0_ranks[OC_NMODES];
			
 
				+  /*The list of modes, sorted in descending order of frequency, that
			
 
				+    corresponds to the ranks above.*/
			
 
				+  unsigned char        scheme0_list[OC_NMODES];
			
 
				+  /*The number of times each mode has been chosen so far.*/
			
 
				+  unsigned             mode_counts[OC_NMODES];
			
 
				+  /*The list of mode coding schemes, sorted in ascending order of bit cost.*/
			
 
				+  unsigned char        scheme_list[8];
			
 
				+  /*The number of bits used by each mode coding scheme.*/
			
 
				+  ptrdiff_t            scheme_bits[8];
			
 
				+};
			
 
				+
			
 
				+
			
 
				+void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*State to track coded block flags and their bit cost.
			
 
				+  We use opportunity cost to measure the bits required to code or skip the next
			
 
				+   block, using the cheaper of the cost to code it fully or partially, so long
			
 
				+   as both are possible.*/
			
 
				+struct oc_fr_state{
			
 
				+  /*The number of bits required for the coded block flags so far this frame.*/
			
 
				+  ptrdiff_t  bits;
			
 
				+  /*The length of the current run for the partial super block flag, not
			
 
				+     including the current super block.*/
			
 
				+  unsigned   sb_partial_count:16;
			
 
				+  /*The length of the current run for the full super block flag, not
			
 
				+     including the current super block.*/
			
 
				+  unsigned   sb_full_count:16;
			
 
				+  /*The length of the coded block flag run when the current super block
			
 
				+     started.*/
			
 
				+  unsigned   b_coded_count_prev:6;
			
 
				+  /*The coded block flag when the current super block started.*/
			
 
				+  signed int b_coded_prev:2;
			
 
				+  /*The length of the current coded block flag run.*/
			
 
				+  unsigned   b_coded_count:6;
			
 
				+  /*The current coded block flag.*/
			
 
				+  signed int b_coded:2;
			
 
				+  /*The number of blocks processed in the current super block.*/
			
 
				+  unsigned   b_count:5;
			
 
				+  /*Whether or not it is cheaper to code the current super block partially,
			
 
				+     even if it could still be coded fully.*/
			
 
				+  unsigned   sb_prefer_partial:1;
			
 
				+  /*Whether the last super block was coded partially.*/
			
 
				+  signed int sb_partial:2;
			
 
				+  /*The number of bits required for the flags for the current super block.*/
			
 
				+  unsigned   sb_bits:6;
			
 
				+  /*Whether the last non-partial super block was coded fully.*/
			
 
				+  signed int sb_full:2;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+struct oc_qii_state{
			
 
				+  ptrdiff_t  bits;
			
 
				+  unsigned   qi01_count:14;
			
 
				+  signed int qi01:2;
			
 
				+  unsigned   qi12_count:14;
			
 
				+  signed int qi12:2;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Temporary encoder state for the analysis pipeline.*/
			
 
				+struct oc_enc_pipeline_state{
			
 
				+  /*DCT coefficient storage.
			
 
				+    This is kept off the stack because a) gcc can't align things on the stack
			
 
				+     reliably on ARM, and b) it avoids (unintentional) data hazards between
			
 
				+     ARM and NEON code.*/
			
 
				+  OC_ALIGN16(ogg_int16_t dct_data[64*3]);
			
 
				+  OC_ALIGN16(signed char bounding_values[256]);
			
 
				+  oc_fr_state         fr[3];
			
 
				+  oc_qii_state        qs[3];
			
 
				+  /*Skip SSD storage for the current MCU in each plane.*/
			
 
				+  unsigned           *skip_ssd[3];
			
 
				+  /*Coded/uncoded fragment lists for each plane for the current MCU.*/
			
 
				+  ptrdiff_t          *coded_fragis[3];
			
 
				+  ptrdiff_t          *uncoded_fragis[3];
			
 
				+  ptrdiff_t           ncoded_fragis[3];
			
 
				+  ptrdiff_t           nuncoded_fragis[3];
			
 
				+  /*The starting fragment for the current MCU in each plane.*/
			
 
				+  ptrdiff_t           froffset[3];
			
 
				+  /*The starting row for the current MCU in each plane.*/
			
 
				+  int                 fragy0[3];
			
 
				+  /*The ending row for the current MCU in each plane.*/
			
 
				+  int                 fragy_end[3];
			
 
				+  /*The starting superblock for the current MCU in each plane.*/
			
 
				+  unsigned            sbi0[3];
			
 
				+  /*The ending superblock for the current MCU in each plane.*/
			
 
				+  unsigned            sbi_end[3];
			
 
				+  /*The number of tokens for zzi=1 for each color plane.*/
			
 
				+  int                 ndct_tokens1[3];
			
 
				+  /*The outstanding eob_run count for zzi=1 for each color plane.*/
			
 
				+  int                 eob_run1[3];
			
 
				+  /*Whether or not the loop filter is enabled.*/
			
 
				+  int                 loop_filter;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Statistics used to estimate R-D cost of a block in a given coding mode.
			
 
				+  See modedec.h for more details.*/
			
 
				+struct oc_mode_rd{
			
 
				+  /*The expected bits used by the DCT tokens, shifted by OC_BIT_SCALE.*/
			
 
				+  ogg_int16_t rate;
			
 
				+  /*The expected square root of the sum of squared errors, shifted by
			
 
				+     OC_RMSE_SCALE.*/
			
 
				+  ogg_int16_t rmse;
			
 
				+};
			
 
				+
			
 
				+# if defined(OC_COLLECT_METRICS)
			
 
				+#  include "collect.h"
			
 
				+# endif
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*A 2nd order low-pass Bessel follower.
			
 
				+  We use this for rate control because it has fast reaction time, but is
			
 
				+   critically damped.*/
			
 
				+struct oc_iir_filter{
			
 
				+  ogg_int32_t c[2];
			
 
				+  ogg_int64_t g;
			
 
				+  ogg_int32_t x[2];
			
 
				+  ogg_int32_t y[2];
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*The 2-pass metrics associated with a single frame.*/
			
 
				+struct oc_frame_metrics{
			
 
				+  /*The log base 2 of the scale factor for this frame in Q24 format.*/
			
 
				+  ogg_int32_t   log_scale;
			
 
				+  /*The number of application-requested duplicates of this frame.*/
			
 
				+  unsigned      dup_count:31;
			
 
				+  /*The frame type from pass 1.*/
			
 
				+  unsigned      frame_type:1;
			
 
				+  /*The frame activity average from pass 1.*/
			
 
				+  unsigned      activity_avg;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Rate control state information.*/
			
 
				+struct oc_rc_state{
			
 
				+  /*The target average bits per frame.*/
			
 
				+  ogg_int64_t        bits_per_frame;
			
 
				+  /*The current buffer fullness (bits available to be used).*/
			
 
				+  ogg_int64_t        fullness;
			
 
				+  /*The target buffer fullness.
			
 
				+    This is where we'd like to be by the last keyframe the appears in the next
			
 
				+     buf_delay frames.*/
			
 
				+  ogg_int64_t        target;
			
 
				+  /*The maximum buffer fullness (total size of the buffer).*/
			
 
				+  ogg_int64_t        max;
			
 
				+  /*The log of the number of pixels in a frame in Q57 format.*/
			
 
				+  ogg_int64_t        log_npixels;
			
 
				+  /*The exponent used in the rate model in Q8 format.*/
			
 
				+  unsigned           exp[2];
			
 
				+  /*The number of frames to distribute the buffer usage over.*/
			
 
				+  int                buf_delay;
			
 
				+  /*The total drop count from the previous frame.
			
 
				+    This includes duplicates explicitly requested via the
			
 
				+     TH_ENCCTL_SET_DUP_COUNT API as well as frames we chose to drop ourselves.*/
			
 
				+  ogg_uint32_t       prev_drop_count;
			
 
				+  /*The log of an estimated scale factor used to obtain the real framerate, for
			
 
				+     VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
			
 
				+  ogg_int64_t        log_drop_scale;
			
 
				+  /*The log of estimated scale factor for the rate model in Q57 format.*/
			
 
				+  ogg_int64_t        log_scale[2];
			
 
				+  /*The log of the target quantizer level in Q57 format.*/
			
 
				+  ogg_int64_t        log_qtarget;
			
 
				+  /*Will we drop frames to meet bitrate target?*/
			
 
				+  unsigned char      drop_frames;
			
 
				+  /*Do we respect the maximum buffer fullness?*/
			
 
				+  unsigned char      cap_overflow;
			
 
				+  /*Can the reservoir go negative?*/
			
 
				+  unsigned char      cap_underflow;
			
 
				+  /*Second-order lowpass filters to track scale and VFR.*/
			
 
				+  oc_iir_filter      scalefilter[2];
			
 
				+  int                inter_count;
			
 
				+  int                inter_delay;
			
 
				+  int                inter_delay_target;
			
 
				+  oc_iir_filter      vfrfilter;
			
 
				+  /*Two-pass mode state.
			
 
				+    0 => 1-pass encoding.
			
 
				+    1 => 1st pass of 2-pass encoding.
			
 
				+    2 => 2nd pass of 2-pass encoding.*/
			
 
				+  int                twopass;
			
 
				+  /*Buffer for current frame metrics.*/
			
 
				+  unsigned char      twopass_buffer[48];
			
 
				+  /*The number of bytes in the frame metrics buffer.
			
 
				+    When 2-pass encoding is enabled, this is set to 0 after each frame is
			
 
				+     submitted, and must be non-zero before the next frame will be accepted.*/
			
 
				+  int                twopass_buffer_bytes;
			
 
				+  int                twopass_buffer_fill;
			
 
				+  /*Whether or not to force the next frame to be a keyframe.*/
			
 
				+  unsigned char      twopass_force_kf;
			
 
				+  /*The metrics for the previous frame.*/
			
 
				+  oc_frame_metrics   prev_metrics;
			
 
				+  /*The metrics for the current frame.*/
			
 
				+  oc_frame_metrics   cur_metrics;
			
 
				+  /*The buffered metrics for future frames.*/
			
 
				+  oc_frame_metrics  *frame_metrics;
			
 
				+  int                nframe_metrics;
			
 
				+  int                cframe_metrics;
			
 
				+  /*The index of the current frame in the circular metric buffer.*/
			
 
				+  int                frame_metrics_head;
			
 
				+  /*The frame count of each type (keyframes, delta frames, and dup frames);
			
 
				+     32 bits limits us to 2.268 years at 60 fps.*/
			
 
				+  ogg_uint32_t       frames_total[3];
			
 
				+  /*The number of frames of each type yet to be processed.*/
			
 
				+  ogg_uint32_t       frames_left[3];
			
 
				+  /*The sum of the scale values for each frame type.*/
			
 
				+  ogg_int64_t        scale_sum[2];
			
 
				+  /*The start of the window over which the current scale sums are taken.*/
			
 
				+  int                scale_window0;
			
 
				+  /*The end of the window over which the current scale sums are taken.*/
			
 
				+  int                scale_window_end;
			
 
				+  /*The frame count of each type in the current 2-pass window; this does not
			
 
				+     include dup frames.*/
			
 
				+  int                nframes[3];
			
 
				+  /*The total accumulated estimation bias.*/
			
 
				+  ogg_int64_t        rate_bias;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+void oc_rc_state_init(oc_rc_state *_rc,oc_enc_ctx *_enc);
			
 
				+void oc_rc_state_clear(oc_rc_state *_rc);
			
 
				+
			
 
				+void oc_enc_rc_resize(oc_enc_ctx *_enc);
			
 
				+int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp);
			
 
				+void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _frame_type);
			
 
				+int oc_enc_update_rc_state(oc_enc_ctx *_enc,
			
 
				+ long _bits,int _qti,int _qi,int _trial,int _droppable);
			
 
				+int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf);
			
 
				+int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes);
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*The internal encoder state.*/
			
 
				+struct th_enc_ctx{
			
 
				+  /*Shared encoder/decoder state.*/
			
 
				+  oc_theora_state          state;
			
 
				+  /*Buffer in which to assemble packets.*/
			
 
				+  oggpack_buffer           opb;
			
 
				+  /*Encoder-specific macroblock information.*/
			
 
				+  oc_mb_enc_info          *mb_info;
			
 
				+  /*DC coefficients after prediction.*/
			
 
				+  ogg_int16_t             *frag_dc;
			
 
				+  /*The list of coded macro blocks, in coded order.*/
			
 
				+  unsigned                *coded_mbis;
			
 
				+  /*The number of coded macro blocks.*/
			
 
				+  size_t                   ncoded_mbis;
			
 
				+  /*Whether or not packets are ready to be emitted.
			
 
				+    This takes on negative values while there are remaining header packets to
			
 
				+     be emitted, reaches 0 when the codec is ready for input, and becomes
			
 
				+     positive when a frame has been processed and data packets are ready.*/
			
 
				+  int                      packet_state;
			
 
				+  /*The maximum distance between keyframes.*/
			
 
				+  ogg_uint32_t             keyframe_frequency_force;
			
 
				+  /*The number of duplicates to produce for the next frame.*/
			
 
				+  ogg_uint32_t             dup_count;
			
 
				+  /*The number of duplicates remaining to be emitted for the current frame.*/
			
 
				+  ogg_uint32_t             nqueued_dups;
			
 
				+  /*The number of duplicates emitted for the last frame.*/
			
 
				+  ogg_uint32_t             prev_dup_count;
			
 
				+  /*The current speed level.*/
			
 
				+  int                      sp_level;
			
 
				+  /*Whether or not VP3 compatibility mode has been enabled.*/
			
 
				+  unsigned char            vp3_compatible;
			
 
				+  /*Whether or not any INTER frames have been coded.*/
			
 
				+  unsigned char            coded_inter_frame;
			
 
				+  /*Whether or not previous frame was dropped.*/
			
 
				+  unsigned char            prevframe_dropped;
			
 
				+  /*Stores most recently chosen Huffman tables for each frame type, DC and AC
			
 
				+     coefficients, and luma and chroma tokens.
			
 
				+    The actual Huffman table used for a given coefficient depends not only on
			
 
				+     the choice made here, but also its index in the zig-zag ordering.*/
			
 
				+  unsigned char            huff_idxs[2][2][2];
			
 
				+  /*Current count of bits used by each MV coding mode.*/
			
 
				+  size_t                   mv_bits[2];
			
 
				+  /*The mode scheme chooser for estimating mode coding costs.*/
			
 
				+  oc_mode_scheme_chooser   chooser;
			
 
				+  /*Temporary encoder state for the analysis pipeline.*/
			
 
				+  oc_enc_pipeline_state    pipe;
			
 
				+  /*The number of vertical super blocks in an MCU.*/
			
 
				+  int                      mcu_nvsbs;
			
 
				+  /*The SSD error for skipping each fragment in the current MCU.*/
			
 
				+  unsigned                *mcu_skip_ssd;
			
 
				+  /*The masking scale factors for chroma blocks in the current MCU.*/
			
 
				+  ogg_uint16_t            *mcu_rd_scale;
			
 
				+  ogg_uint16_t            *mcu_rd_iscale;
			
 
				+  /*The DCT token lists for each coefficient and each plane.*/
			
 
				+  unsigned char          **dct_tokens[3];
			
 
				+  /*The extra bits associated with each DCT token.*/
			
 
				+  ogg_uint16_t           **extra_bits[3];
			
 
				+  /*The number of DCT tokens for each coefficient for each plane.*/
			
 
				+  ptrdiff_t                ndct_tokens[3][64];
			
 
				+  /*Pending EOB runs for each coefficient for each plane.*/
			
 
				+  ogg_uint16_t             eob_run[3][64];
			
 
				+  /*The offset of the first DCT token for each coefficient for each plane.*/
			
 
				+  unsigned char            dct_token_offs[3][64];
			
 
				+  /*The last DC coefficient for each plane and reference frame.*/
			
 
				+  int                      dc_pred_last[3][4];
			
 
				+#if defined(OC_COLLECT_METRICS)
			
 
				+  /*Fragment SAD statistics for MB mode estimation metrics.*/
			
 
				+  unsigned                *frag_sad;
			
 
				+  /*Fragment SATD statistics for MB mode estimation metrics.*/
			
 
				+  unsigned                *frag_satd;
			
 
				+  /*Fragment SSD statistics for MB mode estimation metrics.*/
			
 
				+  unsigned                *frag_ssd;
			
 
				+#endif
			
 
				+  /*The R-D optimization parameter.*/
			
 
				+  int                      lambda;
			
 
				+  /*The average block "activity" of the previous frame.*/
			
 
				+  unsigned                 activity_avg;
			
 
				+  /*The average MB luma of the previous frame.*/
			
 
				+  unsigned                 luma_avg;
			
 
				+  /*The huffman tables in use.*/
			
 
				+  th_huff_code             huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
			
 
				+  /*The quantization parameters in use.*/
			
 
				+  th_quant_info            qinfo;
			
 
				+  /*The original DC coefficients saved off from the dequatization tables.*/
			
 
				+  ogg_uint16_t             dequant_dc[64][3][2];
			
 
				+  /*Condensed dequantization tables.*/
			
 
				+  const ogg_uint16_t      *dequant[3][3][2];
			
 
				+  /*Condensed quantization tables.*/
			
 
				+  void                    *enquant[3][3][2];
			
 
				+  /*The full set of quantization tables.*/
			
 
				+  void                    *enquant_tables[64][3][2];
			
 
				+  /*Storage for the quantization tables.*/
			
 
				+  unsigned char           *enquant_table_data;
			
 
				+  /*An "average" quantizer for each frame type (INTRA or INTER) and qi value.
			
 
				+    This is used to parameterize the rate control decisions.
			
 
				+    They are kept in the log domain to simplify later processing.
			
 
				+    These are DCT domain quantizers, and so are scaled by an additional factor
			
 
				+     of 4 from the pixel domain.*/
			
 
				+  ogg_int64_t              log_qavg[2][64];
			
 
				+  /*The "average" quantizer futher partitioned by color plane.
			
 
				+    This is used to parameterize mode decision.
			
 
				+    These are DCT domain quantizers, and so are scaled by an additional factor
			
 
				+     of 4 from the pixel domain.*/
			
 
				+  ogg_int16_t              log_plq[64][3][2];
			
 
				+  /*The R-D scale factors to apply to chroma blocks for a given frame type
			
 
				+     (INTRA or INTER) and qi value.
			
 
				+    The first is the "D" modifier (rd_scale), while the second is the "lambda"
			
 
				+     modifier (rd_iscale).*/
			
 
				+  ogg_uint16_t             chroma_rd_scale[2][64][2];
			
 
				+  /*The interpolated mode decision R-D lookup tables for the current
			
 
				+     quantizers, color plane, and quantization type.*/
			
 
				+  oc_mode_rd               mode_rd[3][3][2][OC_COMP_BINS];
			
 
				+  /*The buffer state used to drive rate control.*/
			
 
				+  oc_rc_state              rc;
			
 
				+# if defined(OC_ENC_USE_VTABLE)
			
 
				+  /*Table for encoder acceleration functions.*/
			
 
				+  oc_enc_opt_vtable        opt_vtable;
			
 
				+# endif
			
 
				+  /*Table for encoder data used by accelerated functions.*/
			
 
				+  oc_enc_opt_data          opt_data;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode);
			
 
				+int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode);
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Perform fullpel motion search for a single MB against both reference frames.*/
			
 
				+void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi);
			
 
				+/*Refine a MB MV for one frame.*/
			
 
				+void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame);
			
 
				+/*Refine the block MVs.*/
			
 
				+void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi);
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Used to rollback a tokenlog transaction when we retroactively decide to skip
			
 
				+   a fragment.
			
 
				+  A checkpoint is taken right before each token is added.*/
			
 
				+struct oc_token_checkpoint{
			
 
				+  /*The color plane the token was added to.*/
			
 
				+  unsigned char pli;
			
 
				+  /*The zig-zag index the token was added to.*/
			
 
				+  unsigned char zzi;
			
 
				+  /*The outstanding EOB run count before the token was added.*/
			
 
				+  ogg_uint16_t  eob_run;
			
 
				+  /*The token count before the token was added.*/
			
 
				+  ptrdiff_t     ndct_tokens;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+void oc_enc_tokenize_start(oc_enc_ctx *_enc);
			
 
				+int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
			
 
				+ ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
			
 
				+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
			
 
				+ int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
			
 
				+int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
			
 
				+ ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
			
 
				+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
			
 
				+ int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
			
 
				+void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
			
 
				+ const oc_token_checkpoint *_stack,int _n);
			
 
				+void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
			
 
				+ int _pli,int _fragy0,int _frag_yend);
			
 
				+void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
			
 
				+ const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
			
 
				+ int _prev_ndct_tokens1,int _prev_eob_run1);
			
 
				+void oc_enc_tokenize_finish(oc_enc_ctx *_enc);
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Utility routine to encode one of the header packets.*/
			
 
				+int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
			
 
				+ oggpack_buffer *_opb,const th_quant_info *_qinfo,
			
 
				+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
			
 
				+ const char *_vendor,th_comment *_tc,ogg_packet *_op);
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Default pure-C implementations of encoder-specific accelerated functions.*/
			
 
				+void oc_enc_accel_init_c(oc_enc_ctx *_enc);
			
 
				+
			
 
				+void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
			
 
				+ const unsigned char *_src,const unsigned char *_ref,int _ystride);
			
 
				+void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
			
 
				+ const unsigned char *_src,int _ystride);
			
 
				+unsigned oc_enc_frag_sad_c(const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride);
			
 
				+unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride,unsigned _thresh);
			
 
				+unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
			
 
				+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
			
 
				+ unsigned _thresh);
			
 
				+unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride);
			
 
				+unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride);
			
 
				+unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
			
 
				+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
			
 
				+unsigned oc_enc_frag_intra_satd_c(int *_dc,
			
 
				+ const unsigned char *_src,int _ystride);
			
 
				+unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride);
			
 
				+unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
			
 
				+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
			
 
				+void oc_enc_frag_copy2_c(unsigned char *_dst,
			
 
				+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
			
 
				+void oc_enc_enquant_table_init_c(void *_enquant,
			
 
				+ const ogg_uint16_t _dequant[64]);
			
 
				+void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis);
			
 
				+int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
			
 
				+ const ogg_uint16_t _dequant[64],const void *_enquant);
			
 
				+void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
			
 
				+
			
 
				+#endif
			
--- a/modules/theoraplayer/native/theora/lib/encode.c
+++ b/modules/theoraplayer/native/theora/lib/encode.c
@@ -0,0 +1,1836 @@
 
				+/********************************************************************
			
 
				+ *                                                                  *
			
 
				+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
			
 
				+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
			
 
				+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
			
 
				+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
			
 
				+ *                                                                  *
			
 
				+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
			
 
				+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
			
 
				+ *                                                                  *
			
 
				+ ********************************************************************
			
 
				+
			
 
				+  function:
			
 
				+  last mod: $Id: encode.c 17821 2011-02-09 22:08:34Z giles $
			
 
				+
			
 
				+ ********************************************************************/
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include "encint.h"
			
 
				+#include "dequant.h"
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*The default quantization parameters used by VP3.1.*/
			
 
				+static const int OC_VP31_RANGE_SIZES[1]={63};
			
 
				+static const th_quant_base OC_VP31_BASES_INTRA_Y[2]={
			
 
				+  {
			
 
				+     16, 11, 10, 16, 24, 40, 51, 61,
			
 
				+     12, 12, 14, 19, 26, 58, 60, 55,
			
 
				+     14, 13, 16, 24, 40, 57, 69, 56,
			
 
				+     14, 17, 22, 29, 51, 87, 80, 62,
			
 
				+     18, 22, 37, 58, 68,109,103, 77,
			
 
				+     24, 35, 55, 64, 81,104,113, 92,
			
 
				+     49, 64, 78, 87,103,121,120,101,
			
 
				+     72, 92, 95, 98,112,100,103, 99
			
 
				+  },
			
 
				+  {
			
 
				+     16, 11, 10, 16, 24, 40, 51, 61,
			
 
				+     12, 12, 14, 19, 26, 58, 60, 55,
			
 
				+     14, 13, 16, 24, 40, 57, 69, 56,
			
 
				+     14, 17, 22, 29, 51, 87, 80, 62,
			
 
				+     18, 22, 37, 58, 68,109,103, 77,
			
 
				+     24, 35, 55, 64, 81,104,113, 92,
			
 
				+     49, 64, 78, 87,103,121,120,101,
			
 
				+     72, 92, 95, 98,112,100,103, 99
			
 
				+  }
			
 
				+};
			
 
				+static const th_quant_base OC_VP31_BASES_INTRA_C[2]={
			
 
				+  {
			
 
				+     17, 18, 24, 47, 99, 99, 99, 99,
			
 
				+     18, 21, 26, 66, 99, 99, 99, 99,
			
 
				+     24, 26, 56, 99, 99, 99, 99, 99,
			
 
				+     47, 66, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99
			
 
				+  },
			
 
				+  {
			
 
				+     17, 18, 24, 47, 99, 99, 99, 99,
			
 
				+     18, 21, 26, 66, 99, 99, 99, 99,
			
 
				+     24, 26, 56, 99, 99, 99, 99, 99,
			
 
				+     47, 66, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99
			
 
				+  }
			
 
				+};
			
 
				+static const th_quant_base OC_VP31_BASES_INTER[2]={
			
 
				+  {
			
 
				+     16, 16, 16, 20, 24, 28, 32, 40,
			
 
				+     16, 16, 20, 24, 28, 32, 40, 48,
			
 
				+     16, 20, 24, 28, 32, 40, 48, 64,
			
 
				+     20, 24, 28, 32, 40, 48, 64, 64,
			
 
				+     24, 28, 32, 40, 48, 64, 64, 64,
			
 
				+     28, 32, 40, 48, 64, 64, 64, 96,
			
 
				+     32, 40, 48, 64, 64, 64, 96,128,
			
 
				+     40, 48, 64, 64, 64, 96,128,128
			
 
				+  },
			
 
				+  {
			
 
				+     16, 16, 16, 20, 24, 28, 32, 40,
			
 
				+     16, 16, 20, 24, 28, 32, 40, 48,
			
 
				+     16, 20, 24, 28, 32, 40, 48, 64,
			
 
				+     20, 24, 28, 32, 40, 48, 64, 64,
			
 
				+     24, 28, 32, 40, 48, 64, 64, 64,
			
 
				+     28, 32, 40, 48, 64, 64, 64, 96,
			
 
				+     32, 40, 48, 64, 64, 64, 96,128,
			
 
				+     40, 48, 64, 64, 64, 96,128,128
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+const th_quant_info TH_VP31_QUANT_INFO={
			
 
				+  {
			
 
				+    220,200,190,180,170,170,160,160,
			
 
				+    150,150,140,140,130,130,120,120,
			
 
				+    110,110,100,100, 90, 90, 90, 80,
			
 
				+     80, 80, 70, 70, 70, 60, 60, 60,
			
 
				+     60, 50, 50, 50, 50, 40, 40, 40,
			
 
				+     40, 40, 30, 30, 30, 30, 30, 30,
			
 
				+     30, 20, 20, 20, 20, 20, 20, 20,
			
 
				+     20, 10, 10, 10, 10, 10, 10, 10
			
 
				+  },
			
 
				+  {
			
 
				+    500,450,400,370,340,310,285,265,
			
 
				+    245,225,210,195,185,180,170,160,
			
 
				+    150,145,135,130,125,115,110,107,
			
 
				+    100, 96, 93, 89, 85, 82, 75, 74,
			
 
				+     70, 68, 64, 60, 57, 56, 52, 50,
			
 
				+     49, 45, 44, 43, 40, 38, 37, 35,
			
 
				+     33, 32, 30, 29, 28, 25, 24, 22,
			
 
				+     21, 19, 18, 17, 15, 13, 12, 10
			
 
				+  },
			
 
				+  {
			
 
				+    30,25,20,20,15,15,14,14,
			
 
				+    13,13,12,12,11,11,10,10,
			
 
				+     9, 9, 8, 8, 7, 7, 7, 7,
			
 
				+     6, 6, 6, 6, 5, 5, 5, 5,
			
 
				+     4, 4, 4, 4, 3, 3, 3, 3,
			
 
				+     2, 2, 2, 2, 2, 2, 2, 2,
			
 
				+     0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+     0, 0, 0, 0, 0, 0, 0, 0
			
 
				+  },
			
 
				+  {
			
 
				+    {
			
 
				+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_Y},
			
 
				+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_C},
			
 
				+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_C}
			
 
				+    },
			
 
				+    {
			
 
				+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER},
			
 
				+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER},
			
 
				+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER}
			
 
				+    }
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+/*The current default quantization parameters.*/
			
 
				+static const int OC_DEF_QRANGE_SIZES[3]={32,16,15};
			
 
				+static const th_quant_base OC_DEF_BASES_INTRA_Y[4]={
			
 
				+  {
			
 
				+     15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+     15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+     15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+     15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+     15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+     15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+     15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+     15, 15, 15, 15, 15, 15, 15, 15,
			
 
				+  },
			
 
				+  {
			
 
				+     15, 12, 12, 15, 18, 20, 20, 21,
			
 
				+     13, 13, 14, 17, 18, 21, 21, 20,
			
 
				+     14, 14, 15, 18, 20, 21, 21, 21,
			
 
				+     14, 16, 17, 19, 20, 21, 21, 21,
			
 
				+     16, 17, 20, 21, 21, 21, 21, 21,
			
 
				+     18, 19, 20, 21, 21, 21, 21, 21,
			
 
				+     20, 21, 21, 21, 21, 21, 21, 21,
			
 
				+     21, 21, 21, 21, 21, 21, 21, 21
			
 
				+  },
			
 
				+  {
			
 
				+     16, 12, 11, 16, 20, 25, 27, 28,
			
 
				+     13, 13, 14, 18, 21, 28, 28, 27,
			
 
				+     14, 13, 16, 20, 25, 28, 28, 28,
			
 
				+     14, 16, 19, 22, 27, 29, 29, 28,
			
 
				+     17, 19, 25, 28, 28, 30, 30, 29,
			
 
				+     20, 24, 27, 28, 29, 30, 30, 29,
			
 
				+     27, 28, 29, 29, 30, 30, 30, 30,
			
 
				+     29, 29, 29, 29, 30, 30, 30, 29
			
 
				+  },
			
 
				+  {
			
 
				+     16, 11, 10, 16, 24, 40, 51, 61,
			
 
				+     12, 12, 14, 19, 26, 58, 60, 55,
			
 
				+     14, 13, 16, 24, 40, 57, 69, 56,
			
 
				+     14, 17, 22, 29, 51, 87, 80, 62,
			
 
				+     18, 22, 37, 58, 68,109,103, 77,
			
 
				+     24, 35, 55, 64, 81,104,113, 92,
			
 
				+     49, 64, 78, 87,103,121,120,101,
			
 
				+     72, 92, 95, 98,112,100,103, 99
			
 
				+  }
			
 
				+};
			
 
				+static const th_quant_base OC_DEF_BASES_INTRA_C[4]={
			
 
				+  {
			
 
				+     19, 19, 19, 19, 19, 19, 19, 19,
			
 
				+     19, 19, 19, 19, 19, 19, 19, 19,
			
 
				+     19, 19, 19, 19, 19, 19, 19, 19,
			
 
				+     19, 19, 19, 19, 19, 19, 19, 19,
			
 
				+     19, 19, 19, 19, 19, 19, 19, 19,
			
 
				+     19, 19, 19, 19, 19, 19, 19, 19,
			
 
				+     19, 19, 19, 19, 19, 19, 19, 19,
			
 
				+     19, 19, 19, 19, 19, 19, 19, 19
			
 
				+  },
			
 
				+  {
			
 
				+     18, 18, 21, 25, 26, 26, 26, 26,
			
 
				+     18, 20, 22, 26, 26, 26, 26, 26,
			
 
				+     21, 22, 25, 26, 26, 26, 26, 26,
			
 
				+     25, 26, 26, 26, 26, 26, 26, 26,
			
 
				+     26, 26, 26, 26, 26, 26, 26, 26,
			
 
				+     26, 26, 26, 26, 26, 26, 26, 26,
			
 
				+     26, 26, 26, 26, 26, 26, 26, 26,
			
 
				+     26, 26, 26, 26, 26, 26, 26, 26
			
 
				+  },
			
 
				+  {
			
 
				+     17, 18, 22, 31, 36, 36, 36, 36,
			
 
				+     18, 20, 24, 34, 36, 36, 36, 36,
			
 
				+     22, 24, 33, 36, 36, 36, 36, 36,
			
 
				+     31, 34, 36, 36, 36, 36, 36, 36,
			
 
				+     36, 36, 36, 36, 36, 36, 36, 36,
			
 
				+     36, 36, 36, 36, 36, 36, 36, 36,
			
 
				+     36, 36, 36, 36, 36, 36, 36, 36,
			
 
				+     36, 36, 36, 36, 36, 36, 36, 36
			
 
				+  },
			
 
				+  {
			
 
				+     17, 18, 24, 47, 99, 99, 99, 99,
			
 
				+     18, 21, 26, 66, 99, 99, 99, 99,
			
 
				+     24, 26, 56, 99, 99, 99, 99, 99,
			
 
				+     47, 66, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99,
			
 
				+     99, 99, 99, 99, 99, 99, 99, 99
			
 
				+  }
			
 
				+};
			
 
				+static const th_quant_base OC_DEF_BASES_INTER[4]={
			
 
				+  {
			
 
				+     21, 21, 21, 21, 21, 21, 21, 21,
			
 
				+     21, 21, 21, 21, 21, 21, 21, 21,
			
 
				+     21, 21, 21, 21, 21, 21, 21, 21,
			
 
				+     21, 21, 21, 21, 21, 21, 21, 21,
			
 
				+     21, 21, 21, 21, 21, 21, 21, 21,
			
 
				+     21, 21, 21, 21, 21, 21, 21, 21,
			
 
				+     21, 21, 21, 21, 21, 21, 21, 21,
			
 
				+     21, 21, 21, 21, 21, 21, 21, 21
			
 
				+  },
			
 
				+  {
			
 
				+     18, 18, 18, 21, 23, 24, 25, 27,
			
 
				+     18, 18, 21, 23, 24, 25, 27, 28,
			
 
				+     18, 21, 23, 24, 25, 27, 28, 29,
			
 
				+     21, 23, 24, 25, 27, 28, 29, 29,
			
 
				+     23, 24, 25, 27, 28, 29, 29, 29,
			
 
				+     24, 25, 27, 28, 29, 29, 29, 30,
			
 
				+     25, 27, 28, 29, 29, 29, 30, 30,
			
 
				+     27, 28, 29, 29, 29, 30, 30, 30
			
 
				+  },
			
 
				+  {
			
 
				+     17, 17, 17, 20, 23, 26, 28, 32,
			
 
				+     17, 17, 20, 23, 26, 28, 32, 34,
			
 
				+     17, 20, 23, 26, 28, 32, 34, 37,
			
 
				+     20, 23, 26, 28, 32, 34, 37, 37,
			
 
				+     23, 26, 28, 32, 34, 37, 37, 37,
			
 
				+     26, 28, 32, 34, 37, 37, 37, 41,
			
 
				+     28, 32, 34, 37, 37, 37, 41, 42,
			
 
				+     32, 34, 37, 37, 37, 41, 42, 42
			
 
				+  },
			
 
				+  {
			
 
				+     16, 16, 16, 20, 24, 28, 32, 40,
			
 
				+     16, 16, 20, 24, 28, 32, 40, 48,
			
 
				+     16, 20, 24, 28, 32, 40, 48, 64,
			
 
				+     20, 24, 28, 32, 40, 48, 64, 64,
			
 
				+     24, 28, 32, 40, 48, 64, 64, 64,
			
 
				+     28, 32, 40, 48, 64, 64, 64, 96,
			
 
				+     32, 40, 48, 64, 64, 64, 96,128,
			
 
				+     40, 48, 64, 64, 64, 96,128,128
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+const th_quant_info TH_DEF_QUANT_INFO={
			
 
				+  {
			
 
				+    365,348,333,316,300,287,277,265,
			
 
				+    252,240,229,219,206,197,189,180,
			
 
				+    171,168,160,153,146,139,132,127,
			
 
				+    121,115,110,107,101, 97, 94, 89,
			
 
				+     85, 83, 78, 73, 72, 67, 66, 62,
			
 
				+     60, 59, 56, 53, 52, 48, 47, 43,
			
 
				+     42, 40, 36, 35, 34, 33, 31, 30,
			
 
				+     28, 25, 24, 22, 20, 17, 14, 10
			
 
				+  },
			
 
				+  {
			
 
				+    365,348,333,316,300,287,277,265,
			
 
				+    252,240,229,219,206,197,189,180,
			
 
				+    171,168,160,153,146,139,132,127,
			
 
				+    121,115,110,107,101, 97, 94, 89,
			
 
				+     85, 83, 78, 73, 72, 67, 66, 62,
			
 
				+     60, 59, 56, 53, 52, 48, 47, 43,
			
 
				+     42, 40, 36, 35, 34, 33, 31, 30,
			
 
				+     28, 25, 24, 22, 20, 17, 14, 10
			
 
				+  },
			
 
				+  {
			
 
				+    15,12, 9, 8, 6, 6, 5, 5,
			
 
				+     5, 5, 5, 5, 5, 5, 5, 5,
			
 
				+     4, 4, 4, 4, 4, 4, 3, 3,
			
 
				+     3, 3, 3, 3, 3, 3, 3, 3,
			
 
				+     2, 2, 2, 2, 2, 2, 2, 2,
			
 
				+     2, 2, 2, 2, 2, 2, 2, 0,
			
 
				+     0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+     0, 0, 0, 0, 0, 0, 0, 0
			
 
				+  },
			
 
				+  {
			
 
				+    {
			
 
				+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTRA_Y},
			
 
				+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTRA_C},
			
 
				+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTRA_C}
			
 
				+    },
			
 
				+    {
			
 
				+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTER},
			
 
				+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTER},
			
 
				+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTER}
			
 
				+    }
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*The Huffman codes used for macro block modes.*/
			
 
				+
			
 
				+const unsigned char OC_MODE_BITS[2][OC_NMODES]={
			
 
				+  /*Codebook 0: a maximally skewed prefix code.*/
			
 
				+  {1,2,3,4,5,6,7,7},
			
 
				+  /*Codebook 1: a fixed-length code.*/
			
 
				+  {3,3,3,3,3,3,3,3}
			
 
				+};
			
 
				+
			
 
				+static const unsigned char OC_MODE_CODES[2][OC_NMODES]={
			
 
				+  /*Codebook 0: a maximally skewed prefix code.*/
			
 
				+  {0x00,0x02,0x06,0x0E,0x1E,0x3E,0x7E,0x7F},
			
 
				+  /*Codebook 1: a fixed-length code.*/
			
 
				+  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07}
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*The Huffman codes used for motion vectors.*/
			
 
				+
			
 
				+const unsigned char OC_MV_BITS[2][64]={
			
 
				+  /*Codebook 0: VLC code.*/
			
 
				+  {
			
 
				+      8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
			
 
				+    8,7,7,7,7,7,7,7,7,6,6,6,6,4,4,3,
			
 
				+    3,
			
 
				+    3,4,4,6,6,6,6,7,7,7,7,7,7,7,7,8,
			
 
				+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
			
 
				+  },
			
 
				+  /*Codebook 1: (5 bit magnitude, 1 bit sign).
			
 
				+    This wastes a code word (0x01, negative zero), or a bit (0x00, positive
			
 
				+     zero, requires only 5 bits to uniquely decode), but is hopefully not used
			
 
				+     very often.*/
			
 
				+  {
			
 
				+      6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
			
 
				+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
			
 
				+    6,
			
 
				+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
			
 
				+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+static const unsigned char OC_MV_CODES[2][64]={
			
 
				+  /*Codebook 0: VLC code.*/
			
 
				+  {
			
 
				+         0xFF,0xFD,0xFB,0xF9,0xF7,0xF5,0xF3,
			
 
				+    0xF1,0xEF,0xED,0xEB,0xE9,0xE7,0xE5,0xE3,
			
 
				+    0xE1,0x6F,0x6D,0x6B,0x69,0x67,0x65,0x63,
			
 
				+    0x61,0x2F,0x2D,0x2B,0x29,0x09,0x07,0x02,
			
 
				+    0x00,
			
 
				+    0x01,0x06,0x08,0x28,0x2A,0x2C,0x2E,0x60,
			
 
				+    0x62,0x64,0x66,0x68,0x6A,0x6C,0x6E,0xE0,
			
 
				+    0xE2,0xE4,0xE6,0xE8,0xEA,0xEC,0xEE,0xF0,
			
 
				+    0xF2,0xF4,0xF6,0xF8,0xFA,0xFC,0xFE
			
 
				+  },
			
 
				+  /*Codebook 1: (5 bit magnitude, 1 bit sign).*/
			
 
				+  {
			
 
				+         0x3F,0x3D,0x3B,0x39,0x37,0x35,0x33,
			
 
				+    0x31,0x2F,0x2D,0x2B,0x29,0x27,0x25,0x23,
			
 
				+    0x21,0x1F,0x1D,0x1B,0x19,0x17,0x15,0x13,
			
 
				+    0x11,0x0F,0x0D,0x0B,0x09,0x07,0x05,0x03,
			
 
				+    0x00,
			
 
				+    0x02,0x04,0x06,0x08,0x0A,0x0C,0x0E,0x10,
			
 
				+    0x12,0x14,0x16,0x18,0x1A,0x1C,0x1E,0x20,
			
 
				+    0x22,0x24,0x26,0x28,0x2A,0x2C,0x2E,0x30,
			
 
				+    0x32,0x34,0x36,0x38,0x3A,0x3C,0x3E
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Super block run coding scheme:
			
 
				+   Codeword             Run Length
			
 
				+   0                       1
			
 
				+   10x                     2-3
			
 
				+   110x                    4-5
			
 
				+   1110xx                  6-9
			
 
				+   11110xxx                10-17
			
 
				+   111110xxxx              18-33
			
 
				+   111111xxxxxxxxxxxx      34-4129*/
			
 
				+const ogg_uint16_t    OC_SB_RUN_VAL_MIN[8]={1,2,4,6,10,18,34,4130};
			
 
				+static const unsigned OC_SB_RUN_CODE_PREFIX[7]={
			
 
				+  0,4,0xC,0x38,0xF0,0x3E0,0x3F000
			
 
				+};
			
 
				+const unsigned char   OC_SB_RUN_CODE_NBITS[7]={1,3,4,6,8,10,18};
			
 
				+
			
 
				+
			
 
				+/*Writes the bit pattern for the run length of a super block run to the given
			
 
				+   oggpack_buffer.
			
 
				+  _opb:       The buffer to write to.
			
 
				+  _run_count: The length of the run, which must be positive.
			
 
				+  _flag:      The current flag.
			
 
				+  _done:      Whether or not more flags are to be encoded.*/
			
 
				+static void oc_sb_run_pack(oggpack_buffer *_opb,ptrdiff_t _run_count,
			
 
				+ int _flag,int _done){
			
 
				+  int i;
			
 
				+  if(_run_count>=4129){
			
 
				+    do{
			
 
				+      oggpackB_write(_opb,0x3FFFF,18);
			
 
				+      _run_count-=4129;
			
 
				+      if(_run_count>0)oggpackB_write(_opb,_flag,1);
			
 
				+      else if(!_done)oggpackB_write(_opb,!_flag,1);
			
 
				+    }
			
 
				+    while(_run_count>=4129);
			
 
				+    if(_run_count<=0)return;
			
 
				+  }
			
 
				+  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
			
 
				+  oggpackB_write(_opb,OC_SB_RUN_CODE_PREFIX[i]+_run_count-OC_SB_RUN_VAL_MIN[i],
			
 
				+   OC_SB_RUN_CODE_NBITS[i]);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*Block run coding scheme:
			
 
				+   Codeword             Run Length
			
 
				+   0x                      1-2
			
 
				+   10x                     3-4
			
 
				+   110x                    5-6
			
 
				+   1110xx                  7-10
			
 
				+   11110xx                 11-14
			
 
				+   11111xxxx               15-30*/
			
 
				+const unsigned char OC_BLOCK_RUN_CODE_NBITS[30]={
			
 
				+  2,2,3,3,4,4,6,6,6,6,7,7,7,7,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
			
 
				+};
			
 
				+static const ogg_uint16_t  OC_BLOCK_RUN_CODE_PATTERN[30]={
			
 
				+        0x000,0x001,0x004,0x005,0x00C,0x00D,0x038,
			
 
				+  0x039,0x03A,0x03B,0x078,0x079,0x07A,0x07B,0x1F0,
			
 
				+  0x1F1,0x1F2,0x1F3,0x1F4,0x1F5,0x1F6,0x1F7,0x1F8,
			
 
				+  0x1F9,0x1FA,0x1FB,0x1FC,0x1FD,0x1FE,0x1FF
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*Writes the bit pattern for the run length of a block run to the given
			
 
				+   oggpack_buffer.
			
 
				+  _opb:       The buffer to write to.
			
 
				+  _run_count: The length of the run.
			
 
				+              This must be positive, and no more than 30.*/
			
 
				+static void oc_block_run_pack(oggpack_buffer *_opb,int _run_count){
			
 
				+  oggpackB_write(_opb,OC_BLOCK_RUN_CODE_PATTERN[_run_count-1],
			
 
				+   OC_BLOCK_RUN_CODE_NBITS[_run_count-1]);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void oc_enc_frame_header_pack(oc_enc_ctx *_enc){
			
 
				+  /*Mark this as a data packet.*/
			
 
				+  oggpackB_write(&_enc->opb,0,1);
			
 
				+  /*Output the frame type (key frame or delta frame).*/
			
 
				+  oggpackB_write(&_enc->opb,_enc->state.frame_type,1);
			
 
				+  /*Write out the current qi list.*/
			
 
				+  oggpackB_write(&_enc->opb,_enc->state.qis[0],6);
			
 
				+  if(_enc->state.nqis>1){
			
 
				+    oggpackB_write(&_enc->opb,1,1);
			
 
				+    oggpackB_write(&_enc->opb,_enc->state.qis[1],6);
			
 
				+    if(_enc->state.nqis>2){
			
 
				+      oggpackB_write(&_enc->opb,1,1);
			
 
				+      oggpackB_write(&_enc->opb,_enc->state.qis[2],6);
			
 
				+    }
			
 
				+    else oggpackB_write(&_enc->opb,0,1);
			
 
				+  }
			
 
				+  else oggpackB_write(&_enc->opb,0,1);
			
 
				+  if(_enc->state.frame_type==OC_INTRA_FRAME){
			
 
				+    /*Key frames have 3 unused configuration bits, holdovers from the VP3 days.
			
 
				+      Most of the other unused bits in the VP3 headers were eliminated.
			
 
				+      Monty kept these to leave us some wiggle room for future expansion,
			
 
				+       though a single bit in all frames would have been far more useful.*/
			
 
				+    oggpackB_write(&_enc->opb,0,3);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Writes the bit flags for whether or not each super block is partially coded
			
 
				+   or not.
			
 
				+  These flags are run-length encoded, with the flag value alternating between
			
 
				+   each run.
			
 
				+  Return: The number partially coded SBs.*/
			
 
				+static unsigned oc_enc_partial_sb_flags_pack(oc_enc_ctx *_enc){
			
 
				+  const oc_sb_flags *sb_flags;
			
 
				+  unsigned           nsbs;
			
 
				+  unsigned           sbi;
			
 
				+  unsigned           npartial;
			
 
				+  int                flag;
			
 
				+  sb_flags=_enc->state.sb_flags;
			
 
				+  nsbs=_enc->state.nsbs;
			
 
				+  flag=sb_flags[0].coded_partially;
			
 
				+  oggpackB_write(&_enc->opb,flag,1);
			
 
				+  sbi=npartial=0;
			
 
				+  do{
			
 
				+    unsigned run_count;
			
 
				+    for(run_count=0;sbi<nsbs;sbi++){
			
 
				+      if(sb_flags[sbi].coded_partially!=flag)break;
			
 
				+      run_count++;
			
 
				+      npartial+=flag;
			
 
				+    }
			
 
				+    oc_sb_run_pack(&_enc->opb,run_count,flag,sbi>=nsbs);
			
 
				+    flag=!flag;
			
 
				+  }
			
 
				+  while(sbi<nsbs);
			
 
				+  return npartial;
			
 
				+}
			
 
				+
			
 
				+/*Writes the coded/not coded flags for each super block that is not partially
			
 
				+   coded.
			
 
				+  These flags are run-length encoded, with the flag value altenating between
			
 
				+   each run.*/
			
 
				+static void oc_enc_coded_sb_flags_pack(oc_enc_ctx *_enc){
			
 
				+  const oc_sb_flags *sb_flags;
			
 
				+  unsigned           nsbs;
			
 
				+  unsigned           sbi;
			
 
				+  int                flag;
			
 
				+  sb_flags=_enc->state.sb_flags;
			
 
				+  nsbs=_enc->state.nsbs;
			
 
				+  /*Skip partially coded super blocks; their flags have already been coded.*/
			
 
				+  for(sbi=0;sb_flags[sbi].coded_partially;sbi++);
			
 
				+  flag=sb_flags[sbi].coded_fully;
			
 
				+  oggpackB_write(&_enc->opb,flag,1);
			
 
				+  do{
			
 
				+    unsigned run_count;
			
 
				+    for(run_count=0;sbi<nsbs;sbi++){
			
 
				+      if(sb_flags[sbi].coded_partially)continue;
			
 
				+      if(sb_flags[sbi].coded_fully!=flag)break;
			
 
				+      run_count++;
			
 
				+    }
			
 
				+    oc_sb_run_pack(&_enc->opb,run_count,flag,sbi>=nsbs);
			
 
				+    flag=!flag;
			
 
				+  }
			
 
				+  while(sbi<nsbs);
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_coded_flags_pack(oc_enc_ctx *_enc){
			
 
				+  const oc_sb_map   *sb_maps;
			
 
				+  const oc_sb_flags *sb_flags;
			
 
				+  unsigned           nsbs;
			
 
				+  const oc_fragment *frags;
			
 
				+  unsigned           npartial;
			
 
				+  int                run_count;
			
 
				+  int                flag;
			
 
				+  int                pli;
			
 
				+  unsigned           sbi;
			
 
				+  npartial=oc_enc_partial_sb_flags_pack(_enc);
			
 
				+  if(npartial<_enc->state.nsbs)oc_enc_coded_sb_flags_pack(_enc);
			
 
				+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
			
 
				+  sb_flags=_enc->state.sb_flags;
			
 
				+  nsbs=_enc->state.nsbs;
			
 
				+  frags=_enc->state.frags;
			
 
				+  for(sbi=0;sbi<nsbs&&!sb_flags[sbi].coded_partially;sbi++);
			
 
				+  /*If there's at least one partial SB, store individual coded block flags.*/
			
 
				+  if(sbi<nsbs){
			
 
				+    flag=frags[sb_maps[sbi][0][0]].coded;
			
 
				+    oggpackB_write(&_enc->opb,flag,1);
			
 
				+    run_count=0;
			
 
				+    nsbs=sbi=0;
			
 
				+    for(pli=0;pli<3;pli++){
			
 
				+      nsbs+=_enc->state.fplanes[pli].nsbs;
			
 
				+      for(;sbi<nsbs;sbi++){
			
 
				+        int       quadi;
			
 
				+        int       bi;
			
 
				+        ptrdiff_t fragi;
			
 
				+        if(sb_flags[sbi].coded_partially){
			
 
				+          for(quadi=0;quadi<4;quadi++){
			
 
				+            for(bi=0;bi<4;bi++){
			
 
				+              fragi=sb_maps[sbi][quadi][bi];
			
 
				+              if(fragi>=0){
			
 
				+                if(frags[fragi].coded!=flag){
			
 
				+                  oc_block_run_pack(&_enc->opb,run_count);
			
 
				+                  flag=!flag;
			
 
				+                  run_count=1;
			
 
				+                }
			
 
				+                else run_count++;
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    /*Flush any trailing block coded run.*/
			
 
				+    if(run_count>0)oc_block_run_pack(&_enc->opb,run_count);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_mb_modes_pack(oc_enc_ctx *_enc){
			
 
				+  const unsigned char *mode_codes;
			
 
				+  const unsigned char *mode_bits;
			
 
				+  const unsigned char *mode_ranks;
			
 
				+  unsigned            *coded_mbis;
			
 
				+  size_t               ncoded_mbis;
			
 
				+  const signed char   *mb_modes;
			
 
				+  unsigned             mbii;
			
 
				+  int                  scheme;
			
 
				+  int                  mb_mode;
			
 
				+  scheme=_enc->chooser.scheme_list[0];
			
 
				+  /*Encode the best scheme.*/
			
 
				+  oggpackB_write(&_enc->opb,scheme,3);
			
 
				+  /*If the chosen scheme is scheme 0, send the mode frequency ordering.*/
			
 
				+  if(scheme==0){
			
 
				+    for(mb_mode=0;mb_mode<OC_NMODES;mb_mode++){
			
 
				+      oggpackB_write(&_enc->opb,_enc->chooser.scheme0_ranks[mb_mode],3);
			
 
				+    }
			
 
				+  }
			
 
				+  mode_ranks=_enc->chooser.mode_ranks[scheme];
			
 
				+  mode_bits=OC_MODE_BITS[scheme+1>>3];
			
 
				+  mode_codes=OC_MODE_CODES[scheme+1>>3];
			
 
				+  coded_mbis=_enc->coded_mbis;
			
 
				+  ncoded_mbis=_enc->ncoded_mbis;
			
 
				+  mb_modes=_enc->state.mb_modes;
			
 
				+  for(mbii=0;mbii<ncoded_mbis;mbii++){
			
 
				+    int rank;
			
 
				+    rank=mode_ranks[mb_modes[coded_mbis[mbii]]];
			
 
				+    oggpackB_write(&_enc->opb,mode_codes[rank],mode_bits[rank]);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_mv_pack(oc_enc_ctx *_enc,int _mv_scheme,oc_mv _mv){
			
 
				+  int dx;
			
 
				+  int dy;
			
 
				+  dx=OC_MV_X(_mv);
			
 
				+  dy=OC_MV_Y(_mv);
			
 
				+  oggpackB_write(&_enc->opb,
			
 
				+   OC_MV_CODES[_mv_scheme][dx+31],OC_MV_BITS[_mv_scheme][dx+31]);
			
 
				+  oggpackB_write(&_enc->opb,
			
 
				+   OC_MV_CODES[_mv_scheme][dy+31],OC_MV_BITS[_mv_scheme][dy+31]);
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
			
 
				+  const unsigned     *coded_mbis;
			
 
				+  size_t              ncoded_mbis;
			
 
				+  const oc_mb_map    *mb_maps;
			
 
				+  const signed char  *mb_modes;
			
 
				+  const oc_fragment  *frags;
			
 
				+  const oc_mv        *frag_mvs;
			
 
				+  unsigned            mbii;
			
 
				+  int                 mv_scheme;
			
 
				+  /*Choose the coding scheme.*/
			
 
				+  mv_scheme=_enc->mv_bits[1]<_enc->mv_bits[0];
			
 
				+  oggpackB_write(&_enc->opb,mv_scheme,1);
			
 
				+  /*Encode the motion vectors.
			
 
				+    Macro blocks are iterated in Hilbert scan order, but the MVs within the
			
 
				+     macro block are coded in raster order.*/
			
 
				+  coded_mbis=_enc->coded_mbis;
			
 
				+  ncoded_mbis=_enc->ncoded_mbis;
			
 
				+  mb_modes=_enc->state.mb_modes;
			
 
				+  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
			
 
				+  frags=_enc->state.frags;
			
 
				+  frag_mvs=_enc->state.frag_mvs;
			
 
				+  for(mbii=0;mbii<ncoded_mbis;mbii++){
			
 
				+    ptrdiff_t fragi;
			
 
				+    unsigned  mbi;
			
 
				+    int       bi;
			
 
				+    mbi=coded_mbis[mbii];
			
 
				+    switch(mb_modes[mbi]){
			
 
				+      case OC_MODE_INTER_MV:
			
 
				+      case OC_MODE_GOLDEN_MV:{
			
 
				+        for(bi=0;;bi++){
			
 
				+          fragi=mb_maps[mbi][0][bi];
			
 
				+          if(frags[fragi].coded){
			
 
				+            oc_enc_mv_pack(_enc,mv_scheme,frag_mvs[fragi]);
			
 
				+            /*Only code a single MV for this macro block.*/
			
 
				+            break;
			
 
				+          }
			
 
				+        }
			
 
				+      }break;
			
 
				+      case OC_MODE_INTER_MV_FOUR:{
			
 
				+        for(bi=0;bi<4;bi++){
			
 
				+          fragi=mb_maps[mbi][0][bi];
			
 
				+          if(frags[fragi].coded){
			
 
				+            oc_enc_mv_pack(_enc,mv_scheme,frag_mvs[fragi]);
			
 
				+            /*Keep coding all the MVs for this macro block.*/
			
 
				+          }
			
 
				+        }
			
 
				+      }break;
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_block_qis_pack(oc_enc_ctx *_enc){
			
 
				+  const oc_fragment *frags;
			
 
				+  ptrdiff_t         *coded_fragis;
			
 
				+  ptrdiff_t          ncoded_fragis;
			
 
				+  ptrdiff_t          fragii;
			
 
				+  ptrdiff_t          run_count;
			
 
				+  ptrdiff_t          nqi0;
			
 
				+  int                flag;
			
 
				+  if(_enc->state.nqis<=1)return;
			
 
				+  ncoded_fragis=_enc->state.ntotal_coded_fragis;
			
 
				+  if(ncoded_fragis<=0)return;
			
 
				+  coded_fragis=_enc->state.coded_fragis;
			
 
				+  frags=_enc->state.frags;
			
 
				+  flag=!!frags[coded_fragis[0]].qii;
			
 
				+  oggpackB_write(&_enc->opb,flag,1);
			
 
				+  nqi0=0;
			
 
				+  for(fragii=0;fragii<ncoded_fragis;){
			
 
				+    for(run_count=0;fragii<ncoded_fragis;fragii++){
			
 
				+      if(!!frags[coded_fragis[fragii]].qii!=flag)break;
			
 
				+      run_count++;
			
 
				+      nqi0+=!flag;
			
 
				+    }
			
 
				+    oc_sb_run_pack(&_enc->opb,run_count,flag,fragii>=ncoded_fragis);
			
 
				+    flag=!flag;
			
 
				+  }
			
 
				+  if(_enc->state.nqis<3||nqi0>=ncoded_fragis)return;
			
 
				+  for(fragii=0;!frags[coded_fragis[fragii]].qii;fragii++);
			
 
				+  flag=frags[coded_fragis[fragii]].qii-1;
			
 
				+  oggpackB_write(&_enc->opb,flag,1);
			
 
				+  while(fragii<ncoded_fragis){
			
 
				+    for(run_count=0;fragii<ncoded_fragis;fragii++){
			
 
				+      int qii;
			
 
				+      qii=frags[coded_fragis[fragii]].qii;
			
 
				+      if(!qii)continue;
			
 
				+      if(qii-1!=flag)break;
			
 
				+      run_count++;
			
 
				+    }
			
 
				+    oc_sb_run_pack(&_enc->opb,run_count,flag,fragii>=ncoded_fragis);
			
 
				+    flag=!flag;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Counts the tokens of each type used for the given range of coefficient
			
 
				+   indices in zig-zag order.
			
 
				+  _zzi_start:      The first zig-zag index to include.
			
 
				+  _zzi_end:        The first zig-zag index to not include.
			
 
				+  _token_counts_y: Returns the token counts for the Y' plane.
			
 
				+  _token_counts_c: Returns the token counts for the Cb and Cr planes.*/
			
 
				+static void oc_enc_count_tokens(oc_enc_ctx *_enc,int _zzi_start,int _zzi_end,
			
 
				+ ptrdiff_t _token_counts_y[32],ptrdiff_t _token_counts_c[32]){
			
 
				+  const unsigned char *dct_tokens;
			
 
				+  ptrdiff_t            ndct_tokens;
			
 
				+  int                  pli;
			
 
				+  int                  zzi;
			
 
				+  ptrdiff_t            ti;
			
 
				+  memset(_token_counts_y,0,32*sizeof(*_token_counts_y));
			
 
				+  memset(_token_counts_c,0,32*sizeof(*_token_counts_c));
			
 
				+  for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
			
 
				+    dct_tokens=_enc->dct_tokens[0][zzi];
			
 
				+    ndct_tokens=_enc->ndct_tokens[0][zzi];
			
 
				+    for(ti=_enc->dct_token_offs[0][zzi];ti<ndct_tokens;ti++){
			
 
				+      _token_counts_y[dct_tokens[ti]]++;
			
 
				+    }
			
 
				+  }
			
 
				+  for(pli=1;pli<3;pli++){
			
 
				+    for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
			
 
				+      dct_tokens=_enc->dct_tokens[pli][zzi];
			
 
				+      ndct_tokens=_enc->ndct_tokens[pli][zzi];
			
 
				+      for(ti=_enc->dct_token_offs[pli][zzi];ti<ndct_tokens;ti++){
			
 
				+        _token_counts_c[dct_tokens[ti]]++;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Computes the number of bits used for each of the potential Huffman code for
			
 
				+   the given list of token counts.
			
 
				+  The bits are added to whatever the current bit counts are.*/
			
 
				+static void oc_enc_count_bits(oc_enc_ctx *_enc,int _hgi,
			
 
				+ const ptrdiff_t _token_counts[32],size_t _bit_counts[16]){
			
 
				+  int huffi;
			
 
				+  int huff_offs;
			
 
				+  int token;
			
 
				+  huff_offs=_hgi<<4;
			
 
				+  for(huffi=0;huffi<16;huffi++){
			
 
				+    for(token=0;token<32;token++){
			
 
				+      _bit_counts[huffi]+=
			
 
				+       _token_counts[token]*_enc->huff_codes[huffi+huff_offs][token].nbits;
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Returns the Huffman index using the fewest number of bits.*/
			
 
				+static int oc_select_huff_idx(size_t _bit_counts[16]){
			
 
				+  int best_huffi;
			
 
				+  int huffi;
			
 
				+  best_huffi=0;
			
 
				+  for(huffi=1;huffi<16;huffi++)if(_bit_counts[huffi]<_bit_counts[best_huffi]){
			
 
				+    best_huffi=huffi;
			
 
				+  }
			
 
				+  return best_huffi;
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_huff_group_pack(oc_enc_ctx *_enc,
			
 
				+ int _zzi_start,int _zzi_end,const int _huff_idxs[2]){
			
 
				+  int zzi;
			
 
				+  for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
			
 
				+    int pli;
			
 
				+    for(pli=0;pli<3;pli++){
			
 
				+      const unsigned char *dct_tokens;
			
 
				+      const ogg_uint16_t  *extra_bits;
			
 
				+      ptrdiff_t            ndct_tokens;
			
 
				+      const th_huff_code  *huff_codes;
			
 
				+      ptrdiff_t            ti;
			
 
				+      dct_tokens=_enc->dct_tokens[pli][zzi];
			
 
				+      extra_bits=_enc->extra_bits[pli][zzi];
			
 
				+      ndct_tokens=_enc->ndct_tokens[pli][zzi];
			
 
				+      huff_codes=_enc->huff_codes[_huff_idxs[pli+1>>1]];
			
 
				+      for(ti=_enc->dct_token_offs[pli][zzi];ti<ndct_tokens;ti++){
			
 
				+        int token;
			
 
				+        int neb;
			
 
				+        token=dct_tokens[ti];
			
 
				+        oggpackB_write(&_enc->opb,huff_codes[token].pattern,
			
 
				+         huff_codes[token].nbits);
			
 
				+        neb=OC_DCT_TOKEN_EXTRA_BITS[token];
			
 
				+        if(neb)oggpackB_write(&_enc->opb,extra_bits[ti],neb);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_residual_tokens_pack(oc_enc_ctx *_enc){
			
 
				+  static const unsigned char  OC_HUFF_GROUP_MIN[6]={0,1,6,15,28,64};
			
 
				+  static const unsigned char *OC_HUFF_GROUP_MAX=OC_HUFF_GROUP_MIN+1;
			
 
				+  ptrdiff_t token_counts_y[32];
			
 
				+  ptrdiff_t token_counts_c[32];
			
 
				+  size_t    bits_y[16];
			
 
				+  size_t    bits_c[16];
			
 
				+  int       huff_idxs[2];
			
 
				+  int       frame_type;
			
 
				+  int       hgi;
			
 
				+  frame_type=_enc->state.frame_type;
			
 
				+  /*Choose which Huffman tables to use for the DC token list.*/
			
 
				+  oc_enc_count_tokens(_enc,0,1,token_counts_y,token_counts_c);
			
 
				+  memset(bits_y,0,sizeof(bits_y));
			
 
				+  memset(bits_c,0,sizeof(bits_c));
			
 
				+  oc_enc_count_bits(_enc,0,token_counts_y,bits_y);
			
 
				+  oc_enc_count_bits(_enc,0,token_counts_c,bits_c);
			
 
				+  huff_idxs[0]=oc_select_huff_idx(bits_y);
			
 
				+  huff_idxs[1]=oc_select_huff_idx(bits_c);
			
 
				+  /*Write the DC token list with the chosen tables.*/
			
 
				+  oggpackB_write(&_enc->opb,huff_idxs[0],4);
			
 
				+  oggpackB_write(&_enc->opb,huff_idxs[1],4);
			
 
				+  _enc->huff_idxs[frame_type][0][0]=(unsigned char)huff_idxs[0];
			
 
				+  _enc->huff_idxs[frame_type][0][1]=(unsigned char)huff_idxs[1];
			
 
				+  oc_enc_huff_group_pack(_enc,0,1,huff_idxs);
			
 
				+  /*Choose which Huffman tables to use for the AC token lists.*/
			
 
				+  memset(bits_y,0,sizeof(bits_y));
			
 
				+  memset(bits_c,0,sizeof(bits_c));
			
 
				+  for(hgi=1;hgi<5;hgi++){
			
 
				+    oc_enc_count_tokens(_enc,OC_HUFF_GROUP_MIN[hgi],OC_HUFF_GROUP_MAX[hgi],
			
 
				+     token_counts_y,token_counts_c);
			
 
				+    oc_enc_count_bits(_enc,hgi,token_counts_y,bits_y);
			
 
				+    oc_enc_count_bits(_enc,hgi,token_counts_c,bits_c);
			
 
				+  }
			
 
				+  huff_idxs[0]=oc_select_huff_idx(bits_y);
			
 
				+  huff_idxs[1]=oc_select_huff_idx(bits_c);
			
 
				+  /*Write the AC token lists using the chosen tables.*/
			
 
				+  oggpackB_write(&_enc->opb,huff_idxs[0],4);
			
 
				+  oggpackB_write(&_enc->opb,huff_idxs[1],4);
			
 
				+  _enc->huff_idxs[frame_type][1][0]=(unsigned char)huff_idxs[0];
			
 
				+  _enc->huff_idxs[frame_type][1][1]=(unsigned char)huff_idxs[1];
			
 
				+  for(hgi=1;hgi<5;hgi++){
			
 
				+    huff_idxs[0]+=16;
			
 
				+    huff_idxs[1]+=16;
			
 
				+    oc_enc_huff_group_pack(_enc,
			
 
				+     OC_HUFF_GROUP_MIN[hgi],OC_HUFF_GROUP_MAX[hgi],huff_idxs);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Packs an explicit drop frame, instead of using the more efficient 0-byte
			
 
				+   packet.
			
 
				+  This is only enabled in VP3-compatibility mode, even though it is not
			
 
				+   strictly required for VP3 compatibility (VP3 could be encoded in AVI, which
			
 
				+   also supports dropping frames by inserting 0 byte packets).
			
 
				+  However, almost every _Theora_ player used to get this wrong (and many still
			
 
				+   do), and it wasn't until we started shipping a post-VP3 encoder that
			
 
				+   actually used non-VP3 features that this began to be discovered and fixed,
			
 
				+   despite being in the standard since 2004.
			
 
				+  The pack buffer must be reset before calling this function.*/
			
 
				+static void oc_enc_drop_frame_pack(oc_enc_ctx *_enc){
			
 
				+  unsigned nsbs;
			
 
				+  /*Mark this as a data packet.*/
			
 
				+  oggpackB_write(&_enc->opb,0,1);
			
 
				+  /*Output the frame type (key frame or delta frame).*/
			
 
				+  oggpackB_write(&_enc->opb,OC_INTER_FRAME,1);
			
 
				+  /*Write out the current qi list.
			
 
				+    We always use just 1 qi, to avoid wasting bits on the others.*/
			
 
				+  oggpackB_write(&_enc->opb,_enc->state.qis[0],6);
			
 
				+  oggpackB_write(&_enc->opb,0,1);
			
 
				+  /*Coded block flags: everything is uncoded.*/
			
 
				+  nsbs=_enc->state.nsbs;
			
 
				+  /*No partially coded SBs.*/
			
 
				+  oggpackB_write(&_enc->opb,0,1);
			
 
				+  oc_sb_run_pack(&_enc->opb,nsbs,0,1);
			
 
				+  /*No fully coded SBs.*/
			
 
				+  oggpackB_write(&_enc->opb,0,1);
			
 
				+  oc_sb_run_pack(&_enc->opb,nsbs,0,1);
			
 
				+  /*MB modes: just need write which scheme to use.
			
 
				+    Since we have no coded MBs, we can pick any of them except 0, which would
			
 
				+     require writing out an additional mode list.*/
			
 
				+  oggpackB_write(&_enc->opb,7,3);
			
 
				+  /*MVs: just need write which scheme to use.
			
 
				+    We can pick either one, since we have no MVs.*/
			
 
				+  oggpackB_write(&_enc->opb,1,1);
			
 
				+  /*Write the chosen DC token tables.*/
			
 
				+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][0][0],4);
			
 
				+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][0][1],4);
			
 
				+  /*Write the chosen AC token tables.*/
			
 
				+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][1][0],4);
			
 
				+  oggpackB_write(&_enc->opb,_enc->huff_idxs[OC_INTER_FRAME][1][1],4);
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_frame_pack(oc_enc_ctx *_enc){
			
 
				+  oggpackB_reset(&_enc->opb);
			
 
				+  /*Only proceed if we have some coded blocks.*/
			
 
				+  if(_enc->state.ntotal_coded_fragis>0){
			
 
				+    oc_enc_frame_header_pack(_enc);
			
 
				+    if(_enc->state.frame_type==OC_INTER_FRAME){
			
 
				+      /*Coded block flags, MB modes, and MVs are only needed for delta frames.*/
			
 
				+      oc_enc_coded_flags_pack(_enc);
			
 
				+      oc_enc_mb_modes_pack(_enc);
			
 
				+      oc_enc_mvs_pack(_enc);
			
 
				+    }
			
 
				+    oc_enc_block_qis_pack(_enc);
			
 
				+    oc_enc_tokenize_finish(_enc);
			
 
				+    oc_enc_residual_tokens_pack(_enc);
			
 
				+  }
			
 
				+  /*If there are no coded blocks, we can drop this frame simply by emitting a
			
 
				+     0 byte packet.
			
 
				+    We emit an inter frame with no coded blocks in VP3-compatibility mode.*/
			
 
				+  else if(_enc->vp3_compatible)oc_enc_drop_frame_pack(_enc);
			
 
				+  /*Success: Mark the packet as ready to be flushed.*/
			
 
				+  _enc->packet_state=OC_PACKET_READY;
			
 
				+#if defined(OC_COLLECT_METRICS)
			
 
				+  oc_enc_mode_metrics_collect(_enc);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void oc_enc_accel_init_c(oc_enc_ctx *_enc){
			
 
				+  /*The implementations prefixed with oc_enc_ are encoder-specific.
			
 
				+    The rest we re-use from the decoder.*/
			
 
				+# if defined(OC_ENC_USE_VTABLE)
			
 
				+  _enc->opt_vtable.frag_sub=oc_enc_frag_sub_c;
			
 
				+  _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
			
 
				+  _enc->opt_vtable.frag_sad=oc_enc_frag_sad_c;
			
 
				+  _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c;
			
 
				+  _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c;
			
 
				+  _enc->opt_vtable.frag_intra_sad=oc_enc_frag_intra_sad_c;
			
 
				+  _enc->opt_vtable.frag_satd=oc_enc_frag_satd_c;
			
 
				+  _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_c;
			
 
				+  _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_c;
			
 
				+  _enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_c;
			
 
				+  _enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_c;
			
 
				+  _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_c;
			
 
				+  _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_c;
			
 
				+  _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_c;
			
 
				+  _enc->opt_vtable.quantize=oc_enc_quantize_c;
			
 
				+  _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
			
 
				+  _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
			
 
				+  _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_c;
			
 
				+# endif
			
 
				+  _enc->opt_data.enquant_table_size=64*sizeof(oc_iquant);
			
 
				+  _enc->opt_data.enquant_table_alignment=16;
			
 
				+}
			
 
				+
			
 
				+/*Initialize the macro block neighbor lists for MC analysis.
			
 
				+  This assumes that the entire mb_info memory region has been initialized with
			
 
				+   zeros.*/
			
 
				+static void oc_enc_mb_info_init(oc_enc_ctx *_enc){
			
 
				+  oc_mb_enc_info    *embs;
			
 
				+  const signed char *mb_modes;
			
 
				+  unsigned           nhsbs;
			
 
				+  unsigned           nvsbs;
			
 
				+  unsigned           nhmbs;
			
 
				+  unsigned           nvmbs;
			
 
				+  unsigned           sby;
			
 
				+  mb_modes=_enc->state.mb_modes;
			
 
				+  embs=_enc->mb_info;
			
 
				+  nhsbs=_enc->state.fplanes[0].nhsbs;
			
 
				+  nvsbs=_enc->state.fplanes[0].nvsbs;
			
 
				+  nhmbs=_enc->state.nhmbs;
			
 
				+  nvmbs=_enc->state.nvmbs;
			
 
				+  for(sby=0;sby<nvsbs;sby++){
			
 
				+    unsigned sbx;
			
 
				+    for(sbx=0;sbx<nhsbs;sbx++){
			
 
				+      int quadi;
			
 
				+      for(quadi=0;quadi<4;quadi++){
			
 
				+        /*Because of the Hilbert curve ordering the macro blocks are
			
 
				+           visited in, the available neighbors change depending on where in
			
 
				+           a super block the macro block is located.
			
 
				+          Only the first three vectors are used in the median calculation
			
 
				+           for the optimal predictor, and so the most important should be
			
 
				+           listed first.
			
 
				+          Additional vectors are used, so there will always be at least 3,
			
 
				+           except for in the upper-left most macro block.*/
			
 
				+        /*The number of current neighbors for each macro block position.*/
			
 
				+        static const unsigned char NCNEIGHBORS[4]={4,3,2,4};
			
 
				+        /*The offset of each current neighbor in the X direction.*/
			
 
				+        static const signed char   CDX[4][4]={
			
 
				+          {-1,0,1,-1},
			
 
				+          {-1,0,-1,},
			
 
				+          {-1,-1},
			
 
				+          {-1,0,0,1}
			
 
				+        };
			
 
				+        /*The offset of each current neighbor in the Y direction.*/
			
 
				+        static const signed char   CDY[4][4]={
			
 
				+          {0,-1,-1,-1},
			
 
				+          {0,-1,-1},
			
 
				+          {0,-1},
			
 
				+          {0,-1,1,-1}
			
 
				+        };
			
 
				+        /*The offset of each previous neighbor in the X direction.*/
			
 
				+        static const signed char   PDX[4]={-1,0,1,0};
			
 
				+        /*The offset of each previous neighbor in the Y direction.*/
			
 
				+        static const signed char   PDY[4]={0,-1,0,1};
			
 
				+        unsigned mbi;
			
 
				+        int      mbx;
			
 
				+        int      mby;
			
 
				+        unsigned nmbi;
			
 
				+        int      nmbx;
			
 
				+        int      nmby;
			
 
				+        int      ni;
			
 
				+        mbi=(sby*nhsbs+sbx<<2)+quadi;
			
 
				+        if(mb_modes[mbi]==OC_MODE_INVALID)continue;
			
 
				+        mbx=2*sbx+(quadi>>1);
			
 
				+        mby=2*sby+(quadi+1>>1&1);
			
 
				+        /*Fill in the neighbors with current motion vectors available.*/
			
 
				+        for(ni=0;ni<NCNEIGHBORS[quadi];ni++){
			
 
				+          nmbx=mbx+CDX[quadi][ni];
			
 
				+          nmby=mby+CDY[quadi][ni];
			
 
				+          if(nmbx<0||nmbx>=nhmbs||nmby<0||nmby>=nvmbs)continue;
			
 
				+          nmbi=(nmby&~1)*nhmbs+((nmbx&~1)<<1)+OC_MB_MAP[nmby&1][nmbx&1];
			
 
				+          if(mb_modes[nmbi]==OC_MODE_INVALID)continue;
			
 
				+          embs[mbi].cneighbors[embs[mbi].ncneighbors++]=nmbi;
			
 
				+        }
			
 
				+        /*Fill in the neighbors with previous motion vectors available.*/
			
 
				+        for(ni=0;ni<4;ni++){
			
 
				+          nmbx=mbx+PDX[ni];
			
 
				+          nmby=mby+PDY[ni];
			
 
				+          if(nmbx<0||nmbx>=nhmbs||nmby<0||nmby>=nvmbs)continue;
			
 
				+          nmbi=(nmby&~1)*nhmbs+((nmbx&~1)<<1)+OC_MB_MAP[nmby&1][nmbx&1];
			
 
				+          if(mb_modes[nmbi]==OC_MODE_INVALID)continue;
			
 
				+          embs[mbi].pneighbors[embs[mbi].npneighbors++]=nmbi;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static int oc_enc_set_huffman_codes(oc_enc_ctx *_enc,
			
 
				+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){
			
 
				+  int ret;
			
 
				+  if(_enc==NULL)return TH_EFAULT;
			
 
				+  if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
			
 
				+  if(_codes==NULL)_codes=TH_VP31_HUFF_CODES;
			
 
				+  /*Validate the codes.*/
			
 
				+  oggpackB_reset(&_enc->opb);
			
 
				+  ret=oc_huff_codes_pack(&_enc->opb,_codes);
			
 
				+  if(ret<0)return ret;
			
 
				+  memcpy(_enc->huff_codes,_codes,sizeof(_enc->huff_codes));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_enquant_tables_init(oc_enc_ctx *_enc,
			
 
				+ const th_quant_info *_qinfo){
			
 
				+  unsigned char *etd;
			
 
				+  size_t         ets;
			
 
				+  int            align;
			
 
				+  int            qii;
			
 
				+  int            qi;
			
 
				+  int            pli;
			
 
				+  int            qti;
			
 
				+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
			
 
				+    _enc->state.dequant_tables[qi][pli][qti]=
			
 
				+     _enc->state.dequant_table_data[qi][pli][qti];
			
 
				+  }
			
 
				+  /*Initialize the dequantization tables.*/
			
 
				+  oc_dequant_tables_init(_enc->state.dequant_tables,NULL,_qinfo);
			
 
				+  /*And save off the DC values.*/
			
 
				+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
			
 
				+    _enc->dequant_dc[qi][pli][qti]=_enc->state.dequant_tables[qi][pli][qti][0];
			
 
				+  }
			
 
				+  /*Set up storage for the quantization tables.*/
			
 
				+  etd=_enc->enquant_table_data;
			
 
				+  ets=_enc->opt_data.enquant_table_size;
			
 
				+  align=-(etd-(unsigned char *)0)&_enc->opt_data.enquant_table_alignment-1;
			
 
				+  etd+=align;
			
 
				+  /*Set up the main tables.*/
			
 
				+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
			
 
				+    _enc->enquant_tables[qi][pli][qti]=etd;
			
 
				+    oc_enc_enquant_table_init(_enc,etd,
			
 
				+     _enc->state.dequant_tables[qi][pli][qti]);
			
 
				+    etd+=ets;
			
 
				+  }
			
 
				+  /*Set up storage for the local copies we modify for each frame.*/
			
 
				+  for(pli=0;pli<3;pli++)for(qii=0;qii<3;qii++)for(qti=0;qti<2;qti++){
			
 
				+    _enc->enquant[pli][qii][qti]=etd;
			
 
				+    etd+=ets;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Updates the encoder state after the quantization parameters have been
			
 
				+   changed.*/
			
 
				+static void oc_enc_quant_params_updated(oc_enc_ctx *_enc,
			
 
				+ const th_quant_info *_qinfo){
			
 
				+  oc_enc_enquant_tables_init(_enc,_qinfo);
			
 
				+  memcpy(_enc->state.loop_filter_limits,_qinfo->loop_filter_limits,
			
 
				+   sizeof(_enc->state.loop_filter_limits));
			
 
				+  oc_enquant_qavg_init(_enc->log_qavg,_enc->log_plq,_enc->chroma_rd_scale,
			
 
				+   _enc->state.dequant_tables,_enc->state.info.pixel_fmt);
			
 
				+}
			
 
				+
			
 
				+/*Sets the quantization parameters to use.
			
 
				+  This may only be called before the setup header is written.
			
 
				+  If it is called multiple times, only the last call has any effect.
			
 
				+  _qinfo: The quantization parameters.
			
 
				+          These are described in more detail in theoraenc.h.
			
 
				+          This can be NULL, in which case the default quantization parameters
			
 
				+           will be used.*/
			
 
				+static int oc_enc_set_quant_params(oc_enc_ctx *_enc,
			
 
				+ const th_quant_info *_qinfo){
			
 
				+  th_quant_info old_qinfo;
			
 
				+  int           ret;
			
 
				+  if(_enc==NULL)return TH_EFAULT;
			
 
				+  if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
			
 
				+  if(_qinfo==NULL)_qinfo=&TH_DEF_QUANT_INFO;
			
 
				+  memcpy(&old_qinfo,&_enc->qinfo,sizeof(old_qinfo));
			
 
				+  ret=oc_quant_params_clone(&_enc->qinfo,_qinfo);
			
 
				+  if(ret<0){
			
 
				+    oc_quant_params_clear(&_enc->qinfo);
			
 
				+    memcpy(&_enc->qinfo,&old_qinfo,sizeof(old_qinfo));
			
 
				+    return ret;
			
 
				+  }
			
 
				+  else oc_quant_params_clear(&old_qinfo);
			
 
				+  oc_enc_quant_params_updated(_enc,_qinfo);
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_clear(oc_enc_ctx *_enc);
			
 
				+
			
 
				+static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
			
 
				+  th_info   info;
			
 
				+  size_t    mcu_nmbs;
			
 
				+  ptrdiff_t mcu_ncfrags;
			
 
				+  ptrdiff_t mcu_nfrags;
			
 
				+  int       hdec;
			
 
				+  int       vdec;
			
 
				+  int       ret;
			
 
				+  int       pli;
			
 
				+  /*Clean up the requested settings.*/
			
 
				+  memcpy(&info,_info,sizeof(info));
			
 
				+  info.version_major=TH_VERSION_MAJOR;
			
 
				+  info.version_minor=TH_VERSION_MINOR;
			
 
				+  info.version_subminor=TH_VERSION_SUB;
			
 
				+  if(info.quality>63)info.quality=63;
			
 
				+  if(info.quality<0)info.quality=32;
			
 
				+  if(info.target_bitrate<0)info.target_bitrate=0;
			
 
				+  /*Initialize the shared encoder/decoder state.*/
			
 
				+  ret=oc_state_init(&_enc->state,&info,6);
			
 
				+  if(ret<0)return ret;
			
 
				+  oc_enc_accel_init(_enc);
			
 
				+  _enc->mb_info=_ogg_calloc(_enc->state.nmbs,sizeof(*_enc->mb_info));
			
 
				+  _enc->frag_dc=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_dc));
			
 
				+  _enc->coded_mbis=
			
 
				+   (unsigned *)_ogg_malloc(_enc->state.nmbs*sizeof(*_enc->coded_mbis));
			
 
				+  hdec=!(_enc->state.info.pixel_fmt&1);
			
 
				+  vdec=!(_enc->state.info.pixel_fmt&2);
			
 
				+  /*If chroma is sub-sampled in the vertical direction, we have to encode two
			
 
				+     super block rows of Y' for each super block row of Cb and Cr.*/
			
 
				+  _enc->mcu_nvsbs=1<<vdec;
			
 
				+  mcu_nmbs=_enc->mcu_nvsbs*_enc->state.fplanes[0].nhsbs*(size_t)4;
			
 
				+  mcu_ncfrags=mcu_nmbs<<3-(hdec+vdec);
			
 
				+  mcu_nfrags=4*mcu_nmbs+mcu_ncfrags;
			
 
				+  _enc->mcu_skip_ssd=(unsigned *)_ogg_malloc(
			
 
				+   mcu_nfrags*sizeof(*_enc->mcu_skip_ssd));
			
 
				+  _enc->mcu_rd_scale=(ogg_uint16_t *)_ogg_malloc(
			
 
				+   (mcu_ncfrags>>1)*sizeof(*_enc->mcu_rd_scale));
			
 
				+  _enc->mcu_rd_iscale=(ogg_uint16_t *)_ogg_malloc(
			
 
				+   (mcu_ncfrags>>1)*sizeof(*_enc->mcu_rd_iscale));
			
 
				+  for(pli=0;pli<3;pli++){
			
 
				+    _enc->dct_tokens[pli]=(unsigned char **)oc_malloc_2d(64,
			
 
				+     _enc->state.fplanes[pli].nfrags,sizeof(**_enc->dct_tokens));
			
 
				+    _enc->extra_bits[pli]=(ogg_uint16_t **)oc_malloc_2d(64,
			
 
				+     _enc->state.fplanes[pli].nfrags,sizeof(**_enc->extra_bits));
			
 
				+  }
			
 
				+#if defined(OC_COLLECT_METRICS)
			
 
				+  _enc->frag_sad=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_sad));
			
 
				+  _enc->frag_satd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_satd));
			
 
				+  _enc->frag_ssd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_ssd));
			
 
				+#endif
			
 
				+  _enc->enquant_table_data=(unsigned char *)_ogg_malloc(
			
 
				+   (64+3)*3*2*_enc->opt_data.enquant_table_size
			
 
				+   +_enc->opt_data.enquant_table_alignment-1);
			
 
				+  _enc->keyframe_frequency_force=1<<_enc->state.info.keyframe_granule_shift;
			
 
				+  _enc->state.qis[0]=_enc->state.info.quality;
			
 
				+  _enc->state.nqis=1;
			
 
				+  _enc->activity_avg=90<<12;
			
 
				+  _enc->luma_avg=128<<8;
			
 
				+  oc_rc_state_init(&_enc->rc,_enc);
			
 
				+  oggpackB_writeinit(&_enc->opb);
			
 
				+  memcpy(_enc->huff_codes,TH_VP31_HUFF_CODES,sizeof(_enc->huff_codes));
			
 
				+  memset(_enc->qinfo.qi_ranges,0,sizeof(_enc->qinfo.qi_ranges));
			
 
				+  /*Reset the packet-out state machine.*/
			
 
				+  _enc->packet_state=OC_PACKET_INFO_HDR;
			
 
				+  _enc->dup_count=0;
			
 
				+  _enc->nqueued_dups=0;
			
 
				+  _enc->prev_dup_count=0;
			
 
				+  /*Enable speed optimizations up through early skip by default.*/
			
 
				+  _enc->sp_level=OC_SP_LEVEL_EARLY_SKIP;
			
 
				+  /*Disable VP3 compatibility by default.*/
			
 
				+  _enc->vp3_compatible=0;
			
 
				+  /*No INTER frames coded yet.*/
			
 
				+  _enc->coded_inter_frame=0;
			
 
				+  if(_enc->mb_info==NULL||_enc->frag_dc==NULL||_enc->coded_mbis==NULL
			
 
				+   ||_enc->mcu_skip_ssd==NULL||_enc->dct_tokens[0]==NULL
			
 
				+   ||_enc->dct_tokens[1]==NULL||_enc->dct_tokens[2]==NULL
			
 
				+   ||_enc->extra_bits[0]==NULL||_enc->extra_bits[1]==NULL
			
 
				+   ||_enc->extra_bits[2]==NULL
			
 
				+#if defined(OC_COLLECT_METRICS)
			
 
				+   ||_enc->frag_sad==NULL||_enc->frag_satd==NULL||_enc->frag_ssd==NULL
			
 
				+#endif
			
 
				+   ||oc_enc_set_quant_params(_enc,NULL)<0){
			
 
				+    oc_enc_clear(_enc);
			
 
				+    return TH_EFAULT;
			
 
				+  }
			
 
				+  oc_mode_scheme_chooser_init(&_enc->chooser);
			
 
				+  oc_enc_mb_info_init(_enc);
			
 
				+  memset(_enc->huff_idxs,0,sizeof(_enc->huff_idxs));
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_clear(oc_enc_ctx *_enc){
			
 
				+  int pli;
			
 
				+  oc_rc_state_clear(&_enc->rc);
			
 
				+  oggpackB_writeclear(&_enc->opb);
			
 
				+  oc_quant_params_clear(&_enc->qinfo);
			
 
				+  _ogg_free(_enc->enquant_table_data);
			
 
				+#if defined(OC_COLLECT_METRICS)
			
 
				+  /*Save the collected metrics from this run.
			
 
				+    Use tools/process_modedec_stats to actually generate modedec.h from the
			
 
				+     resulting file.*/
			
 
				+  oc_mode_metrics_dump();
			
 
				+  _ogg_free(_enc->frag_ssd);
			
 
				+  _ogg_free(_enc->frag_satd);
			
 
				+  _ogg_free(_enc->frag_sad);
			
 
				+#endif
			
 
				+  for(pli=3;pli-->0;){
			
 
				+    oc_free_2d(_enc->extra_bits[pli]);
			
 
				+    oc_free_2d(_enc->dct_tokens[pli]);
			
 
				+  }
			
 
				+  _ogg_free(_enc->mcu_rd_iscale);
			
 
				+  _ogg_free(_enc->mcu_rd_scale);
			
 
				+  _ogg_free(_enc->mcu_skip_ssd);
			
 
				+  _ogg_free(_enc->coded_mbis);
			
 
				+  _ogg_free(_enc->frag_dc);
			
 
				+  _ogg_free(_enc->mb_info);
			
 
				+  oc_state_clear(&_enc->state);
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_drop_frame(th_enc_ctx *_enc){
			
 
				+  /*Use the previous frame's reconstruction.*/
			
 
				+  _enc->state.ref_frame_idx[OC_FRAME_SELF]=
			
 
				+   _enc->state.ref_frame_idx[OC_FRAME_PREV];
			
 
				+  _enc->state.ref_frame_data[OC_FRAME_SELF]=
			
 
				+   _enc->state.ref_frame_data[OC_FRAME_PREV];
			
 
				+  /*Flag motion vector analysis about the frame drop.*/
			
 
				+  _enc->prevframe_dropped=1;
			
 
				+  /*Zero the packet.*/
			
 
				+  oggpackB_reset(&_enc->opb);
			
 
				+  /*Emit an inter frame with no coded blocks in VP3-compatibility mode.*/
			
 
				+  if(_enc->vp3_compatible)oc_enc_drop_frame_pack(_enc);
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_compress_keyframe(oc_enc_ctx *_enc,int _recode){
			
 
				+  if(_enc->state.info.target_bitrate>0){
			
 
				+    _enc->state.qis[0]=oc_enc_select_qi(_enc,OC_INTRA_FRAME,
			
 
				+     _enc->state.curframe_num>0);
			
 
				+    _enc->state.nqis=1;
			
 
				+  }
			
 
				+  oc_enc_calc_lambda(_enc,OC_INTRA_FRAME);
			
 
				+  oc_enc_analyze_intra(_enc,_recode);
			
 
				+  oc_enc_frame_pack(_enc);
			
 
				+  /*On the first frame, the previous call was an initial dry-run to prime
			
 
				+     feed-forward statistics.*/
			
 
				+  if(!_recode&&_enc->state.curframe_num==0){
			
 
				+    if(_enc->state.info.target_bitrate>0){
			
 
				+      oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
			
 
				+                             OC_INTRA_FRAME,_enc->state.qis[0],1,0);
			
 
				+    }
			
 
				+    oc_enc_compress_keyframe(_enc,1);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+static void oc_enc_compress_frame(oc_enc_ctx *_enc,int _recode){
			
 
				+  if(_enc->state.info.target_bitrate>0){
			
 
				+    _enc->state.qis[0]=oc_enc_select_qi(_enc,OC_INTER_FRAME,1);
			
 
				+    _enc->state.nqis=1;
			
 
				+  }
			
 
				+  oc_enc_calc_lambda(_enc,OC_INTER_FRAME);
			
 
				+  if(oc_enc_analyze_inter(_enc,_enc->rc.twopass!=2,_recode)){
			
 
				+    /*Mode analysis thinks this should have been a keyframe; start over.*/
			
 
				+    oc_enc_compress_keyframe(_enc,1);
			
 
				+  }
			
 
				+  else{
			
 
				+    oc_enc_frame_pack(_enc);
			
 
				+    if(!_enc->coded_inter_frame){
			
 
				+      /*On the first INTER frame, the previous call was an initial dry-run to
			
 
				+         prime feed-forward statistics.*/
			
 
				+      _enc->coded_inter_frame=1;
			
 
				+      if(_enc->state.info.target_bitrate>0){
			
 
				+        /*Rate control also needs to prime.*/
			
 
				+        oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
			
 
				+         OC_INTER_FRAME,_enc->state.qis[0],1,0);
			
 
				+      }
			
 
				+      oc_enc_compress_frame(_enc,1);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+/*Set the granule position for the next packet to output based on the current
			
 
				+   internal state.*/
			
 
				+static void oc_enc_set_granpos(oc_enc_ctx *_enc){
			
 
				+  unsigned dup_offs;
			
 
				+  /*Add an offset for the number of duplicate frames we've emitted so far.*/
			
 
				+  dup_offs=_enc->prev_dup_count-_enc->nqueued_dups;
			
 
				+  /*If the current frame was a keyframe, use it for the high part.*/
			
 
				+  if(_enc->state.frame_type==OC_INTRA_FRAME){
			
 
				+    _enc->state.granpos=(_enc->state.curframe_num+_enc->state.granpos_bias<<
			
 
				+     _enc->state.info.keyframe_granule_shift)+dup_offs;
			
 
				+  }
			
 
				+  /*Otherwise use the last keyframe in the high part and put the current frame
			
 
				+     in the low part.*/
			
 
				+  else{
			
 
				+    _enc->state.granpos=
			
 
				+     (_enc->state.keyframe_num+_enc->state.granpos_bias<<
			
 
				+     _enc->state.info.keyframe_granule_shift)
			
 
				+     +_enc->state.curframe_num-_enc->state.keyframe_num+dup_offs;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+th_enc_ctx *th_encode_alloc(const th_info *_info){
			
 
				+  oc_enc_ctx *enc;
			
 
				+  if(_info==NULL)return NULL;
			
 
				+  enc=oc_aligned_malloc(sizeof(*enc),16);
			
 
				+  if(enc==NULL||oc_enc_init(enc,_info)<0){
			
 
				+    oc_aligned_free(enc);
			
 
				+    return NULL;
			
 
				+  }
			
 
				+  return enc;
			
 
				+}
			
 
				+
			
 
				+void th_encode_free(th_enc_ctx *_enc){
			
 
				+  if(_enc!=NULL){
			
 
				+    oc_enc_clear(_enc);
			
 
				+    oc_aligned_free(_enc);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
			
 
				+  switch(_req){
			
 
				+    case TH_ENCCTL_SET_HUFFMAN_CODES:{
			
 
				+      if(_buf==NULL&&_buf_sz!=0||
			
 
				+       _buf!=NULL&&_buf_sz!=sizeof(th_huff_table)*TH_NHUFFMAN_TABLES){
			
 
				+        return TH_EINVAL;
			
 
				+      }
			
 
				+      return oc_enc_set_huffman_codes(_enc,(const th_huff_table *)_buf);
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_SET_QUANT_PARAMS:{
			
 
				+      if(_buf==NULL&&_buf_sz!=0||
			
 
				+       _buf!=NULL&&_buf_sz!=sizeof(th_quant_info)){
			
 
				+        return TH_EINVAL;
			
 
				+      }
			
 
				+      return oc_enc_set_quant_params(_enc,(th_quant_info *)_buf);
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE:{
			
 
				+      ogg_uint32_t keyframe_frequency_force;
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      if(_buf_sz!=sizeof(keyframe_frequency_force))return TH_EINVAL;
			
 
				+      keyframe_frequency_force=*(ogg_uint32_t *)_buf;
			
 
				+      if(keyframe_frequency_force<=0)keyframe_frequency_force=1;
			
 
				+      if(_enc->packet_state==OC_PACKET_INFO_HDR){
			
 
				+        /*It's still early enough to enlarge keyframe_granule_shift.*/
			
 
				+        _enc->state.info.keyframe_granule_shift=OC_CLAMPI(
			
 
				+         _enc->state.info.keyframe_granule_shift,
			
 
				+         OC_ILOG_32(keyframe_frequency_force-1),31);
			
 
				+      }
			
 
				+      _enc->keyframe_frequency_force=OC_MINI(keyframe_frequency_force,
			
 
				+       (ogg_uint32_t)1U<<_enc->state.info.keyframe_granule_shift);
			
 
				+      *(ogg_uint32_t *)_buf=_enc->keyframe_frequency_force;
			
 
				+      return 0;
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_SET_VP3_COMPATIBLE:{
			
 
				+      int vp3_compatible;
			
 
				+      int ret;
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      if(_buf_sz!=sizeof(vp3_compatible))return TH_EINVAL;
			
 
				+      /*Try this before we change anything else, because it can fail.*/
			
 
				+      ret=oc_enc_set_quant_params(_enc,&TH_VP31_QUANT_INFO);
			
 
				+      /*If we can't allocate enough memory, don't change any of the state.*/
			
 
				+      if(ret==TH_EFAULT)return ret;
			
 
				+      vp3_compatible=*(int *)_buf;
			
 
				+      _enc->vp3_compatible=vp3_compatible;
			
 
				+      if(oc_enc_set_huffman_codes(_enc,TH_VP31_HUFF_CODES)<0)vp3_compatible=0;
			
 
				+      if(ret<0)vp3_compatible=0;
			
 
				+      if(_enc->state.info.pixel_fmt!=TH_PF_420||
			
 
				+       _enc->state.info.pic_width<_enc->state.info.frame_width||
			
 
				+       _enc->state.info.pic_height<_enc->state.info.frame_height||
			
 
				+      /*If we have more than 4095 super blocks, VP3's RLE coding might
			
 
				+         overflow.
			
 
				+        We could overcome this by ensuring we flip the coded/not-coded flags on
			
 
				+         at least one super block in the frame, but we pick the simple solution
			
 
				+         of just telling the user the stream will be incompatible instead.
			
 
				+        It's unlikely the old VP3 codec would be able to decode streams at this
			
 
				+         resolution in real time in the first place.*/
			
 
				+       _enc->state.nsbs>4095){
			
 
				+        vp3_compatible=0;
			
 
				+      }
			
 
				+      *(int *)_buf=vp3_compatible;
			
 
				+      return 0;
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_GET_SPLEVEL_MAX:{
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      if(_buf_sz!=sizeof(int))return TH_EINVAL;
			
 
				+      *(int *)_buf=OC_SP_LEVEL_MAX;
			
 
				+      return 0;
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_SET_SPLEVEL:{
			
 
				+      int speed;
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      if(_buf_sz!=sizeof(speed))return TH_EINVAL;
			
 
				+      speed=*(int *)_buf;
			
 
				+      if(speed<0||speed>OC_SP_LEVEL_MAX)return TH_EINVAL;
			
 
				+      _enc->sp_level=speed;
			
 
				+      return 0;
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_GET_SPLEVEL:{
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      if(_buf_sz!=sizeof(int))return TH_EINVAL;
			
 
				+      *(int *)_buf=_enc->sp_level;
			
 
				+      return 0;
			
 
				+    }
			
 
				+    case TH_ENCCTL_SET_DUP_COUNT:{
			
 
				+      int dup_count;
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      if(_buf_sz!=sizeof(dup_count))return TH_EINVAL;
			
 
				+      dup_count=*(int *)_buf;
			
 
				+      if(dup_count>=_enc->keyframe_frequency_force)return TH_EINVAL;
			
 
				+      _enc->dup_count=OC_MAXI(dup_count,0);
			
 
				+      return 0;
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_SET_QUALITY:{
			
 
				+      int qi;
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      if(_enc->state.info.target_bitrate>0)return TH_EINVAL;
			
 
				+      qi=*(int *)_buf;
			
 
				+      if(qi<0||qi>63)return TH_EINVAL;
			
 
				+      _enc->state.info.quality=qi;
			
 
				+      _enc->state.qis[0]=(unsigned char)qi;
			
 
				+      _enc->state.nqis=1;
			
 
				+      return 0;
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_SET_BITRATE:{
			
 
				+      long bitrate;
			
 
				+      int  reset;
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      bitrate=*(long *)_buf;
			
 
				+      if(bitrate<=0)return TH_EINVAL;
			
 
				+      reset=_enc->state.info.target_bitrate<=0;
			
 
				+      _enc->state.info.target_bitrate=bitrate>INT_MAX?INT_MAX:bitrate;
			
 
				+      if(reset)oc_rc_state_init(&_enc->rc,_enc);
			
 
				+      else oc_enc_rc_resize(_enc);
			
 
				+      return 0;
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_SET_RATE_FLAGS:{
			
 
				+      int set;
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      if(_buf_sz!=sizeof(set))return TH_EINVAL;
			
 
				+      if(_enc->state.info.target_bitrate<=0)return TH_EINVAL;
			
 
				+      set=*(int *)_buf;
			
 
				+      _enc->rc.drop_frames=set&TH_RATECTL_DROP_FRAMES;
			
 
				+      _enc->rc.cap_overflow=set&TH_RATECTL_CAP_OVERFLOW;
			
 
				+      _enc->rc.cap_underflow=set&TH_RATECTL_CAP_UNDERFLOW;
			
 
				+      return 0;
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_SET_RATE_BUFFER:{
			
 
				+      int set;
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      if(_buf_sz!=sizeof(set))return TH_EINVAL;
			
 
				+      if(_enc->state.info.target_bitrate<=0)return TH_EINVAL;
			
 
				+      set=*(int *)_buf;
			
 
				+      _enc->rc.buf_delay=set;
			
 
				+      oc_enc_rc_resize(_enc);
			
 
				+      *(int *)_buf=_enc->rc.buf_delay;
			
 
				+      return 0;
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_2PASS_OUT:{
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      if(_enc->state.info.target_bitrate<=0||
			
 
				+       _enc->state.curframe_num>=0&&_enc->rc.twopass!=1||
			
 
				+       _buf_sz!=sizeof(unsigned char *)){
			
 
				+        return TH_EINVAL;
			
 
				+      }
			
 
				+      return oc_enc_rc_2pass_out(_enc,(unsigned char **)_buf);
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_2PASS_IN:{
			
 
				+      if(_enc==NULL)return TH_EFAULT;
			
 
				+      if(_enc->state.info.target_bitrate<=0||
			
 
				+       _enc->state.curframe_num>=0&&_enc->rc.twopass!=2){
			
 
				+        return TH_EINVAL;
			
 
				+      }
			
 
				+      return oc_enc_rc_2pass_in(_enc,_buf,_buf_sz);
			
 
				+    }break;
			
 
				+    case TH_ENCCTL_SET_COMPAT_CONFIG:{
			
 
				+      unsigned char buf[7];
			
 
				+      oc_pack_buf   opb;
			
 
				+      th_quant_info qinfo;
			
 
				+      th_huff_code  huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
			
 
				+      int           ret;
			
 
				+      int           i;
			
 
				+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
			
 
				+      if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
			
 
				+      oc_pack_readinit(&opb,_buf,_buf_sz);
			
 
				+      /*Validate the setup packet header.*/
			
 
				+      for(i=0;i<7;i++)buf[i]=(unsigned char)oc_pack_read(&opb,8);
			
 
				+      if(!(buf[0]&0x80)||memcmp(buf+1,"theora",6)!=0)return TH_ENOTFORMAT;
			
 
				+      if(buf[0]!=0x82)return TH_EBADHEADER;
			
 
				+      /*Reads its contents.*/
			
 
				+      ret=oc_quant_params_unpack(&opb,&qinfo);
			
 
				+      if(ret<0){
			
 
				+        oc_quant_params_clear(&qinfo);
			
 
				+        return ret;
			
 
				+      }
			
 
				+      ret=oc_huff_codes_unpack(&opb,huff_codes);
			
 
				+      if(ret<0){
			
 
				+        oc_quant_params_clear(&qinfo);
			
 
				+        return ret;
			
 
				+      }
			
 
				+      /*Install the new state.*/
			
 
				+      oc_quant_params_clear(&_enc->qinfo);
			
 
				+      memcpy(&_enc->qinfo,&qinfo,sizeof(qinfo));
			
 
				+      oc_enc_quant_params_updated(_enc,&qinfo);
			
 
				+      memcpy(_enc->huff_codes,huff_codes,sizeof(_enc->huff_codes));
			
 
				+      return 0;
			
 
				+    }
			
 
				+#if defined(OC_COLLECT_METRICS)
			
 
				+    case TH_ENCCTL_SET_METRICS_FILE:{
			
 
				+      OC_MODE_METRICS_FILENAME=(const char *)_buf;
			
 
				+      return 0;
			
 
				+    }
			
 
				+#endif
			
 
				+    default:return TH_EIMPL;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_tc,ogg_packet *_op){
			
 
				+  if(_enc==NULL)return TH_EFAULT;
			
 
				+  return oc_state_flushheader(&_enc->state,&_enc->packet_state,&_enc->opb,
			
 
				+   &_enc->qinfo,(const th_huff_table *)_enc->huff_codes,th_version_string(),
			
 
				+   _tc,_op);
			
 
				+}
			
 
				+
			
 
				+static void oc_img_plane_copy_pad(th_img_plane *_dst,th_img_plane *_src,
			
 
				+ ogg_int32_t _pic_x,ogg_int32_t _pic_y,
			
 
				+ ogg_int32_t _pic_width,ogg_int32_t _pic_height){
			
 
				+  unsigned char *dst;
			
 
				+  int            dstride;
			
 
				+  ogg_uint32_t   frame_width;
			
 
				+  ogg_uint32_t   frame_height;
			
 
				+  ogg_uint32_t   y;
			
 
				+  frame_width=_dst->width;
			
 
				+  frame_height=_dst->height;
			
 
				+  /*If we have _no_ data, just encode a dull green.*/
			
 
				+  if(_pic_width==0||_pic_height==0){
			
 
				+    dst=_dst->data;
			
 
				+    dstride=_dst->stride;
			
 
				+    for(y=0;y<frame_height;y++){
			
 
				+      memset(dst,0,frame_width*sizeof(*dst));
			
 
				+      dst+=dstride;
			
 
				+    }
			
 
				+  }
			
 
				+  /*Otherwise, copy what we do have, and add our own padding.*/
			
 
				+  else{
			
 
				+    unsigned char *dst_data;
			
 
				+    unsigned char *src_data;
			
 
				+    unsigned char *src;
			
 
				+    int            sstride;
			
 
				+    ogg_uint32_t   x;
			
 
				+    /*Step 1: Copy the data we do have.*/
			
 
				+    dstride=_dst->stride;
			
 
				+    sstride=_src->stride;
			
 
				+    dst_data=_dst->data;
			
 
				+    src_data=_src->data;
			
 
				+    dst=dst_data+_pic_y*(ptrdiff_t)dstride+_pic_x;
			
 
				+    src=src_data+_pic_y*(ptrdiff_t)sstride+_pic_x;
			
 
				+    for(y=0;y<_pic_height;y++){
			
 
				+      memcpy(dst,src,_pic_width);
			
 
				+      dst+=dstride;
			
 
				+      src+=sstride;
			
 
				+    }
			
 
				+    /*Step 2: Perform a low-pass extension into the padding region.*/
			
 
				+    /*Left side.*/
			
 
				+    for(x=_pic_x;x-->0;){
			
 
				+      dst=dst_data+_pic_y*(ptrdiff_t)dstride+x;
			
 
				+      for(y=0;y<_pic_height;y++){
			
 
				+        dst[0]=(dst[1]<<1)+(dst-(dstride&-(y>0)))[1]
			
 
				+         +(dst+(dstride&-(y+1<_pic_height)))[1]+2>>2;
			
 
				+        dst+=dstride;
			
 
				+      }
			
 
				+    }
			
 
				+    /*Right side.*/
			
 
				+    for(x=_pic_x+_pic_width;x<frame_width;x++){
			
 
				+      dst=dst_data+_pic_y*(ptrdiff_t)dstride+x-1;
			
 
				+      for(y=0;y<_pic_height;y++){
			
 
				+        dst[1]=(dst[0]<<1)+(dst-(dstride&-(y>0)))[0]
			
 
				+         +(dst+(dstride&-(y+1<_pic_height)))[0]+2>>2;
			
 
				+        dst+=dstride;
			
 
				+      }
			
 
				+    }
			
 
				+    /*Top.*/
			
 
				+    dst=dst_data+_pic_y*(ptrdiff_t)dstride;
			
 
				+    for(y=_pic_y;y-->0;){
			
 
				+      for(x=0;x<frame_width;x++){
			
 
				+        (dst-dstride)[x]=(dst[x]<<1)+dst[x-(x>0)]
			
 
				+         +dst[x+(x+1<frame_width)]+2>>2;
			
 
				+      }
			
 
				+      dst-=dstride;
			
 
				+    }
			
 
				+    /*Bottom.*/
			
 
				+    dst=dst_data+(_pic_y+_pic_height)*(ptrdiff_t)dstride;
			
 
				+    for(y=_pic_y+_pic_height;y<frame_height;y++){
			
 
				+      for(x=0;x<frame_width;x++){
			
 
				+        dst[x]=((dst-dstride)[x]<<1)+(dst-dstride)[x-(x>0)]
			
 
				+         +(dst-dstride)[x+(x+1<frame_width)]+2>>2;
			
 
				+      }
			
 
				+      dst+=dstride;
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
			
 
				+  th_ycbcr_buffer img;
			
 
				+  int             frame_width;
			
 
				+  int             frame_height;
			
 
				+  int             pic_width;
			
 
				+  int             pic_height;
			
 
				+  int             pic_x;
			
 
				+  int             pic_y;
			
 
				+  int             cframe_width;
			
 
				+  int             cframe_height;
			
 
				+  int             cpic_width;
			
 
				+  int             cpic_height;
			
 
				+  int             cpic_x;
			
 
				+  int             cpic_y;
			
 
				+  int             hdec;
			
 
				+  int             vdec;
			
 
				+  int             pli;
			
 
				+  int             refi;
			
 
				+  int             drop;
			
 
				+  /*Step 1: validate parameters.*/
			
 
				+  if(_enc==NULL||_img==NULL)return TH_EFAULT;
			
 
				+  if(_enc->packet_state==OC_PACKET_DONE)return TH_EINVAL;
			
 
				+  if(_enc->rc.twopass&&_enc->rc.twopass_buffer_bytes==0)return TH_EINVAL;
			
 
				+  hdec=!(_enc->state.info.pixel_fmt&1);
			
 
				+  vdec=!(_enc->state.info.pixel_fmt&2);
			
 
				+  frame_width=_enc->state.info.frame_width;
			
 
				+  frame_height=_enc->state.info.frame_height;
			
 
				+  pic_x=_enc->state.info.pic_x;
			
 
				+  pic_y=_enc->state.info.pic_y;
			
 
				+  pic_width=_enc->state.info.pic_width;
			
 
				+  pic_height=_enc->state.info.pic_height;
			
 
				+  cframe_width=frame_width>>hdec;
			
 
				+  cframe_height=frame_height>>vdec;
			
 
				+  cpic_x=pic_x>>hdec;
			
 
				+  cpic_y=pic_y>>vdec;
			
 
				+  cpic_width=(pic_x+pic_width+hdec>>hdec)-cpic_x;
			
 
				+  cpic_height=(pic_y+pic_height+vdec>>vdec)-cpic_y;
			
 
				+  /*Flip the input buffer upside down.*/
			
 
				+  oc_ycbcr_buffer_flip(img,_img);
			
 
				+  if(img[0].width!=frame_width||img[0].height!=frame_height||
			
 
				+   img[1].width!=cframe_width||img[2].width!=cframe_width||
			
 
				+   img[1].height!=cframe_height||img[2].height!=cframe_height){
			
 
				+    /*The buffer does not match the frame size.
			
 
				+      Check to see if it matches the picture size.*/
			
 
				+    if(img[0].width!=pic_width||img[0].height!=pic_height||
			
 
				+     img[1].width!=cpic_width||img[2].width!=cpic_width||
			
 
				+     img[1].height!=cpic_height||img[2].height!=cpic_height){
			
 
				+      /*It doesn't; we don't know how to handle it.*/
			
 
				+      return TH_EINVAL;
			
 
				+    }
			
 
				+    /*Adjust the pointers to address a full frame.
			
 
				+      We still only use the picture region, however.*/
			
 
				+    img[0].data-=pic_y*(ptrdiff_t)img[0].stride+pic_x;
			
 
				+    img[1].data-=cpic_y*(ptrdiff_t)img[1].stride+cpic_x;
			
 
				+    img[2].data-=cpic_y*(ptrdiff_t)img[2].stride+cpic_x;
			
 
				+  }
			
 
				+  /*Step 2: Update the buffer state.*/
			
 
				+  if(_enc->state.ref_frame_idx[OC_FRAME_SELF]>=0){
			
 
				+    _enc->state.ref_frame_idx[OC_FRAME_PREV]=
			
 
				+     _enc->state.ref_frame_idx[OC_FRAME_SELF];
			
 
				+    _enc->state.ref_frame_data[OC_FRAME_PREV]=
			
 
				+     _enc->state.ref_frame_data[OC_FRAME_SELF];
			
 
				+    if(_enc->state.frame_type==OC_INTRA_FRAME){
			
 
				+      /*The new frame becomes both the previous and gold reference frames.*/
			
 
				+      _enc->state.keyframe_num=_enc->state.curframe_num;
			
 
				+      _enc->state.ref_frame_idx[OC_FRAME_GOLD]=
			
 
				+       _enc->state.ref_frame_idx[OC_FRAME_SELF];
			
 
				+      _enc->state.ref_frame_data[OC_FRAME_GOLD]=
			
 
				+       _enc->state.ref_frame_data[OC_FRAME_SELF];
			
 
				+    }
			
 
				+  }
			
 
				+  if(_enc->state.ref_frame_idx[OC_FRAME_IO]>=0&&_enc->prevframe_dropped==0){
			
 
				+    _enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG]=
			
 
				+     _enc->state.ref_frame_idx[OC_FRAME_IO];
			
 
				+    _enc->state.ref_frame_data[OC_FRAME_PREV_ORIG]=
			
 
				+     _enc->state.ref_frame_data[OC_FRAME_IO];
			
 
				+    if(_enc->state.frame_type==OC_INTRA_FRAME){
			
 
				+      /*The new input frame becomes both the previous and gold
			
 
				+         original-reference frames.*/
			
 
				+      _enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]=
			
 
				+       _enc->state.ref_frame_idx[OC_FRAME_IO];
			
 
				+      _enc->state.ref_frame_data[OC_FRAME_GOLD_ORIG]=
			
 
				+       _enc->state.ref_frame_data[OC_FRAME_IO];
			
 
				+    }
			
 
				+  }
			
 
				+  /*Select a free buffer to use for the incoming frame*/
			
 
				+  for(refi=3;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]||
			
 
				+   refi==_enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG];refi++);
			
 
				+  _enc->state.ref_frame_idx[OC_FRAME_IO]=refi;
			
 
				+  _enc->state.ref_frame_data[OC_FRAME_IO]=
			
 
				+   _enc->state.ref_frame_bufs[refi][0].data;
			
 
				+  /*Step 3: Copy the input to our internal buffer.
			
 
				+    This lets us add padding, so we don't have to worry about dereferencing
			
 
				+     possibly invalid addresses, and allows us to use the same strides and
			
 
				+     fragment offsets for both the input frame and the reference frames.*/
			
 
				+  oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[refi]+0,img+0,
			
 
				+   pic_x,pic_y,pic_width,pic_height);
			
 
				+  oc_state_borders_fill_rows(&_enc->state,refi,0,0,frame_height);
			
 
				+  oc_state_borders_fill_caps(&_enc->state,refi,0);
			
 
				+  for(pli=1;pli<3;pli++){
			
 
				+    oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[refi]+pli,img+pli,
			
 
				+     cpic_x,cpic_y,cpic_width,cpic_height);
			
 
				+    oc_state_borders_fill_rows(&_enc->state,refi,pli,0,cframe_height);
			
 
				+    oc_state_borders_fill_caps(&_enc->state,refi,pli);
			
 
				+  }
			
 
				+  /*Select a free buffer to use for the reconstructed version of this frame.*/
			
 
				+  for(refi=0;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD]||
			
 
				+   refi==_enc->state.ref_frame_idx[OC_FRAME_PREV];refi++);
			
 
				+  _enc->state.ref_frame_idx[OC_FRAME_SELF]=refi;
			
 
				+  _enc->state.ref_frame_data[OC_FRAME_SELF]=
			
 
				+   _enc->state.ref_frame_bufs[refi][0].data;
			
 
				+  _enc->state.curframe_num+=_enc->prev_dup_count+1;
			
 
				+  /*Step 4: Compress the frame.*/
			
 
				+  /*Start with a keyframe, and don't allow the generation of invalid files that
			
 
				+     overflow the keyframe_granule_shift.*/
			
 
				+  if(_enc->rc.twopass_force_kf||_enc->state.curframe_num==0||
			
 
				+   _enc->state.curframe_num-_enc->state.keyframe_num+_enc->dup_count>=
			
 
				+   _enc->keyframe_frequency_force){
			
 
				+    oc_enc_compress_keyframe(_enc,0);
			
 
				+    drop=0;
			
 
				+  }
			
 
				+  else{
			
 
				+    oc_enc_compress_frame(_enc,0);
			
 
				+    drop=1;
			
 
				+  }
			
 
				+  oc_restore_fpu(&_enc->state);
			
 
				+  /*drop currently indicates if the frame is droppable.*/
			
 
				+  if(_enc->state.info.target_bitrate>0){
			
 
				+    drop=oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
			
 
				+     _enc->state.frame_type,_enc->state.qis[0],0,drop);
			
 
				+  }
			
 
				+  else drop=0;
			
 
				+  /*drop now indicates if the frame was dropped.*/
			
 
				+  if(drop)oc_enc_drop_frame(_enc);
			
 
				+  else _enc->prevframe_dropped=0;
			
 
				+  _enc->packet_state=OC_PACKET_READY;
			
 
				+  _enc->prev_dup_count=_enc->nqueued_dups=_enc->dup_count;
			
 
				+  _enc->dup_count=0;
			
 
				+#if defined(OC_DUMP_IMAGES)
			
 
				+  oc_enc_set_granpos(_enc);
			
 
				+  oc_state_dump_frame(&_enc->state,OC_FRAME_IO,"src");
			
 
				+  oc_state_dump_frame(&_enc->state,OC_FRAME_SELF,"rec");
			
 
				+#endif
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
			
 
				+  unsigned char *packet;
			
 
				+  if(_enc==NULL||_op==NULL)return TH_EFAULT;
			
 
				+  if(_enc->packet_state==OC_PACKET_READY){
			
 
				+    _enc->packet_state=OC_PACKET_EMPTY;
			
 
				+    if(_enc->rc.twopass!=1){
			
 
				+      packet=oggpackB_get_buffer(&_enc->opb);
			
 
				+      /*If there's no packet, malloc failed while writing; it's lost forever.*/
			
 
				+      if(packet==NULL)return TH_EFAULT;
			
 
				+      _op->packet=packet;
			
 
				+      _op->bytes=oggpackB_bytes(&_enc->opb);
			
 
				+    }
			
 
				+    /*For the first pass in 2-pass mode, don't emit any packet data.*/
			
 
				+    else{
			
 
				+      _op->packet=NULL;
			
 
				+      _op->bytes=0;
			
 
				+    }
			
 
				+  }
			
 
				+  else if(_enc->packet_state==OC_PACKET_EMPTY){
			
 
				+    if(_enc->nqueued_dups>0){
			
 
				+      _enc->nqueued_dups--;
			
 
				+      /*Emit an inter frame with no coded blocks in VP3-compatibility mode.*/
			
 
				+      if(_enc->vp3_compatible){
			
 
				+        oggpackB_reset(&_enc->opb);
			
 
				+        oc_enc_drop_frame_pack(_enc);
			
 
				+        packet=oggpackB_get_buffer(&_enc->opb);
			
 
				+        /*If there's no packet, malloc failed while writing; it's lost
			
 
				+           forever.*/
			
 
				+        if(packet==NULL)return TH_EFAULT;
			
 
				+        _op->packet=packet;
			
 
				+        _op->bytes=oggpackB_bytes(&_enc->opb);
			
 
				+      }
			
 
				+      /*Otherwise emit a 0-byte packet.*/
			
 
				+      else{
			
 
				+        _op->packet=NULL;
			
 
				+        _op->bytes=0;
			
 
				+      }
			
 
				+    }
			
 
				+    else{
			
 
				+      if(_last_p)_enc->packet_state=OC_PACKET_DONE;
			
 
				+      return 0;
			
 
				+    }
			
 
				+  }
			
 
				+  else return 0;
			
 
				+  _last_p=_last_p&&_enc->nqueued_dups<=0;
			
 
				+  _op->b_o_s=0;
			
 
				+  _op->e_o_s=_last_p;
			
 
				+  oc_enc_set_granpos(_enc);
			
 
				+  _op->packetno=th_granule_frame(_enc,_enc->state.granpos)+3;
			
 
				+  _op->granulepos=_enc->state.granpos;
			
 
				+  if(_last_p)_enc->packet_state=OC_PACKET_DONE;
			
 
				+  return 1+_enc->nqueued_dups;
			
 
				+}