10 years ago · c858515785
--- a/drivers/SCsub
+++ b/drivers/SCsub
@@ -38,8 +38,8 @@ if (env["vorbis"]=="yes"):
 
				 if (env["tools"]=="yes"):
			
 
				 	SConscript("convex_decomp/SCsub");
			
 
				 
			
 
				-if env["theora"]=="yes":
			
 
				-	SConscript("theoraplayer/SCsub")
			
 
				+#if env["theora"]=="yes":
			
 
				+#	SConscript("theoraplayer/SCsub")
			
 
				 if (env["theora"]=="yes"):
			
 
				 	SConscript("theora/SCsub");
			
 
				 if (env['speex']=='yes'):
			
--- a/drivers/pulseaudio/audio_driver_pulseaudio.cpp
+++ b/drivers/pulseaudio/audio_driver_pulseaudio.cpp
@@ -82,6 +82,17 @@ Error AudioDriverPulseAudio::init() {
 
				 	return OK;
			
 
				 }
			
 
				 
			
 
				+float AudioDriverPulseAudio::get_latency() {
			
 
				+
			
 
				+	if (latency==0)	{ //only do this once since it's approximate anyway
			
 
				+		int error_code;
			
 
				+		pa_usec_t palat = pa_simple_get_latency( pulse,&error_code);
			
 
				+		latency=double(palat)/1000000.0;
			
 
				+	}
			
 
				+
			
 
				+	return latency;
			
 
				+}
			
 
				+
			
 
				 void AudioDriverPulseAudio::thread_func(void* p_udata) {
			
 
				 
			
 
				     AudioDriverPulseAudio* ad = (AudioDriverPulseAudio*)p_udata;
			
@@ -121,6 +132,7 @@ void AudioDriverPulseAudio::thread_func(void* p_udata) {
 
				             ad->exit_thread = true;
			
 
				             break;
			
 
				         }
			
 
				+
			
 
				     }
			
 
				 
			
 
				     ad->thread_exited = true;
			
@@ -185,6 +197,7 @@ AudioDriverPulseAudio::AudioDriverPulseAudio() {
 
				 	mutex = NULL;
			
 
				     thread = NULL;
			
 
				     pulse = NULL;
			
 
				+    latency=0;
			
 
				 }
			
 
				 
			
 
				 AudioDriverPulseAudio::~AudioDriverPulseAudio() {
			
--- a/drivers/pulseaudio/audio_driver_pulseaudio.h
+++ b/drivers/pulseaudio/audio_driver_pulseaudio.h
@@ -58,6 +58,8 @@ class AudioDriverPulseAudio : public AudioDriverSW {
 
				 	mutable bool exit_thread;
			
 
				 	bool pcm_open;
			
 
				 
			
 
				+	float latency;
			
 
				+
			
 
				 public:
			
 
				 
			
 
				 	const char* get_name() const {
			
@@ -72,6 +74,9 @@ public:
 
				 	virtual void unlock();
			
 
				 	virtual void finish();
			
 
				 
			
 
				+	virtual float get_latency();
			
 
				+
			
 
				+
			
 
				     AudioDriverPulseAudio();
			
 
				     ~AudioDriverPulseAudio();
			
 
				 };
			
--- a/drivers/register_driver_types.cpp
+++ b/drivers/register_driver_types.cpp
@@ -43,7 +43,10 @@
 
				 #endif
			
 
				 
			
 
				 #ifdef THEORA_ENABLED
			
 
				-//#include "theora/video_stream_theora.h"
			
 
				+#include "theora/video_stream_theora.h"
			
 
				+#endif
			
 
				+
			
 
				+#ifdef THEORAPLAYER_ENABLED
			
 
				 #include "theoraplayer/video_stream_theoraplayer.h"
			
 
				 #endif
			
 
				 
			
@@ -90,7 +93,10 @@ static ResourceFormatLoaderAudioStreamSpeex *speex_stream_loader=NULL;
 
				 #endif
			
 
				 
			
 
				 #ifdef THEORA_ENABLED
			
 
				-//static ResourceFormatLoaderVideoStreamTheora* theora_stream_loader = NULL;
			
 
				+static ResourceFormatLoaderVideoStreamTheora* theora_stream_loader = NULL;
			
 
				+#endif
			
 
				+
			
 
				+#ifdef THEORAPLAYER_ENABLED
			
 
				 static ResourceFormatLoaderVideoStreamTheoraplayer* theoraplayer_stream_loader = NULL;
			
 
				 #endif
			
 
				 
			
@@ -205,9 +211,12 @@ void register_driver_types() {
 
				 #endif
			
 
				 
			
 
				 #ifdef THEORA_ENABLED
			
 
				-	//theora_stream_loader = memnew( ResourceFormatLoaderVideoStreamTheora );
			
 
				-	//ResourceLoader::add_resource_format_loader(theora_stream_loader);
			
 
				-	//ObjectTypeDB::register_type<VideoStreamTheora>();
			
 
				+	theora_stream_loader = memnew( ResourceFormatLoaderVideoStreamTheora );
			
 
				+	ResourceLoader::add_resource_format_loader(theora_stream_loader);
			
 
				+	ObjectTypeDB::register_type<VideoStreamTheora>();
			
 
				+#endif
			
 
				+
			
 
				+#ifdef THEORAPLAYER_ENABLED
			
 
				 	theoraplayer_stream_loader = memnew( ResourceFormatLoaderVideoStreamTheoraplayer );
			
 
				 	ResourceLoader::add_resource_format_loader(theoraplayer_stream_loader);
			
 
				 	ObjectTypeDB::register_type<VideoStreamTheoraplayer>();
			
@@ -244,7 +253,10 @@ void unregister_driver_types() {
 
				 #endif
			
 
				 
			
 
				 #ifdef THEORA_ENABLED
			
 
				-	//memdelete (theora_stream_loader);
			
 
				+	memdelete (theora_stream_loader);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef THEORAPLAYER_ENABLED
			
 
				 	memdelete (theoraplayer_stream_loader);
			
 
				 #endif
			
 
				 
			
--- a/drivers/theora/video_stream_theora.cpp
+++ b/drivers/theora/video_stream_theora.cpp
@@ -1,16 +1,12 @@
 
				 #ifdef THEORA_ENABLED
			
 
				-#if 0
			
 
				+
			
 
				 #include "video_stream_theora.h"
			
 
				 #include "os/os.h"
			
 
				 #include "yuv2rgb.h"
			
 
				+#include "globals.h"
			
 
				 
			
 
				 
			
 
				-AudioStream::UpdateMode VideoStreamTheora::get_update_mode() const {
			
 
				-
			
 
				-	return UPDATE_IDLE;
			
 
				-};
			
 
				-
			
 
				-int VideoStreamTheora::	buffer_data() {
			
 
				+int VideoStreamPlaybackTheora::	buffer_data() {
			
 
				   char *buffer=ogg_sync_buffer(&oy,4096);
			
 
				   int bytes=file->get_buffer((uint8_t*)buffer, 4096);
			
 
				 
			
@@ -18,33 +14,13 @@ int VideoStreamTheora::	buffer_data() {
 
				   return(bytes);
			
 
				 }
			
 
				 
			
 
				-int VideoStreamTheora::queue_page(ogg_page *page){
			
 
				+int VideoStreamPlaybackTheora::queue_page(ogg_page *page){
			
 
				   if(theora_p)ogg_stream_pagein(&to,page);
			
 
				   if(vorbis_p)ogg_stream_pagein(&vo,page);
			
 
				   return 0;
			
 
				 }
			
 
				 
			
 
				-Image VideoStreamTheora::peek_frame() const {
			
 
				-
			
 
				-	if (frames_pending == 0)
			
 
				-		return Image();
			
 
				-	return Image(size.x, size.y, 0, format, frame_data);
			
 
				-};
			
 
				-
			
 
				-Image VideoStreamTheora::pop_frame() {
			
 
				-
			
 
				-	Image ret = peek_frame();
			
 
				-	frames_pending = 0;
			
 
				-
			
 
				-	return ret;
			
 
				-};
			
 
				-
			
 
				-int VideoStreamTheora::get_pending_frame_count() const {
			
 
				-
			
 
				-	return frames_pending;
			
 
				-};
			
 
				-
			
 
				-void VideoStreamTheora::video_write(void){
			
 
				+void VideoStreamPlaybackTheora::video_write(void){
			
 
				 	th_ycbcr_buffer yuv;
			
 
				 	int y_offset, uv_offset;
			
 
				 	th_decode_ycbcr_out(td,yuv);
			
@@ -78,25 +54,31 @@ void VideoStreamTheora::video_write(void){
 
				 
			
 
				 	int pitch = 4;
			
 
				 	frame_data.resize(size.x * size.y * pitch);
			
 
				-	DVector<uint8_t>::Write w = frame_data.write();
			
 
				-	char* dst = (char*)w.ptr();
			
 
				+	{
			
 
				+		DVector<uint8_t>::Write w = frame_data.write();
			
 
				+		char* dst = (char*)w.ptr();
			
 
				 
			
 
				-	uv_offset=(ti.pic_x/2)+(yuv[1].stride)*(ti.pic_y/2);
			
 
				+		uv_offset=(ti.pic_x/2)+(yuv[1].stride)*(ti.pic_y/2);
			
 
				 
			
 
				-	if (px_fmt == TH_PF_444) {
			
 
				+		if (px_fmt == TH_PF_444) {
			
 
				 
			
 
				-		yuv444_2_rgb8888((uint8_t*)dst, (uint8_t*)yuv[0].data, (uint8_t*)yuv[1].data, (uint8_t*)yuv[2].data, size.x, size.y, yuv[0].stride, yuv[1].stride, size.x<<2, 0);
			
 
				+			yuv444_2_rgb8888((uint8_t*)dst, (uint8_t*)yuv[0].data, (uint8_t*)yuv[1].data, (uint8_t*)yuv[2].data, size.x, size.y, yuv[0].stride, yuv[1].stride, size.x<<2, 0);
			
 
				 
			
 
				-	} else if (px_fmt == TH_PF_422) {
			
 
				+		} else if (px_fmt == TH_PF_422) {
			
 
				 
			
 
				-		yuv422_2_rgb8888((uint8_t*)dst, (uint8_t*)yuv[0].data, (uint8_t*)yuv[1].data, (uint8_t*)yuv[2].data, size.x, size.y, yuv[0].stride, yuv[1].stride, size.x<<2, 0);
			
 
				+			yuv422_2_rgb8888((uint8_t*)dst, (uint8_t*)yuv[0].data, (uint8_t*)yuv[1].data, (uint8_t*)yuv[2].data, size.x, size.y, yuv[0].stride, yuv[1].stride, size.x<<2, 0);
			
 
				 
			
 
				-	} else if (px_fmt == TH_PF_420) {
			
 
				+		} else if (px_fmt == TH_PF_420) {
			
 
				 
			
 
				-		yuv420_2_rgb8888((uint8_t*)dst, (uint8_t*)yuv[0].data, (uint8_t*)yuv[2].data, (uint8_t*)yuv[1].data, size.x, size.y, yuv[0].stride, yuv[1].stride, size.x<<2, 0);
			
 
				-	};
			
 
				+			yuv420_2_rgb8888((uint8_t*)dst, (uint8_t*)yuv[0].data, (uint8_t*)yuv[2].data, (uint8_t*)yuv[1].data, size.x, size.y, yuv[0].stride, yuv[1].stride, size.x<<2, 0);
			
 
				+		};
			
 
				 
			
 
				-	format = Image::FORMAT_RGBA;
			
 
				+		format = Image::FORMAT_RGBA;
			
 
				+	}
			
 
				+
			
 
				+	Image img(size.x,size.y,0,Image::FORMAT_RGBA,frame_data); //zero copy image creation
			
 
				+
			
 
				+	texture->set_data(img); //zero copy send to visual server
			
 
				 
			
 
				 	/*
			
 
				 
			
@@ -194,7 +176,7 @@ void VideoStreamTheora::video_write(void){
 
				 	frames_pending = 1;
			
 
				 }
			
 
				 
			
 
				-void VideoStreamTheora::clear() {
			
 
				+void VideoStreamPlaybackTheora::clear() {
			
 
				 
			
 
				 	if (file_name == "")
			
 
				 		return;
			
@@ -218,7 +200,7 @@ void VideoStreamTheora::clear() {
 
				 	}
			
 
				 	ogg_sync_clear(&oy);
			
 
				 
			
 
				-	file_name = "";
			
 
				+	//file_name = "";
			
 
				 
			
 
				 	theora_p = 0;
			
 
				 	vorbis_p = 0;
			
@@ -229,7 +211,7 @@ void VideoStreamTheora::clear() {
 
				 	playing = false;
			
 
				 };
			
 
				 
			
 
				-void VideoStreamTheora::set_file(const String& p_file) {
			
 
				+void VideoStreamPlaybackTheora::set_file(const String& p_file) {
			
 
				 
			
 
				 	ogg_packet op;
			
 
				 	th_setup_info    *ts = NULL;
			
@@ -241,7 +223,7 @@ void VideoStreamTheora::set_file(const String& p_file) {
 
				 	file = FileAccess::open(p_file, FileAccess::READ);
			
 
				 	ERR_FAIL_COND(!file);
			
 
				 
			
 
				-	audio_frames_wrote = 0;
			
 
				+
			
 
				 
			
 
				 	ogg_sync_init(&oy);
			
 
				 
			
@@ -386,6 +368,8 @@ void VideoStreamTheora::set_file(const String& p_file) {
 
				 		size.x = w;
			
 
				 		size.y = h;
			
 
				 
			
 
				+		texture->create(w,h,Image::FORMAT_RGBA,Texture::FLAG_FILTER|Texture::FLAG_VIDEO_SURFACE);
			
 
				+
			
 
				 	}else{
			
 
				 		/* tear down the partial theora setup */
			
 
				 		th_info_clear(&ti);
			
@@ -399,7 +383,7 @@ void VideoStreamTheora::set_file(const String& p_file) {
 
				 		vorbis_block_init(&vd,&vb);
			
 
				 		fprintf(stderr,"Ogg logical stream %lx is Vorbis %d channel %ld Hz audio.\n",
			
 
				 				vo.serialno,vi.channels,vi.rate);
			
 
				-		_setup(vi.channels, vi.rate);
			
 
				+		//_setup(vi.channels, vi.rate);
			
 
				 	}else{
			
 
				 		/* tear down the partial vorbis setup */
			
 
				 		vorbis_info_clear(&vi);
			
@@ -411,227 +395,299 @@ void VideoStreamTheora::set_file(const String& p_file) {
 
				 	time=0;
			
 
				 };
			
 
				 
			
 
				-float VideoStreamTheora::get_time() const {
			
 
				+float VideoStreamPlaybackTheora::get_time() const {
			
 
				 
			
 
				 	//print_line("total: "+itos(get_total())+" todo: "+itos(get_todo()));
			
 
				 	//return MAX(0,time-((get_total())/(float)vi.rate));
			
 
				-	return time-((get_total())/(float)vi.rate);
			
 
				+	return time-AudioServer::get_singleton()->get_output_delay()-delay_compensation;//-((get_total())/(float)vi.rate);
			
 
				 };
			
 
				 
			
 
				-void VideoStreamTheora::update() {
			
 
				+Ref<Texture> VideoStreamPlaybackTheora::get_texture() {
			
 
				+
			
 
				+	return texture;
			
 
				+}
			
 
				+
			
 
				+void VideoStreamPlaybackTheora::update(float p_delta) {
			
 
				 
			
 
				 	if (!playing) {
			
 
				 		//printf("not playing\n");
			
 
				 		return;
			
 
				 	};
			
 
				 
			
 
				-	double ctime =AudioServer::get_singleton()->get_mix_time();
			
 
				+	//double ctime =AudioServer::get_singleton()->get_mix_time();
			
 
				 
			
 
				-	if (last_update_time) {
			
 
				-		double delta = (ctime-last_update_time);
			
 
				-		time+=delta;
			
 
				-		//print_line("delta: "+rtos(delta));
			
 
				-	}
			
 
				-	last_update_time=ctime;
			
 
				+	//print_line("play "+rtos(p_delta));
			
 
				+	time+=p_delta;
			
 
				 
			
 
				+	if (videobuf_time>get_time())
			
 
				+		return; //no new frames need to be produced
			
 
				 
			
 
				-	int audio_todo = get_todo();
			
 
				-	ogg_packet op;
			
 
				-	int audio_pending = 0;
			
 
				+	bool frame_done=false;
			
 
				 
			
 
				+	while (!frame_done) {
			
 
				+		//a frame needs to be produced
			
 
				 
			
 
				-	while (vorbis_p && audio_todo) {
			
 
				-		int ret;
			
 
				-		float **pcm;
			
 
				-
			
 
				-		/* if there's pending, decoded audio, grab it */
			
 
				-		if ((ret=vorbis_synthesis_pcmout(&vd,&pcm))>0) {
			
 
				-
			
 
				-			audio_pending = ret;
			
 
				-			int16_t* out = get_write_buffer();
			
 
				-			int count = 0;
			
 
				-			int to_read = MIN(ret, audio_todo);
			
 
				-			for (int i=0; i<to_read; i++) {
			
 
				-
			
 
				-				for(int j=0;j<vi.channels;j++){
			
 
				-					int val=Math::fast_ftoi(pcm[j][i]*32767.f);
			
 
				-					if(val>32767)val=32767;
			
 
				-					if(val<-32768)val=-32768;
			
 
				-					out[count++] = val;
			
 
				-				};
			
 
				-			};
			
 
				-			int tr = vorbis_synthesis_read(&vd, to_read);
			
 
				-			audio_todo -= to_read;
			
 
				-			audio_frames_wrote += to_read;
			
 
				-			write(to_read);
			
 
				-			audio_pending -= to_read;
			
 
				-			if (audio_todo==0)
			
 
				-				buffering=false;
			
 
				+		ogg_packet op;
			
 
				+		bool audio_pending = false;
			
 
				 
			
 
				 
			
 
				-		} else {
			
 
				+		while (vorbis_p) {
			
 
				+			int ret;
			
 
				+			float **pcm;
			
 
				+
			
 
				+			bool buffer_full=false;
			
 
				+
			
 
				+			/* if there's pending, decoded audio, grab it */
			
 
				+			if ((ret=vorbis_synthesis_pcmout(&vd,&pcm))>0) {
			
 
				+
			
 
				+
			
 
				+
			
 
				+				const int AUXBUF_LEN=4096;
			
 
				+				int to_read = ret;
			
 
				+				int16_t aux_buffer[AUXBUF_LEN];
			
 
				+
			
 
				+				while(to_read) {
			
 
				+
			
 
				+					int m = MIN(AUXBUF_LEN/vi.channels,to_read);
			
 
				+
			
 
				+					int count = 0;
			
 
				+
			
 
				+					for(int j=0;j<m;j++){
			
 
				+						for(int i=0;i<vi.channels;i++){
			
 
				+
			
 
				+							int val=Math::fast_ftoi(pcm[i][j]*32767.f);
			
 
				+							if(val>32767)val=32767;
			
 
				+							if(val<-32768)val=-32768;
			
 
				+							aux_buffer[count++] = val;
			
 
				+						}
			
 
				+					}
			
 
				+
			
 
				+					if (mix_callback) {
			
 
				+						int mixed = mix_callback(mix_udata,aux_buffer,m);
			
 
				+						to_read-=mixed;
			
 
				+						if (mixed!=m) { //could mix no more
			
 
				+							buffer_full=true;
			
 
				+							break;
			
 
				+						}
			
 
				+					} else {
			
 
				+						to_read-=m; //just pretend we sent the audio
			
 
				+					}
			
 
				+
			
 
				 
			
 
				-			/* no pending audio; is there a pending packet to decode? */
			
 
				-			if (ogg_stream_packetout(&vo,&op)>0){
			
 
				-				if(vorbis_synthesis(&vb,&op)==0) { /* test for success! */
			
 
				-					vorbis_synthesis_blockin(&vd,&vb);
			
 
				 				}
			
 
				-			} else {  /* we need more data; break out to suck in another page */
			
 
				-				//printf("need moar data\n");
			
 
				+
			
 
				+
			
 
				+				int tr = vorbis_synthesis_read(&vd, ret-to_read);
			
 
				+
			
 
				+				audio_pending=true;
			
 
				+
			
 
				+
			
 
				+			} else {
			
 
				+
			
 
				+				/* no pending audio; is there a pending packet to decode? */
			
 
				+				if (ogg_stream_packetout(&vo,&op)>0){
			
 
				+					if(vorbis_synthesis(&vb,&op)==0) { /* test for success! */
			
 
				+						vorbis_synthesis_blockin(&vd,&vb);
			
 
				+					}
			
 
				+				} else {  /* we need more data; break out to suck in another page */
			
 
				+					//printf("need moar data\n");
			
 
				+					break;
			
 
				+				};
			
 
				+			}
			
 
				+
			
 
				+			if (buffer_full)
			
 
				 				break;
			
 
				-			};
			
 
				 		}
			
 
				-	}
			
 
				 
			
 
				-	while(theora_p && !videobuf_ready){
			
 
				-		/* theora is one in, one out... */
			
 
				-		if(ogg_stream_packetout(&to,&op)>0){
			
 
				+		while(theora_p && !frame_done){
			
 
				+			/* theora is one in, one out... */
			
 
				+			if(ogg_stream_packetout(&to,&op)>0){
			
 
				 
			
 
				 
			
 
				-			if(pp_inc){
			
 
				-				pp_level+=pp_inc;
			
 
				-				th_decode_ctl(td,TH_DECCTL_SET_PPLEVEL,&pp_level,
			
 
				-							  sizeof(pp_level));
			
 
				-				pp_inc=0;
			
 
				-			}
			
 
				-			/*HACK: This should be set after a seek or a gap, but we might not have
			
 
				-			a granulepos for the first packet (we only have them for the last
			
 
				-			packet on a page), so we just set it as often as we get it.
			
 
				-			To do this right, we should back-track from the last packet on the
			
 
				-			page and compute the correct granulepos for the first packet after
			
 
				-			a seek or a gap.*/
			
 
				-			if(op.granulepos>=0){
			
 
				-				th_decode_ctl(td,TH_DECCTL_SET_GRANPOS,&op.granulepos,
			
 
				-							  sizeof(op.granulepos));
			
 
				-			}
			
 
				-			ogg_int64_t videobuf_granulepos;
			
 
				-			if(th_decode_packetin(td,&op,&videobuf_granulepos)==0){
			
 
				-				videobuf_time=th_granule_time(td,videobuf_granulepos);
			
 
				-				//printf("frame time %f, play time %f, ready %i\n", (float)videobuf_time, get_time(), videobuf_ready);
			
 
				-
			
 
				-				/* is it already too old to be useful?  This is only actually
			
 
				-				 useful cosmetically after a SIGSTOP.  Note that we have to
			
 
				-				 decode the frame even if we don't show it (for now) due to
			
 
				-				 keyframing.  Soon enough libtheora will be able to deal
			
 
				-				 with non-keyframe seeks.  */
			
 
				-
			
 
				-				if(videobuf_time>=get_time())
			
 
				-					videobuf_ready=1;
			
 
				-				else{
			
 
				-					/*If we are too slow, reduce the pp level.*/
			
 
				-					pp_inc=pp_level>0?-1:0;
			
 
				+				if(pp_inc){
			
 
				+					pp_level+=pp_inc;
			
 
				+					th_decode_ctl(td,TH_DECCTL_SET_PPLEVEL,&pp_level,
			
 
				+								  sizeof(pp_level));
			
 
				+					pp_inc=0;
			
 
				+				}
			
 
				+				/*HACK: This should be set after a seek or a gap, but we might not have
			
 
				+				a granulepos for the first packet (we only have them for the last
			
 
				+				packet on a page), so we just set it as often as we get it.
			
 
				+				To do this right, we should back-track from the last packet on the
			
 
				+				page and compute the correct granulepos for the first packet after
			
 
				+				a seek or a gap.*/
			
 
				+				if(op.granulepos>=0){
			
 
				+					th_decode_ctl(td,TH_DECCTL_SET_GRANPOS,&op.granulepos,
			
 
				+								  sizeof(op.granulepos));
			
 
				+				}
			
 
				+				ogg_int64_t videobuf_granulepos;
			
 
				+				if(th_decode_packetin(td,&op,&videobuf_granulepos)==0){
			
 
				+					videobuf_time=th_granule_time(td,videobuf_granulepos);
			
 
				+
			
 
				+					//printf("frame time %f, play time %f, ready %i\n", (float)videobuf_time, get_time(), videobuf_ready);
			
 
				+
			
 
				+					/* is it already too old to be useful?  This is only actually
			
 
				+					 useful cosmetically after a SIGSTOP.  Note that we have to
			
 
				+					 decode the frame even if we don't show it (for now) due to
			
 
				+					 keyframing.  Soon enough libtheora will be able to deal
			
 
				+					 with non-keyframe seeks.  */
			
 
				+
			
 
				+					if(videobuf_time>=get_time())
			
 
				+						frame_done=true;
			
 
				+					else{
			
 
				+						/*If we are too slow, reduce the pp level.*/
			
 
				+						pp_inc=pp_level>0?-1:0;
			
 
				+					}
			
 
				 				}
			
 
				-			}
			
 
				-
			
 
				-		} else
			
 
				-			break;
			
 
				-	}
			
 
				 
			
 
				-	if (/*!videobuf_ready && */ audio_pending == 0 && file->eof_reached()) {
			
 
				-		printf("video done, stopping\n");
			
 
				-		stop();
			
 
				-		return;
			
 
				-	};
			
 
				+			} else
			
 
				+				break;
			
 
				+		}
			
 
				 
			
 
				-	if (!videobuf_ready || audio_todo > 0){
			
 
				-		/* no data yet for somebody.  Grab another page */
			
 
				+		if (file && /*!videobuf_ready && */ file->eof_reached()) {
			
 
				+			printf("video done, stopping\n");
			
 
				+			stop();
			
 
				+			return;
			
 
				+		};
			
 
				+	#if 0
			
 
				+		if (!videobuf_ready || audio_todo > 0){
			
 
				+			/* no data yet for somebody.  Grab another page */
			
 
				 
			
 
				-		buffer_data();
			
 
				-		while(ogg_sync_pageout(&oy,&og)>0){
			
 
				-			queue_page(&og);
			
 
				+			buffer_data();
			
 
				+			while(ogg_sync_pageout(&oy,&og)>0){
			
 
				+				queue_page(&og);
			
 
				+			}
			
 
				 		}
			
 
				-	}
			
 
				+	#else
			
 
				+		if (!frame_done){
			
 
				+			//what's the point of waiting for audio to grab a page?
			
 
				 
			
 
				-	/* If playback has begun, top audio buffer off immediately. */
			
 
				-	//if(stateflag) audio_write_nonblocking();
			
 
				+			buffer_data();
			
 
				+			while(ogg_sync_pageout(&oy,&og)>0){
			
 
				+				queue_page(&og);
			
 
				+			}
			
 
				+		}
			
 
				+	#endif
			
 
				+		/* If playback has begun, top audio buffer off immediately. */
			
 
				+		//if(stateflag) audio_write_nonblocking();
			
 
				 
			
 
				-	/* are we at or past time for this video frame? */
			
 
				-	if(videobuf_ready && videobuf_time<=get_time()){
			
 
				+		/* are we at or past time for this video frame? */
			
 
				+		if(videobuf_ready && videobuf_time<=get_time()){
			
 
				 
			
 
				-		video_write();
			
 
				-		videobuf_ready=0;
			
 
				-	} else {
			
 
				-		//printf("frame at %f not ready (time %f), ready %i\n", (float)videobuf_time, get_time(), videobuf_ready);
			
 
				-	}
			
 
				+			//video_write();
			
 
				+			//videobuf_ready=0;
			
 
				+		} else {
			
 
				+			//printf("frame at %f not ready (time %f), ready %i\n", (float)videobuf_time, get_time(), videobuf_ready);
			
 
				+		}
			
 
				 
			
 
				-	float tdiff=videobuf_time-get_time();
			
 
				-	/*If we have lots of extra time, increase the post-processing level.*/
			
 
				-	if(tdiff>ti.fps_denominator*0.25/ti.fps_numerator){
			
 
				-		pp_inc=pp_level<pp_level_max?1:0;
			
 
				-	}
			
 
				-	else if(tdiff<ti.fps_denominator*0.05/ti.fps_numerator){
			
 
				-		pp_inc=pp_level>0?-1:0;
			
 
				+		float tdiff=videobuf_time-get_time();
			
 
				+		/*If we have lots of extra time, increase the post-processing level.*/
			
 
				+		if(tdiff>ti.fps_denominator*0.25/ti.fps_numerator){
			
 
				+			pp_inc=pp_level<pp_level_max?1:0;
			
 
				+		}
			
 
				+		else if(tdiff<ti.fps_denominator*0.05/ti.fps_numerator){
			
 
				+			pp_inc=pp_level>0?-1:0;
			
 
				+		}
			
 
				 	}
			
 
				-};
			
 
				 
			
 
				-bool VideoStreamTheora::_can_mix() const {
			
 
				+	video_write();
			
 
				 
			
 
				-	return !buffering;
			
 
				 };
			
 
				 
			
 
				-void VideoStreamTheora::play() {
			
 
				+
			
 
				+void VideoStreamPlaybackTheora::play() {
			
 
				 
			
 
				 	if (!playing)
			
 
				-		last_update_time=0;
			
 
				+		time=0;
			
 
				 	playing = true;
			
 
				+	delay_compensation=Globals::get_singleton()->get("audio/video_delay_compensation_ms");
			
 
				+	delay_compensation/=1000.0;
			
 
				+
			
 
				 };
			
 
				 
			
 
				-void VideoStreamTheora::stop() {
			
 
				+void VideoStreamPlaybackTheora::stop() {
			
 
				 
			
 
				+	if (playing) {
			
 
				+		clear();
			
 
				+		set_file(file_name); //reset
			
 
				+	}
			
 
				 	playing = false;
			
 
				-	last_update_time=0;
			
 
				+	time=0;
			
 
				 };
			
 
				 
			
 
				-bool VideoStreamTheora::is_playing() const {
			
 
				+bool VideoStreamPlaybackTheora::is_playing() const {
			
 
				 
			
 
				 	return playing;
			
 
				 };
			
 
				 
			
 
				-void VideoStreamTheora::set_paused(bool p_paused) {
			
 
				+void VideoStreamPlaybackTheora::set_paused(bool p_paused) {
			
 
				 
			
 
				 	playing = !p_paused;
			
 
				 };
			
 
				 
			
 
				-bool VideoStreamTheora::is_paused(bool p_paused) const {
			
 
				+bool VideoStreamPlaybackTheora::is_paused(bool p_paused) const {
			
 
				 
			
 
				 	return playing;
			
 
				 };
			
 
				 
			
 
				-void VideoStreamTheora::set_loop(bool p_enable) {
			
 
				+void VideoStreamPlaybackTheora::set_loop(bool p_enable) {
			
 
				 
			
 
				 };
			
 
				 
			
 
				-bool VideoStreamTheora::has_loop() const {
			
 
				+bool VideoStreamPlaybackTheora::has_loop() const {
			
 
				 
			
 
				 	return false;
			
 
				 };
			
 
				 
			
 
				-float VideoStreamTheora::get_length() const {
			
 
				+float VideoStreamPlaybackTheora::get_length() const {
			
 
				 
			
 
				 	return 0;
			
 
				 };
			
 
				 
			
 
				-String VideoStreamTheora::get_stream_name() const {
			
 
				+String VideoStreamPlaybackTheora::get_stream_name() const {
			
 
				 
			
 
				 	return "";
			
 
				 };
			
 
				 
			
 
				-int VideoStreamTheora::get_loop_count() const {
			
 
				+int VideoStreamPlaybackTheora::get_loop_count() const {
			
 
				 
			
 
				 	return 0;
			
 
				 };
			
 
				 
			
 
				-float VideoStreamTheora::get_pos() const {
			
 
				+float VideoStreamPlaybackTheora::get_pos() const {
			
 
				 
			
 
				 	return get_time();
			
 
				 };
			
 
				 
			
 
				-void VideoStreamTheora::seek_pos(float p_time) {
			
 
				+void VideoStreamPlaybackTheora::seek_pos(float p_time) {
			
 
				 
			
 
				 	// no
			
 
				 };
			
 
				 
			
 
				-VideoStreamTheora::VideoStreamTheora() {
			
 
				+void VideoStreamPlaybackTheora::set_mix_callback(AudioMixCallback p_callback,void *p_userdata) {
			
 
				+
			
 
				+	mix_callback=p_callback;
			
 
				+	mix_udata=p_userdata;
			
 
				+}
			
 
				+
			
 
				+int VideoStreamPlaybackTheora::get_channels() const{
			
 
				+
			
 
				+	return vi.channels;
			
 
				+}
			
 
				+
			
 
				+void VideoStreamPlaybackTheora::set_audio_track(int p_idx) {
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+int VideoStreamPlaybackTheora::get_mix_rate() const{
			
 
				+
			
 
				+	return vi.rate;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+VideoStreamPlaybackTheora::VideoStreamPlaybackTheora() {
			
 
				 
			
 
				 	file = NULL;
			
 
				 	theora_p = 0;
			
@@ -640,11 +696,15 @@ VideoStreamTheora::VideoStreamTheora() {
 
				 	playing = false;
			
 
				 	frames_pending = 0;
			
 
				 	videobuf_time = 0;
			
 
				-	last_update_time =0;
			
 
				+
			
 
				 	buffering=false;
			
 
				+	texture = Ref<ImageTexture>( memnew(ImageTexture ));
			
 
				+	mix_callback=NULL;
			
 
				+	mix_udata=NULL;
			
 
				+	delay_compensation=0;
			
 
				 };
			
 
				 
			
 
				-VideoStreamTheora::~VideoStreamTheora() {
			
 
				+VideoStreamPlaybackTheora::~VideoStreamPlaybackTheora() {
			
 
				 
			
 
				 	clear();
			
 
				 
			
@@ -653,10 +713,16 @@ VideoStreamTheora::~VideoStreamTheora() {
 
				 };
			
 
				 
			
 
				 
			
 
				-RES ResourceFormatLoaderVideoStreamTheora::load(const String &p_path,const String& p_original_path) {
			
 
				+RES ResourceFormatLoaderVideoStreamTheora::load(const String &p_path,const String& p_original_path, Error *r_error) {
			
 
				+	if (r_error)
			
 
				+		*r_error=ERR_FILE_CANT_OPEN;
			
 
				 
			
 
				 	VideoStreamTheora *stream = memnew(VideoStreamTheora);
			
 
				 	stream->set_file(p_path);
			
 
				+
			
 
				+	if (r_error)
			
 
				+		*r_error=OK;
			
 
				+
			
 
				 	return Ref<VideoStreamTheora>(stream);
			
 
				 }
			
 
				 
			
@@ -666,16 +732,16 @@ void ResourceFormatLoaderVideoStreamTheora::get_recognized_extensions(List<Strin
 
				 	p_extensions->push_back("ogv");
			
 
				 }
			
 
				 bool ResourceFormatLoaderVideoStreamTheora::handles_type(const String& p_type) const {
			
 
				-	return (p_type=="AudioStream" || p_type=="VideoStreamTheora");
			
 
				+	return (p_type=="VideoStream" || p_type=="VideoStreamTheora");
			
 
				 }
			
 
				 
			
 
				 String ResourceFormatLoaderVideoStreamTheora::get_resource_type(const String &p_path) const {
			
 
				 
			
 
				 	String exl=p_path.extension().to_lower();
			
 
				 	if (exl=="ogm" || exl=="ogv")
			
 
				-		return "AudioStreamTheora";
			
 
				+		return "VideoStreamTheora";
			
 
				 	return "";
			
 
				 }
			
 
				 
			
 
				 #endif
			
 
				-#endif
			
 
				+
			
--- a/drivers/theora/video_stream_theora.h
+++ b/drivers/theora/video_stream_theora.h
@@ -10,9 +10,9 @@
 
				 #include "io/resource_loader.h"
			
 
				 #include "scene/resources/video_stream.h"
			
 
				 
			
 
				-class VideoStreamTheora : public VideoStream {
			
 
				+class VideoStreamPlaybackTheora : public VideoStreamPlayback {
			
 
				 
			
 
				-	OBJ_TYPE(VideoStreamTheora, VideoStream);
			
 
				+	OBJ_TYPE(VideoStreamPlaybackTheora, VideoStreamPlayback);
			
 
				 
			
 
				 	enum {
			
 
				 		MAX_FRAMES = 4,
			
@@ -58,16 +58,17 @@ class VideoStreamTheora : public VideoStream {
 
				 
			
 
				 	double last_update_time;
			
 
				 	double time;
			
 
				+	double delay_compensation;
			
 
				 
			
 
				-protected:
			
 
				+	Ref<ImageTexture> texture;
			
 
				 
			
 
				-	virtual UpdateMode get_update_mode() const;
			
 
				-	virtual void update();
			
 
				+	AudioMixCallback mix_callback;
			
 
				+	void* mix_udata;
			
 
				 
			
 
				-	void clear();
			
 
				-
			
 
				-	virtual bool _can_mix() const;
			
 
				+protected:
			
 
				 
			
 
				+	void clear();
			
 
				+	
			
 
				 public:
			
 
				 
			
 
				 	virtual void play();
			
@@ -92,12 +93,36 @@ public:
 
				 
			
 
				 	void set_file(const String& p_file);
			
 
				 
			
 
				-	int get_pending_frame_count() const;
			
 
				-	Image pop_frame();
			
 
				-	Image peek_frame() const;
			
 
				+	virtual Ref<Texture> get_texture();
			
 
				+	virtual void update(float p_delta);
			
 
				+
			
 
				+	virtual void set_mix_callback(AudioMixCallback p_callback,void *p_userdata);
			
 
				+	virtual int get_channels() const;
			
 
				+	virtual int get_mix_rate() const;
			
 
				+
			
 
				+	virtual void set_audio_track(int p_idx);
			
 
				+
			
 
				+	VideoStreamPlaybackTheora();
			
 
				+	~VideoStreamPlaybackTheora();
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+class VideoStreamTheora : public VideoStream {
			
 
				+
			
 
				+	OBJ_TYPE(VideoStreamTheora,VideoStream);
			
 
				+
			
 
				+	String file;
			
 
				+public:
			
 
				+
			
 
				+	Ref<VideoStreamPlayback> instance_playback() {
			
 
				+		Ref<VideoStreamPlaybackTheora> pb = memnew( VideoStreamPlaybackTheora );
			
 
				+		pb->set_file(file);
			
 
				+		return pb;
			
 
				+	}
			
 
				+
			
 
				+	void set_file(const String& p_file) { file=p_file; }
			
 
				 
			
 
				-	VideoStreamTheora();
			
 
				-	~VideoStreamTheora();
			
 
				 };
			
 
				 
			
 
				 class ResourceFormatLoaderVideoStreamTheora : public ResourceFormatLoader {
			
--- a/drivers/theoraplayer/SCsub
+++ b/drivers/theoraplayer/SCsub
@@ -1,106 +0,0 @@
 
				-Import("env")
			
 
				-
			
 
				-import string
			
 
				-
			
 
				-sources = string.split("""
			
 
				-src/TheoraVideoClip.cpp
			
 
				-src/FFmpeg/TheoraVideoClip_FFmpeg.cpp
			
 
				-src/TheoraAsync.cpp
			
 
				-src/TheoraAudioInterface.cpp
			
 
				-src/TheoraException.cpp
			
 
				-src/TheoraWorkerThread.cpp
			
 
				-src/TheoraVideoManager.cpp
			
 
				-src/TheoraTimer.cpp
			
 
				-src/TheoraUtil.cpp
			
 
				-src/TheoraDataSource.cpp
			
 
				-src/TheoraAudioPacketQueue.cpp
			
 
				-src/TheoraFrameQueue.cpp
			
 
				-src/Theora/TheoraVideoClip_Theora.cpp
			
 
				-src/YUV/yuv_util.c
			
 
				-src/YUV/libyuv/src/row_any.cc
			
 
				-src/YUV/libyuv/src/compare_common.cc
			
 
				-src/YUV/libyuv/src/scale_neon.cc
			
 
				-src/YUV/libyuv/src/planar_functions.cc
			
 
				-src/YUV/libyuv/src/compare.cc
			
 
				-src/YUV/libyuv/src/scale_mips.cc
			
 
				-src/YUV/libyuv/src/scale_posix.cc
			
 
				-src/YUV/libyuv/src/row_posix.cc
			
 
				-src/YUV/libyuv/src/row_win.cc
			
 
				-src/YUV/libyuv/src/compare_neon.cc
			
 
				-src/YUV/libyuv/src/convert_from_argb.cc
			
 
				-src/YUV/libyuv/src/mjpeg_validate.cc
			
 
				-src/YUV/libyuv/src/convert_from.cc
			
 
				-src/YUV/libyuv/src/rotate_neon.cc
			
 
				-src/YUV/libyuv/src/row_neon.cc
			
 
				-src/YUV/libyuv/src/rotate_mips.cc
			
 
				-src/YUV/libyuv/src/compare_posix.cc
			
 
				-src/YUV/libyuv/src/row_mips.cc
			
 
				-src/YUV/libyuv/src/scale.cc
			
 
				-src/YUV/libyuv/src/scale_argb.cc
			
 
				-src/YUV/libyuv/src/mjpeg_decoder.cc
			
 
				-src/YUV/libyuv/src/scale_win.cc
			
 
				-src/YUV/libyuv/src/scale_common.cc
			
 
				-src/YUV/libyuv/src/scale_argb_neon.cc
			
 
				-src/YUV/libyuv/src/row_common.cc
			
 
				-src/YUV/libyuv/src/convert.cc
			
 
				-src/YUV/libyuv/src/format_conversion.cc
			
 
				-src/YUV/libyuv/src/rotate_argb.cc
			
 
				-src/YUV/libyuv/src/rotate.cc
			
 
				-src/YUV/libyuv/src/convert_argb.cc
			
 
				-src/YUV/libyuv/src/cpu_id.cc
			
 
				-src/YUV/libyuv/src/video_common.cc
			
 
				-src/YUV/libyuv/src/convert_to_argb.cc
			
 
				-src/YUV/libyuv/src/compare_win.cc
			
 
				-src/YUV/libyuv/src/convert_to_i420.cc
			
 
				-src/YUV/libyuv/src/convert_jpeg.cc
			
 
				-src/YUV/libyuv/yuv_libyuv.c
			
 
				-src/YUV/android/cpu-features.c
			
 
				-src/YUV/C/yuv420_grey_c.c
			
 
				-src/YUV/C/yuv420_yuv_c.c
			
 
				-src/YUV/C/yuv420_rgb_c.c
			
 
				-src/TheoraVideoFrame.cpp
			
 
				-""")
			
 
				-
			
 
				-env_theora = env.Clone()
			
 
				-
			
 
				-if env["platform"] == "iphone":
			
 
				-	sources.append("src/AVFoundation/TheoraVideoClip_AVFoundation.mm")
			
 
				-	env.Append(LINKFLAGS=['-framework', 'CoreVideo', '-framework', 'CoreMedia', '-framework', 'AVFoundation'])
			
 
				-	if env["target"] == "release":
			
 
				-		env_theora.Append(CPPFLAGS=["-D_IOS", "-D__ARM_NEON__", "-fstrict-aliasing", "-fmessage-length=210", "-fdiagnostics-show-note-include-stack", "-fmacro-backtrace-limit=0", "-fcolor-diagnostics", "-Wno-trigraphs", "-fpascal-strings", "-fvisibility=hidden", "-fvisibility-inlines-hidden"])
			
 
				-
			
 
				-env_theora.Append(CPPFLAGS=["-D_LIB", "-D__THEORA"]) # removed -D_YUV_C
			
 
				-env_theora.Append(CPPFLAGS=["-D_YUV_LIBYUV"])
			
 
				-#env_theora.Append(CPPFLAGS=["-D_YUV_C"])
			
 
				-
			
 
				-if env["platform"] == "iphone":
			
 
				-	env_theora.Append(CPPFLAGS=["-D__AVFOUNDATION"])
			
 
				-else:
			
 
				-	pass
			
 
				-	#env_theora.Append(CPPFLAGS=["-D__FFMPEG"])
			
 
				-
			
 
				-if env["platform"] == "android":
			
 
				-	env_theora.Append(CPPFLAGS=["-D_ANDROID"])
			
 
				-
			
 
				-if env["platform"] == "winrt":
			
 
				-		env_theora.Append(CPPFLAGS=["-D_WINRT"])
			
 
				-
			
 
				-env_theora.Append(CPPPATH=["#drivers/theoraplayer/include/theoraplayer", "#drivers/theoraplayer/src/YUV", "#drivers/theoraplayer/src/YUV/libyuv/include", "#drivers/theoraplayer/src/Theora", "#drivers/theoraplayer/src/AVFoundation"])
			
 
				-
			
 
				-objs = []
			
 
				-
			
 
				-env_theora.add_source_files(objs, ["video_stream_theoraplayer.cpp"])
			
 
				-
			
 
				-if env['use_theoraplayer_binary'] == "yes":
			
 
				-	if env["platform"] == "iphone":
			
 
				-		env.Append(LIBPATH=['#drivers/theoraplayer/lib/ios'])
			
 
				-		env.Append(LIBS=['theoraplayer', 'ogg', 'theora', 'tremor'])
			
 
				-	if env["platform"] == "windows":
			
 
				-		env.Append(LIBPATH=['#drivers/theoraplayer/lib/windows'])
			
 
				-		env.Append(LINKFLAGS=['libtheoraplayer_static.lib', 'libogg.lib', 'libtheora.lib', 'libvorbis.lib'])
			
 
				-else:
			
 
				-	env_theora.add_source_files(objs, sources)
			
 
				-
			
 
				-env.drivers_sources += objs
			
 
				-
			
 
				-
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraAsync.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraAsync.h
@@ -1,51 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifndef _TheoraAsync_h
			
 
				-#define _TheoraAsync_h
			
 
				-
			
 
				-#ifndef _WIN32
			
 
				-#include <pthread.h>
			
 
				-#endif
			
 
				-
			
 
				-/// @note Based on hltypes::Thread
			
 
				-class TheoraMutex
			
 
				-{
			
 
				-public:
			
 
				-	TheoraMutex();
			
 
				-	~TheoraMutex();
			
 
				-	void lock();
			
 
				-	void unlock();
			
 
				-		
			
 
				-protected:
			
 
				-	void* mHandle;
			
 
				-		
			
 
				-};
			
 
				-
			
 
				-/// @note Based on hltypes::Thread
			
 
				-class TheoraThread
			
 
				-{
			
 
				-	TheoraMutex mRunningMutex;
			
 
				-public:
			
 
				-	TheoraThread();
			
 
				-	virtual ~TheoraThread();
			
 
				-	void start();
			
 
				-	void stop();
			
 
				-	void resume();
			
 
				-	void pause();
			
 
				-	bool isRunning();
			
 
				-	virtual void execute() = 0;
			
 
				-	void join();
			
 
				-		
			
 
				-protected:
			
 
				-	void* mId;
			
 
				-	volatile bool mRunning;
			
 
				-		
			
 
				-};
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraAudioInterface.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraAudioInterface.h
@@ -1,51 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifndef _TheoraAudioInterface_h
			
 
				-#define _TheoraAudioInterface_h
			
 
				-
			
 
				-#include "TheoraExport.h"
			
 
				-
			
 
				-class TheoraVideoClip;
			
 
				-
			
 
				-
			
 
				-/**
			
 
				- This is the class that serves as an interface between the library's audio
			
 
				- output and the audio playback library of your choice.
			
 
				- The class gets mono or stereo PCM data in in floating point data
			
 
				- */
			
 
				-class TheoraPlayerExport TheoraAudioInterface
			
 
				-{
			
 
				-public:
			
 
				-	//! PCM frequency, usualy 44100 Hz
			
 
				-	int mFreq;
			
 
				-	//! Mono or stereo
			
 
				-	int mNumChannels;
			
 
				-	//! Pointer to the parent TheoraVideoClip object
			
 
				-	TheoraVideoClip* mClip;
			
 
				-	
			
 
				-	TheoraAudioInterface(TheoraVideoClip* owner, int nChannels, int freq);
			
 
				-	virtual ~TheoraAudioInterface();
			
 
				-    //! A function that the TheoraVideoClip object calls once more audio packets are decoded
			
 
				-    /*!
			
 
				-	 \param data contains one or two channels of float PCM data in the range [-1,1]
			
 
				-	 \param nSamples contains the number of samples that the data parameter contains in each channel
			
 
				-	 */
			
 
				-	virtual void insertData(float* data, int nSamples)=0;	
			
 
				-};
			
 
				-
			
 
				-class TheoraPlayerExport TheoraAudioInterfaceFactory
			
 
				-{
			
 
				-public:
			
 
				-	//! VideoManager calls this when creating a new TheoraVideoClip object
			
 
				-	virtual TheoraAudioInterface* createInstance(TheoraVideoClip* owner, int nChannels, int freq) = 0;
			
 
				-};
			
 
				-
			
 
				-
			
 
				-#endif
			
 
				-
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraAudioPacketQueue.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraAudioPacketQueue.h
@@ -1,48 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifndef _TheoraAudioPacketQueue_h
			
 
				-#define _TheoraAudioPacketQueue_h
			
 
				-
			
 
				-#include "TheoraExport.h"
			
 
				-
			
 
				-class TheoraAudioInterface;
			
 
				-/**
			
 
				- This is an internal structure which TheoraVideoClip_Theora uses to store audio packets
			
 
				- */
			
 
				-struct TheoraAudioPacket
			
 
				-{
			
 
				-	float* pcm;
			
 
				-	int numSamples; //! size in number of float samples (stereo has twice the number of samples)
			
 
				-	TheoraAudioPacket* next; // pointer to the next audio packet, to implement a linked list
			
 
				-};
			
 
				-
			
 
				-/**
			
 
				-    This is a Mutex object, used in thread syncronization.
			
 
				- */
			
 
				-class TheoraPlayerExport TheoraAudioPacketQueue
			
 
				-{
			
 
				-protected:
			
 
				-	unsigned int mAudioFrequency, mNumAudioChannels;
			
 
				-	TheoraAudioPacket* mTheoraAudioPacketQueue;
			
 
				-	void _addAudioPacket(float* data, int numSamples);
			
 
				-public:
			
 
				-	TheoraAudioPacketQueue();
			
 
				-	~TheoraAudioPacketQueue();
			
 
				-	
			
 
				-	float getAudioPacketQueueLength();
			
 
				-	void addAudioPacket(float** buffer, int numSamples, float gain);
			
 
				-	void addAudioPacket(float* buffer, int numSamples, float gain);
			
 
				-	TheoraAudioPacket* popAudioPacket();
			
 
				-	void destroyAudioPacket(TheoraAudioPacket* p);
			
 
				-	void destroyAllAudioPackets();
			
 
				-	
			
 
				-	void flushAudioPackets(TheoraAudioInterface* audioInterface);
			
 
				-};
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraDataSource.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraDataSource.h
@@ -1,89 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifndef _TheoraDataSource_h
			
 
				-#define _TheoraDataSource_h
			
 
				-
			
 
				-#include <stdio.h>
			
 
				-#include <string>
			
 
				-#include "TheoraExport.h"
			
 
				-
			
 
				-/**
			
 
				-	This is a simple class that provides abstracted data feeding. You can use the
			
 
				-	TheoraFileDataSource for regular file playback or you can implement your own
			
 
				-	internet streaming solution, or a class that uses encrypted datafiles etc.
			
 
				-	The sky is the limit
			
 
				-*/
			
 
				-class TheoraPlayerExport TheoraDataSource
			
 
				-{
			
 
				-public:
			
 
				-
			
 
				-    virtual ~TheoraDataSource();
			
 
				-	/**
			
 
				-		Reads nBytes bytes from data source and returns number of read bytes.
			
 
				-		if function returns less bytes then nBytes, the system assumes EOF is reached.
			
 
				-	*/
			
 
				-	virtual int read(void* output,int nBytes)=0;
			
 
				-    //! returns a string representation of the DataSource, eg 'File: source.ogg'
			
 
				-	virtual std::string repr()=0;
			
 
				-	//! position the source pointer to byte_index from the start of the source
			
 
				-	virtual void seek(unsigned long byte_index)=0;
			
 
				-	//! return the size of the stream in bytes
			
 
				-	virtual unsigned long size()=0;
			
 
				-	//! return the current position of the source pointer
			
 
				-	virtual unsigned long tell()=0;
			
 
				-};
			
 
				-
			
 
				-
			
 
				-/**
			
 
				-	provides standard file IO
			
 
				-*/
			
 
				-class TheoraPlayerExport TheoraFileDataSource : public TheoraDataSource
			
 
				-{
			
 
				-	FILE* mFilePtr;
			
 
				-	std::string mFilename;
			
 
				-	unsigned long mSize;
			
 
				-	
			
 
				-	void openFile();
			
 
				-public:
			
 
				-	TheoraFileDataSource(std::string filename);
			
 
				-	~TheoraFileDataSource();
			
 
				-
			
 
				-	int read(void* output,int nBytes);
			
 
				-	void seek(unsigned long byte_index);
			
 
				-	std::string repr() { return mFilename; }
			
 
				-	unsigned long size();
			
 
				-	unsigned long tell();
			
 
				-	
			
 
				-	std::string getFilename() { return mFilename; }
			
 
				-};
			
 
				-
			
 
				-/**
			
 
				-	Pre-loads the entire file and streams from memory.
			
 
				-	Very useful if you're continuously displaying a video and want to avoid disk reads.
			
 
				-	Not very practical for large files.
			
 
				-*/
			
 
				-class TheoraPlayerExport TheoraMemoryFileDataSource : public TheoraDataSource
			
 
				-{
			
 
				-	std::string mFilename;
			
 
				-	unsigned long mSize, mReadPointer;
			
 
				-	unsigned char* mData;
			
 
				-public:
			
 
				-	TheoraMemoryFileDataSource(unsigned char* data, long size, const std::string& filename = "memory");
			
 
				-	TheoraMemoryFileDataSource(std::string filename);
			
 
				-	~TheoraMemoryFileDataSource();
			
 
				-
			
 
				-	int read(void* output,int nBytes);
			
 
				-	void seek(unsigned long byte_index);
			
 
				-	std::string repr() { return "MEM:"+mFilename; }
			
 
				-	unsigned long size();
			
 
				-	unsigned long tell();
			
 
				-	std::string getFilename() { return mFilename; }
			
 
				-};
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraException.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraException.h
@@ -1,46 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifndef EXCEPTION_H
			
 
				-#define EXCEPTION_H
			
 
				-
			
 
				-#include <string>
			
 
				-#include "TheoraExport.h"
			
 
				-
			
 
				-class TheoraPlayerExport _TheoraGenericException
			
 
				-{
			
 
				-public:
			
 
				-    std::string mErrText,mFile,mType;
			
 
				-	int mLineNumber;
			
 
				-
			
 
				-	_TheoraGenericException(const std::string& errorText, std::string type = "",std::string file = "", int line = 0);
			
 
				-    virtual ~_TheoraGenericException() {}
			
 
				-    
			
 
				-	virtual std::string repr();
			
 
				-    
			
 
				-	void writeOutput();
			
 
				-	
			
 
				-	virtual const std::string& getErrorText() { return mErrText; }
			
 
				-    
			
 
				-	const std::string getType(){ return mType; }
			
 
				-};
			
 
				-
			
 
				-#define TheoraGenericException(msg) _TheoraGenericException(msg, "TheoraGenericException", __FILE__, __LINE__)
			
 
				-
			
 
				-
			
 
				-#define exception_cls(name) class name : public _TheoraGenericException \
			
 
				-{ \
			
 
				-public: \
			
 
				-	name(const std::string& errorText,std::string type = "",std::string file = "",int line = 0) : \
			
 
				-	  _TheoraGenericException(errorText, type, file, line){} \
			
 
				-}
			
 
				-
			
 
				-exception_cls(_KeyException);
			
 
				-
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraExport.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraExport.h
@@ -1,38 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifndef _theoraVideoExport_h
			
 
				-#define _theoraVideoExport_h
			
 
				-
			
 
				-	#ifdef _LIB
			
 
				-		#define TheoraPlayerExport
			
 
				-		#define TheoraPlayerFnExport
			
 
				-	#else
			
 
				-		#ifdef _WIN32
			
 
				-			#ifdef THEORAVIDEO_EXPORTS
			
 
				-				#define TheoraPlayerExport __declspec(dllexport)
			
 
				-				#define TheoraPlayerFnExport __declspec(dllexport)
			
 
				-			#else
			
 
				-				#define TheoraPlayerExport __declspec(dllimport)
			
 
				-				#define TheoraPlayerFnExport __declspec(dllimport)
			
 
				-			#endif
			
 
				-		#else
			
 
				-			#define TheoraPlayerExport __attribute__ ((visibility("default")))
			
 
				-			#define TheoraPlayerFnExport __attribute__ ((visibility("default")))
			
 
				-		#endif
			
 
				-	#endif
			
 
				-	#ifndef DEPRECATED_ATTRIBUTE
			
 
				-		#ifdef _MSC_VER
			
 
				-			#define DEPRECATED_ATTRIBUTE __declspec(deprecated("function is deprecated"))
			
 
				-		#else
			
 
				-			#define DEPRECATED_ATTRIBUTE __attribute__((deprecated))
			
 
				-		#endif
			
 
				-	#endif
			
 
				-
			
 
				-#endif
			
 
				-
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraFrameQueue.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraFrameQueue.h
@@ -1,95 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-
			
 
				-#ifndef _TheoraFrameQueue_h
			
 
				-#define _TheoraFrameQueue_h
			
 
				-
			
 
				-#include "TheoraAsync.h"
			
 
				-#include <list>
			
 
				-#include "TheoraExport.h"
			
 
				-
			
 
				-class TheoraVideoFrame;
			
 
				-class TheoraVideoClip;
			
 
				-
			
 
				-/**
			
 
				-	This class handles the frame queue. contains frames and handles their alloctation/deallocation
			
 
				-	it is designed to be thread-safe
			
 
				-*/
			
 
				-class TheoraPlayerExport TheoraFrameQueue
			
 
				-{
			
 
				-protected:
			
 
				-	std::list<TheoraVideoFrame*> mQueue;
			
 
				-	TheoraVideoClip* mParent;
			
 
				-	TheoraMutex mMutex;
			
 
				-	
			
 
				-	//! implementation function that returns a TheoraVideoFrame instance
			
 
				-	TheoraVideoFrame* createFrameInstance(TheoraVideoClip* clip);
			
 
				-public:
			
 
				-	TheoraFrameQueue(TheoraVideoClip* parent);
			
 
				-	~TheoraFrameQueue();
			
 
				-
			
 
				-	/**
			
 
				-	    \brief Returns the first available frame in the queue or NULL if no frames are available.
			
 
				-
			
 
				-		This function DOES NOT remove the frame from the queue, you have to do it manually
			
 
				-		when you want to mark the frame as used by calling the pop() function.
			
 
				-	*/
			
 
				-	TheoraVideoFrame* getFirstAvailableFrame();
			
 
				-    //! non-mutex version
			
 
				-	TheoraVideoFrame* _getFirstAvailableFrame();
			
 
				-
			
 
				-	//! return the number of used (not ready) frames
			
 
				-	int getUsedCount();
			
 
				-
			
 
				-	//! return the number of ready frames
			
 
				-	int getReadyCount();
			
 
				-    //! non-mutex version
			
 
				-	int _getReadyCount();
			
 
				-
			
 
				-	/**
			
 
				-	    \brief remove the first N available frame from the queue.
			
 
				-
			
 
				-	    Use this every time you display a frame	so you can get the next one when the time comes.
			
 
				-		This function marks the frame on the front of the queue as unused and it's memory then
			
 
				-		get's used again in the decoding process.
			
 
				-		If you don't call this, the frame queue will fill up with precached frames up to the
			
 
				-		specified amount in the TheoraVideoManager class and you won't be able to advance the video.
			
 
				-	*/
			
 
				-	void pop(int n = 1);
			
 
				-    
			
 
				-    //! This is an internal _pop function. use externally only in combination with lock() / unlock() calls
			
 
				-	void _pop(int n);
			
 
				-
			
 
				-	//! frees all decoded frames for reuse (does not destroy memory, just marks them as free)
			
 
				-	void clear();
			
 
				-	//! Called by WorkerThreads when they need to unload frame data, do not call directly!
			
 
				-	TheoraVideoFrame* requestEmptyFrame();
			
 
				-
			
 
				-	/** 
			
 
				-	    \brief set's the size of the frame queue.
			
 
				-		
			
 
				-		Beware, currently stored ready frames will be lost upon this call
			
 
				-	*/
			
 
				-	void setSize(int n);
			
 
				-	//! return the size of the queue
			
 
				-	int getSize();
			
 
				-	
			
 
				-	//! return whether all frames in the queue are ready for display
			
 
				-	bool isFull();
			
 
				-	
			
 
				-	//! lock the queue's mutex manually
			
 
				-	void lock();
			
 
				-	//! unlock the queue's mutex manually
			
 
				-	void unlock();
			
 
				-    
			
 
				-    //! returns the internal frame queue. Warning: Always lock / unlock queue's mutex before accessing frames directly!
			
 
				-    std::list<TheoraVideoFrame*>& _getFrameQueue();
			
 
				-};
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraPixelTransform.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraPixelTransform.h
@@ -1,18 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifndef _TheoraPixelTransform_h
			
 
				-#define _TheoraPixelTransform_h
			
 
				-
			
 
				-struct TheoraPixelTransform
			
 
				-{
			
 
				-	unsigned char *raw, *y, *u, *v, *out;
			
 
				-	unsigned int w, h, rawStride, yStride, uStride, vStride;
			
 
				-};
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraPlayer.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraPlayer.h
@@ -1,17 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifndef _TheoraPlayer_h
			
 
				-#define _TheoraPlayer_h
			
 
				-
			
 
				-#include "TheoraVideoManager.h"
			
 
				-#include "TheoraVideoClip.h"
			
 
				-#include "TheoraVideoFrame.h"
			
 
				-
			
 
				-#endif
			
 
				-
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraTimer.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraTimer.h
@@ -1,69 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-
			
 
				-#ifndef _TheoraTimer_h
			
 
				-#define _TheoraTimer_h
			
 
				-
			
 
				-#include "TheoraExport.h"
			
 
				-
			
 
				-/**
			
 
				-    This is a Timer object, it is used to control the playback of a TheoraVideoClip.
			
 
				-
			
 
				-	You can inherit this class and make a timer that eg. plays twice as fast,
			
 
				-	or playbacks an audio track and uses it's time offset for syncronizing Video etc.
			
 
				- */
			
 
				-class TheoraPlayerExport TheoraTimer
			
 
				-{
			
 
				-protected:
			
 
				-	//! Current time in seconds
			
 
				-	float mTime,mSpeed;
			
 
				-	//! Is the timer paused or not
			
 
				-	bool mPaused;
			
 
				-public:
			
 
				-	TheoraTimer();
			
 
				-	virtual ~TheoraTimer();
			
 
				-
			
 
				-	virtual float getTime();
			
 
				-	/**
			
 
				-	    \brief advance the time.
			
 
				-
			
 
				-		If you're using another synronization system, eg. an audio track,
			
 
				-		then you can ignore this call or use it to perform other updates.
			
 
				-
			
 
				-		NOTE: this is called by TheoraVideoManager from the main thread
			
 
				-	 */
			
 
				-	virtual void update(float timeDelta);
			
 
				-
			
 
				-	virtual void pause();
			
 
				-	virtual void play();
			
 
				-	virtual bool isPaused();
			
 
				-	virtual void stop();
			
 
				-	/**
			
 
				-	    \brief set's playback speed
			
 
				-
			
 
				-        1.0 is the default. The speed factor multiplies time advance, thus
			
 
				-        setting the value higher will increase playback speed etc.
			
 
				-    
			
 
				-        NOTE: depending on Timer implementation, it may not support setting the speed
			
 
				-         
			
 
				-	 */
			
 
				-    virtual void setSpeed(float speed);
			
 
				-    //! return the update speed 1.0 is the default
			
 
				-    virtual float getSpeed();
			
 
				-
			
 
				-	/**
			
 
				-	    \brief change the current time.
			
 
				-
			
 
				-		if you're using another syncronization mechanism, make sure to adjust
			
 
				-		the time offset there
			
 
				-	 */
			
 
				-	virtual void seek(float time);
			
 
				-};
			
 
				-#endif
			
 
				-
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraUtil.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraUtil.h
@@ -1,32 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifndef _TheoraUtil_h
			
 
				-#define _TheoraUtil_h
			
 
				-
			
 
				-#include <string>
			
 
				-#include <vector>
			
 
				-
			
 
				-#ifndef THEORAUTIL_NOMACROS
			
 
				-
			
 
				-#define foreach(type,lst) for (std::vector<type>::iterator it=lst.begin();it != lst.end(); ++it)
			
 
				-#define foreach_l(type,lst) for (std::list<type>::iterator it=lst.begin();it != lst.end(); ++it)
			
 
				-#define foreach_r(type,lst) for (std::vector<type>::reverse_iterator it=lst.rbegin();it != lst.rend(); ++it)
			
 
				-#define foreach_in_map(type,lst) for (std::map<std::string,type>::iterator it=lst.begin();it != lst.end(); ++it)
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-#define th_writelog(x) TheoraVideoManager::getSingleton().logMessage(x)
			
 
				-
			
 
				-
			
 
				-std::string str(int i);
			
 
				-std::string strf(float i);
			
 
				-void _psleep(int milliseconds);
			
 
				-int _nextPow2(int x);
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraVideoClip.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraVideoClip.h
@@ -1,282 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-
			
 
				-#ifndef _TheoraVideoClip_h
			
 
				-#define _TheoraVideoClip_h
			
 
				-
			
 
				-#include <string>
			
 
				-#include "TheoraExport.h"
			
 
				-
			
 
				-// forward class declarations
			
 
				-class TheoraMutex;
			
 
				-class TheoraFrameQueue;
			
 
				-class TheoraTimer;
			
 
				-class TheoraAudioInterface;
			
 
				-class TheoraWorkerThread;
			
 
				-class TheoraDataSource;
			
 
				-class TheoraVideoFrame;
			
 
				-
			
 
				-/**
			
 
				-    format of the TheoraVideoFrame pixels. Affects decoding time
			
 
				- */
			
 
				-enum TheoraOutputMode
			
 
				-{
			
 
				-	// A = full alpha (255), order of letters represents the byte order for a pixel
			
 
				-	// A means the image is treated as if it contains an alpha channel, while X formats
			
 
				-	// just mean that RGB frame is transformed to a 4 byte format
			
 
				-	TH_UNDEFINED = 0,
			
 
				-	TH_RGB    =  1,
			
 
				-	TH_RGBA   =  2,
			
 
				-	TH_RGBX   =  3,
			
 
				-	TH_ARGB   =  4,
			
 
				-	TH_XRGB   =  5,
			
 
				-	TH_BGR    =  6,
			
 
				-	TH_BGRA   =  7,
			
 
				-	TH_BGRX   =  8,
			
 
				-	TH_ABGR   =  9,
			
 
				-	TH_XBGR   = 10,
			
 
				-	TH_GREY   = 11,
			
 
				-	TH_GREY3  = 12,
			
 
				-	TH_GREY3A = 13,
			
 
				-	TH_GREY3X = 14,
			
 
				-	TH_AGREY3 = 15,
			
 
				-	TH_XGREY3 = 16,
			
 
				-	TH_YUV    = 17,
			
 
				-	TH_YUVA   = 18,
			
 
				-	TH_YUVX   = 19,
			
 
				-	TH_AYUV   = 20,
			
 
				-	TH_XYUV   = 21
			
 
				-};
			
 
				-
			
 
				-/**
			
 
				-	This object contains all data related to video playback, eg. the open source file,
			
 
				-	the frame queue etc.
			
 
				-*/
			
 
				-class TheoraPlayerExport TheoraVideoClip
			
 
				-{
			
 
				-	friend class TheoraWorkerThread;
			
 
				-	friend class TheoraVideoFrame;
			
 
				-	friend class TheoraVideoManager;
			
 
				-protected:
			
 
				-	TheoraFrameQueue* mFrameQueue;
			
 
				-	TheoraAudioInterface* mAudioInterface;
			
 
				-	TheoraDataSource* mStream;
			
 
				-
			
 
				-	TheoraTimer *mTimer, *mDefaultTimer;
			
 
				-
			
 
				-	TheoraWorkerThread* mAssignedWorkerThread;
			
 
				-	
			
 
				-	bool mUseAlpha;
			
 
				-	
			
 
				-	bool mWaitingForCache;
			
 
				-	
			
 
				-	// benchmark vars
			
 
				-	int mNumDroppedFrames, mNumDisplayedFrames, mNumPrecachedFrames;
			
 
				-
			
 
				-	int mThreadAccessCount; //! counter used by TheoraVideoManager to schedule workload
			
 
				-	
			
 
				-	int mSeekFrame; //! stores desired seek position as a frame number. next worker thread will do the seeking and reset this var to -1
			
 
				-	float mDuration, mFrameDuration, mFPS;
			
 
				-	float mPriority; //! User assigned priority. Default value is 1
			
 
				-    std::string mName;
			
 
				-	int mWidth, mHeight, mStride;
			
 
				-	int mNumFrames;
			
 
				-	int audio_track;
			
 
				-
			
 
				-	int mSubFrameWidth, mSubFrameHeight, mSubFrameOffsetX, mSubFrameOffsetY;
			
 
				-	float mAudioGain; //! multiplier for audio samples. between 0 and 1
			
 
				-
			
 
				-	TheoraOutputMode mOutputMode, mRequestedOutputMode;
			
 
				-	bool mFirstFrameDisplayed;
			
 
				-	bool mAutoRestart;
			
 
				-	bool mEndOfFile, mRestarted;
			
 
				-	int mIteration, mPlaybackIteration; //! used to ensure smooth playback of looping videos
			
 
				-
			
 
				-	TheoraMutex* mAudioMutex; //! syncs audio decoding and extraction
			
 
				-	TheoraMutex* mThreadAccessMutex;
			
 
				-	
			
 
				-	/**
			
 
				-	 * Get the priority of a video clip. based on a forumula that includes user
			
 
				-	 * priority factor, whether the video is paused or not, how many precached
			
 
				-	 * frames it has etc.
			
 
				-	 * This function is used in TheoraVideoManager to efficiently distribute job
			
 
				-	 * assignments among worker threads
			
 
				-	 * @return priority number of this video clip
			
 
				-	 */
			
 
				-	int calculatePriority();
			
 
				-	void readTheoraVorbisHeaders();
			
 
				-	virtual void doSeek() = 0; //! called by WorkerThread to seek to mSeekFrame
			
 
				-	virtual bool _readData() = 0;
			
 
				-	bool isBusy();
			
 
				-
			
 
				-	/**
			
 
				-	 * decodes audio from the vorbis stream and stores it in audio packets
			
 
				-	 * This is an internal function of TheoraVideoClip, called regularly if playing an
			
 
				-	 * audio enabled video clip.
			
 
				-	 * @return last decoded timestamp (if found in decoded packet's granule position)
			
 
				-	 */
			
 
				-	virtual float decodeAudio() = 0;
			
 
				-    
			
 
				-    int _getNumReadyFrames();
			
 
				-    void resetFrameQueue();
			
 
				-    int discardOutdatedFrames(float absTime);
			
 
				-    float getAbsPlaybackTime();
			
 
				-	virtual void load(TheoraDataSource* source) = 0;
			
 
				-
			
 
				-	virtual void _restart() = 0; // resets the decoder and stream but leaves the frame queue intact
			
 
				-public:
			
 
				-	TheoraVideoClip(TheoraDataSource* data_source,
			
 
				-		            TheoraOutputMode output_mode,
			
 
				-					int nPrecachedFrames,
			
 
				-					bool usePower2Stride);
			
 
				-	virtual ~TheoraVideoClip();
			
 
				-
			
 
				-	std::string getName();
			
 
				-	//! Returns the string name of the decoder backend (eg. Theora, AVFoundation)
			
 
				-	virtual std::string getDecoderName() = 0;
			
 
				-
			
 
				-	//! benchmark function
			
 
				-	int getNumDisplayedFrames() { return mNumDisplayedFrames; }
			
 
				-	//! benchmark function
			
 
				-	int getNumDroppedFrames() { return mNumDroppedFrames; }
			
 
				-
			
 
				-	//! return width in pixels of the video clip
			
 
				-	int getWidth();
			
 
				-	//! return height in pixels of the video clip
			
 
				-	int getHeight();
			
 
				-    
			
 
				-    //! Width of the actual picture inside a video frame (depending on implementation, this may be equal to mWidth or differ within a codec block size (usually 16))
			
 
				-    int getSubFrameWidth();
			
 
				-    //! Height of the actual picture inside a video frame (depending on implementation, this may be equal to mHeight or differ within a codec block size (usually 16))
			
 
				-	int getSubFrameHeight();
			
 
				-    //! X Offset of the actual picture inside a video frame (depending on implementation, this may be 0 or within a codec block size (usually 16))
			
 
				-	int getSubFrameOffsetX();
			
 
				-    //! Y Offset of the actual picture inside a video frame (depending on implementation, this may be 0 or differ within a codec block size (usually 16))
			
 
				-	int getSubFrameOffsetY();
			
 
				-    /**
			
 
				-	    \brief return stride in pixels
			
 
				-
			
 
				-		If you've specified usePower2Stride when creating the TheoraVideoClip object
			
 
				-		then this value will be the next power of two size compared to width,
			
 
				-		eg: w=376, stride=512.
			
 
				-
			
 
				-		Otherwise, stride will be equal to width
			
 
				-	 */
			
 
				-	int getStride() { return mStride; }
			
 
				-
			
 
				-	//! retur the timer objet associated with this object
			
 
				-	TheoraTimer* getTimer();
			
 
				-	//! replace the timer object with a new one
			
 
				-	void setTimer(TheoraTimer* timer);
			
 
				-
			
 
				-	//! used by TheoraWorkerThread, do not call directly
			
 
				-	virtual bool decodeNextFrame() = 0;
			
 
				-
			
 
				-	//! advance time. TheoraVideoManager calls this
			
 
				-	void update(float timeDelta);
			
 
				-	/**
			
 
				-	    \brief update timer to the display time of the next frame
			
 
				-
			
 
				-		useful if you want to grab frames instead of regular display
			
 
				-		\return time advanced. 0 if no frames are ready
			
 
				-	*/
			
 
				-	float updateToNextFrame();
			
 
				-
			
 
				-	
			
 
				-	TheoraFrameQueue* getFrameQueue();
			
 
				-	
			
 
				-	/**
			
 
				-	    \brief pop the frame from the front of the FrameQueue
			
 
				-
			
 
				-		see TheoraFrameQueue::pop() for more details
			
 
				-	 */
			
 
				-	void popFrame();
			
 
				-
			
 
				-	/**
			
 
				-	    \brief Returns the first available frame in the queue or NULL if no frames are available.
			
 
				-
			
 
				-		see TheoraFrameQueue::getFirstAvailableFrame() for more details
			
 
				-	*/
			
 
				-	TheoraVideoFrame* getNextFrame();
			
 
				-	/**
			
 
				-	    check if there is enough audio data decoded to submit to the audio interface
			
 
				-
			
 
				-		TheoraWorkerThread calls this
			
 
				-	 */
			
 
				-	virtual void decodedAudioCheck() = 0;
			
 
				-
			
 
				-	void setAudioInterface(TheoraAudioInterface* iface);
			
 
				-	TheoraAudioInterface* getAudioInterface();
			
 
				-
			
 
				-	/**
			
 
				-	    \brief resize the frame queues
			
 
				-
			
 
				-		Warning: this call discards ready frames in the frame queue
			
 
				-	 */
			
 
				-	void setNumPrecachedFrames(int n);
			
 
				-	//! returns the size of the frame queue
			
 
				-	int getNumPrecachedFrames();
			
 
				-	//! returns the number of ready frames in the frame queue
			
 
				-	int getNumReadyFrames();
			
 
				-
			
 
				-	//! if you want to adjust the audio gain. range [0,1]
			
 
				-	void setAudioGain(float gain);
			
 
				-	float getAudioGain();
			
 
				-
			
 
				-	//! if you want the video to automatically and smoothly restart when the last frame is reached
			
 
				-	void setAutoRestart(bool value);
			
 
				-	bool getAutoRestart() { return mAutoRestart; }
			
 
				-
			
 
				-
			
 
				-	void set_audio_track(int p_track) { audio_track=p_track; }
			
 
				-
			
 
				-	/**
			
 
				-	    TODO: user priority. Useful only when more than one video is being decoded
			
 
				-	 */
			
 
				-	void setPriority(float priority);
			
 
				-	float getPriority();
			
 
				-
			
 
				-	//! Used by TheoraVideoManager to schedule work
			
 
				-	float getPriorityIndex();
			
 
				-
			
 
				-	//! get the current time index from the timer object
			
 
				-	float getTimePosition();
			
 
				-	//! get the duration of the movie in seconds
			
 
				-	float getDuration();
			
 
				-	//! return the clips' frame rate, warning, fps can be a non integer number!
			
 
				-	float getFPS();
			
 
				-	//! get the number of frames in this movie
			
 
				-	int getNumFrames() { return mNumFrames; }
			
 
				-
			
 
				-	//! return the current output mode for this video object
			
 
				-	TheoraOutputMode getOutputMode();
			
 
				-	/**
			
 
				-	    set a new output mode
			
 
				-
			
 
				-		Warning: this discards the frame queue. ready frames will be lost.
			
 
				-	 */
			
 
				-	void setOutputMode(TheoraOutputMode mode);
			
 
				-
			
 
				-    bool isDone();
			
 
				-	void play();
			
 
				-	void pause();
			
 
				-	void restart();
			
 
				-	bool isPaused();
			
 
				-	void stop();
			
 
				-    void setPlaybackSpeed(float speed);
			
 
				-    float getPlaybackSpeed();
			
 
				-	//! seek to a given time position
			
 
				-	void seek(float time);
			
 
				-	//! seek to a given frame number
			
 
				-	void seekToFrame(int frame);
			
 
				-	//! wait max_time for the clip to cache a given percentage of frames, factor in range [0,1]
			
 
				-	void waitForCache(float desired_cache_factor = 0.5f, float max_wait_time = 1.0f);
			
 
				-};
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraVideoFrame.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraVideoFrame.h
@@ -1,56 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifndef _TheoraVideoFrame_h
			
 
				-#define _TheoraVideoFrame_h
			
 
				-
			
 
				-#include "TheoraExport.h"
			
 
				-#include "TheoraVideoClip.h"
			
 
				-
			
 
				-struct TheoraPixelTransform;
			
 
				-/**
			
 
				-	
			
 
				-*/
			
 
				-class TheoraPlayerExport TheoraVideoFrame
			
 
				-{
			
 
				-protected:
			
 
				-	TheoraVideoClip* mParent;
			
 
				-	unsigned char* mBuffer;
			
 
				-	unsigned long mFrameNumber;
			
 
				-public:
			
 
				-	//! global time in seconds this frame should be displayed on
			
 
				-	float mTimeToDisplay;
			
 
				-	//! whether the frame is ready for display or not
			
 
				-	bool mReady;
			
 
				-	//! indicates the frame is being used by TheoraWorkerThread instance
			
 
				-	bool mInUse;
			
 
				-	//! used to keep track of linear time in looping videos
			
 
				-	int mIteration;
			
 
				-	
			
 
				-	int mBpp;
			
 
				-
			
 
				-	TheoraVideoFrame(TheoraVideoClip* parent);
			
 
				-	virtual ~TheoraVideoFrame();
			
 
				-
			
 
				-	//! internal function, do not use directly
			
 
				-	void _setFrameNumber(unsigned long number) { mFrameNumber = number; }
			
 
				-	//! returns the frame number of this frame in the theora stream
			
 
				-	unsigned long getFrameNumber() { return mFrameNumber; }
			
 
				-
			
 
				-	void clear();
			
 
				-
			
 
				-	int getWidth();
			
 
				-	int getStride();
			
 
				-	int getHeight();
			
 
				-
			
 
				-	unsigned char* getBuffer();
			
 
				-
			
 
				-	//! Called by TheoraVideoClip to decode a source buffer onto itself
			
 
				-	virtual void decode(struct TheoraPixelTransform* t);
			
 
				-};
			
 
				-#endif
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraVideoManager.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraVideoManager.h
@@ -1,110 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-
			
 
				-#ifndef _TheoraVideoManager_h
			
 
				-#define _TheoraVideoManager_h
			
 
				-
			
 
				-#include <vector>
			
 
				-#include <list>
			
 
				-#include <string>
			
 
				-#include "TheoraExport.h"
			
 
				-#include "TheoraVideoClip.h"
			
 
				-#ifdef _WIN32
			
 
				-#pragma warning( disable: 4251 ) // MSVC++
			
 
				-#endif
			
 
				-// forward class declarations
			
 
				-class TheoraWorkerThread;
			
 
				-class TheoraMutex;
			
 
				-class TheoraDataSource;
			
 
				-class TheoraAudioInterfaceFactory;
			
 
				-/**
			
 
				-	This is the main singleton class that handles all playback/sync operations
			
 
				-*/
			
 
				-class TheoraPlayerExport TheoraVideoManager
			
 
				-{
			
 
				-protected:
			
 
				-	friend class TheoraWorkerThread;
			
 
				-	typedef std::vector<TheoraVideoClip*> ClipList;
			
 
				-	typedef std::vector<TheoraWorkerThread*> ThreadList;
			
 
				-
			
 
				-	//! stores pointers to worker threads which are decoding video and audio
			
 
				-	ThreadList mWorkerThreads;
			
 
				-	//! stores pointers to created video clips
			
 
				-	ClipList mClips;
			
 
				-	
			
 
				-	//! stores pointer to clips that were docoded in the past in order to achieve fair scheduling
			
 
				-	std::list<TheoraVideoClip*> mWorkLog;
			
 
				-
			
 
				-	int mDefaultNumPrecachedFrames;
			
 
				-
			
 
				-	TheoraMutex* mWorkMutex;
			
 
				-	TheoraAudioInterfaceFactory* mAudioFactory;
			
 
				-
			
 
				-	void createWorkerThreads(int n);
			
 
				-	void destroyWorkerThreads();
			
 
				-	
			
 
				-	float calcClipWorkTime(TheoraVideoClip* clip);
			
 
				-
			
 
				-	/**
			
 
				-	 * Called by TheoraWorkerThread to request a TheoraVideoClip instance to work on decoding
			
 
				-	 */
			
 
				-	TheoraVideoClip* requestWork(TheoraWorkerThread* caller);
			
 
				-public:
			
 
				-	TheoraVideoManager(int num_worker_threads=1);
			
 
				-	virtual ~TheoraVideoManager();
			
 
				-
			
 
				-	//! get the global reference to the manager instance
			
 
				-	static TheoraVideoManager& getSingleton();
			
 
				-	//! get the global pointer to the manager instance
			
 
				-	static TheoraVideoManager* getSingletonPtr();
			
 
				-
			
 
				-	//! search registered clips by name
			
 
				-	TheoraVideoClip* getVideoClipByName(std::string name);
			
 
				-
			
 
				-	TheoraVideoClip* createVideoClip(std::string filename,TheoraOutputMode output_mode=TH_RGB,int numPrecachedOverride=0,bool usePower2Stride=0, int p_track=0);
			
 
				-	TheoraVideoClip* createVideoClip(TheoraDataSource* data_source,TheoraOutputMode output_mode=TH_RGB,int numPrecachedOverride=0,bool usePower2Stride=0, int p_audio_track=0);
			
 
				-
			
 
				-	void update(float timeDelta);
			
 
				-
			
 
				-	void destroyVideoClip(TheoraVideoClip* clip);
			
 
				-
			
 
				-	void setAudioInterfaceFactory(TheoraAudioInterfaceFactory* factory);
			
 
				-	TheoraAudioInterfaceFactory* getAudioInterfaceFactory();
			
 
				-
			
 
				-	int getNumWorkerThreads();
			
 
				-	void setNumWorkerThreads(int n);
			
 
				-
			
 
				-	void setDefaultNumPrecachedFrames(int n) { mDefaultNumPrecachedFrames=n; }
			
 
				-	int getDefaultNumPrecachedFrames() { return mDefaultNumPrecachedFrames; }
			
 
				-
			
 
				-	//! used by libtheoraplayer functions
			
 
				-	void logMessage(std::string msg);
			
 
				-
			
 
				-	/**
			
 
				-		\brief you can set your own log function to recieve theora's log calls
			
 
				-
			
 
				-		This way you can integrate libtheoraplayer's log messages in your own
			
 
				-		logging system, prefix them, mute them or whatever you want
			
 
				-	 */
			
 
				-	static void setLogFunction(void (*fn)(std::string));
			
 
				-
			
 
				-	//! get nicely formated version string
			
 
				-	std::string getVersionString();
			
 
				-	/**
			
 
				-	    \brief get version numbers
			
 
				-
			
 
				-		if c is negative, it means it's a release candidate -c
			
 
				-	 */
			
 
				-	void getVersion(int* a,int* b,int* c);
			
 
				-
			
 
				-	//! returns the supported decoders (eg. Theora, AVFoundation...)
			
 
				-	std::vector<std::string> getSupportedDecoders();
			
 
				-};
			
 
				-#endif
			
 
				-
			
--- a/drivers/theoraplayer/include/theoraplayer/TheoraWorkerThread.h
+++ b/drivers/theoraplayer/include/theoraplayer/TheoraWorkerThread.h
@@ -1,32 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifndef _TheoraWorkerThread_h
			
 
				-#define _TheoraWorkerThread_h
			
 
				-
			
 
				-#include "TheoraAsync.h"
			
 
				-
			
 
				-class TheoraVideoClip;
			
 
				-
			
 
				-/**
			
 
				-	This is the worker thread, requests work from TheoraVideoManager
			
 
				-	and decodes assigned TheoraVideoClip objects
			
 
				-*/
			
 
				-class TheoraWorkerThread : public TheoraThread
			
 
				-{
			
 
				-	TheoraVideoClip* mClip;
			
 
				-public:
			
 
				-	TheoraWorkerThread();
			
 
				-	~TheoraWorkerThread();
			
 
				-
			
 
				-	TheoraVideoClip* getAssignedClip() { return mClip; }
			
 
				-
			
 
				-    //! Main Thread Body - do not call directly!
			
 
				-	void execute();
			
 
				-};
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/AVFoundation/TheoraVideoClip_AVFoundation.h
+++ b/drivers/theoraplayer/src/AVFoundation/TheoraVideoClip_AVFoundation.h
@@ -1,47 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#if defined(__AVFOUNDATION) && !defined(_TheoraVideoClip_AVFoundation_h)
			
 
				-#define _TheoraVideoClip_AVFoundation_h
			
 
				-
			
 
				-#include "TheoraAudioPacketQueue.h"
			
 
				-#include "TheoraVideoClip.h"
			
 
				-
			
 
				-#ifndef AVFOUNDATION_CLASSES_DEFINED
			
 
				-class AVAssetReader;
			
 
				-class AVAssetReaderTrackOutput;
			
 
				-#endif
			
 
				-
			
 
				-class TheoraVideoClip_AVFoundation : public TheoraVideoClip, public TheoraAudioPacketQueue
			
 
				-{
			
 
				-protected:
			
 
				-	bool mLoaded;
			
 
				-	int mFrameNumber;
			
 
				-	AVAssetReader* mReader;
			
 
				-	AVAssetReaderTrackOutput *mOutput, *mAudioOutput;
			
 
				-	unsigned int mReadAudioSamples;
			
 
				-	
			
 
				-	void unload();
			
 
				-	void doSeek();
			
 
				-public:
			
 
				-	TheoraVideoClip_AVFoundation(TheoraDataSource* data_source,
			
 
				-								 TheoraOutputMode output_mode,
			
 
				-								 int nPrecachedFrames,
			
 
				-								 bool usePower2Stride);
			
 
				-	~TheoraVideoClip_AVFoundation();
			
 
				-	
			
 
				-	bool _readData();
			
 
				-	bool decodeNextFrame();
			
 
				-	void _restart();
			
 
				-	void load(TheoraDataSource* source);
			
 
				-	float decodeAudio();
			
 
				-	void decodedAudioCheck();
			
 
				-	std::string getDecoderName() { return "AVFoundation"; }
			
 
				-};
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/AVFoundation/TheoraVideoClip_AVFoundation.mm
+++ b/drivers/theoraplayer/src/AVFoundation/TheoraVideoClip_AVFoundation.mm
@@ -1,457 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifdef __AVFOUNDATION
			
 
				-#define AVFOUNDATION_CLASSES_DEFINED
			
 
				-#import <AVFoundation/AVFoundation.h>
			
 
				-#include "TheoraAudioInterface.h"
			
 
				-#include "TheoraDataSource.h"
			
 
				-#include "TheoraException.h"
			
 
				-#include "TheoraTimer.h"
			
 
				-#include "TheoraUtil.h"
			
 
				-#include "TheoraFrameQueue.h"
			
 
				-#include "TheoraVideoFrame.h"
			
 
				-#include "TheoraVideoManager.h"
			
 
				-#include "TheoraVideoClip_AVFoundation.h"
			
 
				-#include "TheoraPixelTransform.h"
			
 
				-
			
 
				-#ifdef _AVFOUNDATION_BGRX
			
 
				-// a fast function developed to use kernel byte swapping calls to optimize alpha decoding.
			
 
				-// In AVFoundation, BGRX mode conversion is prefered to YUV conversion because apple's YUV
			
 
				-// conversion on iOS seems to run faster than libtheoraplayer's implementation
			
 
				-// This may change in the future with more optimizations to libtheoraplayers's YUV conversion
			
 
				-// code, making this function obsolete.
			
 
				-static void bgrx2rgba(unsigned char* dest, int w, int h, struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	unsigned register int a;
			
 
				-	unsigned int *dst = (unsigned int*) dest, *dstEnd;
			
 
				-	unsigned char* src = t->raw;
			
 
				-	int y, x, ax;
			
 
				-	
			
 
				-	for (y = 0; y < h; ++y, src += t->rawStride)
			
 
				-	{
			
 
				-		for (x = 0, ax = w * 4, dstEnd = dst + w; dst != dstEnd; x += 4, ax += 4, ++dst)
			
 
				-		{
			
 
				-            // use the full alpha range here because the Y channel has already been converted
			
 
				-            // to RGB and that's in [0, 255] range.
			
 
				-			a = src[ax];
			
 
				-            *dst = (OSReadSwapInt32(src, x) >> 8) | (a << 24);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-static CVPlanarPixelBufferInfo_YCbCrPlanar getYUVStruct(void* src)
			
 
				-{
			
 
				-	CVPlanarPixelBufferInfo_YCbCrPlanar* bigEndianYuv = (CVPlanarPixelBufferInfo_YCbCrPlanar*) src;
			
 
				-	CVPlanarPixelBufferInfo_YCbCrPlanar yuv;
			
 
				-	yuv.componentInfoY.offset = OSSwapInt32(bigEndianYuv->componentInfoY.offset);
			
 
				-	yuv.componentInfoY.rowBytes = OSSwapInt32(bigEndianYuv->componentInfoY.rowBytes);
			
 
				-	yuv.componentInfoCb.offset = OSSwapInt32(bigEndianYuv->componentInfoCb.offset);
			
 
				-	yuv.componentInfoCb.rowBytes = OSSwapInt32(bigEndianYuv->componentInfoCb.rowBytes);
			
 
				-	yuv.componentInfoCr.offset = OSSwapInt32(bigEndianYuv->componentInfoCr.offset);
			
 
				-	yuv.componentInfoCr.rowBytes = OSSwapInt32(bigEndianYuv->componentInfoCr.rowBytes);
			
 
				-	return yuv;
			
 
				-}
			
 
				-
			
 
				-TheoraVideoClip_AVFoundation::TheoraVideoClip_AVFoundation(TheoraDataSource* data_source,
			
 
				-											   TheoraOutputMode output_mode,
			
 
				-											   int nPrecachedFrames,
			
 
				-											   bool usePower2Stride):
			
 
				-	TheoraVideoClip(data_source, output_mode, nPrecachedFrames, usePower2Stride),
			
 
				-	TheoraAudioPacketQueue()
			
 
				-{
			
 
				-	mLoaded = 0;
			
 
				-	mReader = NULL;
			
 
				-	mOutput = mAudioOutput = NULL;
			
 
				-	mReadAudioSamples = mAudioFrequency = mNumAudioChannels = 0;
			
 
				-}
			
 
				-
			
 
				-TheoraVideoClip_AVFoundation::~TheoraVideoClip_AVFoundation()
			
 
				-{
			
 
				-	unload();
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_AVFoundation::unload()
			
 
				-{
			
 
				-	if (mOutput != NULL || mAudioOutput != NULL || mReader != NULL)
			
 
				-	{
			
 
				-		NSAutoreleasePool* pool = [[NSAutoreleasePool alloc] init];
			
 
				-
			
 
				-		if (mOutput != NULL)
			
 
				-		{
			
 
				-			[mOutput release];
			
 
				-			mOutput = NULL;
			
 
				-		}
			
 
				-		
			
 
				-		if (mAudioOutput)
			
 
				-		{
			
 
				-			[mAudioOutput release];
			
 
				-			mAudioOutput = NULL;
			
 
				-		}
			
 
				-		
			
 
				-		if (mReader != NULL)
			
 
				-		{
			
 
				-			[mReader release];
			
 
				-			mReader = NULL;
			
 
				-		}
			
 
				-		
			
 
				-		[pool release];
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-bool TheoraVideoClip_AVFoundation::_readData()
			
 
				-{
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-bool TheoraVideoClip_AVFoundation::decodeNextFrame()
			
 
				-{
			
 
				-	if (mReader == NULL || mEndOfFile) return 0;
			
 
				-	AVAssetReaderStatus status = [mReader status];
			
 
				-	if (status == AVAssetReaderStatusFailed)
			
 
				-	{
			
 
				-		// This can happen on iOS when you suspend the app... Only happens on the device, iOS simulator seems to work fine.
			
 
				-		th_writelog("AVAssetReader reading failed, restarting...");
			
 
				-
			
 
				-		mSeekFrame = mTimer->getTime() * mFPS;
			
 
				-		// just in case
			
 
				-		if (mSeekFrame < 0) mSeekFrame = 0;
			
 
				-		if (mSeekFrame > mDuration * mFPS - 1) mSeekFrame = mDuration * mFPS - 1;
			
 
				-		_restart();
			
 
				-		status = [mReader status];
			
 
				-		if (status == AVAssetReaderStatusFailed)
			
 
				-		{
			
 
				-			th_writelog("AVAssetReader restart failed!");
			
 
				-			return 0;
			
 
				-		}
			
 
				-		th_writelog("AVAssetReader restart succeeded!");
			
 
				-	}
			
 
				-
			
 
				-	TheoraVideoFrame* frame = mFrameQueue->requestEmptyFrame();
			
 
				-	if (!frame) return 0;
			
 
				-
			
 
				-	CMSampleBufferRef sampleBuffer = NULL;
			
 
				-	NSAutoreleasePool* pool = NULL;
			
 
				-	CMTime presentationTime;
			
 
				-	
			
 
				-	if (mAudioInterface) decodeAudio();
			
 
				-	
			
 
				-	if (status == AVAssetReaderStatusReading)
			
 
				-	{
			
 
				-		pool = [[NSAutoreleasePool alloc] init];
			
 
				-		
			
 
				-		while ((sampleBuffer = [mOutput copyNextSampleBuffer]))
			
 
				-		{
			
 
				-			presentationTime = CMSampleBufferGetOutputPresentationTimeStamp(sampleBuffer);
			
 
				-			frame->mTimeToDisplay = (float) CMTimeGetSeconds(presentationTime);
			
 
				-			frame->mIteration = mIteration;
			
 
				-			frame->_setFrameNumber(mFrameNumber);
			
 
				-			++mFrameNumber;
			
 
				-			if (frame->mTimeToDisplay < mTimer->getTime() && !mRestarted && mFrameNumber % 16 != 0)
			
 
				-			{
			
 
				-				// %16 operation is here to prevent a playback halt during video playback if the decoder can't keep up with demand.
			
 
				-#ifdef _DEBUG
			
 
				-				th_writelog(mName + ": pre-dropped frame " + str(mFrameNumber - 1));
			
 
				-#endif
			
 
				-				++mNumDisplayedFrames;
			
 
				-				++mNumDroppedFrames;
			
 
				-				CMSampleBufferInvalidate(sampleBuffer);
			
 
				-				CFRelease(sampleBuffer);
			
 
				-				sampleBuffer = NULL;
			
 
				-				continue; // drop frame
			
 
				-			}
			
 
				-
			
 
				-			CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
			
 
				-			CVPixelBufferLockBaseAddress(imageBuffer, 0);
			
 
				-			void *baseAddress = CVPixelBufferGetBaseAddress(imageBuffer);
			
 
				-			
			
 
				-			mStride = CVPixelBufferGetBytesPerRow(imageBuffer);
			
 
				-			size_t width = CVPixelBufferGetWidth(imageBuffer);
			
 
				-			size_t height = CVPixelBufferGetHeight(imageBuffer);
			
 
				-
			
 
				-			TheoraPixelTransform t;
			
 
				-			memset(&t, 0, sizeof(TheoraPixelTransform));
			
 
				-#ifdef _AVFOUNDATION_BGRX
			
 
				-			if (mOutputMode == TH_BGRX || mOutputMode == TH_RGBA)
			
 
				-			{
			
 
				-				t.raw = (unsigned char*) baseAddress;
			
 
				-				t.rawStride = mStride;
			
 
				-			}
			
 
				-			else
			
 
				-#endif
			
 
				-			{
			
 
				-				CVPlanarPixelBufferInfo_YCbCrPlanar yuv = getYUVStruct(baseAddress);
			
 
				-				
			
 
				-				t.y = (unsigned char*) baseAddress + yuv.componentInfoY.offset;  t.yStride = yuv.componentInfoY.rowBytes;
			
 
				-				t.u = (unsigned char*) baseAddress + yuv.componentInfoCb.offset; t.uStride = yuv.componentInfoCb.rowBytes;
			
 
				-				t.v = (unsigned char*) baseAddress + yuv.componentInfoCr.offset; t.vStride = yuv.componentInfoCr.rowBytes;
			
 
				-			}
			
 
				-#ifdef _AVFOUNDATION_BGRX
			
 
				-			if (mOutputMode == TH_RGBA)
			
 
				-			{
			
 
				-				for (int i = 0; i < 1000; ++i)
			
 
				-					bgrx2rgba(frame->getBuffer(), mWidth / 2, mHeight, &t);
			
 
				-				frame->mReady = true;
			
 
				-			}
			
 
				-			else
			
 
				-#endif
			
 
				-			frame->decode(&t);
			
 
				-
			
 
				-			CVPixelBufferUnlockBaseAddress(imageBuffer, 0);
			
 
				-			CMSampleBufferInvalidate(sampleBuffer);
			
 
				-			CFRelease(sampleBuffer);
			
 
				-
			
 
				-			break; // TODO - should this really be a while loop instead of an if block?
			
 
				-		}
			
 
				-	}
			
 
				-	if (pool) [pool release];
			
 
				-
			
 
				-	if (!frame->mReady) // in case the frame wasn't used
			
 
				-	{
			
 
				-		frame->mInUse = 0;
			
 
				-	}
			
 
				-
			
 
				-	if (sampleBuffer == NULL && mReader.status == AVAssetReaderStatusCompleted) // other cases could be app suspended
			
 
				-	{
			
 
				-		if (mAutoRestart)
			
 
				-        {
			
 
				-            ++mIteration;
			
 
				-			_restart();
			
 
				-        }
			
 
				-		else
			
 
				-		{
			
 
				-			unload();
			
 
				-			mEndOfFile = true;
			
 
				-		}
			
 
				-		return 0;
			
 
				-	}
			
 
				-	
			
 
				-	
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_AVFoundation::_restart()
			
 
				-{
			
 
				-	mEndOfFile = false;
			
 
				-	unload();
			
 
				-	load(mStream);
			
 
				-	mRestarted = true;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_AVFoundation::load(TheoraDataSource* source)
			
 
				-{
			
 
				-	mStream = source;
			
 
				-	mFrameNumber = 0;
			
 
				-	mEndOfFile = false;
			
 
				-	TheoraFileDataSource* fileDataSource = dynamic_cast<TheoraFileDataSource*>(source);
			
 
				-	std::string filename;
			
 
				-	if (fileDataSource != NULL) filename = fileDataSource->getFilename();
			
 
				-	else
			
 
				-	{
			
 
				-		TheoraMemoryFileDataSource* memoryDataSource = dynamic_cast<TheoraMemoryFileDataSource*>(source);
			
 
				-		if (memoryDataSource != NULL) filename = memoryDataSource->getFilename();
			
 
				-		else throw TheoraGenericException("Unable to load MP4 file");
			
 
				-	}
			
 
				-	
			
 
				-	NSAutoreleasePool* pool = [[NSAutoreleasePool alloc] init];
			
 
				-	NSString* path = [NSString stringWithUTF8String:filename.c_str()];
			
 
				-	NSError* err;
			
 
				-	NSURL *url = [NSURL fileURLWithPath:path];
			
 
				-	AVAsset* asset = [[AVURLAsset alloc] initWithURL:url options:nil];
			
 
				-	mReader = [[AVAssetReader alloc] initWithAsset:asset error:&err];
			
 
				-	NSArray* tracks = [asset tracksWithMediaType:AVMediaTypeVideo];
			
 
				-	if ([tracks count] == 0)
			
 
				-		throw TheoraGenericException("Unable to open video file: " + filename);
			
 
				-	AVAssetTrack *videoTrack = [tracks objectAtIndex:0];
			
 
				-
			
 
				-	NSArray* audioTracks = [asset tracksWithMediaType:AVMediaTypeAudio];
			
 
				-	if (audio_track >= audioTracks.count)
			
 
				-		audio_track = 0;
			
 
				-	AVAssetTrack *audioTrack = audioTracks.count > 0 ? [audioTracks objectAtIndex:audio_track] : NULL;
			
 
				-	printf("*********** using audio track %i\n", audio_track);
			
 
				-	
			
 
				-#ifdef _AVFOUNDATION_BGRX
			
 
				-	bool yuv_output = (mOutputMode != TH_BGRX && mOutputMode != TH_RGBA);
			
 
				-#else
			
 
				-	bool yuv_output = true;
			
 
				-#endif
			
 
				-	
			
 
				-	NSDictionary *videoOptions = [NSDictionary dictionaryWithObjectsAndKeys:[NSNumber numberWithInt:(yuv_output) ? kCVPixelFormatType_420YpCbCr8Planar : kCVPixelFormatType_32BGRA], kCVPixelBufferPixelFormatTypeKey, nil];
			
 
				-
			
 
				-	mOutput = [[AVAssetReaderTrackOutput alloc] initWithTrack:videoTrack outputSettings:videoOptions];
			
 
				-	[mReader addOutput:mOutput];
			
 
				-	if ([mOutput respondsToSelector:@selector(setAlwaysCopiesSampleData:)]) // Not supported on iOS versions older than 5.0
			
 
				-		mOutput.alwaysCopiesSampleData = NO;
			
 
				-
			
 
				-	mFPS = videoTrack.nominalFrameRate;
			
 
				-	mWidth = mSubFrameWidth = mStride = videoTrack.naturalSize.width;
			
 
				-	mHeight = mSubFrameHeight = videoTrack.naturalSize.height;
			
 
				-	mFrameDuration = 1.0f / mFPS;
			
 
				-	mDuration = (float) CMTimeGetSeconds(asset.duration);
			
 
				-	if (mFrameQueue == NULL)
			
 
				-	{
			
 
				-		mFrameQueue = new TheoraFrameQueue(this);
			
 
				-		mFrameQueue->setSize(mNumPrecachedFrames);
			
 
				-	}
			
 
				-
			
 
				-	if (mSeekFrame != -1)
			
 
				-	{
			
 
				-		mFrameNumber = mSeekFrame;
			
 
				-		[mReader setTimeRange: CMTimeRangeMake(CMTimeMakeWithSeconds(mSeekFrame / mFPS, 1), kCMTimePositiveInfinity)];
			
 
				-	}
			
 
				-	if (audioTrack)
			
 
				-	{
			
 
				-		TheoraAudioInterfaceFactory* audio_factory = TheoraVideoManager::getSingleton().getAudioInterfaceFactory();
			
 
				-		if (audio_factory)
			
 
				-		{
			
 
				-			NSDictionary *audioOptions = [NSDictionary dictionaryWithObjectsAndKeys:
			
 
				-										  [NSNumber numberWithInt:kAudioFormatLinearPCM], AVFormatIDKey,
			
 
				-										  [NSNumber numberWithBool:NO], AVLinearPCMIsNonInterleaved,
			
 
				-										  [NSNumber numberWithBool:NO], AVLinearPCMIsBigEndianKey,
			
 
				-										  [NSNumber numberWithBool:YES], AVLinearPCMIsFloatKey,
			
 
				-										  [NSNumber numberWithInt:32], AVLinearPCMBitDepthKey,
			
 
				-										  nil];
			
 
				-
			
 
				-			mAudioOutput = [[AVAssetReaderTrackOutput alloc] initWithTrack:audioTrack outputSettings:audioOptions];
			
 
				-			[mReader addOutput:mAudioOutput];
			
 
				-			if ([mAudioOutput respondsToSelector:@selector(setAlwaysCopiesSampleData:)]) // Not supported on iOS versions older than 5.0
			
 
				-				mAudioOutput.alwaysCopiesSampleData = NO;
			
 
				-			
			
 
				-			NSArray* desclst = audioTrack.formatDescriptions;
			
 
				-			CMAudioFormatDescriptionRef desc = (CMAudioFormatDescriptionRef) [desclst objectAtIndex:0];
			
 
				-			const AudioStreamBasicDescription* audioDesc = CMAudioFormatDescriptionGetStreamBasicDescription(desc);
			
 
				-			mAudioFrequency = (unsigned int) audioDesc->mSampleRate;
			
 
				-			mNumAudioChannels = audioDesc->mChannelsPerFrame;
			
 
				-			
			
 
				-			if (mSeekFrame != -1)
			
 
				-			{
			
 
				-				mReadAudioSamples = mFrameNumber * (mAudioFrequency * mNumAudioChannels) / mFPS;
			
 
				-			}
			
 
				-			else mReadAudioSamples = 0;
			
 
				-
			
 
				-			if (mAudioInterface == NULL)
			
 
				-				setAudioInterface(audio_factory->createInstance(this, mNumAudioChannels, mAudioFrequency));
			
 
				-		}
			
 
				-	}
			
 
				-	
			
 
				-#ifdef _DEBUG
			
 
				-	else if (!mLoaded)
			
 
				-	{
			
 
				-		th_writelog("-----\nwidth: " + str(mWidth) + ", height: " + str(mHeight) + ", fps: " + str((int) getFPS()));
			
 
				-		th_writelog("duration: " + strf(mDuration) + " seconds\n-----");
			
 
				-	}
			
 
				-#endif
			
 
				-	[mReader startReading];
			
 
				-	[pool release];
			
 
				-	mLoaded = true;
			
 
				-}
			
 
				- 
			
 
				-void TheoraVideoClip_AVFoundation::decodedAudioCheck()
			
 
				-{
			
 
				-	if (!mAudioInterface || mTimer->isPaused()) return;
			
 
				-	
			
 
				-	mAudioMutex->lock();
			
 
				-	flushAudioPackets(mAudioInterface);
			
 
				-	mAudioMutex->unlock();
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip_AVFoundation::decodeAudio()
			
 
				-{
			
 
				-	if (mRestarted) return -1;
			
 
				-
			
 
				-	if (mReader == NULL || mEndOfFile) return 0;
			
 
				-	AVAssetReaderStatus status = [mReader status];
			
 
				-
			
 
				-	if (mAudioOutput)
			
 
				-	{
			
 
				-		CMSampleBufferRef sampleBuffer = NULL;
			
 
				-		NSAutoreleasePool* pool = NULL;
			
 
				-		bool mutexLocked = 0;
			
 
				-
			
 
				-		float factor = 1.0f / (mAudioFrequency * mNumAudioChannels);
			
 
				-		float videoTime = (float) mFrameNumber / mFPS;
			
 
				-		float min = mFrameQueue->getSize() / mFPS + 1.0f;
			
 
				-		
			
 
				-		if (status == AVAssetReaderStatusReading)
			
 
				-		{
			
 
				-			pool = [[NSAutoreleasePool alloc] init];
			
 
				-
			
 
				-			// always buffer up of audio ahead of the frames
			
 
				-			while (mReadAudioSamples * factor - videoTime < min)
			
 
				-			{
			
 
				-				if ((sampleBuffer = [mAudioOutput copyNextSampleBuffer]))
			
 
				-				{
			
 
				-					AudioBufferList audioBufferList;
			
 
				-					
			
 
				-					CMBlockBufferRef blockBuffer = NULL;
			
 
				-					CMSampleBufferGetAudioBufferListWithRetainedBlockBuffer(sampleBuffer, NULL, &audioBufferList, sizeof(audioBufferList), NULL, NULL, 0, &blockBuffer);
			
 
				-					
			
 
				-					for (int y = 0; y < audioBufferList.mNumberBuffers; ++y)
			
 
				-					{
			
 
				-						AudioBuffer audioBuffer = audioBufferList.mBuffers[y];
			
 
				-						float *frame = (float*) audioBuffer.mData;
			
 
				-
			
 
				-						if (!mutexLocked)
			
 
				-						{
			
 
				-							mAudioMutex->lock();
			
 
				-							mutexLocked = 1;
			
 
				-						}
			
 
				-						addAudioPacket(frame, audioBuffer.mDataByteSize / (mNumAudioChannels * sizeof(float)), mAudioGain);
			
 
				-						
			
 
				-						mReadAudioSamples += audioBuffer.mDataByteSize / (sizeof(float));
			
 
				-					}
			
 
				-
			
 
				-					CFRelease(blockBuffer);
			
 
				-					CMSampleBufferInvalidate(sampleBuffer);
			
 
				-					CFRelease(sampleBuffer);
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					[mAudioOutput release];
			
 
				-					mAudioOutput = nil;
			
 
				-					break;
			
 
				-				}
			
 
				-			}
			
 
				-			[pool release];
			
 
				-		}
			
 
				-		if (mutexLocked) mAudioMutex->unlock();
			
 
				-	}
			
 
				-	
			
 
				-	return -1;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_AVFoundation::doSeek()
			
 
				-{
			
 
				-#if _DEBUG
			
 
				-	th_writelog(mName + " [seek]: seeking to frame " + str(mSeekFrame));
			
 
				-#endif
			
 
				-	int frame;
			
 
				-	float time = mSeekFrame / getFPS();
			
 
				-	mTimer->seek(time);
			
 
				-	bool paused = mTimer->isPaused();
			
 
				-	if (!paused) mTimer->pause(); // pause until seeking is done
			
 
				-	
			
 
				-	mEndOfFile = false;
			
 
				-	mRestarted = false;
			
 
				-	
			
 
				-    resetFrameQueue();
			
 
				-	unload();
			
 
				-	load(mStream);
			
 
				-
			
 
				-	if (mAudioInterface)
			
 
				-	{
			
 
				-		mAudioMutex->lock();
			
 
				-		destroyAllAudioPackets();
			
 
				-		mAudioMutex->unlock();
			
 
				-	}
			
 
				-
			
 
				-	if (!paused) mTimer->play();
			
 
				-	mSeekFrame = -1;
			
 
				-}
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/FFmpeg/TheoraVideoClip_FFmpeg.cpp
+++ b/drivers/theoraplayer/src/FFmpeg/TheoraVideoClip_FFmpeg.cpp
@@ -1,439 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifdef __FFMPEG
			
 
				-#include "TheoraAudioInterface.h"
			
 
				-#include "TheoraDataSource.h"
			
 
				-#include "TheoraException.h"
			
 
				-#include "TheoraTimer.h"
			
 
				-#include "TheoraUtil.h"
			
 
				-#include "TheoraFrameQueue.h"
			
 
				-#include "TheoraVideoFrame.h"
			
 
				-#include "TheoraVideoManager.h"
			
 
				-#include "TheoraVideoClip_FFmpeg.h"
			
 
				-#include "TheoraPixelTransform.h"
			
 
				-
			
 
				-#define READ_BUFFER_SIZE 4096
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-#define __STDC_CONSTANT_MACROS
			
 
				-#ifdef _STDINT_H
			
 
				-#undef _STDINT_H
			
 
				-#endif
			
 
				-# include <stdint.h>
			
 
				-#endif
			
 
				-
			
 
				-#define _FFMPEG_DEBUG
			
 
				-
			
 
				-extern "C"
			
 
				-{
			
 
				-#include <libavcodec/avcodec.h>
			
 
				-#include <libavformat/avformat.h>
			
 
				-#include "libavutil/avassert.h"
			
 
				-}
			
 
				-
			
 
				-static bool ffmpegInitialised = 0;
			
 
				-
			
 
				-static int readFunction(void* data, uint8_t* buf, int buf_size)
			
 
				-{
			
 
				-#ifdef _FFMPEG_DEBUG
			
 
				-	th_writelog("reading " + str(buf_size) + " bytes");
			
 
				-#endif
			
 
				-
			
 
				-	TheoraDataSource* src = (TheoraDataSource*) data;
			
 
				-	return src->read(buf, buf_size);
			
 
				-}
			
 
				-
			
 
				-static int64_t seekFunction(void* data, int64_t offset, int whence)
			
 
				-{
			
 
				-#ifdef _FFMPEG_DEBUG
			
 
				-	th_writelog("seeking: offset = " + str((long) offset) + ", whence = " + str(whence));
			
 
				-#endif
			
 
				-
			
 
				-	TheoraDataSource* src = (TheoraDataSource*) data;
			
 
				-    if (whence == AVSEEK_SIZE)
			
 
				-		return src->size();
			
 
				-	else if (whence == SEEK_SET)
			
 
				-		src->seek((long) offset);
			
 
				-	else if (whence == SEEK_END)
			
 
				-		src->seek(src->size() - (long) offset);
			
 
				-    return src->tell();
			
 
				-}
			
 
				-
			
 
				-static void avlog_theoraplayer(void* p, int level, const char* fmt, va_list vargs)
			
 
				-{
			
 
				-	th_writelog(fmt);
			
 
				-	static char logstr[2048];
			
 
				-	vsprintf(logstr, fmt, vargs);
			
 
				-	th_writelog("ffmpeg: " + std::string(logstr));
			
 
				-}
			
 
				-
			
 
				-
			
 
				-std::string text;
			
 
				-
			
 
				-static void _log(const char* s)
			
 
				-{
			
 
				-	text += s;
			
 
				-//	th_writelog(text);
			
 
				-//	text = "";
			
 
				-}
			
 
				-
			
 
				-static void _log(const char c)
			
 
				-{
			
 
				-	char s[2] = {c, 0};
			
 
				-	_log(s);
			
 
				-}
			
 
				-
			
 
				-static const AVCodec *next_codec_for_id(enum AVCodecID id, const AVCodec *prev,
			
 
				-                                        int encoder)
			
 
				-{
			
 
				-    while ((prev = av_codec_next(prev))) {
			
 
				-        if (prev->id == id &&
			
 
				-            (encoder ? av_codec_is_encoder(prev) : av_codec_is_decoder(prev)))
			
 
				-            return prev;
			
 
				-    }
			
 
				-    return NULL;
			
 
				-}
			
 
				-
			
 
				-static int compare_codec_desc(const void *a, const void *b)
			
 
				-{
			
 
				-    const AVCodecDescriptor **da = (const AVCodecDescriptor **) a;
			
 
				-    const AVCodecDescriptor **db = (const AVCodecDescriptor **) b;
			
 
				-	
			
 
				-    return (*da)->type != (*db)->type ? (*da)->type - (*db)->type :
			
 
				-	strcmp((*da)->name, (*db)->name);
			
 
				-}
			
 
				-
			
 
				-static unsigned get_codecs_sorted(const AVCodecDescriptor ***rcodecs)
			
 
				-{
			
 
				-    const AVCodecDescriptor *desc = NULL;
			
 
				-    const AVCodecDescriptor **codecs;
			
 
				-    unsigned nb_codecs = 0, i = 0;
			
 
				-	
			
 
				-    while ((desc = avcodec_descriptor_next(desc)))
			
 
				-        ++nb_codecs;
			
 
				-    if (!(codecs = (const AVCodecDescriptor**) av_calloc(nb_codecs, sizeof(*codecs)))) {
			
 
				-        av_log(NULL, AV_LOG_ERROR, "Out of memory\n");
			
 
				-        exit(1);
			
 
				-    }
			
 
				-    desc = NULL;
			
 
				-    while ((desc = avcodec_descriptor_next(desc)))
			
 
				-        codecs[i++] = desc;
			
 
				-    av_assert0(i == nb_codecs);
			
 
				-    qsort(codecs, nb_codecs, sizeof(*codecs), compare_codec_desc);
			
 
				-    *rcodecs = codecs;
			
 
				-    return nb_codecs;
			
 
				-}
			
 
				-
			
 
				-static char get_media_type_char(enum AVMediaType type)
			
 
				-{
			
 
				-    switch (type) {
			
 
				-        case AVMEDIA_TYPE_VIDEO:    return 'V';
			
 
				-        case AVMEDIA_TYPE_AUDIO:    return 'A';
			
 
				-        case AVMEDIA_TYPE_DATA:     return 'D';
			
 
				-        case AVMEDIA_TYPE_SUBTITLE: return 'S';
			
 
				-        case AVMEDIA_TYPE_ATTACHMENT:return 'T';
			
 
				-        default:                    return '?';
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-static void print_codecs_for_id(enum AVCodecID id, int encoder)
			
 
				-{
			
 
				-    const AVCodec *codec = NULL;
			
 
				-	
			
 
				-    _log(encoder ? "encoders" : "decoders");
			
 
				-	
			
 
				-    while ((codec = next_codec_for_id(id, codec, encoder)))
			
 
				-        _log(codec->name);
			
 
				-	
			
 
				-    _log(")");
			
 
				-}
			
 
				-
			
 
				-int show_codecs(void *optctx, const char *opt, const char *arg)
			
 
				-{
			
 
				-    const AVCodecDescriptor **codecs;
			
 
				-    unsigned i, nb_codecs = get_codecs_sorted(&codecs);
			
 
				-	
			
 
				-	char tmp[1024];
			
 
				-    th_writelog("Codecs:\n"
			
 
				-           " D..... = Decoding supported\n"
			
 
				-           " .E.... = Encoding supported\n"
			
 
				-           " ..V... = Video codec\n"
			
 
				-           " ..A... = Audio codec\n"
			
 
				-           " ..S... = Subtitle codec\n"
			
 
				-           " ...I.. = Intra frame-only codec\n"
			
 
				-           " ....L. = Lossy compression\n"
			
 
				-           " .....S = Lossless compression\n"
			
 
				-           " -------\n");
			
 
				-    for (i = 0; i < nb_codecs; ++i) {
			
 
				-        const AVCodecDescriptor *desc = codecs[i];
			
 
				-        const AVCodec *codec = NULL;
			
 
				-		
			
 
				-        _log(" ");
			
 
				-        _log(avcodec_find_decoder(desc->id) ? "D" : ".");
			
 
				-        _log(avcodec_find_encoder(desc->id) ? "E" : ".");
			
 
				-		
			
 
				-        _log(get_media_type_char(desc->type));
			
 
				-        _log((desc->props & AV_CODEC_PROP_INTRA_ONLY) ? "I" : ".");
			
 
				-        _log((desc->props & AV_CODEC_PROP_LOSSY)      ? "L" : ".");
			
 
				-        _log((desc->props & AV_CODEC_PROP_LOSSLESS)   ? "S" : ".");
			
 
				-		
			
 
				-		
			
 
				-        sprintf(tmp, " %-20s %s", desc->name, desc->long_name ? desc->long_name : "");
			
 
				-		
			
 
				-		_log(tmp);
			
 
				-        /* print decoders/encoders when there's more than one or their
			
 
				-         * names are different from codec name */
			
 
				-        while ((codec = next_codec_for_id(desc->id, codec, 0))) {
			
 
				-            if (strcmp(codec->name, desc->name)) {
			
 
				-                print_codecs_for_id(desc->id, 0);
			
 
				-                break;
			
 
				-            }
			
 
				-        }
			
 
				-        codec = NULL;
			
 
				-        while ((codec = next_codec_for_id(desc->id, codec, 1))) {
			
 
				-            if (strcmp(codec->name, desc->name)) {
			
 
				-                print_codecs_for_id(desc->id, 1);
			
 
				-                break;
			
 
				-            }
			
 
				-        }
			
 
				-		_log("\n");
			
 
				-    }
			
 
				-    av_free(codecs);
			
 
				-	
			
 
				-	av_log(0, 0, "%s", text.c_str());
			
 
				-    return 0;
			
 
				-}
			
 
				-
			
 
				-TheoraVideoClip_FFmpeg::TheoraVideoClip_FFmpeg(TheoraDataSource* data_source,
			
 
				-														 TheoraOutputMode output_mode,
			
 
				-														 int nPrecachedFrames,
			
 
				-														 bool usePower2Stride):
			
 
				-								 						 TheoraVideoClip(data_source, output_mode, nPrecachedFrames, usePower2Stride),
			
 
				-														 TheoraAudioPacketQueue()
			
 
				-{
			
 
				-	mFormatContext = NULL;
			
 
				-	mCodecContext = NULL;
			
 
				-	mCodec = NULL;
			
 
				-	mFrame = NULL;
			
 
				-	mVideoStreamIndex = -1;
			
 
				-}
			
 
				-
			
 
				-TheoraVideoClip_FFmpeg::~TheoraVideoClip_FFmpeg()
			
 
				-{
			
 
				-	unload();
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_FFmpeg::load(TheoraDataSource* source)
			
 
				-{
			
 
				-	mVideoStreamIndex = -1;
			
 
				-	mFrameNumber = 0;
			
 
				-	AVDictionary* optionsDict = NULL;
			
 
				-	
			
 
				-	if (!ffmpegInitialised)
			
 
				-	{
			
 
				-#ifdef _FFMPEG_DEBUG
			
 
				-		th_writelog("Initializing ffmpeg");
			
 
				-#endif
			
 
				-		th_writelog("avcodec version: " + str(avcodec_version()));
			
 
				-		av_register_all();
			
 
				-		av_log_set_level(AV_LOG_DEBUG);
			
 
				-		av_log_set_callback(avlog_theoraplayer);
			
 
				-		ffmpegInitialised = 1;
			
 
				-		//show_codecs(0, 0, 0);
			
 
				-	}
			
 
				-	
			
 
				-	mInputBuffer = (unsigned char*) av_malloc(READ_BUFFER_SIZE);
			
 
				-	mAvioContext = avio_alloc_context(mInputBuffer, READ_BUFFER_SIZE, 0, source, &readFunction, NULL, &seekFunction);
			
 
				-	
			
 
				-#ifdef _FFMPEG_DEBUG
			
 
				-	th_writelog(mName + ": avio context created");
			
 
				-#endif
			
 
				-
			
 
				-	mFormatContext = avformat_alloc_context();
			
 
				-#ifdef _FFMPEG_DEBUG
			
 
				-	th_writelog(mName + ": avformat context created");
			
 
				-#endif
			
 
				-	mFormatContext->pb = mAvioContext;
			
 
				-	
			
 
				-	int err;
			
 
				-	if ((err = avformat_open_input(&mFormatContext, "", NULL, NULL)) != 0)
			
 
				-	{
			
 
				-		th_writelog(mName + ": avformat input opening failed!");
			
 
				-		th_writelog(mName + ": error_code: " + str(err));
			
 
				-		return;
			
 
				-	}
			
 
				-	
			
 
				-#ifdef _FFMPEG_DEBUG
			
 
				-	th_writelog(mName + ": avformat input opened");
			
 
				-#endif
			
 
				-	
			
 
				-	// Retrieve stream information
			
 
				-	if (avformat_find_stream_info(mFormatContext, NULL) < 0)
			
 
				-		return; // Couldn't find stream information
			
 
				-	
			
 
				-#ifdef _FFMPEG_DEBUG
			
 
				-	th_writelog(mName + ": got stream info");
			
 
				-#endif
			
 
				-	
			
 
				-	// Dump information about file onto standard error
			
 
				-	//	av_dump_format(mFormatContext, 0, "", 0);
			
 
				-	
			
 
				-	// Find the first video stream
			
 
				-	for (int i = 0; i < mFormatContext->nb_streams; ++i)
			
 
				-	{
			
 
				-		if(mFormatContext->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO)
			
 
				-		{
			
 
				-			mVideoStreamIndex = i;
			
 
				-			break;
			
 
				-		}
			
 
				-	}
			
 
				-	if (mVideoStreamIndex == -1)
			
 
				-		return; // Didn't find a video stream
			
 
				-
			
 
				-#ifdef _FFMPEG_DEBUG
			
 
				-	th_writelog(mName + ": Found video stream at index " + str(mVideoStreamIndex));
			
 
				-#endif
			
 
				-
			
 
				-	// Get a pointer to the codec context for the video stream
			
 
				-	mCodecContext = mFormatContext->streams[mVideoStreamIndex]->codec;
			
 
				-	
			
 
				-	// Find the decoder for the video stream
			
 
				-	mCodec = avcodec_find_decoder(mCodecContext->codec_id);
			
 
				-	if (mCodec == NULL)
			
 
				-	{
			
 
				-		th_writelog("Unsupported codec!");
			
 
				-		return; // Codec not found
			
 
				-	}
			
 
				-	// Open codec
			
 
				-	if(avcodec_open2(mCodecContext, mCodec, &optionsDict) < 0)
			
 
				-		return; // Could not open codec
			
 
				-	
			
 
				-#ifdef _FFMPEG_DEBUG
			
 
				-	th_writelog(mName + ": Codec opened");
			
 
				-#endif
			
 
				-
			
 
				-	
			
 
				-	mFrame = avcodec_alloc_frame();
			
 
				-	
			
 
				-#ifdef _FFMPEG_DEBUG
			
 
				-	th_writelog(mName + ": Frame allocated");
			
 
				-#endif
			
 
				-		
			
 
				-	//AVRational rational = mCodecContext->time_base;
			
 
				-
			
 
				-	mFPS = 25; //TODOOOOOO!!!
			
 
				-	
			
 
				-	mWidth = mStride = mCodecContext->width;
			
 
				-	mHeight = mCodecContext->height;
			
 
				-	mFrameDuration = 1.0f / mFPS;
			
 
				-	mDuration = mFormatContext->duration / AV_TIME_BASE;
			
 
				-	
			
 
				-	if (mFrameQueue == NULL) // todo - why is this set in the backend class? it should be set in the base class, check other backends as well
			
 
				-	{
			
 
				-		mFrameQueue = new TheoraFrameQueue(this);
			
 
				-		mFrameQueue->setSize(mNumPrecachedFrames);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_FFmpeg::unload()
			
 
				-{
			
 
				-	if (mInputBuffer)
			
 
				-	{
			
 
				-//		av_free(mInputBuffer);
			
 
				-		mInputBuffer = NULL;
			
 
				-	}
			
 
				-	if (mAvioContext)
			
 
				-	{
			
 
				-		av_free(mAvioContext);
			
 
				-		mAvioContext = NULL;
			
 
				-	}
			
 
				-	if (mFrame)
			
 
				-	{
			
 
				-		av_free(mFrame);
			
 
				-		mFrame = NULL;
			
 
				-	}
			
 
				-	if (mCodecContext)
			
 
				-	{
			
 
				-		avcodec_close(mCodecContext);
			
 
				-		mCodecContext = NULL;
			
 
				-	}
			
 
				-	if (mFormatContext)
			
 
				-	{
			
 
				-		avformat_close_input(&mFormatContext);
			
 
				-		mFormatContext = NULL;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-bool TheoraVideoClip_FFmpeg::_readData()
			
 
				-{
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-bool TheoraVideoClip_FFmpeg::decodeNextFrame()
			
 
				-{
			
 
				-	TheoraVideoFrame* frame = mFrameQueue->requestEmptyFrame();
			
 
				-	if (!frame) return 0;
			
 
				-
			
 
				-	AVPacket packet;
			
 
				-	int frameFinished;
			
 
				-	
			
 
				-	while (av_read_frame(mFormatContext, &packet) >= 0)
			
 
				-	{
			
 
				-		if (packet.stream_index == mVideoStreamIndex)
			
 
				-		{
			
 
				-			avcodec_decode_video2(mCodecContext, mFrame, &frameFinished, &packet);
			
 
				-			
			
 
				-			if (frameFinished)
			
 
				-			{
			
 
				-				TheoraPixelTransform t;
			
 
				-				memset(&t, 0, sizeof(TheoraPixelTransform));
			
 
				-
			
 
				-				t.y = mFrame->data[0]; t.yStride = mFrame->linesize[0];
			
 
				-				t.u = mFrame->data[1]; t.uStride = mFrame->linesize[1];
			
 
				-				t.v = mFrame->data[2]; t.vStride = mFrame->linesize[2];
			
 
				-				
			
 
				-				frame->decode(&t);
			
 
				-				frame->mTimeToDisplay = mFrameNumber / mFPS;
			
 
				-				frame->mIteration = mIteration;
			
 
				-				frame->_setFrameNumber(mFrameNumber++);
			
 
				-
			
 
				-				av_free_packet(&packet);
			
 
				-				break;
			
 
				-			}
			
 
				-		}
			
 
				-		av_free_packet(&packet);
			
 
				-	}
			
 
				-	return 1;
			
 
				-}
			
 
				- 
			
 
				-void TheoraVideoClip_FFmpeg::decodedAudioCheck()
			
 
				-{
			
 
				-	if (!mAudioInterface || mTimer->isPaused()) return;
			
 
				-	
			
 
				-	mAudioMutex->lock();
			
 
				-	flushAudioPackets(mAudioInterface);
			
 
				-	mAudioMutex->unlock();
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip_FFmpeg::decodeAudio()
			
 
				-{
			
 
				-	return -1;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_FFmpeg::doSeek()
			
 
				-{
			
 
				-
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_FFmpeg::_restart()
			
 
				-{
			
 
				-
			
 
				-}
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/FFmpeg/TheoraVideoClip_FFmpeg.h
+++ b/drivers/theoraplayer/src/FFmpeg/TheoraVideoClip_FFmpeg.h
@@ -1,53 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#if defined(__FFMPEG) && !defined(_TheoraVideoClip_FFmpeg_h)
			
 
				-#define _TheoraVideoClip_FFmpeg_h
			
 
				-
			
 
				-#include "TheoraAudioPacketQueue.h"
			
 
				-#include "TheoraVideoClip.h"
			
 
				-
			
 
				-struct AVFormatContext;
			
 
				-struct AVCodecContext;
			
 
				-struct AVCodec;
			
 
				-struct AVFrame;
			
 
				-struct AVIOContext;
			
 
				-
			
 
				-class TheoraVideoClip_FFmpeg : public TheoraVideoClip, public TheoraAudioPacketQueue
			
 
				-{
			
 
				-protected:
			
 
				-	bool mLoaded;
			
 
				-	
			
 
				-	AVFormatContext* mFormatContext;
			
 
				-	AVCodecContext* mCodecContext;
			
 
				-	AVIOContext* mAvioContext;
			
 
				-	AVCodec* mCodec;
			
 
				-	AVFrame* mFrame;
			
 
				-	unsigned char* mInputBuffer;
			
 
				-	int mVideoStreamIndex;
			
 
				-	int mFrameNumber;
			
 
				-	
			
 
				-	void unload();
			
 
				-	void doSeek();
			
 
				-public:
			
 
				-	TheoraVideoClip_FFmpeg(TheoraDataSource* data_source,
			
 
				-								 TheoraOutputMode output_mode,
			
 
				-								 int nPrecachedFrames,
			
 
				-								 bool usePower2Stride);
			
 
				-	~TheoraVideoClip_FFmpeg();
			
 
				-	
			
 
				-	bool _readData();
			
 
				-	bool decodeNextFrame();
			
 
				-	void _restart();
			
 
				-	void load(TheoraDataSource* source);
			
 
				-	float decodeAudio();
			
 
				-	void decodedAudioCheck();
			
 
				-	std::string getDecoderName() { return "FFmpeg"; }
			
 
				-};
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/Theora/TheoraVideoClip_Theora.cpp
+++ b/drivers/theoraplayer/src/Theora/TheoraVideoClip_Theora.cpp
@@ -1,703 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifdef __THEORA
			
 
				-#include <memory.h>
			
 
				-#include <algorithm>
			
 
				-#include "TheoraVideoManager.h"
			
 
				-#include "TheoraFrameQueue.h"
			
 
				-#include "TheoraVideoFrame.h"
			
 
				-#include "TheoraAudioInterface.h"
			
 
				-#include "TheoraTimer.h"
			
 
				-#include "TheoraDataSource.h"
			
 
				-#include "TheoraUtil.h"
			
 
				-#include "TheoraException.h"
			
 
				-#include "TheoraVideoClip_Theora.h"
			
 
				-#include "TheoraPixelTransform.h"
			
 
				-
			
 
				-TheoraVideoClip_Theora::TheoraVideoClip_Theora(TheoraDataSource* data_source,
			
 
				-										TheoraOutputMode output_mode,
			
 
				-										int nPrecachedFrames,
			
 
				-										bool usePower2Stride):
			
 
				-	TheoraVideoClip(data_source, output_mode, nPrecachedFrames, usePower2Stride),
			
 
				-	TheoraAudioPacketQueue()
			
 
				-{
			
 
				-	mInfo.TheoraDecoder = NULL;
			
 
				-	mInfo.TheoraSetup = NULL;
			
 
				-	mVorbisStreams = mTheoraStreams = 0;
			
 
				-	mReadAudioSamples = 0;
			
 
				-	mLastDecodedFrameNumber = 0;
			
 
				-}
			
 
				-
			
 
				-TheoraVideoClip_Theora::~TheoraVideoClip_Theora()
			
 
				-{
			
 
				-	if (mInfo.TheoraDecoder)
			
 
				-	{
			
 
				-		th_decode_free(mInfo.TheoraDecoder);
			
 
				-		th_setup_free(mInfo.TheoraSetup);
			
 
				-
			
 
				-		if (mAudioInterface)
			
 
				-		{
			
 
				-			vorbis_dsp_clear(&mInfo.VorbisDSPState);
			
 
				-			vorbis_block_clear(&mInfo.VorbisBlock);
			
 
				-		}
			
 
				-
			
 
				-		ogg_stream_clear(&mInfo.TheoraStreamState);
			
 
				-		th_comment_clear(&mInfo.TheoraComment);
			
 
				-		th_info_clear(&mInfo.TheoraInfo);
			
 
				-		
			
 
				-		ogg_stream_clear(&mInfo.VorbisStreamState);
			
 
				-		vorbis_comment_clear(&mInfo.VorbisComment);
			
 
				-		vorbis_info_clear(&mInfo.VorbisInfo);
			
 
				-		
			
 
				-		ogg_sync_clear(&mInfo.OggSyncState);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-bool TheoraVideoClip_Theora::_readData()
			
 
				-{
			
 
				-	int audio_eos = 0, serno;
			
 
				-	float audio_time = 0;
			
 
				-	float time = mTimer->getTime();
			
 
				-	if (mRestarted) time = 0;
			
 
				-	
			
 
				-	for (;;)
			
 
				-	{
			
 
				-		char *buffer = ogg_sync_buffer(&mInfo.OggSyncState, 4096);
			
 
				-		int bytes_read = mStream->read(buffer, 4096);
			
 
				-		ogg_sync_wrote(&mInfo.OggSyncState, bytes_read);
			
 
				-		
			
 
				-		if (bytes_read < 4096)
			
 
				-		{
			
 
				-			if (bytes_read == 0)
			
 
				-			{
			
 
				-				if (!mAutoRestart) mEndOfFile = true;
			
 
				-				return 0;
			
 
				-			}
			
 
				-		}
			
 
				-		// when we fill the stream with enough pages, it'll start spitting out packets
			
 
				-		// which contain keyframes, delta frames or audio data
			
 
				-		while (ogg_sync_pageout(&mInfo.OggSyncState, &mInfo.OggPage) > 0)
			
 
				-		{
			
 
				-			serno = ogg_page_serialno(&mInfo.OggPage);
			
 
				-			if (serno == mInfo.TheoraStreamState.serialno) ogg_stream_pagein(&mInfo.TheoraStreamState, &mInfo.OggPage);
			
 
				-			if (mAudioInterface && serno == mInfo.VorbisStreamState.serialno)
			
 
				-			{
			
 
				-				ogg_int64_t g = ogg_page_granulepos(&mInfo.OggPage);
			
 
				-				audio_time = (float) vorbis_granule_time(&mInfo.VorbisDSPState, g);
			
 
				-				audio_eos = ogg_page_eos(&mInfo.OggPage);
			
 
				-				ogg_stream_pagein(&mInfo.VorbisStreamState, &mInfo.OggPage);
			
 
				-			}
			
 
				-		}
			
 
				-		if (!(mAudioInterface && !audio_eos && audio_time < time + 1.0f))
			
 
				-			break;
			
 
				-	}
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-bool TheoraVideoClip_Theora::decodeNextFrame()
			
 
				-{
			
 
				-	if (mEndOfFile) return 0;
			
 
				-	
			
 
				-	TheoraVideoFrame* frame = mFrameQueue->requestEmptyFrame();
			
 
				-	if (!frame) return 0; // max number of precached frames reached
			
 
				-	bool should_restart = 0;
			
 
				-	ogg_packet opTheora;
			
 
				-	ogg_int64_t granulePos;
			
 
				-	th_ycbcr_buffer buff;
			
 
				-    int ret, nAttempts;
			
 
				-	for (;;)
			
 
				-	{
			
 
				-        // ogg_stream_packetout can return -1 and the official docs suggest to do subsequent calls until it succeeds
			
 
				-        // because the data is out of sync. still will limit the number of attempts just in case
			
 
				-        for (ret = -1, nAttempts = 0; ret < 0 && nAttempts < 100; nAttempts++)
			
 
				-        {
			
 
				-            ret = ogg_stream_packetout(&mInfo.TheoraStreamState, &opTheora);
			
 
				-        }
			
 
				-		
			
 
				-		if (ret > 0)
			
 
				-		{
			
 
				-			int status = th_decode_packetin(mInfo.TheoraDecoder, &opTheora, &granulePos);
			
 
				-            if (status != 0 && status != TH_DUPFRAME) continue; // 0 means success
			
 
				-
			
 
				-			float time = (float) th_granule_time(mInfo.TheoraDecoder, granulePos);
			
 
				-			unsigned long frame_number = (unsigned long) th_granule_frame(mInfo.TheoraDecoder, granulePos);
			
 
				-			
			
 
				-			if (time < mTimer->getTime() && !mRestarted && frame_number % 16 != 0)
			
 
				-			{
			
 
				-				// %16 operation is here to prevent a playback halt during video playback if the decoder can't keep up with demand.
			
 
				-#ifdef _DEBUG
			
 
				-				th_writelog(mName + ": pre-dropped frame " + str((int) frame_number));
			
 
				-#endif
			
 
				-				++mNumDroppedFrames;
			
 
				-				continue; // drop frame
			
 
				-			}
			
 
				-			frame->mTimeToDisplay = time - mFrameDuration;
			
 
				-			frame->mIteration     = mIteration;
			
 
				-			frame->_setFrameNumber(frame_number);
			
 
				-			mLastDecodedFrameNumber = frame_number;
			
 
				-			th_decode_ycbcr_out(mInfo.TheoraDecoder, buff);
			
 
				-			TheoraPixelTransform t;
			
 
				-			memset(&t, 0, sizeof(TheoraPixelTransform));
			
 
				-			
			
 
				-			t.y = buff[0].data; t.yStride = buff[0].stride;
			
 
				-			t.u = buff[1].data; t.uStride = buff[1].stride;
			
 
				-			t.v = buff[2].data; t.vStride = buff[2].stride;
			
 
				-			frame->decode(&t);
			
 
				-			break;
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			if (!_readData())
			
 
				-			{
			
 
				-				frame->mInUse = 0;
			
 
				-				should_restart = mAutoRestart;
			
 
				-				break;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-	
			
 
				-	if (mAudioInterface != NULL)
			
 
				-	{
			
 
				-		mAudioMutex->lock();
			
 
				-		decodeAudio();
			
 
				-		mAudioMutex->unlock();
			
 
				-	}
			
 
				-	if (should_restart)
			
 
				-    {
			
 
				-        ++mIteration;
			
 
				-		_restart();
			
 
				-	}
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_Theora::_restart()
			
 
				-{
			
 
				-	bool paused = mTimer->isPaused();
			
 
				-	if (!paused) mTimer->pause();
			
 
				-	long granule=0;
			
 
				-	th_decode_ctl(mInfo.TheoraDecoder,TH_DECCTL_SET_GRANPOS,&granule,sizeof(granule));
			
 
				-	th_decode_free(mInfo.TheoraDecoder);
			
 
				-	mInfo.TheoraDecoder=th_decode_alloc(&mInfo.TheoraInfo,mInfo.TheoraSetup);
			
 
				-	ogg_stream_reset(&mInfo.TheoraStreamState);
			
 
				-	if (mAudioInterface)
			
 
				-	{
			
 
				-		// empty the DSP buffer
			
 
				-		//float **pcm;
			
 
				-		//int len = vorbis_synthesis_pcmout(&mInfo.VorbisDSPState,&pcm);
			
 
				-		//if (len) vorbis_synthesis_read(&mInfo.VorbisDSPState,len);
			
 
				-		ogg_packet opVorbis;
			
 
				-		mReadAudioSamples = 0;
			
 
				-		while (ogg_stream_packetout(&mInfo.VorbisStreamState,&opVorbis) > 0)
			
 
				-		{
			
 
				-			if (vorbis_synthesis(&mInfo.VorbisBlock,&opVorbis) == 0)
			
 
				-				vorbis_synthesis_blockin(&mInfo.VorbisDSPState,&mInfo.VorbisBlock);
			
 
				-		}
			
 
				-		ogg_stream_reset(&mInfo.VorbisStreamState);
			
 
				-	}
			
 
				-	
			
 
				-	ogg_sync_reset(&mInfo.OggSyncState);
			
 
				-	mStream->seek(0);
			
 
				-	ogg_int64_t granulePos = 0;
			
 
				-	th_decode_ctl(mInfo.TheoraDecoder, TH_DECCTL_SET_GRANPOS, &granulePos, sizeof(granule));
			
 
				-	
			
 
				-	mEndOfFile = false;
			
 
				-	
			
 
				-	mRestarted = 1;
			
 
				-	
			
 
				-	if (!paused) mTimer->play();
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_Theora::load(TheoraDataSource* source)
			
 
				-{
			
 
				-#ifdef _DEBUG
			
 
				-	th_writelog("-----");
			
 
				-#endif
			
 
				-	mStream = source;
			
 
				-	readTheoraVorbisHeaders();
			
 
				-	
			
 
				-	mInfo.TheoraDecoder = th_decode_alloc(&mInfo.TheoraInfo,mInfo.TheoraSetup);
			
 
				-	
			
 
				-	mWidth = mInfo.TheoraInfo.frame_width;
			
 
				-	mHeight = mInfo.TheoraInfo.frame_height;
			
 
				-    mSubFrameWidth	 = mInfo.TheoraInfo.pic_width;
			
 
				-    mSubFrameHeight	 = mInfo.TheoraInfo.pic_height;
			
 
				-    mSubFrameOffsetX = mInfo.TheoraInfo.pic_x;
			
 
				-    mSubFrameOffsetY = mInfo.TheoraInfo.pic_y;
			
 
				-    mStride = (mStride == 1) ? mStride = _nextPow2(getWidth()) : getWidth();
			
 
				-	mFPS = mInfo.TheoraInfo.fps_numerator / (float) mInfo.TheoraInfo.fps_denominator;
			
 
				-	
			
 
				-#ifdef _DEBUG
			
 
				-	th_writelog("width: " + str(mWidth) + ", height: " + str(mHeight) + ", fps: " + str((int) getFPS()));
			
 
				-#endif
			
 
				-	mFrameQueue = new TheoraFrameQueue(this);
			
 
				-	mFrameQueue->setSize(mNumPrecachedFrames);
			
 
				-	// find out the duration of the file by seeking to the end
			
 
				-	// having ogg decode pages, extract the granule pos from
			
 
				-	// the last theora page and seek back to beginning of the file
			
 
				-	long streamSize = mStream->size(), seekPos;
			
 
				-	for (int i = 1; i <= 50; ++i)
			
 
				-	{
			
 
				-		ogg_sync_reset(&mInfo.OggSyncState);
			
 
				-		seekPos = streamSize - 4096 * i;
			
 
				-		if (seekPos < 0) seekPos = 0;
			
 
				-		mStream->seek(seekPos);
			
 
				-		
			
 
				-		char *buffer = ogg_sync_buffer(&mInfo.OggSyncState, 4096 * i);
			
 
				-		int bytes_read = mStream->read(buffer, 4096 * i);
			
 
				-		ogg_sync_wrote(&mInfo.OggSyncState, bytes_read);
			
 
				-		ogg_sync_pageseek(&mInfo.OggSyncState, &mInfo.OggPage);
			
 
				-		
			
 
				-		for (;;)
			
 
				-		{
			
 
				-			int ret = ogg_sync_pageout(&mInfo.OggSyncState, &mInfo.OggPage);
			
 
				-			if (ret == 0) break;
			
 
				-			// if page is not a theora page, skip it
			
 
				-			if (ogg_page_serialno(&mInfo.OggPage) != mInfo.TheoraStreamState.serialno) continue;
			
 
				-			
			
 
				-			ogg_int64_t granule = ogg_page_granulepos(&mInfo.OggPage);
			
 
				-			if (granule >= 0)
			
 
				-			{
			
 
				-				mNumFrames = (int) th_granule_frame(mInfo.TheoraDecoder, granule) + 1;
			
 
				-			}
			
 
				-			else if (mNumFrames > 0)
			
 
				-				++mNumFrames; // append delta frames at the end to get the exact numbe
			
 
				-		}
			
 
				-		if (mNumFrames > 0 || streamSize - 4096 * i < 0) break;
			
 
				-		
			
 
				-	}
			
 
				-	if (mNumFrames < 0)
			
 
				-		th_writelog("unable to determine file duration!");
			
 
				-	else
			
 
				-	{
			
 
				-		mDuration = mNumFrames / mFPS;
			
 
				-#ifdef _DEBUG
			
 
				-		th_writelog("duration: " + strf(mDuration) + " seconds");
			
 
				-#endif
			
 
				-	}
			
 
				-	// restore to beginning of stream.
			
 
				-	ogg_sync_reset(&mInfo.OggSyncState);
			
 
				-	mStream->seek(0);
			
 
				-	
			
 
				-	if (mVorbisStreams) // if there is no audio interface factory defined, even though the video
			
 
				-		// clip might have audio, it will be ignored
			
 
				-	{
			
 
				-		vorbis_synthesis_init(&mInfo.VorbisDSPState, &mInfo.VorbisInfo);
			
 
				-		vorbis_block_init(&mInfo.VorbisDSPState, &mInfo.VorbisBlock);
			
 
				-		mNumAudioChannels = mInfo.VorbisInfo.channels;
			
 
				-		mAudioFrequency = (int) mInfo.VorbisInfo.rate;
			
 
				-
			
 
				-		// create an audio interface instance if available
			
 
				-		TheoraAudioInterfaceFactory* audio_factory = TheoraVideoManager::getSingleton().getAudioInterfaceFactory();
			
 
				-		printf("**** audio factory is %p\n", audio_factory);
			
 
				-		if (audio_factory) setAudioInterface(audio_factory->createInstance(this, mNumAudioChannels, mAudioFrequency));
			
 
				-	}
			
 
				-	
			
 
				-	mFrameDuration = 1.0f / getFPS();
			
 
				-#ifdef _DEBUG
			
 
				-	th_writelog("-----");
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_Theora::readTheoraVorbisHeaders()
			
 
				-{
			
 
				-	ogg_packet tempOggPacket;
			
 
				-	bool done = false;
			
 
				-	bool decode_audio=TheoraVideoManager::getSingleton().getAudioInterfaceFactory() != NULL;
			
 
				-	//init Vorbis/Theora Layer
			
 
				-	//Ensure all structures get cleared out.
			
 
				-	memset(&mInfo.OggSyncState, 0, sizeof(ogg_sync_state));
			
 
				-	memset(&mInfo.OggPage, 0, sizeof(ogg_page));
			
 
				-	memset(&mInfo.VorbisStreamState, 0, sizeof(ogg_stream_state));
			
 
				-	memset(&mInfo.TheoraStreamState, 0, sizeof(ogg_stream_state));
			
 
				-	memset(&mInfo.TheoraInfo, 0, sizeof(th_info));
			
 
				-	memset(&mInfo.TheoraComment, 0, sizeof(th_comment));
			
 
				-	memset(&mInfo.VorbisInfo, 0, sizeof(vorbis_info));
			
 
				-	memset(&mInfo.VorbisDSPState, 0, sizeof(vorbis_dsp_state));
			
 
				-	memset(&mInfo.VorbisBlock, 0, sizeof(vorbis_block));
			
 
				-	memset(&mInfo.VorbisComment, 0, sizeof(vorbis_comment));
			
 
				-	
			
 
				-	ogg_sync_init(&mInfo.OggSyncState);
			
 
				-	th_comment_init(&mInfo.TheoraComment);
			
 
				-	th_info_init(&mInfo.TheoraInfo);
			
 
				-	vorbis_info_init(&mInfo.VorbisInfo);
			
 
				-	vorbis_comment_init(&mInfo.VorbisComment);
			
 
				-	
			
 
				-	while (!done)
			
 
				-	{
			
 
				-		char *buffer = ogg_sync_buffer(&mInfo.OggSyncState, 4096);
			
 
				-		int bytes_read = mStream->read(buffer, 4096);
			
 
				-		ogg_sync_wrote(&mInfo.OggSyncState, bytes_read);
			
 
				-		
			
 
				-		if (bytes_read == 0)
			
 
				-			break;
			
 
				-		
			
 
				-		while (ogg_sync_pageout(&mInfo.OggSyncState, &mInfo.OggPage) > 0)
			
 
				-		{
			
 
				-			ogg_stream_state OggStateTest;
			
 
				-			
			
 
				-			//is this an initial header? If not, stop
			
 
				-			if (!ogg_page_bos(&mInfo.OggPage))
			
 
				-			{
			
 
				-				//This is done blindly, because stream only accept themselves
			
 
				-				if (mTheoraStreams) ogg_stream_pagein(&mInfo.TheoraStreamState, &mInfo.OggPage);
			
 
				-				if (mVorbisStreams) ogg_stream_pagein(&mInfo.VorbisStreamState, &mInfo.OggPage);
			
 
				-				
			
 
				-				done=true;
			
 
				-				break;
			
 
				-			}
			
 
				-			
			
 
				-			ogg_stream_init(&OggStateTest, ogg_page_serialno(&mInfo.OggPage));
			
 
				-			ogg_stream_pagein(&OggStateTest, &mInfo.OggPage);
			
 
				-			ogg_stream_packetout(&OggStateTest, &tempOggPacket);
			
 
				-			
			
 
				-			//identify the codec
			
 
				-			int ret;
			
 
				-			if (!mTheoraStreams)
			
 
				-			{
			
 
				-				ret = th_decode_headerin(&mInfo.TheoraInfo, &mInfo.TheoraComment, &mInfo.TheoraSetup, &tempOggPacket);
			
 
				-				
			
 
				-				if (ret > 0)
			
 
				-				{
			
 
				-					//This is the Theora Header
			
 
				-					memcpy(&mInfo.TheoraStreamState, &OggStateTest, sizeof(OggStateTest));
			
 
				-					mTheoraStreams = 1;
			
 
				-					continue;
			
 
				-				}
			
 
				-			}
			
 
				-			if (decode_audio && !mVorbisStreams &&
			
 
				-				vorbis_synthesis_headerin(&mInfo.VorbisInfo, &mInfo.VorbisComment, &tempOggPacket) >=0)
			
 
				-			{
			
 
				-				//This is vorbis header
			
 
				-				memcpy(&mInfo.VorbisStreamState, &OggStateTest, sizeof(OggStateTest));
			
 
				-				mVorbisStreams = 1;
			
 
				-				continue;
			
 
				-			}
			
 
				-			//Hmm. I guess it's not a header we support, so erase it
			
 
				-			ogg_stream_clear(&OggStateTest);
			
 
				-		}
			
 
				-	}
			
 
				-	
			
 
				-	while ((mTheoraStreams && (mTheoraStreams < 3)) ||
			
 
				-		   (mVorbisStreams && (mVorbisStreams < 3)))
			
 
				-	{
			
 
				-		//Check 2nd'dary headers... Theora First
			
 
				-		int iSuccess;
			
 
				-		while (mTheoraStreams && mTheoraStreams < 3 &&
			
 
				-			  (iSuccess = ogg_stream_packetout(&mInfo.TheoraStreamState, &tempOggPacket)))
			
 
				-		{
			
 
				-			if (iSuccess < 0)
			
 
				-				throw TheoraGenericException("Error parsing Theora stream headers.");
			
 
				-			if (!th_decode_headerin(&mInfo.TheoraInfo, &mInfo.TheoraComment, &mInfo.TheoraSetup, &tempOggPacket))
			
 
				-				throw TheoraGenericException("invalid theora stream");
			
 
				-			
			
 
				-			++mTheoraStreams;
			
 
				-		} //end while looking for more theora headers
			
 
				-		
			
 
				-		//look 2nd vorbis header packets
			
 
				-		while (mVorbisStreams < 3 && (iSuccess = ogg_stream_packetout(&mInfo.VorbisStreamState, &tempOggPacket)))
			
 
				-		{
			
 
				-			if (iSuccess < 0)
			
 
				-				throw TheoraGenericException("Error parsing vorbis stream headers");
			
 
				-			
			
 
				-			if (vorbis_synthesis_headerin(&mInfo.VorbisInfo, &mInfo.VorbisComment,&tempOggPacket))
			
 
				-				throw TheoraGenericException("invalid stream");
			
 
				-			
			
 
				-			++mVorbisStreams;
			
 
				-		} //end while looking for more vorbis headers
			
 
				-		
			
 
				-		//Not finished with Headers, get some more file data
			
 
				-		if (ogg_sync_pageout(&mInfo.OggSyncState, &mInfo.OggPage) > 0)
			
 
				-		{
			
 
				-			if (mTheoraStreams) ogg_stream_pagein(&mInfo.TheoraStreamState, &mInfo.OggPage);
			
 
				-			if (mVorbisStreams) ogg_stream_pagein(&mInfo.VorbisStreamState, &mInfo.OggPage);
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			char *buffer = ogg_sync_buffer(&mInfo.OggSyncState, 4096);
			
 
				-			int bytes_read = mStream->read(buffer, 4096);
			
 
				-			ogg_sync_wrote(&mInfo.OggSyncState, bytes_read);
			
 
				-			
			
 
				-			if (bytes_read == 0)
			
 
				-				throw TheoraGenericException("End of file found prematurely");
			
 
				-		}
			
 
				-	} //end while looking for all headers
			
 
				-	//	writelog("Vorbis Headers: " + str(mVorbisHeaders) + " Theora Headers : " + str(mTheoraHeaders));
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_Theora::decodedAudioCheck()
			
 
				-{
			
 
				-	if (!mAudioInterface || mTimer->isPaused()) return;
			
 
				-
			
 
				-	mAudioMutex->lock();
			
 
				-	flushAudioPackets(mAudioInterface);
			
 
				-	mAudioMutex->unlock();
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip_Theora::decodeAudio()
			
 
				-{
			
 
				-	if (mRestarted) return -1;
			
 
				-	
			
 
				-	ogg_packet opVorbis;
			
 
				-	float **pcm;
			
 
				-	int len = 0;
			
 
				-	float timestamp = -1;
			
 
				-	bool read_past_timestamp = 0;
			
 
				-	
			
 
				-	float factor = 1.0f / mAudioFrequency;
			
 
				-	float videoTime = (float) mLastDecodedFrameNumber / mFPS;
			
 
				-	float min = mFrameQueue->getSize() / mFPS + 1.0f;
			
 
				-
			
 
				-	for (;;)
			
 
				-	{
			
 
				-		len = vorbis_synthesis_pcmout(&mInfo.VorbisDSPState, &pcm);
			
 
				-		if (len == 0)
			
 
				-		{
			
 
				-			if (ogg_stream_packetout(&mInfo.VorbisStreamState, &opVorbis) > 0)
			
 
				-			{
			
 
				-				if (vorbis_synthesis(&mInfo.VorbisBlock, &opVorbis) == 0)
			
 
				-				{
			
 
				-					if (timestamp < 0 && opVorbis.granulepos >= 0)
			
 
				-					{
			
 
				-						timestamp = (float) vorbis_granule_time(&mInfo.VorbisDSPState, opVorbis.granulepos);
			
 
				-					}
			
 
				-					else if (timestamp >= 0) read_past_timestamp = 1;
			
 
				-					vorbis_synthesis_blockin(&mInfo.VorbisDSPState, &mInfo.VorbisBlock);
			
 
				-				}
			
 
				-				continue;
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				float audioTime = mReadAudioSamples * factor;
			
 
				-				// always buffer up of audio ahead of the frames
			
 
				-				if (audioTime - videoTime < min)
			
 
				-				{
			
 
				-					if (!_readData()) break;
			
 
				-				}
			
 
				-				else
			
 
				-					break;
			
 
				-			}
			
 
				-		}
			
 
				-		addAudioPacket(pcm, len, mAudioGain);
			
 
				-		mReadAudioSamples += len;
			
 
				-		if (read_past_timestamp) timestamp += (float) len / mInfo.VorbisInfo.rate;
			
 
				-		vorbis_synthesis_read(&mInfo.VorbisDSPState, len); // tell vorbis we read a number of samples
			
 
				-	}
			
 
				-	return timestamp;
			
 
				-}
			
 
				-
			
 
				-long TheoraVideoClip_Theora::seekPage(long targetFrame, bool return_keyframe)
			
 
				-{
			
 
				-	int i,seek_min = 0, seek_max = (int) mStream->size();
			
 
				-	long frame;
			
 
				-	ogg_int64_t granule = 0;
			
 
				-	
			
 
				-	if (targetFrame == 0) mStream->seek(0);
			
 
				-	for (i = (targetFrame == 0) ? 100 : 0; i < 100; ++i)
			
 
				-	{
			
 
				-		ogg_sync_reset(&mInfo.OggSyncState);
			
 
				-		mStream->seek((seek_min + seek_max) / 2); // do a binary search
			
 
				-		memset(&mInfo.OggPage, 0, sizeof(ogg_page));
			
 
				-		ogg_sync_pageseek(&mInfo.OggSyncState, &mInfo.OggPage);
			
 
				-		
			
 
				-		for (;i < 1000;)
			
 
				-		{
			
 
				-			int ret = ogg_sync_pageout(&mInfo.OggSyncState, &mInfo.OggPage);
			
 
				-			if (ret == 1)
			
 
				-			{
			
 
				-				int serno = ogg_page_serialno(&mInfo.OggPage);
			
 
				-				if (serno == mInfo.TheoraStreamState.serialno)
			
 
				-				{
			
 
				-					granule = ogg_page_granulepos(&mInfo.OggPage);
			
 
				-					if (granule >= 0)
			
 
				-					{
			
 
				-						frame = (long) th_granule_frame(mInfo.TheoraDecoder, granule);
			
 
				-						if (frame < targetFrame && targetFrame - frame < 10)
			
 
				-						{
			
 
				-							// we're close enough, let's break this.
			
 
				-							i = 1000;
			
 
				-							break;
			
 
				-						}
			
 
				-						// we're not close enough, let's shorten the borders of the binary search
			
 
				-						if (targetFrame - 1 > frame) seek_min = (seek_min + seek_max) / 2;
			
 
				-						else				         seek_max = (seek_min + seek_max) / 2;
			
 
				-						break;
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				char *buffer = ogg_sync_buffer(&mInfo.OggSyncState, 4096);
			
 
				-				int bytes_read = mStream->read(buffer, 4096);
			
 
				-				if (bytes_read == 0) break;
			
 
				-				ogg_sync_wrote(&mInfo.OggSyncState, bytes_read);
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-	if (return_keyframe) return (long) (granule >> mInfo.TheoraInfo.keyframe_granule_shift);
			
 
				-	
			
 
				-	ogg_sync_reset(&mInfo.OggSyncState);
			
 
				-	memset(&mInfo.OggPage, 0, sizeof(ogg_page));
			
 
				-	ogg_sync_pageseek(&mInfo.OggSyncState, &mInfo.OggPage);
			
 
				-	if (targetFrame == 0) return -1;
			
 
				-	mStream->seek((seek_min + seek_max) / 2); // do a binary search
			
 
				-	return -1;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip_Theora::doSeek()
			
 
				-{
			
 
				-#if _DEBUG
			
 
				-	th_writelog(mName + " [seek]: seeking to frame " + str(mSeekFrame));
			
 
				-#endif
			
 
				-	int frame;
			
 
				-	float time = mSeekFrame / getFPS();
			
 
				-	mTimer->seek(time);
			
 
				-	bool paused = mTimer->isPaused();
			
 
				-	if (!paused) mTimer->pause(); // pause until seeking is done
			
 
				-	
			
 
				-	mEndOfFile = false;
			
 
				-	mRestarted = false;
			
 
				-	
			
 
				-	resetFrameQueue();
			
 
				-	// reset the video decoder.
			
 
				-	ogg_stream_reset(&mInfo.TheoraStreamState);
			
 
				-	th_decode_free(mInfo.TheoraDecoder);
			
 
				-	mInfo.TheoraDecoder = th_decode_alloc(&mInfo.TheoraInfo, mInfo.TheoraSetup);
			
 
				-	
			
 
				-	if (mAudioInterface)
			
 
				-	{
			
 
				-		mAudioMutex->lock();
			
 
				-		ogg_stream_reset(&mInfo.VorbisStreamState);
			
 
				-		vorbis_synthesis_restart(&mInfo.VorbisDSPState);
			
 
				-		destroyAllAudioPackets();
			
 
				-	}
			
 
				-	// first seek to desired frame, then figure out the location of the
			
 
				-	// previous keyframe and seek to it.
			
 
				-	// then by setting the correct time, the decoder will skip N frames untill
			
 
				-	// we get the frame we want.
			
 
				-	frame = (int) seekPage(mSeekFrame, 1); // find the keyframe nearest to the target frame
			
 
				-#ifdef _DEBUG
			
 
				-	//		th_writelog(mName + " [seek]: nearest keyframe for frame " + str(mSeekFrame) + " is frame: " + str(frame));
			
 
				-#endif
			
 
				-	seekPage(std::max(0, frame - 1), 0);
			
 
				-	
			
 
				-	ogg_packet opTheora;
			
 
				-	ogg_int64_t granulePos;
			
 
				-	bool granule_set = 0;
			
 
				-	if (frame <= 1)
			
 
				-	{
			
 
				-		if (mInfo.TheoraInfo.version_major == 3 && mInfo.TheoraInfo.version_minor == 2 && mInfo.TheoraInfo.version_subminor == 0)
			
 
				-			granulePos = 0;
			
 
				-		else
			
 
				-			granulePos = 1; // because of difference in granule interpretation in theora streams 3.2.0 and newer ones
			
 
				-		th_decode_ctl(mInfo.TheoraDecoder, TH_DECCTL_SET_GRANPOS, &granulePos, sizeof(granulePos));
			
 
				-		granule_set = 1;
			
 
				-	}
			
 
				-	
			
 
				-	// now that we've found the keyframe that preceeds our desired frame, lets keep on decoding frames until we
			
 
				-	// reach our target frame.
			
 
				-	
			
 
				-    int status, ret;
			
 
				-	for (;mSeekFrame != 0;)
			
 
				-	{
			
 
				-		ret = ogg_stream_packetout(&mInfo.TheoraStreamState, &opTheora);
			
 
				-		if (ret > 0)
			
 
				-		{
			
 
				-			if (!granule_set)
			
 
				-			{
			
 
				-				// theora decoder requires to set the granule pos after seek to be able to determine the current frame
			
 
				-				if (opTheora.granulepos >= 0)
			
 
				-				{
			
 
				-					th_decode_ctl(mInfo.TheoraDecoder, TH_DECCTL_SET_GRANPOS, &opTheora.granulepos, sizeof(opTheora.granulepos));
			
 
				-					granule_set = 1;
			
 
				-				}
			
 
				-				else continue; // ignore prev delta frames until we hit a keyframe
			
 
				-			}
			
 
				-			status = th_decode_packetin(mInfo.TheoraDecoder, &opTheora, &granulePos);
			
 
				-            if (status != 0 && status != TH_DUPFRAME) continue;
			
 
				-			frame = (int) th_granule_frame(mInfo.TheoraDecoder, granulePos);
			
 
				-			if (frame >= mSeekFrame - 1) break;
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			if (!_readData())
			
 
				-			{
			
 
				-				th_writelog(mName + " [seek]: fineseeking failed, _readData failed!");
			
 
				-				if (mAudioInterface) mAudioMutex->unlock();
			
 
				-				return;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-#ifdef _DEBUG
			
 
				-	//	th_writelog(mName + " [seek]: fineseeked to frame " + str(frame + 1) + ", requested: " + str(mSeekFrame));
			
 
				-#endif
			
 
				-	if (mAudioInterface)
			
 
				-	{
			
 
				-		// read audio data until we reach a timestamp. this usually takes only one iteration, but just in case let's
			
 
				-		// wrap it in a loop
			
 
				-		float timestamp;
			
 
				-		for (;;)
			
 
				-		{
			
 
				-			timestamp = decodeAudio();
			
 
				-			if (timestamp >= 0) break;
			
 
				-			else _readData();
			
 
				-		}
			
 
				-		float rate = (float) mAudioFrequency * mNumAudioChannels;
			
 
				-		float queued_time = getAudioPacketQueueLength();
			
 
				-		// at this point there are only 2 possibilities: either we have too much packets and we have to delete
			
 
				-		// the first N ones, or we don't have enough, so let's fill the gap with silence.
			
 
				- 		if (time > timestamp - queued_time)
			
 
				-		{
			
 
				-			while (mTheoraAudioPacketQueue != NULL)
			
 
				-			{
			
 
				-				if (time > timestamp - queued_time + mTheoraAudioPacketQueue->numSamples / rate)
			
 
				-				{
			
 
				-					queued_time -= mTheoraAudioPacketQueue->numSamples / rate;
			
 
				-					destroyAudioPacket(popAudioPacket());
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					int n_trim = (int) ((timestamp - queued_time + mTheoraAudioPacketQueue->numSamples / rate - time) * rate);
			
 
				-					if (mTheoraAudioPacketQueue->numSamples - n_trim <= 0)
			
 
				-						destroyAudioPacket(popAudioPacket()); // if there's no data to be left, just destroy it
			
 
				-					else
			
 
				-					{
			
 
				-						for (int i = n_trim, j = 0; i < mTheoraAudioPacketQueue->numSamples; ++i, ++j)
			
 
				-							mTheoraAudioPacketQueue->pcm[j] = mTheoraAudioPacketQueue->pcm[i];
			
 
				-						mTheoraAudioPacketQueue->numSamples -= n_trim;
			
 
				-					}
			
 
				-					break;
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			// expand the first packet with silence.
			
 
				-			if (mTheoraAudioPacketQueue) // just in case!
			
 
				-			{
			
 
				-				int i, j, nmissing = (int) ((timestamp - queued_time - time) * rate);
			
 
				-				if (nmissing > 0)
			
 
				-				{
			
 
				-					float* samples = new float[nmissing + mTheoraAudioPacketQueue->numSamples];
			
 
				-					for (i = 0; i < nmissing; ++i) samples[i] = 0;
			
 
				-					for (j = 0; i < nmissing + mTheoraAudioPacketQueue->numSamples; ++i, ++j)
			
 
				-						samples[i] = mTheoraAudioPacketQueue->pcm[j];
			
 
				-					delete [] mTheoraAudioPacketQueue->pcm;
			
 
				-					mTheoraAudioPacketQueue->pcm = samples;
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-		mLastDecodedFrameNumber = mSeekFrame;
			
 
				-		mReadAudioSamples = (unsigned int) (timestamp * mAudioFrequency);
			
 
				-		
			
 
				-		mAudioMutex->unlock();
			
 
				-	}
			
 
				-	if (!paused) mTimer->play();
			
 
				-	mSeekFrame = -1;
			
 
				-}
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/Theora/TheoraVideoClip_Theora.h
+++ b/drivers/theoraplayer/src/Theora/TheoraVideoClip_Theora.h
@@ -1,64 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#if defined(__THEORA) && !defined(_TheoraVideoClip_Theora_h)
			
 
				-#define _TheoraVideoClip_Theora_h
			
 
				-
			
 
				-#include <ogg/ogg.h>
			
 
				-#include <vorbis/vorbisfile.h>
			
 
				-#include <theora/theoradec.h>
			
 
				-#include "TheoraAudioPacketQueue.h"
			
 
				-#include "TheoraVideoClip.h"
			
 
				-
			
 
				-struct TheoraInfoStruct
			
 
				-{
			
 
				-	// ogg/vorbis/theora variables
			
 
				-	ogg_sync_state   OggSyncState;
			
 
				-	ogg_page         OggPage;
			
 
				-	ogg_stream_state VorbisStreamState;
			
 
				-	ogg_stream_state TheoraStreamState;
			
 
				-	//Theora State
			
 
				-	th_info        TheoraInfo;
			
 
				-	th_comment     TheoraComment;
			
 
				-	th_setup_info* TheoraSetup;
			
 
				-	th_dec_ctx*    TheoraDecoder;
			
 
				-	//Vorbis State
			
 
				-	vorbis_info      VorbisInfo;
			
 
				-	vorbis_dsp_state VorbisDSPState;
			
 
				-	vorbis_block     VorbisBlock;
			
 
				-	vorbis_comment   VorbisComment;
			
 
				-};
			
 
				-
			
 
				-class TheoraVideoClip_Theora : public TheoraVideoClip, public TheoraAudioPacketQueue
			
 
				-{
			
 
				-protected:
			
 
				-	TheoraInfoStruct mInfo; // a pointer is used to avoid having to include theora & vorbis headers
			
 
				-	int mTheoraStreams, mVorbisStreams;	// Keeps track of Theora and Vorbis Streams
			
 
				-
			
 
				-	long seekPage(long targetFrame, bool return_keyframe);
			
 
				-	void doSeek();
			
 
				-	void readTheoraVorbisHeaders();
			
 
				-	unsigned int mReadAudioSamples;
			
 
				-	unsigned long mLastDecodedFrameNumber;
			
 
				-public:
			
 
				-	TheoraVideoClip_Theora(TheoraDataSource* data_source,
			
 
				-						   TheoraOutputMode output_mode,
			
 
				-						   int nPrecachedFrames,
			
 
				-						   bool usePower2Stride);
			
 
				-	~TheoraVideoClip_Theora();
			
 
				-
			
 
				-	bool _readData();
			
 
				-	bool decodeNextFrame();
			
 
				-	void _restart();
			
 
				-	void load(TheoraDataSource* source);
			
 
				-	float decodeAudio();
			
 
				-	void decodedAudioCheck();
			
 
				-	std::string getDecoderName() { return "Theora"; }
			
 
				-};
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/TheoraAsync.cpp
+++ b/drivers/theoraplayer/src/TheoraAsync.cpp
@@ -1,253 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-
			
 
				-#include <stdio.h>
			
 
				-#include <stdlib.h>
			
 
				-
			
 
				-#ifdef _WIN32
			
 
				-#include <windows.h>
			
 
				-#else
			
 
				-#include <unistd.h>
			
 
				-#include <pthread.h>
			
 
				-#endif
			
 
				-
			
 
				-#include "TheoraAsync.h"
			
 
				-#include "TheoraUtil.h"
			
 
				-
			
 
				-#ifdef _WINRT
			
 
				-#include <wrl.h>
			
 
				-#endif
			
 
				-
			
 
				-///////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				-// Mutex
			
 
				-///////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				-
			
 
				-TheoraMutex::TheoraMutex()
			
 
				-{
			
 
				-#ifdef _WIN32
			
 
				-#ifndef _WINRT // WinXP does not have CreateTheoraMutexEx()
			
 
				-	mHandle = CreateMutex(0, 0, 0);
			
 
				-#else
			
 
				-	mHandle = CreateMutexEx(NULL, NULL, 0, SYNCHRONIZE);
			
 
				-#endif
			
 
				-#else
			
 
				-	mHandle = (pthread_mutex_t*)malloc(sizeof(pthread_mutex_t));
			
 
				-	pthread_mutex_init((pthread_mutex_t*)mHandle, 0);
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-TheoraMutex::~TheoraMutex()
			
 
				-{
			
 
				-#ifdef _WIN32
			
 
				-	CloseHandle(mHandle);
			
 
				-#else
			
 
				-	pthread_mutex_destroy((pthread_mutex_t*)mHandle);
			
 
				-	free((pthread_mutex_t*)mHandle);
			
 
				-	mHandle = NULL;
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-void TheoraMutex::lock()
			
 
				-{
			
 
				-#ifdef _WIN32
			
 
				-	WaitForSingleObjectEx(mHandle, INFINITE, FALSE);
			
 
				-#else
			
 
				-	pthread_mutex_lock((pthread_mutex_t*)mHandle);
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-void TheoraMutex::unlock()
			
 
				-{
			
 
				-#ifdef _WIN32
			
 
				-	ReleaseMutex(mHandle);
			
 
				-#else
			
 
				-	pthread_mutex_unlock((pthread_mutex_t*)mHandle);
			
 
				-#endif
			
 
				-}
			
 
				-	
			
 
				-///////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				-// Thread
			
 
				-///////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				-
			
 
				-#ifdef _WINRT
			
 
				-using namespace Windows::Foundation;
			
 
				-using namespace Windows::System::Threading;
			
 
				-#endif
			
 
				-
			
 
				-#ifdef _WIN32
			
 
				-unsigned long WINAPI theoraAsyncCall(void* param)
			
 
				-#else
			
 
				-void* theoraAsyncCall(void* param)
			
 
				-#endif
			
 
				-{
			
 
				-	TheoraThread* t = (TheoraThread*)param;
			
 
				-	t->execute();
			
 
				-#ifdef _WIN32
			
 
				-	return 0;
			
 
				-#else
			
 
				-	pthread_exit(NULL);
			
 
				-	return NULL;
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-#ifdef _WINRT
			
 
				-struct TheoraAsyncActionWrapper
			
 
				-{
			
 
				-public:
			
 
				-	IAsyncAction^ mAsyncAction;
			
 
				-	TheoraAsyncActionWrapper(IAsyncAction^ asyncAction)
			
 
				-	{
			
 
				-		mAsyncAction = asyncAction;
			
 
				-	}
			
 
				-};
			
 
				-#endif
			
 
				-	
			
 
				-TheoraThread::TheoraThread() : mRunning(false), mId(0)
			
 
				-{
			
 
				-#ifndef _WIN32
			
 
				-	mId = (pthread_t*)malloc(sizeof(pthread_t));
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-TheoraThread::~TheoraThread()
			
 
				-{
			
 
				-	if (mRunning)
			
 
				-	{
			
 
				-		stop();
			
 
				-	}
			
 
				-	if (mId != NULL)
			
 
				-	{
			
 
				-#ifdef _WIN32
			
 
				-#ifndef _WINRT
			
 
				-		CloseHandle(mId);
			
 
				-#else
			
 
				-		delete mId;
			
 
				-#endif
			
 
				-#else
			
 
				-		free((pthread_t*)mId);
			
 
				-#endif
			
 
				-		mId = NULL;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-void TheoraThread::start()
			
 
				-{
			
 
				-	mRunning = true;
			
 
				-#ifdef _WIN32
			
 
				-#ifndef _WINRT
			
 
				-	mId = CreateThread(0, 0, &theoraAsyncCall, this, 0, 0);
			
 
				-#else
			
 
				-	mId = new TheoraAsyncActionWrapper(ThreadPool::RunAsync(
			
 
				-		ref new WorkItemHandler([&](IAsyncAction^ work_item)
			
 
				-		{
			
 
				-			execute();
			
 
				-		}),
			
 
				-		WorkItemPriority::Normal, WorkItemOptions::TimeSliced));
			
 
				-#endif
			
 
				-#else
			
 
				-	pthread_create((pthread_t*)mId, NULL, &theoraAsyncCall, this);
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-bool TheoraThread::isRunning()
			
 
				-{
			
 
				-	bool ret;
			
 
				-	mRunningMutex.lock();
			
 
				-	ret = mRunning;
			
 
				-	mRunningMutex.unlock();
			
 
				-	
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-void TheoraThread::join()
			
 
				-{
			
 
				-	mRunningMutex.lock();
			
 
				-	mRunning = false;
			
 
				-	mRunningMutex.unlock();
			
 
				-#ifdef _WIN32
			
 
				-#ifndef _WINRT
			
 
				-	WaitForSingleObject(mId, INFINITE);
			
 
				-	if (mId != NULL)
			
 
				-	{
			
 
				-		CloseHandle(mId);
			
 
				-		mId = NULL;
			
 
				-	}
			
 
				-#else
			
 
				-	IAsyncAction^ action = ((TheoraAsyncActionWrapper*)mId)->mAsyncAction;
			
 
				-	int i = 0;
			
 
				-	while (action->Status != AsyncStatus::Completed &&
			
 
				-		action->Status != AsyncStatus::Canceled &&
			
 
				-		action->Status != AsyncStatus::Error &&
			
 
				-		i < 100)
			
 
				-	{
			
 
				-		_psleep(50);
			
 
				-		++i;
			
 
				-	}
			
 
				-	if (i >= 100)
			
 
				-	{
			
 
				-		i = 0;
			
 
				-		action->Cancel();
			
 
				-		while (action->Status != AsyncStatus::Completed &&
			
 
				-			action->Status != AsyncStatus::Canceled &&
			
 
				-			action->Status != AsyncStatus::Error &&
			
 
				-			i < 100)
			
 
				-		{
			
 
				-			_psleep(50);
			
 
				-			++i;
			
 
				-		}
			
 
				-	}
			
 
				-#endif
			
 
				-#else
			
 
				-	pthread_join(*((pthread_t*)mId), 0);
			
 
				-#endif
			
 
				-}
			
 
				-	
			
 
				-void TheoraThread::resume()
			
 
				-{
			
 
				-#ifdef _WIN32
			
 
				-#ifndef _WINRT
			
 
				-	ResumeThread(mId);
			
 
				-#else
			
 
				-	// not available in WinRT
			
 
				-#endif
			
 
				-#endif
			
 
				-}
			
 
				-	
			
 
				-void TheoraThread::pause()
			
 
				-{
			
 
				-#ifdef _WIN32
			
 
				-#ifndef _WINRT
			
 
				-	SuspendThread(mId);
			
 
				-#else
			
 
				-	// not available in WinRT
			
 
				-#endif
			
 
				-#endif
			
 
				-}
			
 
				-	
			
 
				-void TheoraThread::stop()
			
 
				-{
			
 
				-	if (mRunning)
			
 
				-	{
			
 
				-		mRunningMutex.lock();
			
 
				-		mRunning = false;
			
 
				-		mRunningMutex.unlock();
			
 
				-#ifdef _WIN32
			
 
				-#ifndef _WINRT
			
 
				-		TerminateThread(mId, 0);
			
 
				-#else
			
 
				-		((TheoraAsyncActionWrapper*)mId)->mAsyncAction->Cancel();
			
 
				-#endif
			
 
				-#elif defined(_ANDROID)
			
 
				-		pthread_kill(*((pthread_t*)mId), 0);
			
 
				-#else
			
 
				-		pthread_cancel(*((pthread_t*)mId));
			
 
				-#endif
			
 
				-	}
			
 
				-}
			
 
				-	
			
--- a/drivers/theoraplayer/src/TheoraAudioInterface.cpp
+++ b/drivers/theoraplayer/src/TheoraAudioInterface.cpp
@@ -1,21 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include "TheoraAudioInterface.h"
			
 
				-
			
 
				-TheoraAudioInterface::TheoraAudioInterface(TheoraVideoClip* owner, int nChannels, int freq)
			
 
				-{
			
 
				-	mFreq = freq;
			
 
				-	mNumChannels = nChannels;
			
 
				-	mClip = owner;
			
 
				-}
			
 
				-
			
 
				-TheoraAudioInterface::~TheoraAudioInterface()
			
 
				-{
			
 
				-	
			
 
				-}
			
--- a/drivers/theoraplayer/src/TheoraAudioPacketQueue.cpp
+++ b/drivers/theoraplayer/src/TheoraAudioPacketQueue.cpp
@@ -1,126 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include <stdlib.h>
			
 
				-#include "TheoraAudioPacketQueue.h"
			
 
				-#include "TheoraAudioInterface.h"
			
 
				-
			
 
				-TheoraAudioPacketQueue::TheoraAudioPacketQueue()
			
 
				-{
			
 
				-	mTheoraAudioPacketQueue = NULL;
			
 
				-}
			
 
				-
			
 
				-TheoraAudioPacketQueue::~TheoraAudioPacketQueue()
			
 
				-{
			
 
				-	destroyAllAudioPackets();
			
 
				-}
			
 
				-
			
 
				-float TheoraAudioPacketQueue::getAudioPacketQueueLength()
			
 
				-{
			
 
				-	float len = 0;
			
 
				-	for (TheoraAudioPacket* p = mTheoraAudioPacketQueue; p != NULL; p = p->next)
			
 
				-		len += p->numSamples;
			
 
				-	
			
 
				-	return len / (mAudioFrequency * mNumAudioChannels);
			
 
				-}
			
 
				-
			
 
				-void TheoraAudioPacketQueue::_addAudioPacket(float* data, int numSamples)
			
 
				-{
			
 
				-	TheoraAudioPacket* packet = new TheoraAudioPacket;
			
 
				-	packet->pcm = data;
			
 
				-	packet->numSamples = numSamples;
			
 
				-	packet->next = NULL;
			
 
				-
			
 
				-
			
 
				-	if (mTheoraAudioPacketQueue == NULL) mTheoraAudioPacketQueue = packet;
			
 
				-	else
			
 
				-	{
			
 
				-		TheoraAudioPacket* last = mTheoraAudioPacketQueue;
			
 
				-		for (TheoraAudioPacket* p = last; p != NULL; p = p->next)
			
 
				-			last = p;
			
 
				-		last->next = packet;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-void TheoraAudioPacketQueue::addAudioPacket(float** buffer, int numSamples, float gain)
			
 
				-{
			
 
				-	float* data = new float[numSamples * mNumAudioChannels];
			
 
				-	float* dataptr = data;
			
 
				-	int i;
			
 
				-	unsigned int j;
			
 
				-	
			
 
				-	if (gain < 1.0f)
			
 
				-	{
			
 
				-		// apply gain, let's attenuate the samples
			
 
				-		for (i = 0; i < numSamples; ++i)
			
 
				-			for (j = 0; j < mNumAudioChannels; j++, ++dataptr)
			
 
				-				*dataptr = buffer[i][j] * gain;
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		// do a simple copy, faster then the above method, when gain is 1.0f
			
 
				-		for (i = 0; i < numSamples; ++i)
			
 
				-			for (j = 0; j < mNumAudioChannels; j++, ++dataptr)
			
 
				-				*dataptr = buffer[j][i];
			
 
				-	}
			
 
				-		
			
 
				-	_addAudioPacket(data, numSamples * mNumAudioChannels);
			
 
				-}
			
 
				-
			
 
				-void TheoraAudioPacketQueue::addAudioPacket(float* buffer, int numSamples, float gain)
			
 
				-{
			
 
				-	float* data = new float[numSamples * mNumAudioChannels];
			
 
				-	float* dataptr = data;
			
 
				-	int i, numFloats = numSamples * mNumAudioChannels;
			
 
				-	
			
 
				-	if (gain < 1.0f)
			
 
				-	{
			
 
				-		// apply gain, let's attenuate the samples
			
 
				-		for (i = 0; i < numFloats; ++i, dataptr++)
			
 
				-			*dataptr = buffer[i] * gain;
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		// do a simple copy, faster then the above method, when gain is 1.0f
			
 
				-		for (i = 0; i < numFloats; ++i, dataptr++)
			
 
				-			*dataptr = buffer[i];
			
 
				-	}
			
 
				-	
			
 
				-	_addAudioPacket(data, numFloats);
			
 
				-}
			
 
				-
			
 
				-TheoraAudioPacket* TheoraAudioPacketQueue::popAudioPacket()
			
 
				-{
			
 
				-	if (mTheoraAudioPacketQueue == NULL) return NULL;
			
 
				-	TheoraAudioPacket* p = mTheoraAudioPacketQueue;
			
 
				-	mTheoraAudioPacketQueue = mTheoraAudioPacketQueue->next;
			
 
				-	return p;
			
 
				-}
			
 
				-
			
 
				-void TheoraAudioPacketQueue::destroyAudioPacket(TheoraAudioPacket* p)
			
 
				-{
			
 
				-	if (p == NULL) return;
			
 
				-	delete [] p->pcm;
			
 
				-	delete p;
			
 
				-}
			
 
				-
			
 
				-void TheoraAudioPacketQueue::destroyAllAudioPackets()
			
 
				-{
			
 
				-	for (TheoraAudioPacket* p = popAudioPacket(); p != NULL; p = popAudioPacket())
			
 
				-		destroyAudioPacket(p);
			
 
				-}
			
 
				-
			
 
				-void TheoraAudioPacketQueue::flushAudioPackets(TheoraAudioInterface* audioInterface)
			
 
				-{
			
 
				-	
			
 
				-	for (TheoraAudioPacket* p = popAudioPacket(); p != NULL; p = popAudioPacket())
			
 
				-	{
			
 
				-		audioInterface->insertData(p->pcm, p->numSamples);
			
 
				-		destroyAudioPacket(p);
			
 
				-	}
			
 
				-}
			
--- a/drivers/theoraplayer/src/TheoraDataSource.cpp
+++ b/drivers/theoraplayer/src/TheoraDataSource.cpp
@@ -1,128 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include <stdio.h>
			
 
				-#include <memory.h>
			
 
				-#include "TheoraDataSource.h"
			
 
				-#include "TheoraException.h"
			
 
				-#include "TheoraVideoManager.h"
			
 
				-#include "TheoraUtil.h"
			
 
				-
			
 
				-TheoraDataSource::~TheoraDataSource()
			
 
				-{
			
 
				-
			
 
				-}
			
 
				-
			
 
				-TheoraFileDataSource::TheoraFileDataSource(std::string filename)
			
 
				-{
			
 
				-	mFilename = filename;
			
 
				-	mFilePtr = NULL;
			
 
				-}
			
 
				-
			
 
				-TheoraFileDataSource::~TheoraFileDataSource()
			
 
				-{
			
 
				-	if (mFilePtr)
			
 
				-	{
			
 
				-		fclose(mFilePtr);
			
 
				-		mFilePtr = NULL;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-void TheoraFileDataSource::openFile()
			
 
				-{
			
 
				-	if (mFilePtr == NULL)
			
 
				-	{
			
 
				-		mFilePtr=fopen(mFilename.c_str(), "rb");
			
 
				-		if (!mFilePtr)
			
 
				-        {
			
 
				-            std::string msg = "Can't open video file: " + mFilename;
			
 
				-            th_writelog(msg);
			
 
				-            throw TheoraGenericException(msg);
			
 
				-        }
			
 
				-		fseek(mFilePtr, 0, SEEK_END);
			
 
				-		mSize = ftell(mFilePtr);
			
 
				-		fseek(mFilePtr, 0, SEEK_SET);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int TheoraFileDataSource::read(void* output, int nBytes)
			
 
				-{
			
 
				-	if (mFilePtr == NULL) openFile();
			
 
				-	size_t n = fread(output, 1, nBytes, mFilePtr);
			
 
				-	return (int) n;
			
 
				-}
			
 
				-
			
 
				-void TheoraFileDataSource::seek(unsigned long byte_index)
			
 
				-{
			
 
				-	if (mFilePtr == NULL) openFile();
			
 
				-	fseek(mFilePtr, byte_index, SEEK_SET);
			
 
				-}
			
 
				-
			
 
				-unsigned long TheoraFileDataSource::size()
			
 
				-{
			
 
				-	if (mFilePtr == NULL) openFile();
			
 
				-	return mSize;
			
 
				-}
			
 
				-
			
 
				-unsigned long TheoraFileDataSource::tell()
			
 
				-{
			
 
				-	if (mFilePtr == NULL) return 0;
			
 
				-	return ftell(mFilePtr);
			
 
				-}
			
 
				-
			
 
				-TheoraMemoryFileDataSource::TheoraMemoryFileDataSource(std::string filename) :
			
 
				-	mReadPointer(0),
			
 
				-	mData(0)
			
 
				-{
			
 
				-	mFilename=filename;
			
 
				-	FILE* f=fopen(filename.c_str(),"rb");
			
 
				-	if (!f) throw TheoraGenericException("Can't open video file: "+filename);
			
 
				-	fseek(f,0,SEEK_END);
			
 
				-	mSize=ftell(f);
			
 
				-	fseek(f,0,SEEK_SET);
			
 
				-	mData=new unsigned char[mSize];
			
 
				-	fread(mData,1,mSize,f);
			
 
				-	fclose(f);
			
 
				-}
			
 
				-
			
 
				-TheoraMemoryFileDataSource::TheoraMemoryFileDataSource(unsigned char* data, long size, const std::string& filename)
			
 
				-{
			
 
				-	mFilename = filename;
			
 
				-	mData = data;
			
 
				-	mSize = size;
			
 
				-	mReadPointer = 0;
			
 
				-}
			
 
				-
			
 
				-TheoraMemoryFileDataSource::~TheoraMemoryFileDataSource()
			
 
				-{
			
 
				-	if (mData) delete [] mData;
			
 
				-}
			
 
				-
			
 
				-int TheoraMemoryFileDataSource::read(void* output, int nBytes)
			
 
				-{
			
 
				-	int n = (int) ((mReadPointer+nBytes <= mSize) ? nBytes : mSize - mReadPointer);
			
 
				-	if (!n) return 0;
			
 
				-	memcpy(output, mData + mReadPointer, n);
			
 
				-	mReadPointer += n;
			
 
				-	return n;
			
 
				-}
			
 
				-
			
 
				-void TheoraMemoryFileDataSource::seek(unsigned long byte_index)
			
 
				-{
			
 
				-	mReadPointer=byte_index;
			
 
				-}
			
 
				-
			
 
				-unsigned long TheoraMemoryFileDataSource::size()
			
 
				-{
			
 
				-	return mSize;
			
 
				-}
			
 
				-
			
 
				-unsigned long TheoraMemoryFileDataSource::tell()
			
 
				-{
			
 
				-	return mReadPointer;
			
 
				-}
			
--- a/drivers/theoraplayer/src/TheoraException.cpp
+++ b/drivers/theoraplayer/src/TheoraException.cpp
@@ -1,37 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include "TheoraException.h"
			
 
				-#include "TheoraUtil.h"
			
 
				-#include "TheoraVideoManager.h"
			
 
				-#include <stdio.h>
			
 
				-
			
 
				-_TheoraGenericException::_TheoraGenericException(const std::string& errorText, std::string type, std::string file, int line)
			
 
				-{
			
 
				-    mErrText = errorText;
			
 
				-	int src = (int) file.find("src");
			
 
				-	if (src >= 0) file = file.substr(src + 4, 1000);
			
 
				-	mLineNumber = line;
			
 
				-	mFile = file;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-std::string _TheoraGenericException::repr()
			
 
				-{
			
 
				-	std::string text = getType();
			
 
				-	if (text != "") text += ": ";
			
 
				-
			
 
				-	if (mFile != "") text += "[" + mFile + ":" + str(mLineNumber) + "] - ";
			
 
				-
			
 
				-	return text + getErrorText();
			
 
				-}
			
 
				-
			
 
				-void _TheoraGenericException::writeOutput()
			
 
				-{
			
 
				-	th_writelog("----------------\nException Error!\n\n" + repr() + "\n----------------");
			
 
				-}
			
--- a/drivers/theoraplayer/src/TheoraFrameQueue.cpp
+++ b/drivers/theoraplayer/src/TheoraFrameQueue.cpp
@@ -1,174 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include "TheoraFrameQueue.h"
			
 
				-#include "TheoraVideoFrame.h"
			
 
				-#include "TheoraVideoManager.h"
			
 
				-#include "TheoraUtil.h"
			
 
				-
			
 
				-
			
 
				-TheoraFrameQueue::TheoraFrameQueue(TheoraVideoClip* parent)
			
 
				-{
			
 
				-	mParent = parent;
			
 
				-}
			
 
				-
			
 
				-TheoraFrameQueue::~TheoraFrameQueue()
			
 
				-{
			
 
				-	foreach_l(TheoraVideoFrame*, mQueue)
			
 
				-    {
			
 
				-		delete (*it);
			
 
				-    }
			
 
				-	mQueue.clear();
			
 
				-}
			
 
				-
			
 
				-TheoraVideoFrame* TheoraFrameQueue::createFrameInstance(TheoraVideoClip* clip)
			
 
				-{
			
 
				-	TheoraVideoFrame* frame = new TheoraVideoFrame(clip);
			
 
				-	if (frame->getBuffer() == NULL) // This can happen if you run out of memory
			
 
				-	{
			
 
				-		delete frame;
			
 
				-		return NULL;
			
 
				-	}
			
 
				-	return frame;
			
 
				-}
			
 
				-
			
 
				-void TheoraFrameQueue::setSize(int n)
			
 
				-{
			
 
				-	mMutex.lock();
			
 
				-	if (mQueue.size() > 0)
			
 
				-	{
			
 
				-		foreach_l (TheoraVideoFrame*, mQueue)
			
 
				-        {
			
 
				-			delete (*it);
			
 
				-        }
			
 
				-		mQueue.clear();
			
 
				-	}
			
 
				-	TheoraVideoFrame* frame;
			
 
				-	for (int i = 0;i < n; ++i)
			
 
				-	{
			
 
				-		frame = createFrameInstance(mParent);
			
 
				-		if (frame != NULL) mQueue.push_back(frame);
			
 
				-		else
			
 
				-		{
			
 
				-			TheoraVideoManager::getSingleton().logMessage("TheoraFrameQueue: unable to create " + str(n) + " frames, out of memory. Created " + str((int) mQueue.size()) + " frames.");
			
 
				-			break;
			
 
				-		}
			
 
				-	}
			
 
				-	mMutex.unlock();
			
 
				-}
			
 
				-
			
 
				-int TheoraFrameQueue::getSize()
			
 
				-{
			
 
				-	return (int) mQueue.size();
			
 
				-}
			
 
				-
			
 
				-TheoraVideoFrame* TheoraFrameQueue::_getFirstAvailableFrame()
			
 
				-{
			
 
				-	TheoraVideoFrame* frame = mQueue.front();
			
 
				-	if (frame->mReady) return frame;
			
 
				-	else               return NULL;
			
 
				-}
			
 
				-
			
 
				-TheoraVideoFrame* TheoraFrameQueue::getFirstAvailableFrame()
			
 
				-{
			
 
				-	mMutex.lock();
			
 
				-	TheoraVideoFrame* frame = _getFirstAvailableFrame();
			
 
				-	mMutex.unlock();
			
 
				-	return frame;
			
 
				-}
			
 
				-
			
 
				-void TheoraFrameQueue::clear()
			
 
				-{
			
 
				-	mMutex.lock();
			
 
				-	foreach_l (TheoraVideoFrame*, mQueue)
			
 
				-		(*it)->clear();
			
 
				-	mMutex.unlock();
			
 
				-}
			
 
				-
			
 
				-void TheoraFrameQueue::_pop(int n)
			
 
				-{
			
 
				-    for (int i = 0; i < n; ++i)
			
 
				-    {
			
 
				-        TheoraVideoFrame* first = mQueue.front();
			
 
				-        first->clear();
			
 
				-        mQueue.pop_front();
			
 
				-        mQueue.push_back(first);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-void TheoraFrameQueue::pop(int n)
			
 
				-{
			
 
				-	mMutex.lock();
			
 
				-    _pop(n);
			
 
				-	mMutex.unlock();
			
 
				-}
			
 
				-
			
 
				-TheoraVideoFrame* TheoraFrameQueue::requestEmptyFrame()
			
 
				-{
			
 
				-	TheoraVideoFrame* frame = NULL;
			
 
				-	mMutex.lock();
			
 
				-	foreach_l (TheoraVideoFrame*, mQueue)
			
 
				-	{
			
 
				-		if (!(*it)->mInUse)
			
 
				-		{
			
 
				-			(*it)->mInUse = 1;
			
 
				-			(*it)->mReady = 0;
			
 
				-			frame = (*it);
			
 
				-			break;
			
 
				-		}
			
 
				-	}
			
 
				-	mMutex.unlock();
			
 
				-	return frame;
			
 
				-}
			
 
				-
			
 
				-int TheoraFrameQueue::getUsedCount()
			
 
				-{
			
 
				-	mMutex.lock();
			
 
				-	int n=0;
			
 
				-	foreach_l(TheoraVideoFrame*,mQueue)
			
 
				-		if ((*it)->mInUse) ++n;
			
 
				-	mMutex.unlock();
			
 
				-	return n;
			
 
				-}
			
 
				-
			
 
				-int TheoraFrameQueue::_getReadyCount()
			
 
				-{
			
 
				-	int n = 0;
			
 
				-	foreach_l (TheoraVideoFrame*, mQueue)
			
 
				-    if ((*it)->mReady) ++n;
			
 
				-	return n;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-int TheoraFrameQueue::getReadyCount()
			
 
				-{
			
 
				-	mMutex.lock();
			
 
				-	int n = _getReadyCount();
			
 
				-	mMutex.unlock();
			
 
				-	return n;
			
 
				-}
			
 
				-
			
 
				-bool TheoraFrameQueue::isFull()
			
 
				-{
			
 
				-	return getReadyCount() == mQueue.size();
			
 
				-}
			
 
				-
			
 
				-void TheoraFrameQueue::lock()
			
 
				-{
			
 
				-	mMutex.lock();
			
 
				-}
			
 
				-
			
 
				-void TheoraFrameQueue::unlock()
			
 
				-{
			
 
				-	mMutex.unlock();
			
 
				-}
			
 
				-
			
 
				-std::list<TheoraVideoFrame*>& TheoraFrameQueue::_getFrameQueue()
			
 
				-{
			
 
				-    return mQueue;
			
 
				-}
			
--- a/drivers/theoraplayer/src/TheoraTimer.cpp
+++ b/drivers/theoraplayer/src/TheoraTimer.cpp
@@ -1,70 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include "TheoraTimer.h"
			
 
				-
			
 
				-TheoraTimer::TheoraTimer()
			
 
				-{
			
 
				-	mTime = 0;
			
 
				-	mPaused = 0;
			
 
				-    mSpeed = 1.0f;
			
 
				-}
			
 
				-
			
 
				-TheoraTimer::~TheoraTimer()
			
 
				-{
			
 
				-
			
 
				-}
			
 
				-
			
 
				-void TheoraTimer::update(float timeDelta)
			
 
				-{
			
 
				-	if (!isPaused())
			
 
				-	{
			
 
				-		mTime += timeDelta * mSpeed;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-float TheoraTimer::getTime()
			
 
				-{
			
 
				-	return mTime;
			
 
				-}
			
 
				-
			
 
				-void TheoraTimer::pause()
			
 
				-{
			
 
				-	mPaused = true;
			
 
				-}
			
 
				-
			
 
				-void TheoraTimer::play()
			
 
				-{
			
 
				-	mPaused = false;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-bool TheoraTimer::isPaused()
			
 
				-{
			
 
				-	return mPaused;
			
 
				-}
			
 
				-
			
 
				-void TheoraTimer::stop()
			
 
				-{
			
 
				-
			
 
				-}
			
 
				-
			
 
				-void TheoraTimer::seek(float time)
			
 
				-{
			
 
				-	mTime = time;
			
 
				-}
			
 
				-
			
 
				-void TheoraTimer::setSpeed(float speed)
			
 
				-{
			
 
				-    mSpeed = speed;
			
 
				-}
			
 
				-
			
 
				-float TheoraTimer::getSpeed()
			
 
				-{
			
 
				-    return mSpeed;
			
 
				-}
			
--- a/drivers/theoraplayer/src/TheoraUtil.cpp
+++ b/drivers/theoraplayer/src/TheoraUtil.cpp
@@ -1,59 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include <stdio.h>
			
 
				-#include <algorithm>
			
 
				-#include <math.h>
			
 
				-#include <map>
			
 
				-#ifndef _WIN32
			
 
				-#include <unistd.h>
			
 
				-#include <pthread.h>
			
 
				-#endif
			
 
				-
			
 
				-#include "TheoraUtil.h"
			
 
				-#include "TheoraException.h"
			
 
				-
			
 
				-#ifdef _WIN32
			
 
				-#include <windows.h>
			
 
				-#pragma warning( disable: 4996 ) // MSVC++
			
 
				-#endif
			
 
				-
			
 
				-std::string str(int i)
			
 
				-{
			
 
				-    char s[32];
			
 
				-    sprintf(s, "%d", i);
			
 
				-    return std::string(s);
			
 
				-}
			
 
				-
			
 
				-std::string strf(float i)
			
 
				-{
			
 
				-    char s[32];
			
 
				-    sprintf(s, "%.3f", i);
			
 
				-    return std::string(s);
			
 
				-}
			
 
				-
			
 
				-void _psleep(int miliseconds)
			
 
				-{
			
 
				-#ifdef _WIN32
			
 
				-#ifndef _WINRT
			
 
				-	Sleep(miliseconds);
			
 
				-#else
			
 
				-	WaitForSingleObjectEx(GetCurrentThread(), miliseconds, 0);
			
 
				-#endif
			
 
				-#else
			
 
				-	usleep(miliseconds * 1000);
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-
			
 
				-int _nextPow2(int x)
			
 
				-{
			
 
				-	int y;
			
 
				-	for (y = 1; y < x; y *= 2);
			
 
				-	return y;
			
 
				-}
			
--- a/drivers/theoraplayer/src/TheoraVideoClip.cpp
+++ b/drivers/theoraplayer/src/TheoraVideoClip.cpp
@@ -1,496 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include "TheoraVideoClip.h"
			
 
				-#include "TheoraVideoManager.h"
			
 
				-#include "TheoraVideoFrame.h"
			
 
				-#include "TheoraFrameQueue.h"
			
 
				-#include "TheoraAudioInterface.h"
			
 
				-#include "TheoraTimer.h"
			
 
				-#include "TheoraDataSource.h"
			
 
				-#include "TheoraUtil.h"
			
 
				-#include "TheoraException.h"
			
 
				-
			
 
				-#include "core/os/memory.h"
			
 
				-
			
 
				-TheoraVideoClip::TheoraVideoClip(TheoraDataSource* data_source,
			
 
				-								 TheoraOutputMode output_mode,
			
 
				-								 int nPrecachedFrames,
			
 
				-								 bool usePower2Stride):
			
 
				-	mAudioInterface(NULL),
			
 
				-	mNumDroppedFrames(0),
			
 
				-	mNumDisplayedFrames(0),
			
 
				-	mSeekFrame(-1),
			
 
				-	mDuration(-1),
			
 
				-	mNumFrames(-1),
			
 
				-	mFPS(1),
			
 
				-	mUseAlpha(0),
			
 
				-	mFrameDuration(0),
			
 
				-	mName(data_source->repr()),
			
 
				-	mStride(usePower2Stride),
			
 
				-	mSubFrameWidth(0),
			
 
				-	mSubFrameHeight(0),
			
 
				-	mSubFrameOffsetX(0),
			
 
				-	mSubFrameOffsetY(0),
			
 
				-	mAudioGain(1),
			
 
				-	mRequestedOutputMode(output_mode),
			
 
				-	mAutoRestart(0),
			
 
				-	mEndOfFile(0),
			
 
				-	mRestarted(0),
			
 
				-	mIteration(0),
			
 
				-    mPlaybackIteration(0),
			
 
				-	mStream(0),
			
 
				-	mThreadAccessCount(0),
			
 
				-	mPriority(1),
			
 
				-	mFirstFrameDisplayed(0),
			
 
				-	mWaitingForCache(false),
			
 
				-	mOutputMode(TH_UNDEFINED)
			
 
				-{
			
 
				-
			
 
				-	audio_track=0;
			
 
				-	mAudioMutex = NULL;
			
 
				-	mThreadAccessMutex = new TheoraMutex();
			
 
				-	mTimer = mDefaultTimer = new TheoraTimer();
			
 
				-
			
 
				-	mFrameQueue = NULL;
			
 
				-	mAssignedWorkerThread = NULL;
			
 
				-	mNumPrecachedFrames = nPrecachedFrames;
			
 
				-	setOutputMode(output_mode);
			
 
				-}
			
 
				-
			
 
				-TheoraVideoClip::~TheoraVideoClip()
			
 
				-{
			
 
				-	// wait untill a worker thread is done decoding the frame
			
 
				-	mThreadAccessMutex->lock();
			
 
				-
			
 
				-	delete mDefaultTimer;
			
 
				-
			
 
				-	if (mStream) memdelete(mStream);
			
 
				-
			
 
				-	if (mFrameQueue) delete mFrameQueue;
			
 
				-
			
 
				-	if (mAudioInterface)
			
 
				-	{
			
 
				-		mAudioMutex->lock(); // ensure a thread isn't using this mutex
			
 
				-		delete mAudioInterface; // notify audio interface it's time to call it a day
			
 
				-		mAudioMutex ->unlock();
			
 
				-		delete mAudioMutex;
			
 
				-	}
			
 
				-	
			
 
				-	mThreadAccessMutex->unlock();
			
 
				-
			
 
				-	delete mThreadAccessMutex;
			
 
				-}
			
 
				-
			
 
				-TheoraTimer* TheoraVideoClip::getTimer()
			
 
				-{
			
 
				-	return mTimer;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::setTimer(TheoraTimer* timer)
			
 
				-{
			
 
				-	if (!timer) mTimer = mDefaultTimer;
			
 
				-	else mTimer = timer;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::resetFrameQueue()
			
 
				-{
			
 
				-	mFrameQueue->clear();
			
 
				-    mPlaybackIteration = mIteration = 0;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::restart()
			
 
				-{
			
 
				-	mEndOfFile = true; //temp, to prevent threads to decode while restarting
			
 
				-	mThreadAccessMutex->lock();
			
 
				-	_restart();
			
 
				-	mTimer->seek(0);
			
 
				-	mFirstFrameDisplayed = false;
			
 
				-    resetFrameQueue();
			
 
				-	mEndOfFile = false;
			
 
				-	mRestarted = false;
			
 
				-	mSeekFrame = -1;
			
 
				-	mThreadAccessMutex->unlock();
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::update(float timeDelta)
			
 
				-{
			
 
				-	if (mTimer->isPaused())
			
 
				-	{
			
 
				-		mTimer->update(0); // update timer in case there is some code that needs to execute each frame
			
 
				-		return;
			
 
				-	}
			
 
				-	float time = mTimer->getTime(), speed = mTimer->getSpeed();
			
 
				-    if (time + timeDelta * speed >= mDuration)
			
 
				-    {
			
 
				-        if (mAutoRestart && mRestarted)
			
 
				-        {
			
 
				-            float seekTime = time + timeDelta * speed;
			
 
				-            for (;seekTime >= mDuration;)
			
 
				-            {
			
 
				-                seekTime -= mDuration;
			
 
				-                ++mPlaybackIteration;
			
 
				-            }
			
 
				-
			
 
				-            mTimer->seek(seekTime);
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            if (time != mDuration)
			
 
				-            {
			
 
				-                mTimer->update((mDuration - time) / speed);
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-    else
			
 
				-    {
			
 
				-        mTimer->update(timeDelta);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip::updateToNextFrame()
			
 
				-{
			
 
				-	TheoraVideoFrame* f = mFrameQueue->getFirstAvailableFrame();
			
 
				-	if (!f) return 0;
			
 
				-
			
 
				-	float time = f->mTimeToDisplay - mTimer->getTime();
			
 
				-	update(time);
			
 
				-	return time;
			
 
				-}
			
 
				-
			
 
				-TheoraFrameQueue* TheoraVideoClip::getFrameQueue()
			
 
				-{
			
 
				-	return mFrameQueue;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::popFrame()
			
 
				-{
			
 
				-	++mNumDisplayedFrames;
			
 
				-	
			
 
				-    // after transfering frame data to the texture, free the frame
			
 
				-	// so it can be used again
			
 
				-	if (!mFirstFrameDisplayed)
			
 
				-	{
			
 
				-		mFrameQueue->lock();
			
 
				-		mFrameQueue->_pop(1);
			
 
				-		mFirstFrameDisplayed = true;
			
 
				-		mFrameQueue->unlock();
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		mFrameQueue->pop();
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoClip::getWidth()
			
 
				-{
			
 
				-	return mUseAlpha ? mWidth / 2 : mWidth;
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoClip::getHeight()
			
 
				-{
			
 
				-	return mHeight;
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoClip::getSubFrameWidth()
			
 
				-{
			
 
				-	return mUseAlpha ? mWidth / 2 : mSubFrameWidth;
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoClip::getSubFrameHeight()
			
 
				-{
			
 
				-	return mUseAlpha ? mHeight : mSubFrameHeight;
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoClip::getSubFrameOffsetX()
			
 
				-{
			
 
				-	return mUseAlpha ? 0 : mSubFrameOffsetX;
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoClip::getSubFrameOffsetY()
			
 
				-{
			
 
				-	return mUseAlpha ? 0 : mSubFrameOffsetY;
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip::getAbsPlaybackTime()
			
 
				-{
			
 
				-    return mTimer->getTime() + mPlaybackIteration * mDuration;
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoClip::discardOutdatedFrames(float absTime)
			
 
				-{
			
 
				-    int nReady = mFrameQueue->_getReadyCount();
			
 
				-    // only drop frames if you have more frames to show. otherwise even the late frame will do..
			
 
				-    if (nReady == 1) return 0;
			
 
				-    float time = absTime;
			
 
				-
			
 
				-    int nPop = 0;
			
 
				-    TheoraVideoFrame* frame;
			
 
				-    float timeToDisplay;
			
 
				-    
			
 
				-    std::list<TheoraVideoFrame*>& queue = mFrameQueue->_getFrameQueue();
			
 
				-    foreach_l (TheoraVideoFrame*, queue)
			
 
				-    {
			
 
				-        frame = *it;
			
 
				-        if (!frame->mReady) break;
			
 
				-        timeToDisplay = frame->mTimeToDisplay + frame->mIteration * mDuration;
			
 
				-        if (time > timeToDisplay + mFrameDuration)
			
 
				-        {
			
 
				-            ++nPop;
			
 
				-            if (nReady - nPop == 1) break; // always leave at least one in the queue
			
 
				-        }
			
 
				-        else break;
			
 
				-    }
			
 
				-    
			
 
				-	if (nPop > 0)
			
 
				-    {
			
 
				-#define _DEBUG
			
 
				-#ifdef _DEBUG
			
 
				-        std::string log = getName() + ": dropped frame ";
			
 
				-    
			
 
				-        int i = nPop;
			
 
				-        foreach_l (TheoraVideoFrame*, queue)
			
 
				-        {
			
 
				-            log += str((int) (*it)->getFrameNumber());
			
 
				-            if (i-- > 1)
			
 
				-            {
			
 
				-                log += ", ";
			
 
				-            }
			
 
				-            else break;
			
 
				-        }
			
 
				-        th_writelog(log);
			
 
				-#endif
			
 
				-        mNumDroppedFrames += nPop;
			
 
				-        mFrameQueue->_pop(nPop);
			
 
				-	}
			
 
				-    
			
 
				-    return nPop;
			
 
				-}
			
 
				-
			
 
				-TheoraVideoFrame* TheoraVideoClip::getNextFrame()
			
 
				-{
			
 
				-	TheoraVideoFrame* frame;
			
 
				-    // if we are about to seek, then the current frame queue is invalidated
			
 
				-	// (will be cleared when a worker thread does the actual seek)
			
 
				-    if (mSeekFrame != -1) return NULL;
			
 
				-
			
 
				-    mFrameQueue->lock();
			
 
				-	float time = getAbsPlaybackTime();
			
 
				-    discardOutdatedFrames(time);
			
 
				-    
			
 
				-    frame = mFrameQueue->_getFirstAvailableFrame();
			
 
				-    if (frame != NULL)
			
 
				-    {
			
 
				-        if (frame->mTimeToDisplay + frame->mIteration * mDuration > time && mFirstFrameDisplayed)
			
 
				-        {
			
 
				-            frame = NULL; // frame is ready but it's not yet time to display it, except when we haven't displayed any frames yet
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    mFrameQueue->unlock();
			
 
				-	return frame;
			
 
				-}
			
 
				-
			
 
				-std::string TheoraVideoClip::getName()
			
 
				-{
			
 
				-	return mName;
			
 
				-}
			
 
				-
			
 
				-bool TheoraVideoClip::isBusy()
			
 
				-{
			
 
				-	return mAssignedWorkerThread || mOutputMode != mRequestedOutputMode;
			
 
				-}
			
 
				-
			
 
				-TheoraOutputMode TheoraVideoClip::getOutputMode()
			
 
				-{
			
 
				-	return mOutputMode;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::setOutputMode(TheoraOutputMode mode)
			
 
				-{
			
 
				-	if (mode == TH_UNDEFINED) throw TheoraGenericException("Invalid output mode: TH_UNDEFINED for video: " + mName);
			
 
				-	if (mOutputMode == mode) return;
			
 
				-	mRequestedOutputMode = mode;
			
 
				-	mUseAlpha = (mode == TH_RGBA   ||
			
 
				-				 mode == TH_ARGB   ||
			
 
				-				 mode == TH_BGRA   ||
			
 
				-				 mode == TH_ABGR   ||
			
 
				-				 mode == TH_GREY3A ||
			
 
				-				 mode == TH_AGREY3 ||
			
 
				-				 mode == TH_YUVA   ||
			
 
				-				 mode == TH_AYUV);
			
 
				-	if (mAssignedWorkerThread)
			
 
				-	{
			
 
				-		mThreadAccessMutex->lock();
			
 
				-		// discard current frames and recreate them
			
 
				-		mFrameQueue->setSize(mFrameQueue->getSize());
			
 
				-		mThreadAccessMutex->unlock();
			
 
				-
			
 
				-	}
			
 
				-	mOutputMode = mRequestedOutputMode;
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip::getTimePosition()
			
 
				-{
			
 
				-	return mTimer->getTime();
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoClip::getNumPrecachedFrames()
			
 
				-{
			
 
				-	return mFrameQueue->getSize();
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::setNumPrecachedFrames(int n)
			
 
				-{
			
 
				-	if (mFrameQueue->getSize() != n)
			
 
				-		mFrameQueue->setSize(n);
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoClip::_getNumReadyFrames()
			
 
				-{
			
 
				-	if (mSeekFrame != -1) return 0;
			
 
				-	return mFrameQueue->_getReadyCount();
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoClip::getNumReadyFrames()
			
 
				-{
			
 
				-	if (mSeekFrame != -1) return 0; // we are about to seek, consider frame queue empty even though it will be emptied upon seek
			
 
				-	return mFrameQueue->getReadyCount();
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip::getDuration()
			
 
				-{
			
 
				-	return mDuration;
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip::getFPS()
			
 
				-{
			
 
				-	return mFPS;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::play()
			
 
				-{
			
 
				-	mTimer->play();
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::pause()
			
 
				-{
			
 
				-	mTimer->pause();
			
 
				-}
			
 
				-
			
 
				-bool TheoraVideoClip::isPaused()
			
 
				-{
			
 
				-	return mTimer->isPaused();
			
 
				-}
			
 
				-
			
 
				-bool TheoraVideoClip::isDone()
			
 
				-{
			
 
				-	return mEndOfFile && !mFrameQueue->getFirstAvailableFrame();
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::stop()
			
 
				-{
			
 
				-	pause();
			
 
				-    resetFrameQueue();
			
 
				-	mFirstFrameDisplayed = false;
			
 
				-	seek(0);
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::setPlaybackSpeed(float speed)
			
 
				-{
			
 
				-	mTimer->setSpeed(speed);
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip::getPlaybackSpeed()
			
 
				-{
			
 
				-	return mTimer->getSpeed();
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::seek(float time)
			
 
				-{
			
 
				-	seekToFrame((int) (time * getFPS()));
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::seekToFrame(int frame)
			
 
				-{
			
 
				-	if      (frame < 0)          mSeekFrame = 0;
			
 
				-	else if (frame > mNumFrames) mSeekFrame = mNumFrames;
			
 
				-	else                         mSeekFrame = frame;
			
 
				-
			
 
				-	mFirstFrameDisplayed = false;
			
 
				-	mEndOfFile = false;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::waitForCache(float desired_cache_factor, float max_wait_time)
			
 
				-{
			
 
				-	mWaitingForCache = true;
			
 
				-	bool paused = mTimer->isPaused();
			
 
				-	if (!paused) mTimer->pause();
			
 
				-	int elapsed = 0;
			
 
				-	int desired_num_precached_frames = (int) (desired_cache_factor * getNumPrecachedFrames());
			
 
				-	while (getNumReadyFrames() < desired_num_precached_frames)
			
 
				-	{
			
 
				-		_psleep(10);
			
 
				-		elapsed += 10;
			
 
				-		if (elapsed >= max_wait_time * 1000) break;
			
 
				-	}
			
 
				-	if (!paused) mTimer->play();
			
 
				-	mWaitingForCache = false;
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip::getPriority()
			
 
				-{
			
 
				-	return mPriority;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::setPriority(float priority)
			
 
				-{
			
 
				-	mPriority = priority;
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip::getPriorityIndex()
			
 
				-{
			
 
				-	float priority = (float) getNumReadyFrames();
			
 
				-	if (mTimer->isPaused()) priority += getNumPrecachedFrames() / 2;
			
 
				-	
			
 
				-	return priority;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::setAudioInterface(TheoraAudioInterface* iface)
			
 
				-{
			
 
				-	mAudioInterface = iface;
			
 
				-	if (iface && !mAudioMutex) mAudioMutex = new TheoraMutex;
			
 
				-	if (!iface && mAudioMutex)
			
 
				-	{
			
 
				-		delete mAudioMutex;
			
 
				-		mAudioMutex = NULL;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-TheoraAudioInterface* TheoraVideoClip::getAudioInterface()
			
 
				-{
			
 
				-	return mAudioInterface;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::setAudioGain(float gain)
			
 
				-{
			
 
				-	if (gain > 1) mAudioGain=1;
			
 
				-	if (gain < 0) mAudioGain=0;
			
 
				-	else          mAudioGain=gain;
			
 
				-}
			
 
				-
			
 
				-float TheoraVideoClip::getAudioGain()
			
 
				-{
			
 
				-	return mAudioGain;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoClip::setAutoRestart(bool value)
			
 
				-{
			
 
				-	mAutoRestart = value;
			
 
				-	if (value) mEndOfFile = false;
			
 
				-}
			
--- a/drivers/theoraplayer/src/TheoraVideoFrame.cpp
+++ b/drivers/theoraplayer/src/TheoraVideoFrame.cpp
@@ -1,159 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include <memory.h>
			
 
				-#include "TheoraPixelTransform.h"
			
 
				-#include "TheoraVideoClip.h"
			
 
				-#include "TheoraVideoFrame.h"
			
 
				-#include "TheoraVideoManager.h"
			
 
				-
			
 
				-//#define YUV_TEST // uncomment this if you want to benchmark YUV decoding functions
			
 
				-
			
 
				-extern "C"
			
 
				-{
			
 
				-void decodeRGB  (struct TheoraPixelTransform* t);
			
 
				-void decodeRGBA (struct TheoraPixelTransform* t);
			
 
				-void decodeRGBX (struct TheoraPixelTransform* t);
			
 
				-void decodeARGB (struct TheoraPixelTransform* t);
			
 
				-void decodeXRGB (struct TheoraPixelTransform* t);
			
 
				-void decodeBGR  (struct TheoraPixelTransform* t);
			
 
				-void decodeBGRA (struct TheoraPixelTransform* t);
			
 
				-void decodeBGRX (struct TheoraPixelTransform* t);
			
 
				-void decodeABGR (struct TheoraPixelTransform* t);
			
 
				-void decodeXBGR (struct TheoraPixelTransform* t);
			
 
				-void decodeGrey (struct TheoraPixelTransform* t);
			
 
				-void decodeGrey3(struct TheoraPixelTransform* t);
			
 
				-void decodeGreyA(struct TheoraPixelTransform* t);
			
 
				-void decodeGreyX(struct TheoraPixelTransform* t);
			
 
				-void decodeAGrey(struct TheoraPixelTransform* t);
			
 
				-void decodeXGrey(struct TheoraPixelTransform* t);
			
 
				-void decodeYUV  (struct TheoraPixelTransform* t);
			
 
				-void decodeYUVA (struct TheoraPixelTransform* t);
			
 
				-void decodeYUVX (struct TheoraPixelTransform* t);
			
 
				-void decodeAYUV (struct TheoraPixelTransform* t);
			
 
				-void decodeXYUV (struct TheoraPixelTransform* t);
			
 
				-}
			
 
				-
			
 
				-static void (*conversion_functions[])(struct TheoraPixelTransform*) = {0,
			
 
				-	decodeRGB,
			
 
				-	decodeRGBA,
			
 
				-	decodeRGBX,
			
 
				-	decodeARGB,
			
 
				-	decodeXRGB,
			
 
				-	decodeBGR,
			
 
				-	decodeBGRA,
			
 
				-	decodeBGRX,
			
 
				-	decodeABGR,
			
 
				-	decodeXBGR,
			
 
				-	decodeGrey,
			
 
				-	decodeGrey3,
			
 
				-	decodeGreyA,
			
 
				-	decodeGreyX,
			
 
				-	decodeAGrey,
			
 
				-	decodeXGrey,
			
 
				-	decodeYUV,
			
 
				-	decodeYUVA,
			
 
				-	decodeYUVX,
			
 
				-	decodeAYUV,
			
 
				-	decodeXYUV
			
 
				-};
			
 
				-
			
 
				-TheoraVideoFrame::TheoraVideoFrame(TheoraVideoClip* parent)
			
 
				-{
			
 
				-	mReady = mInUse = false;
			
 
				-	mParent = parent;
			
 
				-	mIteration = 0;
			
 
				-	// number of bytes based on output mode
			
 
				-	int bytemap[]={0, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 1, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4};
			
 
				-	mBpp = bytemap[mParent->getOutputMode()];
			
 
				-	unsigned int size = mParent->getStride() * mParent->mHeight * mBpp;
			
 
				-	try
			
 
				-	{
			
 
				-		mBuffer = new unsigned char[size];
			
 
				-	}
			
 
				-	catch (std::bad_alloc)
			
 
				-	{
			
 
				-		mBuffer = NULL;
			
 
				-		return;
			
 
				-	}
			
 
				-	memset(mBuffer, 255, size);
			
 
				-}
			
 
				-
			
 
				-TheoraVideoFrame::~TheoraVideoFrame()
			
 
				-{
			
 
				-	if (mBuffer) delete [] mBuffer;
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoFrame::getWidth()
			
 
				-{
			
 
				-	return mParent->getWidth();
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoFrame::getStride()
			
 
				-{
			
 
				-	return mParent->mStride;
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoFrame::getHeight()
			
 
				-{
			
 
				-	return mParent->getHeight();
			
 
				-}
			
 
				-
			
 
				-unsigned char* TheoraVideoFrame::getBuffer()
			
 
				-{
			
 
				-	return mBuffer;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoFrame::decode(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	if (t->raw != NULL)
			
 
				-	{
			
 
				-		int bufferStride = mParent->getWidth() * mBpp;
			
 
				-		if (bufferStride == t->rawStride)
			
 
				-		{
			
 
				-			memcpy(mBuffer, t->raw, t->rawStride * mParent->getHeight());
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			unsigned char *buff = mBuffer, *src = t->raw;
			
 
				-			int i, h = mParent->getHeight();
			
 
				-			for (i = 0; i < h; ++i, buff += bufferStride, src += t->rawStride)
			
 
				-			{
			
 
				-				memcpy(buff, src, bufferStride);
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		t->out = mBuffer;
			
 
				-		t->w = mParent->getWidth();
			
 
				-		t->h = mParent->getHeight();
			
 
				-        
			
 
				-#ifdef YUV_TEST // when benchmarking yuv conversion functions during development, do a timed average
			
 
				-        #define N 1000
			
 
				-        clock_t time = clock();
			
 
				-        for (int i = 0; i < N; ++i)
			
 
				-        {
			
 
				-            conversion_functions[mParent->getOutputMode()](t);
			
 
				-        }
			
 
				-        float diff = (clock() - time) * 1000.0f / CLOCKS_PER_SEC;
			
 
				-        
			
 
				-		char s[128];
			
 
				-		sprintf(s, "%.2f", diff / N);
			
 
				-        TheoraVideoManager::getSingleton().logMessage("YUV Decoding time: " + std::string(s) + " ms\n");
			
 
				-#else
			
 
				-		conversion_functions[mParent->getOutputMode()](t);
			
 
				-#endif
			
 
				-	}
			
 
				-	mReady = true;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoFrame::clear()
			
 
				-{
			
 
				-	mInUse = mReady = false;
			
 
				-}
			
--- a/drivers/theoraplayer/src/TheoraVideoManager.cpp
+++ b/drivers/theoraplayer/src/TheoraVideoManager.cpp
@@ -1,485 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include "TheoraVideoManager.h"
			
 
				-#include "TheoraWorkerThread.h"
			
 
				-#include "TheoraVideoClip.h"
			
 
				-#include "TheoraFrameQueue.h"
			
 
				-#include "TheoraAudioInterface.h"
			
 
				-#include "TheoraUtil.h"
			
 
				-#include "TheoraDataSource.h"
			
 
				-#include "TheoraException.h"
			
 
				-#ifdef __THEORA
			
 
				-	#include <theora/codec.h>
			
 
				-	#include <vorbis/codec.h>
			
 
				-	#include "TheoraVideoClip_Theora.h"
			
 
				-#endif
			
 
				-#ifdef __AVFOUNDATION
			
 
				-	#include "TheoraVideoClip_AVFoundation.h"
			
 
				-#endif
			
 
				-#ifdef __FFMPEG
			
 
				-	#include "TheoraVideoClip_FFmpeg.h"
			
 
				-#endif
			
 
				-#ifdef _ANDROID //libtheoraplayer addition for cpu feature detection
			
 
				-	#include "cpu-features.h"
			
 
				-#endif
			
 
				-// declaring function prototype here so I don't have to put it in a header file
			
 
				-// it only needs to be used by this plugin and called once
			
 
				-extern "C"
			
 
				-{
			
 
				-	void initYUVConversionModule();
			
 
				-}
			
 
				-
			
 
				-#include "core/os/memory.h"
			
 
				-
			
 
				-//#define _DECODING_BENCHMARK //uncomment to test average decoding time on a given device
			
 
				-
			
 
				-
			
 
				-// --------------------------
			
 
				-//#define _SCHEDULING_DEBUG
			
 
				-#ifdef _SCHEDULING_DEBUG
			
 
				-float gThreadDiagnosticTimer = 0;
			
 
				-#endif
			
 
				-// --------------------------
			
 
				-
			
 
				-#ifdef _DECODING_BENCHMARK
			
 
				-void benchmark(TheoraVideoClip* clip)
			
 
				-{
			
 
				-	int nPrecached = 256;
			
 
				-	int n = nPrecached;
			
 
				-	char msg[1024];
			
 
				-	clock_t t = clock();
			
 
				-	while (n > 0)
			
 
				-	{
			
 
				-		clip->waitForCache(1.0f, 1000000);
			
 
				-		n -= 32;
			
 
				-		clip->getFrameQueue()->clear();
			
 
				-	}
			
 
				-	float diff = ((float) (clock() - t) * 1000.0f) / CLOCKS_PER_SEC;
			
 
				-	sprintf(msg, "BENCHMARK: %s: Decoding %d frames took %.1fms (%.2fms average per frame)\n",clip->getName().c_str(), nPrecached, diff, diff / nPrecached);
			
 
				-	TheoraVideoManager::getSingleton().logMessage(msg);
			
 
				-	clip->seek(0);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-struct TheoraWorkCandidate
			
 
				-{
			
 
				-	TheoraVideoClip* clip;
			
 
				-	float priority, queuedTime, workTime, entitledTime;
			
 
				-};
			
 
				-
			
 
				-TheoraVideoManager* g_ManagerSingleton = NULL;
			
 
				-
			
 
				-void theora_writelog(std::string output)
			
 
				-{
			
 
				-	printf("%s\n", output.c_str());
			
 
				-}
			
 
				-
			
 
				-void (*g_LogFuction)(std::string) = theora_writelog;
			
 
				-
			
 
				-void TheoraVideoManager::setLogFunction(void (*fn)(std::string))
			
 
				-{
			
 
				-	g_LogFuction = fn;
			
 
				-}
			
 
				-
			
 
				-TheoraVideoManager* TheoraVideoManager::getSingletonPtr()
			
 
				-{
			
 
				-    return g_ManagerSingleton;
			
 
				-}
			
 
				-
			
 
				-TheoraVideoManager& TheoraVideoManager::getSingleton()
			
 
				-{  
			
 
				-    return *g_ManagerSingleton;  
			
 
				-}
			
 
				-
			
 
				-TheoraVideoManager::TheoraVideoManager(int num_worker_threads) : 
			
 
				-	mDefaultNumPrecachedFrames(8)
			
 
				-{
			
 
				-	if (num_worker_threads < 1) throw TheoraGenericException("Unable to create TheoraVideoManager, at least one worker thread is reqired");
			
 
				-
			
 
				-	g_ManagerSingleton = this;
			
 
				-
			
 
				-	std::string msg = "Initializing Theora Playback Library (" + getVersionString() + ")\n";
			
 
				-#ifdef __THEORA
			
 
				-	msg += "  - libtheora version: " + std::string(th_version_string()) + "\n" +
			
 
				-	       "  - libvorbis version: " +  std::string(vorbis_version_string()) + "\n";
			
 
				-#endif
			
 
				-#ifdef _ANDROID
			
 
				-	uint64_t features = android_getCpuFeaturesExt();
			
 
				-	char s[128];
			
 
				-	sprintf(s, "  - Android: CPU Features: %u\n", (unsigned int) features);
			
 
				-	msg += s;
			
 
				-	if ((features & ANDROID_CPU_ARM_FEATURE_NEON) == 0)
			
 
				-		msg += "  - Android: NEON features NOT SUPPORTED by CPU\n";
			
 
				-	else
			
 
				-		msg += "  - Android: Detected NEON CPU features\n";
			
 
				-#endif
			
 
				-
			
 
				-#ifdef __AVFOUNDATION
			
 
				-	msg += "  - using Apple AVFoundation classes.\n";
			
 
				-#endif
			
 
				-#ifdef __FFMPEG
			
 
				-	msg += "  - using FFmpeg library.\n";
			
 
				-#endif
			
 
				-	
			
 
				-	logMessage(msg + "------------------------------------");
			
 
				-	mAudioFactory = NULL;
			
 
				-	mWorkMutex = new TheoraMutex();
			
 
				-
			
 
				-	// for CPU based yuv2rgb decoding
			
 
				-	initYUVConversionModule();
			
 
				-
			
 
				-	createWorkerThreads(num_worker_threads);
			
 
				-}
			
 
				-
			
 
				-TheoraVideoManager::~TheoraVideoManager()
			
 
				-{
			
 
				-	destroyWorkerThreads();
			
 
				-
			
 
				-	mWorkMutex->lock();
			
 
				-	ClipList::iterator ci;
			
 
				-	for (ci = mClips.begin(); ci != mClips.end(); ++ci)
			
 
				-		delete (*ci);
			
 
				-	mClips.clear();
			
 
				-	mWorkMutex->unlock();
			
 
				-	delete mWorkMutex;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoManager::logMessage(std::string msg)
			
 
				-{
			
 
				-	g_LogFuction(msg);
			
 
				-}
			
 
				-
			
 
				-TheoraVideoClip* TheoraVideoManager::getVideoClipByName(std::string name)
			
 
				-{
			
 
				-	TheoraVideoClip* clip = NULL;
			
 
				-	mWorkMutex->lock();
			
 
				-
			
 
				-	foreach(TheoraVideoClip*, mClips)
			
 
				-	{
			
 
				-		if ((*it)->getName() == name)
			
 
				-		{
			
 
				-			clip = *it;
			
 
				-			break;
			
 
				-		}
			
 
				-	}
			
 
				-	mWorkMutex->unlock();
			
 
				-
			
 
				-	return clip;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoManager::setAudioInterfaceFactory(TheoraAudioInterfaceFactory* factory)
			
 
				-{
			
 
				-	mAudioFactory = factory;
			
 
				-}
			
 
				-
			
 
				-TheoraAudioInterfaceFactory* TheoraVideoManager::getAudioInterfaceFactory()
			
 
				-{
			
 
				-	return mAudioFactory;
			
 
				-}
			
 
				-
			
 
				-TheoraVideoClip* TheoraVideoManager::createVideoClip(std::string filename,
			
 
				-													 TheoraOutputMode output_mode,
			
 
				-													 int numPrecachedOverride,
			
 
				-													 bool usePower2Stride,
			
 
				-													 int p_track)
			
 
				-{
			
 
				-	TheoraDataSource* src=memnew(TheoraFileDataSource(filename));
			
 
				-	return createVideoClip(src,output_mode,numPrecachedOverride,usePower2Stride, p_track);
			
 
				-}
			
 
				-
			
 
				-TheoraVideoClip* TheoraVideoManager::createVideoClip(TheoraDataSource* data_source,
			
 
				-													 TheoraOutputMode output_mode,
			
 
				-													 int numPrecachedOverride,
			
 
				-													 bool usePower2Stride,
			
 
				-													 int p_audio_track)
			
 
				-{
			
 
				-	mWorkMutex->lock();
			
 
				-
			
 
				-	TheoraVideoClip* clip = NULL;
			
 
				-	int nPrecached = numPrecachedOverride ? numPrecachedOverride : mDefaultNumPrecachedFrames;
			
 
				-	logMessage("Creating video from data source: " + data_source->repr() + " [" + str(nPrecached) + " precached frames].");
			
 
				-	
			
 
				-#ifdef __AVFOUNDATION
			
 
				-	TheoraFileDataSource* fileDataSource = dynamic_cast<TheoraFileDataSource*>(data_source);
			
 
				-	std::string filename;
			
 
				-	if (fileDataSource == NULL)
			
 
				-	{
			
 
				-		TheoraMemoryFileDataSource* memoryDataSource = dynamic_cast<TheoraMemoryFileDataSource*>(data_source);
			
 
				-		if (memoryDataSource != NULL) filename = memoryDataSource->getFilename();
			
 
				-		// if the user has his own data source, it's going to be a problem for AVAssetReader since it only supports reading from files...
			
 
				-	}
			
 
				-	else filename = fileDataSource->getFilename();
			
 
				-
			
 
				-	if (filename.size() > 4 && filename.substr(filename.size() - 4, filename.size()) == ".mp4")
			
 
				-	{
			
 
				-		clip = new TheoraVideoClip_AVFoundation(data_source, output_mode, nPrecached, usePower2Stride);
			
 
				-	}
			
 
				-#endif
			
 
				-#if defined(__AVFOUNDATION) && defined(__THEORA)
			
 
				-	else
			
 
				-#endif
			
 
				-#ifdef __THEORA
			
 
				-		clip = new TheoraVideoClip_Theora(data_source, output_mode, nPrecached, usePower2Stride);
			
 
				-#endif
			
 
				-#ifdef __FFMPEG
			
 
				-		clip = new TheoraVideoClip_FFmpeg(data_source, output_mode, nPrecached, usePower2Stride);
			
 
				-#endif
			
 
				-
			
 
				-	clip->set_audio_track(p_audio_track);
			
 
				-	clip->load(data_source);
			
 
				-	clip->decodeNextFrame(); // ensure the first frame is always preloaded and have the main thread do it to prevent potential thread starvatio
			
 
				-
			
 
				-	mClips.push_back(clip);
			
 
				-	mWorkMutex->unlock();
			
 
				-	
			
 
				-#ifdef _DECODING_BENCHMARK
			
 
				-	benchmark(clip);
			
 
				-#endif
			
 
				-	return clip;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoManager::destroyVideoClip(TheoraVideoClip* clip)
			
 
				-{
			
 
				-	if (clip)
			
 
				-	{
			
 
				-		th_writelog("Destroying video clip: " + clip->getName());
			
 
				-		mWorkMutex->lock();
			
 
				-		bool reported = 0;
			
 
				-		while (clip->mAssignedWorkerThread)
			
 
				-		{
			
 
				-			if (!reported)
			
 
				-			{
			
 
				-				th_writelog(" - Waiting for WorkerThread to finish decoding in order to destroy");
			
 
				-				reported = 1;
			
 
				-			}
			
 
				-			_psleep(1);
			
 
				-		}
			
 
				-		if (reported) th_writelog(" - WorkerThread done, destroying...");
			
 
				-		
			
 
				-		// erase the clip from the clip list
			
 
				-		foreach (TheoraVideoClip*, mClips)
			
 
				-		{
			
 
				-			if ((*it) == clip)
			
 
				-			{
			
 
				-				mClips.erase(it);
			
 
				-				break;
			
 
				-			}
			
 
				-		}
			
 
				-		// remove all it's references from the work log
			
 
				-		mWorkLog.remove(clip);
			
 
				-
			
 
				-		// delete the actual clip
			
 
				-		delete clip;
			
 
				-#ifdef _DEBUG
			
 
				-		th_writelog("Destroyed video.");
			
 
				-#endif
			
 
				-		mWorkMutex->unlock();
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-TheoraVideoClip* TheoraVideoManager::requestWork(TheoraWorkerThread* caller)
			
 
				-{
			
 
				-	if (!mWorkMutex) return NULL;
			
 
				-	mWorkMutex->lock();
			
 
				-
			
 
				-	TheoraVideoClip* selectedClip = NULL;
			
 
				-	float maxQueuedTime = 0, totalAccessCount = 0, prioritySum = 0, diff, maxDiff = -1;
			
 
				-	int nReadyFrames;
			
 
				-	std::vector<TheoraWorkCandidate> candidates;
			
 
				-	TheoraVideoClip* clip;
			
 
				-	TheoraWorkCandidate candidate;
			
 
				-
			
 
				-	// first pass is for playing videos, but if no such videos are available for decoding
			
 
				-	// paused videos are selected in the second pass.
			
 
				-    // Note that paused videos that are waiting for cache are considered equal to playing
			
 
				-    // videos in the scheduling context
			
 
				-
			
 
				-	for (int i = 0; i < 2 && candidates.size() == 0; ++i)
			
 
				-	{
			
 
				-		foreach (TheoraVideoClip*, mClips)
			
 
				-		{
			
 
				-			clip = *it;
			
 
				-			if (clip->isBusy() || (i == 0 && clip->isPaused() && !clip->mWaitingForCache)) continue;
			
 
				-			nReadyFrames = clip->getNumReadyFrames();
			
 
				-			if (nReadyFrames == clip->getFrameQueue()->getSize()) continue;
			
 
				-
			
 
				-			candidate.clip = clip;
			
 
				-			candidate.priority = clip->getPriority();
			
 
				-			candidate.queuedTime = (float) nReadyFrames / (clip->getFPS() * clip->getPlaybackSpeed());
			
 
				-			candidate.workTime = (float) clip->mThreadAccessCount;
			
 
				-			
			
 
				-			totalAccessCount += candidate.workTime;
			
 
				-			if (maxQueuedTime < candidate.queuedTime) maxQueuedTime = candidate.queuedTime;
			
 
				-
			
 
				-			candidates.push_back(candidate);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	// prevent division by zero
			
 
				-	if (totalAccessCount == 0) totalAccessCount = 1;
			
 
				-	if (maxQueuedTime == 0) maxQueuedTime = 1;
			
 
				-
			
 
				-	// normalize candidate values
			
 
				-	foreach (TheoraWorkCandidate, candidates)
			
 
				-	{
			
 
				-		it->workTime /= totalAccessCount;
			
 
				-		// adjust user priorities to favor clips that have fewer frames queued
			
 
				-		it->priority *= 1.0f - (it->queuedTime / maxQueuedTime) * 0.5f;
			
 
				-		prioritySum += it->priority;
			
 
				-	}
			
 
				-	foreach (TheoraWorkCandidate, candidates)
			
 
				-	{
			
 
				-		it->entitledTime = it->priority / prioritySum;
			
 
				-	}
			
 
				-
			
 
				-	// now, based on how much access time has been given to each clip in the work log
			
 
				-	// and how much time should be given to each clip based on calculated priorities,
			
 
				-	// we choose a best suited clip for this worker thread to decode next
			
 
				-	foreach (TheoraWorkCandidate, candidates)
			
 
				-	{
			
 
				-		diff = it->entitledTime - it->workTime;
			
 
				-
			
 
				-		if (maxDiff < diff)
			
 
				-		{
			
 
				-			maxDiff = diff;
			
 
				-			selectedClip = it->clip;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	if (selectedClip)
			
 
				-	{
			
 
				-		selectedClip->mAssignedWorkerThread = caller;
			
 
				-		
			
 
				-		int nClips = (int) mClips.size();
			
 
				-		unsigned int maxWorkLogSize = (nClips - 1) * 50;
			
 
				-
			
 
				-		if (nClips > 1)
			
 
				-		{
			
 
				-			mWorkLog.push_front(selectedClip);
			
 
				-			++selectedClip->mThreadAccessCount;
			
 
				-		}
			
 
				-		
			
 
				-		TheoraVideoClip* c;
			
 
				-		while (mWorkLog.size() > maxWorkLogSize)
			
 
				-		{
			
 
				-			c = mWorkLog.back();
			
 
				-			mWorkLog.pop_back();
			
 
				-			c->mThreadAccessCount--;
			
 
				-		}
			
 
				-#ifdef _SCHEDULING_DEBUG
			
 
				-		if (mClips.size() > 1)
			
 
				-		{
			
 
				-			int accessCount = mWorkLog.size();
			
 
				-			if (gThreadDiagnosticTimer > 2.0f)
			
 
				-			{
			
 
				-				gThreadDiagnosticTimer = 0;
			
 
				-				std::string logstr = "-----\nTheora Playback Library debug CPU time analysis (" + str(accessCount) + "):\n";
			
 
				-				int percent;
			
 
				-				foreach (TheoraVideoClip*, mClips)
			
 
				-				{
			
 
				-					percent = ((float) (*it)->mThreadAccessCount / mWorkLog.size()) * 100.0f;
			
 
				-					logstr += (*it)->getName() + " (" + str((*it)->getPriority()) + "): " + str((*it)->mThreadAccessCount) + ", " + str(percent) + "%\n";
			
 
				-				}
			
 
				-				logstr += "-----";
			
 
				-				th_writelog(logstr);
			
 
				-			}
			
 
				-		}
			
 
				-#endif
			
 
				-	}
			
 
				-
			
 
				-	mWorkMutex->unlock();
			
 
				-	return selectedClip;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoManager::update(float timeDelta)
			
 
				-{
			
 
				-	mWorkMutex->lock();
			
 
				-	foreach (TheoraVideoClip*, mClips)
			
 
				-	{
			
 
				-		(*it)->update(timeDelta);
			
 
				-		(*it)->decodedAudioCheck();
			
 
				-	}
			
 
				-	mWorkMutex->unlock();
			
 
				-#ifdef _SCHEDULING_DEBUG
			
 
				-	gThreadDiagnosticTimer += timeDelta;
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-int TheoraVideoManager::getNumWorkerThreads()
			
 
				-{
			
 
				-	return (int) mWorkerThreads.size();
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoManager::createWorkerThreads(int n)
			
 
				-{
			
 
				-	TheoraWorkerThread* t;
			
 
				-	for (int i=0;i<n;++i)
			
 
				-	{
			
 
				-		t=new TheoraWorkerThread();
			
 
				-		t->start();
			
 
				-		mWorkerThreads.push_back(t);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoManager::destroyWorkerThreads()
			
 
				-{
			
 
				-	foreach(TheoraWorkerThread*,mWorkerThreads)
			
 
				-	{
			
 
				-		(*it)->join();
			
 
				-		delete (*it);
			
 
				-	}
			
 
				-	mWorkerThreads.clear();
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoManager::setNumWorkerThreads(int n)
			
 
				-{
			
 
				-	if (n == getNumWorkerThreads()) return;
			
 
				-	if (n < 1) throw TheoraGenericException("Unable to change the number of worker threads in TheoraVideoManager, at least one worker thread is reqired");
			
 
				-
			
 
				-	th_writelog("changing number of worker threats to: "+str(n));
			
 
				-
			
 
				-	destroyWorkerThreads();
			
 
				-	createWorkerThreads(n);
			
 
				-}
			
 
				-
			
 
				-std::string TheoraVideoManager::getVersionString()
			
 
				-{
			
 
				-	int a, b, c;
			
 
				-	getVersion(&a, &b, &c);
			
 
				-	std::string out = str(a) + "." + str(b);
			
 
				-	if (c != 0)
			
 
				-	{
			
 
				-		if (c < 0) out += " RC" + str(-c);
			
 
				-		else       out += "." + str(c);
			
 
				-	}
			
 
				-	return out;
			
 
				-}
			
 
				-
			
 
				-void TheoraVideoManager::getVersion(int* a, int* b, int* c) // TODO, return a struct instead of the current solution.
			
 
				-{
			
 
				-	*a = 1;
			
 
				-	*b = 1;
			
 
				-	*c = 0;
			
 
				-}
			
 
				-
			
 
				-std::vector<std::string> TheoraVideoManager::getSupportedDecoders()
			
 
				-{
			
 
				-	std::vector<std::string> lst;
			
 
				-#ifdef __THEORA
			
 
				-	lst.push_back("Theora");
			
 
				-#endif
			
 
				-#ifdef __AVFOUNDATION
			
 
				-	lst.push_back("AVFoundation");
			
 
				-#endif
			
 
				-#ifdef __FFMPEG
			
 
				-	lst.push_back("FFmpeg");
			
 
				-#endif
			
 
				-	
			
 
				-	return lst;
			
 
				-}
			
--- a/drivers/theoraplayer/src/TheoraWorkerThread.cpp
+++ b/drivers/theoraplayer/src/TheoraWorkerThread.cpp
@@ -1,49 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifdef _WIN32
			
 
				-#pragma warning( disable: 4251 ) // MSVC++
			
 
				-#endif
			
 
				-#include "TheoraWorkerThread.h"
			
 
				-#include "TheoraVideoManager.h"
			
 
				-#include "TheoraVideoClip.h"
			
 
				-#include "TheoraUtil.h"
			
 
				-
			
 
				-TheoraWorkerThread::TheoraWorkerThread() : TheoraThread()
			
 
				-{
			
 
				-	mClip = NULL;
			
 
				-}
			
 
				-
			
 
				-TheoraWorkerThread::~TheoraWorkerThread()
			
 
				-{
			
 
				-
			
 
				-}
			
 
				-
			
 
				-void TheoraWorkerThread::execute()
			
 
				-{
			
 
				-	while (isRunning())
			
 
				-	{
			
 
				-		mClip = TheoraVideoManager::getSingleton().requestWork(this);
			
 
				-		if (!mClip)
			
 
				-		{
			
 
				-			_psleep(100);
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		mClip->mThreadAccessMutex->lock();
			
 
				-		// if user requested seeking, do that then.
			
 
				-		if (mClip->mSeekFrame >= 0) mClip->doSeek();
			
 
				-
			
 
				-		if (!mClip->decodeNextFrame())
			
 
				-			_psleep(1); // this happens when the video frame queue is full.
			
 
				-
			
 
				-		mClip->mAssignedWorkerThread = NULL;
			
 
				-		mClip->mThreadAccessMutex->unlock();
			
 
				-		mClip = NULL;
			
 
				-	}
			
 
				-}
			
--- a/drivers/theoraplayer/src/YUV/C/yuv420_grey_c.c
+++ b/drivers/theoraplayer/src/YUV/C/yuv420_grey_c.c
@@ -1,56 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include "yuv_util.h"
			
 
				-
			
 
				-static void _decodeGrey3(struct TheoraPixelTransform* t, int stride, int nBytes)
			
 
				-{
			
 
				-	unsigned char *ySrc = t->y, *yLineEnd, *out = t->out;
			
 
				-	unsigned int y;
			
 
				-	for (y = 0; y < t->h; ++y, ySrc += t->yStride - t->w, out += stride-t->w * nBytes)
			
 
				-		for (yLineEnd = ySrc + t->w; ySrc != yLineEnd; ++ySrc, out += nBytes)
			
 
				-			out[0] = out[1] = out[2] = *ySrc;
			
 
				-}
			
 
				-
			
 
				-void decodeGrey(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	unsigned char *ySrc = t->y, *yLineEnd, *out = t->out;
			
 
				-	unsigned int y;
			
 
				-	for (y = 0; y < t->h; ++y, ySrc += t->yStride - t->w)
			
 
				-		for (yLineEnd = ySrc + t->w; ySrc != yLineEnd; ++ySrc, ++out)
			
 
				-			*out = *ySrc;
			
 
				-
			
 
				-}
			
 
				-
			
 
				-void decodeGrey3(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeGrey3(t, t->w * 3, 3);
			
 
				-}
			
 
				-
			
 
				-void decodeGreyA(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeGrey3(t, t->w * 4, 4);
			
 
				-	_decodeAlpha(incOut(t, 3), t->w * 4);
			
 
				-}
			
 
				-
			
 
				-void decodeGreyX(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeGrey3(t, t->w * 4, 4);
			
 
				-}
			
 
				-
			
 
				-void decodeAGrey(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeGrey3(incOut(t, 1), t->w * 4, 4);
			
 
				-	_decodeAlpha(t, t->w * 4);
			
 
				-}
			
 
				-
			
 
				-void decodeXGrey(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeGrey3(incOut(t, 1), t->w * 4, 4);
			
 
				-}
			
 
				-
			
--- a/drivers/theoraplayer/src/YUV/C/yuv420_rgb_c.c
+++ b/drivers/theoraplayer/src/YUV/C/yuv420_rgb_c.c
@@ -1,358 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#ifdef _YUV_C
			
 
				-#include "yuv_util.h"
			
 
				-
			
 
				-int YTable [256];
			
 
				-int BUTable[256];
			
 
				-int GUTable[256];
			
 
				-int GVTable[256];
			
 
				-int RVTable[256];
			
 
				-
			
 
				-#define CLIP_RGB_COLOR(dst, x) \
			
 
				-	tmp = (x) >> 13;\
			
 
				-	if ((tmp & ~0xFF) == 0) dst = tmp;\
			
 
				-	else                    dst = (-tmp) >> 31;
			
 
				-
			
 
				-#define _decodeRGB(t, stride, nBytes, maxWidth, i1, i2, i3, j1, j2, j3)\
			
 
				-	register int tmp;\
			
 
				-	int nBytes2 = nBytes * 2, cv, cu, rgbY1, rgbY2, rgbY3, rgbY4, rV, gUV, bU, width = maxWidth == 0 ? t->w : maxWidth;\
			
 
				-	unsigned int y;\
			
 
				-	unsigned char *ySrcEven, *ySrcOdd, *yLineEnd, *uSrc, *vSrc, *out1, *out2;\
			
 
				-	\
			
 
				-	for (y = 0; y < t->h; y += 2)\
			
 
				-	{\
			
 
				-		ySrcEven = t->y + y * t->yStride;\
			
 
				-		ySrcOdd  = t->y + (y + 1) * t->yStride;\
			
 
				-		uSrc = t->u + y * t->uStride / 2;\
			
 
				-		vSrc = t->v + y * t->vStride / 2;\
			
 
				-		out1 = t->out + y * stride;\
			
 
				-		out2 = t->out + (y + 1) * stride;\
			
 
				-		\
			
 
				-		for (yLineEnd = ySrcEven + width; ySrcEven != yLineEnd;)\
			
 
				-		{\
			
 
				-			cu = *uSrc; ++uSrc;\
			
 
				-			cv = *vSrc; ++vSrc;\
			
 
				-			rV   = RVTable[cv];\
			
 
				-			gUV  = GUTable[cu] + GVTable[cv];\
			
 
				-			bU   = BUTable[cu];\
			
 
				-			\
			
 
				-			rgbY1 = YTable[*ySrcEven]; ++ySrcEven;\
			
 
				-			rgbY2 = YTable[*ySrcOdd];  ++ySrcOdd;\
			
 
				-			rgbY3 = YTable[*ySrcEven]; ++ySrcEven;\
			
 
				-			rgbY4 = YTable[*ySrcOdd];  ++ySrcOdd;\
			
 
				-			\
			
 
				-			CLIP_RGB_COLOR(out1[i1], rgbY1 + rV );\
			
 
				-			CLIP_RGB_COLOR(out1[i2], rgbY1 - gUV);\
			
 
				-			CLIP_RGB_COLOR(out1[i3], rgbY1 + bU );\
			
 
				-			\
			
 
				-			CLIP_RGB_COLOR(out2[i1], rgbY2 + rV );\
			
 
				-			CLIP_RGB_COLOR(out2[i2], rgbY2 - gUV);\
			
 
				-			CLIP_RGB_COLOR(out2[i3], rgbY2 + bU );\
			
 
				-			\
			
 
				-			CLIP_RGB_COLOR(out1[j1], rgbY3 + rV );\
			
 
				-			CLIP_RGB_COLOR(out1[j2], rgbY3 - gUV);\
			
 
				-			CLIP_RGB_COLOR(out1[j3], rgbY3 + bU );\
			
 
				-			\
			
 
				-			CLIP_RGB_COLOR(out2[j1], rgbY4 + rV );\
			
 
				-			CLIP_RGB_COLOR(out2[j2], rgbY4 - gUV);\
			
 
				-			CLIP_RGB_COLOR(out2[j3], rgbY4 + bU );\
			
 
				-			\
			
 
				-			out1 += nBytes2;  out2 += nBytes2;\
			
 
				-		}\
			
 
				-	}
			
 
				-
			
 
				-// The 'trick' with this function is that it skips decoding YUV pixels if the alpha value is 0, thus improving the decoding speed of a frame
			
 
				-#define _decodeRGBA(t, stride, nBytes, maxWidth, i1, i2, i3, j1, j2, j3, aindex1, aindex2)\
			
 
				-\
			
 
				-	register int tmp;\
			
 
				-	int nBytes2 = nBytes * 2, cv, cu, rgbY1, rgbY2, rgbY3, rgbY4, a1, a2, a3, a4, rV, gUV, bU, width = maxWidth == 0 ? t->w : maxWidth;\
			
 
				-	int alphaStride = t->w;\
			
 
				-	unsigned int y;\
			
 
				-	unsigned char *ySrcEven, *ySrcOdd, *yLineEnd, *uSrc, *vSrc, *out1, *out2;\
			
 
				-	\
			
 
				-	for (y = 0; y < t->h; y += 2)\
			
 
				-	{\
			
 
				-		ySrcEven = t->y + y * t->yStride;\
			
 
				-		ySrcOdd  = t->y + (y + 1) * t->yStride;\
			
 
				-		uSrc = t->u + y * t->uStride / 2;\
			
 
				-		vSrc = t->v + y * t->vStride / 2;\
			
 
				-		out1 = t->out + y * stride;\
			
 
				-		out2 = t->out + (y + 1) * stride;\
			
 
				-		\
			
 
				-		for (yLineEnd = ySrcEven + width; ySrcEven != yLineEnd;)\
			
 
				-		{\
			
 
				-			cu = *uSrc; ++uSrc;\
			
 
				-			cv = *vSrc; ++vSrc;\
			
 
				-			rV   = RVTable[cv];\
			
 
				-			gUV  = GUTable[cu] + GVTable[cv];\
			
 
				-			bU   = BUTable[cu];\
			
 
				-			\
			
 
				-			rgbY1 = YTable[*ySrcEven]; a1 = ySrcEven[alphaStride]; ++ySrcEven;\
			
 
				-			rgbY2 = YTable[*ySrcOdd];  a2 = ySrcOdd [alphaStride];  ++ySrcOdd;\
			
 
				-			rgbY3 = YTable[*ySrcEven]; a3 = ySrcEven[alphaStride]; ++ySrcEven;\
			
 
				-			rgbY4 = YTable[*ySrcOdd];  a4 = ySrcOdd [alphaStride];  ++ySrcOdd;\
			
 
				-			\
			
 
				-			if (a1 > 16)\
			
 
				-			{\
			
 
				-				CLIP_RGB_COLOR(out1[i1], rgbY1 + rV );\
			
 
				-				CLIP_RGB_COLOR(out1[i2], rgbY1 - gUV);\
			
 
				-				CLIP_RGB_COLOR(out1[i3], rgbY1 + bU );\
			
 
				-				out1[aindex1] = a1 >= 235 ? 255 : (unsigned char) (((a1 - 16) * 255) / 219);\
			
 
				-			}\
			
 
				-			else *((unsigned int*) out1) = 0;\
			
 
				-			\
			
 
				-			if (a2 > 16)\
			
 
				-			{\
			
 
				-				CLIP_RGB_COLOR(out2[i1], rgbY2 + rV );\
			
 
				-				CLIP_RGB_COLOR(out2[i2], rgbY2 - gUV);\
			
 
				-				CLIP_RGB_COLOR(out2[i3], rgbY2 + bU );\
			
 
				-				out2[aindex1] = a2 >= 235 ? 255 : (unsigned char) (((a2 - 16) * 255) / 219);\
			
 
				-			}\
			
 
				-			else *((unsigned int*) out2) = 0;\
			
 
				-			\
			
 
				-			if (a3 > 16)\
			
 
				-			{\
			
 
				-				CLIP_RGB_COLOR(out1[j1], rgbY3 + rV );\
			
 
				-				CLIP_RGB_COLOR(out1[j2], rgbY3 - gUV);\
			
 
				-				CLIP_RGB_COLOR(out1[j3], rgbY3 + bU );\
			
 
				-				out1[aindex2] = a3 >= 235 ? 255 : (unsigned char) (((a3 - 16) * 255) / 219);\
			
 
				-			}\
			
 
				-			else *((unsigned int*) &out1[4]) = 0;\
			
 
				-			\
			
 
				-			if (a4 > 16)\
			
 
				-			{\
			
 
				-				CLIP_RGB_COLOR(out2[j1], rgbY4 + rV );\
			
 
				-				CLIP_RGB_COLOR(out2[j2], rgbY4 - gUV);\
			
 
				-				CLIP_RGB_COLOR(out2[j3], rgbY4 + bU );\
			
 
				-				out2[aindex2] = a4 >= 235 ? 255 : (unsigned char) (((a4 - 16) * 255) / 219);\
			
 
				-			}\
			
 
				-			else *((unsigned int*) &out2[4]) = 0;\
			
 
				-			\
			
 
				-			out1 += nBytes2;  out2 += nBytes2;\
			
 
				-		}\
			
 
				-	}\
			
 
				-
			
 
				-void decodeRGB(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeRGB(t, t->w * 3, 3, 0, 0, 1, 2, 3, 4, 5);
			
 
				-}
			
 
				-
			
 
				-void decodeRGBA(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeRGBA(t, t->w * 4, 4, 0, 0, 1, 2, 4, 5, 6, 3, 7);
			
 
				-// This is the old 2-phase version, leaving it here in case more debugging is needed
			
 
				-//	_decodeRGB(t, t->w * 4, 4, 0, 0, 1, 2, 4, 5, 6);
			
 
				-//	_decodeAlpha(incOut(t, 3), t->w * 4);
			
 
				-}
			
 
				-
			
 
				-void decodeRGBX(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeRGB(t, t->w * 4, 4, 0, 0, 1, 2, 4, 5, 6);
			
 
				-}
			
 
				-
			
 
				-void decodeARGB(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeRGBA(t, t->w * 4, 4, 0, 1, 2, 3, 5, 6, 7, 0, 4);
			
 
				-// This is the old 2-phase version, leaving it here in case more debugging is needed
			
 
				-//	_decodeRGB(t, t->w * 4, 4, 0, 1, 2, 3, 5, 6, 7);
			
 
				-//	_decodeAlpha(t, t->w * 4);
			
 
				-}
			
 
				-
			
 
				-void decodeXRGB(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeRGB(t, t->w * 4, 4, 0, 1, 2, 3, 5, 6, 7);
			
 
				-}
			
 
				-
			
 
				-void decodeBGR(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeRGB(t, t->w * 3, 3, 0, 2, 1, 0, 5, 4, 3);
			
 
				-}
			
 
				-
			
 
				-void decodeBGRA(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeRGBA(t, t->w * 4, 4, 0, 2, 1, 0, 6, 5, 4, 3, 7);
			
 
				-// This is the old 2-phase version, leaving it here in case more debugging is needed
			
 
				-//	_decodeRGB(t, t->w * 4, 4, 0, 2, 1, 0, 6, 5, 4);
			
 
				-//	_decodeAlpha(incOut(t, 3), t->w * 4);
			
 
				-}
			
 
				-
			
 
				-void decodeBGRX(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeRGB(t, t->w * 4, 4, 0, 2, 1, 0, 6, 5, 4);
			
 
				-}
			
 
				-
			
 
				-void decodeABGR(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeRGBA(t, t->w * 4, 4, 0, 3, 2, 1, 7, 6, 5, 0, 4);
			
 
				-// This is the old 2-phase version, leaving it here in case more debugging is needed
			
 
				-//	_decodeRGB(t, t->w * 4, 4, 0, 3, 2, 1, 7, 6, 5);
			
 
				-//	_decodeAlpha(t, t->w * 4);
			
 
				-}
			
 
				-
			
 
				-void decodeXBGR(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeRGB(t, t->w * 4, 4, 0, 3, 2, 1, 7, 6, 5);
			
 
				-}
			
 
				-
			
 
				-void initYUVConversionModule()
			
 
				-{
			
 
				-	//used to bring the table into the high side (scale up) so we
			
 
				-	//can maintain high precision and not use floats (FIXED POINT)
			
 
				-	
			
 
				-	// this is the pseudocode for yuv->rgb conversion
			
 
				-	//        r = 1.164*(*ySrc - 16) + 1.596*(cv - 128);
			
 
				-	//        b = 1.164*(*ySrc - 16)                   + 2.018*(cu - 128);
			
 
				-	//        g = 1.164*(*ySrc - 16) - 0.813*(cv - 128) - 0.391*(cu - 128);
			
 
				-	
			
 
				-    double scale = 1L << 13, temp;
			
 
				-	
			
 
				-	int i;
			
 
				-	for (i = 0; i < 256; ++i)
			
 
				-	{
			
 
				-		temp = i - 128;
			
 
				-		
			
 
				-		YTable[i]  = (int)((1.164 * scale + 0.5) * (i - 16));	//Calc Y component
			
 
				-		RVTable[i] = (int)((1.596 * scale + 0.5) * temp);		//Calc R component
			
 
				-		GUTable[i] = (int)((0.391 * scale + 0.5) * temp);		//Calc G u & v components
			
 
				-		GVTable[i] = (int)((0.813 * scale + 0.5) * temp);
			
 
				-		BUTable[i] = (int)((2.018 * scale + 0.5) * temp);		//Calc B component
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Below are the function versions of the above macros, use those for debugging, but leave the macros for maximum CPU execution speed
			
 
				- *
			
 
				- *
			
 
				- *
			
 
				- *
			
 
				-
			
 
				-void _decodeRGB(struct TheoraPixelTransform* t, int stride, int nBytes, int maxWidth, int i1, int i2, int i3, int j1, int j2, int j3)
			
 
				-{
			
 
				-	register int tmp;
			
 
				-	int nBytes2 = nBytes * 2, cv, cu, rgbY1, rgbY2, rgbY3, rgbY4, rV, gUV, bU, width = maxWidth == 0 ? t->w : maxWidth;
			
 
				-	unsigned int y;
			
 
				-	unsigned char *ySrcEven, *ySrcOdd, *yLineEnd, *uSrc, *vSrc, *out1, *out2;
			
 
				-	
			
 
				-	for (y = 0; y < t->h; y += 2)
			
 
				-	{
			
 
				-		ySrcEven = t->y + y * t->yStride;
			
 
				-		ySrcOdd  = t->y + (y + 1) * t->yStride;
			
 
				-		uSrc = t->u + y * t->uStride / 2;
			
 
				-		vSrc = t->v + y * t->vStride / 2;
			
 
				-		out1 = t->out + y * stride;
			
 
				-		out2 = t->out + (y + 1) * stride;
			
 
				-		
			
 
				-		for (yLineEnd = ySrcEven + width; ySrcEven != yLineEnd;)
			
 
				-		{
			
 
				-			cu = *uSrc; ++uSrc;
			
 
				-			cv = *vSrc; ++vSrc;
			
 
				-			rV   = RVTable[cv];
			
 
				-			gUV  = GUTable[cu] + GVTable[cv];
			
 
				-			bU   = BUTable[cu];
			
 
				-			
			
 
				-			rgbY1 = YTable[*ySrcEven]; ++ySrcEven;
			
 
				-			rgbY2 = YTable[*ySrcOdd];  ++ySrcOdd;
			
 
				-			rgbY3 = YTable[*ySrcEven]; ++ySrcEven;
			
 
				-			rgbY4 = YTable[*ySrcOdd];  ++ySrcOdd;
			
 
				-			
			
 
				-			CLIP_RGB_COLOR(out1[i1], rgbY1 + rV );
			
 
				-			CLIP_RGB_COLOR(out1[i2], rgbY1 - gUV);
			
 
				-			CLIP_RGB_COLOR(out1[i3], rgbY1 + bU );
			
 
				-			
			
 
				-			CLIP_RGB_COLOR(out2[i1], rgbY2 + rV );
			
 
				-			CLIP_RGB_COLOR(out2[i2], rgbY2 - gUV);
			
 
				-			CLIP_RGB_COLOR(out2[i3], rgbY2 + bU );
			
 
				-			
			
 
				-			CLIP_RGB_COLOR(out1[j1], rgbY3 + rV );
			
 
				-			CLIP_RGB_COLOR(out1[j2], rgbY3 - gUV);
			
 
				-			CLIP_RGB_COLOR(out1[j3], rgbY3 + bU );
			
 
				-			
			
 
				-			CLIP_RGB_COLOR(out2[j1], rgbY4 + rV );
			
 
				-			CLIP_RGB_COLOR(out2[j2], rgbY4 - gUV);
			
 
				-			CLIP_RGB_COLOR(out2[j3], rgbY4 + bU );
			
 
				-			
			
 
				-			out1 += nBytes2;  out2 += nBytes2;
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				- 
			
 
				-void _decodeRGBA(struct TheoraPixelTransform* t, int stride, int nBytes, int maxWidth, int i1, int i2, int i3, int j1, int j2, int j3, int aindex1, int aindex2)
			
 
				-{
			
 
				-	register int tmp;
			
 
				-	int nBytes2 = nBytes * 2, cv, cu, rgbY1, rgbY2, rgbY3, rgbY4, a1, a2, a3, a4, rV, gUV, bU, width = maxWidth == 0 ? t->w : maxWidth;
			
 
				-	int alphaStride = t->w;
			
 
				-	unsigned int y;
			
 
				-	unsigned char *ySrcEven, *ySrcOdd, *yLineEnd, *uSrc, *vSrc, *out1, *out2;
			
 
				-	
			
 
				-	for (y = 0; y < t->h; y += 2)
			
 
				-	{
			
 
				-		ySrcEven = t->y + y * t->yStride;
			
 
				-		ySrcOdd  = t->y + (y + 1) * t->yStride;
			
 
				-		uSrc = t->u + y * t->uStride / 2;
			
 
				-		vSrc = t->v + y * t->vStride / 2;
			
 
				-		out1 = t->out + y * stride;
			
 
				-		out2 = t->out + (y + 1) * stride;
			
 
				-		
			
 
				-		for (yLineEnd = ySrcEven + width; ySrcEven != yLineEnd;)
			
 
				-		{
			
 
				-			cu = *uSrc; ++uSrc;
			
 
				-			cv = *vSrc; ++vSrc;
			
 
				-			rV   = RVTable[cv];
			
 
				-			gUV  = GUTable[cu] + GVTable[cv];
			
 
				-			bU   = BUTable[cu];
			
 
				-			
			
 
				-			rgbY1 = YTable[*ySrcEven]; a1 = ySrcEven[alphaStride]; ++ySrcEven;
			
 
				-			rgbY2 = YTable[*ySrcOdd];  a2 = ySrcOdd [alphaStride];  ++ySrcOdd;
			
 
				-			rgbY3 = YTable[*ySrcEven]; a3 = ySrcEven[alphaStride]; ++ySrcEven;
			
 
				-			rgbY4 = YTable[*ySrcOdd];  a4 = ySrcOdd [alphaStride];  ++ySrcOdd;
			
 
				-			
			
 
				-			if (a1 >= 32)
			
 
				-			{
			
 
				-				CLIP_RGB_COLOR(out1[i1], rgbY1 + rV );
			
 
				-				CLIP_RGB_COLOR(out1[i2], rgbY1 - gUV);
			
 
				-				CLIP_RGB_COLOR(out1[i3], rgbY1 + bU );
			
 
				-				out1[aindex1] = a1 > 224 ? 255 : a1;
			
 
				-			}
			
 
				-			else *((unsigned int*) out1) = 0;
			
 
				-			
			
 
				-			if (a2 >= 32)
			
 
				-			{
			
 
				-				CLIP_RGB_COLOR(out2[i1], rgbY2 + rV );
			
 
				-				CLIP_RGB_COLOR(out2[i2], rgbY2 - gUV);
			
 
				-				CLIP_RGB_COLOR(out2[i3], rgbY2 + bU );
			
 
				-				out2[aindex1] = a2 > 224 ? 255 : a2;
			
 
				-			}
			
 
				-			else *((unsigned int*) out2) = 0;
			
 
				-
			
 
				-			
			
 
				-			if (a3 >= 32)
			
 
				-			{
			
 
				-				CLIP_RGB_COLOR(out1[j1], rgbY3 + rV );
			
 
				-				CLIP_RGB_COLOR(out1[j2], rgbY3 - gUV);
			
 
				-				CLIP_RGB_COLOR(out1[j3], rgbY3 + bU );
			
 
				-				out1[aindex2] = a3 > 224 ? 255 : a3;
			
 
				-			}
			
 
				-			else *((unsigned int*) &out1[4]) = 0;
			
 
				-
			
 
				-			if (a4 >= 32)
			
 
				-			{
			
 
				-				CLIP_RGB_COLOR(out2[j1], rgbY4 + rV );
			
 
				-				CLIP_RGB_COLOR(out2[j2], rgbY4 - gUV);
			
 
				-				CLIP_RGB_COLOR(out2[j3], rgbY4 + bU );
			
 
				-				out2[aindex2] = a4 > 224 ? 255 : a4;
			
 
				-			}
			
 
				-			else *((unsigned int*) &out2[4]) = 0;
			
 
				-
			
 
				-			out1 += nBytes2;  out2 += nBytes2;
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-*/
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/C/yuv420_yuv_c.c
+++ b/drivers/theoraplayer/src/YUV/C/yuv420_yuv_c.c
@@ -1,86 +0,0 @@
 
				-/************************************************************************************
			
 
				-This source file is part of the Theora Video Playback Library
			
 
				-For latest info, see http://libtheoraplayer.googlecode.com
			
 
				-*************************************************************************************
			
 
				-Copyright (c) 2008-2014 Kresimir Spes ([email protected])
			
 
				-This program is free software; you can redistribute it and/or modify it under
			
 
				-the terms of the BSD license: http://opensource.org/licenses/BSD-3-Clause
			
 
				-*************************************************************************************/
			
 
				-#include "yuv_util.h"
			
 
				-
			
 
				-static void _decodeYUV(struct TheoraPixelTransform* t, int stride, int nBytes, int maxWidth)
			
 
				-{
			
 
				-	int cv, cu, y1, y2, y3, y4, width = maxWidth == 0 ? t->w : maxWidth;
			
 
				-	unsigned char *ySrcEven, *ySrcOdd, *yLineEnd, *uSrc, *vSrc, *out1, *out2;
			
 
				-	unsigned int y;
			
 
				-
			
 
				-	for (y=0; y < t->h; y += 2)
			
 
				-	{
			
 
				-		ySrcEven = t->y + y * t->yStride;
			
 
				-		ySrcOdd  = t->y + (y + 1) * t->yStride;
			
 
				-		uSrc = t->u + y * t->uStride / 2;
			
 
				-		vSrc = t->v + y * t->vStride / 2;
			
 
				-		out1 = t->out + y * stride;
			
 
				-		out2 = t->out + (y + 1) * stride;
			
 
				-		
			
 
				-		for (yLineEnd = ySrcEven + width; ySrcEven != yLineEnd;)
			
 
				-		{
			
 
				-			// EVEN columns
			
 
				-			cu = *uSrc; ++uSrc;
			
 
				-			cv = *vSrc; ++vSrc;
			
 
				-			
			
 
				-			y1 = *ySrcEven; ++ySrcEven;
			
 
				-			y2 = *ySrcOdd;  ++ySrcOdd;
			
 
				-			y3 = *ySrcEven; ++ySrcEven;
			
 
				-			y4 = *ySrcOdd;  ++ySrcOdd;
			
 
				-			
			
 
				-			// EVEN columns
			
 
				-			out1[0] = y1;
			
 
				-			out1[1] = cu;
			
 
				-			out1[2] = cv;
			
 
				-			
			
 
				-			out2[0] = y2;
			
 
				-			out2[1] = cu;
			
 
				-			out2[2] = cv;
			
 
				-			
			
 
				-			out1 += nBytes;  out2 += nBytes;
			
 
				-			// ODD columns
			
 
				-			out1[0] = y3;
			
 
				-			out1[1] = cu;
			
 
				-			out1[2] = cv;
			
 
				-			
			
 
				-			out2[0] = y4;
			
 
				-			out2[1] = cu;
			
 
				-			out2[2] = cv;
			
 
				-			out1 += nBytes;  out2 += nBytes;
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-void decodeYUV(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeYUV(t, t->w * 3, 3, 0);
			
 
				-}
			
 
				-
			
 
				-void decodeYUVA(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeYUV(t, t->w * 4, 4, 0);
			
 
				-	_decodeAlpha(incOut(t, 3), t->w * 4);
			
 
				-}
			
 
				-
			
 
				-void decodeYUVX(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeYUV(t, t->w * 4, 4, 0);
			
 
				-}
			
 
				-
			
 
				-void decodeAYUV(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeYUV(incOut(t, 1), t->w * 4, 4, 0);
			
 
				-	_decodeAlpha(t, t->w * 4);
			
 
				-}
			
 
				-
			
 
				-void decodeXYUV(struct TheoraPixelTransform* t)
			
 
				-{
			
 
				-	_decodeYUV(incOut(t, 1), t->w * 4, 4, 0);
			
 
				-}
			
 
				-
			
--- a/drivers/theoraplayer/src/YUV/android/cpu-features.c
+++ b/drivers/theoraplayer/src/YUV/android/cpu-features.c
@@ -1,1095 +0,0 @@
 
				-/*
			
 
				- * Copyright (C) 2010 The Android Open Source Project
			
 
				- * All rights reserved.
			
 
				- *
			
 
				- * Redistribution and use in source and binary forms, with or without
			
 
				- * modification, are permitted provided that the following conditions
			
 
				- * are met:
			
 
				- *  * Redistributions of source code must retain the above copyright
			
 
				- *    notice, this list of conditions and the following disclaimer.
			
 
				- *  * Redistributions in binary form must reproduce the above copyright
			
 
				- *    notice, this list of conditions and the following disclaimer in
			
 
				- *    the documentation and/or other materials provided with the
			
 
				- *    distribution.
			
 
				- *
			
 
				- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
			
 
				- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
			
 
				- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
			
 
				- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
			
 
				- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
			
 
				- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
			
 
				- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
			
 
				- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
			
 
				- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
			
 
				- * SUCH DAMAGE.
			
 
				- */
			
 
				-
			
 
				-/* ChangeLog for this library:
			
 
				- *
			
 
				- * NDK r8d: Add android_setCpu().
			
 
				- *
			
 
				- * NDK r8c: Add new ARM CPU features: VFPv2, VFP_D32, VFP_FP16,
			
 
				- *          VFP_FMA, NEON_FMA, IDIV_ARM, IDIV_THUMB2 and iWMMXt.
			
 
				- *
			
 
				- *          Rewrite the code to parse /proc/self/auxv instead of
			
 
				- *          the "Features" field in /proc/cpuinfo.
			
 
				- *
			
 
				- *          Dynamically allocate the buffer that hold the content
			
 
				- *          of /proc/cpuinfo to deal with newer hardware.
			
 
				- *
			
 
				- * NDK r7c: Fix CPU count computation. The old method only reported the
			
 
				- *           number of _active_ CPUs when the library was initialized,
			
 
				- *           which could be less than the real total.
			
 
				- *
			
 
				- * NDK r5: Handle buggy kernels which report a CPU Architecture number of 7
			
 
				- *         for an ARMv6 CPU (see below).
			
 
				- *
			
 
				- *         Handle kernels that only report 'neon', and not 'vfpv3'
			
 
				- *         (VFPv3 is mandated by the ARM architecture is Neon is implemented)
			
 
				- *
			
 
				- *         Handle kernels that only report 'vfpv3d16', and not 'vfpv3'
			
 
				- *
			
 
				- *         Fix x86 compilation. Report ANDROID_CPU_FAMILY_X86 in
			
 
				- *         android_getCpuFamily().
			
 
				- *
			
 
				- * NDK r4: Initial release
			
 
				- */
			
 
				-
			
 
				-#if 0
			
 
				-
			
 
				-#ifdef _ANDROID
			
 
				-#if defined(__le32__)
			
 
				-
			
 
				-// When users enter this, we should only provide interface and
			
 
				-// libportable will give the implementations.
			
 
				-
			
 
				-#else // !__le32__
			
 
				-
			
 
				-#include <sys/system_properties.h>
			
 
				-#include <pthread.h>
			
 
				-#include "cpu-features.h"
			
 
				-#include <stdio.h>
			
 
				-#include <stdlib.h>
			
 
				-#include <fcntl.h>
			
 
				-#include <errno.h>
			
 
				-
			
 
				-static  pthread_once_t     g_once;
			
 
				-static  int                g_inited;
			
 
				-static  AndroidCpuFamily   g_cpuFamily;
			
 
				-static  uint64_t           g_cpuFeatures;
			
 
				-static  int                g_cpuCount;
			
 
				-
			
 
				-#ifdef __arm__
			
 
				-static  uint32_t           g_cpuIdArm;
			
 
				-#endif
			
 
				-
			
 
				-static const int  android_cpufeatures_debug = 0;
			
 
				-
			
 
				-#ifdef __arm__
			
 
				-#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_ARM
			
 
				-#elif defined __i386__
			
 
				-#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_X86
			
 
				-#else
			
 
				-#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_UNKNOWN
			
 
				-#endif
			
 
				-
			
 
				-#define  D(...) \
			
 
				-    do { \
			
 
				-        if (android_cpufeatures_debug) { \
			
 
				-            printf(__VA_ARGS__); fflush(stdout); \
			
 
				-        } \
			
 
				-    } while (0)
			
 
				-
			
 
				-#ifdef __i386__
			
 
				-static __inline__ void x86_cpuid(int func, int values[4])
			
 
				-{
			
 
				-    int a, b, c, d;
			
 
				-    /* We need to preserve ebx since we're compiling PIC code */
			
 
				-    /* this means we can't use "=b" for the second output register */
			
 
				-    __asm__ __volatile__ ( \
			
 
				-      "push %%ebx\n"
			
 
				-      "cpuid\n" \
			
 
				-      "mov %%ebx, %1\n"
			
 
				-      "pop %%ebx\n"
			
 
				-      : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
			
 
				-      : "a" (func) \
			
 
				-    );
			
 
				-    values[0] = a;
			
 
				-    values[1] = b;
			
 
				-    values[2] = c;
			
 
				-    values[3] = d;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-/* Get the size of a file by reading it until the end. This is needed
			
 
				- * because files under /proc do not always return a valid size when
			
 
				- * using fseek(0, SEEK_END) + ftell(). Nor can they be mmap()-ed.
			
 
				- */
			
 
				-static int
			
 
				-get_file_size(const char* pathname)
			
 
				-{
			
 
				-    int fd, ret, result = 0;
			
 
				-    char buffer[256];
			
 
				-
			
 
				-    fd = open(pathname, O_RDONLY);
			
 
				-    if (fd < 0) {
			
 
				-        D("Can't open %s: %s\n", pathname, strerror(errno));
			
 
				-        return -1;
			
 
				-    }
			
 
				-
			
 
				-    for (;;) {
			
 
				-        int ret = read(fd, buffer, sizeof buffer);
			
 
				-        if (ret < 0) {
			
 
				-            if (errno == EINTR)
			
 
				-                continue;
			
 
				-            D("Error while reading %s: %s\n", pathname, strerror(errno));
			
 
				-            break;
			
 
				-        }
			
 
				-        if (ret == 0)
			
 
				-            break;
			
 
				-
			
 
				-        result += ret;
			
 
				-    }
			
 
				-    close(fd);
			
 
				-    return result;
			
 
				-}
			
 
				-
			
 
				-/* Read the content of /proc/cpuinfo into a user-provided buffer.
			
 
				- * Return the length of the data, or -1 on error. Does *not*
			
 
				- * zero-terminate the content. Will not read more
			
 
				- * than 'buffsize' bytes.
			
 
				- */
			
 
				-static int
			
 
				-read_file(const char*  pathname, char*  buffer, size_t  buffsize)
			
 
				-{
			
 
				-    int  fd, count;
			
 
				-
			
 
				-    fd = open(pathname, O_RDONLY);
			
 
				-    if (fd < 0) {
			
 
				-        D("Could not open %s: %s\n", pathname, strerror(errno));
			
 
				-        return -1;
			
 
				-    }
			
 
				-    count = 0;
			
 
				-    while (count < (int)buffsize) {
			
 
				-        int ret = read(fd, buffer + count, buffsize - count);
			
 
				-        if (ret < 0) {
			
 
				-            if (errno == EINTR)
			
 
				-                continue;
			
 
				-            D("Error while reading from %s: %s\n", pathname, strerror(errno));
			
 
				-            if (count == 0)
			
 
				-                count = -1;
			
 
				-            break;
			
 
				-        }
			
 
				-        if (ret == 0)
			
 
				-            break;
			
 
				-        count += ret;
			
 
				-    }
			
 
				-    close(fd);
			
 
				-    return count;
			
 
				-}
			
 
				-
			
 
				-/* Extract the content of a the first occurence of a given field in
			
 
				- * the content of /proc/cpuinfo and return it as a heap-allocated
			
 
				- * string that must be freed by the caller.
			
 
				- *
			
 
				- * Return NULL if not found
			
 
				- */
			
 
				-static char*
			
 
				-extract_cpuinfo_field(const char* buffer, int buflen, const char* field)
			
 
				-{
			
 
				-    int  fieldlen = strlen(field);
			
 
				-    const char* bufend = buffer + buflen;
			
 
				-    char* result = NULL;
			
 
				-    int len, ignore;
			
 
				-    const char *p, *q;
			
 
				-
			
 
				-    /* Look for first field occurence, and ensures it starts the line. */
			
 
				-    p = buffer;
			
 
				-    for (;;) {
			
 
				-        p = memmem(p, bufend-p, field, fieldlen);
			
 
				-        if (p == NULL)
			
 
				-            goto EXIT;
			
 
				-
			
 
				-        if (p == buffer || p[-1] == '\n')
			
 
				-            break;
			
 
				-
			
 
				-        p += fieldlen;
			
 
				-    }
			
 
				-
			
 
				-    /* Skip to the first column followed by a space */
			
 
				-    p += fieldlen;
			
 
				-    p  = memchr(p, ':', bufend-p);
			
 
				-    if (p == NULL || p[1] != ' ')
			
 
				-        goto EXIT;
			
 
				-
			
 
				-    /* Find the end of the line */
			
 
				-    p += 2;
			
 
				-    q = memchr(p, '\n', bufend-p);
			
 
				-    if (q == NULL)
			
 
				-        q = bufend;
			
 
				-
			
 
				-    /* Copy the line into a heap-allocated buffer */
			
 
				-    len = q-p;
			
 
				-    result = malloc(len+1);
			
 
				-    if (result == NULL)
			
 
				-        goto EXIT;
			
 
				-
			
 
				-    memcpy(result, p, len);
			
 
				-    result[len] = '\0';
			
 
				-
			
 
				-EXIT:
			
 
				-    return result;
			
 
				-}
			
 
				-
			
 
				-/* Checks that a space-separated list of items contains one given 'item'.
			
 
				- * Returns 1 if found, 0 otherwise.
			
 
				- */
			
 
				-static int
			
 
				-has_list_item(const char* list, const char* item)
			
 
				-{
			
 
				-    const char*  p = list;
			
 
				-    int itemlen = strlen(item);
			
 
				-
			
 
				-    if (list == NULL)
			
 
				-        return 0;
			
 
				-
			
 
				-    while (*p) {
			
 
				-        const char*  q;
			
 
				-
			
 
				-        /* skip spaces */
			
 
				-        while (*p == ' ' || *p == '\t')
			
 
				-            p++;
			
 
				-
			
 
				-        /* find end of current list item */
			
 
				-        q = p;
			
 
				-        while (*q && *q != ' ' && *q != '\t')
			
 
				-            q++;
			
 
				-
			
 
				-        if (itemlen == q-p && !memcmp(p, item, itemlen))
			
 
				-            return 1;
			
 
				-
			
 
				-        /* skip to next item */
			
 
				-        p = q;
			
 
				-    }
			
 
				-    return 0;
			
 
				-}
			
 
				-
			
 
				-/* Parse a number starting from 'input', but not going further
			
 
				- * than 'limit'. Return the value into '*result'.
			
 
				- *
			
 
				- * NOTE: Does not skip over leading spaces, or deal with sign characters.
			
 
				- * NOTE: Ignores overflows.
			
 
				- *
			
 
				- * The function returns NULL in case of error (bad format), or the new
			
 
				- * position after the decimal number in case of success (which will always
			
 
				- * be <= 'limit').
			
 
				- */
			
 
				-static const char*
			
 
				-parse_number(const char* input, const char* limit, int base, int* result)
			
 
				-{
			
 
				-    const char* p = input;
			
 
				-    int val = 0;
			
 
				-    while (p < limit) {
			
 
				-        int d = (*p - '0');
			
 
				-        if ((unsigned)d >= 10U) {
			
 
				-            d = (*p - 'a');
			
 
				-            if ((unsigned)d >= 6U)
			
 
				-              d = (*p - 'A');
			
 
				-            if ((unsigned)d >= 6U)
			
 
				-              break;
			
 
				-            d += 10;
			
 
				-        }
			
 
				-        if (d >= base)
			
 
				-          break;
			
 
				-        val = val*base + d;
			
 
				-        p++;
			
 
				-    }
			
 
				-    if (p == input)
			
 
				-        return NULL;
			
 
				-
			
 
				-    *result = val;
			
 
				-    return p;
			
 
				-}
			
 
				-
			
 
				-static const char*
			
 
				-parse_decimal(const char* input, const char* limit, int* result)
			
 
				-{
			
 
				-    return parse_number(input, limit, 10, result);
			
 
				-}
			
 
				-
			
 
				-static const char*
			
 
				-parse_hexadecimal(const char* input, const char* limit, int* result)
			
 
				-{
			
 
				-    return parse_number(input, limit, 16, result);
			
 
				-}
			
 
				-
			
 
				-/* This small data type is used to represent a CPU list / mask, as read
			
 
				- * from sysfs on Linux. See http://www.kernel.org/doc/Documentation/cputopology.txt
			
 
				- *
			
 
				- * For now, we don't expect more than 32 cores on mobile devices, so keep
			
 
				- * everything simple.
			
 
				- */
			
 
				-typedef struct {
			
 
				-    uint32_t mask;
			
 
				-} CpuList;
			
 
				-
			
 
				-static __inline__ void
			
 
				-cpulist_init(CpuList* list) {
			
 
				-    list->mask = 0;
			
 
				-}
			
 
				-
			
 
				-static __inline__ void
			
 
				-cpulist_and(CpuList* list1, CpuList* list2) {
			
 
				-    list1->mask &= list2->mask;
			
 
				-}
			
 
				-
			
 
				-static __inline__ void
			
 
				-cpulist_set(CpuList* list, int index) {
			
 
				-    if ((unsigned)index < 32) {
			
 
				-        list->mask |= (uint32_t)(1U << index);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-static __inline__ int
			
 
				-cpulist_count(CpuList* list) {
			
 
				-    return __builtin_popcount(list->mask);
			
 
				-}
			
 
				-
			
 
				-/* Parse a textual list of cpus and store the result inside a CpuList object.
			
 
				- * Input format is the following:
			
 
				- * - comma-separated list of items (no spaces)
			
 
				- * - each item is either a single decimal number (cpu index), or a range made
			
 
				- *   of two numbers separated by a single dash (-). Ranges are inclusive.
			
 
				- *
			
 
				- * Examples:   0
			
 
				- *             2,4-127,128-143
			
 
				- *             0-1
			
 
				- */
			
 
				-static void
			
 
				-cpulist_parse(CpuList* list, const char* line, int line_len)
			
 
				-{
			
 
				-    const char* p = line;
			
 
				-    const char* end = p + line_len;
			
 
				-    const char* q;
			
 
				-
			
 
				-    /* NOTE: the input line coming from sysfs typically contains a
			
 
				-     * trailing newline, so take care of it in the code below
			
 
				-     */
			
 
				-    while (p < end && *p != '\n')
			
 
				-    {
			
 
				-        int val, start_value, end_value;
			
 
				-
			
 
				-        /* Find the end of current item, and put it into 'q' */
			
 
				-        q = memchr(p, ',', end-p);
			
 
				-        if (q == NULL) {
			
 
				-            q = end;
			
 
				-        }
			
 
				-
			
 
				-        /* Get first value */
			
 
				-        p = parse_decimal(p, q, &start_value);
			
 
				-        if (p == NULL)
			
 
				-            goto BAD_FORMAT;
			
 
				-
			
 
				-        end_value = start_value;
			
 
				-
			
 
				-        /* If we're not at the end of the item, expect a dash and
			
 
				-         * and integer; extract end value.
			
 
				-         */
			
 
				-        if (p < q && *p == '-') {
			
 
				-            p = parse_decimal(p+1, q, &end_value);
			
 
				-            if (p == NULL)
			
 
				-                goto BAD_FORMAT;
			
 
				-        }
			
 
				-
			
 
				-        /* Set bits CPU list bits */
			
 
				-        for (val = start_value; val <= end_value; val++) {
			
 
				-            cpulist_set(list, val);
			
 
				-        }
			
 
				-
			
 
				-        /* Jump to next item */
			
 
				-        p = q;
			
 
				-        if (p < end)
			
 
				-            p++;
			
 
				-    }
			
 
				-
			
 
				-BAD_FORMAT:
			
 
				-    ;
			
 
				-}
			
 
				-
			
 
				-/* Read a CPU list from one sysfs file */
			
 
				-static void
			
 
				-cpulist_read_from(CpuList* list, const char* filename)
			
 
				-{
			
 
				-    char   file[64];
			
 
				-    int    filelen;
			
 
				-
			
 
				-    cpulist_init(list);
			
 
				-
			
 
				-    filelen = read_file(filename, file, sizeof file);
			
 
				-    if (filelen < 0) {
			
 
				-        D("Could not read %s: %s\n", filename, strerror(errno));
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    cpulist_parse(list, file, filelen);
			
 
				-}
			
 
				-
			
 
				-// See <asm/hwcap.h> kernel header.
			
 
				-#define HWCAP_VFP       (1 << 6)
			
 
				-#define HWCAP_IWMMXT    (1 << 9)
			
 
				-#define HWCAP_NEON      (1 << 12)
			
 
				-#define HWCAP_VFPv3     (1 << 13)
			
 
				-#define HWCAP_VFPv3D16  (1 << 14)
			
 
				-#define HWCAP_VFPv4     (1 << 16)
			
 
				-#define HWCAP_IDIVA     (1 << 17)
			
 
				-#define HWCAP_IDIVT     (1 << 18)
			
 
				-
			
 
				-#define AT_HWCAP 16
			
 
				-
			
 
				-#if defined(__arm__)
			
 
				-/* Compute the ELF HWCAP flags.
			
 
				- */
			
 
				-static uint32_t
			
 
				-get_elf_hwcap(const char* cpuinfo, int cpuinfo_len)
			
 
				-{
			
 
				-  /* IMPORTANT:
			
 
				-   *   Accessing /proc/self/auxv doesn't work anymore on all
			
 
				-   *   platform versions. More specifically, when running inside
			
 
				-   *   a regular application process, most of /proc/self/ will be
			
 
				-   *   non-readable, including /proc/self/auxv. This doesn't
			
 
				-   *   happen however if the application is debuggable, or when
			
 
				-   *   running under the "shell" UID, which is why this was not
			
 
				-   *   detected appropriately.
			
 
				-   */
			
 
				-#if 0
			
 
				-    uint32_t result = 0;
			
 
				-    const char filepath[] = "/proc/self/auxv";
			
 
				-    int fd = open(filepath, O_RDONLY);
			
 
				-    if (fd < 0) {
			
 
				-        D("Could not open %s: %s\n", filepath, strerror(errno));
			
 
				-        return 0;
			
 
				-    }
			
 
				-
			
 
				-    struct { uint32_t tag; uint32_t value; } entry;
			
 
				-
			
 
				-    for (;;) {
			
 
				-        int ret = read(fd, (char*)&entry, sizeof entry);
			
 
				-        if (ret < 0) {
			
 
				-            if (errno == EINTR)
			
 
				-                continue;
			
 
				-            D("Error while reading %s: %s\n", filepath, strerror(errno));
			
 
				-            break;
			
 
				-        }
			
 
				-        // Detect end of list.
			
 
				-        if (ret == 0 || (entry.tag == 0 && entry.value == 0))
			
 
				-          break;
			
 
				-        if (entry.tag == AT_HWCAP) {
			
 
				-          result = entry.value;
			
 
				-          break;
			
 
				-        }
			
 
				-    }
			
 
				-    close(fd);
			
 
				-    return result;
			
 
				-#else
			
 
				-    // Recreate ELF hwcaps by parsing /proc/cpuinfo Features tag.
			
 
				-    uint32_t hwcaps = 0;
			
 
				-
			
 
				-    char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "Features");
			
 
				-
			
 
				-    if (cpuFeatures != NULL) {
			
 
				-        D("Found cpuFeatures = '%s'\n", cpuFeatures);
			
 
				-
			
 
				-        if (has_list_item(cpuFeatures, "vfp"))
			
 
				-            hwcaps |= HWCAP_VFP;
			
 
				-        if (has_list_item(cpuFeatures, "vfpv3"))
			
 
				-            hwcaps |= HWCAP_VFPv3;
			
 
				-        if (has_list_item(cpuFeatures, "vfpv3d16"))
			
 
				-            hwcaps |= HWCAP_VFPv3D16;
			
 
				-        if (has_list_item(cpuFeatures, "vfpv4"))
			
 
				-            hwcaps |= HWCAP_VFPv4;
			
 
				-        if (has_list_item(cpuFeatures, "neon"))
			
 
				-            hwcaps |= HWCAP_NEON;
			
 
				-        if (has_list_item(cpuFeatures, "idiva"))
			
 
				-            hwcaps |= HWCAP_IDIVA;
			
 
				-        if (has_list_item(cpuFeatures, "idivt"))
			
 
				-            hwcaps |= HWCAP_IDIVT;
			
 
				-        if (has_list_item(cpuFeatures, "idiv"))
			
 
				-            hwcaps |= HWCAP_IDIVA | HWCAP_IDIVT;
			
 
				-        if (has_list_item(cpuFeatures, "iwmmxt"))
			
 
				-            hwcaps |= HWCAP_IWMMXT;
			
 
				-
			
 
				-        free(cpuFeatures);
			
 
				-    }
			
 
				-    return hwcaps;
			
 
				-#endif
			
 
				-}
			
 
				-#endif  /* __arm__ */
			
 
				-
			
 
				-/* Return the number of cpus present on a given device.
			
 
				- *
			
 
				- * To handle all weird kernel configurations, we need to compute the
			
 
				- * intersection of the 'present' and 'possible' CPU lists and count
			
 
				- * the result.
			
 
				- */
			
 
				-static int
			
 
				-get_cpu_count(void)
			
 
				-{
			
 
				-    CpuList cpus_present[1];
			
 
				-    CpuList cpus_possible[1];
			
 
				-
			
 
				-    cpulist_read_from(cpus_present, "/sys/devices/system/cpu/present");
			
 
				-    cpulist_read_from(cpus_possible, "/sys/devices/system/cpu/possible");
			
 
				-
			
 
				-    /* Compute the intersection of both sets to get the actual number of
			
 
				-     * CPU cores that can be used on this device by the kernel.
			
 
				-     */
			
 
				-    cpulist_and(cpus_present, cpus_possible);
			
 
				-
			
 
				-    return cpulist_count(cpus_present);
			
 
				-}
			
 
				-
			
 
				-static void
			
 
				-android_cpuInitFamily(void)
			
 
				-{
			
 
				-#if defined(__arm__)
			
 
				-    g_cpuFamily = ANDROID_CPU_FAMILY_ARM;
			
 
				-#elif defined(__i386__)
			
 
				-    g_cpuFamily = ANDROID_CPU_FAMILY_X86;
			
 
				-#elif defined(__mips64)
			
 
				-/* Needs to be before __mips__ since the compiler defines both */
			
 
				-    g_cpuFamily = ANDROID_CPU_FAMILY_MIPS64;
			
 
				-#elif defined(__mips__)
			
 
				-    g_cpuFamily = ANDROID_CPU_FAMILY_MIPS;
			
 
				-#elif defined(__aarch64__)
			
 
				-    g_cpuFamily = ANDROID_CPU_FAMILY_ARM64;
			
 
				-#elif defined(__x86_64__)
			
 
				-    g_cpuFamily = ANDROID_CPU_FAMILY_X86_64;
			
 
				-#else
			
 
				-    g_cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-static void
			
 
				-android_cpuInit(void)
			
 
				-{
			
 
				-    char* cpuinfo = NULL;
			
 
				-    int   cpuinfo_len;
			
 
				-
			
 
				-    android_cpuInitFamily();
			
 
				-
			
 
				-    g_cpuFeatures = 0;
			
 
				-    g_cpuCount    = 1;
			
 
				-    g_inited      = 1;
			
 
				-
			
 
				-    cpuinfo_len = get_file_size("/proc/cpuinfo");
			
 
				-    if (cpuinfo_len < 0) {
			
 
				-      D("cpuinfo_len cannot be computed!");
			
 
				-      return;
			
 
				-    }
			
 
				-    cpuinfo = malloc(cpuinfo_len);
			
 
				-    if (cpuinfo == NULL) {
			
 
				-      D("cpuinfo buffer could not be allocated");
			
 
				-      return;
			
 
				-    }
			
 
				-    cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, cpuinfo_len);
			
 
				-    D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len,
			
 
				-      cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo);
			
 
				-
			
 
				-    if (cpuinfo_len < 0)  /* should not happen */ {
			
 
				-        free(cpuinfo);
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    /* Count the CPU cores, the value may be 0 for single-core CPUs */
			
 
				-    g_cpuCount = get_cpu_count();
			
 
				-    if (g_cpuCount == 0) {
			
 
				-        g_cpuCount = 1;
			
 
				-    }
			
 
				-
			
 
				-    D("found cpuCount = %d\n", g_cpuCount);
			
 
				-
			
 
				-#ifdef __arm__
			
 
				-    {
			
 
				-        char*  features = NULL;
			
 
				-        char*  architecture = NULL;
			
 
				-
			
 
				-        /* Extract architecture from the "CPU Architecture" field.
			
 
				-         * The list is well-known, unlike the the output of
			
 
				-         * the 'Processor' field which can vary greatly.
			
 
				-         *
			
 
				-         * See the definition of the 'proc_arch' array in
			
 
				-         * $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in
			
 
				-         * same file.
			
 
				-         */
			
 
				-        char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "CPU architecture");
			
 
				-
			
 
				-        if (cpuArch != NULL) {
			
 
				-            char*  end;
			
 
				-            long   archNumber;
			
 
				-            int    hasARMv7 = 0;
			
 
				-
			
 
				-            D("found cpuArch = '%s'\n", cpuArch);
			
 
				-
			
 
				-            /* read the initial decimal number, ignore the rest */
			
 
				-            archNumber = strtol(cpuArch, &end, 10);
			
 
				-
			
 
				-            /* Here we assume that ARMv8 will be upwards compatible with v7
			
 
				-             * in the future. Unfortunately, there is no 'Features' field to
			
 
				-             * indicate that Thumb-2 is supported.
			
 
				-             */
			
 
				-            if (end > cpuArch && archNumber >= 7) {
			
 
				-                hasARMv7 = 1;
			
 
				-            }
			
 
				-
			
 
				-            /* Unfortunately, it seems that certain ARMv6-based CPUs
			
 
				-             * report an incorrect architecture number of 7!
			
 
				-             *
			
 
				-             * See http://code.google.com/p/android/issues/detail?id=10812
			
 
				-             *
			
 
				-             * We try to correct this by looking at the 'elf_format'
			
 
				-             * field reported by the 'Processor' field, which is of the
			
 
				-             * form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for
			
 
				-             * an ARMv6-one.
			
 
				-             */
			
 
				-            if (hasARMv7) {
			
 
				-                char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
			
 
				-                                                      "Processor");
			
 
				-                if (cpuProc != NULL) {
			
 
				-                    D("found cpuProc = '%s'\n", cpuProc);
			
 
				-                    if (has_list_item(cpuProc, "(v6l)")) {
			
 
				-                        D("CPU processor and architecture mismatch!!\n");
			
 
				-                        hasARMv7 = 0;
			
 
				-                    }
			
 
				-                    free(cpuProc);
			
 
				-                }
			
 
				-            }
			
 
				-
			
 
				-            if (hasARMv7) {
			
 
				-                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_ARMv7;
			
 
				-            }
			
 
				-
			
 
				-            /* The LDREX / STREX instructions are available from ARMv6 */
			
 
				-            if (archNumber >= 6) {
			
 
				-                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_LDREX_STREX;
			
 
				-            }
			
 
				-
			
 
				-            free(cpuArch);
			
 
				-        }
			
 
				-
			
 
				-        /* Extract the list of CPU features from ELF hwcaps */
			
 
				-        uint32_t hwcaps = get_elf_hwcap(cpuinfo, cpuinfo_len);
			
 
				-
			
 
				-        if (hwcaps != 0) {
			
 
				-            int has_vfp = (hwcaps & HWCAP_VFP);
			
 
				-            int has_vfpv3 = (hwcaps & HWCAP_VFPv3);
			
 
				-            int has_vfpv3d16 = (hwcaps & HWCAP_VFPv3D16);
			
 
				-            int has_vfpv4 = (hwcaps & HWCAP_VFPv4);
			
 
				-            int has_neon = (hwcaps & HWCAP_NEON);
			
 
				-            int has_idiva = (hwcaps & HWCAP_IDIVA);
			
 
				-            int has_idivt = (hwcaps & HWCAP_IDIVT);
			
 
				-            int has_iwmmxt = (hwcaps & HWCAP_IWMMXT);
			
 
				-
			
 
				-            // The kernel does a poor job at ensuring consistency when
			
 
				-            // describing CPU features. So lots of guessing is needed.
			
 
				-
			
 
				-            // 'vfpv4' implies VFPv3|VFP_FMA|FP16
			
 
				-            if (has_vfpv4)
			
 
				-                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3    |
			
 
				-                                 ANDROID_CPU_ARM_FEATURE_VFP_FP16 |
			
 
				-                                 ANDROID_CPU_ARM_FEATURE_VFP_FMA;
			
 
				-
			
 
				-            // 'vfpv3' or 'vfpv3d16' imply VFPv3. Note that unlike GCC,
			
 
				-            // a value of 'vfpv3' doesn't necessarily mean that the D32
			
 
				-            // feature is present, so be conservative. All CPUs in the
			
 
				-            // field that support D32 also support NEON, so this should
			
 
				-            // not be a problem in practice.
			
 
				-            if (has_vfpv3 || has_vfpv3d16)
			
 
				-                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3;
			
 
				-
			
 
				-            // 'vfp' is super ambiguous. Depending on the kernel, it can
			
 
				-            // either mean VFPv2 or VFPv3. Make it depend on ARMv7.
			
 
				-            if (has_vfp) {
			
 
				-              if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7)
			
 
				-                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3;
			
 
				-              else
			
 
				-                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2;
			
 
				-            }
			
 
				-
			
 
				-            // Neon implies VFPv3|D32, and if vfpv4 is detected, NEON_FMA
			
 
				-            if (has_neon) {
			
 
				-                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3 |
			
 
				-                                 ANDROID_CPU_ARM_FEATURE_NEON |
			
 
				-                                 ANDROID_CPU_ARM_FEATURE_VFP_D32;
			
 
				-              if (has_vfpv4)
			
 
				-                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_NEON_FMA;
			
 
				-            }
			
 
				-
			
 
				-            // VFPv3 implies VFPv2 and ARMv7
			
 
				-            if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3)
			
 
				-                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2 |
			
 
				-                                 ANDROID_CPU_ARM_FEATURE_ARMv7;
			
 
				-
			
 
				-            if (has_idiva)
			
 
				-                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_ARM;
			
 
				-            if (has_idivt)
			
 
				-                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2;
			
 
				-
			
 
				-            if (has_iwmmxt)
			
 
				-                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_iWMMXt;
			
 
				-        }
			
 
				-
			
 
				-        /* Extract the cpuid value from various fields */
			
 
				-        // The CPUID value is broken up in several entries in /proc/cpuinfo.
			
 
				-        // This table is used to rebuild it from the entries.
			
 
				-        static const struct CpuIdEntry {
			
 
				-            const char* field;
			
 
				-            char        format;
			
 
				-            char        bit_lshift;
			
 
				-            char        bit_length;
			
 
				-        } cpu_id_entries[] = {
			
 
				-            { "CPU implementer", 'x', 24, 8 },
			
 
				-            { "CPU variant", 'x', 20, 4 },
			
 
				-            { "CPU part", 'x', 4, 12 },
			
 
				-            { "CPU revision", 'd', 0, 4 },
			
 
				-        };
			
 
				-        size_t i;
			
 
				-        D("Parsing /proc/cpuinfo to recover CPUID\n");
			
 
				-        for (i = 0;
			
 
				-             i < sizeof(cpu_id_entries)/sizeof(cpu_id_entries[0]);
			
 
				-             ++i) {
			
 
				-            const struct CpuIdEntry* entry = &cpu_id_entries[i];
			
 
				-            char* value = extract_cpuinfo_field(cpuinfo,
			
 
				-                                                cpuinfo_len,
			
 
				-                                                entry->field);
			
 
				-            if (value == NULL)
			
 
				-                continue;
			
 
				-
			
 
				-            D("field=%s value='%s'\n", entry->field, value);
			
 
				-            char* value_end = value + strlen(value);
			
 
				-            int val = 0;
			
 
				-            const char* start = value;
			
 
				-            const char* p;
			
 
				-            if (value[0] == '0' && (value[1] == 'x' || value[1] == 'X')) {
			
 
				-              start += 2;
			
 
				-              p = parse_hexadecimal(start, value_end, &val);
			
 
				-            } else if (entry->format == 'x')
			
 
				-              p = parse_hexadecimal(value, value_end, &val);
			
 
				-            else
			
 
				-              p = parse_decimal(value, value_end, &val);
			
 
				-
			
 
				-            if (p > (const char*)start) {
			
 
				-              val &= ((1 << entry->bit_length)-1);
			
 
				-              val <<= entry->bit_lshift;
			
 
				-              g_cpuIdArm |= (uint32_t) val;
			
 
				-            }
			
 
				-
			
 
				-            free(value);
			
 
				-        }
			
 
				-
			
 
				-        // Handle kernel configuration bugs that prevent the correct
			
 
				-        // reporting of CPU features.
			
 
				-        static const struct CpuFix {
			
 
				-            uint32_t  cpuid;
			
 
				-            uint64_t  or_flags;
			
 
				-        } cpu_fixes[] = {
			
 
				-            /* The Nexus 4 (Qualcomm Krait) kernel configuration
			
 
				-             * forgets to report IDIV support. */
			
 
				-            { 0x510006f2, ANDROID_CPU_ARM_FEATURE_IDIV_ARM |
			
 
				-                          ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 },
			
 
				-            { 0x510006f3, ANDROID_CPU_ARM_FEATURE_IDIV_ARM |
			
 
				-                          ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 },
			
 
				-        };
			
 
				-        size_t n;
			
 
				-        for (n = 0; n < sizeof(cpu_fixes)/sizeof(cpu_fixes[0]); ++n) {
			
 
				-            const struct CpuFix* entry = &cpu_fixes[n];
			
 
				-
			
 
				-            if (g_cpuIdArm == entry->cpuid)
			
 
				-                g_cpuFeatures |= entry->or_flags;
			
 
				-        }
			
 
				-
			
 
				-    }
			
 
				-#endif /* __arm__ */
			
 
				-
			
 
				-#ifdef __i386__
			
 
				-    int regs[4];
			
 
				-
			
 
				-/* According to http://en.wikipedia.org/wiki/CPUID */
			
 
				-#define VENDOR_INTEL_b  0x756e6547
			
 
				-#define VENDOR_INTEL_c  0x6c65746e
			
 
				-#define VENDOR_INTEL_d  0x49656e69
			
 
				-
			
 
				-    x86_cpuid(0, regs);
			
 
				-    int vendorIsIntel = (regs[1] == VENDOR_INTEL_b &&
			
 
				-                         regs[2] == VENDOR_INTEL_c &&
			
 
				-                         regs[3] == VENDOR_INTEL_d);
			
 
				-
			
 
				-    x86_cpuid(1, regs);
			
 
				-    if ((regs[2] & (1 << 9)) != 0) {
			
 
				-        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSSE3;
			
 
				-    }
			
 
				-    if ((regs[2] & (1 << 23)) != 0) {
			
 
				-        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_POPCNT;
			
 
				-    }
			
 
				-    if (vendorIsIntel && (regs[2] & (1 << 22)) != 0) {
			
 
				-        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_MOVBE;
			
 
				-    }
			
 
				-#endif
			
 
				-
			
 
				-    free(cpuinfo);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-AndroidCpuFamily
			
 
				-android_getCpuFamily(void)
			
 
				-{
			
 
				-    pthread_once(&g_once, android_cpuInit);
			
 
				-    return g_cpuFamily;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-uint64_t
			
 
				-android_getCpuFeaturesExt(void)
			
 
				-{
			
 
				-    pthread_once(&g_once, android_cpuInit);
			
 
				-    return g_cpuFeatures;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-int
			
 
				-android_getCpuCount(void)
			
 
				-{
			
 
				-    pthread_once(&g_once, android_cpuInit);
			
 
				-    return g_cpuCount;
			
 
				-}
			
 
				-
			
 
				-static void
			
 
				-android_cpuInitDummy(void)
			
 
				-{
			
 
				-    g_inited = 1;
			
 
				-}
			
 
				-
			
 
				-int
			
 
				-android_setCpu(int cpu_count, uint64_t cpu_features)
			
 
				-{
			
 
				-    /* Fail if the library was already initialized. */
			
 
				-    if (g_inited)
			
 
				-        return 0;
			
 
				-
			
 
				-    android_cpuInitFamily();
			
 
				-    g_cpuCount = (cpu_count <= 0 ? 1 : cpu_count);
			
 
				-    g_cpuFeatures = cpu_features;
			
 
				-    pthread_once(&g_once, android_cpuInitDummy);
			
 
				-
			
 
				-    return 1;
			
 
				-}
			
 
				-
			
 
				-#ifdef __arm__
			
 
				-uint32_t
			
 
				-android_getCpuIdArm(void)
			
 
				-{
			
 
				-    pthread_once(&g_once, android_cpuInit);
			
 
				-    return g_cpuIdArm;
			
 
				-}
			
 
				-
			
 
				-int
			
 
				-android_setCpuArm(int cpu_count, uint64_t cpu_features, uint32_t cpu_id)
			
 
				-{
			
 
				-    if (!android_setCpu(cpu_count, cpu_features))
			
 
				-        return 0;
			
 
				-
			
 
				-    g_cpuIdArm = cpu_id;
			
 
				-    return 1;
			
 
				-}
			
 
				-#endif  /* __arm__ */
			
 
				-
			
 
				-/*
			
 
				- * Technical note: Making sense of ARM's FPU architecture versions.
			
 
				- *
			
 
				- * FPA was ARM's first attempt at an FPU architecture. There is no Android
			
 
				- * device that actually uses it since this technology was already obsolete
			
 
				- * when the project started. If you see references to FPA instructions
			
 
				- * somewhere, you can be sure that this doesn't apply to Android at all.
			
 
				- *
			
 
				- * FPA was followed by "VFP", soon renamed "VFPv1" due to the emergence of
			
 
				- * new versions / additions to it. ARM considers this obsolete right now,
			
 
				- * and no known Android device implements it either.
			
 
				- *
			
 
				- * VFPv2 added a few instructions to VFPv1, and is an *optional* extension
			
 
				- * supported by some ARMv5TE, ARMv6 and ARMv6T2 CPUs. Note that a device
			
 
				- * supporting the 'armeabi' ABI doesn't necessarily support these.
			
 
				- *
			
 
				- * VFPv3-D16 adds a few instructions on top of VFPv2 and is typically used
			
 
				- * on ARMv7-A CPUs which implement a FPU. Note that it is also mandated
			
 
				- * by the Android 'armeabi-v7a' ABI. The -D16 suffix in its name means
			
 
				- * that it provides 16 double-precision FPU registers (d0-d15) and 32
			
 
				- * single-precision ones (s0-s31) which happen to be mapped to the same
			
 
				- * register banks.
			
 
				- *
			
 
				- * VFPv3-D32 is the name of an extension to VFPv3-D16 that provides 16
			
 
				- * additional double precision registers (d16-d31). Note that there are
			
 
				- * still only 32 single precision registers.
			
 
				- *
			
 
				- * VFPv3xD is a *subset* of VFPv3-D16 that only provides single-precision
			
 
				- * registers. It is only used on ARMv7-M (i.e. on micro-controllers) which
			
 
				- * are not supported by Android. Note that it is not compatible with VFPv2.
			
 
				- *
			
 
				- * NOTE: The term 'VFPv3' usually designate either VFPv3-D16 or VFPv3-D32
			
 
				- *       depending on context. For example GCC uses it for VFPv3-D32, but
			
 
				- *       the Linux kernel code uses it for VFPv3-D16 (especially in
			
 
				- *       /proc/cpuinfo). Always try to use the full designation when
			
 
				- *       possible.
			
 
				- *
			
 
				- * NEON, a.k.a. "ARM Advanced SIMD" is an extension that provides
			
 
				- * instructions to perform parallel computations on vectors of 8, 16,
			
 
				- * 32, 64 and 128 bit quantities. NEON requires VFPv32-D32 since all
			
 
				- * NEON registers are also mapped to the same register banks.
			
 
				- *
			
 
				- * VFPv4-D16, adds a few instructions on top of VFPv3-D16 in order to
			
 
				- * perform fused multiply-accumulate on VFP registers, as well as
			
 
				- * half-precision (16-bit) conversion operations.
			
 
				- *
			
 
				- * VFPv4-D32 is VFPv4-D16 with 32, instead of 16, FPU double precision
			
 
				- * registers.
			
 
				- *
			
 
				- * VPFv4-NEON is VFPv4-D32 with NEON instructions. It also adds fused
			
 
				- * multiply-accumulate instructions that work on the NEON registers.
			
 
				- *
			
 
				- * NOTE: Similarly, "VFPv4" might either reference VFPv4-D16 or VFPv4-D32
			
 
				- *       depending on context.
			
 
				- *
			
 
				- * The following information was determined by scanning the binutils-2.22
			
 
				- * sources:
			
 
				- *
			
 
				- * Basic VFP instruction subsets:
			
 
				- *
			
 
				- * #define FPU_VFP_EXT_V1xD 0x08000000     // Base VFP instruction set.
			
 
				- * #define FPU_VFP_EXT_V1   0x04000000     // Double-precision insns.
			
 
				- * #define FPU_VFP_EXT_V2   0x02000000     // ARM10E VFPr1.
			
 
				- * #define FPU_VFP_EXT_V3xD 0x01000000     // VFPv3 single-precision.
			
 
				- * #define FPU_VFP_EXT_V3   0x00800000     // VFPv3 double-precision.
			
 
				- * #define FPU_NEON_EXT_V1  0x00400000     // Neon (SIMD) insns.
			
 
				- * #define FPU_VFP_EXT_D32  0x00200000     // Registers D16-D31.
			
 
				- * #define FPU_VFP_EXT_FP16 0x00100000     // Half-precision extensions.
			
 
				- * #define FPU_NEON_EXT_FMA 0x00080000     // Neon fused multiply-add
			
 
				- * #define FPU_VFP_EXT_FMA  0x00040000     // VFP fused multiply-add
			
 
				- *
			
 
				- * FPU types (excluding NEON)
			
 
				- *
			
 
				- * FPU_VFP_V1xD (EXT_V1xD)
			
 
				- *    |
			
 
				- *    +--------------------------+
			
 
				- *    |                          |
			
 
				- * FPU_VFP_V1 (+EXT_V1)       FPU_VFP_V3xD (+EXT_V2+EXT_V3xD)
			
 
				- *    |                          |
			
 
				- *    |                          |
			
 
				- * FPU_VFP_V2 (+EXT_V2)       FPU_VFP_V4_SP_D16 (+EXT_FP16+EXT_FMA)
			
 
				- *    |
			
 
				- * FPU_VFP_V3D16 (+EXT_Vx3D+EXT_V3)
			
 
				- *    |
			
 
				- *    +--------------------------+
			
 
				- *    |                          |
			
 
				- * FPU_VFP_V3 (+EXT_D32)     FPU_VFP_V4D16 (+EXT_FP16+EXT_FMA)
			
 
				- *    |                          |
			
 
				- *    |                      FPU_VFP_V4 (+EXT_D32)
			
 
				- *    |
			
 
				- * FPU_VFP_HARD (+EXT_FMA+NEON_EXT_FMA)
			
 
				- *
			
 
				- * VFP architectures:
			
 
				- *
			
 
				- * ARCH_VFP_V1xD  (EXT_V1xD)
			
 
				- *   |
			
 
				- *   +------------------+
			
 
				- *   |                  |
			
 
				- *   |             ARCH_VFP_V3xD (+EXT_V2+EXT_V3xD)
			
 
				- *   |                  |
			
 
				- *   |             ARCH_VFP_V3xD_FP16 (+EXT_FP16)
			
 
				- *   |                  |
			
 
				- *   |             ARCH_VFP_V4_SP_D16 (+EXT_FMA)
			
 
				- *   |
			
 
				- * ARCH_VFP_V1 (+EXT_V1)
			
 
				- *   |
			
 
				- * ARCH_VFP_V2 (+EXT_V2)
			
 
				- *   |
			
 
				- * ARCH_VFP_V3D16 (+EXT_V3xD+EXT_V3)
			
 
				- *   |
			
 
				- *   +-------------------+
			
 
				- *   |                   |
			
 
				- *   |         ARCH_VFP_V3D16_FP16  (+EXT_FP16)
			
 
				- *   |
			
 
				- *   +-------------------+
			
 
				- *   |                   |
			
 
				- *   |         ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA)
			
 
				- *   |                   |
			
 
				- *   |         ARCH_VFP_V4 (+EXT_D32)
			
 
				- *   |                   |
			
 
				- *   |         ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA)
			
 
				- *   |
			
 
				- * ARCH_VFP_V3 (+EXT_D32)
			
 
				- *   |
			
 
				- *   +-------------------+
			
 
				- *   |                   |
			
 
				- *   |         ARCH_VFP_V3_FP16 (+EXT_FP16)
			
 
				- *   |
			
 
				- * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON)
			
 
				- *   |
			
 
				- * ARCH_NEON_FP16 (+EXT_FP16)
			
 
				- *
			
 
				- * -fpu=<name> values and their correspondance with FPU architectures above:
			
 
				- *
			
 
				- *   {"vfp",               FPU_ARCH_VFP_V2},
			
 
				- *   {"vfp9",              FPU_ARCH_VFP_V2},
			
 
				- *   {"vfp3",              FPU_ARCH_VFP_V3}, // For backwards compatbility.
			
 
				- *   {"vfp10",             FPU_ARCH_VFP_V2},
			
 
				- *   {"vfp10-r0",          FPU_ARCH_VFP_V1},
			
 
				- *   {"vfpxd",             FPU_ARCH_VFP_V1xD},
			
 
				- *   {"vfpv2",             FPU_ARCH_VFP_V2},
			
 
				- *   {"vfpv3",             FPU_ARCH_VFP_V3},
			
 
				- *   {"vfpv3-fp16",        FPU_ARCH_VFP_V3_FP16},
			
 
				- *   {"vfpv3-d16",         FPU_ARCH_VFP_V3D16},
			
 
				- *   {"vfpv3-d16-fp16",    FPU_ARCH_VFP_V3D16_FP16},
			
 
				- *   {"vfpv3xd",           FPU_ARCH_VFP_V3xD},
			
 
				- *   {"vfpv3xd-fp16",      FPU_ARCH_VFP_V3xD_FP16},
			
 
				- *   {"neon",              FPU_ARCH_VFP_V3_PLUS_NEON_V1},
			
 
				- *   {"neon-fp16",         FPU_ARCH_NEON_FP16},
			
 
				- *   {"vfpv4",             FPU_ARCH_VFP_V4},
			
 
				- *   {"vfpv4-d16",         FPU_ARCH_VFP_V4D16},
			
 
				- *   {"fpv4-sp-d16",       FPU_ARCH_VFP_V4_SP_D16},
			
 
				- *   {"neon-vfpv4",        FPU_ARCH_NEON_VFP_V4},
			
 
				- *
			
 
				- *
			
 
				- * Simplified diagram that only includes FPUs supported by Android:
			
 
				- * Only ARCH_VFP_V3D16 is actually mandated by the armeabi-v7a ABI,
			
 
				- * all others are optional and must be probed at runtime.
			
 
				- *
			
 
				- * ARCH_VFP_V3D16 (EXT_V1xD+EXT_V1+EXT_V2+EXT_V3xD+EXT_V3)
			
 
				- *   |
			
 
				- *   +-------------------+
			
 
				- *   |                   |
			
 
				- *   |         ARCH_VFP_V3D16_FP16  (+EXT_FP16)
			
 
				- *   |
			
 
				- *   +-------------------+
			
 
				- *   |                   |
			
 
				- *   |         ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA)
			
 
				- *   |                   |
			
 
				- *   |         ARCH_VFP_V4 (+EXT_D32)
			
 
				- *   |                   |
			
 
				- *   |         ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA)
			
 
				- *   |
			
 
				- * ARCH_VFP_V3 (+EXT_D32)
			
 
				- *   |
			
 
				- *   +-------------------+
			
 
				- *   |                   |
			
 
				- *   |         ARCH_VFP_V3_FP16 (+EXT_FP16)
			
 
				- *   |
			
 
				- * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON)
			
 
				- *   |
			
 
				- * ARCH_NEON_FP16 (+EXT_FP16)
			
 
				- *
			
 
				- */
			
 
				-
			
 
				-#endif // defined(__le32__)
			
 
				-#endif
			
 
				-
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/android/cpu-features.h
+++ b/drivers/theoraplayer/src/YUV/android/cpu-features.h
@@ -1,212 +0,0 @@
 
				-/*
			
 
				- * Copyright (C) 2010 The Android Open Source Project
			
 
				- * All rights reserved.
			
 
				- *
			
 
				- * Redistribution and use in source and binary forms, with or without
			
 
				- * modification, are permitted provided that the following conditions
			
 
				- * are met:
			
 
				- *  * Redistributions of source code must retain the above copyright
			
 
				- *    notice, this list of conditions and the following disclaimer.
			
 
				- *  * Redistributions in binary form must reproduce the above copyright
			
 
				- *    notice, this list of conditions and the following disclaimer in
			
 
				- *    the documentation and/or other materials provided with the
			
 
				- *    distribution.
			
 
				- *
			
 
				- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
			
 
				- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
			
 
				- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
			
 
				- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
			
 
				- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
			
 
				- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
			
 
				- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
			
 
				- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
			
 
				- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
			
 
				- * SUCH DAMAGE.
			
 
				- */
			
 
				-#ifndef CPU_FEATURES_H
			
 
				-#define CPU_FEATURES_H
			
 
				-
			
 
				-#include <sys/cdefs.h>
			
 
				-#include <stdint.h>
			
 
				-
			
 
				-__BEGIN_DECLS
			
 
				-
			
 
				-typedef enum {
			
 
				-    ANDROID_CPU_FAMILY_UNKNOWN = 0,
			
 
				-    ANDROID_CPU_FAMILY_ARM,
			
 
				-    ANDROID_CPU_FAMILY_X86,
			
 
				-    ANDROID_CPU_FAMILY_MIPS,
			
 
				-
			
 
				-    ANDROID_CPU_FAMILY_MAX  /* do not remove */
			
 
				-
			
 
				-} AndroidCpuFamily;
			
 
				-
			
 
				-/* Return family of the device's CPU */
			
 
				-extern AndroidCpuFamily   android_getCpuFamily(void);
			
 
				-
			
 
				-/* The list of feature flags for ARM CPUs that can be recognized by the
			
 
				- * library. Value details are:
			
 
				- *
			
 
				- *   VFPv2:
			
 
				- *     CPU supports the VFPv2 instruction set. Many, but not all, ARMv6 CPUs
			
 
				- *     support these instructions. VFPv2 is a subset of VFPv3 so this will
			
 
				- *     be set whenever VFPv3 is set too.
			
 
				- *
			
 
				- *   ARMv7:
			
 
				- *     CPU supports the ARMv7-A basic instruction set.
			
 
				- *     This feature is mandated by the 'armeabi-v7a' ABI.
			
 
				- *
			
 
				- *   VFPv3:
			
 
				- *     CPU supports the VFPv3-D16 instruction set, providing hardware FPU
			
 
				- *     support for single and double precision floating point registers.
			
 
				- *     Note that only 16 FPU registers are available by default, unless
			
 
				- *     the D32 bit is set too. This feature is also mandated by the
			
 
				- *     'armeabi-v7a' ABI.
			
 
				- *
			
 
				- *   VFP_D32:
			
 
				- *     CPU VFP optional extension that provides 32 FPU registers,
			
 
				- *     instead of 16. Note that ARM mandates this feature is the 'NEON'
			
 
				- *     feature is implemented by the CPU.
			
 
				- *
			
 
				- *   NEON:
			
 
				- *     CPU FPU supports "ARM Advanced SIMD" instructions, also known as
			
 
				- *     NEON. Note that this mandates the VFP_D32 feature as well, per the
			
 
				- *     ARM Architecture specification.
			
 
				- *
			
 
				- *   VFP_FP16:
			
 
				- *     Half-width floating precision VFP extension. If set, the CPU
			
 
				- *     supports instructions to perform floating-point operations on
			
 
				- *     16-bit registers. This is part of the VFPv4 specification, but
			
 
				- *     not mandated by any Android ABI.
			
 
				- *
			
 
				- *   VFP_FMA:
			
 
				- *     Fused multiply-accumulate VFP instructions extension. Also part of
			
 
				- *     the VFPv4 specification, but not mandated by any Android ABI.
			
 
				- *
			
 
				- *   NEON_FMA:
			
 
				- *     Fused multiply-accumulate NEON instructions extension. Optional
			
 
				- *     extension from the VFPv4 specification, but not mandated by any
			
 
				- *     Android ABI.
			
 
				- *
			
 
				- *   IDIV_ARM:
			
 
				- *     Integer division available in ARM mode. Only available
			
 
				- *     on recent CPUs (e.g. Cortex-A15).
			
 
				- *
			
 
				- *   IDIV_THUMB2:
			
 
				- *     Integer division available in Thumb-2 mode. Only available
			
 
				- *     on recent CPUs (e.g. Cortex-A15).
			
 
				- *
			
 
				- *   iWMMXt:
			
 
				- *     Optional extension that adds MMX registers and operations to an
			
 
				- *     ARM CPU. This is only available on a few XScale-based CPU designs
			
 
				- *     sold by Marvell. Pretty rare in practice.
			
 
				- *
			
 
				- * If you want to tell the compiler to generate code that targets one of
			
 
				- * the feature set above, you should probably use one of the following
			
 
				- * flags (for more details, see technical note at the end of this file):
			
 
				- *
			
 
				- *   -mfpu=vfp
			
 
				- *   -mfpu=vfpv2
			
 
				- *     These are equivalent and tell GCC to use VFPv2 instructions for
			
 
				- *     floating-point operations. Use this if you want your code to
			
 
				- *     run on *some* ARMv6 devices, and any ARMv7-A device supported
			
 
				- *     by Android.
			
 
				- *
			
 
				- *     Generated code requires VFPv2 feature.
			
 
				- *
			
 
				- *   -mfpu=vfpv3-d16
			
 
				- *     Tell GCC to use VFPv3 instructions (using only 16 FPU registers).
			
 
				- *     This should be generic code that runs on any CPU that supports the
			
 
				- *     'armeabi-v7a' Android ABI. Note that no ARMv6 CPU supports this.
			
 
				- *
			
 
				- *     Generated code requires VFPv3 feature.
			
 
				- *
			
 
				- *   -mfpu=vfpv3
			
 
				- *     Tell GCC to use VFPv3 instructions with 32 FPU registers.
			
 
				- *     Generated code requires VFPv3|VFP_D32 features.
			
 
				- *
			
 
				- *   -mfpu=neon
			
 
				- *     Tell GCC to use VFPv3 instructions with 32 FPU registers, and
			
 
				- *     also support NEON intrinsics (see <arm_neon.h>).
			
 
				- *     Generated code requires VFPv3|VFP_D32|NEON features.
			
 
				- *
			
 
				- *   -mfpu=vfpv4-d16
			
 
				- *     Generated code requires VFPv3|VFP_FP16|VFP_FMA features.
			
 
				- *
			
 
				- *   -mfpu=vfpv4
			
 
				- *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32 features.
			
 
				- *
			
 
				- *   -mfpu=neon-vfpv4
			
 
				- *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|NEON|NEON_FMA
			
 
				- *     features.
			
 
				- *
			
 
				- *   -mcpu=cortex-a7
			
 
				- *   -mcpu=cortex-a15
			
 
				- *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|
			
 
				- *                             NEON|NEON_FMA|IDIV_ARM|IDIV_THUMB2
			
 
				- *     This flag implies -mfpu=neon-vfpv4.
			
 
				- *
			
 
				- *   -mcpu=iwmmxt
			
 
				- *     Allows the use of iWMMXt instrinsics with GCC.
			
 
				- */
			
 
				-enum {
			
 
				-    ANDROID_CPU_ARM_FEATURE_ARMv7       = (1 << 0),
			
 
				-    ANDROID_CPU_ARM_FEATURE_VFPv3       = (1 << 1),
			
 
				-    ANDROID_CPU_ARM_FEATURE_NEON        = (1 << 2),
			
 
				-    ANDROID_CPU_ARM_FEATURE_LDREX_STREX = (1 << 3),
			
 
				-    ANDROID_CPU_ARM_FEATURE_VFPv2       = (1 << 4),
			
 
				-    ANDROID_CPU_ARM_FEATURE_VFP_D32     = (1 << 5),
			
 
				-    ANDROID_CPU_ARM_FEATURE_VFP_FP16    = (1 << 6),
			
 
				-    ANDROID_CPU_ARM_FEATURE_VFP_FMA     = (1 << 7),
			
 
				-    ANDROID_CPU_ARM_FEATURE_NEON_FMA    = (1 << 8),
			
 
				-    ANDROID_CPU_ARM_FEATURE_IDIV_ARM    = (1 << 9),
			
 
				-    ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 = (1 << 10),
			
 
				-    ANDROID_CPU_ARM_FEATURE_iWMMXt      = (1 << 11),
			
 
				-};
			
 
				-
			
 
				-enum {
			
 
				-    ANDROID_CPU_X86_FEATURE_SSSE3  = (1 << 0),
			
 
				-    ANDROID_CPU_X86_FEATURE_POPCNT = (1 << 1),
			
 
				-    ANDROID_CPU_X86_FEATURE_MOVBE  = (1 << 2),
			
 
				-};
			
 
				-
			
 
				-// libtheoraplayer addition, renamed this to "Ext" as not to conflict with your own project if you've included cpu-features.c in it
			
 
				-//extern uint64_t    android_getCpuFeaturesExt(void);
			
 
				-#define android_getCpuFeaturesExt android_getCpuFeatures
			
 
				-
			
 
				-/* Return the number of CPU cores detected on this device. */
			
 
				-extern int         android_getCpuCount(void);
			
 
				-
			
 
				-/* The following is used to force the CPU count and features
			
 
				- * mask in sandboxed processes. Under 4.1 and higher, these processes
			
 
				- * cannot access /proc, which is the only way to get information from
			
 
				- * the kernel about the current hardware (at least on ARM).
			
 
				- *
			
 
				- * It _must_ be called only once, and before any android_getCpuXXX
			
 
				- * function, any other case will fail.
			
 
				- *
			
 
				- * This function return 1 on success, and 0 on failure.
			
 
				- */
			
 
				-extern int android_setCpu(int      cpu_count,
			
 
				-                          uint64_t cpu_features);
			
 
				-
			
 
				-#ifdef __arm__
			
 
				-/* Retrieve the ARM 32-bit CPUID value from the kernel.
			
 
				- * Note that this cannot work on sandboxed processes under 4.1 and
			
 
				- * higher, unless you called android_setCpuArm() before.
			
 
				- */
			
 
				-extern uint32_t android_getCpuIdArm(void);
			
 
				-
			
 
				-/* An ARM-specific variant of android_setCpu() that also allows you
			
 
				- * to set the ARM CPUID field.
			
 
				- */
			
 
				-extern int android_setCpuArm(int      cpu_count,
			
 
				-                             uint64_t cpu_features,
			
 
				-                             uint32_t cpu_id);
			
 
				-#endif
			
 
				-
			
 
				-__END_DECLS
			
 
				-
			
 
				-#endif /* CPU_FEATURES_H */
			
--- a/drivers/theoraplayer/src/YUV/libyuv/LICENSE
+++ b/drivers/theoraplayer/src/YUV/libyuv/LICENSE
@@ -1,29 +0,0 @@
 
				-Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				-
			
 
				-Redistribution and use in source and binary forms, with or without
			
 
				-modification, are permitted provided that the following conditions are
			
 
				-met:
			
 
				-
			
 
				-  * Redistributions of source code must retain the above copyright
			
 
				-    notice, this list of conditions and the following disclaimer.
			
 
				-
			
 
				-  * Redistributions in binary form must reproduce the above copyright
			
 
				-    notice, this list of conditions and the following disclaimer in
			
 
				-    the documentation and/or other materials provided with the
			
 
				-    distribution.
			
 
				-
			
 
				-  * Neither the name of Google nor the names of its contributors may
			
 
				-    be used to endorse or promote products derived from this software
			
 
				-    without specific prior written permission.
			
 
				-
			
 
				-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
			
 
				-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
			
 
				-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
			
 
				-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
--- a/drivers/theoraplayer/src/YUV/libyuv/LICENSE_THIRD_PARTY
+++ b/drivers/theoraplayer/src/YUV/libyuv/LICENSE_THIRD_PARTY
@@ -1,8 +0,0 @@
 
				-This source tree contains third party source code which is governed by third
			
 
				-party licenses. This file contains references to files which are under other
			
 
				-licenses than the one provided in the LICENSE file in the root of the source
			
 
				-tree.
			
 
				-
			
 
				-Files governed by third party licenses:
			
 
				-source/x86inc.asm
			
 
				-
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv.h
@@ -1,33 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/compare.h"
			
 
				-#include "libyuv/convert.h"
			
 
				-#include "libyuv/convert_argb.h"
			
 
				-#include "libyuv/convert_from.h"
			
 
				-#include "libyuv/convert_from_argb.h"
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/format_conversion.h"
			
 
				-#include "libyuv/mjpeg_decoder.h"
			
 
				-#include "libyuv/planar_functions.h"
			
 
				-#include "libyuv/rotate.h"
			
 
				-#include "libyuv/rotate_argb.h"
			
 
				-#include "libyuv/row.h"
			
 
				-#include "libyuv/scale.h"
			
 
				-#include "libyuv/scale_argb.h"
			
 
				-#include "libyuv/scale_row.h"
			
 
				-#include "libyuv/version.h"
			
 
				-#include "libyuv/video_common.h"
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/basic_types.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/basic_types.h
@@ -1,118 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_BASIC_TYPES_H_
			
 
				-
			
 
				-#include <stddef.h>  // for NULL, size_t
			
 
				-
			
 
				-#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
			
 
				-#include <sys/types.h>  // for uintptr_t on x86
			
 
				-#else
			
 
				-#include <stdint.h>  // for uintptr_t
			
 
				-#endif
			
 
				-
			
 
				-#ifndef GG_LONGLONG
			
 
				-#ifndef INT_TYPES_DEFINED
			
 
				-#define INT_TYPES_DEFINED
			
 
				-#ifdef COMPILER_MSVC
			
 
				-typedef unsigned __int64 uint64;
			
 
				-typedef __int64 int64;
			
 
				-#ifndef INT64_C
			
 
				-#define INT64_C(x) x ## I64
			
 
				-#endif
			
 
				-#ifndef UINT64_C
			
 
				-#define UINT64_C(x) x ## UI64
			
 
				-#endif
			
 
				-#define INT64_F "I64"
			
 
				-#else  // COMPILER_MSVC
			
 
				-#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
			
 
				-typedef unsigned long uint64;  // NOLINT
			
 
				-typedef long int64;  // NOLINT
			
 
				-#ifndef INT64_C
			
 
				-#define INT64_C(x) x ## L
			
 
				-#endif
			
 
				-#ifndef UINT64_C
			
 
				-#define UINT64_C(x) x ## UL
			
 
				-#endif
			
 
				-#define INT64_F "l"
			
 
				-#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
			
 
				-typedef unsigned long long uint64;  // NOLINT
			
 
				-typedef long long int64;  // NOLINT
			
 
				-#ifndef INT64_C
			
 
				-#define INT64_C(x) x ## LL
			
 
				-#endif
			
 
				-#ifndef UINT64_C
			
 
				-#define UINT64_C(x) x ## ULL
			
 
				-#endif
			
 
				-#define INT64_F "ll"
			
 
				-#endif  // __LP64__
			
 
				-#endif  // COMPILER_MSVC
			
 
				-typedef unsigned int uint32;
			
 
				-typedef int int32;
			
 
				-typedef unsigned short uint16;  // NOLINT
			
 
				-typedef short int16;  // NOLINT
			
 
				-typedef unsigned char uint8;
			
 
				-typedef signed char int8;
			
 
				-#endif  // INT_TYPES_DEFINED
			
 
				-#endif  // GG_LONGLONG
			
 
				-
			
 
				-// Detect compiler is for x86 or x64.
			
 
				-#if defined(__x86_64__) || defined(_M_X64) || \
			
 
				-    defined(__i386__) || defined(_M_IX86)
			
 
				-#define CPU_X86 1
			
 
				-#endif
			
 
				-// Detect compiler is for ARM.
			
 
				-#if defined(__arm__) || defined(_M_ARM)
			
 
				-#define CPU_ARM 1
			
 
				-#endif
			
 
				-
			
 
				-#ifndef ALIGNP
			
 
				-#ifdef __cplusplus
			
 
				-#define ALIGNP(p, t) \
			
 
				-    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
			
 
				-    ((t) - 1)) & ~((t) - 1))))
			
 
				-#else
			
 
				-#define ALIGNP(p, t) \
			
 
				-    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */
			
 
				-#endif
			
 
				-#endif
			
 
				-
			
 
				-#if !defined(LIBYUV_API)
			
 
				-#if defined(_WIN32) || defined(__CYGWIN__)
			
 
				-#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
			
 
				-#define LIBYUV_API __declspec(dllexport)
			
 
				-#elif defined(LIBYUV_USING_SHARED_LIBRARY)
			
 
				-#define LIBYUV_API __declspec(dllimport)
			
 
				-#else
			
 
				-#define LIBYUV_API
			
 
				-#endif  // LIBYUV_BUILDING_SHARED_LIBRARY
			
 
				-#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
			
 
				-    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
			
 
				-    defined(LIBYUV_USING_SHARED_LIBRARY))
			
 
				-#define LIBYUV_API __attribute__ ((visibility ("default")))
			
 
				-#else
			
 
				-#define LIBYUV_API
			
 
				-#endif  // __GNUC__
			
 
				-#endif  // LIBYUV_API
			
 
				-
			
 
				-#define LIBYUV_BOOL int
			
 
				-#define LIBYUV_FALSE 0
			
 
				-#define LIBYUV_TRUE 1
			
 
				-
			
 
				-// Visual C x86 or GCC little endian.
			
 
				-#if defined(__x86_64__) || defined(_M_X64) || \
			
 
				-  defined(__i386__) || defined(_M_IX86) || \
			
 
				-  defined(__arm__) || defined(_M_ARM) || \
			
 
				-  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
			
 
				-#define LIBYUV_LITTLE_ENDIAN
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/compare.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/compare.h
@@ -1,73 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_COMPARE_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Compute a hash for specified memory. Seed of 5381 recommended.
			
 
				-LIBYUV_API
			
 
				-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
			
 
				-
			
 
				-// Sum Square Error - used to compute Mean Square Error or PSNR.
			
 
				-LIBYUV_API
			
 
				-uint64 ComputeSumSquareError(const uint8* src_a,
			
 
				-                             const uint8* src_b, int count);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
			
 
				-                                  const uint8* src_b, int stride_b,
			
 
				-                                  int width, int height);
			
 
				-
			
 
				-static const int kMaxPsnr = 128;
			
 
				-
			
 
				-LIBYUV_API
			
 
				-double SumSquareErrorToPsnr(uint64 sse, uint64 count);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-double CalcFramePsnr(const uint8* src_a, int stride_a,
			
 
				-                     const uint8* src_b, int stride_b,
			
 
				-                     int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-double I420Psnr(const uint8* src_y_a, int stride_y_a,
			
 
				-                const uint8* src_u_a, int stride_u_a,
			
 
				-                const uint8* src_v_a, int stride_v_a,
			
 
				-                const uint8* src_y_b, int stride_y_b,
			
 
				-                const uint8* src_u_b, int stride_u_b,
			
 
				-                const uint8* src_v_b, int stride_v_b,
			
 
				-                int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-double CalcFrameSsim(const uint8* src_a, int stride_a,
			
 
				-                     const uint8* src_b, int stride_b,
			
 
				-                     int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-double I420Ssim(const uint8* src_y_a, int stride_y_a,
			
 
				-                const uint8* src_u_a, int stride_u_a,
			
 
				-                const uint8* src_v_a, int stride_v_a,
			
 
				-                const uint8* src_y_b, int stride_y_b,
			
 
				-                const uint8* src_u_b, int stride_u_b,
			
 
				-                const uint8* src_v_b, int stride_v_b,
			
 
				-                int width, int height);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/convert.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/convert.h
@@ -1,254 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_CONVERT_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-// TODO(fbarchard): Remove the following headers includes.
			
 
				-#include "libyuv/convert_from.h"
			
 
				-#include "libyuv/planar_functions.h"
			
 
				-#include "libyuv/rotate.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Convert I444 to I420.
			
 
				-LIBYUV_API
			
 
				-int I444ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert I422 to I420.
			
 
				-LIBYUV_API
			
 
				-int I422ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert I411 to I420.
			
 
				-LIBYUV_API
			
 
				-int I411ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Copy I420 to I420.
			
 
				-#define I420ToI420 I420Copy
			
 
				-LIBYUV_API
			
 
				-int I420Copy(const uint8* src_y, int src_stride_y,
			
 
				-             const uint8* src_u, int src_stride_u,
			
 
				-             const uint8* src_v, int src_stride_v,
			
 
				-             uint8* dst_y, int dst_stride_y,
			
 
				-             uint8* dst_u, int dst_stride_u,
			
 
				-             uint8* dst_v, int dst_stride_v,
			
 
				-             int width, int height);
			
 
				-
			
 
				-// Convert I400 (grey) to I420.
			
 
				-LIBYUV_API
			
 
				-int I400ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert NV12 to I420.
			
 
				-LIBYUV_API
			
 
				-int NV12ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_uv, int src_stride_uv,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert NV21 to I420.
			
 
				-LIBYUV_API
			
 
				-int NV21ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_vu, int src_stride_vu,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert YUY2 to I420.
			
 
				-LIBYUV_API
			
 
				-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert UYVY to I420.
			
 
				-LIBYUV_API
			
 
				-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert M420 to I420.
			
 
				-LIBYUV_API
			
 
				-int M420ToI420(const uint8* src_m420, int src_stride_m420,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert Q420 to I420.
			
 
				-LIBYUV_API
			
 
				-int Q420ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_yuy2, int src_stride_yuy2,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// ARGB little endian (bgra in memory) to I420.
			
 
				-LIBYUV_API
			
 
				-int ARGBToI420(const uint8* src_frame, int src_stride_frame,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// BGRA little endian (argb in memory) to I420.
			
 
				-LIBYUV_API
			
 
				-int BGRAToI420(const uint8* src_frame, int src_stride_frame,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// ABGR little endian (rgba in memory) to I420.
			
 
				-LIBYUV_API
			
 
				-int ABGRToI420(const uint8* src_frame, int src_stride_frame,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// RGBA little endian (abgr in memory) to I420.
			
 
				-LIBYUV_API
			
 
				-int RGBAToI420(const uint8* src_frame, int src_stride_frame,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// RGB little endian (bgr in memory) to I420.
			
 
				-LIBYUV_API
			
 
				-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
			
 
				-                uint8* dst_y, int dst_stride_y,
			
 
				-                uint8* dst_u, int dst_stride_u,
			
 
				-                uint8* dst_v, int dst_stride_v,
			
 
				-                int width, int height);
			
 
				-
			
 
				-// RGB big endian (rgb in memory) to I420.
			
 
				-LIBYUV_API
			
 
				-int RAWToI420(const uint8* src_frame, int src_stride_frame,
			
 
				-              uint8* dst_y, int dst_stride_y,
			
 
				-              uint8* dst_u, int dst_stride_u,
			
 
				-              uint8* dst_v, int dst_stride_v,
			
 
				-              int width, int height);
			
 
				-
			
 
				-// RGB16 (RGBP fourcc) little endian to I420.
			
 
				-LIBYUV_API
			
 
				-int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
			
 
				-                 uint8* dst_y, int dst_stride_y,
			
 
				-                 uint8* dst_u, int dst_stride_u,
			
 
				-                 uint8* dst_v, int dst_stride_v,
			
 
				-                 int width, int height);
			
 
				-
			
 
				-// RGB15 (RGBO fourcc) little endian to I420.
			
 
				-LIBYUV_API
			
 
				-int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
			
 
				-                   uint8* dst_y, int dst_stride_y,
			
 
				-                   uint8* dst_u, int dst_stride_u,
			
 
				-                   uint8* dst_v, int dst_stride_v,
			
 
				-                   int width, int height);
			
 
				-
			
 
				-// RGB12 (R444 fourcc) little endian to I420.
			
 
				-LIBYUV_API
			
 
				-int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
			
 
				-                   uint8* dst_y, int dst_stride_y,
			
 
				-                   uint8* dst_u, int dst_stride_u,
			
 
				-                   uint8* dst_v, int dst_stride_v,
			
 
				-                   int width, int height);
			
 
				-
			
 
				-#ifdef HAVE_JPEG
			
 
				-// src_width/height provided by capture.
			
 
				-// dst_width/height for clipping determine final size.
			
 
				-LIBYUV_API
			
 
				-int MJPGToI420(const uint8* sample, size_t sample_size,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int src_width, int src_height,
			
 
				-               int dst_width, int dst_height);
			
 
				-
			
 
				-// Query size of MJPG in pixels.
			
 
				-LIBYUV_API
			
 
				-int MJPGSize(const uint8* sample, size_t sample_size,
			
 
				-             int* width, int* height);
			
 
				-#endif
			
 
				-
			
 
				-// Note Bayer formats (BGGR) To I420 are in format_conversion.h
			
 
				-
			
 
				-// Convert camera sample to I420 with cropping, rotation and vertical flip.
			
 
				-// "src_size" is needed to parse MJPG.
			
 
				-// "dst_stride_y" number of bytes in a row of the dst_y plane.
			
 
				-//   Normally this would be the same as dst_width, with recommended alignment
			
 
				-//   to 16 bytes for better efficiency.
			
 
				-//   If rotation of 90 or 270 is used, stride is affected. The caller should
			
 
				-//   allocate the I420 buffer according to rotation.
			
 
				-// "dst_stride_u" number of bytes in a row of the dst_u plane.
			
 
				-//   Normally this would be the same as (dst_width + 1) / 2, with
			
 
				-//   recommended alignment to 16 bytes for better efficiency.
			
 
				-//   If rotation of 90 or 270 is used, stride is affected.
			
 
				-// "crop_x" and "crop_y" are starting position for cropping.
			
 
				-//   To center, crop_x = (src_width - dst_width) / 2
			
 
				-//              crop_y = (src_height - dst_height) / 2
			
 
				-// "src_width" / "src_height" is size of src_frame in pixels.
			
 
				-//   "src_height" can be negative indicating a vertically flipped image source.
			
 
				-// "crop_width" / "crop_height" is the size to crop the src to.
			
 
				-//    Must be less than or equal to src_width/src_height
			
 
				-//    Cropping parameters are pre-rotation.
			
 
				-// "rotation" can be 0, 90, 180 or 270.
			
 
				-// "format" is a fourcc. ie 'I420', 'YUY2'
			
 
				-// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
			
 
				-LIBYUV_API
			
 
				-int ConvertToI420(const uint8* src_frame, size_t src_size,
			
 
				-                  uint8* dst_y, int dst_stride_y,
			
 
				-                  uint8* dst_u, int dst_stride_u,
			
 
				-                  uint8* dst_v, int dst_stride_v,
			
 
				-                  int crop_x, int crop_y,
			
 
				-                  int src_width, int src_height,
			
 
				-                  int crop_width, int crop_height,
			
 
				-                  enum RotationMode rotation,
			
 
				-                  uint32 format);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/convert_argb.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/convert_argb.h
@@ -1,225 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-// TODO(fbarchard): Remove the following headers includes
			
 
				-#include "libyuv/convert_from.h"
			
 
				-#include "libyuv/planar_functions.h"
			
 
				-#include "libyuv/rotate.h"
			
 
				-
			
 
				-// TODO(fbarchard): This set of functions should exactly match convert.h
			
 
				-// Add missing Q420.
			
 
				-// TODO(fbarchard): Add tests. Create random content of right size and convert
			
 
				-// with C vs Opt and or to I420 and compare.
			
 
				-// TODO(fbarchard): Some of these functions lack parameter setting.
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Alias.
			
 
				-#define ARGBToARGB ARGBCopy
			
 
				-
			
 
				-// Copy ARGB to ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
			
 
				-             uint8* dst_argb, int dst_stride_argb,
			
 
				-             int width, int height);
			
 
				-
			
 
				-// Convert I420 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int I420ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert I422 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int I422ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert I444 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int I444ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert I411 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int I411ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert I400 (grey) to ARGB.
			
 
				-LIBYUV_API
			
 
				-int I400ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Alias.
			
 
				-#define YToARGB I400ToARGB_Reference
			
 
				-
			
 
				-// Convert I400 to ARGB. Reverse of ARGBToI400.
			
 
				-LIBYUV_API
			
 
				-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
			
 
				-                         uint8* dst_argb, int dst_stride_argb,
			
 
				-                         int width, int height);
			
 
				-
			
 
				-// Convert NV12 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int NV12ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_uv, int src_stride_uv,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert NV21 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int NV21ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_vu, int src_stride_vu,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert M420 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// TODO(fbarchard): Convert Q420 to ARGB.
			
 
				-// LIBYUV_API
			
 
				-// int Q420ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-//                const uint8* src_yuy2, int src_stride_yuy2,
			
 
				-//                uint8* dst_argb, int dst_stride_argb,
			
 
				-//                int width, int height);
			
 
				-
			
 
				-// Convert YUY2 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert UYVY to ARGB.
			
 
				-LIBYUV_API
			
 
				-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// BGRA little endian (argb in memory) to ARGB.
			
 
				-LIBYUV_API
			
 
				-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// ABGR little endian (rgba in memory) to ARGB.
			
 
				-LIBYUV_API
			
 
				-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// RGBA little endian (abgr in memory) to ARGB.
			
 
				-LIBYUV_API
			
 
				-int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Deprecated function name.
			
 
				-#define BG24ToARGB RGB24ToARGB
			
 
				-
			
 
				-// RGB little endian (bgr in memory) to ARGB.
			
 
				-LIBYUV_API
			
 
				-int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
			
 
				-                uint8* dst_argb, int dst_stride_argb,
			
 
				-                int width, int height);
			
 
				-
			
 
				-// RGB big endian (rgb in memory) to ARGB.
			
 
				-LIBYUV_API
			
 
				-int RAWToARGB(const uint8* src_frame, int src_stride_frame,
			
 
				-              uint8* dst_argb, int dst_stride_argb,
			
 
				-              int width, int height);
			
 
				-
			
 
				-// RGB16 (RGBP fourcc) little endian to ARGB.
			
 
				-LIBYUV_API
			
 
				-int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
			
 
				-                 uint8* dst_argb, int dst_stride_argb,
			
 
				-                 int width, int height);
			
 
				-
			
 
				-// RGB15 (RGBO fourcc) little endian to ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
			
 
				-                   uint8* dst_argb, int dst_stride_argb,
			
 
				-                   int width, int height);
			
 
				-
			
 
				-// RGB12 (R444 fourcc) little endian to ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
			
 
				-                   uint8* dst_argb, int dst_stride_argb,
			
 
				-                   int width, int height);
			
 
				-
			
 
				-#ifdef HAVE_JPEG
			
 
				-// src_width/height provided by capture
			
 
				-// dst_width/height for clipping determine final size.
			
 
				-LIBYUV_API
			
 
				-int MJPGToARGB(const uint8* sample, size_t sample_size,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int src_width, int src_height,
			
 
				-               int dst_width, int dst_height);
			
 
				-#endif
			
 
				-
			
 
				-// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
			
 
				-
			
 
				-// Convert camera sample to ARGB with cropping, rotation and vertical flip.
			
 
				-// "src_size" is needed to parse MJPG.
			
 
				-// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
			
 
				-//   Normally this would be the same as dst_width, with recommended alignment
			
 
				-//   to 16 bytes for better efficiency.
			
 
				-//   If rotation of 90 or 270 is used, stride is affected. The caller should
			
 
				-//   allocate the I420 buffer according to rotation.
			
 
				-// "dst_stride_u" number of bytes in a row of the dst_u plane.
			
 
				-//   Normally this would be the same as (dst_width + 1) / 2, with
			
 
				-//   recommended alignment to 16 bytes for better efficiency.
			
 
				-//   If rotation of 90 or 270 is used, stride is affected.
			
 
				-// "crop_x" and "crop_y" are starting position for cropping.
			
 
				-//   To center, crop_x = (src_width - dst_width) / 2
			
 
				-//              crop_y = (src_height - dst_height) / 2
			
 
				-// "src_width" / "src_height" is size of src_frame in pixels.
			
 
				-//   "src_height" can be negative indicating a vertically flipped image source.
			
 
				-// "crop_width" / "crop_height" is the size to crop the src to.
			
 
				-//    Must be less than or equal to src_width/src_height
			
 
				-//    Cropping parameters are pre-rotation.
			
 
				-// "rotation" can be 0, 90, 180 or 270.
			
 
				-// "format" is a fourcc. ie 'I420', 'YUY2'
			
 
				-// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
			
 
				-LIBYUV_API
			
 
				-int ConvertToARGB(const uint8* src_frame, size_t src_size,
			
 
				-                  uint8* dst_argb, int dst_stride_argb,
			
 
				-                  int crop_x, int crop_y,
			
 
				-                  int src_width, int src_height,
			
 
				-                  int crop_width, int crop_height,
			
 
				-                  enum RotationMode rotation,
			
 
				-                  uint32 format);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/convert_from.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/convert_from.h
@@ -1,173 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_CONVERT_FROM_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/rotate.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// See Also convert.h for conversions from formats to I420.
			
 
				-
			
 
				-// I420Copy in convert to I420ToI420.
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToI422(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToI444(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToI411(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
			
 
				-LIBYUV_API
			
 
				-int I400Copy(const uint8* src_y, int src_stride_y,
			
 
				-             uint8* dst_y, int dst_stride_y,
			
 
				-             int width, int height);
			
 
				-
			
 
				-// TODO(fbarchard): I420ToM420
			
 
				-// TODO(fbarchard): I420ToQ420
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToNV12(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_uv, int dst_stride_uv,
			
 
				-               int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToNV21(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_vu, int dst_stride_vu,
			
 
				-               int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToYUY2(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_frame, int dst_stride_frame,
			
 
				-               int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToUYVY(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_frame, int dst_stride_frame,
			
 
				-               int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToBGRA(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToABGR(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToRGBA(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_rgba, int dst_stride_rgba,
			
 
				-               int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToRGB24(const uint8* src_y, int src_stride_y,
			
 
				-                const uint8* src_u, int src_stride_u,
			
 
				-                const uint8* src_v, int src_stride_v,
			
 
				-                uint8* dst_frame, int dst_stride_frame,
			
 
				-                int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToRAW(const uint8* src_y, int src_stride_y,
			
 
				-              const uint8* src_u, int src_stride_u,
			
 
				-              const uint8* src_v, int src_stride_v,
			
 
				-              uint8* dst_frame, int dst_stride_frame,
			
 
				-              int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToRGB565(const uint8* src_y, int src_stride_y,
			
 
				-                 const uint8* src_u, int src_stride_u,
			
 
				-                 const uint8* src_v, int src_stride_v,
			
 
				-                 uint8* dst_frame, int dst_stride_frame,
			
 
				-                 int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
			
 
				-                   const uint8* src_u, int src_stride_u,
			
 
				-                   const uint8* src_v, int src_stride_v,
			
 
				-                   uint8* dst_frame, int dst_stride_frame,
			
 
				-                   int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
			
 
				-                   const uint8* src_u, int src_stride_u,
			
 
				-                   const uint8* src_v, int src_stride_v,
			
 
				-                   uint8* dst_frame, int dst_stride_frame,
			
 
				-                   int width, int height);
			
 
				-
			
 
				-// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
			
 
				-
			
 
				-// Convert I420 to specified format.
			
 
				-// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
			
 
				-//    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
			
 
				-LIBYUV_API
			
 
				-int ConvertFromI420(const uint8* y, int y_stride,
			
 
				-                    const uint8* u, int u_stride,
			
 
				-                    const uint8* v, int v_stride,
			
 
				-                    uint8* dst_sample, int dst_sample_stride,
			
 
				-                    int width, int height,
			
 
				-                    uint32 format);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/convert_from_argb.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/convert_from_argb.h
@@ -1,168 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Copy ARGB to ARGB.
			
 
				-#define ARGBToARGB ARGBCopy
			
 
				-LIBYUV_API
			
 
				-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
			
 
				-             uint8* dst_argb, int dst_stride_argb,
			
 
				-             int width, int height);
			
 
				-
			
 
				-// Convert ARGB To BGRA. (alias)
			
 
				-#define ARGBToBGRA BGRAToARGB
			
 
				-LIBYUV_API
			
 
				-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB To ABGR. (alias)
			
 
				-#define ARGBToABGR ABGRToARGB
			
 
				-LIBYUV_API
			
 
				-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB To RGBA.
			
 
				-LIBYUV_API
			
 
				-int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB To RGB24.
			
 
				-LIBYUV_API
			
 
				-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
			
 
				-                uint8* dst_rgb24, int dst_stride_rgb24,
			
 
				-                int width, int height);
			
 
				-
			
 
				-// Convert ARGB To RAW.
			
 
				-LIBYUV_API
			
 
				-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
			
 
				-              uint8* dst_rgb, int dst_stride_rgb,
			
 
				-              int width, int height);
			
 
				-
			
 
				-// Convert ARGB To RGB565.
			
 
				-LIBYUV_API
			
 
				-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
			
 
				-                 uint8* dst_rgb565, int dst_stride_rgb565,
			
 
				-                 int width, int height);
			
 
				-
			
 
				-// Convert ARGB To ARGB1555.
			
 
				-LIBYUV_API
			
 
				-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
			
 
				-                   uint8* dst_argb1555, int dst_stride_argb1555,
			
 
				-                   int width, int height);
			
 
				-
			
 
				-// Convert ARGB To ARGB4444.
			
 
				-LIBYUV_API
			
 
				-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
			
 
				-                   uint8* dst_argb4444, int dst_stride_argb4444,
			
 
				-                   int width, int height);
			
 
				-
			
 
				-// Convert ARGB To I444.
			
 
				-LIBYUV_API
			
 
				-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB To I422.
			
 
				-LIBYUV_API
			
 
				-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB To I420. (also in convert.h)
			
 
				-LIBYUV_API
			
 
				-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB to J420. (JPeg full range I420).
			
 
				-LIBYUV_API
			
 
				-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_yj, int dst_stride_yj,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB To I411.
			
 
				-LIBYUV_API
			
 
				-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB to J400. (JPeg full range).
			
 
				-LIBYUV_API
			
 
				-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_yj, int dst_stride_yj,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB to I400.
			
 
				-LIBYUV_API
			
 
				-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB To NV12.
			
 
				-LIBYUV_API
			
 
				-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_uv, int dst_stride_uv,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB To NV21.
			
 
				-LIBYUV_API
			
 
				-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_vu, int dst_stride_vu,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB To NV21.
			
 
				-LIBYUV_API
			
 
				-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_vu, int dst_stride_vu,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB To YUY2.
			
 
				-LIBYUV_API
			
 
				-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_yuy2, int dst_stride_yuy2,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert ARGB To UYVY.
			
 
				-LIBYUV_API
			
 
				-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_uyvy, int dst_stride_uyvy,
			
 
				-               int width, int height);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/cpu_id.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/cpu_id.h
@@ -1,81 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_CPU_ID_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// TODO(fbarchard): Consider overlapping bits for different architectures.
			
 
				-// Internal flag to indicate cpuid requires initialization.
			
 
				-#define kCpuInit 0x1
			
 
				-
			
 
				-// These flags are only valid on ARM processors.
			
 
				-static const int kCpuHasARM = 0x2;
			
 
				-static const int kCpuHasNEON = 0x4;
			
 
				-// 0x8 reserved for future ARM flag.
			
 
				-
			
 
				-// These flags are only valid on x86 processors.
			
 
				-static const int kCpuHasX86 = 0x10;
			
 
				-static const int kCpuHasSSE2 = 0x20;
			
 
				-static const int kCpuHasSSSE3 = 0x40;
			
 
				-static const int kCpuHasSSE41 = 0x80;
			
 
				-static const int kCpuHasSSE42 = 0x100;
			
 
				-static const int kCpuHasAVX = 0x200;
			
 
				-static const int kCpuHasAVX2 = 0x400;
			
 
				-static const int kCpuHasERMS = 0x800;
			
 
				-static const int kCpuHasFMA3 = 0x1000;
			
 
				-// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
			
 
				-
			
 
				-// These flags are only valid on MIPS processors.
			
 
				-static const int kCpuHasMIPS = 0x10000;
			
 
				-static const int kCpuHasMIPS_DSP = 0x20000;
			
 
				-static const int kCpuHasMIPS_DSPR2 = 0x40000;
			
 
				-
			
 
				-// Internal function used to auto-init.
			
 
				-LIBYUV_API
			
 
				-int InitCpuFlags(void);
			
 
				-
			
 
				-// Internal function for parsing /proc/cpuinfo.
			
 
				-LIBYUV_API
			
 
				-int ArmCpuCaps(const char* cpuinfo_name);
			
 
				-
			
 
				-// Detect CPU has SSE2 etc.
			
 
				-// Test_flag parameter should be one of kCpuHas constants above.
			
 
				-// returns non-zero if instruction set is detected
			
 
				-static __inline int TestCpuFlag(int test_flag) {
			
 
				-  LIBYUV_API extern int cpu_info_;
			
 
				-  return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
			
 
				-}
			
 
				-
			
 
				-// For testing, allow CPU flags to be disabled.
			
 
				-// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
			
 
				-// MaskCpuFlags(-1) to enable all cpu specific optimizations.
			
 
				-// MaskCpuFlags(0) to disable all cpu specific optimizations.
			
 
				-LIBYUV_API
			
 
				-void MaskCpuFlags(int enable_flags);
			
 
				-
			
 
				-// Low level cpuid for X86. Returns zeros on other CPUs.
			
 
				-// eax is the info type that you want.
			
 
				-// ecx is typically the cpu number, and should normally be zero.
			
 
				-LIBYUV_API
			
 
				-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/format_conversion.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/format_conversion.h
@@ -1,168 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Convert Bayer RGB formats to I420.
			
 
				-LIBYUV_API
			
 
				-int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_y, int dst_stride_y,
			
 
				-                    uint8* dst_u, int dst_stride_u,
			
 
				-                    uint8* dst_v, int dst_stride_v,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_y, int dst_stride_y,
			
 
				-                    uint8* dst_u, int dst_stride_u,
			
 
				-                    uint8* dst_v, int dst_stride_v,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_y, int dst_stride_y,
			
 
				-                    uint8* dst_u, int dst_stride_u,
			
 
				-                    uint8* dst_v, int dst_stride_v,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_y, int dst_stride_y,
			
 
				-                    uint8* dst_u, int dst_stride_u,
			
 
				-                    uint8* dst_v, int dst_stride_v,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-// Temporary API mapper.
			
 
				-#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
			
 
				-    BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                uint8* dst_y, int dst_stride_y,
			
 
				-                uint8* dst_u, int dst_stride_u,
			
 
				-                uint8* dst_v, int dst_stride_v,
			
 
				-                int width, int height,
			
 
				-                uint32 src_fourcc_bayer);
			
 
				-
			
 
				-// Convert I420 to Bayer RGB formats.
			
 
				-LIBYUV_API
			
 
				-int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
			
 
				-                    const uint8* src_u, int src_stride_u,
			
 
				-                    const uint8* src_v, int src_stride_v,
			
 
				-                    uint8* dst_frame, int dst_stride_frame,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
			
 
				-                    const uint8* src_u, int src_stride_u,
			
 
				-                    const uint8* src_v, int src_stride_v,
			
 
				-                    uint8* dst_frame, int dst_stride_frame,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
			
 
				-                    const uint8* src_u, int src_stride_u,
			
 
				-                    const uint8* src_v, int src_stride_v,
			
 
				-                    uint8* dst_frame, int dst_stride_frame,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
			
 
				-                    const uint8* src_u, int src_stride_u,
			
 
				-                    const uint8* src_v, int src_stride_v,
			
 
				-                    uint8* dst_frame, int dst_stride_frame,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-// Temporary API mapper.
			
 
				-#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
			
 
				-    I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToBayer(const uint8* src_y, int src_stride_y,
			
 
				-                const uint8* src_u, int src_stride_u,
			
 
				-                const uint8* src_v, int src_stride_v,
			
 
				-                uint8* dst_frame, int dst_stride_frame,
			
 
				-                int width, int height,
			
 
				-                uint32 dst_fourcc_bayer);
			
 
				-
			
 
				-// Convert Bayer RGB formats to ARGB.
			
 
				-LIBYUV_API
			
 
				-int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_argb, int dst_stride_argb,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_argb, int dst_stride_argb,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_argb, int dst_stride_argb,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_argb, int dst_stride_argb,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-// Temporary API mapper.
			
 
				-#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                uint8* dst_argb, int dst_stride_argb,
			
 
				-                int width, int height,
			
 
				-                uint32 src_fourcc_bayer);
			
 
				-
			
 
				-// Converts ARGB to Bayer RGB formats.
			
 
				-LIBYUV_API
			
 
				-int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
			
 
				-                    uint8* dst_bayer, int dst_stride_bayer,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
			
 
				-                    uint8* dst_bayer, int dst_stride_bayer,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
			
 
				-                    uint8* dst_bayer, int dst_stride_bayer,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
			
 
				-                    uint8* dst_bayer, int dst_stride_bayer,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-// Temporary API mapper.
			
 
				-#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
			
 
				-                uint8* dst_bayer, int dst_stride_bayer,
			
 
				-                int width, int height,
			
 
				-                uint32 dst_fourcc_bayer);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_FORMATCONVERSION_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/mjpeg_decoder.h
@@ -1,201 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-// NOTE: For a simplified public API use convert.h MJPGToI420().
			
 
				-
			
 
				-struct jpeg_common_struct;
			
 
				-struct jpeg_decompress_struct;
			
 
				-struct jpeg_source_mgr;
			
 
				-
			
 
				-namespace libyuv {
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-#endif
			
 
				-
			
 
				-static const uint32 kUnknownDataSize = 0xFFFFFFFF;
			
 
				-
			
 
				-enum JpegSubsamplingType {
			
 
				-  kJpegYuv420,
			
 
				-  kJpegYuv422,
			
 
				-  kJpegYuv411,
			
 
				-  kJpegYuv444,
			
 
				-  kJpegYuv400,
			
 
				-  kJpegUnknown
			
 
				-};
			
 
				-
			
 
				-struct SetJmpErrorMgr;
			
 
				-
			
 
				-// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
			
 
				-// simply independent JPEG images with a fixed huffman table (which is omitted).
			
 
				-// It is rarely used in video transmission, but is common as a camera capture
			
 
				-// format, especially in Logitech devices. This class implements a decoder for
			
 
				-// MJPEG frames.
			
 
				-//
			
 
				-// See http://tools.ietf.org/html/rfc2435
			
 
				-class LIBYUV_API MJpegDecoder {
			
 
				- public:
			
 
				-  typedef void (*CallbackFunction)(void* opaque,
			
 
				-                                   const uint8* const* data,
			
 
				-                                   const int* strides,
			
 
				-                                   int rows);
			
 
				-
			
 
				-  static const int kColorSpaceUnknown;
			
 
				-  static const int kColorSpaceGrayscale;
			
 
				-  static const int kColorSpaceRgb;
			
 
				-  static const int kColorSpaceYCbCr;
			
 
				-  static const int kColorSpaceCMYK;
			
 
				-  static const int kColorSpaceYCCK;
			
 
				-
			
 
				-  MJpegDecoder();
			
 
				-  ~MJpegDecoder();
			
 
				-
			
 
				-  // Loads a new frame, reads its headers, and determines the uncompressed
			
 
				-  // image format.
			
 
				-  // Returns LIBYUV_TRUE if image looks valid and format is supported.
			
 
				-  // If return value is LIBYUV_TRUE, then the values for all the following
			
 
				-  // getters are populated.
			
 
				-  // src_len is the size of the compressed mjpeg frame in bytes.
			
 
				-  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
			
 
				-
			
 
				-  // Returns width of the last loaded frame in pixels.
			
 
				-  int GetWidth();
			
 
				-
			
 
				-  // Returns height of the last loaded frame in pixels.
			
 
				-  int GetHeight();
			
 
				-
			
 
				-  // Returns format of the last loaded frame. The return value is one of the
			
 
				-  // kColorSpace* constants.
			
 
				-  int GetColorSpace();
			
 
				-
			
 
				-  // Number of color components in the color space.
			
 
				-  int GetNumComponents();
			
 
				-
			
 
				-  // Sample factors of the n-th component.
			
 
				-  int GetHorizSampFactor(int component);
			
 
				-
			
 
				-  int GetVertSampFactor(int component);
			
 
				-
			
 
				-  int GetHorizSubSampFactor(int component);
			
 
				-
			
 
				-  int GetVertSubSampFactor(int component);
			
 
				-
			
 
				-  // Public for testability.
			
 
				-  int GetImageScanlinesPerImcuRow();
			
 
				-
			
 
				-  // Public for testability.
			
 
				-  int GetComponentScanlinesPerImcuRow(int component);
			
 
				-
			
 
				-  // Width of a component in bytes.
			
 
				-  int GetComponentWidth(int component);
			
 
				-
			
 
				-  // Height of a component.
			
 
				-  int GetComponentHeight(int component);
			
 
				-
			
 
				-  // Width of a component in bytes with padding for DCTSIZE. Public for testing.
			
 
				-  int GetComponentStride(int component);
			
 
				-
			
 
				-  // Size of a component in bytes.
			
 
				-  int GetComponentSize(int component);
			
 
				-
			
 
				-  // Call this after LoadFrame() if you decide you don't want to decode it
			
 
				-  // after all.
			
 
				-  LIBYUV_BOOL UnloadFrame();
			
 
				-
			
 
				-  // Decodes the entire image into a one-buffer-per-color-component format.
			
 
				-  // dst_width must match exactly. dst_height must be <= to image height; if
			
 
				-  // less, the image is cropped. "planes" must have size equal to at least
			
 
				-  // GetNumComponents() and they must point to non-overlapping buffers of size
			
 
				-  // at least GetComponentSize(i). The pointers in planes are incremented
			
 
				-  // to point to after the end of the written data.
			
 
				-  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
			
 
				-  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
			
 
				-
			
 
				-  // Decodes the entire image and passes the data via repeated calls to a
			
 
				-  // callback function. Each call will get the data for a whole number of
			
 
				-  // image scanlines.
			
 
				-  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
			
 
				-  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
			
 
				-                        int dst_width, int dst_height);
			
 
				-
			
 
				-  // The helper function which recognizes the jpeg sub-sampling type.
			
 
				-  static JpegSubsamplingType JpegSubsamplingTypeHelper(
			
 
				-     int* subsample_x, int* subsample_y, int number_of_components);
			
 
				-
			
 
				- private:
			
 
				-  struct Buffer {
			
 
				-    const uint8* data;
			
 
				-    int len;
			
 
				-  };
			
 
				-
			
 
				-  struct BufferVector {
			
 
				-    Buffer* buffers;
			
 
				-    int len;
			
 
				-    int pos;
			
 
				-  };
			
 
				-
			
 
				-  // Methods that are passed to jpeglib.
			
 
				-  static int fill_input_buffer(jpeg_decompress_struct* cinfo);
			
 
				-  static void init_source(jpeg_decompress_struct* cinfo);
			
 
				-  static void skip_input_data(jpeg_decompress_struct* cinfo,
			
 
				-                              long num_bytes);  // NOLINT
			
 
				-  static void term_source(jpeg_decompress_struct* cinfo);
			
 
				-
			
 
				-  static void ErrorHandler(jpeg_common_struct* cinfo);
			
 
				-
			
 
				-  void AllocOutputBuffers(int num_outbufs);
			
 
				-  void DestroyOutputBuffers();
			
 
				-
			
 
				-  LIBYUV_BOOL StartDecode();
			
 
				-  LIBYUV_BOOL FinishDecode();
			
 
				-
			
 
				-  void SetScanlinePointers(uint8** data);
			
 
				-  LIBYUV_BOOL DecodeImcuRow();
			
 
				-
			
 
				-  int GetComponentScanlinePadding(int component);
			
 
				-
			
 
				-  // A buffer holding the input data for a frame.
			
 
				-  Buffer buf_;
			
 
				-  BufferVector buf_vec_;
			
 
				-
			
 
				-  jpeg_decompress_struct* decompress_struct_;
			
 
				-  jpeg_source_mgr* source_mgr_;
			
 
				-  SetJmpErrorMgr* error_mgr_;
			
 
				-
			
 
				-  // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
			
 
				-  // GetComponentScanlinePadding() != 0.)
			
 
				-  LIBYUV_BOOL has_scanline_padding_;
			
 
				-
			
 
				-  // Temporaries used to point to scanline outputs.
			
 
				-  int num_outbufs_;  // Outermost size of all arrays below.
			
 
				-  uint8*** scanlines_;
			
 
				-  int* scanlines_sizes_;
			
 
				-  // Temporary buffer used for decoding when we can't decode directly to the
			
 
				-  // output buffers. Large enough for just one iMCU row.
			
 
				-  uint8** databuf_;
			
 
				-  int* databuf_strides_;
			
 
				-};
			
 
				-
			
 
				-}  // namespace libyuv
			
 
				-
			
 
				-#endif  //  __cplusplus
			
 
				-#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/planar_functions.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/planar_functions.h
@@ -1,434 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-// TODO(fbarchard): Remove the following headers includes.
			
 
				-#include "libyuv/convert.h"
			
 
				-#include "libyuv/convert_argb.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Copy a plane of data.
			
 
				-LIBYUV_API
			
 
				-void CopyPlane(const uint8* src_y, int src_stride_y,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Set a plane of data to a 32 bit value.
			
 
				-LIBYUV_API
			
 
				-void SetPlane(uint8* dst_y, int dst_stride_y,
			
 
				-              int width, int height,
			
 
				-              uint32 value);
			
 
				-
			
 
				-// Copy I400.  Supports inverting.
			
 
				-LIBYUV_API
			
 
				-int I400ToI400(const uint8* src_y, int src_stride_y,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               int width, int height);
			
 
				-
			
 
				-
			
 
				-// Copy I422 to I422.
			
 
				-#define I422ToI422 I422Copy
			
 
				-LIBYUV_API
			
 
				-int I422Copy(const uint8* src_y, int src_stride_y,
			
 
				-             const uint8* src_u, int src_stride_u,
			
 
				-             const uint8* src_v, int src_stride_v,
			
 
				-             uint8* dst_y, int dst_stride_y,
			
 
				-             uint8* dst_u, int dst_stride_u,
			
 
				-             uint8* dst_v, int dst_stride_v,
			
 
				-             int width, int height);
			
 
				-
			
 
				-// Copy I444 to I444.
			
 
				-#define I444ToI444 I444Copy
			
 
				-LIBYUV_API
			
 
				-int I444Copy(const uint8* src_y, int src_stride_y,
			
 
				-             const uint8* src_u, int src_stride_u,
			
 
				-             const uint8* src_v, int src_stride_v,
			
 
				-             uint8* dst_y, int dst_stride_y,
			
 
				-             uint8* dst_u, int dst_stride_u,
			
 
				-             uint8* dst_v, int dst_stride_v,
			
 
				-             int width, int height);
			
 
				-
			
 
				-// Convert YUY2 to I422.
			
 
				-LIBYUV_API
			
 
				-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert UYVY to I422.
			
 
				-LIBYUV_API
			
 
				-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert I420 to I400. (calls CopyPlane ignoring u/v).
			
 
				-LIBYUV_API
			
 
				-int I420ToI400(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Alias
			
 
				-#define I420ToI420Mirror I420Mirror
			
 
				-
			
 
				-// I420 mirror.
			
 
				-LIBYUV_API
			
 
				-int I420Mirror(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Alias
			
 
				-#define I400ToI400Mirror I400Mirror
			
 
				-
			
 
				-// I400 mirror.  A single plane is mirrored horizontally.
			
 
				-// Pass negative height to achieve 180 degree rotation.
			
 
				-LIBYUV_API
			
 
				-int I400Mirror(const uint8* src_y, int src_stride_y,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Alias
			
 
				-#define ARGBToARGBMirror ARGBMirror
			
 
				-
			
 
				-// ARGB mirror.
			
 
				-LIBYUV_API
			
 
				-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert NV12 to RGB565.
			
 
				-LIBYUV_API
			
 
				-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
			
 
				-                 const uint8* src_uv, int src_stride_uv,
			
 
				-                 uint8* dst_rgb565, int dst_stride_rgb565,
			
 
				-                 int width, int height);
			
 
				-
			
 
				-// Convert NV21 to RGB565.
			
 
				-LIBYUV_API
			
 
				-int NV21ToRGB565(const uint8* src_y, int src_stride_y,
			
 
				-                 const uint8* src_uv, int src_stride_uv,
			
 
				-                 uint8* dst_rgb565, int dst_stride_rgb565,
			
 
				-                 int width, int height);
			
 
				-
			
 
				-// I422ToARGB is in convert_argb.h
			
 
				-// Convert I422 to BGRA.
			
 
				-LIBYUV_API
			
 
				-int I422ToBGRA(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_bgra, int dst_stride_bgra,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert I422 to ABGR.
			
 
				-LIBYUV_API
			
 
				-int I422ToABGR(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_abgr, int dst_stride_abgr,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert I422 to RGBA.
			
 
				-LIBYUV_API
			
 
				-int I422ToRGBA(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_rgba, int dst_stride_rgba,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Draw a rectangle into I420.
			
 
				-LIBYUV_API
			
 
				-int I420Rect(uint8* dst_y, int dst_stride_y,
			
 
				-             uint8* dst_u, int dst_stride_u,
			
 
				-             uint8* dst_v, int dst_stride_v,
			
 
				-             int x, int y, int width, int height,
			
 
				-             int value_y, int value_u, int value_v);
			
 
				-
			
 
				-// Draw a rectangle into ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
			
 
				-             int x, int y, int width, int height, uint32 value);
			
 
				-
			
 
				-// Convert ARGB to gray scale ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Make a rectangle of ARGB gray scale.
			
 
				-LIBYUV_API
			
 
				-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
			
 
				-             int x, int y, int width, int height);
			
 
				-
			
 
				-// Make a rectangle of ARGB Sepia tone.
			
 
				-LIBYUV_API
			
 
				-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
			
 
				-              int x, int y, int width, int height);
			
 
				-
			
 
				-// Apply a matrix rotation to each ARGB pixel.
			
 
				-// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
			
 
				-// The first 4 coefficients apply to B, G, R, A and produce B of the output.
			
 
				-// The next 4 coefficients apply to B, G, R, A and produce G of the output.
			
 
				-// The next 4 coefficients apply to B, G, R, A and produce R of the output.
			
 
				-// The last 4 coefficients apply to B, G, R, A and produce A of the output.
			
 
				-LIBYUV_API
			
 
				-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
			
 
				-                    uint8* dst_argb, int dst_stride_argb,
			
 
				-                    const int8* matrix_argb,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-// Deprecated. Use ARGBColorMatrix instead.
			
 
				-// Apply a matrix rotation to each ARGB pixel.
			
 
				-// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
			
 
				-// The first 4 coefficients apply to B, G, R, A and produce B of the output.
			
 
				-// The next 4 coefficients apply to B, G, R, A and produce G of the output.
			
 
				-// The last 4 coefficients apply to B, G, R, A and produce R of the output.
			
 
				-LIBYUV_API
			
 
				-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
			
 
				-                   const int8* matrix_rgb,
			
 
				-                   int x, int y, int width, int height);
			
 
				-
			
 
				-// Apply a color table each ARGB pixel.
			
 
				-// Table contains 256 ARGB values.
			
 
				-LIBYUV_API
			
 
				-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
			
 
				-                   const uint8* table_argb,
			
 
				-                   int x, int y, int width, int height);
			
 
				-
			
 
				-// Apply a color table each ARGB pixel but preserve destination alpha.
			
 
				-// Table contains 256 ARGB values.
			
 
				-LIBYUV_API
			
 
				-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
			
 
				-                  const uint8* table_argb,
			
 
				-                  int x, int y, int width, int height);
			
 
				-
			
 
				-// Apply a luma/color table each ARGB pixel but preserve destination alpha.
			
 
				-// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
			
 
				-// RGB (YJ style) and C is an 8 bit color component (R, G or B).
			
 
				-LIBYUV_API
			
 
				-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
			
 
				-                       uint8* dst_argb, int dst_stride_argb,
			
 
				-                       const uint8* luma_rgb_table,
			
 
				-                       int width, int height);
			
 
				-
			
 
				-// Apply a 3 term polynomial to ARGB values.
			
 
				-// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
			
 
				-// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
			
 
				-// g squared, r squared and a squared.  The 4rd row is coefficients for b to
			
 
				-// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
			
 
				-// result clamped to 0 to 255.
			
 
				-// A polynomial approximation can be dirived using software such as 'R'.
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
			
 
				-                   uint8* dst_argb, int dst_stride_argb,
			
 
				-                   const float* poly,
			
 
				-                   int width, int height);
			
 
				-
			
 
				-// Quantize a rectangle of ARGB. Alpha unaffected.
			
 
				-// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
			
 
				-// interval_size should be a value between 1 and 255.
			
 
				-// interval_offset should be a value between 0 and 255.
			
 
				-LIBYUV_API
			
 
				-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
			
 
				-                 int scale, int interval_size, int interval_offset,
			
 
				-                 int x, int y, int width, int height);
			
 
				-
			
 
				-// Copy ARGB to ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
			
 
				-             uint8* dst_argb, int dst_stride_argb,
			
 
				-             int width, int height);
			
 
				-
			
 
				-// Copy ARGB to ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
			
 
				-                  uint8* dst_argb, int dst_stride_argb,
			
 
				-                  int width, int height);
			
 
				-
			
 
				-// Copy ARGB to ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
			
 
				-                     uint8* dst_argb, int dst_stride_argb,
			
 
				-                     int width, int height);
			
 
				-
			
 
				-typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                             uint8* dst_argb, int width);
			
 
				-
			
 
				-// Get function to Alpha Blend ARGB pixels and store to destination.
			
 
				-LIBYUV_API
			
 
				-ARGBBlendRow GetARGBBlend();
			
 
				-
			
 
				-// Alpha Blend ARGB images and store to destination.
			
 
				-// Alpha of destination is set to 255.
			
 
				-LIBYUV_API
			
 
				-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
			
 
				-              const uint8* src_argb1, int src_stride_argb1,
			
 
				-              uint8* dst_argb, int dst_stride_argb,
			
 
				-              int width, int height);
			
 
				-
			
 
				-// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
			
 
				-LIBYUV_API
			
 
				-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
			
 
				-                 const uint8* src_argb1, int src_stride_argb1,
			
 
				-                 uint8* dst_argb, int dst_stride_argb,
			
 
				-                 int width, int height);
			
 
				-
			
 
				-// Add ARGB image with ARGB image. Saturates to 255.
			
 
				-LIBYUV_API
			
 
				-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
			
 
				-            const uint8* src_argb1, int src_stride_argb1,
			
 
				-            uint8* dst_argb, int dst_stride_argb,
			
 
				-            int width, int height);
			
 
				-
			
 
				-// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
			
 
				-LIBYUV_API
			
 
				-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
			
 
				-                 const uint8* src_argb1, int src_stride_argb1,
			
 
				-                 uint8* dst_argb, int dst_stride_argb,
			
 
				-                 int width, int height);
			
 
				-
			
 
				-// Convert I422 to YUY2.
			
 
				-LIBYUV_API
			
 
				-int I422ToYUY2(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_frame, int dst_stride_frame,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert I422 to UYVY.
			
 
				-LIBYUV_API
			
 
				-int I422ToUYVY(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_frame, int dst_stride_frame,
			
 
				-               int width, int height);
			
 
				-
			
 
				-// Convert unattentuated ARGB to preattenuated ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
			
 
				-                  uint8* dst_argb, int dst_stride_argb,
			
 
				-                  int width, int height);
			
 
				-
			
 
				-// Convert preattentuated ARGB to unattenuated ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
			
 
				-                    uint8* dst_argb, int dst_stride_argb,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-// Convert MJPG to ARGB.
			
 
				-LIBYUV_API
			
 
				-int MJPGToARGB(const uint8* sample, size_t sample_size,
			
 
				-               uint8* argb, int argb_stride,
			
 
				-               int w, int h, int dw, int dh);
			
 
				-
			
 
				-// Internal function - do not call directly.
			
 
				-// Computes table of cumulative sum for image where the value is the sum
			
 
				-// of all values above and to the left of the entry. Used by ARGBBlur.
			
 
				-LIBYUV_API
			
 
				-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
			
 
				-                             int32* dst_cumsum, int dst_stride32_cumsum,
			
 
				-                             int width, int height);
			
 
				-
			
 
				-// Blur ARGB image.
			
 
				-// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
			
 
				-//   16 byte boundary.
			
 
				-// dst_stride32_cumsum is number of ints in a row (width * 4).
			
 
				-// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
			
 
				-// Blur is optimized for radius of 5 (11x11) or less.
			
 
				-LIBYUV_API
			
 
				-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
			
 
				-             uint8* dst_argb, int dst_stride_argb,
			
 
				-             int32* dst_cumsum, int dst_stride32_cumsum,
			
 
				-             int width, int height, int radius);
			
 
				-
			
 
				-// Multiply ARGB image by ARGB value.
			
 
				-LIBYUV_API
			
 
				-int ARGBShade(const uint8* src_argb, int src_stride_argb,
			
 
				-              uint8* dst_argb, int dst_stride_argb,
			
 
				-              int width, int height, uint32 value);
			
 
				-
			
 
				-// Interpolate between two ARGB images using specified amount of interpolation
			
 
				-// (0 to 255) and store to destination.
			
 
				-// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
			
 
				-// and 255 means 1% src_argb0 and 99% src_argb1.
			
 
				-// Internally uses ARGBScale bilinear filtering.
			
 
				-// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
			
 
				-LIBYUV_API
			
 
				-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
			
 
				-                    const uint8* src_argb1, int src_stride_argb1,
			
 
				-                    uint8* dst_argb, int dst_stride_argb,
			
 
				-                    int width, int height, int interpolation);
			
 
				-
			
 
				-#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
			
 
				-    defined(TARGET_IPHONE_SIMULATOR)
			
 
				-#define LIBYUV_DISABLE_X86
			
 
				-#endif
			
 
				-
			
 
				-// Row functions for copying a pixels from a source with a slope to a row
			
 
				-// of destination. Useful for scaling, rotation, mirror, texture mapping.
			
 
				-LIBYUV_API
			
 
				-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
			
 
				-                     uint8* dst_argb, const float* uv_dudv, int width);
			
 
				-// The following are available on all x86 platforms:
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && \
			
 
				-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
			
 
				-LIBYUV_API
			
 
				-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
			
 
				-                        uint8* dst_argb, const float* uv_dudv, int width);
			
 
				-#define HAS_ARGBAFFINEROW_SSE2
			
 
				-#endif  // LIBYUV_DISABLE_X86
			
 
				-
			
 
				-// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
			
 
				-// shuffler is 16 bytes and must be aligned.
			
 
				-LIBYUV_API
			
 
				-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
			
 
				-                uint8* dst_argb, int dst_stride_argb,
			
 
				-                const uint8* shuffler, int width, int height);
			
 
				-
			
 
				-// Sobel ARGB effect with planar output.
			
 
				-LIBYUV_API
			
 
				-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
			
 
				-                     uint8* dst_y, int dst_stride_y,
			
 
				-                     int width, int height);
			
 
				-
			
 
				-// Sobel ARGB effect.
			
 
				-LIBYUV_API
			
 
				-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
			
 
				-              uint8* dst_argb, int dst_stride_argb,
			
 
				-              int width, int height);
			
 
				-
			
 
				-// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
			
 
				-                uint8* dst_argb, int dst_stride_argb,
			
 
				-                int width, int height);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/rotate.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/rotate.h
@@ -1,117 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_ROTATE_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Supported rotation.
			
 
				-typedef enum RotationMode {
			
 
				-  kRotate0 = 0,  // No rotation.
			
 
				-  kRotate90 = 90,  // Rotate 90 degrees clockwise.
			
 
				-  kRotate180 = 180,  // Rotate 180 degrees.
			
 
				-  kRotate270 = 270,  // Rotate 270 degrees clockwise.
			
 
				-
			
 
				-  // Deprecated.
			
 
				-  kRotateNone = 0,
			
 
				-  kRotateClockwise = 90,
			
 
				-  kRotateCounterClockwise = 270,
			
 
				-} RotationModeEnum;
			
 
				-
			
 
				-// Rotate I420 frame.
			
 
				-LIBYUV_API
			
 
				-int I420Rotate(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int src_width, int src_height, enum RotationMode mode);
			
 
				-
			
 
				-// Rotate NV12 input and store in I420.
			
 
				-LIBYUV_API
			
 
				-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
			
 
				-                     const uint8* src_uv, int src_stride_uv,
			
 
				-                     uint8* dst_y, int dst_stride_y,
			
 
				-                     uint8* dst_u, int dst_stride_u,
			
 
				-                     uint8* dst_v, int dst_stride_v,
			
 
				-                     int src_width, int src_height, enum RotationMode mode);
			
 
				-
			
 
				-// Rotate a plane by 0, 90, 180, or 270.
			
 
				-LIBYUV_API
			
 
				-int RotatePlane(const uint8* src, int src_stride,
			
 
				-                uint8* dst, int dst_stride,
			
 
				-                int src_width, int src_height, enum RotationMode mode);
			
 
				-
			
 
				-// Rotate planes by 90, 180, 270. Deprecated.
			
 
				-LIBYUV_API
			
 
				-void RotatePlane90(const uint8* src, int src_stride,
			
 
				-                   uint8* dst, int dst_stride,
			
 
				-                   int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void RotatePlane180(const uint8* src, int src_stride,
			
 
				-                    uint8* dst, int dst_stride,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void RotatePlane270(const uint8* src, int src_stride,
			
 
				-                    uint8* dst, int dst_stride,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void RotateUV90(const uint8* src, int src_stride,
			
 
				-                uint8* dst_a, int dst_stride_a,
			
 
				-                uint8* dst_b, int dst_stride_b,
			
 
				-                int width, int height);
			
 
				-
			
 
				-// Rotations for when U and V are interleaved.
			
 
				-// These functions take one input pointer and
			
 
				-// split the data into two buffers while
			
 
				-// rotating them. Deprecated.
			
 
				-LIBYUV_API
			
 
				-void RotateUV180(const uint8* src, int src_stride,
			
 
				-                 uint8* dst_a, int dst_stride_a,
			
 
				-                 uint8* dst_b, int dst_stride_b,
			
 
				-                 int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void RotateUV270(const uint8* src, int src_stride,
			
 
				-                 uint8* dst_a, int dst_stride_a,
			
 
				-                 uint8* dst_b, int dst_stride_b,
			
 
				-                 int width, int height);
			
 
				-
			
 
				-// The 90 and 270 functions are based on transposes.
			
 
				-// Doing a transpose with reversing the read/write
			
 
				-// order will result in a rotation by +- 90 degrees.
			
 
				-// Deprecated.
			
 
				-LIBYUV_API
			
 
				-void TransposePlane(const uint8* src, int src_stride,
			
 
				-                    uint8* dst, int dst_stride,
			
 
				-                    int width, int height);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void TransposeUV(const uint8* src, int src_stride,
			
 
				-                 uint8* dst_a, int dst_stride_a,
			
 
				-                 uint8* dst_b, int dst_stride_b,
			
 
				-                 int width, int height);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/rotate_argb.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/rotate_argb.h
@@ -1,33 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/rotate.h"  // For RotationMode.
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Rotate ARGB frame
			
 
				-LIBYUV_API
			
 
				-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int src_width, int src_height, enum RotationMode mode);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/row.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/row.h
@@ -1,1694 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_ROW_H_
			
 
				-
			
 
				-#include <stdlib.h>  // For malloc.
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-#define align_buffer_64(var, size)                                             \
			
 
				-  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \
			
 
				-  uint8* var = reinterpret_cast<uint8*>                                        \
			
 
				-      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
			
 
				-#else
			
 
				-#define align_buffer_64(var, size)                                             \
			
 
				-  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \
			
 
				-  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */
			
 
				-#endif
			
 
				-
			
 
				-#define free_aligned_buffer_64(var) \
			
 
				-  free(var##_mem);  \
			
 
				-  var = 0
			
 
				-
			
 
				-#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
			
 
				-    defined(TARGET_IPHONE_SIMULATOR)
			
 
				-#define LIBYUV_DISABLE_X86
			
 
				-#endif
			
 
				-// True if compiling for SSSE3 as a requirement.
			
 
				-#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
			
 
				-#define LIBYUV_SSSE3_ONLY
			
 
				-#endif
			
 
				-
			
 
				-// Enable for NaCL pepper 33 for bundle and AVX2 support.
			
 
				-//  #define NEW_BINUTILS
			
 
				-
			
 
				-// The following are available on all x86 platforms:
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && \
			
 
				-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
			
 
				-// Effects:
			
 
				-#define HAS_ARGBADDROW_SSE2
			
 
				-#define HAS_ARGBAFFINEROW_SSE2
			
 
				-#define HAS_ARGBATTENUATEROW_SSSE3
			
 
				-#define HAS_ARGBBLENDROW_SSSE3
			
 
				-#define HAS_ARGBCOLORMATRIXROW_SSSE3
			
 
				-#define HAS_ARGBCOLORTABLEROW_X86
			
 
				-#define HAS_ARGBCOPYALPHAROW_SSE2
			
 
				-#define HAS_ARGBCOPYYTOALPHAROW_SSE2
			
 
				-#define HAS_ARGBGRAYROW_SSSE3
			
 
				-#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
			
 
				-#define HAS_ARGBMIRRORROW_SSSE3
			
 
				-#define HAS_ARGBMULTIPLYROW_SSE2
			
 
				-#define HAS_ARGBPOLYNOMIALROW_SSE2
			
 
				-#define HAS_ARGBQUANTIZEROW_SSE2
			
 
				-#define HAS_ARGBSEPIAROW_SSSE3
			
 
				-#define HAS_ARGBSHADEROW_SSE2
			
 
				-#define HAS_ARGBSUBTRACTROW_SSE2
			
 
				-#define HAS_ARGBTOUVROW_SSSE3
			
 
				-#define HAS_ARGBUNATTENUATEROW_SSE2
			
 
				-#define HAS_COMPUTECUMULATIVESUMROW_SSE2
			
 
				-#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
			
 
				-#define HAS_INTERPOLATEROW_SSE2
			
 
				-#define HAS_INTERPOLATEROW_SSSE3
			
 
				-#define HAS_RGBCOLORTABLEROW_X86
			
 
				-#define HAS_SOBELROW_SSE2
			
 
				-#define HAS_SOBELTOPLANEROW_SSE2
			
 
				-#define HAS_SOBELXROW_SSE2
			
 
				-#define HAS_SOBELXYROW_SSE2
			
 
				-#define HAS_SOBELYROW_SSE2
			
 
				-
			
 
				-// Conversions:
			
 
				-#define HAS_ABGRTOUVROW_SSSE3
			
 
				-#define HAS_ABGRTOYROW_SSSE3
			
 
				-#define HAS_ARGB1555TOARGBROW_SSE2
			
 
				-#define HAS_ARGB4444TOARGBROW_SSE2
			
 
				-#define HAS_ARGBSHUFFLEROW_SSE2
			
 
				-#define HAS_ARGBSHUFFLEROW_SSSE3
			
 
				-#define HAS_ARGBTOARGB1555ROW_SSE2
			
 
				-#define HAS_ARGBTOARGB4444ROW_SSE2
			
 
				-#define HAS_ARGBTOBAYERGGROW_SSE2
			
 
				-#define HAS_ARGBTOBAYERROW_SSSE3
			
 
				-#define HAS_ARGBTORAWROW_SSSE3
			
 
				-#define HAS_ARGBTORGB24ROW_SSSE3
			
 
				-#define HAS_ARGBTORGB565ROW_SSE2
			
 
				-#define HAS_ARGBTOUV422ROW_SSSE3
			
 
				-#define HAS_ARGBTOUV444ROW_SSSE3
			
 
				-#define HAS_ARGBTOUVJROW_SSSE3
			
 
				-#define HAS_ARGBTOYJROW_SSSE3
			
 
				-#define HAS_ARGBTOYROW_SSSE3
			
 
				-#define HAS_BGRATOUVROW_SSSE3
			
 
				-#define HAS_BGRATOYROW_SSSE3
			
 
				-#define HAS_COPYROW_ERMS
			
 
				-#define HAS_COPYROW_SSE2
			
 
				-#define HAS_COPYROW_X86
			
 
				-#define HAS_HALFROW_SSE2
			
 
				-#define HAS_I400TOARGBROW_SSE2
			
 
				-#define HAS_I411TOARGBROW_SSSE3
			
 
				-#define HAS_I422TOARGB1555ROW_SSSE3
			
 
				-#define HAS_I422TOABGRROW_SSSE3
			
 
				-#define HAS_I422TOARGB1555ROW_SSSE3
			
 
				-#define HAS_I422TOARGB4444ROW_SSSE3
			
 
				-#define HAS_I422TOARGBROW_SSSE3
			
 
				-#define HAS_I422TOBGRAROW_SSSE3
			
 
				-#define HAS_I422TORAWROW_SSSE3
			
 
				-#define HAS_I422TORGB24ROW_SSSE3
			
 
				-#define HAS_I422TORGB565ROW_SSSE3
			
 
				-#define HAS_I422TORGBAROW_SSSE3
			
 
				-#define HAS_I422TOUYVYROW_SSE2
			
 
				-#define HAS_I422TOYUY2ROW_SSE2
			
 
				-#define HAS_I444TOARGBROW_SSSE3
			
 
				-#define HAS_MERGEUVROW_SSE2
			
 
				-#define HAS_MIRRORROW_SSE2
			
 
				-#define HAS_MIRRORROW_SSSE3
			
 
				-#define HAS_MIRRORROW_UV_SSSE3
			
 
				-#define HAS_MIRRORUVROW_SSSE3
			
 
				-#define HAS_NV12TOARGBROW_SSSE3
			
 
				-#define HAS_NV12TORGB565ROW_SSSE3
			
 
				-#define HAS_NV21TOARGBROW_SSSE3
			
 
				-#define HAS_NV21TORGB565ROW_SSSE3
			
 
				-#define HAS_RAWTOARGBROW_SSSE3
			
 
				-#define HAS_RAWTOYROW_SSSE3
			
 
				-#define HAS_RGB24TOARGBROW_SSSE3
			
 
				-#define HAS_RGB24TOYROW_SSSE3
			
 
				-#define HAS_RGB565TOARGBROW_SSE2
			
 
				-#define HAS_RGBATOUVROW_SSSE3
			
 
				-#define HAS_RGBATOYROW_SSSE3
			
 
				-#define HAS_SETROW_X86
			
 
				-#define HAS_SPLITUVROW_SSE2
			
 
				-#define HAS_UYVYTOARGBROW_SSSE3
			
 
				-#define HAS_UYVYTOUV422ROW_SSE2
			
 
				-#define HAS_UYVYTOUVROW_SSE2
			
 
				-#define HAS_UYVYTOYROW_SSE2
			
 
				-#define HAS_YTOARGBROW_SSE2
			
 
				-#define HAS_YUY2TOARGBROW_SSSE3
			
 
				-#define HAS_YUY2TOUV422ROW_SSE2
			
 
				-#define HAS_YUY2TOUVROW_SSE2
			
 
				-#define HAS_YUY2TOYROW_SSE2
			
 
				-#endif
			
 
				-
			
 
				-// GCC >= 4.7.0 required for AVX2.
			
 
				-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
			
 
				-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
			
 
				-#define GCC_HAS_AVX2 1
			
 
				-#endif  // GNUC >= 4.7
			
 
				-#endif  // __GNUC__
			
 
				-
			
 
				-// clang >= 3.4.0 required for AVX2.
			
 
				-#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
			
 
				-#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
			
 
				-#define CLANG_HAS_AVX2 1
			
 
				-#endif  // clang >= 3.4
			
 
				-#endif  // __clang__
			
 
				-
			
 
				-// Visual C 2012 required for AVX2.
			
 
				-#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700
			
 
				-#define VISUALC_HAS_AVX2 1
			
 
				-#endif  // VisualStudio >= 2012
			
 
				-
			
 
				-// The following are available on all x86 platforms, but
			
 
				-// require VS2012, clang 3.4 or gcc 4.7.
			
 
				-// The code supports NaCL but requires a new compiler and validator.
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
			
 
				-    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
			
 
				-// Effects:
			
 
				-#define HAS_ARGBPOLYNOMIALROW_AVX2
			
 
				-#define HAS_ARGBSHUFFLEROW_AVX2
			
 
				-#define HAS_ARGBCOPYALPHAROW_AVX2
			
 
				-#define HAS_ARGBCOPYYTOALPHAROW_AVX2
			
 
				-#endif
			
 
				-
			
 
				-// The following are require VS2012.
			
 
				-// TODO(fbarchard): Port to gcc.
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
			
 
				-#define HAS_ARGBTOUVROW_AVX2
			
 
				-#define HAS_ARGBTOYJROW_AVX2
			
 
				-#define HAS_ARGBTOYROW_AVX2
			
 
				-#define HAS_HALFROW_AVX2
			
 
				-#define HAS_I422TOARGBROW_AVX2
			
 
				-#define HAS_INTERPOLATEROW_AVX2
			
 
				-#define HAS_MERGEUVROW_AVX2
			
 
				-#define HAS_MIRRORROW_AVX2
			
 
				-#define HAS_SPLITUVROW_AVX2
			
 
				-#define HAS_UYVYTOUV422ROW_AVX2
			
 
				-#define HAS_UYVYTOUVROW_AVX2
			
 
				-#define HAS_UYVYTOYROW_AVX2
			
 
				-#define HAS_YUY2TOUV422ROW_AVX2
			
 
				-#define HAS_YUY2TOUVROW_AVX2
			
 
				-#define HAS_YUY2TOYROW_AVX2
			
 
				-
			
 
				-// Effects:
			
 
				-#define HAS_ARGBADDROW_AVX2
			
 
				-#define HAS_ARGBATTENUATEROW_AVX2
			
 
				-#define HAS_ARGBMIRRORROW_AVX2
			
 
				-#define HAS_ARGBMULTIPLYROW_AVX2
			
 
				-#define HAS_ARGBSUBTRACTROW_AVX2
			
 
				-#define HAS_ARGBUNATTENUATEROW_AVX2
			
 
				-#endif  // defined(VISUALC_HAS_AVX2)
			
 
				-
			
 
				-// The following are Yasm x86 only:
			
 
				-// TODO(fbarchard): Port AVX2 to inline.
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
			
 
				-    (defined(_M_IX86) || defined(_M_X64) || \
			
 
				-    defined(__x86_64__) || defined(__i386__))
			
 
				-#define HAS_MERGEUVROW_AVX2
			
 
				-#define HAS_MERGEUVROW_MMX
			
 
				-#define HAS_SPLITUVROW_AVX2
			
 
				-#define HAS_SPLITUVROW_MMX
			
 
				-#define HAS_UYVYTOYROW_AVX2
			
 
				-#define HAS_UYVYTOYROW_MMX
			
 
				-#define HAS_YUY2TOYROW_AVX2
			
 
				-#define HAS_YUY2TOYROW_MMX
			
 
				-#endif
			
 
				-
			
 
				-// The following are disabled when SSSE3 is available:
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && \
			
 
				-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
			
 
				-    !defined(LIBYUV_SSSE3_ONLY)
			
 
				-#define HAS_ARGBBLENDROW_SSE2
			
 
				-#define HAS_ARGBATTENUATEROW_SSE2
			
 
				-#define HAS_MIRRORROW_SSE2
			
 
				-#endif
			
 
				-
			
 
				-// The following are available on Neon platforms:
			
 
				-#if !defined(LIBYUV_DISABLE_NEON) && \
			
 
				-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
			
 
				-#define HAS_ABGRTOUVROW_NEON
			
 
				-#define HAS_ABGRTOYROW_NEON
			
 
				-#define HAS_ARGB1555TOARGBROW_NEON
			
 
				-#define HAS_ARGB1555TOUVROW_NEON
			
 
				-#define HAS_ARGB1555TOYROW_NEON
			
 
				-#define HAS_ARGB4444TOARGBROW_NEON
			
 
				-#define HAS_ARGB4444TOUVROW_NEON
			
 
				-#define HAS_ARGB4444TOYROW_NEON
			
 
				-#define HAS_ARGBTOARGB1555ROW_NEON
			
 
				-#define HAS_ARGBTOARGB4444ROW_NEON
			
 
				-#define HAS_ARGBTOBAYERROW_NEON
			
 
				-#define HAS_ARGBTOBAYERGGROW_NEON
			
 
				-#define HAS_ARGBTORAWROW_NEON
			
 
				-#define HAS_ARGBTORGB24ROW_NEON
			
 
				-#define HAS_ARGBTORGB565ROW_NEON
			
 
				-#define HAS_ARGBTOUV411ROW_NEON
			
 
				-#define HAS_ARGBTOUV422ROW_NEON
			
 
				-#define HAS_ARGBTOUV444ROW_NEON
			
 
				-#define HAS_ARGBTOUVROW_NEON
			
 
				-#define HAS_ARGBTOUVJROW_NEON
			
 
				-#define HAS_ARGBTOYROW_NEON
			
 
				-#define HAS_ARGBTOYJROW_NEON
			
 
				-#define HAS_BGRATOUVROW_NEON
			
 
				-#define HAS_BGRATOYROW_NEON
			
 
				-#define HAS_COPYROW_NEON
			
 
				-#define HAS_HALFROW_NEON
			
 
				-#define HAS_I400TOARGBROW_NEON
			
 
				-#define HAS_I411TOARGBROW_NEON
			
 
				-#define HAS_I422TOABGRROW_NEON
			
 
				-#define HAS_I422TOARGB1555ROW_NEON
			
 
				-#define HAS_I422TOARGB4444ROW_NEON
			
 
				-#define HAS_I422TOARGBROW_NEON
			
 
				-#define HAS_I422TOBGRAROW_NEON
			
 
				-#define HAS_I422TORAWROW_NEON
			
 
				-#define HAS_I422TORGB24ROW_NEON
			
 
				-#define HAS_I422TORGB565ROW_NEON
			
 
				-#define HAS_I422TORGBAROW_NEON
			
 
				-#define HAS_I422TOUYVYROW_NEON
			
 
				-#define HAS_I422TOYUY2ROW_NEON
			
 
				-#define HAS_I444TOARGBROW_NEON
			
 
				-#define HAS_MERGEUVROW_NEON
			
 
				-#define HAS_MIRRORROW_NEON
			
 
				-#define HAS_MIRRORUVROW_NEON
			
 
				-#define HAS_NV12TOARGBROW_NEON
			
 
				-#define HAS_NV12TORGB565ROW_NEON
			
 
				-#define HAS_NV21TOARGBROW_NEON
			
 
				-#define HAS_NV21TORGB565ROW_NEON
			
 
				-#define HAS_RAWTOARGBROW_NEON
			
 
				-#define HAS_RAWTOUVROW_NEON
			
 
				-#define HAS_RAWTOYROW_NEON
			
 
				-#define HAS_RGB24TOARGBROW_NEON
			
 
				-#define HAS_RGB24TOUVROW_NEON
			
 
				-#define HAS_RGB24TOYROW_NEON
			
 
				-#define HAS_RGB565TOARGBROW_NEON
			
 
				-#define HAS_RGB565TOUVROW_NEON
			
 
				-#define HAS_RGB565TOYROW_NEON
			
 
				-#define HAS_RGBATOUVROW_NEON
			
 
				-#define HAS_RGBATOYROW_NEON
			
 
				-#define HAS_SETROW_NEON
			
 
				-#define HAS_SPLITUVROW_NEON
			
 
				-#define HAS_UYVYTOARGBROW_NEON
			
 
				-#define HAS_UYVYTOUV422ROW_NEON
			
 
				-#define HAS_UYVYTOUVROW_NEON
			
 
				-#define HAS_UYVYTOYROW_NEON
			
 
				-#define HAS_YTOARGBROW_NEON
			
 
				-#define HAS_YUY2TOARGBROW_NEON
			
 
				-#define HAS_YUY2TOUV422ROW_NEON
			
 
				-#define HAS_YUY2TOUVROW_NEON
			
 
				-#define HAS_YUY2TOYROW_NEON
			
 
				-
			
 
				-// Effects:
			
 
				-#define HAS_ARGBADDROW_NEON
			
 
				-#define HAS_ARGBATTENUATEROW_NEON
			
 
				-#define HAS_ARGBBLENDROW_NEON
			
 
				-#define HAS_ARGBCOLORMATRIXROW_NEON
			
 
				-#define HAS_ARGBGRAYROW_NEON
			
 
				-#define HAS_ARGBMIRRORROW_NEON
			
 
				-#define HAS_ARGBMULTIPLYROW_NEON
			
 
				-#define HAS_ARGBQUANTIZEROW_NEON
			
 
				-#define HAS_ARGBSEPIAROW_NEON
			
 
				-#define HAS_ARGBSHADEROW_NEON
			
 
				-#define HAS_ARGBSUBTRACTROW_NEON
			
 
				-#define HAS_SOBELROW_NEON
			
 
				-#define HAS_SOBELTOPLANEROW_NEON
			
 
				-#define HAS_SOBELXYROW_NEON
			
 
				-#define HAS_SOBELXROW_NEON
			
 
				-#define HAS_SOBELYROW_NEON
			
 
				-#define HAS_INTERPOLATEROW_NEON
			
 
				-#endif
			
 
				-
			
 
				-// The following are available on Mips platforms:
			
 
				-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
			
 
				-#define HAS_COPYROW_MIPS
			
 
				-#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
			
 
				-#define HAS_I422TOABGRROW_MIPS_DSPR2
			
 
				-#define HAS_I422TOARGBROW_MIPS_DSPR2
			
 
				-#define HAS_I422TOBGRAROW_MIPS_DSPR2
			
 
				-#define HAS_INTERPOLATEROWS_MIPS_DSPR2
			
 
				-#define HAS_MIRRORROW_MIPS_DSPR2
			
 
				-#define HAS_MIRRORUVROW_MIPS_DSPR2
			
 
				-#define HAS_SPLITUVROW_MIPS_DSPR2
			
 
				-#endif
			
 
				-#endif
			
 
				-
			
 
				-#if defined(_MSC_VER) && !defined(__CLR_VER)
			
 
				-#define SIMD_ALIGNED(var) __declspec(align(16)) var
			
 
				-typedef __declspec(align(16)) int16 vec16[8];
			
 
				-typedef __declspec(align(16)) int32 vec32[4];
			
 
				-typedef __declspec(align(16)) int8 vec8[16];
			
 
				-typedef __declspec(align(16)) uint16 uvec16[8];
			
 
				-typedef __declspec(align(16)) uint32 uvec32[4];
			
 
				-typedef __declspec(align(16)) uint8 uvec8[16];
			
 
				-typedef __declspec(align(32)) int16 lvec16[16];
			
 
				-typedef __declspec(align(32)) int32 lvec32[8];
			
 
				-typedef __declspec(align(32)) int8 lvec8[32];
			
 
				-typedef __declspec(align(32)) uint16 ulvec16[16];
			
 
				-typedef __declspec(align(32)) uint32 ulvec32[8];
			
 
				-typedef __declspec(align(32)) uint8 ulvec8[32];
			
 
				-
			
 
				-#elif defined(__GNUC__)
			
 
				-// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
			
 
				-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
			
 
				-typedef int16 __attribute__((vector_size(16))) vec16;
			
 
				-typedef int32 __attribute__((vector_size(16))) vec32;
			
 
				-typedef int8 __attribute__((vector_size(16))) vec8;
			
 
				-typedef uint16 __attribute__((vector_size(16))) uvec16;
			
 
				-typedef uint32 __attribute__((vector_size(16))) uvec32;
			
 
				-typedef uint8 __attribute__((vector_size(16))) uvec8;
			
 
				-#else
			
 
				-#define SIMD_ALIGNED(var) var
			
 
				-typedef int16 vec16[8];
			
 
				-typedef int32 vec32[4];
			
 
				-typedef int8 vec8[16];
			
 
				-typedef uint16 uvec16[8];
			
 
				-typedef uint32 uvec32[4];
			
 
				-typedef uint8 uvec8[16];
			
 
				-#endif
			
 
				-
			
 
				-#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
			
 
				-#define OMITFP
			
 
				-#else
			
 
				-#define OMITFP __attribute__((optimize("omit-frame-pointer")))
			
 
				-#endif
			
 
				-
			
 
				-// NaCL macros for GCC x86 and x64.
			
 
				-
			
 
				-// TODO(nfullagar): When pepper_33 toolchain is distributed, default to
			
 
				-// NEW_BINUTILS and remove all BUNDLEALIGN occurances.
			
 
				-#if defined(__native_client__)
			
 
				-#define LABELALIGN ".p2align 5\n"
			
 
				-#else
			
 
				-#define LABELALIGN ".p2align 2\n"
			
 
				-#endif
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-#if defined(NEW_BINUTILS)
			
 
				-#define BUNDLELOCK ".bundle_lock\n"
			
 
				-#define BUNDLEUNLOCK ".bundle_unlock\n"
			
 
				-#define BUNDLEALIGN "\n"
			
 
				-#else
			
 
				-#define BUNDLELOCK "\n"
			
 
				-#define BUNDLEUNLOCK "\n"
			
 
				-#define BUNDLEALIGN ".p2align 5\n"
			
 
				-#endif
			
 
				-#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
			
 
				-#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
			
 
				-#define MEMLEA(offset, base) #offset "(%q" #base ")"
			
 
				-#define MEMLEA3(offset, index, scale) \
			
 
				-    #offset "(,%q" #index "," #scale ")"
			
 
				-#define MEMLEA4(offset, base, index, scale) \
			
 
				-    #offset "(%q" #base ",%q" #index "," #scale ")"
			
 
				-#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
			
 
				-#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
			
 
				-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
			
 
				-    BUNDLELOCK \
			
 
				-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
			
 
				-    #opcode " (%%r15,%%r14),%%" #reg "\n" \
			
 
				-    BUNDLEUNLOCK
			
 
				-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
			
 
				-    BUNDLELOCK \
			
 
				-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
			
 
				-    #opcode " %%" #reg ",(%%r15,%%r14)\n" \
			
 
				-    BUNDLEUNLOCK
			
 
				-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
			
 
				-    BUNDLELOCK \
			
 
				-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
			
 
				-    #opcode " (%%r15,%%r14),%" #arg "\n" \
			
 
				-    BUNDLEUNLOCK
			
 
				-#else
			
 
				-#define BUNDLEALIGN "\n"
			
 
				-#define MEMACCESS(base) "(%" #base ")"
			
 
				-#define MEMACCESS2(offset, base) #offset "(%" #base ")"
			
 
				-#define MEMLEA(offset, base) #offset "(%" #base ")"
			
 
				-#define MEMLEA3(offset, index, scale) \
			
 
				-    #offset "(,%" #index "," #scale ")"
			
 
				-#define MEMLEA4(offset, base, index, scale) \
			
 
				-    #offset "(%" #base ",%" #index "," #scale ")"
			
 
				-#define MEMMOVESTRING(s, d)
			
 
				-#define MEMSTORESTRING(reg, d)
			
 
				-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
			
 
				-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
			
 
				-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
			
 
				-    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
			
 
				-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
			
 
				-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
			
 
				-#endif
			
 
				-
			
 
				-void I444ToARGBRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width);
			
 
				-void I422ToARGBRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width);
			
 
				-void I411ToARGBRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width);
			
 
				-void I422ToBGRARow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_bgra,
			
 
				-                        int width);
			
 
				-void I422ToABGRRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_abgr,
			
 
				-                        int width);
			
 
				-void I422ToRGBARow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_rgba,
			
 
				-                        int width);
			
 
				-void I422ToRGB24Row_NEON(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_rgb24,
			
 
				-                         int width);
			
 
				-void I422ToRAWRow_NEON(const uint8* src_y,
			
 
				-                       const uint8* src_u,
			
 
				-                       const uint8* src_v,
			
 
				-                       uint8* dst_raw,
			
 
				-                       int width);
			
 
				-void I422ToRGB565Row_NEON(const uint8* src_y,
			
 
				-                          const uint8* src_u,
			
 
				-                          const uint8* src_v,
			
 
				-                          uint8* dst_rgb565,
			
 
				-                          int width);
			
 
				-void I422ToARGB1555Row_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb1555,
			
 
				-                            int width);
			
 
				-void I422ToARGB4444Row_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb4444,
			
 
				-                            int width);
			
 
				-void NV12ToARGBRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_uv,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width);
			
 
				-void NV21ToARGBRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_vu,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width);
			
 
				-void NV12ToRGB565Row_NEON(const uint8* src_y,
			
 
				-                          const uint8* src_uv,
			
 
				-                          uint8* dst_rgb565,
			
 
				-                          int width);
			
 
				-void NV21ToRGB565Row_NEON(const uint8* src_y,
			
 
				-                          const uint8* src_vu,
			
 
				-                          uint8* dst_rgb565,
			
 
				-                          int width);
			
 
				-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width);
			
 
				-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width);
			
 
				-
			
 
				-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
			
 
				-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
			
 
				-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
			
 
				-void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
			
 
				-void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
			
 
				-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
			
 
				-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
			
 
				-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
			
 
				-void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
			
 
				-void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
			
 
				-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix);
			
 
				-void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix);
			
 
				-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix);
			
 
				-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
			
 
				-                       uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
			
 
				-                       uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
			
 
				-                     uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
			
 
				-                        uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
			
 
				-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
			
 
				-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
			
 
				-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
			
 
				-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix);
			
 
				-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
			
 
				-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
			
 
				-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
			
 
				-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix);
			
 
				-void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix);
			
 
				-void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix);
			
 
				-void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix);
			
 
				-void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix);
			
 
				-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix);
			
 
				-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix);
			
 
				-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix);
			
 
				-void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
			
 
				-void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
			
 
				-void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
			
 
				-void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
			
 
				-void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
			
 
				-void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
			
 
				-void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
			
 
				-void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
			
 
				-void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
			
 
				-void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
			
 
				-void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix);
			
 
				-void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
			
 
				-void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
			
 
				-void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
			
 
				-
			
 
				-void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
			
 
				-                          uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
			
 
				-                        uint8* dst_u, uint8* dst_v, int width);
			
 
				-void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width);
			
 
				-void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
			
 
				-                                  uint8* dst_u, uint8* dst_v, int width);
			
 
				-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width);
			
 
				-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
			
 
				-                           uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
			
 
				-                            uint8* dst_u, uint8* dst_v, int width);
			
 
				-void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
			
 
				-                           uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
			
 
				-                           uint8* dst_u, uint8* dst_v, int width);
			
 
				-void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
			
 
				-                           uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                             int pix);
			
 
				-void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                             int pix);
			
 
				-void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                             int pix);
			
 
				-void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
			
 
				-                           uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
			
 
				-                           uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
			
 
				-                            uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
			
 
				-                              int src_stride_argb1555,
			
 
				-                              uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
			
 
				-                              int src_stride_argb4444,
			
 
				-                              uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
			
 
				-                   uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
			
 
				-                    uint8* dst_u, uint8* dst_v, int width);
			
 
				-void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
			
 
				-                   uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
			
 
				-                   uint8* dst_u, uint8* dst_v, int width);
			
 
				-void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
			
 
				-                   uint8* dst_u, uint8* dst_v, int width);
			
 
				-void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
			
 
				-                    uint8* dst_u, uint8* dst_v, int width);
			
 
				-void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
			
 
				-                  uint8* dst_u, uint8* dst_v, int width);
			
 
				-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
			
 
				-                     uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width);
			
 
				-
			
 
				-void ARGBToUV444Row_SSSE3(const uint8* src_argb,
			
 
				-                          uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb,
			
 
				-                                    uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
			
 
				-                              uint8* dst_u, uint8* dst_v, int width);
			
 
				-
			
 
				-void ARGBToUV422Row_SSSE3(const uint8* src_argb,
			
 
				-                          uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb,
			
 
				-                                    uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
			
 
				-                              uint8* dst_u, uint8* dst_v, int width);
			
 
				-
			
 
				-void ARGBToUV444Row_C(const uint8* src_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUV422Row_C(const uint8* src_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width);
			
 
				-void ARGBToUV411Row_C(const uint8* src_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width);
			
 
				-
			
 
				-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
			
 
				-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
			
 
				-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
			
 
				-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
			
 
				-void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
			
 
				-void MirrorRow_C(const uint8* src, uint8* dst, int width);
			
 
				-
			
 
				-void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                       int width);
			
 
				-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                      int width);
			
 
				-void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                            int width);
			
 
				-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                   int width);
			
 
				-
			
 
				-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
			
 
				-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
			
 
				-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
			
 
				-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
			
 
				-
			
 
				-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                           int pix);
			
 
				-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                               int pix);
			
 
				-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
			
 
				-                                     uint8* dst_v, int pix);
			
 
				-void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix);
			
 
				-void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix);
			
 
				-void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix);
			
 
				-void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                               int pix);
			
 
				-
			
 
				-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                  int width);
			
 
				-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                     int width);
			
 
				-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                     int width);
			
 
				-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                     int width);
			
 
				-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
			
 
				-                               uint8* dst_uv, int width);
			
 
				-void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                         int width);
			
 
				-void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                         int width);
			
 
				-void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                         int width);
			
 
				-
			
 
				-void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
			
 
				-void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
			
 
				-void CopyRow_X86(const uint8* src, uint8* dst, int count);
			
 
				-void CopyRow_NEON(const uint8* src, uint8* dst, int count);
			
 
				-void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
			
 
				-void CopyRow_C(const uint8* src, uint8* dst, int count);
			
 
				-
			
 
				-void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-
			
 
				-void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
			
 
				-void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
			
 
				-void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
			
 
				-
			
 
				-void SetRow_X86(uint8* dst, uint32 v32, int count);
			
 
				-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
			
 
				-                     int dst_stride, int height);
			
 
				-void SetRow_NEON(uint8* dst, uint32 v32, int count);
			
 
				-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
			
 
				-                      int dst_stride, int height);
			
 
				-void SetRow_C(uint8* dst, uint32 v32, int count);
			
 
				-void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride,
			
 
				-                   int height);
			
 
				-
			
 
				-// ARGBShufflers for BGRAToARGB etc.
			
 
				-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
			
 
				-                      const uint8* shuffler, int pix);
			
 
				-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                         const uint8* shuffler, int pix);
			
 
				-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                          const uint8* shuffler, int pix);
			
 
				-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                         const uint8* shuffler, int pix);
			
 
				-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
			
 
				-                         const uint8* shuffler, int pix);
			
 
				-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                                    const uint8* shuffler, int pix);
			
 
				-void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                             const uint8* shuffler, int pix);
			
 
				-void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                              const uint8* shuffler, int pix);
			
 
				-void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                             const uint8* shuffler, int pix);
			
 
				-void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
			
 
				-                             const uint8* shuffler, int pix);
			
 
				-
			
 
				-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
			
 
				-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
			
 
				-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix);
			
 
				-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
			
 
				-                            int pix);
			
 
				-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
			
 
				-                            int pix);
			
 
				-
			
 
				-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
			
 
				-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
			
 
				-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix);
			
 
				-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
			
 
				-                            int pix);
			
 
				-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
			
 
				-                            int pix);
			
 
				-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
			
 
				-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix);
			
 
				-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
			
 
				-void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
			
 
				-void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
			
 
				-void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
			
 
				-void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
			
 
				-void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
			
 
				-                              int pix);
			
 
				-void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
			
 
				-                                int pix);
			
 
				-void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
			
 
				-                                int pix);
			
 
				-void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
			
 
				-void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
			
 
				-void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
			
 
				-                              int pix);
			
 
				-void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
			
 
				-                                int pix);
			
 
				-void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
			
 
				-                                int pix);
			
 
				-
			
 
				-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-
			
 
				-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-
			
 
				-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-
			
 
				-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
			
 
				-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
			
 
				-void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
			
 
				-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
			
 
				-void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
			
 
				-void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
			
 
				-
			
 
				-void I444ToARGBRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* dst_argb,
			
 
				-                     int width);
			
 
				-void I422ToARGBRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* dst_argb,
			
 
				-                     int width);
			
 
				-void I411ToARGBRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* dst_argb,
			
 
				-                     int width);
			
 
				-void NV12ToARGBRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_uv,
			
 
				-                     uint8* dst_argb,
			
 
				-                     int width);
			
 
				-void NV21ToRGB565Row_C(const uint8* src_y,
			
 
				-                       const uint8* src_vu,
			
 
				-                       uint8* dst_argb,
			
 
				-                       int width);
			
 
				-void NV12ToRGB565Row_C(const uint8* src_y,
			
 
				-                       const uint8* src_uv,
			
 
				-                       uint8* dst_argb,
			
 
				-                       int width);
			
 
				-void NV21ToARGBRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_vu,
			
 
				-                     uint8* dst_argb,
			
 
				-                     int width);
			
 
				-void YUY2ToARGBRow_C(const uint8* src_yuy2,
			
 
				-                     uint8* dst_argb,
			
 
				-                     int width);
			
 
				-void UYVYToARGBRow_C(const uint8* src_uyvy,
			
 
				-                     uint8* dst_argb,
			
 
				-                     int width);
			
 
				-void I422ToBGRARow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* dst_bgra,
			
 
				-                     int width);
			
 
				-void I422ToABGRRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* dst_abgr,
			
 
				-                     int width);
			
 
				-void I422ToRGBARow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* dst_rgba,
			
 
				-                     int width);
			
 
				-void I422ToRGB24Row_C(const uint8* src_y,
			
 
				-                      const uint8* src_u,
			
 
				-                      const uint8* src_v,
			
 
				-                      uint8* dst_rgb24,
			
 
				-                      int width);
			
 
				-void I422ToRAWRow_C(const uint8* src_y,
			
 
				-                    const uint8* src_u,
			
 
				-                    const uint8* src_v,
			
 
				-                    uint8* dst_raw,
			
 
				-                    int width);
			
 
				-void I422ToARGB4444Row_C(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_argb4444,
			
 
				-                         int width);
			
 
				-void I422ToARGB1555Row_C(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_argb4444,
			
 
				-                         int width);
			
 
				-void I422ToRGB565Row_C(const uint8* src_y,
			
 
				-                       const uint8* src_u,
			
 
				-                       const uint8* src_v,
			
 
				-                       uint8* dst_rgb565,
			
 
				-                       int width);
			
 
				-void YToARGBRow_C(const uint8* src_y,
			
 
				-                  uint8* dst_argb,
			
 
				-                  int width);
			
 
				-void I422ToARGBRow_AVX2(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width);
			
 
				-void I444ToARGBRow_SSSE3(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width);
			
 
				-void I422ToARGBRow_SSSE3(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width);
			
 
				-void I411ToARGBRow_SSSE3(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width);
			
 
				-void NV12ToARGBRow_SSSE3(const uint8* src_y,
			
 
				-                         const uint8* src_uv,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width);
			
 
				-void NV21ToARGBRow_SSSE3(const uint8* src_y,
			
 
				-                         const uint8* src_vu,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width);
			
 
				-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
			
 
				-                           const uint8* src_uv,
			
 
				-                           uint8* dst_argb,
			
 
				-                           int width);
			
 
				-void NV21ToRGB565Row_SSSE3(const uint8* src_y,
			
 
				-                           const uint8* src_vu,
			
 
				-                           uint8* dst_argb,
			
 
				-                           int width);
			
 
				-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width);
			
 
				-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width);
			
 
				-void I422ToBGRARow_SSSE3(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_bgra,
			
 
				-                         int width);
			
 
				-void I422ToABGRRow_SSSE3(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_abgr,
			
 
				-                         int width);
			
 
				-void I422ToRGBARow_SSSE3(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_rgba,
			
 
				-                         int width);
			
 
				-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_u,
			
 
				-                             const uint8* src_v,
			
 
				-                             uint8* dst_argb,
			
 
				-                             int width);
			
 
				-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_u,
			
 
				-                             const uint8* src_v,
			
 
				-                             uint8* dst_argb,
			
 
				-                             int width);
			
 
				-void I422ToRGB565Row_SSSE3(const uint8* src_y,
			
 
				-                           const uint8* src_u,
			
 
				-                           const uint8* src_v,
			
 
				-                           uint8* dst_argb,
			
 
				-                           int width);
			
 
				-// RGB24/RAW are unaligned.
			
 
				-void I422ToRGB24Row_SSSE3(const uint8* src_y,
			
 
				-                          const uint8* src_u,
			
 
				-                          const uint8* src_v,
			
 
				-                          uint8* dst_rgb24,
			
 
				-                          int width);
			
 
				-void I422ToRAWRow_SSSE3(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_raw,
			
 
				-                        int width);
			
 
				-
			
 
				-void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
			
 
				-                                   const uint8* src_u,
			
 
				-                                   const uint8* src_v,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width);
			
 
				-void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
			
 
				-                                   const uint8* src_u,
			
 
				-                                   const uint8* src_v,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width);
			
 
				-void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
			
 
				-                                   const uint8* src_u,
			
 
				-                                   const uint8* src_v,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width);
			
 
				-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
			
 
				-                                   const uint8* src_uv,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width);
			
 
				-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
			
 
				-                                   const uint8* src_vu,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width);
			
 
				-void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width);
			
 
				-void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width);
			
 
				-void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y,
			
 
				-                                   const uint8* src_u,
			
 
				-                                   const uint8* src_v,
			
 
				-                                   uint8* dst_bgra,
			
 
				-                                   int width);
			
 
				-void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y,
			
 
				-                                   const uint8* src_u,
			
 
				-                                   const uint8* src_v,
			
 
				-                                   uint8* dst_abgr,
			
 
				-                                   int width);
			
 
				-void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y,
			
 
				-                                   const uint8* src_u,
			
 
				-                                   const uint8* src_v,
			
 
				-                                   uint8* dst_rgba,
			
 
				-                                   int width);
			
 
				-void I422ToARGBRow_Any_AVX2(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_u,
			
 
				-                             const uint8* src_v,
			
 
				-                             uint8* dst_argb,
			
 
				-                             int width);
			
 
				-void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_u,
			
 
				-                             const uint8* src_v,
			
 
				-                             uint8* dst_argb,
			
 
				-                             int width);
			
 
				-void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_u,
			
 
				-                             const uint8* src_v,
			
 
				-                             uint8* dst_argb,
			
 
				-                             int width);
			
 
				-void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_uv,
			
 
				-                             uint8* dst_argb,
			
 
				-                             int width);
			
 
				-void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_vu,
			
 
				-                             uint8* dst_argb,
			
 
				-                             int width);
			
 
				-void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
			
 
				-                               const uint8* src_uv,
			
 
				-                               uint8* dst_argb,
			
 
				-                               int width);
			
 
				-void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,
			
 
				-                               const uint8* src_vu,
			
 
				-                               uint8* dst_argb,
			
 
				-                               int width);
			
 
				-void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
			
 
				-                             uint8* dst_argb,
			
 
				-                             int width);
			
 
				-void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
			
 
				-                             uint8* dst_argb,
			
 
				-                             int width);
			
 
				-void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_u,
			
 
				-                             const uint8* src_v,
			
 
				-                             uint8* dst_bgra,
			
 
				-                             int width);
			
 
				-void I422ToABGRRow_Any_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_u,
			
 
				-                             const uint8* src_v,
			
 
				-                             uint8* dst_abgr,
			
 
				-                             int width);
			
 
				-void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_u,
			
 
				-                             const uint8* src_v,
			
 
				-                             uint8* dst_rgba,
			
 
				-                             int width);
			
 
				-void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
			
 
				-                                 const uint8* src_u,
			
 
				-                                 const uint8* src_v,
			
 
				-                                 uint8* dst_rgba,
			
 
				-                                 int width);
			
 
				-void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
			
 
				-                                 const uint8* src_u,
			
 
				-                                 const uint8* src_v,
			
 
				-                                 uint8* dst_rgba,
			
 
				-                                 int width);
			
 
				-void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
			
 
				-                               const uint8* src_u,
			
 
				-                               const uint8* src_v,
			
 
				-                               uint8* dst_rgba,
			
 
				-                               int width);
			
 
				-// RGB24/RAW are unaligned.
			
 
				-void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
			
 
				-                              const uint8* src_u,
			
 
				-                              const uint8* src_v,
			
 
				-                              uint8* dst_argb,
			
 
				-                              int width);
			
 
				-void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void YToARGBRow_SSE2(const uint8* src_y,
			
 
				-                     uint8* dst_argb,
			
 
				-                     int width);
			
 
				-void YToARGBRow_NEON(const uint8* src_y,
			
 
				-                     uint8* dst_argb,
			
 
				-                     int width);
			
 
				-void YToARGBRow_Any_SSE2(const uint8* src_y,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width);
			
 
				-void YToARGBRow_Any_NEON(const uint8* src_y,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width);
			
 
				-
			
 
				-// ARGB preattenuated alpha blend.
			
 
				-void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                        uint8* dst_argb, int width);
			
 
				-void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                       uint8* dst_argb, int width);
			
 
				-void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                       uint8* dst_argb, int width);
			
 
				-void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                    uint8* dst_argb, int width);
			
 
				-
			
 
				-// ARGB multiply images. Same API as Blend, but these require
			
 
				-// pointer and width alignment for SSE2.
			
 
				-void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                       uint8* dst_argb, int width);
			
 
				-void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width);
			
 
				-void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                              uint8* dst_argb, int width);
			
 
				-void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width);
			
 
				-void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                              uint8* dst_argb, int width);
			
 
				-void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width);
			
 
				-void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                              uint8* dst_argb, int width);
			
 
				-
			
 
				-// ARGB add images.
			
 
				-void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                  uint8* dst_argb, int width);
			
 
				-void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                     uint8* dst_argb, int width);
			
 
				-void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                         uint8* dst_argb, int width);
			
 
				-void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                     uint8* dst_argb, int width);
			
 
				-void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                         uint8* dst_argb, int width);
			
 
				-void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                     uint8* dst_argb, int width);
			
 
				-void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                         uint8* dst_argb, int width);
			
 
				-
			
 
				-// ARGB subtract images. Same API as Blend, but these require
			
 
				-// pointer and width alignment for SSE2.
			
 
				-void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                       uint8* dst_argb, int width);
			
 
				-void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width);
			
 
				-void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                              uint8* dst_argb, int width);
			
 
				-void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width);
			
 
				-void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                              uint8* dst_argb, int width);
			
 
				-void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width);
			
 
				-void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                              uint8* dst_argb, int width);
			
 
				-
			
 
				-void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-
			
 
				-void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
			
 
				-
			
 
				-void I444ToARGBRow_Any_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void I422ToARGBRow_Any_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void I411ToARGBRow_Any_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void I422ToBGRARow_Any_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void I422ToABGRRow_Any_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void I422ToRGBARow_Any_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void I422ToRGB24Row_Any_NEON(const uint8* src_y,
			
 
				-                             const uint8* src_u,
			
 
				-                             const uint8* src_v,
			
 
				-                             uint8* dst_argb,
			
 
				-                             int width);
			
 
				-void I422ToRAWRow_Any_NEON(const uint8* src_y,
			
 
				-                           const uint8* src_u,
			
 
				-                           const uint8* src_v,
			
 
				-                           uint8* dst_argb,
			
 
				-                           int width);
			
 
				-void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
			
 
				-                                const uint8* src_u,
			
 
				-                                const uint8* src_v,
			
 
				-                                uint8* dst_argb,
			
 
				-                                int width);
			
 
				-void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
			
 
				-                                const uint8* src_u,
			
 
				-                                const uint8* src_v,
			
 
				-                                uint8* dst_argb,
			
 
				-                                int width);
			
 
				-void I422ToRGB565Row_Any_NEON(const uint8* src_y,
			
 
				-                              const uint8* src_u,
			
 
				-                              const uint8* src_v,
			
 
				-                              uint8* dst_argb,
			
 
				-                              int width);
			
 
				-void NV12ToARGBRow_Any_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_uv,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void NV21ToARGBRow_Any_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_uv,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
			
 
				-                              const uint8* src_uv,
			
 
				-                              uint8* dst_argb,
			
 
				-                              int width);
			
 
				-void NV21ToRGB565Row_Any_NEON(const uint8* src_y,
			
 
				-                              const uint8* src_uv,
			
 
				-                              uint8* dst_argb,
			
 
				-                              int width);
			
 
				-void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
			
 
				-                            uint8* dst_argb,
			
 
				-                            int width);
			
 
				-void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
			
 
				-                              const uint8* src_u,
			
 
				-                              const uint8* src_v,
			
 
				-                              uint8* dst_argb,
			
 
				-                              int width);
			
 
				-void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
			
 
				-                              const uint8* src_u,
			
 
				-                              const uint8* src_v,
			
 
				-                              uint8* dst_argb,
			
 
				-                              int width);
			
 
				-void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
			
 
				-                              const uint8* src_u,
			
 
				-                              const uint8* src_v,
			
 
				-                              uint8* dst_argb,
			
 
				-                              int width);
			
 
				-void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
			
 
				-                              const uint8* src_u,
			
 
				-                              const uint8* src_v,
			
 
				-                              uint8* dst_argb,
			
 
				-                              int width);
			
 
				-void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
			
 
				-                              const uint8* src_u,
			
 
				-                              const uint8* src_v,
			
 
				-                              uint8* dst_argb,
			
 
				-                              int width);
			
 
				-void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
			
 
				-                              const uint8* src_u,
			
 
				-                              const uint8* src_v,
			
 
				-                              uint8* dst_argb,
			
 
				-                              int width);
			
 
				-
			
 
				-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
			
 
				-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
			
 
				-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
			
 
				-                               uint8* dst_y, int pix);
			
 
				-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                                uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
			
 
				-                                   uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
			
 
				-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
			
 
				-void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                   uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToUV422Row_C(const uint8* src_yuy2,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
			
 
				-void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
			
 
				-                             uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
			
 
				-void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
			
 
				-                             uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
			
 
				-void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
			
 
				-                             uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
			
 
				-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
			
 
				-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
			
 
				-                               uint8* dst_y, int pix);
			
 
				-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                                uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
			
 
				-                                   uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
			
 
				-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
			
 
				-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToUV422Row_NEON(const uint8* src_uyvy,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix);
			
 
				-
			
 
				-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
			
 
				-void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                   uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToUV422Row_C(const uint8* src_uyvy,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
			
 
				-void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
			
 
				-                             uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
			
 
				-void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
			
 
				-                             uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
			
 
				-void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix);
			
 
				-void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
			
 
				-                             uint8* dst_u, uint8* dst_v, int pix);
			
 
				-
			
 
				-void HalfRow_C(const uint8* src_uv, int src_uv_stride,
			
 
				-               uint8* dst_uv, int pix);
			
 
				-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
			
 
				-                  uint8* dst_uv, int pix);
			
 
				-void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
			
 
				-                  uint8* dst_uv, int pix);
			
 
				-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
			
 
				-                  uint8* dst_uv, int pix);
			
 
				-
			
 
				-void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                      uint32 selector, int pix);
			
 
				-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                          uint32 selector, int pix);
			
 
				-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                         uint32 selector, int pix);
			
 
				-void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                              uint32 selector, int pix);
			
 
				-void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                             uint32 selector, int pix);
			
 
				-void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                        uint32 /* selector */, int pix);
			
 
				-void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                           uint32 /* selector */, int pix);
			
 
				-void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                           uint32 /* selector */, int pix);
			
 
				-void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                               uint32 /* selector */, int pix);
			
 
				-void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                               uint32 /* selector */, int pix);
			
 
				-
			
 
				-void I422ToYUY2Row_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* dst_yuy2, int width);
			
 
				-void I422ToUYVYRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* dst_uyvy, int width);
			
 
				-void I422ToYUY2Row_SSE2(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_yuy2, int width);
			
 
				-void I422ToUYVYRow_SSE2(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_uyvy, int width);
			
 
				-void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_yuy2, int width);
			
 
				-void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_uyvy, int width);
			
 
				-void I422ToYUY2Row_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_yuy2, int width);
			
 
				-void I422ToUYVYRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_uyvy, int width);
			
 
				-void I422ToYUY2Row_Any_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_yuy2, int width);
			
 
				-void I422ToUYVYRow_Any_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_uyvy, int width);
			
 
				-
			
 
				-// Effects related row functions.
			
 
				-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                               int width);
			
 
				-void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                                int width);
			
 
				-void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                               int width);
			
 
				-void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
			
 
				-                               int width);
			
 
				-
			
 
				-// Inverse table for unattenuate, shared by C and SSE2.
			
 
				-extern const uint32 fixed_invtbl8[256];
			
 
				-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                                 int width);
			
 
				-void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                                 int width);
			
 
				-
			
 
				-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
			
 
				-
			
 
				-void ARGBSepiaRow_C(uint8* dst_argb, int width);
			
 
				-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
			
 
				-void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
			
 
				-
			
 
				-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
			
 
				-                          const int8* matrix_argb, int width);
			
 
				-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                              const int8* matrix_argb, int width);
			
 
				-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
			
 
				-                             const int8* matrix_argb, int width);
			
 
				-
			
 
				-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
			
 
				-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
			
 
				-
			
 
				-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
			
 
				-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
			
 
				-
			
 
				-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
			
 
				-                       int interval_offset, int width);
			
 
				-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
			
 
				-                          int interval_offset, int width);
			
 
				-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
			
 
				-                          int interval_offset, int width);
			
 
				-
			
 
				-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
			
 
				-                    uint32 value);
			
 
				-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
			
 
				-                       uint32 value);
			
 
				-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
			
 
				-                       uint32 value);
			
 
				-
			
 
				-// Used for blur.
			
 
				-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
			
 
				-                                    int width, int area, uint8* dst, int count);
			
 
				-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
			
 
				-                                  const int32* previous_cumsum, int width);
			
 
				-
			
 
				-void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
			
 
				-                                 int width, int area, uint8* dst, int count);
			
 
				-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
			
 
				-                               const int32* previous_cumsum, int width);
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
			
 
				-                     uint8* dst_argb, const float* uv_dudv, int width);
			
 
				-LIBYUV_API
			
 
				-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
			
 
				-                        uint8* dst_argb, const float* uv_dudv, int width);
			
 
				-
			
 
				-// Used for I420Scale, ARGBScale, and ARGBInterpolate.
			
 
				-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                      ptrdiff_t src_stride_ptr,
			
 
				-                      int width, int source_y_fraction);
			
 
				-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                         ptrdiff_t src_stride_ptr, int width,
			
 
				-                         int source_y_fraction);
			
 
				-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                          ptrdiff_t src_stride_ptr, int width,
			
 
				-                          int source_y_fraction);
			
 
				-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                         ptrdiff_t src_stride_ptr, int width,
			
 
				-                         int source_y_fraction);
			
 
				-void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                         ptrdiff_t src_stride_ptr, int width,
			
 
				-                         int source_y_fraction);
			
 
				-void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                                ptrdiff_t src_stride_ptr, int width,
			
 
				-                                int source_y_fraction);
			
 
				-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                                   ptrdiff_t src_stride_ptr, int width,
			
 
				-                                   int source_y_fraction);
			
 
				-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                                    ptrdiff_t src_stride_ptr, int width,
			
 
				-                                    int source_y_fraction);
			
 
				-void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                             ptrdiff_t src_stride_ptr, int width,
			
 
				-                             int source_y_fraction);
			
 
				-void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                             ptrdiff_t src_stride_ptr, int width,
			
 
				-                             int source_y_fraction);
			
 
				-void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                              ptrdiff_t src_stride_ptr, int width,
			
 
				-                              int source_y_fraction);
			
 
				-void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                             ptrdiff_t src_stride_ptr, int width,
			
 
				-                             int source_y_fraction);
			
 
				-void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                                    ptrdiff_t src_stride_ptr, int width,
			
 
				-                                    int source_y_fraction);
			
 
				-
			
 
				-// Sobel images.
			
 
				-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
			
 
				-                 uint8* dst_sobelx, int width);
			
 
				-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    const uint8* src_y2, uint8* dst_sobelx, int width);
			
 
				-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    const uint8* src_y2, uint8* dst_sobelx, int width);
			
 
				-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
			
 
				-                 uint8* dst_sobely, int width);
			
 
				-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    uint8* dst_sobely, int width);
			
 
				-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    uint8* dst_sobely, int width);
			
 
				-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                uint8* dst_argb, int width);
			
 
				-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                   uint8* dst_argb, int width);
			
 
				-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                   uint8* dst_argb, int width);
			
 
				-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                       uint8* dst_y, int width);
			
 
				-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                          uint8* dst_y, int width);
			
 
				-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                          uint8* dst_y, int width);
			
 
				-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                  uint8* dst_argb, int width);
			
 
				-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                     uint8* dst_argb, int width);
			
 
				-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                     uint8* dst_argb, int width);
			
 
				-
			
 
				-void ARGBPolynomialRow_C(const uint8* src_argb,
			
 
				-                         uint8* dst_argb, const float* poly,
			
 
				-                         int width);
			
 
				-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
			
 
				-                            uint8* dst_argb, const float* poly,
			
 
				-                            int width);
			
 
				-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
			
 
				-                            uint8* dst_argb, const float* poly,
			
 
				-                            int width);
			
 
				-
			
 
				-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
			
 
				-                             const uint8* luma, uint32 lumacoeff);
			
 
				-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                                 int width,
			
 
				-                                 const uint8* luma, uint32 lumacoeff);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/scale.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/scale.h
@@ -1,85 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_SCALE_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Supported filtering.
			
 
				-typedef enum FilterMode {
			
 
				-  kFilterNone = 0,  // Point sample; Fastest.
			
 
				-  kFilterLinear = 1,  // Filter horizontally only.
			
 
				-  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
			
 
				-  kFilterBox = 3  // Highest quality.
			
 
				-} FilterModeEnum;
			
 
				-
			
 
				-// Scale a YUV plane.
			
 
				-LIBYUV_API
			
 
				-void ScalePlane(const uint8* src, int src_stride,
			
 
				-                int src_width, int src_height,
			
 
				-                uint8* dst, int dst_stride,
			
 
				-                int dst_width, int dst_height,
			
 
				-                enum FilterMode filtering);
			
 
				-
			
 
				-// Scales a YUV 4:2:0 image from the src width and height to the
			
 
				-// dst width and height.
			
 
				-// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
			
 
				-// used. This produces basic (blocky) quality at the fastest speed.
			
 
				-// If filtering is kFilterBilinear, interpolation is used to produce a better
			
 
				-// quality image, at the expense of speed.
			
 
				-// If filtering is kFilterBox, averaging is used to produce ever better
			
 
				-// quality image, at further expense of speed.
			
 
				-// Returns 0 if successful.
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420Scale(const uint8* src_y, int src_stride_y,
			
 
				-              const uint8* src_u, int src_stride_u,
			
 
				-              const uint8* src_v, int src_stride_v,
			
 
				-              int src_width, int src_height,
			
 
				-              uint8* dst_y, int dst_stride_y,
			
 
				-              uint8* dst_u, int dst_stride_u,
			
 
				-              uint8* dst_v, int dst_stride_v,
			
 
				-              int dst_width, int dst_height,
			
 
				-              enum FilterMode filtering);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-// Legacy API.  Deprecated.
			
 
				-LIBYUV_API
			
 
				-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
			
 
				-          int src_stride_y, int src_stride_u, int src_stride_v,
			
 
				-          int src_width, int src_height,
			
 
				-          uint8* dst_y, uint8* dst_u, uint8* dst_v,
			
 
				-          int dst_stride_y, int dst_stride_u, int dst_stride_v,
			
 
				-          int dst_width, int dst_height,
			
 
				-          LIBYUV_BOOL interpolate);
			
 
				-
			
 
				-// Legacy API.  Deprecated.
			
 
				-LIBYUV_API
			
 
				-int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
			
 
				-                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
			
 
				-                LIBYUV_BOOL interpolate);
			
 
				-
			
 
				-// For testing, allow disabling of specialized scalers.
			
 
				-LIBYUV_API
			
 
				-void SetUseReferenceImpl(LIBYUV_BOOL use);
			
 
				-#endif  // __cplusplus
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/scale_argb.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/scale_argb.h
@@ -1,57 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_SCALE_ARGB_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/scale.h"  // For FilterMode
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int ARGBScale(const uint8* src_argb, int src_stride_argb,
			
 
				-              int src_width, int src_height,
			
 
				-              uint8* dst_argb, int dst_stride_argb,
			
 
				-              int dst_width, int dst_height,
			
 
				-              enum FilterMode filtering);
			
 
				-
			
 
				-// Clipped scale takes destination rectangle coordinates for clip values.
			
 
				-LIBYUV_API
			
 
				-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
			
 
				-                  int src_width, int src_height,
			
 
				-                  uint8* dst_argb, int dst_stride_argb,
			
 
				-                  int dst_width, int dst_height,
			
 
				-                  int clip_x, int clip_y, int clip_width, int clip_height,
			
 
				-                  enum FilterMode filtering);
			
 
				-
			
 
				-// TODO(fbarchard): Implement this.
			
 
				-// Scale with YUV conversion to ARGB and clipping.
			
 
				-LIBYUV_API
			
 
				-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
			
 
				-                       const uint8* src_u, int src_stride_u,
			
 
				-                       const uint8* src_v, int src_stride_v,
			
 
				-                       uint32 src_fourcc,
			
 
				-                       int src_width, int src_height,
			
 
				-                       uint8* dst_argb, int dst_stride_argb,
			
 
				-                       uint32 dst_fourcc,
			
 
				-                       int dst_width, int dst_height,
			
 
				-                       int clip_x, int clip_y, int clip_width, int clip_height,
			
 
				-                       enum FilterMode filtering);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/scale_row.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/scale_row.h
@@ -1,301 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_SCALE_ROW_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
			
 
				-    defined(TARGET_IPHONE_SIMULATOR)
			
 
				-#define LIBYUV_DISABLE_X86
			
 
				-#endif
			
 
				-
			
 
				-// The following are available on all x86 platforms:
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && \
			
 
				-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
			
 
				-#define HAS_SCALEROWDOWN2_SSE2
			
 
				-#define HAS_SCALEROWDOWN4_SSE2
			
 
				-#define HAS_SCALEROWDOWN34_SSSE3
			
 
				-#define HAS_SCALEROWDOWN38_SSSE3
			
 
				-#define HAS_SCALEADDROWS_SSE2
			
 
				-#define HAS_SCALEFILTERCOLS_SSSE3
			
 
				-#define HAS_SCALECOLSUP2_SSE2
			
 
				-#define HAS_SCALEARGBROWDOWN2_SSE2
			
 
				-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
			
 
				-#define HAS_SCALEARGBCOLS_SSE2
			
 
				-#define HAS_SCALEARGBFILTERCOLS_SSSE3
			
 
				-#define HAS_SCALEARGBCOLSUP2_SSE2
			
 
				-#define HAS_FIXEDDIV_X86
			
 
				-#define HAS_FIXEDDIV1_X86
			
 
				-#endif
			
 
				-
			
 
				-// The following are available on Neon platforms:
			
 
				-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
			
 
				-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
			
 
				-#define HAS_SCALEROWDOWN2_NEON
			
 
				-#define HAS_SCALEROWDOWN4_NEON
			
 
				-#define HAS_SCALEROWDOWN34_NEON
			
 
				-#define HAS_SCALEROWDOWN38_NEON
			
 
				-#define HAS_SCALEARGBROWDOWNEVEN_NEON
			
 
				-#define HAS_SCALEARGBROWDOWN2_NEON
			
 
				-#endif
			
 
				-
			
 
				-// The following are available on Mips platforms:
			
 
				-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
			
 
				-    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
			
 
				-#define HAS_SCALEROWDOWN2_MIPS_DSPR2
			
 
				-#define HAS_SCALEROWDOWN4_MIPS_DSPR2
			
 
				-#define HAS_SCALEROWDOWN34_MIPS_DSPR2
			
 
				-#define HAS_SCALEROWDOWN38_MIPS_DSPR2
			
 
				-#endif
			
 
				-
			
 
				-// Scale ARGB vertically with bilinear interpolation.
			
 
				-void ScalePlaneVertical(int src_height,
			
 
				-                        int dst_width, int dst_height,
			
 
				-                        int src_stride, int dst_stride,
			
 
				-                        const uint8* src_argb, uint8* dst_argb,
			
 
				-                        int x, int y, int dy,
			
 
				-                        int bpp, enum FilterMode filtering);
			
 
				-
			
 
				-// Simplify the filtering based on scale factors.
			
 
				-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
			
 
				-                                  int dst_width, int dst_height,
			
 
				-                                  enum FilterMode filtering);
			
 
				-
			
 
				-// Divide num by div and return as 16.16 fixed point result.
			
 
				-int FixedDiv_C(int num, int div);
			
 
				-int FixedDiv_X86(int num, int div);
			
 
				-// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
			
 
				-int FixedDiv1_C(int num, int div);
			
 
				-int FixedDiv1_X86(int num, int div);
			
 
				-#ifdef HAS_FIXEDDIV_X86
			
 
				-#define FixedDiv FixedDiv_X86
			
 
				-#define FixedDiv1 FixedDiv1_X86
			
 
				-#else
			
 
				-#define FixedDiv FixedDiv_C
			
 
				-#define FixedDiv1 FixedDiv1_C
			
 
				-#endif
			
 
				-
			
 
				-// Compute slope values for stepping.
			
 
				-void ScaleSlope(int src_width, int src_height,
			
 
				-                int dst_width, int dst_height,
			
 
				-                enum FilterMode filtering,
			
 
				-                int* x, int* y, int* dx, int* dy);
			
 
				-
			
 
				-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                     uint8* dst, int dst_width);
			
 
				-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                           uint8* dst, int dst_width);
			
 
				-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                        uint8* dst, int dst_width);
			
 
				-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                     uint8* dst, int dst_width);
			
 
				-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                        uint8* dst, int dst_width);
			
 
				-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                      uint8* dst, int dst_width);
			
 
				-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                            uint8* d, int dst_width);
			
 
				-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                            uint8* d, int dst_width);
			
 
				-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                 int dst_width, int x, int dx);
			
 
				-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                    int dst_width, int, int);
			
 
				-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                       int dst_width, int x, int dx);
			
 
				-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                         int dst_width, int x, int dx);
			
 
				-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                      uint8* dst, int dst_width);
			
 
				-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
			
 
				-                            ptrdiff_t src_stride,
			
 
				-                            uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                            uint8* dst_ptr, int dst_width);
			
 
				-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                    uint16* dst_ptr, int src_width, int src_height);
			
 
				-void ScaleARGBRowDown2_C(const uint8* src_argb,
			
 
				-                         ptrdiff_t src_stride,
			
 
				-                         uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
			
 
				-                               ptrdiff_t src_stride,
			
 
				-                               uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
			
 
				-                            uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
			
 
				-                            int src_stepx,
			
 
				-                            uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
			
 
				-                               ptrdiff_t src_stride,
			
 
				-                               int src_stepx,
			
 
				-                               uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
			
 
				-                     int dst_width, int x, int dx);
			
 
				-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
			
 
				-                       int dst_width, int x, int dx);
			
 
				-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
			
 
				-                        int dst_width, int, int);
			
 
				-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
			
 
				-                           int dst_width, int x, int dx);
			
 
				-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
			
 
				-                             int dst_width, int x, int dx);
			
 
				-
			
 
				-void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                        uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                              uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                           uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
			
 
				-                                  ptrdiff_t src_stride,
			
 
				-                                  uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
			
 
				-                                        ptrdiff_t src_stride,
			
 
				-                                        uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
			
 
				-                                     ptrdiff_t src_stride,
			
 
				-                                     uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                        uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                           uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                          uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
			
 
				-                                ptrdiff_t src_stride,
			
 
				-                                uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
			
 
				-                                ptrdiff_t src_stride,
			
 
				-                                uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                          uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
			
 
				-                                ptrdiff_t src_stride,
			
 
				-                                uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
			
 
				-                                ptrdiff_t src_stride,
			
 
				-                                uint8* dst_ptr, int dst_width);
			
 
				-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                       uint16* dst_ptr, int src_width,
			
 
				-                       int src_height);
			
 
				-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                           int dst_width, int x, int dx);
			
 
				-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                       int dst_width, int x, int dx);
			
 
				-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
			
 
				-                            ptrdiff_t src_stride,
			
 
				-                            uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
			
 
				-                                  ptrdiff_t src_stride,
			
 
				-                                  uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
			
 
				-                               ptrdiff_t src_stride,
			
 
				-                               uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
			
 
				-                               int src_stepx,
			
 
				-                               uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
			
 
				-                                  ptrdiff_t src_stride,
			
 
				-                                  int src_stepx,
			
 
				-                                  uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
			
 
				-                        int dst_width, int x, int dx);
			
 
				-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
			
 
				-                               int dst_width, int x, int dx);
			
 
				-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
			
 
				-                           int dst_width, int x, int dx);
			
 
				-// Row functions.
			
 
				-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
			
 
				-                               int src_stepx,
			
 
				-                               uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
			
 
				-                                  int src_stepx,
			
 
				-                                  uint8* dst_argb, int dst_width);
			
 
				-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                            uint8* dst, int dst_width);
			
 
				-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                               uint8* dst, int dst_width);
			
 
				-
			
 
				-// ScaleRowDown2Box also used by planar functions
			
 
				-// NEON downscalers with interpolation.
			
 
				-
			
 
				-// Note - not static due to reuse in convert for 444 to 420.
			
 
				-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                        uint8* dst, int dst_width);
			
 
				-
			
 
				-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                           uint8* dst, int dst_width);
			
 
				-
			
 
				-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                        uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                           uint8* dst_ptr, int dst_width);
			
 
				-
			
 
				-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
			
 
				-//  to load up the every 4th pixel into a 4 different registers.
			
 
				-// Point samples 32 pixels to 24 pixels.
			
 
				-void ScaleRowDown34_NEON(const uint8* src_ptr,
			
 
				-                         ptrdiff_t src_stride,
			
 
				-                         uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
			
 
				-                               ptrdiff_t src_stride,
			
 
				-                               uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
			
 
				-                               ptrdiff_t src_stride,
			
 
				-                               uint8* dst_ptr, int dst_width);
			
 
				-
			
 
				-// 32 -> 12
			
 
				-void ScaleRowDown38_NEON(const uint8* src_ptr,
			
 
				-                         ptrdiff_t src_stride,
			
 
				-                         uint8* dst_ptr, int dst_width);
			
 
				-// 32x3 -> 12x1
			
 
				-void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
			
 
				-                               ptrdiff_t src_stride,
			
 
				-                               uint8* dst_ptr, int dst_width);
			
 
				-// 32x2 -> 12x1
			
 
				-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
			
 
				-                               ptrdiff_t src_stride,
			
 
				-                               uint8* dst_ptr, int dst_width);
			
 
				-
			
 
				-void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                              uint8* dst, int dst_width);
			
 
				-void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                                 uint8* dst, int dst_width);
			
 
				-void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                              uint8* dst, int dst_width);
			
 
				-void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                                 uint8* dst, int dst_width);
			
 
				-void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                               uint8* dst, int dst_width);
			
 
				-void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                                     uint8* d, int dst_width);
			
 
				-void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                                     uint8* d, int dst_width);
			
 
				-void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                               uint8* dst, int dst_width);
			
 
				-void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                                     uint8* dst_ptr, int dst_width);
			
 
				-void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
			
 
				-                                     ptrdiff_t src_stride,
			
 
				-                                     uint8* dst_ptr, int dst_width);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/version.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/version.h
@@ -1,16 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_VERSION_H_
			
 
				-
			
 
				-#define LIBYUV_VERSION 998
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/video_common.h
+++ b/drivers/theoraplayer/src/YUV/libyuv/include/libyuv/video_common.h
@@ -1,182 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-// Common definitions for video, including fourcc and VideoFormat.
			
 
				-
			
 
				-#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
			
 
				-#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-//////////////////////////////////////////////////////////////////////////////
			
 
				-// Definition of FourCC codes
			
 
				-//////////////////////////////////////////////////////////////////////////////
			
 
				-
			
 
				-// Convert four characters to a FourCC code.
			
 
				-// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
			
 
				-// constants are used in a switch.
			
 
				-#ifdef __cplusplus
			
 
				-#define FOURCC(a, b, c, d) ( \
			
 
				-    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
			
 
				-    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
			
 
				-#else
			
 
				-#define FOURCC(a, b, c, d) ( \
			
 
				-    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
			
 
				-    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */
			
 
				-#endif
			
 
				-
			
 
				-// Some pages discussing FourCC codes:
			
 
				-//   http://www.fourcc.org/yuv.php
			
 
				-//   http://v4l2spec.bytesex.org/spec/book1.htm
			
 
				-//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
			
 
				-//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
			
 
				-//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
			
 
				-
			
 
				-// FourCC codes grouped according to implementation efficiency.
			
 
				-// Primary formats should convert in 1 efficient step.
			
 
				-// Secondary formats are converted in 2 steps.
			
 
				-// Auxilliary formats call primary converters.
			
 
				-enum FourCC {
			
 
				-  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
			
 
				-  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
			
 
				-  FOURCC_I422 = FOURCC('I', '4', '2', '2'),
			
 
				-  FOURCC_I444 = FOURCC('I', '4', '4', '4'),
			
 
				-  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
			
 
				-  FOURCC_I400 = FOURCC('I', '4', '0', '0'),
			
 
				-  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
			
 
				-  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
			
 
				-  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
			
 
				-  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
			
 
				-
			
 
				-  // 2 Secondary YUV formats: row biplanar.
			
 
				-  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
			
 
				-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
			
 
				-
			
 
				-  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
			
 
				-  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
			
 
				-  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
			
 
				-  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
			
 
				-  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
			
 
				-  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
			
 
				-  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
			
 
				-  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
			
 
				-  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
			
 
				-  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
			
 
				-
			
 
				-  // 4 Secondary RGB formats: 4 Bayer Patterns.
			
 
				-  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
			
 
				-  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
			
 
				-  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
			
 
				-  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
			
 
				-
			
 
				-  // 1 Primary Compressed YUV format.
			
 
				-  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
			
 
				-
			
 
				-  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
			
 
				-  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
			
 
				-  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
			
 
				-  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
			
 
				-  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
			
 
				-  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
			
 
				-  FOURCC_J400 = FOURCC('J', '4', '0', '0'),
			
 
				-
			
 
				-  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
			
 
				-  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
			
 
				-  FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
			
 
				-  FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
			
 
				-  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
			
 
				-  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
			
 
				-  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
			
 
				-  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
			
 
				-  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
			
 
				-  FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
			
 
				-  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
			
 
				-  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
			
 
				-  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
			
 
				-  FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
			
 
				-  FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
			
 
				-  FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
			
 
				-  FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
			
 
				-  FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
			
 
				-
			
 
				-  // 1 Auxiliary compressed YUV format set aside for capturer.
			
 
				-  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
			
 
				-
			
 
				-  // Match any fourcc.
			
 
				-  FOURCC_ANY  = 0xFFFFFFFF,
			
 
				-};
			
 
				-
			
 
				-enum FourCCBpp {
			
 
				-  // Canonical fourcc codes used in our code.
			
 
				-  FOURCC_BPP_I420 = 12,
			
 
				-  FOURCC_BPP_I422 = 16,
			
 
				-  FOURCC_BPP_I444 = 24,
			
 
				-  FOURCC_BPP_I411 = 12,
			
 
				-  FOURCC_BPP_I400 = 8,
			
 
				-  FOURCC_BPP_NV21 = 12,
			
 
				-  FOURCC_BPP_NV12 = 12,
			
 
				-  FOURCC_BPP_YUY2 = 16,
			
 
				-  FOURCC_BPP_UYVY = 16,
			
 
				-  FOURCC_BPP_M420 = 12,
			
 
				-  FOURCC_BPP_Q420 = 12,
			
 
				-  FOURCC_BPP_ARGB = 32,
			
 
				-  FOURCC_BPP_BGRA = 32,
			
 
				-  FOURCC_BPP_ABGR = 32,
			
 
				-  FOURCC_BPP_RGBA = 32,
			
 
				-  FOURCC_BPP_24BG = 24,
			
 
				-  FOURCC_BPP_RAW  = 24,
			
 
				-  FOURCC_BPP_RGBP = 16,
			
 
				-  FOURCC_BPP_RGBO = 16,
			
 
				-  FOURCC_BPP_R444 = 16,
			
 
				-  FOURCC_BPP_RGGB = 8,
			
 
				-  FOURCC_BPP_BGGR = 8,
			
 
				-  FOURCC_BPP_GRBG = 8,
			
 
				-  FOURCC_BPP_GBRG = 8,
			
 
				-  FOURCC_BPP_YV12 = 12,
			
 
				-  FOURCC_BPP_YV16 = 16,
			
 
				-  FOURCC_BPP_YV24 = 24,
			
 
				-  FOURCC_BPP_YU12 = 12,
			
 
				-  FOURCC_BPP_J420 = 12,
			
 
				-  FOURCC_BPP_J400 = 8,
			
 
				-  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
			
 
				-  FOURCC_BPP_H264 = 0,
			
 
				-  FOURCC_BPP_IYUV = 12,
			
 
				-  FOURCC_BPP_YU16 = 16,
			
 
				-  FOURCC_BPP_YU24 = 24,
			
 
				-  FOURCC_BPP_YUYV = 16,
			
 
				-  FOURCC_BPP_YUVS = 16,
			
 
				-  FOURCC_BPP_HDYC = 16,
			
 
				-  FOURCC_BPP_2VUY = 16,
			
 
				-  FOURCC_BPP_JPEG = 1,
			
 
				-  FOURCC_BPP_DMB1 = 1,
			
 
				-  FOURCC_BPP_BA81 = 8,
			
 
				-  FOURCC_BPP_RGB3 = 24,
			
 
				-  FOURCC_BPP_BGR3 = 24,
			
 
				-  FOURCC_BPP_CM32 = 32,
			
 
				-  FOURCC_BPP_CM24 = 24,
			
 
				-
			
 
				-  // Match any fourcc.
			
 
				-  FOURCC_BPP_ANY  = 0,  // 0 means unknown.
			
 
				-};
			
 
				-
			
 
				-// Converts fourcc aliases into canonical ones.
			
 
				-LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
 
				-#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT
			
--- a/drivers/theoraplayer/src/YUV/libyuv/libtheoraplayer-readme.txt
+++ b/drivers/theoraplayer/src/YUV/libyuv/libtheoraplayer-readme.txt
@@ -1,15 +0,0 @@
 
				-libyuv's source code is here provided in minimalist distribution format
			
 
				-with all source files not needed for compiling libtheoraplayer removed.
			
 
				-
			
 
				-- The project files were modified to fit libtheoraplayer's binary output
			
 
				-  folder structure.
			
 
				-- Some project files missing in the original source distibution were added to support
			
 
				-  compiling the libtheoraplayer on those platforms.
			
 
				-- Also, some code may have been changed to address certain compiler/platform
			
 
				-  specific problems and is so indicated in the source code.
			
 
				-
			
 
				-libyuv is owned and maintained by the Google Inc. and this distribution
			
 
				-is present here only for convenience and easier compilation of libtheoraplayer.
			
 
				-
			
 
				-If you want to use libyuv outside of libtheoraplayer, it is encouraged to use the
			
 
				-original source distribution by Google Inc: https://code.google.com/p/libyuv/
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/compare.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/compare.cc
@@ -1,325 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/compare.h"
			
 
				-
			
 
				-#include <float.h>
			
 
				-#include <math.h>
			
 
				-#ifdef _OPENMP
			
 
				-#include <omp.h>
			
 
				-#endif
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// hash seed of 5381 recommended.
			
 
				-// Internal C version of HashDjb2 with int sized count for efficiency.
			
 
				-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
			
 
				-
			
 
				-// This module is for Visual C x86
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && \
			
 
				-    (defined(_M_IX86) || \
			
 
				-    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
			
 
				-#define HAS_HASHDJB2_SSE41
			
 
				-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
			
 
				-
			
 
				-#if _MSC_VER >= 1700
			
 
				-#define HAS_HASHDJB2_AVX2
			
 
				-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
			
 
				-#endif
			
 
				-
			
 
				-#endif  // HAS_HASHDJB2_SSE41
			
 
				-
			
 
				-// hash seed of 5381 recommended.
			
 
				-LIBYUV_API
			
 
				-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
			
 
				-  const int kBlockSize = 1 << 15;  // 32768;
			
 
				-  int remainder;
			
 
				-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
			
 
				-#if defined(HAS_HASHDJB2_SSE41)
			
 
				-  if (TestCpuFlag(kCpuHasSSE41)) {
			
 
				-    HashDjb2_SSE = HashDjb2_SSE41;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_HASHDJB2_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2)) {
			
 
				-    HashDjb2_SSE = HashDjb2_AVX2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  while (count >= (uint64)(kBlockSize)) {
			
 
				-    seed = HashDjb2_SSE(src, kBlockSize, seed);
			
 
				-    src += kBlockSize;
			
 
				-    count -= kBlockSize;
			
 
				-  }
			
 
				-  remainder = (int)(count) & ~15;
			
 
				-  if (remainder) {
			
 
				-    seed = HashDjb2_SSE(src, remainder, seed);
			
 
				-    src += remainder;
			
 
				-    count -= remainder;
			
 
				-  }
			
 
				-  remainder = (int)(count) & 15;
			
 
				-  if (remainder) {
			
 
				-    seed = HashDjb2_C(src, remainder, seed);
			
 
				-  }
			
 
				-  return seed;
			
 
				-}
			
 
				-
			
 
				-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
			
 
				-#if !defined(LIBYUV_DISABLE_NEON) && \
			
 
				-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
			
 
				-#define HAS_SUMSQUAREERROR_NEON
			
 
				-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
			
 
				-#endif
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && \
			
 
				-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
			
 
				-#define HAS_SUMSQUAREERROR_SSE2
			
 
				-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
			
 
				-#endif
			
 
				-// Visual C 2012 required for AVX2.
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
			
 
				-#define HAS_SUMSQUAREERROR_AVX2
			
 
				-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
			
 
				-#endif
			
 
				-
			
 
				-// TODO(fbarchard): Refactor into row function.
			
 
				-LIBYUV_API
			
 
				-uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
			
 
				-                             int count) {
			
 
				-  // SumSquareError returns values 0 to 65535 for each squared difference.
			
 
				-  // Up to 65536 of those can be summed and remain within a uint32.
			
 
				-  // After each block of 65536 pixels, accumulate into a uint64.
			
 
				-  const int kBlockSize = 65536;
			
 
				-  int remainder = count & (kBlockSize - 1) & ~31;
			
 
				-  uint64 sse = 0;
			
 
				-  int i;
			
 
				-  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
			
 
				-      SumSquareError_C;
			
 
				-#if defined(HAS_SUMSQUAREERROR_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON)) {
			
 
				-    SumSquareError = SumSquareError_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SUMSQUAREERROR_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) &&
			
 
				-      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
			
 
				-    // Note only used for multiples of 16 so count is not checked.
			
 
				-    SumSquareError = SumSquareError_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SUMSQUAREERROR_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2)) {
			
 
				-    // Note only used for multiples of 32 so count is not checked.
			
 
				-    SumSquareError = SumSquareError_AVX2;
			
 
				-  }
			
 
				-#endif
			
 
				-#ifdef _OPENMP
			
 
				-#pragma omp parallel for reduction(+: sse)
			
 
				-#endif
			
 
				-  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
			
 
				-    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
			
 
				-  }
			
 
				-  src_a += count & ~(kBlockSize - 1);
			
 
				-  src_b += count & ~(kBlockSize - 1);
			
 
				-  if (remainder) {
			
 
				-    sse += SumSquareError(src_a, src_b, remainder);
			
 
				-    src_a += remainder;
			
 
				-    src_b += remainder;
			
 
				-  }
			
 
				-  remainder = count & 31;
			
 
				-  if (remainder) {
			
 
				-    sse += SumSquareError_C(src_a, src_b, remainder);
			
 
				-  }
			
 
				-  return sse;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
			
 
				-                                  const uint8* src_b, int stride_b,
			
 
				-                                  int width, int height) {
			
 
				-  uint64 sse = 0;
			
 
				-  int h;
			
 
				-  // Coalesce rows.
			
 
				-  if (stride_a == width &&
			
 
				-      stride_b == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    stride_a = stride_b = 0;
			
 
				-  }
			
 
				-  for (h = 0; h < height; ++h) {
			
 
				-    sse += ComputeSumSquareError(src_a, src_b, width);
			
 
				-    src_a += stride_a;
			
 
				-    src_b += stride_b;
			
 
				-  }
			
 
				-  return sse;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
			
 
				-  double psnr;
			
 
				-  if (sse > 0) {
			
 
				-    double mse = (double)(count) / (double)(sse);
			
 
				-    psnr = 10.0 * log10(255.0 * 255.0 * mse);
			
 
				-  } else {
			
 
				-    psnr = kMaxPsnr;      // Limit to prevent divide by 0
			
 
				-  }
			
 
				-
			
 
				-  if (psnr > kMaxPsnr)
			
 
				-    psnr = kMaxPsnr;
			
 
				-
			
 
				-  return psnr;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-double CalcFramePsnr(const uint8* src_a, int stride_a,
			
 
				-                     const uint8* src_b, int stride_b,
			
 
				-                     int width, int height) {
			
 
				-  const uint64 samples = width * height;
			
 
				-  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
			
 
				-                                                src_b, stride_b,
			
 
				-                                                width, height);
			
 
				-  return SumSquareErrorToPsnr(sse, samples);
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-double I420Psnr(const uint8* src_y_a, int stride_y_a,
			
 
				-                const uint8* src_u_a, int stride_u_a,
			
 
				-                const uint8* src_v_a, int stride_v_a,
			
 
				-                const uint8* src_y_b, int stride_y_b,
			
 
				-                const uint8* src_u_b, int stride_u_b,
			
 
				-                const uint8* src_v_b, int stride_v_b,
			
 
				-                int width, int height) {
			
 
				-  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
			
 
				-                                                  src_y_b, stride_y_b,
			
 
				-                                                  width, height);
			
 
				-  const int width_uv = (width + 1) >> 1;
			
 
				-  const int height_uv = (height + 1) >> 1;
			
 
				-  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
			
 
				-                                                  src_u_b, stride_u_b,
			
 
				-                                                  width_uv, height_uv);
			
 
				-  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
			
 
				-                                                  src_v_b, stride_v_b,
			
 
				-                                                  width_uv, height_uv);
			
 
				-  const uint64 samples = width * height + 2 * (width_uv * height_uv);
			
 
				-  const uint64 sse = sse_y + sse_u + sse_v;
			
 
				-  return SumSquareErrorToPsnr(sse, samples);
			
 
				-}
			
 
				-
			
 
				-static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
			
 
				-static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
			
 
				-
			
 
				-static double Ssim8x8_C(const uint8* src_a, int stride_a,
			
 
				-                        const uint8* src_b, int stride_b) {
			
 
				-  int64 sum_a = 0;
			
 
				-  int64 sum_b = 0;
			
 
				-  int64 sum_sq_a = 0;
			
 
				-  int64 sum_sq_b = 0;
			
 
				-  int64 sum_axb = 0;
			
 
				-
			
 
				-  int i;
			
 
				-  for (i = 0; i < 8; ++i) {
			
 
				-    int j;
			
 
				-    for (j = 0; j < 8; ++j) {
			
 
				-      sum_a += src_a[j];
			
 
				-      sum_b += src_b[j];
			
 
				-      sum_sq_a += src_a[j] * src_a[j];
			
 
				-      sum_sq_b += src_b[j] * src_b[j];
			
 
				-      sum_axb += src_a[j] * src_b[j];
			
 
				-    }
			
 
				-
			
 
				-    src_a += stride_a;
			
 
				-    src_b += stride_b;
			
 
				-  }
			
 
				-
			
 
				-  {
			
 
				-    const int64 count = 64;
			
 
				-    // scale the constants by number of pixels
			
 
				-    const int64 c1 = (cc1 * count * count) >> 12;
			
 
				-    const int64 c2 = (cc2 * count * count) >> 12;
			
 
				-
			
 
				-    const int64 sum_a_x_sum_b = sum_a * sum_b;
			
 
				-
			
 
				-    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
			
 
				-                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
			
 
				-
			
 
				-    const int64 sum_a_sq = sum_a*sum_a;
			
 
				-    const int64 sum_b_sq = sum_b*sum_b;
			
 
				-
			
 
				-    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
			
 
				-                         (count * sum_sq_a - sum_a_sq +
			
 
				-                          count * sum_sq_b - sum_b_sq + c2);
			
 
				-
			
 
				-    if (ssim_d == 0.0) {
			
 
				-      return DBL_MAX;
			
 
				-    }
			
 
				-    return ssim_n * 1.0 / ssim_d;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// We are using a 8x8 moving window with starting location of each 8x8 window
			
 
				-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
			
 
				-// block boundaries to penalize blocking artifacts.
			
 
				-LIBYUV_API
			
 
				-double CalcFrameSsim(const uint8* src_a, int stride_a,
			
 
				-                     const uint8* src_b, int stride_b,
			
 
				-                     int width, int height) {
			
 
				-  int samples = 0;
			
 
				-  double ssim_total = 0;
			
 
				-  double (*Ssim8x8)(const uint8* src_a, int stride_a,
			
 
				-                    const uint8* src_b, int stride_b) = Ssim8x8_C;
			
 
				-
			
 
				-  // sample point start with each 4x4 location
			
 
				-  int i;
			
 
				-  for (i = 0; i < height - 8; i += 4) {
			
 
				-    int j;
			
 
				-    for (j = 0; j < width - 8; j += 4) {
			
 
				-      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
			
 
				-      samples++;
			
 
				-    }
			
 
				-
			
 
				-    src_a += stride_a * 4;
			
 
				-    src_b += stride_b * 4;
			
 
				-  }
			
 
				-
			
 
				-  ssim_total /= samples;
			
 
				-  return ssim_total;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-double I420Ssim(const uint8* src_y_a, int stride_y_a,
			
 
				-                const uint8* src_u_a, int stride_u_a,
			
 
				-                const uint8* src_v_a, int stride_v_a,
			
 
				-                const uint8* src_y_b, int stride_y_b,
			
 
				-                const uint8* src_u_b, int stride_u_b,
			
 
				-                const uint8* src_v_b, int stride_v_b,
			
 
				-                int width, int height) {
			
 
				-  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
			
 
				-                                      src_y_b, stride_y_b, width, height);
			
 
				-  const int width_uv = (width + 1) >> 1;
			
 
				-  const int height_uv = (height + 1) >> 1;
			
 
				-  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
			
 
				-                                      src_u_b, stride_u_b,
			
 
				-                                      width_uv, height_uv);
			
 
				-  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
			
 
				-                                      src_v_b, stride_v_b,
			
 
				-                                      width_uv, height_uv);
			
 
				-  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/compare_common.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/compare_common.cc
@@ -1,42 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
			
 
				-  uint32 sse = 0u;
			
 
				-  int i;
			
 
				-  for (i = 0; i < count; ++i) {
			
 
				-    int diff = src_a[i] - src_b[i];
			
 
				-    sse += (uint32)(diff * diff);
			
 
				-  }
			
 
				-  return sse;
			
 
				-}
			
 
				-
			
 
				-// hash seed of 5381 recommended.
			
 
				-// Internal C version of HashDjb2 with int sized count for efficiency.
			
 
				-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
			
 
				-  uint32 hash = seed;
			
 
				-  int i;
			
 
				-  for (i = 0; i < count; ++i) {
			
 
				-    hash += (hash << 5) + src[i];
			
 
				-  }
			
 
				-  return hash;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/compare_neon.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/compare_neon.cc
@@ -1,64 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
			
 
				-
			
 
				-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
			
 
				-  volatile uint32 sse;
			
 
				-  asm volatile (
			
 
				-#ifdef _ANDROID
			
 
				-	".fpu neon\n"
			
 
				-#endif
			
 
				-    "vmov.u8    q8, #0                         \n"
			
 
				-    "vmov.u8    q10, #0                        \n"
			
 
				-    "vmov.u8    q9, #0                         \n"
			
 
				-    "vmov.u8    q11, #0                        \n"
			
 
				-
			
 
				-    ".p2align  2                               \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"
			
 
				-    "vld1.8     {q1}, [%1]!                    \n"
			
 
				-    "subs       %2, %2, #16                    \n"
			
 
				-    "vsubl.u8   q2, d0, d2                     \n"
			
 
				-    "vsubl.u8   q3, d1, d3                     \n"
			
 
				-    "vmlal.s16  q8, d4, d4                     \n"
			
 
				-    "vmlal.s16  q9, d6, d6                     \n"
			
 
				-    "vmlal.s16  q10, d5, d5                    \n"
			
 
				-    "vmlal.s16  q11, d7, d7                    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-
			
 
				-    "vadd.u32   q8, q8, q9                     \n"
			
 
				-    "vadd.u32   q10, q10, q11                  \n"
			
 
				-    "vadd.u32   q11, q8, q10                   \n"
			
 
				-    "vpaddl.u32 q1, q11                        \n"
			
 
				-    "vadd.u64   d0, d2, d3                     \n"
			
 
				-    "vmov.32    %3, d0[0]                      \n"
			
 
				-    : "+r"(src_a),
			
 
				-      "+r"(src_b),
			
 
				-      "+r"(count),
			
 
				-      "=r"(sse)
			
 
				-    :
			
 
				-    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
			
 
				-  return sse;
			
 
				-}
			
 
				-
			
 
				-#endif  // __ARM_NEON__
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/compare_posix.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/compare_posix.cc
@@ -1,158 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
			
 
				-
			
 
				-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
			
 
				-  uint32 sse;
			
 
				-  asm volatile (  // NOLINT
			
 
				-    "pxor      %%xmm0,%%xmm0                   \n"
			
 
				-    "pxor      %%xmm5,%%xmm5                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
			
 
				-    "lea       " MEMLEA(0x10, 0) ",%0          \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
			
 
				-    "lea       " MEMLEA(0x10, 1) ",%1          \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqa    %%xmm1,%%xmm3                   \n"
			
 
				-    "psubusb   %%xmm2,%%xmm1                   \n"
			
 
				-    "psubusb   %%xmm3,%%xmm2                   \n"
			
 
				-    "por       %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm1,%%xmm2                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm1                   \n"
			
 
				-    "punpckhbw %%xmm5,%%xmm2                   \n"
			
 
				-    "pmaddwd   %%xmm1,%%xmm1                   \n"
			
 
				-    "pmaddwd   %%xmm2,%%xmm2                   \n"
			
 
				-    "paddd     %%xmm1,%%xmm0                   \n"
			
 
				-    "paddd     %%xmm2,%%xmm0                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-
			
 
				-    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
			
 
				-    "paddd     %%xmm1,%%xmm0                   \n"
			
 
				-    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
			
 
				-    "paddd     %%xmm1,%%xmm0                   \n"
			
 
				-    "movd      %%xmm0,%3                       \n"
			
 
				-
			
 
				-  : "+r"(src_a),      // %0
			
 
				-    "+r"(src_b),      // %1
			
 
				-    "+r"(count),      // %2
			
 
				-    "=g"(sse)         // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
			
 
				-#endif
			
 
				-  );  // NOLINT
			
 
				-  return sse;
			
 
				-}
			
 
				-
			
 
				-#endif  // defined(__x86_64__) || defined(__i386__)
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && \
			
 
				-    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
			
 
				-#define HAS_HASHDJB2_SSE41
			
 
				-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
			
 
				-static uvec32 kHashMul0 = {
			
 
				-  0x0c3525e1,  // 33 ^ 15
			
 
				-  0xa3476dc1,  // 33 ^ 14
			
 
				-  0x3b4039a1,  // 33 ^ 13
			
 
				-  0x4f5f0981,  // 33 ^ 12
			
 
				-};
			
 
				-static uvec32 kHashMul1 = {
			
 
				-  0x30f35d61,  // 33 ^ 11
			
 
				-  0x855cb541,  // 33 ^ 10
			
 
				-  0x040a9121,  // 33 ^ 9
			
 
				-  0x747c7101,  // 33 ^ 8
			
 
				-};
			
 
				-static uvec32 kHashMul2 = {
			
 
				-  0xec41d4e1,  // 33 ^ 7
			
 
				-  0x4cfa3cc1,  // 33 ^ 6
			
 
				-  0x025528a1,  // 33 ^ 5
			
 
				-  0x00121881,  // 33 ^ 4
			
 
				-};
			
 
				-static uvec32 kHashMul3 = {
			
 
				-  0x00008c61,  // 33 ^ 3
			
 
				-  0x00000441,  // 33 ^ 2
			
 
				-  0x00000021,  // 33 ^ 1
			
 
				-  0x00000001,  // 33 ^ 0
			
 
				-};
			
 
				-
			
 
				-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
			
 
				-  uint32 hash;
			
 
				-  asm volatile (  // NOLINT
			
 
				-    "movd      %2,%%xmm0                       \n"
			
 
				-    "pxor      %%xmm7,%%xmm7                   \n"
			
 
				-    "movdqa    %4,%%xmm6                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
			
 
				-    "lea       " MEMLEA(0x10, 0) ",%0          \n"
			
 
				-    "pmulld    %%xmm6,%%xmm0                   \n"
			
 
				-    "movdqa    %5,%%xmm5                       \n"
			
 
				-    "movdqa    %%xmm1,%%xmm2                   \n"
			
 
				-    "punpcklbw %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm3                   \n"
			
 
				-    "punpcklwd %%xmm7,%%xmm3                   \n"
			
 
				-    "pmulld    %%xmm5,%%xmm3                   \n"
			
 
				-    "movdqa    %6,%%xmm5                       \n"
			
 
				-    "movdqa    %%xmm2,%%xmm4                   \n"
			
 
				-    "punpckhwd %%xmm7,%%xmm4                   \n"
			
 
				-    "pmulld    %%xmm5,%%xmm4                   \n"
			
 
				-    "movdqa    %7,%%xmm5                       \n"
			
 
				-    "punpckhbw %%xmm7,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm1,%%xmm2                   \n"
			
 
				-    "punpcklwd %%xmm7,%%xmm2                   \n"
			
 
				-    "pmulld    %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %8,%%xmm5                       \n"
			
 
				-    "punpckhwd %%xmm7,%%xmm1                   \n"
			
 
				-    "pmulld    %%xmm5,%%xmm1                   \n"
			
 
				-    "paddd     %%xmm4,%%xmm3                   \n"
			
 
				-    "paddd     %%xmm2,%%xmm1                   \n"
			
 
				-    "sub       $0x10,%1                        \n"
			
 
				-    "paddd     %%xmm3,%%xmm1                   \n"
			
 
				-    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
			
 
				-    "paddd     %%xmm2,%%xmm1                   \n"
			
 
				-    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
			
 
				-    "paddd     %%xmm2,%%xmm1                   \n"
			
 
				-    "paddd     %%xmm1,%%xmm0                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-    "movd      %%xmm0,%3                       \n"
			
 
				-  : "+r"(src),        // %0
			
 
				-    "+r"(count),      // %1
			
 
				-    "+rm"(seed),      // %2
			
 
				-    "=g"(hash)        // %3
			
 
				-  : "m"(kHash16x33),  // %4
			
 
				-    "m"(kHashMul0),   // %5
			
 
				-    "m"(kHashMul1),   // %6
			
 
				-    "m"(kHashMul2),   // %7
			
 
				-    "m"(kHashMul3)    // %8
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );  // NOLINT
			
 
				-  return hash;
			
 
				-}
			
 
				-#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/compare_win.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/compare_win.cc
@@ -1,232 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_a
			
 
				-    mov        edx, [esp + 8]    // src_b
			
 
				-    mov        ecx, [esp + 12]   // count
			
 
				-    pxor       xmm0, xmm0
			
 
				-    pxor       xmm5, xmm5
			
 
				-
			
 
				-    align      4
			
 
				-  wloop:
			
 
				-    movdqa     xmm1, [eax]
			
 
				-    lea        eax,  [eax + 16]
			
 
				-    movdqa     xmm2, [edx]
			
 
				-    lea        edx,  [edx + 16]
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     xmm3, xmm1  // abs trick
			
 
				-    psubusb    xmm1, xmm2
			
 
				-    psubusb    xmm2, xmm3
			
 
				-    por        xmm1, xmm2
			
 
				-    movdqa     xmm2, xmm1
			
 
				-    punpcklbw  xmm1, xmm5
			
 
				-    punpckhbw  xmm2, xmm5
			
 
				-    pmaddwd    xmm1, xmm1
			
 
				-    pmaddwd    xmm2, xmm2
			
 
				-    paddd      xmm0, xmm1
			
 
				-    paddd      xmm0, xmm2
			
 
				-    jg         wloop
			
 
				-
			
 
				-    pshufd     xmm1, xmm0, 0xee
			
 
				-    paddd      xmm0, xmm1
			
 
				-    pshufd     xmm1, xmm0, 0x01
			
 
				-    paddd      xmm0, xmm1
			
 
				-    movd       eax, xmm0
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Visual C 2012 required for AVX2.
			
 
				-#if _MSC_VER >= 1700
			
 
				-// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
			
 
				-#pragma warning(disable: 4752)
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_a
			
 
				-    mov        edx, [esp + 8]    // src_b
			
 
				-    mov        ecx, [esp + 12]   // count
			
 
				-    vpxor      ymm0, ymm0, ymm0  // sum
			
 
				-    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
			
 
				-    sub        edx, eax
			
 
				-
			
 
				-    align      4
			
 
				-  wloop:
			
 
				-    vmovdqu    ymm1, [eax]
			
 
				-    vmovdqu    ymm2, [eax + edx]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    sub        ecx, 32
			
 
				-    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
			
 
				-    vpsubusb   ymm2, ymm2, ymm1
			
 
				-    vpor       ymm1, ymm2, ymm3
			
 
				-    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
			
 
				-    vpunpckhbw ymm1, ymm1, ymm5
			
 
				-    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
			
 
				-    vpmaddwd   ymm1, ymm1, ymm1
			
 
				-    vpaddd     ymm0, ymm0, ymm1
			
 
				-    vpaddd     ymm0, ymm0, ymm2
			
 
				-    jg         wloop
			
 
				-
			
 
				-    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
			
 
				-    vpaddd     ymm0, ymm0, ymm1
			
 
				-    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
			
 
				-    vpaddd     ymm0, ymm0, ymm1
			
 
				-    vpermq     ymm1, ymm0, 0x02  // high + low lane.
			
 
				-    vpaddd     ymm0, ymm0, ymm1
			
 
				-    vmovd      eax, xmm0
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // _MSC_VER >= 1700
			
 
				-
			
 
				-#define HAS_HASHDJB2_SSE41
			
 
				-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
			
 
				-static uvec32 kHashMul0 = {
			
 
				-  0x0c3525e1,  // 33 ^ 15
			
 
				-  0xa3476dc1,  // 33 ^ 14
			
 
				-  0x3b4039a1,  // 33 ^ 13
			
 
				-  0x4f5f0981,  // 33 ^ 12
			
 
				-};
			
 
				-static uvec32 kHashMul1 = {
			
 
				-  0x30f35d61,  // 33 ^ 11
			
 
				-  0x855cb541,  // 33 ^ 10
			
 
				-  0x040a9121,  // 33 ^ 9
			
 
				-  0x747c7101,  // 33 ^ 8
			
 
				-};
			
 
				-static uvec32 kHashMul2 = {
			
 
				-  0xec41d4e1,  // 33 ^ 7
			
 
				-  0x4cfa3cc1,  // 33 ^ 6
			
 
				-  0x025528a1,  // 33 ^ 5
			
 
				-  0x00121881,  // 33 ^ 4
			
 
				-};
			
 
				-static uvec32 kHashMul3 = {
			
 
				-  0x00008c61,  // 33 ^ 3
			
 
				-  0x00000441,  // 33 ^ 2
			
 
				-  0x00000021,  // 33 ^ 1
			
 
				-  0x00000001,  // 33 ^ 0
			
 
				-};
			
 
				-
			
 
				-// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
			
 
				-// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
			
 
				-// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
			
 
				-// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
			
 
				-// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
			
 
				-#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
			
 
				-    _asm _emit 0x40 _asm _emit reg
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src
			
 
				-    mov        ecx, [esp + 8]    // count
			
 
				-    movd       xmm0, [esp + 12]  // seed
			
 
				-
			
 
				-    pxor       xmm7, xmm7        // constant 0 for unpck
			
 
				-    movdqa     xmm6, kHash16x33
			
 
				-
			
 
				-    align      4
			
 
				-  wloop:
			
 
				-    movdqu     xmm1, [eax]       // src[0-15]
			
 
				-    lea        eax, [eax + 16]
			
 
				-    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
			
 
				-    movdqa     xmm5, kHashMul0
			
 
				-    movdqa     xmm2, xmm1
			
 
				-    punpcklbw  xmm2, xmm7        // src[0-7]
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    punpcklwd  xmm3, xmm7        // src[0-3]
			
 
				-    pmulld(0xdd)                 // pmulld     xmm3, xmm5
			
 
				-    movdqa     xmm5, kHashMul1
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    punpckhwd  xmm4, xmm7        // src[4-7]
			
 
				-    pmulld(0xe5)                 // pmulld     xmm4, xmm5
			
 
				-    movdqa     xmm5, kHashMul2
			
 
				-    punpckhbw  xmm1, xmm7        // src[8-15]
			
 
				-    movdqa     xmm2, xmm1
			
 
				-    punpcklwd  xmm2, xmm7        // src[8-11]
			
 
				-    pmulld(0xd5)                 // pmulld     xmm2, xmm5
			
 
				-    movdqa     xmm5, kHashMul3
			
 
				-    punpckhwd  xmm1, xmm7        // src[12-15]
			
 
				-    pmulld(0xcd)                 // pmulld     xmm1, xmm5
			
 
				-    paddd      xmm3, xmm4        // add 16 results
			
 
				-    paddd      xmm1, xmm2
			
 
				-    sub        ecx, 16
			
 
				-    paddd      xmm1, xmm3
			
 
				-
			
 
				-    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
			
 
				-    paddd      xmm1, xmm2
			
 
				-    pshufd     xmm2, xmm1, 0x01
			
 
				-    paddd      xmm1, xmm2
			
 
				-    paddd      xmm0, xmm1
			
 
				-    jg         wloop
			
 
				-
			
 
				-    movd       eax, xmm0         // return hash
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Visual C 2012 required for AVX2.
			
 
				-#if _MSC_VER >= 1700
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src
			
 
				-    mov        ecx, [esp + 8]    // count
			
 
				-    movd       xmm0, [esp + 12]  // seed
			
 
				-    movdqa     xmm6, kHash16x33
			
 
				-
			
 
				-    align      4
			
 
				-  wloop:
			
 
				-    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
			
 
				-    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
			
 
				-    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
			
 
				-    pmulld     xmm3, kHashMul0
			
 
				-    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
			
 
				-    pmulld     xmm4, kHashMul1
			
 
				-    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
			
 
				-    pmulld     xmm2, kHashMul2
			
 
				-    lea        eax, [eax + 16]
			
 
				-    pmulld     xmm1, kHashMul3
			
 
				-    paddd      xmm3, xmm4        // add 16 results
			
 
				-    paddd      xmm1, xmm2
			
 
				-    sub        ecx, 16
			
 
				-    paddd      xmm1, xmm3
			
 
				-    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
			
 
				-    paddd      xmm1, xmm2
			
 
				-    pshufd     xmm2, xmm1, 0x01
			
 
				-    paddd      xmm1, xmm2
			
 
				-    paddd      xmm0, xmm1
			
 
				-    jg         wloop
			
 
				-
			
 
				-    movd       eax, xmm0         // return hash
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // _MSC_VER >= 1700
			
 
				-
			
 
				-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/convert.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/convert.cc
@@ -1,1491 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/convert.h"
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/planar_functions.h"
			
 
				-#include "libyuv/rotate.h"
			
 
				-#include "libyuv/scale.h"  // For ScalePlane()
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
			
 
				-static __inline int Abs(int v) {
			
 
				-  return v >= 0 ? v : -v;
			
 
				-}
			
 
				-
			
 
				-// Any I4xx To I420 format with mirroring.
			
 
				-static int I4xxToI420(const uint8* src_y, int src_stride_y,
			
 
				-                      const uint8* src_u, int src_stride_u,
			
 
				-                      const uint8* src_v, int src_stride_v,
			
 
				-                      uint8* dst_y, int dst_stride_y,
			
 
				-                      uint8* dst_u, int dst_stride_u,
			
 
				-                      uint8* dst_v, int dst_stride_v,
			
 
				-                      int src_y_width, int src_y_height,
			
 
				-                      int src_uv_width, int src_uv_height) {
			
 
				-  if (src_y_width == 0 || src_y_height == 0 ||
			
 
				-      src_uv_width == 0 || src_uv_height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  const int dst_y_width = Abs(src_y_width);
			
 
				-  const int dst_y_height = Abs(src_y_height);
			
 
				-  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
			
 
				-  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
			
 
				-  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
			
 
				-             dst_y, dst_stride_y, dst_y_width, dst_y_height,
			
 
				-             kFilterBilinear);
			
 
				-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
			
 
				-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
			
 
				-             kFilterBilinear);
			
 
				-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
			
 
				-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
			
 
				-             kFilterBilinear);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Copy I420 with optional flipping
			
 
				-// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
			
 
				-// is does row coalescing.
			
 
				-LIBYUV_API
			
 
				-int I420Copy(const uint8* src_y, int src_stride_y,
			
 
				-             const uint8* src_u, int src_stride_u,
			
 
				-             const uint8* src_v, int src_stride_v,
			
 
				-             uint8* dst_y, int dst_stride_y,
			
 
				-             uint8* dst_u, int dst_stride_u,
			
 
				-             uint8* dst_v, int dst_stride_v,
			
 
				-             int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v ||
			
 
				-      !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    const int halfheight = (height + 1) >> 1;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_u = src_u + (halfheight - 1) * src_stride_u;
			
 
				-    src_v = src_v + (halfheight - 1) * src_stride_v;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-    src_stride_u = -src_stride_u;
			
 
				-    src_stride_v = -src_stride_v;
			
 
				-  }
			
 
				-
			
 
				-  if (dst_y) {
			
 
				-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
			
 
				-  }
			
 
				-  // Copy UV planes.
			
 
				-  const int halfwidth = (width + 1) >> 1;
			
 
				-  const int halfheight = (height + 1) >> 1;
			
 
				-  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
			
 
				-  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// 422 chroma is 1/2 width, 1x height
			
 
				-// 420 chroma is 1/2 width, 1/2 height
			
 
				-LIBYUV_API
			
 
				-int I422ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  const int src_uv_width = SUBSAMPLE(width, 1, 1);
			
 
				-  return I4xxToI420(src_y, src_stride_y,
			
 
				-                    src_u, src_stride_u,
			
 
				-                    src_v, src_stride_v,
			
 
				-                    dst_y, dst_stride_y,
			
 
				-                    dst_u, dst_stride_u,
			
 
				-                    dst_v, dst_stride_v,
			
 
				-                    width, height,
			
 
				-                    src_uv_width, height);
			
 
				-}
			
 
				-
			
 
				-// 444 chroma is 1x width, 1x height
			
 
				-// 420 chroma is 1/2 width, 1/2 height
			
 
				-LIBYUV_API
			
 
				-int I444ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  return I4xxToI420(src_y, src_stride_y,
			
 
				-                    src_u, src_stride_u,
			
 
				-                    src_v, src_stride_v,
			
 
				-                    dst_y, dst_stride_y,
			
 
				-                    dst_u, dst_stride_u,
			
 
				-                    dst_v, dst_stride_v,
			
 
				-                    width, height,
			
 
				-                    width, height);
			
 
				-}
			
 
				-
			
 
				-// 411 chroma is 1/4 width, 1x height
			
 
				-// 420 chroma is 1/2 width, 1/2 height
			
 
				-LIBYUV_API
			
 
				-int I411ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  const int src_uv_width = SUBSAMPLE(width, 3, 2);
			
 
				-  return I4xxToI420(src_y, src_stride_y,
			
 
				-                    src_u, src_stride_u,
			
 
				-                    src_v, src_stride_v,
			
 
				-                    dst_y, dst_stride_y,
			
 
				-                    dst_u, dst_stride_u,
			
 
				-                    dst_v, dst_stride_v,
			
 
				-                    width, height,
			
 
				-                    src_uv_width, height);
			
 
				-}
			
 
				-
			
 
				-// I400 is greyscale typically used in MJPG
			
 
				-LIBYUV_API
			
 
				-int I400ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-  }
			
 
				-  int halfwidth = (width + 1) >> 1;
			
 
				-  int halfheight = (height + 1) >> 1;
			
 
				-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
			
 
				-  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
			
 
				-  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
			
 
				-                       uint8* dst, int dst_stride,
			
 
				-                       int width, int height) {
			
 
				-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
			
 
				-#if defined(HAS_COPYROW_X86)
			
 
				-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
			
 
				-    CopyRow = CopyRow_X86;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
			
 
				-      IS_ALIGNED(src, 16) &&
			
 
				-      IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
			
 
				-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-    CopyRow = CopyRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_ERMS)
			
 
				-  if (TestCpuFlag(kCpuHasERMS)) {
			
 
				-    CopyRow = CopyRow_ERMS;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
			
 
				-    CopyRow = CopyRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_MIPS)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS)) {
			
 
				-    CopyRow = CopyRow_MIPS;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Copy plane
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    CopyRow(src, dst, width);
			
 
				-    CopyRow(src + src_stride_0, dst + dst_stride, width);
			
 
				-    src += src_stride_0 + src_stride_1;
			
 
				-    dst += dst_stride * 2;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    CopyRow(src, dst, width);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Support converting from FOURCC_M420
			
 
				-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
			
 
				-// easy conversion to I420.
			
 
				-// M420 format description:
			
 
				-// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
			
 
				-// Chroma is half width / half height. (420)
			
 
				-// src_stride_m420 is row planar. Normally this will be the width in pixels.
			
 
				-//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
			
 
				-//   this as well as the two Y planes.
			
 
				-static int X420ToI420(const uint8* src_y,
			
 
				-                      int src_stride_y0, int src_stride_y1,
			
 
				-                      const uint8* src_uv, int src_stride_uv,
			
 
				-                      uint8* dst_y, int dst_stride_y,
			
 
				-                      uint8* dst_u, int dst_stride_u,
			
 
				-                      uint8* dst_v, int dst_stride_v,
			
 
				-                      int width, int height) {
			
 
				-  if (!src_y || !src_uv ||
			
 
				-      !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    int halfheight = (height + 1) >> 1;
			
 
				-    dst_y = dst_y + (height - 1) * dst_stride_y;
			
 
				-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
			
 
				-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
			
 
				-    dst_stride_y = -dst_stride_y;
			
 
				-    dst_stride_u = -dst_stride_u;
			
 
				-    dst_stride_v = -dst_stride_v;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  int halfwidth = (width + 1) >> 1;
			
 
				-  int halfheight = (height + 1) >> 1;
			
 
				-  if (src_stride_y0 == width &&
			
 
				-      src_stride_y1 == width &&
			
 
				-      dst_stride_y == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_uv == halfwidth * 2 &&
			
 
				-      dst_stride_u == halfwidth &&
			
 
				-      dst_stride_v == halfwidth) {
			
 
				-    halfwidth *= halfheight;
			
 
				-    halfheight = 1;
			
 
				-    src_stride_uv = dst_stride_u = dst_stride_v = 0;
			
 
				-  }
			
 
				-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
			
 
				-      SplitUVRow_C;
			
 
				-#if defined(HAS_SPLITUVROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
			
 
				-    SplitUVRow = SplitUVRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(halfwidth, 16)) {
			
 
				-      SplitUVRow = SplitUVRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
			
 
				-          IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
			
 
				-          IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
			
 
				-        SplitUVRow = SplitUVRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SPLITUVROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
			
 
				-    SplitUVRow = SplitUVRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(halfwidth, 32)) {
			
 
				-      SplitUVRow = SplitUVRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SPLITUVROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
			
 
				-    SplitUVRow = SplitUVRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(halfwidth, 16)) {
			
 
				-      SplitUVRow = SplitUVRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
			
 
				-    SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
			
 
				-    if (IS_ALIGNED(halfwidth, 16)) {
			
 
				-      SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2;
			
 
				-      if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
			
 
				-          IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
			
 
				-          IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
			
 
				-        SplitUVRow = SplitUVRow_MIPS_DSPR2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  if (dst_y) {
			
 
				-    if (src_stride_y0 == src_stride_y1) {
			
 
				-      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
			
 
				-    } else {
			
 
				-      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
			
 
				-                 width, height);
			
 
				-    }
			
 
				-  }
			
 
				-
			
 
				-  for (int y = 0; y < halfheight; ++y) {
			
 
				-    // Copy a row of UV.
			
 
				-    SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-    src_uv += src_stride_uv;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert NV12 to I420.
			
 
				-LIBYUV_API
			
 
				-int NV12ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_uv, int src_stride_uv,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  return X420ToI420(src_y, src_stride_y, src_stride_y,
			
 
				-                    src_uv, src_stride_uv,
			
 
				-                    dst_y, dst_stride_y,
			
 
				-                    dst_u, dst_stride_u,
			
 
				-                    dst_v, dst_stride_v,
			
 
				-                    width, height);
			
 
				-}
			
 
				-
			
 
				-// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
			
 
				-LIBYUV_API
			
 
				-int NV21ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_vu, int src_stride_vu,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  return X420ToI420(src_y, src_stride_y, src_stride_y,
			
 
				-                    src_vu, src_stride_vu,
			
 
				-                    dst_y, dst_stride_y,
			
 
				-                    dst_v, dst_stride_v,
			
 
				-                    dst_u, dst_stride_u,
			
 
				-                    width, height);
			
 
				-}
			
 
				-
			
 
				-// Convert M420 to I420.
			
 
				-LIBYUV_API
			
 
				-int M420ToI420(const uint8* src_m420, int src_stride_m420,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
			
 
				-                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
			
 
				-                    dst_y, dst_stride_y,
			
 
				-                    dst_u, dst_stride_u,
			
 
				-                    dst_v, dst_stride_v,
			
 
				-                    width, height);
			
 
				-}
			
 
				-
			
 
				-// Convert Q420 to I420.
			
 
				-// Format is rows of YY/YUYV
			
 
				-LIBYUV_API
			
 
				-int Q420ToI420(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_yuy2, int src_stride_yuy2,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_yuy2 ||
			
 
				-      !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    int halfheight = (height + 1) >> 1;
			
 
				-    dst_y = dst_y + (height - 1) * dst_stride_y;
			
 
				-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
			
 
				-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
			
 
				-    dst_stride_y = -dst_stride_y;
			
 
				-    dst_stride_u = -dst_stride_u;
			
 
				-    dst_stride_v = -dst_stride_v;
			
 
				-  }
			
 
				-  // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
			
 
				-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
			
 
				-#if defined(HAS_COPYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
			
 
				-    CopyRow = CopyRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_X86)
			
 
				-  if (IS_ALIGNED(width, 4)) {
			
 
				-    CopyRow = CopyRow_X86;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
			
 
				-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
			
 
				-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-    CopyRow = CopyRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_ERMS)
			
 
				-  if (TestCpuFlag(kCpuHasERMS)) {
			
 
				-    CopyRow = CopyRow_ERMS;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_MIPS)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS)) {
			
 
				-    CopyRow = CopyRow_MIPS;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
			
 
				-      int pix) = YUY2ToUV422Row_C;
			
 
				-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
			
 
				-      YUY2ToYRow_C;
			
 
				-#if defined(HAS_YUY2TOYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
			
 
				-    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
			
 
				-    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
			
 
				-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
			
 
				-        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
			
 
				-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-          YUY2ToYRow = YUY2ToYRow_SSE2;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_YUY2TOYROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
			
 
				-    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
			
 
				-    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 32)) {
			
 
				-      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
			
 
				-      YUY2ToYRow = YUY2ToYRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_YUY2TOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    YUY2ToYRow = YUY2ToYRow_Any_NEON;
			
 
				-    if (width >= 16) {
			
 
				-      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
			
 
				-    }
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      YUY2ToYRow = YUY2ToYRow_NEON;
			
 
				-      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    CopyRow(src_y, dst_y, width);
			
 
				-    src_y += src_stride_y;
			
 
				-    dst_y += dst_stride_y;
			
 
				-
			
 
				-    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
			
 
				-    YUY2ToYRow(src_yuy2, dst_y, width);
			
 
				-    src_yuy2 += src_stride_yuy2;
			
 
				-    dst_y += dst_stride_y;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    CopyRow(src_y, dst_y, width);
			
 
				-    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert YUY2 to I420.
			
 
				-LIBYUV_API
			
 
				-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
			
 
				-    src_stride_yuy2 = -src_stride_yuy2;
			
 
				-  }
			
 
				-  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-  void (*YUY2ToYRow)(const uint8* src_yuy2,
			
 
				-                     uint8* dst_y, int pix);
			
 
				-  YUY2ToYRow = YUY2ToYRow_C;
			
 
				-  YUY2ToUVRow = YUY2ToUVRow_C;
			
 
				-#if defined(HAS_YUY2TOYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
			
 
				-    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
			
 
				-    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
			
 
				-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
			
 
				-        YUY2ToUVRow = YUY2ToUVRow_SSE2;
			
 
				-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-          YUY2ToYRow = YUY2ToYRow_SSE2;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_YUY2TOYROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
			
 
				-    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
			
 
				-    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 32)) {
			
 
				-      YUY2ToUVRow = YUY2ToUVRow_AVX2;
			
 
				-      YUY2ToYRow = YUY2ToYRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_YUY2TOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    YUY2ToYRow = YUY2ToYRow_Any_NEON;
			
 
				-    if (width >= 16) {
			
 
				-      YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
			
 
				-    }
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      YUY2ToYRow = YUY2ToYRow_NEON;
			
 
				-      YUY2ToUVRow = YUY2ToUVRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
			
 
				-    YUY2ToYRow(src_yuy2, dst_y, width);
			
 
				-    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
			
 
				-    src_yuy2 += src_stride_yuy2 * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
			
 
				-    YUY2ToYRow(src_yuy2, dst_y, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert UYVY to I420.
			
 
				-LIBYUV_API
			
 
				-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
			
 
				-    src_stride_uyvy = -src_stride_uyvy;
			
 
				-  }
			
 
				-  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix);
			
 
				-  void (*UYVYToYRow)(const uint8* src_uyvy,
			
 
				-                     uint8* dst_y, int pix);
			
 
				-  UYVYToYRow = UYVYToYRow_C;
			
 
				-  UYVYToUVRow = UYVYToUVRow_C;
			
 
				-#if defined(HAS_UYVYTOYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
			
 
				-    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
			
 
				-    UYVYToYRow = UYVYToYRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
			
 
				-      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
			
 
				-        UYVYToUVRow = UYVYToUVRow_SSE2;
			
 
				-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-          UYVYToYRow = UYVYToYRow_SSE2;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_UYVYTOYROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
			
 
				-    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
			
 
				-    UYVYToYRow = UYVYToYRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 32)) {
			
 
				-      UYVYToUVRow = UYVYToUVRow_AVX2;
			
 
				-      UYVYToYRow = UYVYToYRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_UYVYTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    UYVYToYRow = UYVYToYRow_Any_NEON;
			
 
				-    if (width >= 16) {
			
 
				-      UYVYToUVRow = UYVYToUVRow_Any_NEON;
			
 
				-    }
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      UYVYToYRow = UYVYToYRow_NEON;
			
 
				-      UYVYToUVRow = UYVYToUVRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
			
 
				-    UYVYToYRow(src_uyvy, dst_y, width);
			
 
				-    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
			
 
				-    src_uyvy += src_stride_uyvy * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
			
 
				-    UYVYToYRow(src_uyvy, dst_y, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB to I420.
			
 
				-LIBYUV_API
			
 
				-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  if (!src_argb ||
			
 
				-      !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-        ARGBToUVRow = ARGBToUVRow_SSSE3;
			
 
				-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-          ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
			
 
				-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
			
 
				-    ARGBToYRow = ARGBToYRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 32)) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_AVX2;
			
 
				-      ARGBToYRow = ARGBToYRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYRow = ARGBToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ARGBToUVRow = ARGBToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(src_argb, dst_y, width);
			
 
				-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
			
 
				-    src_argb += src_stride_argb * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(src_argb, dst_y, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert BGRA to I420.
			
 
				-LIBYUV_API
			
 
				-int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  if (!src_bgra ||
			
 
				-      !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
			
 
				-    src_stride_bgra = -src_stride_bgra;
			
 
				-  }
			
 
				-  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
			
 
				-  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
			
 
				-      BGRAToYRow_C;
			
 
				-#if defined(HAS_BGRATOYROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
			
 
				-    BGRAToYRow = BGRAToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
			
 
				-      BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
			
 
				-        BGRAToUVRow = BGRAToUVRow_SSSE3;
			
 
				-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-          BGRAToYRow = BGRAToYRow_SSSE3;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_BGRATOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    BGRAToYRow = BGRAToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      BGRAToYRow = BGRAToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      BGRAToUVRow = BGRAToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        BGRAToUVRow = BGRAToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
			
 
				-    BGRAToYRow(src_bgra, dst_y, width);
			
 
				-    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
			
 
				-    src_bgra += src_stride_bgra * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
			
 
				-    BGRAToYRow(src_bgra, dst_y, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ABGR to I420.
			
 
				-LIBYUV_API
			
 
				-int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  if (!src_abgr ||
			
 
				-      !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
			
 
				-    src_stride_abgr = -src_stride_abgr;
			
 
				-  }
			
 
				-  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
			
 
				-  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
			
 
				-      ABGRToYRow_C;
			
 
				-#if defined(HAS_ABGRTOYROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
			
 
				-    ABGRToYRow = ABGRToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
			
 
				-      ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
			
 
				-        ABGRToUVRow = ABGRToUVRow_SSSE3;
			
 
				-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-          ABGRToYRow = ABGRToYRow_SSSE3;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ABGRTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ABGRToYRow = ABGRToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ABGRToYRow = ABGRToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      ABGRToUVRow = ABGRToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ABGRToUVRow = ABGRToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
			
 
				-    ABGRToYRow(src_abgr, dst_y, width);
			
 
				-    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
			
 
				-    src_abgr += src_stride_abgr * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
			
 
				-    ABGRToYRow(src_abgr, dst_y, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert RGBA to I420.
			
 
				-LIBYUV_API
			
 
				-int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  if (!src_rgba ||
			
 
				-      !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
			
 
				-    src_stride_rgba = -src_stride_rgba;
			
 
				-  }
			
 
				-  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
			
 
				-  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
			
 
				-      RGBAToYRow_C;
			
 
				-#if defined(HAS_RGBATOYROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
			
 
				-    RGBAToYRow = RGBAToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
			
 
				-      RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
			
 
				-        RGBAToUVRow = RGBAToUVRow_SSSE3;
			
 
				-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-          RGBAToYRow = RGBAToYRow_SSSE3;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_RGBATOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    RGBAToYRow = RGBAToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      RGBAToYRow = RGBAToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      RGBAToUVRow = RGBAToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        RGBAToUVRow = RGBAToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
			
 
				-    RGBAToYRow(src_rgba, dst_y, width);
			
 
				-    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
			
 
				-    src_rgba += src_stride_rgba * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
			
 
				-    RGBAToYRow(src_rgba, dst_y, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert RGB24 to I420.
			
 
				-LIBYUV_API
			
 
				-int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
			
 
				-                uint8* dst_y, int dst_stride_y,
			
 
				-                uint8* dst_u, int dst_stride_u,
			
 
				-                uint8* dst_v, int dst_stride_v,
			
 
				-                int width, int height) {
			
 
				-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
			
 
				-    src_stride_rgb24 = -src_stride_rgb24;
			
 
				-  }
			
 
				-
			
 
				-#if defined(HAS_RGB24TOYROW_NEON)
			
 
				-  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
			
 
				-      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
			
 
				-  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
			
 
				-      RGB24ToYRow_C;
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    RGB24ToYRow = RGB24ToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      RGB24ToYRow = RGB24ToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        RGB24ToUVRow = RGB24ToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#else  // HAS_RGB24TOYROW_NEON
			
 
				-
			
 
				-  // Allocate 2 rows of ARGB.
			
 
				-  const int kRowSize = (width * 4 + 15) & ~15;
			
 
				-  align_buffer_64(row, kRowSize * 2);
			
 
				-
			
 
				-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
			
 
				-      RGB24ToARGBRow_C;
			
 
				-#if defined(HAS_RGB24TOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
			
 
				-#if defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif  // HAS_ARGBTOUVROW_SSSE3
			
 
				-#endif  // HAS_RGB24TOYROW_NEON
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-#if defined(HAS_RGB24TOYROW_NEON)
			
 
				-    RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
			
 
				-    RGB24ToYRow(src_rgb24, dst_y, width);
			
 
				-    RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
			
 
				-#else
			
 
				-    RGB24ToARGBRow(src_rgb24, row, width);
			
 
				-    RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
			
 
				-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(row, dst_y, width);
			
 
				-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
			
 
				-#endif
			
 
				-    src_rgb24 += src_stride_rgb24 * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-#if defined(HAS_RGB24TOYROW_NEON)
			
 
				-    RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
			
 
				-    RGB24ToYRow(src_rgb24, dst_y, width);
			
 
				-#else
			
 
				-    RGB24ToARGBRow(src_rgb24, row, width);
			
 
				-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(row, dst_y, width);
			
 
				-#endif
			
 
				-  }
			
 
				-#if !defined(HAS_RGB24TOYROW_NEON)
			
 
				-  free_aligned_buffer_64(row);
			
 
				-#endif
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert RAW to I420.
			
 
				-LIBYUV_API
			
 
				-int RAWToI420(const uint8* src_raw, int src_stride_raw,
			
 
				-              uint8* dst_y, int dst_stride_y,
			
 
				-              uint8* dst_u, int dst_stride_u,
			
 
				-              uint8* dst_v, int dst_stride_v,
			
 
				-              int width, int height) {
			
 
				-  if (!src_raw || !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_raw = src_raw + (height - 1) * src_stride_raw;
			
 
				-    src_stride_raw = -src_stride_raw;
			
 
				-  }
			
 
				-
			
 
				-#if defined(HAS_RAWTOYROW_NEON)
			
 
				-  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
			
 
				-      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
			
 
				-  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
			
 
				-      RAWToYRow_C;
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    RAWToYRow = RAWToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      RAWToYRow = RAWToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      RAWToUVRow = RAWToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        RAWToUVRow = RAWToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#else  // HAS_RAWTOYROW_NEON
			
 
				-
			
 
				-  // Allocate 2 rows of ARGB.
			
 
				-  const int kRowSize = (width * 4 + 15) & ~15;
			
 
				-  align_buffer_64(row, kRowSize * 2);
			
 
				-
			
 
				-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
			
 
				-      RAWToARGBRow_C;
			
 
				-#if defined(HAS_RAWTOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      RAWToARGBRow = RAWToARGBRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
			
 
				-#if defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif  // HAS_ARGBTOUVROW_SSSE3
			
 
				-#endif  // HAS_RAWTOYROW_NEON
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-#if defined(HAS_RAWTOYROW_NEON)
			
 
				-    RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
			
 
				-    RAWToYRow(src_raw, dst_y, width);
			
 
				-    RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
			
 
				-#else
			
 
				-    RAWToARGBRow(src_raw, row, width);
			
 
				-    RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
			
 
				-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(row, dst_y, width);
			
 
				-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
			
 
				-#endif
			
 
				-    src_raw += src_stride_raw * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-#if defined(HAS_RAWTOYROW_NEON)
			
 
				-    RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
			
 
				-    RAWToYRow(src_raw, dst_y, width);
			
 
				-#else
			
 
				-    RAWToARGBRow(src_raw, row, width);
			
 
				-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(row, dst_y, width);
			
 
				-#endif
			
 
				-  }
			
 
				-#if !defined(HAS_RAWTOYROW_NEON)
			
 
				-  free_aligned_buffer_64(row);
			
 
				-#endif
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert RGB565 to I420.
			
 
				-LIBYUV_API
			
 
				-int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
			
 
				-                uint8* dst_y, int dst_stride_y,
			
 
				-                uint8* dst_u, int dst_stride_u,
			
 
				-                uint8* dst_v, int dst_stride_v,
			
 
				-                int width, int height) {
			
 
				-  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
			
 
				-    src_stride_rgb565 = -src_stride_rgb565;
			
 
				-  }
			
 
				-
			
 
				-#if defined(HAS_RGB565TOYROW_NEON)
			
 
				-  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
			
 
				-      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
			
 
				-  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
			
 
				-      RGB565ToYRow_C;
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    RGB565ToYRow = RGB565ToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      RGB565ToYRow = RGB565ToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        RGB565ToUVRow = RGB565ToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#else  // HAS_RGB565TOYROW_NEON
			
 
				-
			
 
				-  // Allocate 2 rows of ARGB.
			
 
				-  const int kRowSize = (width * 4 + 15) & ~15;
			
 
				-  align_buffer_64(row, kRowSize * 2);
			
 
				-
			
 
				-  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
			
 
				-      RGB565ToARGBRow_C;
			
 
				-#if defined(HAS_RGB565TOARGBROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
			
 
				-    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
			
 
				-#if defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif  // HAS_ARGBTOUVROW_SSSE3
			
 
				-#endif  // HAS_RGB565TOYROW_NEON
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-#if defined(HAS_RGB565TOYROW_NEON)
			
 
				-    RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
			
 
				-    RGB565ToYRow(src_rgb565, dst_y, width);
			
 
				-    RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
			
 
				-#else
			
 
				-    RGB565ToARGBRow(src_rgb565, row, width);
			
 
				-    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
			
 
				-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(row, dst_y, width);
			
 
				-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
			
 
				-#endif
			
 
				-    src_rgb565 += src_stride_rgb565 * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-#if defined(HAS_RGB565TOYROW_NEON)
			
 
				-    RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
			
 
				-    RGB565ToYRow(src_rgb565, dst_y, width);
			
 
				-#else
			
 
				-    RGB565ToARGBRow(src_rgb565, row, width);
			
 
				-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(row, dst_y, width);
			
 
				-#endif
			
 
				-  }
			
 
				-#if !defined(HAS_RGB565TOYROW_NEON)
			
 
				-  free_aligned_buffer_64(row);
			
 
				-#endif
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB1555 to I420.
			
 
				-LIBYUV_API
			
 
				-int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
			
 
				-                   uint8* dst_y, int dst_stride_y,
			
 
				-                   uint8* dst_u, int dst_stride_u,
			
 
				-                   uint8* dst_v, int dst_stride_v,
			
 
				-                   int width, int height) {
			
 
				-  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
			
 
				-    src_stride_argb1555 = -src_stride_argb1555;
			
 
				-  }
			
 
				-
			
 
				-#if defined(HAS_ARGB1555TOYROW_NEON)
			
 
				-  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
			
 
				-      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
			
 
				-  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
			
 
				-      ARGB1555ToYRow_C;
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#else  // HAS_ARGB1555TOYROW_NEON
			
 
				-
			
 
				-  // Allocate 2 rows of ARGB.
			
 
				-  const int kRowSize = (width * 4 + 15) & ~15;
			
 
				-  align_buffer_64(row, kRowSize * 2);
			
 
				-
			
 
				-  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
			
 
				-      ARGB1555ToARGBRow_C;
			
 
				-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
			
 
				-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
			
 
				-#if defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif  // HAS_ARGBTOUVROW_SSSE3
			
 
				-#endif  // HAS_ARGB1555TOYROW_NEON
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-#if defined(HAS_ARGB1555TOYROW_NEON)
			
 
				-    ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
			
 
				-    ARGB1555ToYRow(src_argb1555, dst_y, width);
			
 
				-    ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
			
 
				-                   width);
			
 
				-#else
			
 
				-    ARGB1555ToARGBRow(src_argb1555, row, width);
			
 
				-    ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
			
 
				-                      width);
			
 
				-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(row, dst_y, width);
			
 
				-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
			
 
				-#endif
			
 
				-    src_argb1555 += src_stride_argb1555 * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-#if defined(HAS_ARGB1555TOYROW_NEON)
			
 
				-    ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
			
 
				-    ARGB1555ToYRow(src_argb1555, dst_y, width);
			
 
				-#else
			
 
				-    ARGB1555ToARGBRow(src_argb1555, row, width);
			
 
				-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(row, dst_y, width);
			
 
				-#endif
			
 
				-  }
			
 
				-#if !defined(HAS_ARGB1555TOYROW_NEON)
			
 
				-  free_aligned_buffer_64(row);
			
 
				-#endif
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB4444 to I420.
			
 
				-LIBYUV_API
			
 
				-int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
			
 
				-                   uint8* dst_y, int dst_stride_y,
			
 
				-                   uint8* dst_u, int dst_stride_u,
			
 
				-                   uint8* dst_v, int dst_stride_v,
			
 
				-                   int width, int height) {
			
 
				-  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
			
 
				-    src_stride_argb4444 = -src_stride_argb4444;
			
 
				-  }
			
 
				-
			
 
				-#if defined(HAS_ARGB4444TOYROW_NEON)
			
 
				-  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
			
 
				-      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
			
 
				-  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
			
 
				-      ARGB4444ToYRow_C;
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#else  // HAS_ARGB4444TOYROW_NEON
			
 
				-
			
 
				-  // Allocate 2 rows of ARGB.
			
 
				-  const int kRowSize = (width * 4 + 15) & ~15;
			
 
				-  align_buffer_64(row, kRowSize * 2);
			
 
				-
			
 
				-  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
			
 
				-      ARGB4444ToARGBRow_C;
			
 
				-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
			
 
				-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
			
 
				-#if defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif  // HAS_ARGBTOUVROW_SSSE3
			
 
				-#endif  // HAS_ARGB4444TOYROW_NEON
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-#if defined(HAS_ARGB4444TOYROW_NEON)
			
 
				-    ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
			
 
				-    ARGB4444ToYRow(src_argb4444, dst_y, width);
			
 
				-    ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
			
 
				-                   width);
			
 
				-#else
			
 
				-    ARGB4444ToARGBRow(src_argb4444, row, width);
			
 
				-    ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
			
 
				-                      width);
			
 
				-    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(row, dst_y, width);
			
 
				-    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
			
 
				-#endif
			
 
				-    src_argb4444 += src_stride_argb4444 * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-#if defined(HAS_ARGB4444TOYROW_NEON)
			
 
				-    ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
			
 
				-    ARGB4444ToYRow(src_argb4444, dst_y, width);
			
 
				-#else
			
 
				-    ARGB4444ToARGBRow(src_argb4444, row, width);
			
 
				-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(row, dst_y, width);
			
 
				-#endif
			
 
				-  }
			
 
				-#if !defined(HAS_ARGB4444TOYROW_NEON)
			
 
				-  free_aligned_buffer_64(row);
			
 
				-#endif
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/convert_argb.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/convert_argb.cc
@@ -1,901 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/convert_argb.h"
			
 
				-
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/format_conversion.h"
			
 
				-#ifdef HAVE_JPEG
			
 
				-#include "libyuv/mjpeg_decoder.h"
			
 
				-#endif
			
 
				-#include "libyuv/rotate_argb.h"
			
 
				-#include "libyuv/row.h"
			
 
				-#include "libyuv/video_common.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Copy ARGB with optional flipping
			
 
				-LIBYUV_API
			
 
				-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
			
 
				-             uint8* dst_argb, int dst_stride_argb,
			
 
				-             int width, int height) {
			
 
				-  if (!src_argb || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-
			
 
				-  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
			
 
				-            width * 4, height);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I444 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int I444ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v ||
			
 
				-      !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      src_stride_u == width &&
			
 
				-      src_stride_v == width &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*I444ToARGBRow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I444ToARGBRow_C;
			
 
				-#if defined(HAS_I444TOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        I444ToARGBRow = I444ToARGBRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I444TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I444ToARGBRow = I444ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I444ToARGBRow = I444ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-    src_y += src_stride_y;
			
 
				-    src_u += src_stride_u;
			
 
				-    src_v += src_stride_v;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I422 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int I422ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v ||
			
 
				-      !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      src_stride_u * 2 == width &&
			
 
				-      src_stride_v * 2 == width &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*I422ToARGBRow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToARGBRow_C;
			
 
				-#if defined(HAS_I422TOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        I422ToARGBRow = I422ToARGBRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
			
 
				-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
			
 
				-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
			
 
				-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
			
 
				-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-    src_y += src_stride_y;
			
 
				-    src_u += src_stride_u;
			
 
				-    src_v += src_stride_v;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I411 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int I411ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v ||
			
 
				-      !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      src_stride_u * 4 == width &&
			
 
				-      src_stride_v * 4 == width &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*I411ToARGBRow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I411ToARGBRow_C;
			
 
				-#if defined(HAS_I411TOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        I411ToARGBRow = I411ToARGBRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I411TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I411ToARGBRow = I411ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I411ToARGBRow = I411ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-    src_y += src_stride_y;
			
 
				-    src_u += src_stride_u;
			
 
				-    src_v += src_stride_v;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I400 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
			
 
				-                         uint8* dst_argb, int dst_stride_argb,
			
 
				-                         int width, int height) {
			
 
				-  if (!src_y || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*YToARGBRow)(const uint8* y_buf,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) = YToARGBRow_C;
			
 
				-#if defined(HAS_YTOARGBROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    YToARGBRow = YToARGBRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      YToARGBRow = YToARGBRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_YTOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    YToARGBRow = YToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      YToARGBRow = YToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    YToARGBRow(src_y, dst_argb, width);
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-    src_y += src_stride_y;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I400 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int I400ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
			
 
				-      I400ToARGBRow_C;
			
 
				-#if defined(HAS_I400TOARGBROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
			
 
				-    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        I400ToARGBRow = I400ToARGBRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I400TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I400ToARGBRow = I400ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I400ToARGBRow = I400ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I400ToARGBRow(src_y, dst_argb, width);
			
 
				-    src_y += src_stride_y;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Shuffle table for converting BGRA to ARGB.
			
 
				-static uvec8 kShuffleMaskBGRAToARGB = {
			
 
				-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting ABGR to ARGB.
			
 
				-static uvec8 kShuffleMaskABGRToARGB = {
			
 
				-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting RGBA to ARGB.
			
 
				-static uvec8 kShuffleMaskRGBAToARGB = {
			
 
				-  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
			
 
				-};
			
 
				-
			
 
				-// Convert BGRA to ARGB.
			
 
				-LIBYUV_API
			
 
				-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  return ARGBShuffle(src_bgra, src_stride_bgra,
			
 
				-                     dst_argb, dst_stride_argb,
			
 
				-                     (const uint8*)(&kShuffleMaskBGRAToARGB),
			
 
				-                     width, height);
			
 
				-}
			
 
				-
			
 
				-// Convert ABGR to ARGB.
			
 
				-LIBYUV_API
			
 
				-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  return ARGBShuffle(src_abgr, src_stride_abgr,
			
 
				-                     dst_argb, dst_stride_argb,
			
 
				-                     (const uint8*)(&kShuffleMaskABGRToARGB),
			
 
				-                     width, height);
			
 
				-}
			
 
				-
			
 
				-// Convert RGBA to ARGB.
			
 
				-LIBYUV_API
			
 
				-int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  return ARGBShuffle(src_rgba, src_stride_rgba,
			
 
				-                     dst_argb, dst_stride_argb,
			
 
				-                     (const uint8*)(&kShuffleMaskRGBAToARGB),
			
 
				-                     width, height);
			
 
				-}
			
 
				-
			
 
				-// Convert RGB24 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
			
 
				-                uint8* dst_argb, int dst_stride_argb,
			
 
				-                int width, int height) {
			
 
				-  if (!src_rgb24 || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
			
 
				-    src_stride_rgb24 = -src_stride_rgb24;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_rgb24 == width * 3 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_rgb24 = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
			
 
				-      RGB24ToARGBRow_C;
			
 
				-#if defined(HAS_RGB24TOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_RGB24TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    RGB24ToARGBRow(src_rgb24, dst_argb, width);
			
 
				-    src_rgb24 += src_stride_rgb24;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert RAW to ARGB.
			
 
				-LIBYUV_API
			
 
				-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
			
 
				-              uint8* dst_argb, int dst_stride_argb,
			
 
				-              int width, int height) {
			
 
				-  if (!src_raw || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_raw = src_raw + (height - 1) * src_stride_raw;
			
 
				-    src_stride_raw = -src_stride_raw;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_raw == width * 3 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_raw = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
			
 
				-      RAWToARGBRow_C;
			
 
				-#if defined(HAS_RAWTOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      RAWToARGBRow = RAWToARGBRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_RAWTOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    RAWToARGBRow = RAWToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      RAWToARGBRow = RAWToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    RAWToARGBRow(src_raw, dst_argb, width);
			
 
				-    src_raw += src_stride_raw;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert RGB565 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
			
 
				-                 uint8* dst_argb, int dst_stride_argb,
			
 
				-                 int width, int height) {
			
 
				-  if (!src_rgb565 || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
			
 
				-    src_stride_rgb565 = -src_stride_rgb565;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_rgb565 == width * 2 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_rgb565 = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
			
 
				-      RGB565ToARGBRow_C;
			
 
				-#if defined(HAS_RGB565TOARGBROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_RGB565TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    RGB565ToARGBRow(src_rgb565, dst_argb, width);
			
 
				-    src_rgb565 += src_stride_rgb565;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB1555 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
			
 
				-                   uint8* dst_argb, int dst_stride_argb,
			
 
				-                   int width, int height) {
			
 
				-  if (!src_argb1555 || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
			
 
				-    src_stride_argb1555 = -src_stride_argb1555;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb1555 == width * 2 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb1555 = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
			
 
				-                            int pix) = ARGB1555ToARGBRow_C;
			
 
				-#if defined(HAS_ARGB1555TOARGBROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGB1555TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
			
 
				-    src_argb1555 += src_stride_argb1555;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB4444 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
			
 
				-                   uint8* dst_argb, int dst_stride_argb,
			
 
				-                   int width, int height) {
			
 
				-  if (!src_argb4444 || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
			
 
				-    src_stride_argb4444 = -src_stride_argb4444;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb4444 == width * 2 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb4444 = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
			
 
				-                            int pix) = ARGB4444ToARGBRow_C;
			
 
				-#if defined(HAS_ARGB4444TOARGBROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGB4444TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
			
 
				-    src_argb4444 += src_stride_argb4444;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert NV12 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int NV12ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_uv, int src_stride_uv,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_uv || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  void (*NV12ToARGBRow)(const uint8* y_buf,
			
 
				-                        const uint8* uv_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = NV12ToARGBRow_C;
			
 
				-#if defined(HAS_NV12TOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_NV12TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      NV12ToARGBRow = NV12ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    NV12ToARGBRow(src_y, src_uv, dst_argb, width);
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_uv += src_stride_uv;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert NV21 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int NV21ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_uv, int src_stride_uv,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_uv || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  void (*NV21ToARGBRow)(const uint8* y_buf,
			
 
				-                        const uint8* uv_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = NV21ToARGBRow_C;
			
 
				-#if defined(HAS_NV21TOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        NV21ToARGBRow = NV21ToARGBRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_NV21TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      NV21ToARGBRow = NV21ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    NV21ToARGBRow(src_y, src_uv, dst_argb, width);
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_uv += src_stride_uv;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert M420 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  if (!src_m420 || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  void (*NV12ToARGBRow)(const uint8* y_buf,
			
 
				-                        const uint8* uv_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = NV12ToARGBRow_C;
			
 
				-#if defined(HAS_NV12TOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_NV12TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      NV12ToARGBRow = NV12ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
			
 
				-    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
			
 
				-                  dst_argb + dst_stride_argb, width);
			
 
				-    dst_argb += dst_stride_argb * 2;
			
 
				-    src_m420 += src_stride_m420 * 3;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert YUY2 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  if (!src_yuy2 || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
			
 
				-    src_stride_yuy2 = -src_stride_yuy2;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_yuy2 == width * 2 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_yuy2 = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
			
 
				-      YUY2ToARGBRow_C;
			
 
				-#if defined(HAS_YUY2TOARGBROW_SSSE3)
			
 
				-  // Posix is 16, Windows is 8.
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&
			
 
				-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_YUY2TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    YUY2ToARGBRow(src_yuy2, dst_argb, width);
			
 
				-    src_yuy2 += src_stride_yuy2;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert UYVY to ARGB.
			
 
				-LIBYUV_API
			
 
				-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  if (!src_uyvy || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
			
 
				-    src_stride_uyvy = -src_stride_uyvy;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_uyvy == width * 2 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_uyvy = dst_stride_argb = 0;
			
 
				-  }
			
 
				-  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
			
 
				-      UYVYToARGBRow_C;
			
 
				-#if defined(HAS_UYVYTOARGBROW_SSSE3)
			
 
				-  // Posix is 16, Windows is 8.
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&
			
 
				-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        UYVYToARGBRow = UYVYToARGBRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_UYVYTOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      UYVYToARGBRow = UYVYToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    UYVYToARGBRow(src_uyvy, dst_argb, width);
			
 
				-    src_uyvy += src_stride_uyvy;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/convert_from.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/convert_from.cc
@@ -1,1196 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/convert_from.h"
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/convert.h"  // For I420Copy
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/format_conversion.h"
			
 
				-#include "libyuv/planar_functions.h"
			
 
				-#include "libyuv/rotate.h"
			
 
				-#include "libyuv/scale.h"  // For ScalePlane()
			
 
				-#include "libyuv/video_common.h"
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
			
 
				-static __inline int Abs(int v) {
			
 
				-  return v >= 0 ? v : -v;
			
 
				-}
			
 
				-
			
 
				-// I420 To any I4xx YUV format with mirroring.
			
 
				-static int I420ToI4xx(const uint8* src_y, int src_stride_y,
			
 
				-                      const uint8* src_u, int src_stride_u,
			
 
				-                      const uint8* src_v, int src_stride_v,
			
 
				-                      uint8* dst_y, int dst_stride_y,
			
 
				-                      uint8* dst_u, int dst_stride_u,
			
 
				-                      uint8* dst_v, int dst_stride_v,
			
 
				-                      int src_y_width, int src_y_height,
			
 
				-                      int dst_uv_width, int dst_uv_height) {
			
 
				-  if (src_y_width == 0 || src_y_height == 0 ||
			
 
				-      dst_uv_width <= 0 || dst_uv_height <= 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  const int dst_y_width = Abs(src_y_width);
			
 
				-  const int dst_y_height = Abs(src_y_height);
			
 
				-  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
			
 
				-  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
			
 
				-  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
			
 
				-             dst_y, dst_stride_y, dst_y_width, dst_y_height,
			
 
				-             kFilterBilinear);
			
 
				-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
			
 
				-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
			
 
				-             kFilterBilinear);
			
 
				-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
			
 
				-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
			
 
				-             kFilterBilinear);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// 420 chroma is 1/2 width, 1/2 height
			
 
				-// 422 chroma is 1/2 width, 1x height
			
 
				-LIBYUV_API
			
 
				-int I420ToI422(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  const int dst_uv_width = (Abs(width) + 1) >> 1;
			
 
				-  const int dst_uv_height = Abs(height);
			
 
				-  return I420ToI4xx(src_y, src_stride_y,
			
 
				-                    src_u, src_stride_u,
			
 
				-                    src_v, src_stride_v,
			
 
				-                    dst_y, dst_stride_y,
			
 
				-                    dst_u, dst_stride_u,
			
 
				-                    dst_v, dst_stride_v,
			
 
				-                    width, height,
			
 
				-                    dst_uv_width, dst_uv_height);
			
 
				-}
			
 
				-
			
 
				-// 420 chroma is 1/2 width, 1/2 height
			
 
				-// 444 chroma is 1x width, 1x height
			
 
				-LIBYUV_API
			
 
				-int I420ToI444(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  const int dst_uv_width = Abs(width);
			
 
				-  const int dst_uv_height = Abs(height);
			
 
				-  return I420ToI4xx(src_y, src_stride_y,
			
 
				-                    src_u, src_stride_u,
			
 
				-                    src_v, src_stride_v,
			
 
				-                    dst_y, dst_stride_y,
			
 
				-                    dst_u, dst_stride_u,
			
 
				-                    dst_v, dst_stride_v,
			
 
				-                    width, height,
			
 
				-                    dst_uv_width, dst_uv_height);
			
 
				-}
			
 
				-
			
 
				-// 420 chroma is 1/2 width, 1/2 height
			
 
				-// 411 chroma is 1/4 width, 1x height
			
 
				-LIBYUV_API
			
 
				-int I420ToI411(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  const int dst_uv_width = (Abs(width) + 3) >> 2;
			
 
				-  const int dst_uv_height = Abs(height);
			
 
				-  return I420ToI4xx(src_y, src_stride_y,
			
 
				-                    src_u, src_stride_u,
			
 
				-                    src_v, src_stride_v,
			
 
				-                    dst_y, dst_stride_y,
			
 
				-                    dst_u, dst_stride_u,
			
 
				-                    dst_v, dst_stride_v,
			
 
				-                    width, height,
			
 
				-                    dst_uv_width, dst_uv_height);
			
 
				-}
			
 
				-
			
 
				-// Copy to I400. Source can be I420,422,444,400,NV12,NV21
			
 
				-LIBYUV_API
			
 
				-int I400Copy(const uint8* src_y, int src_stride_y,
			
 
				-             uint8* dst_y, int dst_stride_y,
			
 
				-             int width, int height) {
			
 
				-  if (!src_y || !dst_y ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-  }
			
 
				-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I422ToYUY2(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_yuy2, int dst_stride_yuy2,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
			
 
				-    dst_stride_yuy2 = -dst_stride_yuy2;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      src_stride_u * 2 == width &&
			
 
				-      src_stride_v * 2 == width &&
			
 
				-      dst_stride_yuy2 == width * 2) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
			
 
				-  }
			
 
				-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
			
 
				-                        const uint8* src_v, uint8* dst_yuy2, int width) =
			
 
				-      I422ToYUY2Row_C;
			
 
				-#if defined(HAS_I422TOYUY2ROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
			
 
				-    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToYUY2Row = I422ToYUY2Row_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOYUY2ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
			
 
				-    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToYUY2Row = I422ToYUY2Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
			
 
				-    src_y += src_stride_y;
			
 
				-    src_u += src_stride_u;
			
 
				-    src_v += src_stride_v;
			
 
				-    dst_yuy2 += dst_stride_yuy2;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToYUY2(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_yuy2, int dst_stride_yuy2,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
			
 
				-    dst_stride_yuy2 = -dst_stride_yuy2;
			
 
				-  }
			
 
				-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
			
 
				-                        const uint8* src_v, uint8* dst_yuy2, int width) =
			
 
				-      I422ToYUY2Row_C;
			
 
				-#if defined(HAS_I422TOYUY2ROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
			
 
				-    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToYUY2Row = I422ToYUY2Row_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOYUY2ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
			
 
				-    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToYUY2Row = I422ToYUY2Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
			
 
				-    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
			
 
				-                  dst_yuy2 + dst_stride_yuy2, width);
			
 
				-    src_y += src_stride_y * 2;
			
 
				-    src_u += src_stride_u;
			
 
				-    src_v += src_stride_v;
			
 
				-    dst_yuy2 += dst_stride_yuy2 * 2;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I422ToUYVY(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_uyvy, int dst_stride_uyvy,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
			
 
				-    dst_stride_uyvy = -dst_stride_uyvy;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      src_stride_u * 2 == width &&
			
 
				-      src_stride_v * 2 == width &&
			
 
				-      dst_stride_uyvy == width * 2) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
			
 
				-  }
			
 
				-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
			
 
				-                        const uint8* src_v, uint8* dst_uyvy, int width) =
			
 
				-      I422ToUYVYRow_C;
			
 
				-#if defined(HAS_I422TOUYVYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
			
 
				-    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToUYVYRow = I422ToUYVYRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOUYVYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
			
 
				-    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToUYVYRow = I422ToUYVYRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
			
 
				-    src_y += src_stride_y;
			
 
				-    src_u += src_stride_u;
			
 
				-    src_v += src_stride_v;
			
 
				-    dst_uyvy += dst_stride_uyvy;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToUYVY(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_uyvy, int dst_stride_uyvy,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
			
 
				-    dst_stride_uyvy = -dst_stride_uyvy;
			
 
				-  }
			
 
				-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
			
 
				-                        const uint8* src_v, uint8* dst_uyvy, int width) =
			
 
				-      I422ToUYVYRow_C;
			
 
				-#if defined(HAS_I422TOUYVYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
			
 
				-    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToUYVYRow = I422ToUYVYRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOUYVYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
			
 
				-    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToUYVYRow = I422ToUYVYRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
			
 
				-    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
			
 
				-                  dst_uyvy + dst_stride_uyvy, width);
			
 
				-    src_y += src_stride_y * 2;
			
 
				-    src_u += src_stride_u;
			
 
				-    src_v += src_stride_v;
			
 
				-    dst_uyvy += dst_stride_uyvy * 2;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToNV12(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_uv, int dst_stride_uv,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    int halfheight = (height + 1) >> 1;
			
 
				-    dst_y = dst_y + (height - 1) * dst_stride_y;
			
 
				-    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
			
 
				-    dst_stride_y = -dst_stride_y;
			
 
				-    dst_stride_uv = -dst_stride_uv;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  int halfwidth = (width + 1) >> 1;
			
 
				-  int halfheight = (height + 1) >> 1;
			
 
				-  if (src_stride_y == width &&
			
 
				-      dst_stride_y == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = dst_stride_y = 0;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_u == halfwidth &&
			
 
				-      src_stride_v == halfwidth &&
			
 
				-      dst_stride_uv == halfwidth * 2) {
			
 
				-    halfwidth *= halfheight;
			
 
				-    halfheight = 1;
			
 
				-    src_stride_u = src_stride_v = dst_stride_uv = 0;
			
 
				-  }
			
 
				-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                      int width) = MergeUVRow_C;
			
 
				-#if defined(HAS_MERGEUVROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
			
 
				-    MergeUVRow_ = MergeUVRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(halfwidth, 16)) {
			
 
				-      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
			
 
				-          IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
			
 
				-          IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
			
 
				-        MergeUVRow_ = MergeUVRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MERGEUVROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
			
 
				-    MergeUVRow_ = MergeUVRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(halfwidth, 32)) {
			
 
				-      MergeUVRow_ = MergeUVRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MERGEUVROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
			
 
				-    MergeUVRow_ = MergeUVRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(halfwidth, 16)) {
			
 
				-      MergeUVRow_ = MergeUVRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
			
 
				-  for (int y = 0; y < halfheight; ++y) {
			
 
				-    // Merge a row of U and V into a row of UV.
			
 
				-    MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
			
 
				-    src_u += src_stride_u;
			
 
				-    src_v += src_stride_v;
			
 
				-    dst_uv += dst_stride_uv;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420ToNV21(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_vu, int dst_stride_vu,
			
 
				-               int width, int height) {
			
 
				-  return I420ToNV12(src_y, src_stride_y,
			
 
				-                    src_v, src_stride_v,
			
 
				-                    src_u, src_stride_u,
			
 
				-                    dst_y, src_stride_y,
			
 
				-                    dst_vu, dst_stride_vu,
			
 
				-                    width, height);
			
 
				-}
			
 
				-
			
 
				-// Convert I420 to ARGB.
			
 
				-LIBYUV_API
			
 
				-int I420ToARGB(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  void (*I422ToARGBRow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToARGBRow_C;
			
 
				-#if defined(HAS_I422TOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        I422ToARGBRow = I422ToARGBRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
			
 
				-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
			
 
				-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
			
 
				-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
			
 
				-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_u += src_stride_u;
			
 
				-      src_v += src_stride_v;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I420 to BGRA.
			
 
				-LIBYUV_API
			
 
				-int I420ToBGRA(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_bgra, int dst_stride_bgra,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_bgra ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
			
 
				-    dst_stride_bgra = -dst_stride_bgra;
			
 
				-  }
			
 
				-  void (*I422ToBGRARow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToBGRARow_C;
			
 
				-#if defined(HAS_I422TOBGRAROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
			
 
				-        I422ToBGRARow = I422ToBGRARow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOBGRAROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I422ToBGRARow = I422ToBGRARow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToBGRARow = I422ToBGRARow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
			
 
				-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
			
 
				-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
			
 
				-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
			
 
				-      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
			
 
				-    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
			
 
				-    dst_bgra += dst_stride_bgra;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_u += src_stride_u;
			
 
				-      src_v += src_stride_v;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I420 to ABGR.
			
 
				-LIBYUV_API
			
 
				-int I420ToABGR(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_abgr, int dst_stride_abgr,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_abgr ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
			
 
				-    dst_stride_abgr = -dst_stride_abgr;
			
 
				-  }
			
 
				-  void (*I422ToABGRRow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToABGRRow_C;
			
 
				-#if defined(HAS_I422TOABGRROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
			
 
				-        I422ToABGRRow = I422ToABGRRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOABGRROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I422ToABGRRow = I422ToABGRRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToABGRRow = I422ToABGRRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
			
 
				-    dst_abgr += dst_stride_abgr;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_u += src_stride_u;
			
 
				-      src_v += src_stride_v;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I420 to RGBA.
			
 
				-LIBYUV_API
			
 
				-int I420ToRGBA(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_rgba, int dst_stride_rgba,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_rgba ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
			
 
				-    dst_stride_rgba = -dst_stride_rgba;
			
 
				-  }
			
 
				-  void (*I422ToRGBARow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToRGBARow_C;
			
 
				-#if defined(HAS_I422TORGBAROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
			
 
				-        I422ToRGBARow = I422ToRGBARow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TORGBAROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToRGBARow = I422ToRGBARow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
			
 
				-    dst_rgba += dst_stride_rgba;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_u += src_stride_u;
			
 
				-      src_v += src_stride_v;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I420 to RGB24.
			
 
				-LIBYUV_API
			
 
				-int I420ToRGB24(const uint8* src_y, int src_stride_y,
			
 
				-                const uint8* src_u, int src_stride_u,
			
 
				-                const uint8* src_v, int src_stride_v,
			
 
				-                uint8* dst_rgb24, int dst_stride_rgb24,
			
 
				-                int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
			
 
				-    dst_stride_rgb24 = -dst_stride_rgb24;
			
 
				-  }
			
 
				-  void (*I422ToRGB24Row)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToRGB24Row_C;
			
 
				-#if defined(HAS_I422TORGB24ROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TORGB24ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToRGB24Row = I422ToRGB24Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
			
 
				-    dst_rgb24 += dst_stride_rgb24;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_u += src_stride_u;
			
 
				-      src_v += src_stride_v;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I420 to RAW.
			
 
				-LIBYUV_API
			
 
				-int I420ToRAW(const uint8* src_y, int src_stride_y,
			
 
				-                const uint8* src_u, int src_stride_u,
			
 
				-                const uint8* src_v, int src_stride_v,
			
 
				-                uint8* dst_raw, int dst_stride_raw,
			
 
				-                int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_raw ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_raw = dst_raw + (height - 1) * dst_stride_raw;
			
 
				-    dst_stride_raw = -dst_stride_raw;
			
 
				-  }
			
 
				-  void (*I422ToRAWRow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToRAWRow_C;
			
 
				-#if defined(HAS_I422TORAWROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToRAWRow = I422ToRAWRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TORAWROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I422ToRAWRow = I422ToRAWRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToRAWRow = I422ToRAWRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
			
 
				-    dst_raw += dst_stride_raw;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_u += src_stride_u;
			
 
				-      src_v += src_stride_v;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I420 to ARGB1555.
			
 
				-LIBYUV_API
			
 
				-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
			
 
				-                   const uint8* src_u, int src_stride_u,
			
 
				-                   const uint8* src_v, int src_stride_v,
			
 
				-                   uint8* dst_argb1555, int dst_stride_argb1555,
			
 
				-                   int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
			
 
				-    dst_stride_argb1555 = -dst_stride_argb1555;
			
 
				-  }
			
 
				-  void (*I422ToARGB1555Row)(const uint8* y_buf,
			
 
				-                          const uint8* u_buf,
			
 
				-                          const uint8* v_buf,
			
 
				-                          uint8* rgb_buf,
			
 
				-                          int width) = I422ToARGB1555Row_C;
			
 
				-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOARGB1555ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
			
 
				-    dst_argb1555 += dst_stride_argb1555;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_u += src_stride_u;
			
 
				-      src_v += src_stride_v;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-// Convert I420 to ARGB4444.
			
 
				-LIBYUV_API
			
 
				-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
			
 
				-                   const uint8* src_u, int src_stride_u,
			
 
				-                   const uint8* src_v, int src_stride_v,
			
 
				-                   uint8* dst_argb4444, int dst_stride_argb4444,
			
 
				-                   int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
			
 
				-    dst_stride_argb4444 = -dst_stride_argb4444;
			
 
				-  }
			
 
				-  void (*I422ToARGB4444Row)(const uint8* y_buf,
			
 
				-                          const uint8* u_buf,
			
 
				-                          const uint8* v_buf,
			
 
				-                          uint8* rgb_buf,
			
 
				-                          int width) = I422ToARGB4444Row_C;
			
 
				-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOARGB4444ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
			
 
				-    dst_argb4444 += dst_stride_argb4444;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_u += src_stride_u;
			
 
				-      src_v += src_stride_v;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I420 to RGB565.
			
 
				-LIBYUV_API
			
 
				-int I420ToRGB565(const uint8* src_y, int src_stride_y,
			
 
				-                 const uint8* src_u, int src_stride_u,
			
 
				-                 const uint8* src_v, int src_stride_v,
			
 
				-                 uint8* dst_rgb565, int dst_stride_rgb565,
			
 
				-                 int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
			
 
				-    dst_stride_rgb565 = -dst_stride_rgb565;
			
 
				-  }
			
 
				-  void (*I422ToRGB565Row)(const uint8* y_buf,
			
 
				-                          const uint8* u_buf,
			
 
				-                          const uint8* v_buf,
			
 
				-                          uint8* rgb_buf,
			
 
				-                          int width) = I422ToRGB565Row_C;
			
 
				-#if defined(HAS_I422TORGB565ROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TORGB565ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToRGB565Row = I422ToRGB565Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
			
 
				-    dst_rgb565 += dst_stride_rgb565;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_u += src_stride_u;
			
 
				-      src_v += src_stride_v;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I420 to specified format
			
 
				-LIBYUV_API
			
 
				-int ConvertFromI420(const uint8* y, int y_stride,
			
 
				-                    const uint8* u, int u_stride,
			
 
				-                    const uint8* v, int v_stride,
			
 
				-                    uint8* dst_sample, int dst_sample_stride,
			
 
				-                    int width, int height,
			
 
				-                    uint32 fourcc) {
			
 
				-  uint32 format = CanonicalFourCC(fourcc);
			
 
				-  if (!y || !u|| !v || !dst_sample ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  int r = 0;
			
 
				-  switch (format) {
			
 
				-    // Single plane formats
			
 
				-    case FOURCC_YUY2:
			
 
				-      r = I420ToYUY2(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     dst_sample,
			
 
				-                     dst_sample_stride ? dst_sample_stride : width * 2,
			
 
				-                     width, height);
			
 
				-      break;
			
 
				-    case FOURCC_UYVY:
			
 
				-      r = I420ToUYVY(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     dst_sample,
			
 
				-                     dst_sample_stride ? dst_sample_stride : width * 2,
			
 
				-                     width, height);
			
 
				-      break;
			
 
				-    case FOURCC_RGBP:
			
 
				-      r = I420ToRGB565(y, y_stride,
			
 
				-                       u, u_stride,
			
 
				-                       v, v_stride,
			
 
				-                       dst_sample,
			
 
				-                       dst_sample_stride ? dst_sample_stride : width * 2,
			
 
				-                       width, height);
			
 
				-      break;
			
 
				-    case FOURCC_RGBO:
			
 
				-      r = I420ToARGB1555(y, y_stride,
			
 
				-                         u, u_stride,
			
 
				-                         v, v_stride,
			
 
				-                         dst_sample,
			
 
				-                         dst_sample_stride ? dst_sample_stride : width * 2,
			
 
				-                         width, height);
			
 
				-      break;
			
 
				-    case FOURCC_R444:
			
 
				-      r = I420ToARGB4444(y, y_stride,
			
 
				-                         u, u_stride,
			
 
				-                         v, v_stride,
			
 
				-                         dst_sample,
			
 
				-                         dst_sample_stride ? dst_sample_stride : width * 2,
			
 
				-                         width, height);
			
 
				-      break;
			
 
				-    case FOURCC_24BG:
			
 
				-      r = I420ToRGB24(y, y_stride,
			
 
				-                      u, u_stride,
			
 
				-                      v, v_stride,
			
 
				-                      dst_sample,
			
 
				-                      dst_sample_stride ? dst_sample_stride : width * 3,
			
 
				-                      width, height);
			
 
				-      break;
			
 
				-    case FOURCC_RAW:
			
 
				-      r = I420ToRAW(y, y_stride,
			
 
				-                    u, u_stride,
			
 
				-                    v, v_stride,
			
 
				-                    dst_sample,
			
 
				-                    dst_sample_stride ? dst_sample_stride : width * 3,
			
 
				-                    width, height);
			
 
				-      break;
			
 
				-    case FOURCC_ARGB:
			
 
				-      r = I420ToARGB(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     dst_sample,
			
 
				-                     dst_sample_stride ? dst_sample_stride : width * 4,
			
 
				-                     width, height);
			
 
				-      break;
			
 
				-    case FOURCC_BGRA:
			
 
				-      r = I420ToBGRA(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     dst_sample,
			
 
				-                     dst_sample_stride ? dst_sample_stride : width * 4,
			
 
				-                     width, height);
			
 
				-      break;
			
 
				-    case FOURCC_ABGR:
			
 
				-      r = I420ToABGR(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     dst_sample,
			
 
				-                     dst_sample_stride ? dst_sample_stride : width * 4,
			
 
				-                     width, height);
			
 
				-      break;
			
 
				-    case FOURCC_RGBA:
			
 
				-      r = I420ToRGBA(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     dst_sample,
			
 
				-                     dst_sample_stride ? dst_sample_stride : width * 4,
			
 
				-                     width, height);
			
 
				-      break;
			
 
				-    case FOURCC_BGGR:
			
 
				-      r = I420ToBayerBGGR(y, y_stride,
			
 
				-                          u, u_stride,
			
 
				-                          v, v_stride,
			
 
				-                          dst_sample,
			
 
				-                          dst_sample_stride ? dst_sample_stride : width,
			
 
				-                          width, height);
			
 
				-      break;
			
 
				-    case FOURCC_GBRG:
			
 
				-      r = I420ToBayerGBRG(y, y_stride,
			
 
				-                          u, u_stride,
			
 
				-                          v, v_stride,
			
 
				-                          dst_sample,
			
 
				-                          dst_sample_stride ? dst_sample_stride : width,
			
 
				-                          width, height);
			
 
				-      break;
			
 
				-    case FOURCC_GRBG:
			
 
				-      r = I420ToBayerGRBG(y, y_stride,
			
 
				-                          u, u_stride,
			
 
				-                          v, v_stride,
			
 
				-                          dst_sample,
			
 
				-                          dst_sample_stride ? dst_sample_stride : width,
			
 
				-                          width, height);
			
 
				-      break;
			
 
				-    case FOURCC_RGGB:
			
 
				-      r = I420ToBayerRGGB(y, y_stride,
			
 
				-                          u, u_stride,
			
 
				-                          v, v_stride,
			
 
				-                          dst_sample,
			
 
				-                          dst_sample_stride ? dst_sample_stride : width,
			
 
				-                          width, height);
			
 
				-      break;
			
 
				-    case FOURCC_I400:
			
 
				-      r = I400Copy(y, y_stride,
			
 
				-                   dst_sample,
			
 
				-                   dst_sample_stride ? dst_sample_stride : width,
			
 
				-                   width, height);
			
 
				-      break;
			
 
				-    case FOURCC_NV12: {
			
 
				-      uint8* dst_uv = dst_sample + width * height;
			
 
				-      r = I420ToNV12(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     dst_sample,
			
 
				-                     dst_sample_stride ? dst_sample_stride : width,
			
 
				-                     dst_uv,
			
 
				-                     dst_sample_stride ? dst_sample_stride : width,
			
 
				-                     width, height);
			
 
				-      break;
			
 
				-    }
			
 
				-    case FOURCC_NV21: {
			
 
				-      uint8* dst_vu = dst_sample + width * height;
			
 
				-      r = I420ToNV21(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     dst_sample,
			
 
				-                     dst_sample_stride ? dst_sample_stride : width,
			
 
				-                     dst_vu,
			
 
				-                     dst_sample_stride ? dst_sample_stride : width,
			
 
				-                     width, height);
			
 
				-      break;
			
 
				-    }
			
 
				-    // TODO(fbarchard): Add M420 and Q420.
			
 
				-    // Triplanar formats
			
 
				-    // TODO(fbarchard): halfstride instead of halfwidth
			
 
				-    case FOURCC_I420:
			
 
				-    case FOURCC_YU12:
			
 
				-    case FOURCC_YV12: {
			
 
				-      int halfwidth = (width + 1) / 2;
			
 
				-      int halfheight = (height + 1) / 2;
			
 
				-      uint8* dst_u;
			
 
				-      uint8* dst_v;
			
 
				-      if (format == FOURCC_YV12) {
			
 
				-        dst_v = dst_sample + width * height;
			
 
				-        dst_u = dst_v + halfwidth * halfheight;
			
 
				-      } else {
			
 
				-        dst_u = dst_sample + width * height;
			
 
				-        dst_v = dst_u + halfwidth * halfheight;
			
 
				-      }
			
 
				-      r = I420Copy(y, y_stride,
			
 
				-                   u, u_stride,
			
 
				-                   v, v_stride,
			
 
				-                   dst_sample, width,
			
 
				-                   dst_u, halfwidth,
			
 
				-                   dst_v, halfwidth,
			
 
				-                   width, height);
			
 
				-      break;
			
 
				-    }
			
 
				-    case FOURCC_I422:
			
 
				-    case FOURCC_YV16: {
			
 
				-      int halfwidth = (width + 1) / 2;
			
 
				-      uint8* dst_u;
			
 
				-      uint8* dst_v;
			
 
				-      if (format == FOURCC_YV16) {
			
 
				-        dst_v = dst_sample + width * height;
			
 
				-        dst_u = dst_v + halfwidth * height;
			
 
				-      } else {
			
 
				-        dst_u = dst_sample + width * height;
			
 
				-        dst_v = dst_u + halfwidth * height;
			
 
				-      }
			
 
				-      r = I420ToI422(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     dst_sample, width,
			
 
				-                     dst_u, halfwidth,
			
 
				-                     dst_v, halfwidth,
			
 
				-                     width, height);
			
 
				-      break;
			
 
				-    }
			
 
				-    case FOURCC_I444:
			
 
				-    case FOURCC_YV24: {
			
 
				-      uint8* dst_u;
			
 
				-      uint8* dst_v;
			
 
				-      if (format == FOURCC_YV24) {
			
 
				-        dst_v = dst_sample + width * height;
			
 
				-        dst_u = dst_v + width * height;
			
 
				-      } else {
			
 
				-        dst_u = dst_sample + width * height;
			
 
				-        dst_v = dst_u + width * height;
			
 
				-      }
			
 
				-      r = I420ToI444(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     dst_sample, width,
			
 
				-                     dst_u, width,
			
 
				-                     dst_v, width,
			
 
				-                     width, height);
			
 
				-      break;
			
 
				-    }
			
 
				-    case FOURCC_I411: {
			
 
				-      int quarterwidth = (width + 3) / 4;
			
 
				-      uint8* dst_u = dst_sample + width * height;
			
 
				-      uint8* dst_v = dst_u + quarterwidth * height;
			
 
				-      r = I420ToI411(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     dst_sample, width,
			
 
				-                     dst_u, quarterwidth,
			
 
				-                     dst_v, quarterwidth,
			
 
				-                     width, height);
			
 
				-      break;
			
 
				-    }
			
 
				-
			
 
				-    // Formats not supported - MJPG, biplanar, some rgb formats.
			
 
				-    default:
			
 
				-      return -1;  // unknown fourcc - return failure code.
			
 
				-  }
			
 
				-  return r;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/convert_from_argb.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/convert_from_argb.cc
@@ -1,1096 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/convert_from_argb.h"
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/format_conversion.h"
			
 
				-#include "libyuv/planar_functions.h"
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// ARGB little endian (bgra in memory) to I444
			
 
				-LIBYUV_API
			
 
				-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_y == width &&
			
 
				-      dst_stride_u == width &&
			
 
				-      dst_stride_v == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
			
 
				-  }
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix) = ARGBToUV444Row_C;
			
 
				-#if defined(HAS_ARGBTOUV444ROW_SSSE3)
			
 
				-    if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
			
 
				-        if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-          ARGBToUV444Row = ARGBToUV444Row_SSSE3;
			
 
				-        }
			
 
				-      }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
			
 
				-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-
			
 
				-#elif defined(HAS_ARGBTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_NEON;
			
 
				-    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYRow = ARGBToYRow_NEON;
			
 
				-      ARGBToUV444Row = ARGBToUV444Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToUV444Row(src_argb, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(src_argb, dst_y, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_y += dst_stride_y;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// ARGB little endian (bgra in memory) to I422
			
 
				-LIBYUV_API
			
 
				-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_y == width &&
			
 
				-      dst_stride_u * 2 == width &&
			
 
				-      dst_stride_v * 2 == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
			
 
				-  }
			
 
				-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix) = ARGBToUV422Row_C;
			
 
				-#if defined(HAS_ARGBTOUV422ROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
			
 
				-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYRow = ARGBToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ARGBToUV422Row = ARGBToUV422Row_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToUV422Row(src_argb, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(src_argb, dst_y, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_y += dst_stride_y;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// ARGB little endian (bgra in memory) to I411
			
 
				-LIBYUV_API
			
 
				-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_y == width &&
			
 
				-      dst_stride_u * 4 == width &&
			
 
				-      dst_stride_v * 4 == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
			
 
				-  }
			
 
				-  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix) = ARGBToUV411Row_C;
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
			
 
				-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOYROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 32)) {
			
 
				-      ARGBToYRow = ARGBToYRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYRow = ARGBToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 32) {
			
 
				-      ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 32)) {
			
 
				-        ARGBToUV411Row = ARGBToUV411Row_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
			
 
				-    ARGBToYRow(src_argb, dst_y, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_y += dst_stride_y;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_uv, int dst_stride_uv,
			
 
				-               int width, int height) {
			
 
				-  if (!src_argb ||
			
 
				-      !dst_y || !dst_uv ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-        ARGBToUVRow = ARGBToUVRow_SSSE3;
			
 
				-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-          ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYRow = ARGBToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ARGBToUVRow = ARGBToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  int halfwidth = (width + 1) >> 1;
			
 
				-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                      int width) = MergeUVRow_C;
			
 
				-#if defined(HAS_MERGEUVROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
			
 
				-    MergeUVRow_ = MergeUVRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(halfwidth, 16)) {
			
 
				-      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
			
 
				-        MergeUVRow_ = MergeUVRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MERGEUVROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
			
 
				-    MergeUVRow_ = MergeUVRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(halfwidth, 32)) {
			
 
				-      MergeUVRow_ = MergeUVRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MERGEUVROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
			
 
				-    MergeUVRow_ = MergeUVRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(halfwidth, 16)) {
			
 
				-      MergeUVRow_ = MergeUVRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Allocate a rows of uv.
			
 
				-  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
			
 
				-  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
			
 
				-    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
			
 
				-    ARGBToYRow(src_argb, dst_y, width);
			
 
				-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
			
 
				-    src_argb += src_stride_argb * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_uv += dst_stride_uv;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
			
 
				-    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
			
 
				-    ARGBToYRow(src_argb, dst_y, width);
			
 
				-  }
			
 
				-  free_aligned_buffer_64(row_u);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Same as NV12 but U and V swapped.
			
 
				-LIBYUV_API
			
 
				-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_uv, int dst_stride_uv,
			
 
				-               int width, int height) {
			
 
				-  if (!src_argb ||
			
 
				-      !dst_y || !dst_uv ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-        ARGBToUVRow = ARGBToUVRow_SSSE3;
			
 
				-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-          ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYRow = ARGBToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ARGBToUVRow = ARGBToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  int halfwidth = (width + 1) >> 1;
			
 
				-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                      int width) = MergeUVRow_C;
			
 
				-#if defined(HAS_MERGEUVROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
			
 
				-    MergeUVRow_ = MergeUVRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(halfwidth, 16)) {
			
 
				-      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
			
 
				-        MergeUVRow_ = MergeUVRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MERGEUVROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
			
 
				-    MergeUVRow_ = MergeUVRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(halfwidth, 32)) {
			
 
				-      MergeUVRow_ = MergeUVRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MERGEUVROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
			
 
				-    MergeUVRow_ = MergeUVRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(halfwidth, 16)) {
			
 
				-      MergeUVRow_ = MergeUVRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Allocate a rows of uv.
			
 
				-  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
			
 
				-  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
			
 
				-    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
			
 
				-    ARGBToYRow(src_argb, dst_y, width);
			
 
				-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
			
 
				-    src_argb += src_stride_argb * 2;
			
 
				-    dst_y += dst_stride_y * 2;
			
 
				-    dst_uv += dst_stride_uv;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
			
 
				-    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
			
 
				-    ARGBToYRow(src_argb, dst_y, width);
			
 
				-  }
			
 
				-  free_aligned_buffer_64(row_u);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB to YUY2.
			
 
				-LIBYUV_API
			
 
				-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_yuy2, int dst_stride_yuy2,
			
 
				-               int width, int height) {
			
 
				-  if (!src_argb || !dst_yuy2 ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
			
 
				-    dst_stride_yuy2 = -dst_stride_yuy2;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_yuy2 == width * 2) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_yuy2 = 0;
			
 
				-  }
			
 
				-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix) = ARGBToUV422Row_C;
			
 
				-#if defined(HAS_ARGBTOUV422ROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYRow = ARGBToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ARGBToUV422Row = ARGBToUV422Row_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
			
 
				-                        const uint8* src_v, uint8* dst_yuy2, int width) =
			
 
				-      I422ToYUY2Row_C;
			
 
				-#if defined(HAS_I422TOYUY2ROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
			
 
				-    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToYUY2Row = I422ToYUY2Row_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOYUY2ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
			
 
				-    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToYUY2Row = I422ToYUY2Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Allocate a rows of yuv.
			
 
				-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
			
 
				-  uint8* row_u = row_y + ((width + 63) & ~63);
			
 
				-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToUV422Row(src_argb, row_u, row_v, width);
			
 
				-    ARGBToYRow(src_argb, row_y, width);
			
 
				-    I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_yuy2 += dst_stride_yuy2;
			
 
				-  }
			
 
				-
			
 
				-  free_aligned_buffer_64(row_y);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB to UYVY.
			
 
				-LIBYUV_API
			
 
				-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_uyvy, int dst_stride_uyvy,
			
 
				-               int width, int height) {
			
 
				-  if (!src_argb || !dst_uyvy ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
			
 
				-    dst_stride_uyvy = -dst_stride_uyvy;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_uyvy == width * 2) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_uyvy = 0;
			
 
				-  }
			
 
				-  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix) = ARGBToUV422Row_C;
			
 
				-#if defined(HAS_ARGBTOUV422ROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYRow = ARGBToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ARGBToUV422Row = ARGBToUV422Row_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
			
 
				-                        const uint8* src_v, uint8* dst_uyvy, int width) =
			
 
				-      I422ToUYVYRow_C;
			
 
				-#if defined(HAS_I422TOUYVYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
			
 
				-    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToUYVYRow = I422ToUYVYRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOUYVYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
			
 
				-    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToUYVYRow = I422ToUYVYRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Allocate a rows of yuv.
			
 
				-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
			
 
				-  uint8* row_u = row_y + ((width + 63) & ~63);
			
 
				-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToUV422Row(src_argb, row_u, row_v, width);
			
 
				-    ARGBToYRow(src_argb, row_y, width);
			
 
				-    I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_uyvy += dst_stride_uyvy;
			
 
				-  }
			
 
				-
			
 
				-  free_aligned_buffer_64(row_y);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB to I400.
			
 
				-LIBYUV_API
			
 
				-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               int width, int height) {
			
 
				-  if (!src_argb || !dst_y || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_y == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_y = 0;
			
 
				-  }
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
			
 
				-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOYROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 32)) {
			
 
				-      ARGBToYRow = ARGBToYRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYRow = ARGBToYRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToYRow(src_argb, dst_y, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_y += dst_stride_y;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Shuffle table for converting ARGB to RGBA.
			
 
				-static uvec8 kShuffleMaskARGBToRGBA = {
			
 
				-  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
			
 
				-};
			
 
				-
			
 
				-// Convert ARGB to RGBA.
			
 
				-LIBYUV_API
			
 
				-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_rgba, int dst_stride_rgba,
			
 
				-               int width, int height) {
			
 
				-  return ARGBShuffle(src_argb, src_stride_argb,
			
 
				-                     dst_rgba, dst_stride_rgba,
			
 
				-                     (const uint8*)(&kShuffleMaskARGBToRGBA),
			
 
				-                     width, height);
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB To RGB24.
			
 
				-LIBYUV_API
			
 
				-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
			
 
				-                uint8* dst_rgb24, int dst_stride_rgb24,
			
 
				-                int width, int height) {
			
 
				-  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_rgb24 == width * 3) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_rgb24 = 0;
			
 
				-  }
			
 
				-  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
			
 
				-      ARGBToRGB24Row_C;
			
 
				-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTORGB24ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToRGB24Row(src_argb, dst_rgb24, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_rgb24 += dst_stride_rgb24;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB To RAW.
			
 
				-LIBYUV_API
			
 
				-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
			
 
				-              uint8* dst_raw, int dst_stride_raw,
			
 
				-              int width, int height) {
			
 
				-  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_raw == width * 3) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_raw = 0;
			
 
				-  }
			
 
				-  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
			
 
				-      ARGBToRAWRow_C;
			
 
				-#if defined(HAS_ARGBTORAWROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTORAWROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToRAWRow = ARGBToRAWRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToRAWRow(src_argb, dst_raw, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_raw += dst_stride_raw;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB To RGB565.
			
 
				-LIBYUV_API
			
 
				-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
			
 
				-                 uint8* dst_rgb565, int dst_stride_rgb565,
			
 
				-                 int width, int height) {
			
 
				-  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_rgb565 == width * 2) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_rgb565 = 0;
			
 
				-  }
			
 
				-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
			
 
				-      ARGBToRGB565Row_C;
			
 
				-#if defined(HAS_ARGBTORGB565ROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTORGB565ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToRGB565Row = ARGBToRGB565Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToRGB565Row(src_argb, dst_rgb565, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_rgb565 += dst_stride_rgb565;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB To ARGB1555.
			
 
				-LIBYUV_API
			
 
				-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
			
 
				-                   uint8* dst_argb1555, int dst_stride_argb1555,
			
 
				-                   int width, int height) {
			
 
				-  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_argb1555 == width * 2) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_argb1555 = 0;
			
 
				-  }
			
 
				-  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
			
 
				-      ARGBToARGB1555Row_C;
			
 
				-#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTOARGB1555ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_argb1555 += dst_stride_argb1555;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB To ARGB4444.
			
 
				-LIBYUV_API
			
 
				-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
			
 
				-                   uint8* dst_argb4444, int dst_stride_argb4444,
			
 
				-                   int width, int height) {
			
 
				-  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_argb4444 == width * 2) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_argb4444 = 0;
			
 
				-  }
			
 
				-  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
			
 
				-      ARGBToARGB4444Row_C;
			
 
				-#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTOARGB4444ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_argb4444 += dst_stride_argb4444;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB to J420. (JPeg full range I420).
			
 
				-LIBYUV_API
			
 
				-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_yj, int dst_stride_yj,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  if (!src_argb ||
			
 
				-      !dst_yj || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
			
 
				-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
			
 
				-      ARGBToYJRow_C;
			
 
				-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
			
 
				-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;
			
 
				-      ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-        ARGBToUVJRow = ARGBToUVJRow_SSSE3;
			
 
				-        if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
			
 
				-          ARGBToYJRow = ARGBToYJRow_SSSE3;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
			
 
				-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 32)) {
			
 
				-      ARGBToYJRow = ARGBToYJRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOYJROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYJRow = ARGBToYJRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ARGBToUVJRow = ARGBToUVJRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height - 1; y += 2) {
			
 
				-    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
			
 
				-    ARGBToYJRow(src_argb, dst_yj, width);
			
 
				-    ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
			
 
				-    src_argb += src_stride_argb * 2;
			
 
				-    dst_yj += dst_stride_yj * 2;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
			
 
				-    ARGBToYJRow(src_argb, dst_yj, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB to J400.
			
 
				-LIBYUV_API
			
 
				-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_yj, int dst_stride_yj,
			
 
				-               int width, int height) {
			
 
				-  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_yj == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_yj = 0;
			
 
				-  }
			
 
				-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
			
 
				-      ARGBToYJRow_C;
			
 
				-#if defined(HAS_ARGBTOYJROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
			
 
				-          IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
			
 
				-        ARGBToYJRow = ARGBToYJRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOYJROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
			
 
				-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 32)) {
			
 
				-      ARGBToYJRow = ARGBToYJRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOYJROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYJRow = ARGBToYJRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    ARGBToYJRow(src_argb, dst_yj, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_yj += dst_stride_yj;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/convert_jpeg.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/convert_jpeg.cc
@@ -1,392 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/convert.h"
			
 
				-
			
 
				-#ifdef HAVE_JPEG
			
 
				-#include "libyuv/mjpeg_decoder.h"
			
 
				-#endif
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#ifdef HAVE_JPEG
			
 
				-struct I420Buffers {
			
 
				-  uint8* y;
			
 
				-  int y_stride;
			
 
				-  uint8* u;
			
 
				-  int u_stride;
			
 
				-  uint8* v;
			
 
				-  int v_stride;
			
 
				-  int w;
			
 
				-  int h;
			
 
				-};
			
 
				-
			
 
				-static void JpegCopyI420(void* opaque,
			
 
				-                         const uint8* const* data,
			
 
				-                         const int* strides,
			
 
				-                         int rows) {
			
 
				-  I420Buffers* dest = (I420Buffers*)(opaque);
			
 
				-  I420Copy(data[0], strides[0],
			
 
				-           data[1], strides[1],
			
 
				-           data[2], strides[2],
			
 
				-           dest->y, dest->y_stride,
			
 
				-           dest->u, dest->u_stride,
			
 
				-           dest->v, dest->v_stride,
			
 
				-           dest->w, rows);
			
 
				-  dest->y += rows * dest->y_stride;
			
 
				-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
			
 
				-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
			
 
				-  dest->h -= rows;
			
 
				-}
			
 
				-
			
 
				-static void JpegI422ToI420(void* opaque,
			
 
				-                           const uint8* const* data,
			
 
				-                           const int* strides,
			
 
				-                           int rows) {
			
 
				-  I420Buffers* dest = (I420Buffers*)(opaque);
			
 
				-  I422ToI420(data[0], strides[0],
			
 
				-             data[1], strides[1],
			
 
				-             data[2], strides[2],
			
 
				-             dest->y, dest->y_stride,
			
 
				-             dest->u, dest->u_stride,
			
 
				-             dest->v, dest->v_stride,
			
 
				-             dest->w, rows);
			
 
				-  dest->y += rows * dest->y_stride;
			
 
				-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
			
 
				-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
			
 
				-  dest->h -= rows;
			
 
				-}
			
 
				-
			
 
				-static void JpegI444ToI420(void* opaque,
			
 
				-                           const uint8* const* data,
			
 
				-                           const int* strides,
			
 
				-                           int rows) {
			
 
				-  I420Buffers* dest = (I420Buffers*)(opaque);
			
 
				-  I444ToI420(data[0], strides[0],
			
 
				-             data[1], strides[1],
			
 
				-             data[2], strides[2],
			
 
				-             dest->y, dest->y_stride,
			
 
				-             dest->u, dest->u_stride,
			
 
				-             dest->v, dest->v_stride,
			
 
				-             dest->w, rows);
			
 
				-  dest->y += rows * dest->y_stride;
			
 
				-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
			
 
				-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
			
 
				-  dest->h -= rows;
			
 
				-}
			
 
				-
			
 
				-static void JpegI411ToI420(void* opaque,
			
 
				-                           const uint8* const* data,
			
 
				-                           const int* strides,
			
 
				-                           int rows) {
			
 
				-  I420Buffers* dest = (I420Buffers*)(opaque);
			
 
				-  I411ToI420(data[0], strides[0],
			
 
				-             data[1], strides[1],
			
 
				-             data[2], strides[2],
			
 
				-             dest->y, dest->y_stride,
			
 
				-             dest->u, dest->u_stride,
			
 
				-             dest->v, dest->v_stride,
			
 
				-             dest->w, rows);
			
 
				-  dest->y += rows * dest->y_stride;
			
 
				-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
			
 
				-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
			
 
				-  dest->h -= rows;
			
 
				-}
			
 
				-
			
 
				-static void JpegI400ToI420(void* opaque,
			
 
				-                           const uint8* const* data,
			
 
				-                           const int* strides,
			
 
				-                           int rows) {
			
 
				-  I420Buffers* dest = (I420Buffers*)(opaque);
			
 
				-  I400ToI420(data[0], strides[0],
			
 
				-             dest->y, dest->y_stride,
			
 
				-             dest->u, dest->u_stride,
			
 
				-             dest->v, dest->v_stride,
			
 
				-             dest->w, rows);
			
 
				-  dest->y += rows * dest->y_stride;
			
 
				-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
			
 
				-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
			
 
				-  dest->h -= rows;
			
 
				-}
			
 
				-
			
 
				-// Query size of MJPG in pixels.
			
 
				-LIBYUV_API
			
 
				-int MJPGSize(const uint8* sample, size_t sample_size,
			
 
				-             int* width, int* height) {
			
 
				-  MJpegDecoder mjpeg_decoder;
			
 
				-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
			
 
				-  if (ret) {
			
 
				-    *width = mjpeg_decoder.GetWidth();
			
 
				-    *height = mjpeg_decoder.GetHeight();
			
 
				-  }
			
 
				-  mjpeg_decoder.UnloadFrame();
			
 
				-  return ret ? 0 : -1;  // -1 for runtime failure.
			
 
				-}
			
 
				-
			
 
				-// MJPG (Motion JPeg) to I420
			
 
				-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
			
 
				-LIBYUV_API
			
 
				-int MJPGToI420(const uint8* sample,
			
 
				-               size_t sample_size,
			
 
				-               uint8* y, int y_stride,
			
 
				-               uint8* u, int u_stride,
			
 
				-               uint8* v, int v_stride,
			
 
				-               int w, int h,
			
 
				-               int dw, int dh) {
			
 
				-  if (sample_size == kUnknownDataSize) {
			
 
				-    // ERROR: MJPEG frame size unknown
			
 
				-    return -1;
			
 
				-  }
			
 
				-
			
 
				-  // TODO(fbarchard): Port MJpeg to C.
			
 
				-  MJpegDecoder mjpeg_decoder;
			
 
				-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
			
 
				-  if (ret && (mjpeg_decoder.GetWidth() != w ||
			
 
				-              mjpeg_decoder.GetHeight() != h)) {
			
 
				-    // ERROR: MJPEG frame has unexpected dimensions
			
 
				-    mjpeg_decoder.UnloadFrame();
			
 
				-    return 1;  // runtime failure
			
 
				-  }
			
 
				-  if (ret) {
			
 
				-    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
			
 
				-    // YUV420
			
 
				-    if (mjpeg_decoder.GetColorSpace() ==
			
 
				-            MJpegDecoder::kColorSpaceYCbCr &&
			
 
				-        mjpeg_decoder.GetNumComponents() == 3 &&
			
 
				-        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
			
 
				-        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
			
 
				-        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
			
 
				-        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
			
 
				-        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
			
 
				-        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
			
 
				-      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
			
 
				-    // YUV422
			
 
				-    } else if (mjpeg_decoder.GetColorSpace() ==
			
 
				-                   MJpegDecoder::kColorSpaceYCbCr &&
			
 
				-               mjpeg_decoder.GetNumComponents() == 3 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
			
 
				-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
			
 
				-    // YUV444
			
 
				-    } else if (mjpeg_decoder.GetColorSpace() ==
			
 
				-                   MJpegDecoder::kColorSpaceYCbCr &&
			
 
				-               mjpeg_decoder.GetNumComponents() == 3 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
			
 
				-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
			
 
				-    // YUV411
			
 
				-    } else if (mjpeg_decoder.GetColorSpace() ==
			
 
				-                   MJpegDecoder::kColorSpaceYCbCr &&
			
 
				-               mjpeg_decoder.GetNumComponents() == 3 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
			
 
				-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
			
 
				-    // YUV400
			
 
				-    } else if (mjpeg_decoder.GetColorSpace() ==
			
 
				-                   MJpegDecoder::kColorSpaceGrayscale &&
			
 
				-               mjpeg_decoder.GetNumComponents() == 1 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
			
 
				-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
			
 
				-    } else {
			
 
				-      // TODO(fbarchard): Implement conversion for any other colorspace/sample
			
 
				-      // factors that occur in practice. 411 is supported by libjpeg
			
 
				-      // ERROR: Unable to convert MJPEG frame because format is not supported
			
 
				-      mjpeg_decoder.UnloadFrame();
			
 
				-      return 1;
			
 
				-    }
			
 
				-  }
			
 
				-  return ret ? 0 : 1;
			
 
				-}
			
 
				-
			
 
				-#ifdef HAVE_JPEG
			
 
				-struct ARGBBuffers {
			
 
				-  uint8* argb;
			
 
				-  int argb_stride;
			
 
				-  int w;
			
 
				-  int h;
			
 
				-};
			
 
				-
			
 
				-static void JpegI420ToARGB(void* opaque,
			
 
				-                         const uint8* const* data,
			
 
				-                         const int* strides,
			
 
				-                         int rows) {
			
 
				-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
			
 
				-  I420ToARGB(data[0], strides[0],
			
 
				-             data[1], strides[1],
			
 
				-             data[2], strides[2],
			
 
				-             dest->argb, dest->argb_stride,
			
 
				-             dest->w, rows);
			
 
				-  dest->argb += rows * dest->argb_stride;
			
 
				-  dest->h -= rows;
			
 
				-}
			
 
				-
			
 
				-static void JpegI422ToARGB(void* opaque,
			
 
				-                           const uint8* const* data,
			
 
				-                           const int* strides,
			
 
				-                           int rows) {
			
 
				-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
			
 
				-  I422ToARGB(data[0], strides[0],
			
 
				-             data[1], strides[1],
			
 
				-             data[2], strides[2],
			
 
				-             dest->argb, dest->argb_stride,
			
 
				-             dest->w, rows);
			
 
				-  dest->argb += rows * dest->argb_stride;
			
 
				-  dest->h -= rows;
			
 
				-}
			
 
				-
			
 
				-static void JpegI444ToARGB(void* opaque,
			
 
				-                           const uint8* const* data,
			
 
				-                           const int* strides,
			
 
				-                           int rows) {
			
 
				-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
			
 
				-  I444ToARGB(data[0], strides[0],
			
 
				-             data[1], strides[1],
			
 
				-             data[2], strides[2],
			
 
				-             dest->argb, dest->argb_stride,
			
 
				-             dest->w, rows);
			
 
				-  dest->argb += rows * dest->argb_stride;
			
 
				-  dest->h -= rows;
			
 
				-}
			
 
				-
			
 
				-static void JpegI411ToARGB(void* opaque,
			
 
				-                           const uint8* const* data,
			
 
				-                           const int* strides,
			
 
				-                           int rows) {
			
 
				-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
			
 
				-  I411ToARGB(data[0], strides[0],
			
 
				-             data[1], strides[1],
			
 
				-             data[2], strides[2],
			
 
				-             dest->argb, dest->argb_stride,
			
 
				-             dest->w, rows);
			
 
				-  dest->argb += rows * dest->argb_stride;
			
 
				-  dest->h -= rows;
			
 
				-}
			
 
				-
			
 
				-static void JpegI400ToARGB(void* opaque,
			
 
				-                           const uint8* const* data,
			
 
				-                           const int* strides,
			
 
				-                           int rows) {
			
 
				-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
			
 
				-  I400ToARGB(data[0], strides[0],
			
 
				-             dest->argb, dest->argb_stride,
			
 
				-             dest->w, rows);
			
 
				-  dest->argb += rows * dest->argb_stride;
			
 
				-  dest->h -= rows;
			
 
				-}
			
 
				-
			
 
				-// MJPG (Motion JPeg) to ARGB
			
 
				-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
			
 
				-LIBYUV_API
			
 
				-int MJPGToARGB(const uint8* sample,
			
 
				-               size_t sample_size,
			
 
				-               uint8* argb, int argb_stride,
			
 
				-               int w, int h,
			
 
				-               int dw, int dh) {
			
 
				-  if (sample_size == kUnknownDataSize) {
			
 
				-    // ERROR: MJPEG frame size unknown
			
 
				-    return -1;
			
 
				-  }
			
 
				-
			
 
				-  // TODO(fbarchard): Port MJpeg to C.
			
 
				-  MJpegDecoder mjpeg_decoder;
			
 
				-  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
			
 
				-  if (ret && (mjpeg_decoder.GetWidth() != w ||
			
 
				-              mjpeg_decoder.GetHeight() != h)) {
			
 
				-    // ERROR: MJPEG frame has unexpected dimensions
			
 
				-    mjpeg_decoder.UnloadFrame();
			
 
				-    return 1;  // runtime failure
			
 
				-  }
			
 
				-  if (ret) {
			
 
				-    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
			
 
				-    // YUV420
			
 
				-    if (mjpeg_decoder.GetColorSpace() ==
			
 
				-            MJpegDecoder::kColorSpaceYCbCr &&
			
 
				-        mjpeg_decoder.GetNumComponents() == 3 &&
			
 
				-        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
			
 
				-        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
			
 
				-        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
			
 
				-        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
			
 
				-        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
			
 
				-        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
			
 
				-      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
			
 
				-    // YUV422
			
 
				-    } else if (mjpeg_decoder.GetColorSpace() ==
			
 
				-                   MJpegDecoder::kColorSpaceYCbCr &&
			
 
				-               mjpeg_decoder.GetNumComponents() == 3 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
			
 
				-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
			
 
				-    // YUV444
			
 
				-    } else if (mjpeg_decoder.GetColorSpace() ==
			
 
				-                   MJpegDecoder::kColorSpaceYCbCr &&
			
 
				-               mjpeg_decoder.GetNumComponents() == 3 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
			
 
				-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
			
 
				-    // YUV411
			
 
				-    } else if (mjpeg_decoder.GetColorSpace() ==
			
 
				-                   MJpegDecoder::kColorSpaceYCbCr &&
			
 
				-               mjpeg_decoder.GetNumComponents() == 3 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
			
 
				-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
			
 
				-    // YUV400
			
 
				-    } else if (mjpeg_decoder.GetColorSpace() ==
			
 
				-                   MJpegDecoder::kColorSpaceGrayscale &&
			
 
				-               mjpeg_decoder.GetNumComponents() == 1 &&
			
 
				-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
			
 
				-               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
			
 
				-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
			
 
				-    } else {
			
 
				-      // TODO(fbarchard): Implement conversion for any other colorspace/sample
			
 
				-      // factors that occur in practice. 411 is supported by libjpeg
			
 
				-      // ERROR: Unable to convert MJPEG frame because format is not supported
			
 
				-      mjpeg_decoder.UnloadFrame();
			
 
				-      return 1;
			
 
				-    }
			
 
				-  }
			
 
				-  return ret ? 0 : 1;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/convert_to_argb.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/convert_to_argb.cc
@@ -1,327 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/convert_argb.h"
			
 
				-
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/format_conversion.h"
			
 
				-#ifdef HAVE_JPEG
			
 
				-#include "libyuv/mjpeg_decoder.h"
			
 
				-#endif
			
 
				-#include "libyuv/rotate_argb.h"
			
 
				-#include "libyuv/row.h"
			
 
				-#include "libyuv/video_common.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Convert camera sample to I420 with cropping, rotation and vertical flip.
			
 
				-// src_width is used for source stride computation
			
 
				-// src_height is used to compute location of planes, and indicate inversion
			
 
				-// sample_size is measured in bytes and is the size of the frame.
			
 
				-//   With MJPEG it is the compressed size of the frame.
			
 
				-LIBYUV_API
			
 
				-int ConvertToARGB(const uint8* sample, size_t sample_size,
			
 
				-                  uint8* crop_argb, int argb_stride,
			
 
				-                  int crop_x, int crop_y,
			
 
				-                  int src_width, int src_height,
			
 
				-                  int crop_width, int crop_height,
			
 
				-                  enum RotationMode rotation,
			
 
				-                  uint32 fourcc) {
			
 
				-  uint32 format = CanonicalFourCC(fourcc);
			
 
				-  int aligned_src_width = (src_width + 1) & ~1;
			
 
				-  const uint8* src;
			
 
				-  const uint8* src_uv;
			
 
				-  int abs_src_height = (src_height < 0) ? -src_height : src_height;
			
 
				-  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
			
 
				-  int r = 0;
			
 
				-
			
 
				-  // One pass rotation is available for some formats. For the rest, convert
			
 
				-  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
			
 
				-  // and then rotate the I420 to the final destination buffer.
			
 
				-  // For in-place conversion, if destination crop_argb is same as source sample,
			
 
				-  // also enable temporary buffer.
			
 
				-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
			
 
				-      crop_argb == sample;
			
 
				-  uint8* tmp_argb = crop_argb;
			
 
				-  int tmp_argb_stride = argb_stride;
			
 
				-  uint8* rotate_buffer = NULL;
			
 
				-  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
			
 
				-
			
 
				-  if (crop_argb == NULL || sample == NULL ||
			
 
				-      src_width <= 0 || crop_width <= 0 ||
			
 
				-      src_height == 0 || crop_height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (src_height < 0) {
			
 
				-    inv_crop_height = -inv_crop_height;
			
 
				-  }
			
 
				-
			
 
				-  if (need_buf) {
			
 
				-    int argb_size = crop_width * abs_crop_height * 4;
			
 
				-    rotate_buffer = (uint8*)malloc(argb_size);
			
 
				-    if (!rotate_buffer) {
			
 
				-      return 1;  // Out of memory runtime error.
			
 
				-    }
			
 
				-    crop_argb = rotate_buffer;
			
 
				-    argb_stride = crop_width;
			
 
				-  }
			
 
				-
			
 
				-  switch (format) {
			
 
				-    // Single plane formats
			
 
				-    case FOURCC_YUY2:
			
 
				-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
			
 
				-      r = YUY2ToARGB(src, aligned_src_width * 2,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_UYVY:
			
 
				-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
			
 
				-      r = UYVYToARGB(src, aligned_src_width * 2,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_24BG:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 3;
			
 
				-      r = RGB24ToARGB(src, src_width * 3,
			
 
				-                      crop_argb, argb_stride,
			
 
				-                      crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_RAW:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 3;
			
 
				-      r = RAWToARGB(src, src_width * 3,
			
 
				-                    crop_argb, argb_stride,
			
 
				-                    crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_ARGB:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 4;
			
 
				-      r = ARGBToARGB(src, src_width * 4,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_BGRA:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 4;
			
 
				-      r = BGRAToARGB(src, src_width * 4,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_ABGR:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 4;
			
 
				-      r = ABGRToARGB(src, src_width * 4,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_RGBA:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 4;
			
 
				-      r = RGBAToARGB(src, src_width * 4,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_RGBP:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 2;
			
 
				-      r = RGB565ToARGB(src, src_width * 2,
			
 
				-                       crop_argb, argb_stride,
			
 
				-                       crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_RGBO:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 2;
			
 
				-      r = ARGB1555ToARGB(src, src_width * 2,
			
 
				-                         crop_argb, argb_stride,
			
 
				-                         crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_R444:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 2;
			
 
				-      r = ARGB4444ToARGB(src, src_width * 2,
			
 
				-                         crop_argb, argb_stride,
			
 
				-                         crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    // TODO(fbarchard): Support cropping Bayer by odd numbers
			
 
				-    // by adjusting fourcc.
			
 
				-    case FOURCC_BGGR:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      r = BayerBGGRToARGB(src, src_width,
			
 
				-                          crop_argb, argb_stride,
			
 
				-                          crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-
			
 
				-    case FOURCC_GBRG:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      r = BayerGBRGToARGB(src, src_width,
			
 
				-                          crop_argb, argb_stride,
			
 
				-                          crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-
			
 
				-    case FOURCC_GRBG:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      r = BayerGRBGToARGB(src, src_width,
			
 
				-                          crop_argb, argb_stride,
			
 
				-                          crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-
			
 
				-    case FOURCC_RGGB:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      r = BayerRGGBToARGB(src, src_width,
			
 
				-                          crop_argb, argb_stride,
			
 
				-                          crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-
			
 
				-    case FOURCC_I400:
			
 
				-      src = sample + src_width * crop_y + crop_x;
			
 
				-      r = I400ToARGB(src, src_width,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-
			
 
				-    // Biplanar formats
			
 
				-    case FOURCC_NV12:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
			
 
				-      r = NV12ToARGB(src, src_width,
			
 
				-                     src_uv, aligned_src_width,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_NV21:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
			
 
				-      // Call NV12 but with u and v parameters swapped.
			
 
				-      r = NV21ToARGB(src, src_width,
			
 
				-                     src_uv, aligned_src_width,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_M420:
			
 
				-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
			
 
				-      r = M420ToARGB(src, src_width,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-//    case FOURCC_Q420:
			
 
				-//      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
			
 
				-//      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
			
 
				-//               src_width + crop_x * 2;
			
 
				-//      r = Q420ToARGB(src, src_width * 3,
			
 
				-//                    src_uv, src_width * 3,
			
 
				-//                    crop_argb, argb_stride,
			
 
				-//                    crop_width, inv_crop_height);
			
 
				-//      break;
			
 
				-    // Triplanar formats
			
 
				-    case FOURCC_I420:
			
 
				-    case FOURCC_YU12:
			
 
				-    case FOURCC_YV12: {
			
 
				-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
			
 
				-      const uint8* src_u;
			
 
				-      const uint8* src_v;
			
 
				-      int halfwidth = (src_width + 1) / 2;
			
 
				-      int halfheight = (abs_src_height + 1) / 2;
			
 
				-      if (format == FOURCC_YV12) {
			
 
				-        src_v = sample + src_width * abs_src_height +
			
 
				-            (halfwidth * crop_y + crop_x) / 2;
			
 
				-        src_u = sample + src_width * abs_src_height +
			
 
				-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
			
 
				-      } else {
			
 
				-        src_u = sample + src_width * abs_src_height +
			
 
				-            (halfwidth * crop_y + crop_x) / 2;
			
 
				-        src_v = sample + src_width * abs_src_height +
			
 
				-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
			
 
				-      }
			
 
				-      r = I420ToARGB(src_y, src_width,
			
 
				-                     src_u, halfwidth,
			
 
				-                     src_v, halfwidth,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    }
			
 
				-    case FOURCC_I422:
			
 
				-    case FOURCC_YV16: {
			
 
				-      const uint8* src_y = sample + src_width * crop_y + crop_x;
			
 
				-      const uint8* src_u;
			
 
				-      const uint8* src_v;
			
 
				-      int halfwidth = (src_width + 1) / 2;
			
 
				-      if (format == FOURCC_YV16) {
			
 
				-        src_v = sample + src_width * abs_src_height +
			
 
				-            halfwidth * crop_y + crop_x / 2;
			
 
				-        src_u = sample + src_width * abs_src_height +
			
 
				-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
			
 
				-      } else {
			
 
				-        src_u = sample + src_width * abs_src_height +
			
 
				-            halfwidth * crop_y + crop_x / 2;
			
 
				-        src_v = sample + src_width * abs_src_height +
			
 
				-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
			
 
				-      }
			
 
				-      r = I422ToARGB(src_y, src_width,
			
 
				-                     src_u, halfwidth,
			
 
				-                     src_v, halfwidth,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    }
			
 
				-    case FOURCC_I444:
			
 
				-    case FOURCC_YV24: {
			
 
				-      const uint8* src_y = sample + src_width * crop_y + crop_x;
			
 
				-      const uint8* src_u;
			
 
				-      const uint8* src_v;
			
 
				-      if (format == FOURCC_YV24) {
			
 
				-        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
			
 
				-        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
			
 
				-      } else {
			
 
				-        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
			
 
				-        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
			
 
				-      }
			
 
				-      r = I444ToARGB(src_y, src_width,
			
 
				-                     src_u, src_width,
			
 
				-                     src_v, src_width,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    }
			
 
				-    case FOURCC_I411: {
			
 
				-      int quarterwidth = (src_width + 3) / 4;
			
 
				-      const uint8* src_y = sample + src_width * crop_y + crop_x;
			
 
				-      const uint8* src_u = sample + src_width * abs_src_height +
			
 
				-          quarterwidth * crop_y + crop_x / 4;
			
 
				-      const uint8* src_v = sample + src_width * abs_src_height +
			
 
				-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
			
 
				-      r = I411ToARGB(src_y, src_width,
			
 
				-                     src_u, quarterwidth,
			
 
				-                     src_v, quarterwidth,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    }
			
 
				-#ifdef HAVE_JPEG
			
 
				-    case FOURCC_MJPG:
			
 
				-      r = MJPGToARGB(sample, sample_size,
			
 
				-                     crop_argb, argb_stride,
			
 
				-                     src_width, abs_src_height, crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-#endif
			
 
				-    default:
			
 
				-      r = -1;  // unknown fourcc - return failure code.
			
 
				-  }
			
 
				-
			
 
				-  if (need_buf) {
			
 
				-    if (!r) {
			
 
				-      r = ARGBRotate(crop_argb, argb_stride,
			
 
				-                     tmp_argb, tmp_argb_stride,
			
 
				-                     crop_width, abs_crop_height, rotation);
			
 
				-    }
			
 
				-    free(rotate_buffer);
			
 
				-  }
			
 
				-
			
 
				-  return r;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/convert_to_i420.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/convert_to_i420.cc
@@ -1,383 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include <stdlib.h>
			
 
				-
			
 
				-#include "libyuv/convert.h"
			
 
				-
			
 
				-#include "libyuv/format_conversion.h"
			
 
				-#include "libyuv/video_common.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Convert camera sample to I420 with cropping, rotation and vertical flip.
			
 
				-// src_width is used for source stride computation
			
 
				-// src_height is used to compute location of planes, and indicate inversion
			
 
				-// sample_size is measured in bytes and is the size of the frame.
			
 
				-//   With MJPEG it is the compressed size of the frame.
			
 
				-LIBYUV_API
			
 
				-int ConvertToI420(const uint8* sample,
			
 
				-                  size_t sample_size,
			
 
				-                  uint8* y, int y_stride,
			
 
				-                  uint8* u, int u_stride,
			
 
				-                  uint8* v, int v_stride,
			
 
				-                  int crop_x, int crop_y,
			
 
				-                  int src_width, int src_height,
			
 
				-                  int crop_width, int crop_height,
			
 
				-                  enum RotationMode rotation,
			
 
				-                  uint32 fourcc) {
			
 
				-  uint32 format = CanonicalFourCC(fourcc);
			
 
				-  int aligned_src_width = (src_width + 1) & ~1;
			
 
				-  const uint8* src;
			
 
				-  const uint8* src_uv;
			
 
				-  int abs_src_height = (src_height < 0) ? -src_height : src_height;
			
 
				-  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
			
 
				-  int r = 0;
			
 
				-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
			
 
				-      format != FOURCC_NV12 && format != FOURCC_NV21 &&
			
 
				-      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
			
 
				-  uint8* tmp_y = y;
			
 
				-  uint8* tmp_u = u;
			
 
				-  uint8* tmp_v = v;
			
 
				-  int tmp_y_stride = y_stride;
			
 
				-  int tmp_u_stride = u_stride;
			
 
				-  int tmp_v_stride = v_stride;
			
 
				-  uint8* rotate_buffer = NULL;
			
 
				-  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
			
 
				-
			
 
				-  if (!y || !u || !v || !sample ||
			
 
				-      src_width <= 0 || crop_width <= 0  ||
			
 
				-      src_height == 0 || crop_height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (src_height < 0) {
			
 
				-    inv_crop_height = -inv_crop_height;
			
 
				-  }
			
 
				-
			
 
				-  // One pass rotation is available for some formats. For the rest, convert
			
 
				-  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
			
 
				-  // and then rotate the I420 to the final destination buffer.
			
 
				-  // For in-place conversion, if destination y is same as source sample,
			
 
				-  // also enable temporary buffer.
			
 
				-  if (need_buf) {
			
 
				-    int y_size = crop_width * abs_crop_height;
			
 
				-    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
			
 
				-    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
			
 
				-    if (!rotate_buffer) {
			
 
				-      return 1;  // Out of memory runtime error.
			
 
				-    }
			
 
				-    y = rotate_buffer;
			
 
				-    u = y + y_size;
			
 
				-    v = u + uv_size;
			
 
				-    y_stride = crop_width;
			
 
				-    u_stride = v_stride = ((crop_width + 1) / 2);
			
 
				-  }
			
 
				-
			
 
				-  switch (format) {
			
 
				-    // Single plane formats
			
 
				-    case FOURCC_YUY2:
			
 
				-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
			
 
				-      r = YUY2ToI420(src, aligned_src_width * 2,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_UYVY:
			
 
				-      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
			
 
				-      r = UYVYToI420(src, aligned_src_width * 2,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_RGBP:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 2;
			
 
				-      r = RGB565ToI420(src, src_width * 2,
			
 
				-                       y, y_stride,
			
 
				-                       u, u_stride,
			
 
				-                       v, v_stride,
			
 
				-                       crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_RGBO:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 2;
			
 
				-      r = ARGB1555ToI420(src, src_width * 2,
			
 
				-                         y, y_stride,
			
 
				-                         u, u_stride,
			
 
				-                         v, v_stride,
			
 
				-                         crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_R444:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 2;
			
 
				-      r = ARGB4444ToI420(src, src_width * 2,
			
 
				-                         y, y_stride,
			
 
				-                         u, u_stride,
			
 
				-                         v, v_stride,
			
 
				-                         crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_24BG:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 3;
			
 
				-      r = RGB24ToI420(src, src_width * 3,
			
 
				-                      y, y_stride,
			
 
				-                      u, u_stride,
			
 
				-                      v, v_stride,
			
 
				-                      crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_RAW:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 3;
			
 
				-      r = RAWToI420(src, src_width * 3,
			
 
				-                    y, y_stride,
			
 
				-                    u, u_stride,
			
 
				-                    v, v_stride,
			
 
				-                    crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_ARGB:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 4;
			
 
				-      r = ARGBToI420(src, src_width * 4,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_BGRA:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 4;
			
 
				-      r = BGRAToI420(src, src_width * 4,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_ABGR:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 4;
			
 
				-      r = ABGRToI420(src, src_width * 4,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_RGBA:
			
 
				-      src = sample + (src_width * crop_y + crop_x) * 4;
			
 
				-      r = RGBAToI420(src, src_width * 4,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    // TODO(fbarchard): Support cropping Bayer by odd numbers
			
 
				-    // by adjusting fourcc.
			
 
				-    case FOURCC_BGGR:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      r = BayerBGGRToI420(src, src_width,
			
 
				-                          y, y_stride,
			
 
				-                          u, u_stride,
			
 
				-                          v, v_stride,
			
 
				-                          crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_GBRG:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      r = BayerGBRGToI420(src, src_width,
			
 
				-                          y, y_stride,
			
 
				-                          u, u_stride,
			
 
				-                          v, v_stride,
			
 
				-                          crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_GRBG:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      r = BayerGRBGToI420(src, src_width,
			
 
				-                          y, y_stride,
			
 
				-                          u, u_stride,
			
 
				-                          v, v_stride,
			
 
				-                          crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_RGGB:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      r = BayerRGGBToI420(src, src_width,
			
 
				-                          y, y_stride,
			
 
				-                          u, u_stride,
			
 
				-                          v, v_stride,
			
 
				-                          crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_I400:
			
 
				-      src = sample + src_width * crop_y + crop_x;
			
 
				-      r = I400ToI420(src, src_width,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    // Biplanar formats
			
 
				-    case FOURCC_NV12:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
			
 
				-      r = NV12ToI420Rotate(src, src_width,
			
 
				-                           src_uv, aligned_src_width,
			
 
				-                           y, y_stride,
			
 
				-                           u, u_stride,
			
 
				-                           v, v_stride,
			
 
				-                           crop_width, inv_crop_height, rotation);
			
 
				-      break;
			
 
				-    case FOURCC_NV21:
			
 
				-      src = sample + (src_width * crop_y + crop_x);
			
 
				-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
			
 
				-      // Call NV12 but with u and v parameters swapped.
			
 
				-      r = NV12ToI420Rotate(src, src_width,
			
 
				-                           src_uv, aligned_src_width,
			
 
				-                           y, y_stride,
			
 
				-                           v, v_stride,
			
 
				-                           u, u_stride,
			
 
				-                           crop_width, inv_crop_height, rotation);
			
 
				-      break;
			
 
				-    case FOURCC_M420:
			
 
				-      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
			
 
				-      r = M420ToI420(src, src_width,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    case FOURCC_Q420:
			
 
				-      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
			
 
				-      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
			
 
				-               src_width + crop_x * 2;
			
 
				-      r = Q420ToI420(src, src_width * 3,
			
 
				-                    src_uv, src_width * 3,
			
 
				-                    y, y_stride,
			
 
				-                    u, u_stride,
			
 
				-                    v, v_stride,
			
 
				-                    crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    // Triplanar formats
			
 
				-    case FOURCC_I420:
			
 
				-    case FOURCC_YU12:
			
 
				-    case FOURCC_YV12: {
			
 
				-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
			
 
				-      const uint8* src_u;
			
 
				-      const uint8* src_v;
			
 
				-      int halfwidth = (src_width + 1) / 2;
			
 
				-      int halfheight = (abs_src_height + 1) / 2;
			
 
				-      if (format == FOURCC_YV12) {
			
 
				-        src_v = sample + src_width * abs_src_height +
			
 
				-            (halfwidth * crop_y + crop_x) / 2;
			
 
				-        src_u = sample + src_width * abs_src_height +
			
 
				-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
			
 
				-      } else {
			
 
				-        src_u = sample + src_width * abs_src_height +
			
 
				-            (halfwidth * crop_y + crop_x) / 2;
			
 
				-        src_v = sample + src_width * abs_src_height +
			
 
				-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
			
 
				-      }
			
 
				-      r = I420Rotate(src_y, src_width,
			
 
				-                     src_u, halfwidth,
			
 
				-                     src_v, halfwidth,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height, rotation);
			
 
				-      break;
			
 
				-    }
			
 
				-    case FOURCC_I422:
			
 
				-    case FOURCC_YV16: {
			
 
				-      const uint8* src_y = sample + src_width * crop_y + crop_x;
			
 
				-      const uint8* src_u;
			
 
				-      const uint8* src_v;
			
 
				-      int halfwidth = (src_width + 1) / 2;
			
 
				-      if (format == FOURCC_YV16) {
			
 
				-        src_v = sample + src_width * abs_src_height +
			
 
				-            halfwidth * crop_y + crop_x / 2;
			
 
				-        src_u = sample + src_width * abs_src_height +
			
 
				-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
			
 
				-      } else {
			
 
				-        src_u = sample + src_width * abs_src_height +
			
 
				-            halfwidth * crop_y + crop_x / 2;
			
 
				-        src_v = sample + src_width * abs_src_height +
			
 
				-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
			
 
				-      }
			
 
				-      r = I422ToI420(src_y, src_width,
			
 
				-                     src_u, halfwidth,
			
 
				-                     src_v, halfwidth,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    }
			
 
				-    case FOURCC_I444:
			
 
				-    case FOURCC_YV24: {
			
 
				-      const uint8* src_y = sample + src_width * crop_y + crop_x;
			
 
				-      const uint8* src_u;
			
 
				-      const uint8* src_v;
			
 
				-      if (format == FOURCC_YV24) {
			
 
				-        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
			
 
				-        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
			
 
				-      } else {
			
 
				-        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
			
 
				-        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
			
 
				-      }
			
 
				-      r = I444ToI420(src_y, src_width,
			
 
				-                     src_u, src_width,
			
 
				-                     src_v, src_width,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    }
			
 
				-    case FOURCC_I411: {
			
 
				-      int quarterwidth = (src_width + 3) / 4;
			
 
				-      const uint8* src_y = sample + src_width * crop_y + crop_x;
			
 
				-      const uint8* src_u = sample + src_width * abs_src_height +
			
 
				-          quarterwidth * crop_y + crop_x / 4;
			
 
				-      const uint8* src_v = sample + src_width * abs_src_height +
			
 
				-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
			
 
				-      r = I411ToI420(src_y, src_width,
			
 
				-                     src_u, quarterwidth,
			
 
				-                     src_v, quarterwidth,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-    }
			
 
				-#ifdef HAVE_JPEG
			
 
				-    case FOURCC_MJPG:
			
 
				-      r = MJPGToI420(sample, sample_size,
			
 
				-                     y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     src_width, abs_src_height, crop_width, inv_crop_height);
			
 
				-      break;
			
 
				-#endif
			
 
				-    default:
			
 
				-      r = -1;  // unknown fourcc - return failure code.
			
 
				-  }
			
 
				-
			
 
				-  if (need_buf) {
			
 
				-    if (!r) {
			
 
				-      r = I420Rotate(y, y_stride,
			
 
				-                     u, u_stride,
			
 
				-                     v, v_stride,
			
 
				-                     tmp_y, tmp_y_stride,
			
 
				-                     tmp_u, tmp_u_stride,
			
 
				-                     tmp_v, tmp_v_stride,
			
 
				-                     crop_width, abs_crop_height, rotation);
			
 
				-    }
			
 
				-    free(rotate_buffer);
			
 
				-  }
			
 
				-
			
 
				-  return r;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/cpu_id.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/cpu_id.cc
@@ -1,300 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-
			
 
				-#ifdef _ANDROID //libtheoraplayer addition for cpu feature detection
			
 
				-#include "cpu-features.h"
			
 
				-#endif
			
 
				-
			
 
				-#ifdef _MSC_VER
			
 
				-#include <intrin.h>  // For __cpuidex()
			
 
				-#endif
			
 
				-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
			
 
				-    !defined(__native_client__) && defined(_M_X64) && \
			
 
				-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
			
 
				-#include <immintrin.h>  // For _xgetbv()
			
 
				-#endif
			
 
				-
			
 
				-#if !defined(__native_client__)
			
 
				-#include <stdlib.h>  // For getenv()
			
 
				-#endif
			
 
				-
			
 
				-// For ArmCpuCaps() but unittested on all platforms
			
 
				-#include <stdio.h>
			
 
				-#include <string.h>
			
 
				-
			
 
				-#include "libyuv/basic_types.h"  // For CPU_X86
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// For functions that use the stack and have runtime checks for overflow,
			
 
				-// use SAFEBUFFERS to avoid additional check.
			
 
				-#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
			
 
				-#define SAFEBUFFERS __declspec(safebuffers)
			
 
				-#else
			
 
				-#define SAFEBUFFERS
			
 
				-#endif
			
 
				-
			
 
				-// Low level cpuid for X86. Returns zeros on other CPUs.
			
 
				-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
			
 
				-    (defined(_M_IX86) || defined(_M_X64) || \
			
 
				-    defined(__i386__) || defined(__x86_64__))
			
 
				-LIBYUV_API
			
 
				-void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
			
 
				-#if defined(_MSC_VER)
			
 
				-#if (_MSC_FULL_VER >= 160040219)
			
 
				-  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
			
 
				-#elif defined(_M_IX86)
			
 
				-  __asm {
			
 
				-    mov        eax, info_eax
			
 
				-    mov        ecx, info_ecx
			
 
				-    mov        edi, cpu_info
			
 
				-    cpuid
			
 
				-    mov        [edi], eax
			
 
				-    mov        [edi + 4], ebx
			
 
				-    mov        [edi + 8], ecx
			
 
				-    mov        [edi + 12], edx
			
 
				-  }
			
 
				-#else
			
 
				-  if (info_ecx == 0) {
			
 
				-    __cpuid((int*)(cpu_info), info_eax);
			
 
				-  } else {
			
 
				-    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
			
 
				-  }
			
 
				-#endif
			
 
				-#else  // defined(_MSC_VER)
			
 
				-  uint32 info_ebx, info_edx;
			
 
				-  asm volatile (  // NOLINT
			
 
				-#if defined( __i386__) && defined(__PIC__)
			
 
				-    // Preserve ebx for fpic 32 bit.
			
 
				-    "mov %%ebx, %%edi                          \n"
			
 
				-    "cpuid                                     \n"
			
 
				-    "xchg %%edi, %%ebx                         \n"
			
 
				-    : "=D" (info_ebx),
			
 
				-#else
			
 
				-    "cpuid                                     \n"
			
 
				-    : "=b" (info_ebx),
			
 
				-#endif  //  defined( __i386__) && defined(__PIC__)
			
 
				-      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
			
 
				-  cpu_info[0] = info_eax;
			
 
				-  cpu_info[1] = info_ebx;
			
 
				-  cpu_info[2] = info_ecx;
			
 
				-  cpu_info[3] = info_edx;
			
 
				-#endif  // defined(_MSC_VER)
			
 
				-}
			
 
				-
			
 
				-#if !defined(__native_client__)
			
 
				-#define HAS_XGETBV
			
 
				-// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
			
 
				-int TestOsSaveYmm() {
			
 
				-  uint32 xcr0 = 0u;
			
 
				-#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
			
 
				-  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
			
 
				-#elif defined(_M_IX86)
			
 
				-  __asm {
			
 
				-    xor        ecx, ecx    // xcr 0
			
 
				-    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
			
 
				-    mov        xcr0, eax
			
 
				-  }
			
 
				-#elif defined(__i386__) || defined(__x86_64__)
			
 
				-  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
			
 
				-#endif  // defined(_MSC_VER)
			
 
				-  return((xcr0 & 6) == 6);  // Is ymm saved?
			
 
				-}
			
 
				-#endif  // !defined(__native_client__)
			
 
				-#else
			
 
				-LIBYUV_API
			
 
				-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
			
 
				-  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-// based on libvpx arm_cpudetect.c
			
 
				-// For Arm, but public to allow testing on any CPU
			
 
				-LIBYUV_API SAFEBUFFERS
			
 
				-int ArmCpuCaps(const char* cpuinfo_name) {
			
 
				-  char cpuinfo_line[512];
			
 
				-  FILE* f = fopen(cpuinfo_name, "r");
			
 
				-  if (!f) {
			
 
				-    // Assume Neon if /proc/cpuinfo is unavailable.
			
 
				-    // This will occur for Chrome sandbox for Pepper or Render process.
			
 
				-    return kCpuHasNEON;
			
 
				-  }
			
 
				-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
			
 
				-    if (memcmp(cpuinfo_line, "Features", 8) == 0) {
			
 
				-      char* p = strstr(cpuinfo_line, " neon");
			
 
				-      if (p && (p[5] == ' ' || p[5] == '\n')) {
			
 
				-        fclose(f);
			
 
				-        return kCpuHasNEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-  fclose(f);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-#if defined(__mips__) && defined(__linux__)
			
 
				-static int MipsCpuCaps(const char* search_string) {
			
 
				-  char cpuinfo_line[512];
			
 
				-  const char* file_name = "/proc/cpuinfo";
			
 
				-  FILE* f = fopen(file_name, "r");
			
 
				-  if (!f) {
			
 
				-    // Assume DSP if /proc/cpuinfo is unavailable.
			
 
				-    // This will occur for Chrome sandbox for Pepper or Render process.
			
 
				-    return kCpuHasMIPS_DSP;
			
 
				-  }
			
 
				-  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
			
 
				-    if (strstr(cpuinfo_line, search_string) != NULL) {
			
 
				-      fclose(f);
			
 
				-      return kCpuHasMIPS_DSP;
			
 
				-    }
			
 
				-  }
			
 
				-  fclose(f);
			
 
				-  return 0;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-// CPU detect function for SIMD instruction sets.
			
 
				-LIBYUV_API
			
 
				-int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.
			
 
				-
			
 
				-// Test environment variable for disabling CPU features. Any non-zero value
			
 
				-// to disable. Zero ignored to make it easy to set the variable on/off.
			
 
				-#if !defined(__native_client__) && !defined(_M_ARM)
			
 
				-
			
 
				-static LIBYUV_BOOL TestEnv(const char* name) {
			
 
				-#if !defined(_WINRT) && !defined(ORBIS_ENABLED)
			
 
				-  const char* var = getenv(name);
			
 
				-  if (var) {
			
 
				-    if (var[0] != '0') {
			
 
				-      return LIBYUV_TRUE;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  return LIBYUV_FALSE;
			
 
				-}
			
 
				-#else  // nacl does not support getenv().
			
 
				-static LIBYUV_BOOL TestEnv(const char*) {
			
 
				-  return LIBYUV_FALSE;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-LIBYUV_API SAFEBUFFERS
			
 
				-int InitCpuFlags(void) {
			
 
				-#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
			
 
				-
			
 
				-  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
			
 
				-  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
			
 
				-  CpuId(1, 0, cpu_info1);
			
 
				-  CpuId(7, 0, cpu_info7);
			
 
				-  cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
			
 
				-              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
			
 
				-              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
			
 
				-              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
			
 
				-              ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
			
 
				-              ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
			
 
				-              kCpuHasX86;
			
 
				-#ifdef HAS_XGETBV
			
 
				-  if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave
			
 
				-      TestOsSaveYmm()) {  // Saves YMM.
			
 
				-    cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
			
 
				-                 kCpuHasAVX;
			
 
				-  }
			
 
				-#endif
			
 
				-  // Environment variable overrides for testing.
			
 
				-  if (TestEnv("LIBYUV_DISABLE_X86")) {
			
 
				-    cpu_info_ &= ~kCpuHasX86;
			
 
				-  }
			
 
				-  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
			
 
				-    cpu_info_ &= ~kCpuHasSSE2;
			
 
				-  }
			
 
				-  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
			
 
				-    cpu_info_ &= ~kCpuHasSSSE3;
			
 
				-  }
			
 
				-  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
			
 
				-    cpu_info_ &= ~kCpuHasSSE41;
			
 
				-  }
			
 
				-  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
			
 
				-    cpu_info_ &= ~kCpuHasSSE42;
			
 
				-  }
			
 
				-  if (TestEnv("LIBYUV_DISABLE_AVX")) {
			
 
				-    cpu_info_ &= ~kCpuHasAVX;
			
 
				-  }
			
 
				-  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
			
 
				-    cpu_info_ &= ~kCpuHasAVX2;
			
 
				-  }
			
 
				-  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
			
 
				-    cpu_info_ &= ~kCpuHasERMS;
			
 
				-  }
			
 
				-  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
			
 
				-    cpu_info_ &= ~kCpuHasFMA3;
			
 
				-  }
			
 
				-#elif defined(__mips__) && defined(__linux__)
			
 
				-  // Linux mips parse text file for dsp detect.
			
 
				-  cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
			
 
				-#if defined(__mips_dspr2)
			
 
				-  cpu_info_ |= kCpuHasMIPS_DSPR2;
			
 
				-#endif
			
 
				-  cpu_info_ |= kCpuHasMIPS;
			
 
				-
			
 
				-  if (getenv("LIBYUV_DISABLE_MIPS")) {
			
 
				-    cpu_info_ &= ~kCpuHasMIPS;
			
 
				-  }
			
 
				-  if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
			
 
				-    cpu_info_ &= ~kCpuHasMIPS_DSP;
			
 
				-  }
			
 
				-  if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
			
 
				-    cpu_info_ &= ~kCpuHasMIPS_DSPR2;
			
 
				-  }
			
 
				-#elif defined(__arm__)
			
 
				-// gcc -mfpu=neon defines __ARM_NEON__
			
 
				-// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
			
 
				-// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
			
 
				-#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
			
 
				-#ifdef _ANDROID
			
 
				-  cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); // libtheoraplayer #ifdef addition, just in case, android gave us troubles
			
 
				-#else
			
 
				-  cpu_info_ = kCpuHasNEON;
			
 
				-#endif
			
 
				-#else
			
 
				-  // Linux arm parse text file for neon detect.
			
 
				-  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
			
 
				-#endif
			
 
				-  cpu_info_ |= kCpuHasARM;
			
 
				-  if (TestEnv("LIBYUV_DISABLE_NEON")) {
			
 
				-    cpu_info_ &= ~kCpuHasNEON;
			
 
				-  }
			
 
				-#ifdef _ANDROID
			
 
				-  // libtheoraplayer addition to disable NEON support on android devices that don't support it, once again, just in case	
			
 
				-  if ((android_getCpuFeaturesExt() & ANDROID_CPU_ARM_FEATURE_NEON) == 0)
			
 
				-  {
			
 
				- 	cpu_info_ = kCpuHasARM;
			
 
				-  }
			
 
				-#endif
			
 
				-#endif  // __arm__
			
 
				-  if (TestEnv("LIBYUV_DISABLE_ASM")) {
			
 
				-    cpu_info_ = 0;
			
 
				-  }
			
 
				-  return cpu_info_;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void MaskCpuFlags(int enable_flags) {
			
 
				-  cpu_info_ = InitCpuFlags() & enable_flags;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/format_conversion.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/format_conversion.cc
@@ -1,552 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/format_conversion.h"
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/video_common.h"
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// generate a selector mask useful for pshufb
			
 
				-static uint32 GenerateSelector(int select0, int select1) {
			
 
				-  return (uint32)(select0) |
			
 
				-         (uint32)((select1 + 4) << 8) |
			
 
				-         (uint32)((select0 + 8) << 16) |
			
 
				-         (uint32)((select1 + 12) << 24);
			
 
				-}
			
 
				-
			
 
				-static int MakeSelectors(const int blue_index,
			
 
				-                         const int green_index,
			
 
				-                         const int red_index,
			
 
				-                         uint32 dst_fourcc_bayer,
			
 
				-                         uint32* index_map) {
			
 
				-  // Now build a lookup table containing the indices for the four pixels in each
			
 
				-  // 2x2 Bayer grid.
			
 
				-  switch (dst_fourcc_bayer) {
			
 
				-    case FOURCC_BGGR:
			
 
				-      index_map[0] = GenerateSelector(blue_index, green_index);
			
 
				-      index_map[1] = GenerateSelector(green_index, red_index);
			
 
				-      break;
			
 
				-    case FOURCC_GBRG:
			
 
				-      index_map[0] = GenerateSelector(green_index, blue_index);
			
 
				-      index_map[1] = GenerateSelector(red_index, green_index);
			
 
				-      break;
			
 
				-    case FOURCC_RGGB:
			
 
				-      index_map[0] = GenerateSelector(red_index, green_index);
			
 
				-      index_map[1] = GenerateSelector(green_index, blue_index);
			
 
				-      break;
			
 
				-    case FOURCC_GRBG:
			
 
				-      index_map[0] = GenerateSelector(green_index, red_index);
			
 
				-      index_map[1] = GenerateSelector(blue_index, green_index);
			
 
				-      break;
			
 
				-    default:
			
 
				-      return -1;  // Bad FourCC
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Converts 32 bit ARGB to Bayer RGB formats.
			
 
				-LIBYUV_API
			
 
				-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
			
 
				-                uint8* dst_bayer, int dst_stride_bayer,
			
 
				-                int width, int height,
			
 
				-                uint32 dst_fourcc_bayer) {
			
 
				-  int y;
			
 
				-  const int blue_index = 0;  // Offsets for ARGB format
			
 
				-  const int green_index = 1;
			
 
				-  const int red_index = 2;
			
 
				-  uint32 index_map[2];
			
 
				-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                         uint32 selector, int pix) = ARGBToBayerRow_C;
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTOBAYERROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToBayerRow = ARGBToBayerRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  if (MakeSelectors(blue_index, green_index, red_index,
			
 
				-                    dst_fourcc_bayer, index_map)) {
			
 
				-    return -1;  // Bad FourCC
			
 
				-  }
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_bayer += dst_stride_bayer;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-#define AVG(a, b) (((a) + (b)) >> 1)
			
 
				-
			
 
				-static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
			
 
				-                       uint8* dst_argb, int pix) {
			
 
				-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
			
 
				-  uint8 g = src_bayer0[1];
			
 
				-  uint8 r = src_bayer1[1];
			
 
				-  int x;
			
 
				-  for (x = 0; x < pix - 2; x += 2) {
			
 
				-    dst_argb[0] = src_bayer0[0];
			
 
				-    dst_argb[1] = AVG(g, src_bayer0[1]);
			
 
				-    dst_argb[2] = AVG(r, src_bayer1[1]);
			
 
				-    dst_argb[3] = 255U;
			
 
				-    dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
			
 
				-    dst_argb[5] = src_bayer0[1];
			
 
				-    dst_argb[6] = src_bayer1[1];
			
 
				-    dst_argb[7] = 255U;
			
 
				-    g = src_bayer0[1];
			
 
				-    r = src_bayer1[1];
			
 
				-    src_bayer0 += 2;
			
 
				-    src_bayer1 += 2;
			
 
				-    dst_argb += 8;
			
 
				-  }
			
 
				-  dst_argb[0] = src_bayer0[0];
			
 
				-  dst_argb[1] = AVG(g, src_bayer0[1]);
			
 
				-  dst_argb[2] = AVG(r, src_bayer1[1]);
			
 
				-  dst_argb[3] = 255U;
			
 
				-  if (!(pix & 1)) {
			
 
				-    dst_argb[4] = src_bayer0[0];
			
 
				-    dst_argb[5] = src_bayer0[1];
			
 
				-    dst_argb[6] = src_bayer1[1];
			
 
				-    dst_argb[7] = 255U;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
			
 
				-                       uint8* dst_argb, int pix) {
			
 
				-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
			
 
				-  uint8 g = src_bayer0[1];
			
 
				-  uint8 b = src_bayer1[1];
			
 
				-  int x;
			
 
				-  for (x = 0; x < pix - 2; x += 2) {
			
 
				-    dst_argb[0] = AVG(b, src_bayer1[1]);
			
 
				-    dst_argb[1] = AVG(g, src_bayer0[1]);
			
 
				-    dst_argb[2] = src_bayer0[0];
			
 
				-    dst_argb[3] = 255U;
			
 
				-    dst_argb[4] = src_bayer1[1];
			
 
				-    dst_argb[5] = src_bayer0[1];
			
 
				-    dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
			
 
				-    dst_argb[7] = 255U;
			
 
				-    g = src_bayer0[1];
			
 
				-    b = src_bayer1[1];
			
 
				-    src_bayer0 += 2;
			
 
				-    src_bayer1 += 2;
			
 
				-    dst_argb += 8;
			
 
				-  }
			
 
				-  dst_argb[0] = AVG(b, src_bayer1[1]);
			
 
				-  dst_argb[1] = AVG(g, src_bayer0[1]);
			
 
				-  dst_argb[2] = src_bayer0[0];
			
 
				-  dst_argb[3] = 255U;
			
 
				-  if (!(pix & 1)) {
			
 
				-    dst_argb[4] = src_bayer1[1];
			
 
				-    dst_argb[5] = src_bayer0[1];
			
 
				-    dst_argb[6] = src_bayer0[0];
			
 
				-    dst_argb[7] = 255U;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
			
 
				-                       uint8* dst_argb, int pix) {
			
 
				-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
			
 
				-  uint8 b = src_bayer0[1];
			
 
				-  int x;
			
 
				-  for (x = 0; x < pix - 2; x += 2) {
			
 
				-    dst_argb[0] = AVG(b, src_bayer0[1]);
			
 
				-    dst_argb[1] = src_bayer0[0];
			
 
				-    dst_argb[2] = src_bayer1[0];
			
 
				-    dst_argb[3] = 255U;
			
 
				-    dst_argb[4] = src_bayer0[1];
			
 
				-    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
			
 
				-    dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
			
 
				-    dst_argb[7] = 255U;
			
 
				-    b = src_bayer0[1];
			
 
				-    src_bayer0 += 2;
			
 
				-    src_bayer1 += 2;
			
 
				-    dst_argb += 8;
			
 
				-  }
			
 
				-  dst_argb[0] = AVG(b, src_bayer0[1]);
			
 
				-  dst_argb[1] = src_bayer0[0];
			
 
				-  dst_argb[2] = src_bayer1[0];
			
 
				-  dst_argb[3] = 255U;
			
 
				-  if (!(pix & 1)) {
			
 
				-    dst_argb[4] = src_bayer0[1];
			
 
				-    dst_argb[5] = src_bayer0[0];
			
 
				-    dst_argb[6] = src_bayer1[0];
			
 
				-    dst_argb[7] = 255U;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
			
 
				-                       uint8* dst_argb, int pix) {
			
 
				-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
			
 
				-  uint8 r = src_bayer0[1];
			
 
				-  int x;
			
 
				-  for (x = 0; x < pix - 2; x += 2) {
			
 
				-    dst_argb[0] = src_bayer1[0];
			
 
				-    dst_argb[1] = src_bayer0[0];
			
 
				-    dst_argb[2] = AVG(r, src_bayer0[1]);
			
 
				-    dst_argb[3] = 255U;
			
 
				-    dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
			
 
				-    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
			
 
				-    dst_argb[6] = src_bayer0[1];
			
 
				-    dst_argb[7] = 255U;
			
 
				-    r = src_bayer0[1];
			
 
				-    src_bayer0 += 2;
			
 
				-    src_bayer1 += 2;
			
 
				-    dst_argb += 8;
			
 
				-  }
			
 
				-  dst_argb[0] = src_bayer1[0];
			
 
				-  dst_argb[1] = src_bayer0[0];
			
 
				-  dst_argb[2] = AVG(r, src_bayer0[1]);
			
 
				-  dst_argb[3] = 255U;
			
 
				-  if (!(pix & 1)) {
			
 
				-    dst_argb[4] = src_bayer1[0];
			
 
				-    dst_argb[5] = src_bayer0[0];
			
 
				-    dst_argb[6] = src_bayer0[1];
			
 
				-    dst_argb[7] = 255U;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Converts any Bayer RGB format to ARGB.
			
 
				-LIBYUV_API
			
 
				-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                uint8* dst_argb, int dst_stride_argb,
			
 
				-                int width, int height,
			
 
				-                uint32 src_fourcc_bayer) {
			
 
				-  int y;
			
 
				-  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_argb, int pix);
			
 
				-  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_argb, int pix);
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  switch (src_fourcc_bayer) {
			
 
				-    case FOURCC_BGGR:
			
 
				-      BayerRow0 = BayerRowBG;
			
 
				-      BayerRow1 = BayerRowGR;
			
 
				-      break;
			
 
				-    case FOURCC_GBRG:
			
 
				-      BayerRow0 = BayerRowGB;
			
 
				-      BayerRow1 = BayerRowRG;
			
 
				-      break;
			
 
				-    case FOURCC_GRBG:
			
 
				-      BayerRow0 = BayerRowGR;
			
 
				-      BayerRow1 = BayerRowBG;
			
 
				-      break;
			
 
				-    case FOURCC_RGGB:
			
 
				-      BayerRow0 = BayerRowRG;
			
 
				-      BayerRow1 = BayerRowGB;
			
 
				-      break;
			
 
				-    default:
			
 
				-      return -1;    // Bad FourCC
			
 
				-  }
			
 
				-
			
 
				-  for (y = 0; y < height - 1; y += 2) {
			
 
				-    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
			
 
				-    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
			
 
				-              dst_argb + dst_stride_argb, width);
			
 
				-    src_bayer += src_stride_bayer * 2;
			
 
				-    dst_argb += dst_stride_argb * 2;
			
 
				-  }
			
 
				-  if (height & 1) {
			
 
				-    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Converts any Bayer RGB format to ARGB.
			
 
				-LIBYUV_API
			
 
				-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                uint8* dst_y, int dst_stride_y,
			
 
				-                uint8* dst_u, int dst_stride_u,
			
 
				-                uint8* dst_v, int dst_stride_v,
			
 
				-                int width, int height,
			
 
				-                uint32 src_fourcc_bayer) {
			
 
				-  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_argb, int pix);
			
 
				-  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
			
 
				-                    uint8* dst_argb, int pix);
			
 
				-
			
 
				-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
			
 
				-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
			
 
				-      ARGBToYRow_C;
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    int halfheight;
			
 
				-    height = -height;
			
 
				-    halfheight = (height + 1) >> 1;
			
 
				-    dst_y = dst_y + (height - 1) * dst_stride_y;
			
 
				-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
			
 
				-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
			
 
				-    dst_stride_y = -dst_stride_y;
			
 
				-    dst_stride_u = -dst_stride_u;
			
 
				-    dst_stride_v = -dst_stride_v;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
			
 
				-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
			
 
				-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
			
 
				-      ARGBToUVRow = ARGBToUVRow_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-        ARGBToYRow = ARGBToYRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToYRow = ARGBToYRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToYRow = ARGBToYRow_NEON;
			
 
				-    }
			
 
				-    if (width >= 16) {
			
 
				-      ARGBToUVRow = ARGBToUVRow_Any_NEON;
			
 
				-      if (IS_ALIGNED(width, 16)) {
			
 
				-        ARGBToUVRow = ARGBToUVRow_NEON;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  switch (src_fourcc_bayer) {
			
 
				-    case FOURCC_BGGR:
			
 
				-      BayerRow0 = BayerRowBG;
			
 
				-      BayerRow1 = BayerRowGR;
			
 
				-      break;
			
 
				-    case FOURCC_GBRG:
			
 
				-      BayerRow0 = BayerRowGB;
			
 
				-      BayerRow1 = BayerRowRG;
			
 
				-      break;
			
 
				-    case FOURCC_GRBG:
			
 
				-      BayerRow0 = BayerRowGR;
			
 
				-      BayerRow1 = BayerRowBG;
			
 
				-      break;
			
 
				-    case FOURCC_RGGB:
			
 
				-      BayerRow0 = BayerRowRG;
			
 
				-      BayerRow1 = BayerRowGB;
			
 
				-      break;
			
 
				-    default:
			
 
				-      return -1;  // Bad FourCC
			
 
				-  }
			
 
				-
			
 
				-  {
			
 
				-    // Allocate 2 rows of ARGB.
			
 
				-    const int kRowSize = (width * 4 + 15) & ~15;
			
 
				-    align_buffer_64(row, kRowSize * 2);
			
 
				-    int y;
			
 
				-    for (y = 0; y < height - 1; y += 2) {
			
 
				-      BayerRow0(src_bayer, src_stride_bayer, row, width);
			
 
				-      BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
			
 
				-                row + kRowSize, width);
			
 
				-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
			
 
				-      ARGBToYRow(row, dst_y, width);
			
 
				-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
			
 
				-      src_bayer += src_stride_bayer * 2;
			
 
				-      dst_y += dst_stride_y * 2;
			
 
				-      dst_u += dst_stride_u;
			
 
				-      dst_v += dst_stride_v;
			
 
				-    }
			
 
				-    if (height & 1) {
			
 
				-      BayerRow0(src_bayer, src_stride_bayer, row, width);
			
 
				-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
			
 
				-      ARGBToYRow(row, dst_y, width);
			
 
				-    }
			
 
				-    free_aligned_buffer_64(row);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I420 to Bayer.
			
 
				-LIBYUV_API
			
 
				-int I420ToBayer(const uint8* src_y, int src_stride_y,
			
 
				-                const uint8* src_u, int src_stride_u,
			
 
				-                const uint8* src_v, int src_stride_v,
			
 
				-                uint8* dst_bayer, int dst_stride_bayer,
			
 
				-                int width, int height,
			
 
				-                uint32 dst_fourcc_bayer) {
			
 
				-  void (*I422ToARGBRow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToARGBRow_C;
			
 
				-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                         uint32 selector, int pix) = ARGBToBayerRow_C;
			
 
				-  const int blue_index = 0;  // Offsets for ARGB format
			
 
				-  const int green_index = 1;
			
 
				-  const int red_index = 2;
			
 
				-  uint32 index_map[2];
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    int halfheight;
			
 
				-    height = -height;
			
 
				-    halfheight = (height + 1) >> 1;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_u = src_u + (halfheight - 1) * src_stride_u;
			
 
				-    src_v = src_v + (halfheight - 1) * src_stride_v;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-    src_stride_u = -src_stride_u;
			
 
				-    src_stride_v = -src_stride_v;
			
 
				-  }
			
 
				-#if defined(HAS_I422TOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
			
 
				-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
			
 
				-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
			
 
				-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBTOBAYERROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToBayerRow = ARGBToBayerRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  if (MakeSelectors(blue_index, green_index, red_index,
			
 
				-                    dst_fourcc_bayer, index_map)) {
			
 
				-    return -1;  // Bad FourCC
			
 
				-  }
			
 
				-  {
			
 
				-    // Allocate a row of ARGB.
			
 
				-    align_buffer_64(row, width * 4);
			
 
				-    int y;
			
 
				-    for (y = 0; y < height; ++y) {
			
 
				-      I422ToARGBRow(src_y, src_u, src_v, row, width);
			
 
				-      ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
			
 
				-      dst_bayer += dst_stride_bayer;
			
 
				-      src_y += src_stride_y;
			
 
				-      if (y & 1) {
			
 
				-        src_u += src_stride_u;
			
 
				-        src_v += src_stride_v;
			
 
				-      }
			
 
				-    }
			
 
				-    free_aligned_buffer_64(row);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-#define MAKEBAYERFOURCC(BAYER)                                                 \
			
 
				-LIBYUV_API                                                                     \
			
 
				-int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer,         \
			
 
				-                         uint8* dst_y, int dst_stride_y,                       \
			
 
				-                         uint8* dst_u, int dst_stride_u,                       \
			
 
				-                         uint8* dst_v, int dst_stride_v,                       \
			
 
				-                         int width, int height) {                              \
			
 
				-  return BayerToI420(src_bayer, src_stride_bayer,                              \
			
 
				-                     dst_y, dst_stride_y,                                      \
			
 
				-                     dst_u, dst_stride_u,                                      \
			
 
				-                     dst_v, dst_stride_v,                                      \
			
 
				-                     width, height,                                            \
			
 
				-                     FOURCC_##BAYER);                                          \
			
 
				-}                                                                              \
			
 
				-                                                                               \
			
 
				-LIBYUV_API                                                                     \
			
 
				-int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y,                   \
			
 
				-                       const uint8* src_u, int src_stride_u,                   \
			
 
				-                       const uint8* src_v, int src_stride_v,                   \
			
 
				-                       uint8* dst_bayer, int dst_stride_bayer,                 \
			
 
				-                       int width, int height) {                                \
			
 
				-  return I420ToBayer(src_y, src_stride_y,                                      \
			
 
				-                     src_u, src_stride_u,                                      \
			
 
				-                     src_v, src_stride_v,                                      \
			
 
				-                     dst_bayer, dst_stride_bayer,                              \
			
 
				-                     width, height,                                            \
			
 
				-                     FOURCC_##BAYER);                                          \
			
 
				-}                                                                              \
			
 
				-                                                                               \
			
 
				-LIBYUV_API                                                                     \
			
 
				-int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb,             \
			
 
				-                       uint8* dst_bayer, int dst_stride_bayer,                 \
			
 
				-                       int width, int height) {                                \
			
 
				-  return ARGBToBayer(src_argb, src_stride_argb,                                \
			
 
				-                     dst_bayer, dst_stride_bayer,                              \
			
 
				-                     width, height,                                            \
			
 
				-                     FOURCC_##BAYER);                                          \
			
 
				-}                                                                              \
			
 
				-                                                                               \
			
 
				-LIBYUV_API                                                                     \
			
 
				-int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer,         \
			
 
				-                         uint8* dst_argb, int dst_stride_argb,                 \
			
 
				-                         int width, int height) {                              \
			
 
				-  return BayerToARGB(src_bayer, src_stride_bayer,                              \
			
 
				-                     dst_argb, dst_stride_argb,                                \
			
 
				-                     width, height,                                            \
			
 
				-                     FOURCC_##BAYER);                                          \
			
 
				-}
			
 
				-
			
 
				-MAKEBAYERFOURCC(BGGR)
			
 
				-MAKEBAYERFOURCC(GBRG)
			
 
				-MAKEBAYERFOURCC(GRBG)
			
 
				-MAKEBAYERFOURCC(RGGB)
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/mjpeg_decoder.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/mjpeg_decoder.cc
@@ -1,558 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/mjpeg_decoder.h"
			
 
				-
			
 
				-#ifdef HAVE_JPEG
			
 
				-#include <assert.h>
			
 
				-
			
 
				-#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\
			
 
				-    !defined(TARGET_IPHONE_SIMULATOR)
			
 
				-// Must be included before jpeglib.
			
 
				-#include <setjmp.h>
			
 
				-#define HAVE_SETJMP
			
 
				-#endif
			
 
				-struct FILE;  // For jpeglib.h.
			
 
				-
			
 
				-// C++ build requires extern C for jpeg internals.
			
 
				-#ifdef __cplusplus
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#include <jpeglib.h>
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-#endif
			
 
				-
			
 
				-#include "libyuv/planar_functions.h"  // For CopyPlane().
			
 
				-
			
 
				-namespace libyuv {
			
 
				-
			
 
				-#ifdef HAVE_SETJMP
			
 
				-struct SetJmpErrorMgr {
			
 
				-  jpeg_error_mgr base;  // Must be at the top
			
 
				-  jmp_buf setjmp_buffer;
			
 
				-};
			
 
				-#endif
			
 
				-
			
 
				-const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
			
 
				-const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
			
 
				-const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
			
 
				-const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
			
 
				-const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
			
 
				-const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
			
 
				-
			
 
				-MJpegDecoder::MJpegDecoder()
			
 
				-    : has_scanline_padding_(LIBYUV_FALSE),
			
 
				-      num_outbufs_(0),
			
 
				-      scanlines_(NULL),
			
 
				-      scanlines_sizes_(NULL),
			
 
				-      databuf_(NULL),
			
 
				-      databuf_strides_(NULL) {
			
 
				-  decompress_struct_ = new jpeg_decompress_struct;
			
 
				-  source_mgr_ = new jpeg_source_mgr;
			
 
				-#ifdef HAVE_SETJMP
			
 
				-  error_mgr_ = new SetJmpErrorMgr;
			
 
				-  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
			
 
				-  // Override standard exit()-based error handler.
			
 
				-  error_mgr_->base.error_exit = &ErrorHandler;
			
 
				-#endif
			
 
				-  decompress_struct_->client_data = NULL;
			
 
				-  source_mgr_->init_source = &init_source;
			
 
				-  source_mgr_->fill_input_buffer = &fill_input_buffer;
			
 
				-  source_mgr_->skip_input_data = &skip_input_data;
			
 
				-  source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
			
 
				-  source_mgr_->term_source = &term_source;
			
 
				-  jpeg_create_decompress(decompress_struct_);
			
 
				-  decompress_struct_->src = source_mgr_;
			
 
				-  buf_vec_.buffers = &buf_;
			
 
				-  buf_vec_.len = 1;
			
 
				-}
			
 
				-
			
 
				-MJpegDecoder::~MJpegDecoder() {
			
 
				-  jpeg_destroy_decompress(decompress_struct_);
			
 
				-  delete decompress_struct_;
			
 
				-  delete source_mgr_;
			
 
				-#ifdef HAVE_SETJMP
			
 
				-  delete error_mgr_;
			
 
				-#endif
			
 
				-  DestroyOutputBuffers();
			
 
				-}
			
 
				-
			
 
				-LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
			
 
				-  if (!ValidateJpeg(src, src_len)) {
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-
			
 
				-  buf_.data = src;
			
 
				-  buf_.len = (int)(src_len);
			
 
				-  buf_vec_.pos = 0;
			
 
				-  decompress_struct_->client_data = &buf_vec_;
			
 
				-#ifdef HAVE_SETJMP
			
 
				-  if (setjmp(error_mgr_->setjmp_buffer)) {
			
 
				-    // We called jpeg_read_header, it experienced an error, and we called
			
 
				-    // longjmp() and rewound the stack to here. Return error.
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-#endif
			
 
				-  if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
			
 
				-    // ERROR: Bad MJPEG header
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-  AllocOutputBuffers(GetNumComponents());
			
 
				-  for (int i = 0; i < num_outbufs_; ++i) {
			
 
				-    int scanlines_size = GetComponentScanlinesPerImcuRow(i);
			
 
				-    if (scanlines_sizes_[i] != scanlines_size) {
			
 
				-      if (scanlines_[i]) {
			
 
				-        delete scanlines_[i];
			
 
				-      }
			
 
				-      scanlines_[i] = new uint8* [scanlines_size];
			
 
				-      scanlines_sizes_[i] = scanlines_size;
			
 
				-    }
			
 
				-
			
 
				-    // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
			
 
				-    // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
			
 
				-    // the preceding scanlines, the padding is not needed/wanted because the
			
 
				-    // following addresses will already be valid (they are the initial bytes of
			
 
				-    // the next scanline) and will be overwritten when jpeglib writes out that
			
 
				-    // next scanline.
			
 
				-    int databuf_stride = GetComponentStride(i);
			
 
				-    int databuf_size = scanlines_size * databuf_stride;
			
 
				-    if (databuf_strides_[i] != databuf_stride) {
			
 
				-      if (databuf_[i]) {
			
 
				-        delete databuf_[i];
			
 
				-      }
			
 
				-      databuf_[i] = new uint8[databuf_size];
			
 
				-      databuf_strides_[i] = databuf_stride;
			
 
				-    }
			
 
				-
			
 
				-    if (GetComponentStride(i) != GetComponentWidth(i)) {
			
 
				-      has_scanline_padding_ = LIBYUV_TRUE;
			
 
				-    }
			
 
				-  }
			
 
				-  return LIBYUV_TRUE;
			
 
				-}
			
 
				-
			
 
				-static int DivideAndRoundUp(int numerator, int denominator) {
			
 
				-  return (numerator + denominator - 1) / denominator;
			
 
				-}
			
 
				-
			
 
				-static int DivideAndRoundDown(int numerator, int denominator) {
			
 
				-  return numerator / denominator;
			
 
				-}
			
 
				-
			
 
				-// Returns width of the last loaded frame.
			
 
				-int MJpegDecoder::GetWidth() {
			
 
				-  return decompress_struct_->image_width;
			
 
				-}
			
 
				-
			
 
				-// Returns height of the last loaded frame.
			
 
				-int MJpegDecoder::GetHeight() {
			
 
				-  return decompress_struct_->image_height;
			
 
				-}
			
 
				-
			
 
				-// Returns format of the last loaded frame. The return value is one of the
			
 
				-// kColorSpace* constants.
			
 
				-int MJpegDecoder::GetColorSpace() {
			
 
				-  return decompress_struct_->jpeg_color_space;
			
 
				-}
			
 
				-
			
 
				-// Number of color components in the color space.
			
 
				-int MJpegDecoder::GetNumComponents() {
			
 
				-  return decompress_struct_->num_components;
			
 
				-}
			
 
				-
			
 
				-// Sample factors of the n-th component.
			
 
				-int MJpegDecoder::GetHorizSampFactor(int component) {
			
 
				-  return decompress_struct_->comp_info[component].h_samp_factor;
			
 
				-}
			
 
				-
			
 
				-int MJpegDecoder::GetVertSampFactor(int component) {
			
 
				-  return decompress_struct_->comp_info[component].v_samp_factor;
			
 
				-}
			
 
				-
			
 
				-int MJpegDecoder::GetHorizSubSampFactor(int component) {
			
 
				-  return decompress_struct_->max_h_samp_factor /
			
 
				-      GetHorizSampFactor(component);
			
 
				-}
			
 
				-
			
 
				-int MJpegDecoder::GetVertSubSampFactor(int component) {
			
 
				-  return decompress_struct_->max_v_samp_factor /
			
 
				-      GetVertSampFactor(component);
			
 
				-}
			
 
				-
			
 
				-int MJpegDecoder::GetImageScanlinesPerImcuRow() {
			
 
				-  return decompress_struct_->max_v_samp_factor * DCTSIZE;
			
 
				-}
			
 
				-
			
 
				-int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
			
 
				-  int vs = GetVertSubSampFactor(component);
			
 
				-  return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
			
 
				-}
			
 
				-
			
 
				-int MJpegDecoder::GetComponentWidth(int component) {
			
 
				-  int hs = GetHorizSubSampFactor(component);
			
 
				-  return DivideAndRoundUp(GetWidth(), hs);
			
 
				-}
			
 
				-
			
 
				-int MJpegDecoder::GetComponentHeight(int component) {
			
 
				-  int vs = GetVertSubSampFactor(component);
			
 
				-  return DivideAndRoundUp(GetHeight(), vs);
			
 
				-}
			
 
				-
			
 
				-// Get width in bytes padded out to a multiple of DCTSIZE
			
 
				-int MJpegDecoder::GetComponentStride(int component) {
			
 
				-  return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
			
 
				-}
			
 
				-
			
 
				-int MJpegDecoder::GetComponentSize(int component) {
			
 
				-  return GetComponentWidth(component) * GetComponentHeight(component);
			
 
				-}
			
 
				-
			
 
				-LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
			
 
				-#ifdef HAVE_SETJMP
			
 
				-  if (setjmp(error_mgr_->setjmp_buffer)) {
			
 
				-    // We called jpeg_abort_decompress, it experienced an error, and we called
			
 
				-    // longjmp() and rewound the stack to here. Return error.
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-#endif
			
 
				-  jpeg_abort_decompress(decompress_struct_);
			
 
				-  return LIBYUV_TRUE;
			
 
				-}
			
 
				-
			
 
				-// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
			
 
				-LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
			
 
				-    uint8** planes, int dst_width, int dst_height) {
			
 
				-  if (dst_width != GetWidth() ||
			
 
				-      dst_height > GetHeight()) {
			
 
				-    // ERROR: Bad dimensions
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-#ifdef HAVE_SETJMP
			
 
				-  if (setjmp(error_mgr_->setjmp_buffer)) {
			
 
				-    // We called into jpeglib, it experienced an error sometime during this
			
 
				-    // function call, and we called longjmp() and rewound the stack to here.
			
 
				-    // Return error.
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-#endif
			
 
				-  if (!StartDecode()) {
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-  SetScanlinePointers(databuf_);
			
 
				-  int lines_left = dst_height;
			
 
				-  // Compute amount of lines to skip to implement vertical crop.
			
 
				-  // TODO(fbarchard): Ensure skip is a multiple of maximum component
			
 
				-  // subsample. ie 2
			
 
				-  int skip = (GetHeight() - dst_height) / 2;
			
 
				-  if (skip > 0) {
			
 
				-    // There is no API to skip lines in the output data, so we read them
			
 
				-    // into the temp buffer.
			
 
				-    while (skip >= GetImageScanlinesPerImcuRow()) {
			
 
				-      if (!DecodeImcuRow()) {
			
 
				-        FinishDecode();
			
 
				-        return LIBYUV_FALSE;
			
 
				-      }
			
 
				-      skip -= GetImageScanlinesPerImcuRow();
			
 
				-    }
			
 
				-    if (skip > 0) {
			
 
				-      // Have a partial iMCU row left over to skip. Must read it and then
			
 
				-      // copy the parts we want into the destination.
			
 
				-      if (!DecodeImcuRow()) {
			
 
				-        FinishDecode();
			
 
				-        return LIBYUV_FALSE;
			
 
				-      }
			
 
				-      for (int i = 0; i < num_outbufs_; ++i) {
			
 
				-        // TODO(fbarchard): Compute skip to avoid this
			
 
				-        assert(skip % GetVertSubSampFactor(i) == 0);
			
 
				-        int rows_to_skip =
			
 
				-            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
			
 
				-        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
			
 
				-                                rows_to_skip;
			
 
				-        int data_to_skip = rows_to_skip * GetComponentStride(i);
			
 
				-        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
			
 
				-                  planes[i], GetComponentWidth(i),
			
 
				-                  GetComponentWidth(i), scanlines_to_copy);
			
 
				-        planes[i] += scanlines_to_copy * GetComponentWidth(i);
			
 
				-      }
			
 
				-      lines_left -= (GetImageScanlinesPerImcuRow() - skip);
			
 
				-    }
			
 
				-  }
			
 
				-
			
 
				-  // Read full MCUs but cropped horizontally
			
 
				-  for (; lines_left > GetImageScanlinesPerImcuRow();
			
 
				-         lines_left -= GetImageScanlinesPerImcuRow()) {
			
 
				-    if (!DecodeImcuRow()) {
			
 
				-      FinishDecode();
			
 
				-      return LIBYUV_FALSE;
			
 
				-    }
			
 
				-    for (int i = 0; i < num_outbufs_; ++i) {
			
 
				-      int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
			
 
				-      CopyPlane(databuf_[i], GetComponentStride(i),
			
 
				-                planes[i], GetComponentWidth(i),
			
 
				-                GetComponentWidth(i), scanlines_to_copy);
			
 
				-      planes[i] += scanlines_to_copy * GetComponentWidth(i);
			
 
				-    }
			
 
				-  }
			
 
				-
			
 
				-  if (lines_left > 0) {
			
 
				-    // Have a partial iMCU row left over to decode.
			
 
				-    if (!DecodeImcuRow()) {
			
 
				-      FinishDecode();
			
 
				-      return LIBYUV_FALSE;
			
 
				-    }
			
 
				-    for (int i = 0; i < num_outbufs_; ++i) {
			
 
				-      int scanlines_to_copy =
			
 
				-          DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
			
 
				-      CopyPlane(databuf_[i], GetComponentStride(i),
			
 
				-                planes[i], GetComponentWidth(i),
			
 
				-                GetComponentWidth(i), scanlines_to_copy);
			
 
				-      planes[i] += scanlines_to_copy * GetComponentWidth(i);
			
 
				-    }
			
 
				-  }
			
 
				-  return FinishDecode();
			
 
				-}
			
 
				-
			
 
				-LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
			
 
				-    int dst_width, int dst_height) {
			
 
				-  if (dst_width != GetWidth() ||
			
 
				-      dst_height > GetHeight()) {
			
 
				-    // ERROR: Bad dimensions
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-#ifdef HAVE_SETJMP
			
 
				-  if (setjmp(error_mgr_->setjmp_buffer)) {
			
 
				-    // We called into jpeglib, it experienced an error sometime during this
			
 
				-    // function call, and we called longjmp() and rewound the stack to here.
			
 
				-    // Return error.
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-#endif
			
 
				-  if (!StartDecode()) {
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-  SetScanlinePointers(databuf_);
			
 
				-  int lines_left = dst_height;
			
 
				-  // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
			
 
				-  int skip = (GetHeight() - dst_height) / 2;
			
 
				-  if (skip > 0) {
			
 
				-    while (skip >= GetImageScanlinesPerImcuRow()) {
			
 
				-      if (!DecodeImcuRow()) {
			
 
				-        FinishDecode();
			
 
				-        return LIBYUV_FALSE;
			
 
				-      }
			
 
				-      skip -= GetImageScanlinesPerImcuRow();
			
 
				-    }
			
 
				-    if (skip > 0) {
			
 
				-      // Have a partial iMCU row left over to skip.
			
 
				-      if (!DecodeImcuRow()) {
			
 
				-        FinishDecode();
			
 
				-        return LIBYUV_FALSE;
			
 
				-      }
			
 
				-      for (int i = 0; i < num_outbufs_; ++i) {
			
 
				-        // TODO(fbarchard): Compute skip to avoid this
			
 
				-        assert(skip % GetVertSubSampFactor(i) == 0);
			
 
				-        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
			
 
				-        int data_to_skip = rows_to_skip * GetComponentStride(i);
			
 
				-        // Change our own data buffer pointers so we can pass them to the
			
 
				-        // callback.
			
 
				-        databuf_[i] += data_to_skip;
			
 
				-      }
			
 
				-      int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
			
 
				-      (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
			
 
				-      // Now change them back.
			
 
				-      for (int i = 0; i < num_outbufs_; ++i) {
			
 
				-        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
			
 
				-        int data_to_skip = rows_to_skip * GetComponentStride(i);
			
 
				-        databuf_[i] -= data_to_skip;
			
 
				-      }
			
 
				-      lines_left -= scanlines_to_copy;
			
 
				-    }
			
 
				-  }
			
 
				-  // Read full MCUs until we get to the crop point.
			
 
				-  for (; lines_left >= GetImageScanlinesPerImcuRow();
			
 
				-         lines_left -= GetImageScanlinesPerImcuRow()) {
			
 
				-    if (!DecodeImcuRow()) {
			
 
				-      FinishDecode();
			
 
				-      return LIBYUV_FALSE;
			
 
				-    }
			
 
				-    (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
			
 
				-  }
			
 
				-  if (lines_left > 0) {
			
 
				-    // Have a partial iMCU row left over to decode.
			
 
				-    if (!DecodeImcuRow()) {
			
 
				-      FinishDecode();
			
 
				-      return LIBYUV_FALSE;
			
 
				-    }
			
 
				-    (*fn)(opaque, databuf_, databuf_strides_, lines_left);
			
 
				-  }
			
 
				-  return FinishDecode();
			
 
				-}
			
 
				-
			
 
				-void MJpegDecoder::init_source(j_decompress_ptr cinfo) {
			
 
				-  fill_input_buffer(cinfo);
			
 
				-}
			
 
				-
			
 
				-boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) {
			
 
				-  BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);
			
 
				-  if (buf_vec->pos >= buf_vec->len) {
			
 
				-    assert(0 && "No more data");
			
 
				-    // ERROR: No more data
			
 
				-    return FALSE;
			
 
				-  }
			
 
				-  cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
			
 
				-  cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
			
 
				-  ++buf_vec->pos;
			
 
				-  return TRUE;
			
 
				-}
			
 
				-
			
 
				-void MJpegDecoder::skip_input_data(j_decompress_ptr cinfo,
			
 
				-                                   long num_bytes) {  // NOLINT
			
 
				-  cinfo->src->next_input_byte += num_bytes;
			
 
				-}
			
 
				-
			
 
				-void MJpegDecoder::term_source(j_decompress_ptr cinfo) {
			
 
				-  // Nothing to do.
			
 
				-}
			
 
				-
			
 
				-#ifdef HAVE_SETJMP
			
 
				-void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
			
 
				-  // This is called when a jpeglib command experiences an error. Unfortunately
			
 
				-  // jpeglib's error handling model is not very flexible, because it expects the
			
 
				-  // error handler to not return--i.e., it wants the program to terminate. To
			
 
				-  // recover from errors we use setjmp() as shown in their example. setjmp() is
			
 
				-  // C's implementation for the "call with current continuation" functionality
			
 
				-  // seen in some functional programming languages.
			
 
				-  // A formatted message can be output, but is unsafe for release.
			
 
				-#ifdef DEBUG
			
 
				-  char buf[JMSG_LENGTH_MAX];
			
 
				-  (*cinfo->err->format_message)(cinfo, buf);
			
 
				-  // ERROR: Error in jpeglib: buf
			
 
				-#endif
			
 
				-
			
 
				-  SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);
			
 
				-  // This rewinds the call stack to the point of the corresponding setjmp()
			
 
				-  // and causes it to return (for a second time) with value 1.
			
 
				-  longjmp(mgr->setjmp_buffer, 1);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
			
 
				-  if (num_outbufs != num_outbufs_) {
			
 
				-    // We could perhaps optimize this case to resize the output buffers without
			
 
				-    // necessarily having to delete and recreate each one, but it's not worth
			
 
				-    // it.
			
 
				-    DestroyOutputBuffers();
			
 
				-
			
 
				-    scanlines_ = new uint8** [num_outbufs];
			
 
				-    scanlines_sizes_ = new int[num_outbufs];
			
 
				-    databuf_ = new uint8* [num_outbufs];
			
 
				-    databuf_strides_ = new int[num_outbufs];
			
 
				-
			
 
				-    for (int i = 0; i < num_outbufs; ++i) {
			
 
				-      scanlines_[i] = NULL;
			
 
				-      scanlines_sizes_[i] = 0;
			
 
				-      databuf_[i] = NULL;
			
 
				-      databuf_strides_[i] = 0;
			
 
				-    }
			
 
				-
			
 
				-    num_outbufs_ = num_outbufs;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void MJpegDecoder::DestroyOutputBuffers() {
			
 
				-  for (int i = 0; i < num_outbufs_; ++i) {
			
 
				-    delete [] scanlines_[i];
			
 
				-    delete [] databuf_[i];
			
 
				-  }
			
 
				-  delete [] scanlines_;
			
 
				-  delete [] databuf_;
			
 
				-  delete [] scanlines_sizes_;
			
 
				-  delete [] databuf_strides_;
			
 
				-  scanlines_ = NULL;
			
 
				-  databuf_ = NULL;
			
 
				-  scanlines_sizes_ = NULL;
			
 
				-  databuf_strides_ = NULL;
			
 
				-  num_outbufs_ = 0;
			
 
				-}
			
 
				-
			
 
				-// JDCT_IFAST and do_block_smoothing improve performance substantially.
			
 
				-LIBYUV_BOOL MJpegDecoder::StartDecode() {
			
 
				-  decompress_struct_->raw_data_out = TRUE;
			
 
				-  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
			
 
				-  decompress_struct_->dither_mode = JDITHER_NONE;
			
 
				-  // Not applicable to 'raw':
			
 
				-  decompress_struct_->do_fancy_upsampling = LIBYUV_FALSE;
			
 
				-  // Only for buffered mode:
			
 
				-  decompress_struct_->enable_2pass_quant = LIBYUV_FALSE;
			
 
				-  // Blocky but fast:
			
 
				-  decompress_struct_->do_block_smoothing = LIBYUV_FALSE;
			
 
				-
			
 
				-  if (!jpeg_start_decompress(decompress_struct_)) {
			
 
				-    // ERROR: Couldn't start JPEG decompressor";
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-  return LIBYUV_TRUE;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_BOOL MJpegDecoder::FinishDecode() {
			
 
				-  // jpeglib considers it an error if we finish without decoding the whole
			
 
				-  // image, so we call "abort" rather than "finish".
			
 
				-  jpeg_abort_decompress(decompress_struct_);
			
 
				-  return LIBYUV_TRUE;
			
 
				-}
			
 
				-
			
 
				-void MJpegDecoder::SetScanlinePointers(uint8** data) {
			
 
				-  for (int i = 0; i < num_outbufs_; ++i) {
			
 
				-    uint8* data_i = data[i];
			
 
				-    for (int j = 0; j < scanlines_sizes_[i]; ++j) {
			
 
				-      scanlines_[i][j] = data_i;
			
 
				-      data_i += GetComponentStride(i);
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
			
 
				-  return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
			
 
				-      jpeg_read_raw_data(decompress_struct_,
			
 
				-                         scanlines_,
			
 
				-                         GetImageScanlinesPerImcuRow());
			
 
				-}
			
 
				-
			
 
				-// The helper function which recognizes the jpeg sub-sampling type.
			
 
				-JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
			
 
				-    int* subsample_x, int* subsample_y, int number_of_components) {
			
 
				-  if (number_of_components == 3) {  // Color images.
			
 
				-    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
			
 
				-        subsample_x[1] == 2 && subsample_y[1] == 2 &&
			
 
				-        subsample_x[2] == 2 && subsample_y[2] == 2) {
			
 
				-      return kJpegYuv420;
			
 
				-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
			
 
				-        subsample_x[1] == 2 && subsample_y[1] == 1 &&
			
 
				-        subsample_x[2] == 2 && subsample_y[2] == 1) {
			
 
				-      return kJpegYuv422;
			
 
				-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
			
 
				-        subsample_x[1] == 1 && subsample_y[1] == 1 &&
			
 
				-        subsample_x[2] == 1 && subsample_y[2] == 1) {
			
 
				-      return kJpegYuv444;
			
 
				-    }
			
 
				-  } else if (number_of_components == 1) {  // Grey-scale images.
			
 
				-    if (subsample_x[0] == 1 && subsample_y[0] == 1) {
			
 
				-      return kJpegYuv400;
			
 
				-    }
			
 
				-  }
			
 
				-  return kJpegUnknown;
			
 
				-}
			
 
				-
			
 
				-}  // namespace libyuv
			
 
				-#endif  // HAVE_JPEG
			
 
				-
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/mjpeg_validate.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/mjpeg_validate.cc
@@ -1,47 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/mjpeg_decoder.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Helper function to validate the jpeg appears intact.
			
 
				-// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
			
 
				-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
			
 
				-  size_t i;
			
 
				-  if (sample_size < 64) {
			
 
				-    // ERROR: Invalid jpeg size: sample_size
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-  if (sample[0] != 0xff || sample[1] != 0xd8) {  // Start Of Image
			
 
				-    // ERROR: Invalid jpeg initial start code
			
 
				-    return LIBYUV_FALSE;
			
 
				-  }
			
 
				-  for (i = sample_size - 2; i > 1;) {
			
 
				-    if (sample[i] != 0xd9) {
			
 
				-      if (sample[i] == 0xff && sample[i + 1] == 0xd9) {  // End Of Image
			
 
				-        return LIBYUV_TRUE;  // Success: Valid jpeg.
			
 
				-      }
			
 
				-      --i;
			
 
				-    }
			
 
				-    --i;
			
 
				-  }
			
 
				-  // ERROR: Invalid jpeg end code not found. Size sample_size
			
 
				-  return LIBYUV_FALSE;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/planar_functions.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/planar_functions.cc
@@ -1,2238 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/planar_functions.h"
			
 
				-
			
 
				-#include <string.h>  // for memset()
			
 
				-
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#ifdef HAVE_JPEG
			
 
				-#include "libyuv/mjpeg_decoder.h"
			
 
				-#endif
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Copy a plane of data
			
 
				-LIBYUV_API
			
 
				-void CopyPlane(const uint8* src_y, int src_stride_y,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               int width, int height) {
			
 
				-  int y;
			
 
				-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      dst_stride_y == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = dst_stride_y = 0;
			
 
				-  }
			
 
				-#if defined(HAS_COPYROW_X86)
			
 
				-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
			
 
				-    CopyRow = CopyRow_X86;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
			
 
				-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
			
 
				-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-    CopyRow = CopyRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_ERMS)
			
 
				-  if (TestCpuFlag(kCpuHasERMS)) {
			
 
				-    CopyRow = CopyRow_ERMS;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
			
 
				-    CopyRow = CopyRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_MIPS)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS)) {
			
 
				-    CopyRow = CopyRow_MIPS;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Copy plane
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    CopyRow(src_y, dst_y, width);
			
 
				-    src_y += src_stride_y;
			
 
				-    dst_y += dst_stride_y;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Copy I422.
			
 
				-LIBYUV_API
			
 
				-int I422Copy(const uint8* src_y, int src_stride_y,
			
 
				-             const uint8* src_u, int src_stride_u,
			
 
				-             const uint8* src_v, int src_stride_v,
			
 
				-             uint8* dst_y, int dst_stride_y,
			
 
				-             uint8* dst_u, int dst_stride_u,
			
 
				-             uint8* dst_v, int dst_stride_v,
			
 
				-             int width, int height) {
			
 
				-  int halfwidth = (width + 1) >> 1;
			
 
				-  if (!src_y || !src_u || !src_v ||
			
 
				-      !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_u = src_u + (height - 1) * src_stride_u;
			
 
				-    src_v = src_v + (height - 1) * src_stride_v;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-    src_stride_u = -src_stride_u;
			
 
				-    src_stride_v = -src_stride_v;
			
 
				-  }
			
 
				-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
			
 
				-  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
			
 
				-  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Copy I444.
			
 
				-LIBYUV_API
			
 
				-int I444Copy(const uint8* src_y, int src_stride_y,
			
 
				-             const uint8* src_u, int src_stride_u,
			
 
				-             const uint8* src_v, int src_stride_v,
			
 
				-             uint8* dst_y, int dst_stride_y,
			
 
				-             uint8* dst_u, int dst_stride_u,
			
 
				-             uint8* dst_v, int dst_stride_v,
			
 
				-             int width, int height) {
			
 
				-  if (!src_y || !src_u || !src_v ||
			
 
				-      !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_u = src_u + (height - 1) * src_stride_u;
			
 
				-    src_v = src_v + (height - 1) * src_stride_v;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-    src_stride_u = -src_stride_u;
			
 
				-    src_stride_v = -src_stride_v;
			
 
				-  }
			
 
				-
			
 
				-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
			
 
				-  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
			
 
				-  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Copy I400.
			
 
				-LIBYUV_API
			
 
				-int I400ToI400(const uint8* src_y, int src_stride_y,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !dst_y || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-  }
			
 
				-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I420 to I400.
			
 
				-LIBYUV_API
			
 
				-int I420ToI400(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !dst_y || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-  }
			
 
				-  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Mirror a plane of data.
			
 
				-void MirrorPlane(const uint8* src_y, int src_stride_y,
			
 
				-                 uint8* dst_y, int dst_stride_y,
			
 
				-                 int width, int height) {
			
 
				-  int y;
			
 
				-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-  }
			
 
				-#if defined(HAS_MIRRORROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
			
 
				-    MirrorRow = MirrorRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MIRRORROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
			
 
				-    MirrorRow = MirrorRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MIRRORROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
			
 
				-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
			
 
				-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-    MirrorRow = MirrorRow_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MIRRORROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
			
 
				-    MirrorRow = MirrorRow_AVX2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Mirror plane
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    MirrorRow(src_y, dst_y, width);
			
 
				-    src_y += src_stride_y;
			
 
				-    dst_y += dst_stride_y;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Convert YUY2 to I422.
			
 
				-LIBYUV_API
			
 
				-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  int y;
			
 
				-  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix) =
			
 
				-      YUY2ToUV422Row_C;
			
 
				-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
			
 
				-      YUY2ToYRow_C;
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
			
 
				-    src_stride_yuy2 = -src_stride_yuy2;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_yuy2 == width * 2 &&
			
 
				-      dst_stride_y == width &&
			
 
				-      dst_stride_u * 2 == width &&
			
 
				-      dst_stride_v * 2 == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
			
 
				-  }
			
 
				-#if defined(HAS_YUY2TOYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
			
 
				-    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
			
 
				-    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
			
 
				-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
			
 
				-        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
			
 
				-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-          YUY2ToYRow = YUY2ToYRow_SSE2;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_YUY2TOYROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
			
 
				-    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
			
 
				-    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 32)) {
			
 
				-      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
			
 
				-      YUY2ToYRow = YUY2ToYRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_YUY2TOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    YUY2ToYRow = YUY2ToYRow_Any_NEON;
			
 
				-    if (width >= 16) {
			
 
				-      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
			
 
				-    }
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      YUY2ToYRow = YUY2ToYRow_NEON;
			
 
				-      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
			
 
				-    YUY2ToYRow(src_yuy2, dst_y, width);
			
 
				-    src_yuy2 += src_stride_yuy2;
			
 
				-    dst_y += dst_stride_y;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert UYVY to I422.
			
 
				-LIBYUV_API
			
 
				-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  int y;
			
 
				-  void (*UYVYToUV422Row)(const uint8* src_uyvy,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix) =
			
 
				-      UYVYToUV422Row_C;
			
 
				-  void (*UYVYToYRow)(const uint8* src_uyvy,
			
 
				-                     uint8* dst_y, int pix) = UYVYToYRow_C;
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
			
 
				-    src_stride_uyvy = -src_stride_uyvy;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_uyvy == width * 2 &&
			
 
				-      dst_stride_y == width &&
			
 
				-      dst_stride_u * 2 == width &&
			
 
				-      dst_stride_v * 2 == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
			
 
				-  }
			
 
				-#if defined(HAS_UYVYTOYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
			
 
				-    UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
			
 
				-    UYVYToYRow = UYVYToYRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
			
 
				-      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
			
 
				-        UYVYToUV422Row = UYVYToUV422Row_SSE2;
			
 
				-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-          UYVYToYRow = UYVYToYRow_SSE2;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_UYVYTOYROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
			
 
				-    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
			
 
				-    UYVYToYRow = UYVYToYRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 32)) {
			
 
				-      UYVYToUV422Row = UYVYToUV422Row_AVX2;
			
 
				-      UYVYToYRow = UYVYToYRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_UYVYTOYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    UYVYToYRow = UYVYToYRow_Any_NEON;
			
 
				-    if (width >= 16) {
			
 
				-      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
			
 
				-    }
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      UYVYToYRow = UYVYToYRow_NEON;
			
 
				-      UYVYToUV422Row = UYVYToUV422Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
			
 
				-    UYVYToYRow(src_uyvy, dst_y, width);
			
 
				-    src_uyvy += src_stride_uyvy;
			
 
				-    dst_y += dst_stride_y;
			
 
				-    dst_u += dst_stride_u;
			
 
				-    dst_v += dst_stride_v;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Mirror I400 with optional flipping
			
 
				-LIBYUV_API
			
 
				-int I400Mirror(const uint8* src_y, int src_stride_y,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               int width, int height) {
			
 
				-  if (!src_y || !dst_y ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-  }
			
 
				-
			
 
				-  MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Mirror I420 with optional flipping
			
 
				-LIBYUV_API
			
 
				-int I420Mirror(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height) {
			
 
				-  int halfwidth = (width + 1) >> 1;
			
 
				-  int halfheight = (height + 1) >> 1;
			
 
				-  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    halfheight = (height + 1) >> 1;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_u = src_u + (halfheight - 1) * src_stride_u;
			
 
				-    src_v = src_v + (halfheight - 1) * src_stride_v;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-    src_stride_u = -src_stride_u;
			
 
				-    src_stride_v = -src_stride_v;
			
 
				-  }
			
 
				-
			
 
				-  if (dst_y) {
			
 
				-    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
			
 
				-  }
			
 
				-  MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
			
 
				-  MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// ARGB mirror.
			
 
				-LIBYUV_API
			
 
				-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
			
 
				-      ARGBMirrorRow_C;
			
 
				-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-
			
 
				-#if defined(HAS_ARGBMIRRORROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBMIRRORROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
			
 
				-    ARGBMirrorRow = ARGBMirrorRow_AVX2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBMIRRORROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
			
 
				-    ARGBMirrorRow = ARGBMirrorRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Mirror plane
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBMirrorRow(src_argb, dst_argb, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Get a blender that optimized for the CPU, alignment and pixel count.
			
 
				-// As there are 6 blenders to choose from, the caller should try to use
			
 
				-// the same blend function for all pixels if possible.
			
 
				-LIBYUV_API
			
 
				-ARGBBlendRow GetARGBBlend() {
			
 
				-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                       uint8* dst_argb, int width) = ARGBBlendRow_C;
			
 
				-#if defined(HAS_ARGBBLENDROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3)) {
			
 
				-    ARGBBlendRow = ARGBBlendRow_SSSE3;
			
 
				-    return ARGBBlendRow;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBBLENDROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2)) {
			
 
				-    ARGBBlendRow = ARGBBlendRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBBLENDROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON)) {
			
 
				-    ARGBBlendRow = ARGBBlendRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-  return ARGBBlendRow;
			
 
				-}
			
 
				-
			
 
				-// Alpha Blend 2 ARGB images and store to destination.
			
 
				-LIBYUV_API
			
 
				-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
			
 
				-              const uint8* src_argb1, int src_stride_argb1,
			
 
				-              uint8* dst_argb, int dst_stride_argb,
			
 
				-              int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
			
 
				-                       uint8* dst_argb, int width) = GetARGBBlend();
			
 
				-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb0 == width * 4 &&
			
 
				-      src_stride_argb1 == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
			
 
				-  }
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
			
 
				-    src_argb0 += src_stride_argb0;
			
 
				-    src_argb1 += src_stride_argb1;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Multiply 2 ARGB images and store to destination.
			
 
				-LIBYUV_API
			
 
				-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
			
 
				-                 const uint8* src_argb1, int src_stride_argb1,
			
 
				-                 uint8* dst_argb, int dst_stride_argb,
			
 
				-                 int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
			
 
				-                          int width) = ARGBMultiplyRow_C;
			
 
				-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb0 == width * 4 &&
			
 
				-      src_stride_argb1 == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBMULTIPLYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
			
 
				-    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBMULTIPLYROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
			
 
				-    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBMULTIPLYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBMultiplyRow = ARGBMultiplyRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Multiply plane
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
			
 
				-    src_argb0 += src_stride_argb0;
			
 
				-    src_argb1 += src_stride_argb1;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Add 2 ARGB images and store to destination.
			
 
				-LIBYUV_API
			
 
				-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
			
 
				-            const uint8* src_argb1, int src_stride_argb1,
			
 
				-            uint8* dst_argb, int dst_stride_argb,
			
 
				-            int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
			
 
				-                     int width) = ARGBAddRow_C;
			
 
				-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb0 == width * 4 &&
			
 
				-      src_stride_argb1 == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2)) {
			
 
				-    ARGBAddRow = ARGBAddRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
			
 
				-    ARGBAddRow = ARGBAddRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      ARGBAddRow = ARGBAddRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBADDROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
			
 
				-    ARGBAddRow = ARGBAddRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBAddRow = ARGBAddRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBADDROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBAddRow = ARGBAddRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBAddRow = ARGBAddRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Add plane
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
			
 
				-    src_argb0 += src_stride_argb0;
			
 
				-    src_argb1 += src_stride_argb1;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Subtract 2 ARGB images and store to destination.
			
 
				-LIBYUV_API
			
 
				-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
			
 
				-                 const uint8* src_argb1, int src_stride_argb1,
			
 
				-                 uint8* dst_argb, int dst_stride_argb,
			
 
				-                 int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
			
 
				-                          int width) = ARGBSubtractRow_C;
			
 
				-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb0 == width * 4 &&
			
 
				-      src_stride_argb1 == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBSUBTRACTROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
			
 
				-    ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      ARGBSubtractRow = ARGBSubtractRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBSUBTRACTROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
			
 
				-    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBSubtractRow = ARGBSubtractRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBSUBTRACTROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBSubtractRow = ARGBSubtractRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Subtract plane
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
			
 
				-    src_argb0 += src_stride_argb0;
			
 
				-    src_argb1 += src_stride_argb1;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I422 to BGRA.
			
 
				-LIBYUV_API
			
 
				-int I422ToBGRA(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_bgra, int dst_stride_bgra,
			
 
				-               int width, int height) {
			
 
				-  int y;
			
 
				-  void (*I422ToBGRARow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToBGRARow_C;
			
 
				-  if (!src_y || !src_u || !src_v ||
			
 
				-      !dst_bgra ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
			
 
				-    dst_stride_bgra = -dst_stride_bgra;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      src_stride_u * 2 == width &&
			
 
				-      src_stride_v * 2 == width &&
			
 
				-      dst_stride_bgra == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
			
 
				-  }
			
 
				-#if defined(HAS_I422TOBGRAROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON)) {
			
 
				-    I422ToBGRARow = I422ToBGRARow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToBGRARow = I422ToBGRARow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOBGRAROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
			
 
				-        I422ToBGRARow = I422ToBGRARow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
			
 
				-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
			
 
				-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
			
 
				-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
			
 
				-      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
			
 
				-    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
			
 
				-    dst_bgra += dst_stride_bgra;
			
 
				-    src_y += src_stride_y;
			
 
				-    src_u += src_stride_u;
			
 
				-    src_v += src_stride_v;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I422 to ABGR.
			
 
				-LIBYUV_API
			
 
				-int I422ToABGR(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_abgr, int dst_stride_abgr,
			
 
				-               int width, int height) {
			
 
				-  int y;
			
 
				-  void (*I422ToABGRRow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToABGRRow_C;
			
 
				-  if (!src_y || !src_u || !src_v ||
			
 
				-      !dst_abgr ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
			
 
				-    dst_stride_abgr = -dst_stride_abgr;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      src_stride_u * 2 == width &&
			
 
				-      src_stride_v * 2 == width &&
			
 
				-      dst_stride_abgr == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
			
 
				-  }
			
 
				-#if defined(HAS_I422TOABGRROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON)) {
			
 
				-    I422ToABGRRow = I422ToABGRRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToABGRRow = I422ToABGRRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TOABGRROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
			
 
				-        I422ToABGRRow = I422ToABGRRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
			
 
				-    dst_abgr += dst_stride_abgr;
			
 
				-    src_y += src_stride_y;
			
 
				-    src_u += src_stride_u;
			
 
				-    src_v += src_stride_v;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert I422 to RGBA.
			
 
				-LIBYUV_API
			
 
				-int I422ToRGBA(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_rgba, int dst_stride_rgba,
			
 
				-               int width, int height) {
			
 
				-  int y;
			
 
				-  void (*I422ToRGBARow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToRGBARow_C;
			
 
				-  if (!src_y || !src_u || !src_v ||
			
 
				-      !dst_rgba ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
			
 
				-    dst_stride_rgba = -dst_stride_rgba;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      src_stride_u * 2 == width &&
			
 
				-      src_stride_v * 2 == width &&
			
 
				-      dst_stride_rgba == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
			
 
				-  }
			
 
				-#if defined(HAS_I422TORGBAROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON)) {
			
 
				-    I422ToRGBARow = I422ToRGBARow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      I422ToRGBARow = I422ToRGBARow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_I422TORGBAROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
			
 
				-        I422ToRGBARow = I422ToRGBARow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
			
 
				-    dst_rgba += dst_stride_rgba;
			
 
				-    src_y += src_stride_y;
			
 
				-    src_u += src_stride_u;
			
 
				-    src_v += src_stride_v;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert NV12 to RGB565.
			
 
				-LIBYUV_API
			
 
				-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
			
 
				-                 const uint8* src_uv, int src_stride_uv,
			
 
				-                 uint8* dst_rgb565, int dst_stride_rgb565,
			
 
				-                 int width, int height) {
			
 
				-  int y;
			
 
				-  void (*NV12ToRGB565Row)(const uint8* y_buf,
			
 
				-                          const uint8* uv_buf,
			
 
				-                          uint8* rgb_buf,
			
 
				-                          int width) = NV12ToRGB565Row_C;
			
 
				-  if (!src_y || !src_uv || !dst_rgb565 ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
			
 
				-    dst_stride_rgb565 = -dst_stride_rgb565;
			
 
				-  }
			
 
				-#if defined(HAS_NV12TORGB565ROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_NV12TORGB565ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);
			
 
				-    dst_rgb565 += dst_stride_rgb565;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_uv += src_stride_uv;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert NV21 to RGB565.
			
 
				-LIBYUV_API
			
 
				-int NV21ToRGB565(const uint8* src_y, int src_stride_y,
			
 
				-                 const uint8* src_vu, int src_stride_vu,
			
 
				-                 uint8* dst_rgb565, int dst_stride_rgb565,
			
 
				-                 int width, int height) {
			
 
				-  int y;
			
 
				-  void (*NV21ToRGB565Row)(const uint8* y_buf,
			
 
				-                          const uint8* src_vu,
			
 
				-                          uint8* rgb_buf,
			
 
				-                          int width) = NV21ToRGB565Row_C;
			
 
				-  if (!src_y || !src_vu || !dst_rgb565 ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
			
 
				-    dst_stride_rgb565 = -dst_stride_rgb565;
			
 
				-  }
			
 
				-#if defined(HAS_NV21TORGB565ROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_NV21TORGB565ROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      NV21ToRGB565Row = NV21ToRGB565Row_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);
			
 
				-    dst_rgb565 += dst_stride_rgb565;
			
 
				-    src_y += src_stride_y;
			
 
				-    if (y & 1) {
			
 
				-      src_vu += src_stride_vu;
			
 
				-    }
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void SetPlane(uint8* dst_y, int dst_stride_y,
			
 
				-              int width, int height,
			
 
				-              uint32 value) {
			
 
				-  int y;
			
 
				-  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
			
 
				-  void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
			
 
				-  // Coalesce rows.
			
 
				-  if (dst_stride_y == width) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    dst_stride_y = 0;
			
 
				-  }
			
 
				-#if defined(HAS_SETROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) &&
			
 
				-      IS_ALIGNED(width, 16) &&
			
 
				-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-    SetRow = SetRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SETROW_X86)
			
 
				-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
			
 
				-    SetRow = SetRow_X86;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Set plane
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    SetRow(dst_y, v32, width);
			
 
				-    dst_y += dst_stride_y;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Draw a rectangle into I420
			
 
				-LIBYUV_API
			
 
				-int I420Rect(uint8* dst_y, int dst_stride_y,
			
 
				-             uint8* dst_u, int dst_stride_u,
			
 
				-             uint8* dst_v, int dst_stride_v,
			
 
				-             int x, int y,
			
 
				-             int width, int height,
			
 
				-             int value_y, int value_u, int value_v) {
			
 
				-  int halfwidth = (width + 1) >> 1;
			
 
				-  int halfheight = (height + 1) >> 1;
			
 
				-  uint8* start_y = dst_y + y * dst_stride_y + x;
			
 
				-  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
			
 
				-  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
			
 
				-  if (!dst_y || !dst_u || !dst_v ||
			
 
				-      width <= 0 || height <= 0 ||
			
 
				-      x < 0 || y < 0 ||
			
 
				-      value_y < 0 || value_y > 255 ||
			
 
				-      value_u < 0 || value_u > 255 ||
			
 
				-      value_v < 0 || value_v > 255) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-
			
 
				-  SetPlane(start_y, dst_stride_y, width, height, value_y);
			
 
				-  SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
			
 
				-  SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Draw a rectangle into ARGB
			
 
				-LIBYUV_API
			
 
				-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
			
 
				-             int dst_x, int dst_y,
			
 
				-             int width, int height,
			
 
				-             uint32 value) {
			
 
				-  if (!dst_argb ||
			
 
				-      width <= 0 || height <= 0 ||
			
 
				-      dst_x < 0 || dst_y < 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  dst_argb += dst_y * dst_stride_argb + dst_x * 4;
			
 
				-  // Coalesce rows.
			
 
				-  if (dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_SETROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
			
 
				-    return 0;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SETROW_X86)
			
 
				-  if (TestCpuFlag(kCpuHasX86)) {
			
 
				-    ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height);
			
 
				-    return 0;
			
 
				-  }
			
 
				-#endif
			
 
				-  ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert unattentuated ARGB to preattenuated ARGB.
			
 
				-// An unattenutated ARGB alpha blend uses the formula
			
 
				-// p = a * f + (1 - a) * b
			
 
				-// where
			
 
				-//   p is output pixel
			
 
				-//   f is foreground pixel
			
 
				-//   b is background pixel
			
 
				-//   a is alpha value from foreground pixel
			
 
				-// An preattenutated ARGB alpha blend uses the formula
			
 
				-// p = f + (1 - a) * b
			
 
				-// where
			
 
				-//   f is foreground pixel premultiplied by alpha
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
			
 
				-                  uint8* dst_argb, int dst_stride_argb,
			
 
				-                  int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
			
 
				-                           int width) = ARGBAttenuateRow_C;
			
 
				-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBATTENUATEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBATTENUATEROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
			
 
				-    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBATTENUATEROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
			
 
				-    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBATTENUATEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBAttenuateRow(src_argb, dst_argb, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert preattentuated ARGB to unattenuated ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
			
 
				-                    uint8* dst_argb, int dst_stride_argb,
			
 
				-                    int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
			
 
				-                             int width) = ARGBUnattenuateRow_C;
			
 
				-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
			
 
				-    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
			
 
				-    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-// TODO(fbarchard): Neon version.
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBUnattenuateRow(src_argb, dst_argb, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Convert ARGB to Grayed ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
			
 
				-                      int width) = ARGBGrayRow_C;
			
 
				-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBGRAYROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    ARGBGrayRow = ARGBGrayRow_SSSE3;
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBGRAYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
			
 
				-    ARGBGrayRow = ARGBGrayRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBGrayRow(src_argb, dst_argb, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Make a rectangle of ARGB gray scale.
			
 
				-LIBYUV_API
			
 
				-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
			
 
				-             int dst_x, int dst_y,
			
 
				-             int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
			
 
				-                      int width) = ARGBGrayRow_C;
			
 
				-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
			
 
				-  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBGRAYROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    ARGBGrayRow = ARGBGrayRow_SSSE3;
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBGRAYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
			
 
				-    ARGBGrayRow = ARGBGrayRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBGrayRow(dst, dst, width);
			
 
				-    dst += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Make a rectangle of ARGB Sepia tone.
			
 
				-LIBYUV_API
			
 
				-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
			
 
				-              int dst_x, int dst_y, int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
			
 
				-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
			
 
				-  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBSEPIAROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    ARGBSepiaRow = ARGBSepiaRow_SSSE3;
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBSEPIAROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
			
 
				-    ARGBSepiaRow = ARGBSepiaRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBSepiaRow(dst, width);
			
 
				-    dst += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Apply a 4x4 matrix to each ARGB pixel.
			
 
				-// Note: Normally for shading, but can be used to swizzle or invert.
			
 
				-LIBYUV_API
			
 
				-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
			
 
				-                    uint8* dst_argb, int dst_stride_argb,
			
 
				-                    const int8* matrix_argb,
			
 
				-                    int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
			
 
				-      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
			
 
				-  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
			
 
				-    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Apply a 4x3 matrix to each ARGB pixel.
			
 
				-// Deprecated.
			
 
				-LIBYUV_API
			
 
				-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
			
 
				-                   const int8* matrix_rgb,
			
 
				-                   int dst_x, int dst_y, int width, int height) {
			
 
				-  SIMD_ALIGNED(int8 matrix_argb[16]);
			
 
				-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
			
 
				-  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
			
 
				-      dst_x < 0 || dst_y < 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-
			
 
				-  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
			
 
				-  matrix_argb[0] = matrix_rgb[0] / 2;
			
 
				-  matrix_argb[1] = matrix_rgb[1] / 2;
			
 
				-  matrix_argb[2] = matrix_rgb[2] / 2;
			
 
				-  matrix_argb[3] = matrix_rgb[3] / 2;
			
 
				-  matrix_argb[4] = matrix_rgb[4] / 2;
			
 
				-  matrix_argb[5] = matrix_rgb[5] / 2;
			
 
				-  matrix_argb[6] = matrix_rgb[6] / 2;
			
 
				-  matrix_argb[7] = matrix_rgb[7] / 2;
			
 
				-  matrix_argb[8] = matrix_rgb[8] / 2;
			
 
				-  matrix_argb[9] = matrix_rgb[9] / 2;
			
 
				-  matrix_argb[10] = matrix_rgb[10] / 2;
			
 
				-  matrix_argb[11] = matrix_rgb[11] / 2;
			
 
				-  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
			
 
				-  matrix_argb[15] = 64;  // 1.0
			
 
				-
			
 
				-  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
			
 
				-                         dst, dst_stride_argb,
			
 
				-                         &matrix_argb[0], width, height);
			
 
				-}
			
 
				-
			
 
				-// Apply a color table each ARGB pixel.
			
 
				-// Table contains 256 ARGB values.
			
 
				-LIBYUV_API
			
 
				-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
			
 
				-                   const uint8* table_argb,
			
 
				-                   int dst_x, int dst_y, int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
			
 
				-                            int width) = ARGBColorTableRow_C;
			
 
				-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
			
 
				-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
			
 
				-      dst_x < 0 || dst_y < 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBCOLORTABLEROW_X86)
			
 
				-  if (TestCpuFlag(kCpuHasX86)) {
			
 
				-    ARGBColorTableRow = ARGBColorTableRow_X86;
			
 
				-  }
			
 
				-#endif
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBColorTableRow(dst, table_argb, width);
			
 
				-    dst += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Apply a color table each ARGB pixel but preserve destination alpha.
			
 
				-// Table contains 256 ARGB values.
			
 
				-LIBYUV_API
			
 
				-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
			
 
				-                  const uint8* table_argb,
			
 
				-                  int dst_x, int dst_y, int width, int height) {
			
 
				-  int y;
			
 
				-  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
			
 
				-                           int width) = RGBColorTableRow_C;
			
 
				-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
			
 
				-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
			
 
				-      dst_x < 0 || dst_y < 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_RGBCOLORTABLEROW_X86)
			
 
				-  if (TestCpuFlag(kCpuHasX86)) {
			
 
				-    RGBColorTableRow = RGBColorTableRow_X86;
			
 
				-  }
			
 
				-#endif
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    RGBColorTableRow(dst, table_argb, width);
			
 
				-    dst += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// ARGBQuantize is used to posterize art.
			
 
				-// e.g. rgb / qvalue * qvalue + qvalue / 2
			
 
				-// But the low levels implement efficiently with 3 parameters, and could be
			
 
				-// used for other high level operations.
			
 
				-// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
			
 
				-// where scale is 1 / interval_size as a fixed point value.
			
 
				-// The divide is replaces with a multiply by reciprocal fixed point multiply.
			
 
				-// Caveat - although SSE2 saturates, the C function does not and should be used
			
 
				-// with care if doing anything but quantization.
			
 
				-LIBYUV_API
			
 
				-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
			
 
				-                 int scale, int interval_size, int interval_offset,
			
 
				-                 int dst_x, int dst_y, int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
			
 
				-                          int interval_offset, int width) = ARGBQuantizeRow_C;
			
 
				-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
			
 
				-  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
			
 
				-      interval_size < 1 || interval_size > 255) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBQUANTIZEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBQUANTIZEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
			
 
				-    ARGBQuantizeRow = ARGBQuantizeRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
			
 
				-    dst += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Computes table of cumulative sum for image where the value is the sum
			
 
				-// of all values above and to the left of the entry. Used by ARGBBlur.
			
 
				-LIBYUV_API
			
 
				-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
			
 
				-                             int32* dst_cumsum, int dst_stride32_cumsum,
			
 
				-                             int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
			
 
				-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
			
 
				-  int32* previous_cumsum = dst_cumsum;
			
 
				-  if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2)) {
			
 
				-    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-  memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
			
 
				-    previous_cumsum = dst_cumsum;
			
 
				-    dst_cumsum += dst_stride32_cumsum;
			
 
				-    src_argb += src_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Blur ARGB image.
			
 
				-// Caller should allocate CumulativeSum table of width * height * 16 bytes
			
 
				-// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
			
 
				-// as the buffer is treated as circular.
			
 
				-LIBYUV_API
			
 
				-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
			
 
				-             uint8* dst_argb, int dst_stride_argb,
			
 
				-             int32* dst_cumsum, int dst_stride32_cumsum,
			
 
				-             int width, int height, int radius) {
			
 
				-  int y;
			
 
				-  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
			
 
				-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
			
 
				-  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
			
 
				-      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
			
 
				-  int32* cumsum_bot_row;
			
 
				-  int32* max_cumsum_bot_row;
			
 
				-  int32* cumsum_top_row;
			
 
				-
			
 
				-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  if (radius > height) {
			
 
				-    radius = height;
			
 
				-  }
			
 
				-  if (radius > (width / 2 - 1)) {
			
 
				-    radius = width / 2 - 1;
			
 
				-  }
			
 
				-  if (radius <= 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2)) {
			
 
				-    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
			
 
				-    CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-  // Compute enough CumulativeSum for first row to be blurred. After this
			
 
				-  // one row of CumulativeSum is updated at a time.
			
 
				-  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
			
 
				-                           dst_cumsum, dst_stride32_cumsum,
			
 
				-                           width, radius);
			
 
				-
			
 
				-  src_argb = src_argb + radius * src_stride_argb;
			
 
				-  cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
			
 
				-
			
 
				-  max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
			
 
				-  cumsum_top_row = &dst_cumsum[0];
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
			
 
				-    int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
			
 
				-    int area = radius * (bot_y - top_y);
			
 
				-    int boxwidth = radius * 4;
			
 
				-    int x;
			
 
				-    int n;
			
 
				-
			
 
				-    // Increment cumsum_top_row pointer with circular buffer wrap around.
			
 
				-    if (top_y) {
			
 
				-      cumsum_top_row += dst_stride32_cumsum;
			
 
				-      if (cumsum_top_row >= max_cumsum_bot_row) {
			
 
				-        cumsum_top_row = dst_cumsum;
			
 
				-      }
			
 
				-    }
			
 
				-    // Increment cumsum_bot_row pointer with circular buffer wrap around and
			
 
				-    // then fill in a row of CumulativeSum.
			
 
				-    if ((y + radius) < height) {
			
 
				-      const int32* prev_cumsum_bot_row = cumsum_bot_row;
			
 
				-      cumsum_bot_row += dst_stride32_cumsum;
			
 
				-      if (cumsum_bot_row >= max_cumsum_bot_row) {
			
 
				-        cumsum_bot_row = dst_cumsum;
			
 
				-      }
			
 
				-      ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
			
 
				-                              width);
			
 
				-      src_argb += src_stride_argb;
			
 
				-    }
			
 
				-
			
 
				-    // Left clipped.
			
 
				-    for (x = 0; x < radius + 1; ++x) {
			
 
				-      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
			
 
				-                                boxwidth, area, &dst_argb[x * 4], 1);
			
 
				-      area += (bot_y - top_y);
			
 
				-      boxwidth += 4;
			
 
				-    }
			
 
				-
			
 
				-    // Middle unclipped.
			
 
				-    n = (width - 1) - radius - x + 1;
			
 
				-    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
			
 
				-                              boxwidth, area, &dst_argb[x * 4], n);
			
 
				-
			
 
				-    // Right clipped.
			
 
				-    for (x += n; x <= width - 1; ++x) {
			
 
				-      area -= (bot_y - top_y);
			
 
				-      boxwidth -= 4;
			
 
				-      CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
			
 
				-                                cumsum_bot_row + (x - radius - 1) * 4,
			
 
				-                                boxwidth, area, &dst_argb[x * 4], 1);
			
 
				-    }
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Multiply ARGB image by a specified ARGB value.
			
 
				-LIBYUV_API
			
 
				-int ARGBShade(const uint8* src_argb, int src_stride_argb,
			
 
				-              uint8* dst_argb, int dst_stride_argb,
			
 
				-              int width, int height, uint32 value) {
			
 
				-  int y;
			
 
				-  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
			
 
				-                       int width, uint32 value) = ARGBShadeRow_C;
			
 
				-  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBSHADEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    ARGBShadeRow = ARGBShadeRow_SSE2;
			
 
				-  }
			
 
				-#elif defined(HAS_ARGBSHADEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
			
 
				-    ARGBShadeRow = ARGBShadeRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBShadeRow(src_argb, dst_argb, width, value);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Interpolate 2 ARGB images by specified amount (0 to 255).
			
 
				-LIBYUV_API
			
 
				-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
			
 
				-                    const uint8* src_argb1, int src_stride_argb1,
			
 
				-                    uint8* dst_argb, int dst_stride_argb,
			
 
				-                    int width, int height, int interpolation) {
			
 
				-  int y;
			
 
				-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                         ptrdiff_t src_stride, int dst_width,
			
 
				-                         int source_y_fraction) = InterpolateRow_C;
			
 
				-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
			
 
				-    dst_stride_argb = -dst_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb0 == width * 4 &&
			
 
				-      src_stride_argb1 == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_INTERPOLATEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
			
 
				-          IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
			
 
				-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
			
 
				-          IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
			
 
				-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
			
 
				-    InterpolateRow = InterpolateRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      InterpolateRow = InterpolateRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
			
 
				-    InterpolateRow = InterpolateRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 &&
			
 
				-      IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
			
 
				-      IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
			
 
				-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
			
 
				-    ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
			
 
				-                   width * 4, interpolation);
			
 
				-    src_argb0 += src_stride_argb0;
			
 
				-    src_argb1 += src_stride_argb1;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
			
 
				-LIBYUV_API
			
 
				-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
			
 
				-                uint8* dst_argb, int dst_stride_argb,
			
 
				-                const uint8* shuffler, int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
			
 
				-                         const uint8* shuffler, int pix) = ARGBShuffleRow_C;
			
 
				-  if (!src_bgra || !dst_argb ||
			
 
				-      width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
			
 
				-    src_stride_bgra = -src_stride_bgra;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_bgra == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_bgra = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBSHUFFLEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
			
 
				-    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      ARGBShuffleRow = ARGBShuffleRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
			
 
				-    ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
			
 
				-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        ARGBShuffleRow = ARGBShuffleRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBSHUFFLEROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
			
 
				-    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(width, 16)) {
			
 
				-      ARGBShuffleRow = ARGBShuffleRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBSHUFFLEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
			
 
				-    ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 4)) {
			
 
				-      ARGBShuffleRow = ARGBShuffleRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
			
 
				-    src_bgra += src_stride_bgra;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Sobel ARGB effect.
			
 
				-static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
			
 
				-                        uint8* dst_argb, int dst_stride_argb,
			
 
				-                        int width, int height,
			
 
				-                        void (*SobelRow)(const uint8* src_sobelx,
			
 
				-                                         const uint8* src_sobely,
			
 
				-                                         uint8* dst, int width)) {
			
 
				-  int y;
			
 
				-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                         uint32 selector, int pix) = ARGBToBayerGGRow_C;
			
 
				-  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    uint8* dst_sobely, int width) = SobelYRow_C;
			
 
				-  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    const uint8* src_y2, uint8* dst_sobely, int width) =
			
 
				-      SobelXRow_C;
			
 
				-  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
			
 
				-  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // ARGBToBayer used to select G channel from ARGB.
			
 
				-#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-    ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
			
 
				-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOBAYERGGROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
			
 
				-    ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(width, 8)) {
			
 
				-      ARGBToBayerRow = ARGBToBayerGGRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SOBELYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2)) {
			
 
				-    SobelYRow = SobelYRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SOBELYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON)) {
			
 
				-    SobelYRow = SobelYRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SOBELXROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2)) {
			
 
				-    SobelXRow = SobelXRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SOBELXROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON)) {
			
 
				-    SobelXRow = SobelXRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-  {
			
 
				-    // 3 rows with edges before/after.
			
 
				-    const int kRowSize = (width + kEdge + 15) & ~15;
			
 
				-    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
			
 
				-    uint8* row_sobelx = rows;
			
 
				-    uint8* row_sobely = rows + kRowSize;
			
 
				-    uint8* row_y = rows + kRowSize * 2;
			
 
				-
			
 
				-    // Convert first row.
			
 
				-    uint8* row_y0 = row_y + kEdge;
			
 
				-    uint8* row_y1 = row_y0 + kRowSize;
			
 
				-    uint8* row_y2 = row_y1 + kRowSize;
			
 
				-    ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
			
 
				-    row_y0[-1] = row_y0[0];
			
 
				-    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
			
 
				-    ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
			
 
				-    row_y1[-1] = row_y1[0];
			
 
				-    memset(row_y1 + width, row_y1[width - 1], 16);
			
 
				-    memset(row_y2 + width, 0, 16);
			
 
				-
			
 
				-    for (y = 0; y < height; ++y) {
			
 
				-      // Convert next row of ARGB to Y.
			
 
				-      if (y < (height - 1)) {
			
 
				-        src_argb += src_stride_argb;
			
 
				-      }
			
 
				-      ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
			
 
				-      row_y2[-1] = row_y2[0];
			
 
				-      row_y2[width] = row_y2[width - 1];
			
 
				-
			
 
				-      SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
			
 
				-      SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
			
 
				-      SobelRow(row_sobelx, row_sobely, dst_argb, width);
			
 
				-
			
 
				-      // Cycle thru circular queue of 3 row_y buffers.
			
 
				-      {
			
 
				-        uint8* row_yt = row_y0;
			
 
				-        row_y0 = row_y1;
			
 
				-        row_y1 = row_y2;
			
 
				-        row_y2 = row_yt;
			
 
				-      }
			
 
				-
			
 
				-      dst_argb += dst_stride_argb;
			
 
				-    }
			
 
				-    free_aligned_buffer_64(rows);
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Sobel ARGB effect.
			
 
				-LIBYUV_API
			
 
				-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
			
 
				-              uint8* dst_argb, int dst_stride_argb,
			
 
				-              int width, int height) {
			
 
				-  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                   uint8* dst_argb, int width) = SobelRow_C;
			
 
				-#if defined(HAS_SOBELROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    SobelRow = SobelRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SOBELROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
			
 
				-    SobelRow = SobelRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
			
 
				-                      width, height, SobelRow);
			
 
				-}
			
 
				-
			
 
				-// Sobel ARGB effect with planar output.
			
 
				-LIBYUV_API
			
 
				-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
			
 
				-                     uint8* dst_y, int dst_stride_y,
			
 
				-                     int width, int height) {
			
 
				-  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                          uint8* dst_, int width) = SobelToPlaneRow_C;
			
 
				-#if defined(HAS_SOBELTOPLANEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
			
 
				-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
			
 
				-    SobelToPlaneRow = SobelToPlaneRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SOBELTOPLANEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
			
 
				-    SobelToPlaneRow = SobelToPlaneRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
			
 
				-                      width, height, SobelToPlaneRow);
			
 
				-}
			
 
				-
			
 
				-// SobelXY ARGB effect.
			
 
				-// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
			
 
				-LIBYUV_API
			
 
				-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
			
 
				-                uint8* dst_argb, int dst_stride_argb,
			
 
				-                int width, int height) {
			
 
				-  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                     uint8* dst_argb, int width) = SobelXYRow_C;
			
 
				-#if defined(HAS_SOBELXYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-    SobelXYRow = SobelXYRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SOBELXYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
			
 
				-    SobelXYRow = SobelXYRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
			
 
				-                      width, height, SobelXYRow);
			
 
				-}
			
 
				-
			
 
				-// Apply a 4x4 polynomial to each ARGB pixel.
			
 
				-LIBYUV_API
			
 
				-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
			
 
				-                   uint8* dst_argb, int dst_stride_argb,
			
 
				-                   const float* poly,
			
 
				-                   int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBPolynomialRow)(const uint8* src_argb,
			
 
				-                            uint8* dst_argb, const float* poly,
			
 
				-                            int width) = ARGBPolynomialRow_C;
			
 
				-  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
			
 
				-    ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
			
 
				-      IS_ALIGNED(width, 2)) {
			
 
				-    ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBPolynomialRow(src_argb, dst_argb, poly, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Apply a lumacolortable to each ARGB pixel.
			
 
				-LIBYUV_API
			
 
				-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
			
 
				-                       uint8* dst_argb, int dst_stride_argb,
			
 
				-                       const uint8* luma,
			
 
				-                       int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
			
 
				-      int width, const uint8* luma, const uint32 lumacoeff) =
			
 
				-      ARGBLumaColorTableRow_C;
			
 
				-  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
			
 
				-    ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Copy Alpha from one ARGB image to another.
			
 
				-LIBYUV_API
			
 
				-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
			
 
				-                  uint8* dst_argb, int dst_stride_argb,
			
 
				-                  int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
			
 
				-      ARGBCopyAlphaRow_C;
			
 
				-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_argb == width * 4 &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_argb = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
			
 
				-      IS_ALIGNED(width, 8)) {
			
 
				-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
			
 
				-    ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBCopyAlphaRow(src_argb, dst_argb, width);
			
 
				-    src_argb += src_stride_argb;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Copy a planar Y channel to the alpha channel of a destination ARGB image.
			
 
				-LIBYUV_API
			
 
				-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
			
 
				-                     uint8* dst_argb, int dst_stride_argb,
			
 
				-                     int width, int height) {
			
 
				-  int y;
			
 
				-  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
			
 
				-      ARGBCopyYToAlphaRow_C;
			
 
				-  if (!src_y || !dst_argb || width <= 0 || height == 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-  }
			
 
				-  // Coalesce rows.
			
 
				-  if (src_stride_y == width &&
			
 
				-      dst_stride_argb == width * 4) {
			
 
				-    width *= height;
			
 
				-    height = 1;
			
 
				-    src_stride_y = dst_stride_argb = 0;
			
 
				-  }
			
 
				-#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) &&
			
 
				-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
			
 
				-      IS_ALIGNED(width, 8)) {
			
 
				-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
			
 
				-    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    ARGBCopyYToAlphaRow(src_y, dst_argb, width);
			
 
				-    src_y += src_stride_y;
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-  }
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/rotate.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/rotate.cc
@@ -1,1301 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/rotate.h"
			
 
				-
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/convert.h"
			
 
				-#include "libyuv/planar_functions.h"
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && \
			
 
				-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
			
 
				-#if defined(__APPLE__) && defined(__i386__)
			
 
				-#define DECLARE_FUNCTION(name)                                                 \
			
 
				-    ".text                                     \n"                             \
			
 
				-    ".private_extern _" #name "                \n"                             \
			
 
				-    ".align 4,0x90                             \n"                             \
			
 
				-"_" #name ":                                   \n"
			
 
				-#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
			
 
				-#define DECLARE_FUNCTION(name)                                                 \
			
 
				-    ".text                                     \n"                             \
			
 
				-    ".align 4,0x90                             \n"                             \
			
 
				-"_" #name ":                                   \n"
			
 
				-#else
			
 
				-#define DECLARE_FUNCTION(name)                                                 \
			
 
				-    ".text                                     \n"                             \
			
 
				-    ".align 4,0x90                             \n"                             \
			
 
				-#name ":                                       \n"
			
 
				-#endif
			
 
				-#endif
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
			
 
				-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
			
 
				-#define HAS_MIRRORROW_NEON
			
 
				-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
			
 
				-#define HAS_MIRRORROW_UV_NEON
			
 
				-void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
			
 
				-#define HAS_TRANSPOSE_WX8_NEON
			
 
				-void TransposeWx8_NEON(const uint8* src, int src_stride,
			
 
				-                       uint8* dst, int dst_stride, int width);
			
 
				-#define HAS_TRANSPOSE_UVWX8_NEON
			
 
				-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
			
 
				-                         uint8* dst_a, int dst_stride_a,
			
 
				-                         uint8* dst_b, int dst_stride_b,
			
 
				-                         int width);
			
 
				-#endif  // defined(__ARM_NEON__)
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
			
 
				-    defined(__mips__) && \
			
 
				-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
			
 
				-#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
			
 
				-void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
			
 
				-                             uint8* dst, int dst_stride, int width);
			
 
				-
			
 
				-void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
			
 
				-                                  uint8* dst, int dst_stride, int width);
			
 
				-#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
			
 
				-void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
			
 
				-                               uint8* dst_a, int dst_stride_a,
			
 
				-                               uint8* dst_b, int dst_stride_b,
			
 
				-                               int width);
			
 
				-#endif  // defined(__mips__)
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && \
			
 
				-    defined(_M_IX86) && defined(_MSC_VER)
			
 
				-#define HAS_TRANSPOSE_WX8_SSSE3
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
			
 
				-                               uint8* dst, int dst_stride, int width) {
			
 
				-  __asm {
			
 
				-    push      edi
			
 
				-    push      esi
			
 
				-    push      ebp
			
 
				-    mov       eax, [esp + 12 + 4]   // src
			
 
				-    mov       edi, [esp + 12 + 8]   // src_stride
			
 
				-    mov       edx, [esp + 12 + 12]  // dst
			
 
				-    mov       esi, [esp + 12 + 16]  // dst_stride
			
 
				-    mov       ecx, [esp + 12 + 20]  // width
			
 
				-
			
 
				-    // Read in the data from the source pointer.
			
 
				-    // First round of bit swap.
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movq      xmm0, qword ptr [eax]
			
 
				-    lea       ebp, [eax + 8]
			
 
				-    movq      xmm1, qword ptr [eax + edi]
			
 
				-    lea       eax, [eax + 2 * edi]
			
 
				-    punpcklbw xmm0, xmm1
			
 
				-    movq      xmm2, qword ptr [eax]
			
 
				-    movdqa    xmm1, xmm0
			
 
				-    palignr   xmm1, xmm1, 8
			
 
				-    movq      xmm3, qword ptr [eax + edi]
			
 
				-    lea       eax, [eax + 2 * edi]
			
 
				-    punpcklbw xmm2, xmm3
			
 
				-    movdqa    xmm3, xmm2
			
 
				-    movq      xmm4, qword ptr [eax]
			
 
				-    palignr   xmm3, xmm3, 8
			
 
				-    movq      xmm5, qword ptr [eax + edi]
			
 
				-    punpcklbw xmm4, xmm5
			
 
				-    lea       eax, [eax + 2 * edi]
			
 
				-    movdqa    xmm5, xmm4
			
 
				-    movq      xmm6, qword ptr [eax]
			
 
				-    palignr   xmm5, xmm5, 8
			
 
				-    movq      xmm7, qword ptr [eax + edi]
			
 
				-    punpcklbw xmm6, xmm7
			
 
				-    mov       eax, ebp
			
 
				-    movdqa    xmm7, xmm6
			
 
				-    palignr   xmm7, xmm7, 8
			
 
				-    // Second round of bit swap.
			
 
				-    punpcklwd xmm0, xmm2
			
 
				-    punpcklwd xmm1, xmm3
			
 
				-    movdqa    xmm2, xmm0
			
 
				-    movdqa    xmm3, xmm1
			
 
				-    palignr   xmm2, xmm2, 8
			
 
				-    palignr   xmm3, xmm3, 8
			
 
				-    punpcklwd xmm4, xmm6
			
 
				-    punpcklwd xmm5, xmm7
			
 
				-    movdqa    xmm6, xmm4
			
 
				-    movdqa    xmm7, xmm5
			
 
				-    palignr   xmm6, xmm6, 8
			
 
				-    palignr   xmm7, xmm7, 8
			
 
				-    // Third round of bit swap.
			
 
				-    // Write to the destination pointer.
			
 
				-    punpckldq xmm0, xmm4
			
 
				-    movq      qword ptr [edx], xmm0
			
 
				-    movdqa    xmm4, xmm0
			
 
				-    palignr   xmm4, xmm4, 8
			
 
				-    movq      qword ptr [edx + esi], xmm4
			
 
				-    lea       edx, [edx + 2 * esi]
			
 
				-    punpckldq xmm2, xmm6
			
 
				-    movdqa    xmm6, xmm2
			
 
				-    palignr   xmm6, xmm6, 8
			
 
				-    movq      qword ptr [edx], xmm2
			
 
				-    punpckldq xmm1, xmm5
			
 
				-    movq      qword ptr [edx + esi], xmm6
			
 
				-    lea       edx, [edx + 2 * esi]
			
 
				-    movdqa    xmm5, xmm1
			
 
				-    movq      qword ptr [edx], xmm1
			
 
				-    palignr   xmm5, xmm5, 8
			
 
				-    punpckldq xmm3, xmm7
			
 
				-    movq      qword ptr [edx + esi], xmm5
			
 
				-    lea       edx, [edx + 2 * esi]
			
 
				-    movq      qword ptr [edx], xmm3
			
 
				-    movdqa    xmm7, xmm3
			
 
				-    palignr   xmm7, xmm7, 8
			
 
				-    sub       ecx, 8
			
 
				-    movq      qword ptr [edx + esi], xmm7
			
 
				-    lea       edx, [edx + 2 * esi]
			
 
				-    jg        convertloop
			
 
				-
			
 
				-    pop       ebp
			
 
				-    pop       esi
			
 
				-    pop       edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#define HAS_TRANSPOSE_UVWX8_SSE2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
			
 
				-                                uint8* dst_a, int dst_stride_a,
			
 
				-                                uint8* dst_b, int dst_stride_b,
			
 
				-                                int w) {
			
 
				-  __asm {
			
 
				-    push      ebx
			
 
				-    push      esi
			
 
				-    push      edi
			
 
				-    push      ebp
			
 
				-    mov       eax, [esp + 16 + 4]   // src
			
 
				-    mov       edi, [esp + 16 + 8]   // src_stride
			
 
				-    mov       edx, [esp + 16 + 12]  // dst_a
			
 
				-    mov       esi, [esp + 16 + 16]  // dst_stride_a
			
 
				-    mov       ebx, [esp + 16 + 20]  // dst_b
			
 
				-    mov       ebp, [esp + 16 + 24]  // dst_stride_b
			
 
				-    mov       ecx, esp
			
 
				-    sub       esp, 4 + 16
			
 
				-    and       esp, ~15
			
 
				-    mov       [esp + 16], ecx
			
 
				-    mov       ecx, [ecx + 16 + 28]  // w
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    // Read in the data from the source pointer.
			
 
				-    // First round of bit swap.
			
 
				-    movdqa    xmm0, [eax]
			
 
				-    movdqa    xmm1, [eax + edi]
			
 
				-    lea       eax, [eax + 2 * edi]
			
 
				-    movdqa    xmm7, xmm0  // use xmm7 as temp register.
			
 
				-    punpcklbw xmm0, xmm1
			
 
				-    punpckhbw xmm7, xmm1
			
 
				-    movdqa    xmm1, xmm7
			
 
				-    movdqa    xmm2, [eax]
			
 
				-    movdqa    xmm3, [eax + edi]
			
 
				-    lea       eax, [eax + 2 * edi]
			
 
				-    movdqa    xmm7, xmm2
			
 
				-    punpcklbw xmm2, xmm3
			
 
				-    punpckhbw xmm7, xmm3
			
 
				-    movdqa    xmm3, xmm7
			
 
				-    movdqa    xmm4, [eax]
			
 
				-    movdqa    xmm5, [eax + edi]
			
 
				-    lea       eax, [eax + 2 * edi]
			
 
				-    movdqa    xmm7, xmm4
			
 
				-    punpcklbw xmm4, xmm5
			
 
				-    punpckhbw xmm7, xmm5
			
 
				-    movdqa    xmm5, xmm7
			
 
				-    movdqa    xmm6, [eax]
			
 
				-    movdqa    xmm7, [eax + edi]
			
 
				-    lea       eax, [eax + 2 * edi]
			
 
				-    movdqa    [esp], xmm5  // backup xmm5
			
 
				-    neg       edi
			
 
				-    movdqa    xmm5, xmm6   // use xmm5 as temp register.
			
 
				-    punpcklbw xmm6, xmm7
			
 
				-    punpckhbw xmm5, xmm7
			
 
				-    movdqa    xmm7, xmm5
			
 
				-    lea       eax, [eax + 8 * edi + 16]
			
 
				-    neg       edi
			
 
				-    // Second round of bit swap.
			
 
				-    movdqa    xmm5, xmm0
			
 
				-    punpcklwd xmm0, xmm2
			
 
				-    punpckhwd xmm5, xmm2
			
 
				-    movdqa    xmm2, xmm5
			
 
				-    movdqa    xmm5, xmm1
			
 
				-    punpcklwd xmm1, xmm3
			
 
				-    punpckhwd xmm5, xmm3
			
 
				-    movdqa    xmm3, xmm5
			
 
				-    movdqa    xmm5, xmm4
			
 
				-    punpcklwd xmm4, xmm6
			
 
				-    punpckhwd xmm5, xmm6
			
 
				-    movdqa    xmm6, xmm5
			
 
				-    movdqa    xmm5, [esp]  // restore xmm5
			
 
				-    movdqa    [esp], xmm6  // backup xmm6
			
 
				-    movdqa    xmm6, xmm5    // use xmm6 as temp register.
			
 
				-    punpcklwd xmm5, xmm7
			
 
				-    punpckhwd xmm6, xmm7
			
 
				-    movdqa    xmm7, xmm6
			
 
				-    // Third round of bit swap.
			
 
				-    // Write to the destination pointer.
			
 
				-    movdqa    xmm6, xmm0
			
 
				-    punpckldq xmm0, xmm4
			
 
				-    punpckhdq xmm6, xmm4
			
 
				-    movdqa    xmm4, xmm6
			
 
				-    movdqa    xmm6, [esp]  // restore xmm6
			
 
				-    movlpd    qword ptr [edx], xmm0
			
 
				-    movhpd    qword ptr [ebx], xmm0
			
 
				-    movlpd    qword ptr [edx + esi], xmm4
			
 
				-    lea       edx, [edx + 2 * esi]
			
 
				-    movhpd    qword ptr [ebx + ebp], xmm4
			
 
				-    lea       ebx, [ebx + 2 * ebp]
			
 
				-    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
			
 
				-    punpckldq xmm2, xmm6
			
 
				-    movlpd    qword ptr [edx], xmm2
			
 
				-    movhpd    qword ptr [ebx], xmm2
			
 
				-    punpckhdq xmm0, xmm6
			
 
				-    movlpd    qword ptr [edx + esi], xmm0
			
 
				-    lea       edx, [edx + 2 * esi]
			
 
				-    movhpd    qword ptr [ebx + ebp], xmm0
			
 
				-    lea       ebx, [ebx + 2 * ebp]
			
 
				-    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
			
 
				-    punpckldq xmm1, xmm5
			
 
				-    movlpd    qword ptr [edx], xmm1
			
 
				-    movhpd    qword ptr [ebx], xmm1
			
 
				-    punpckhdq xmm0, xmm5
			
 
				-    movlpd    qword ptr [edx + esi], xmm0
			
 
				-    lea       edx, [edx + 2 * esi]
			
 
				-    movhpd    qword ptr [ebx + ebp], xmm0
			
 
				-    lea       ebx, [ebx + 2 * ebp]
			
 
				-    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
			
 
				-    punpckldq xmm3, xmm7
			
 
				-    movlpd    qword ptr [edx], xmm3
			
 
				-    movhpd    qword ptr [ebx], xmm3
			
 
				-    punpckhdq xmm0, xmm7
			
 
				-    sub       ecx, 8
			
 
				-    movlpd    qword ptr [edx + esi], xmm0
			
 
				-    lea       edx, [edx + 2 * esi]
			
 
				-    movhpd    qword ptr [ebx + ebp], xmm0
			
 
				-    lea       ebx, [ebx + 2 * ebp]
			
 
				-    jg        convertloop
			
 
				-
			
 
				-    mov       esp, [esp + 16]
			
 
				-    pop       ebp
			
 
				-    pop       edi
			
 
				-    pop       esi
			
 
				-    pop       ebx
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#elif !defined(LIBYUV_DISABLE_X86) && \
			
 
				-    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
			
 
				-#define HAS_TRANSPOSE_WX8_SSSE3
			
 
				-static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
			
 
				-                               uint8* dst, int dst_stride, int width) {
			
 
				-  asm volatile (
			
 
				-    // Read in the data from the source pointer.
			
 
				-    // First round of bit swap.
			
 
				-    ".p2align  2                                 \n"
			
 
				-  "1:                                            \n"
			
 
				-    "movq       (%0),%%xmm0                      \n"
			
 
				-    "movq       (%0,%3),%%xmm1                   \n"
			
 
				-    "lea        (%0,%3,2),%0                     \n"
			
 
				-    "punpcklbw  %%xmm1,%%xmm0                    \n"
			
 
				-    "movq       (%0),%%xmm2                      \n"
			
 
				-    "movdqa     %%xmm0,%%xmm1                    \n"
			
 
				-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
			
 
				-    "movq       (%0,%3),%%xmm3                   \n"
			
 
				-    "lea        (%0,%3,2),%0                     \n"
			
 
				-    "punpcklbw  %%xmm3,%%xmm2                    \n"
			
 
				-    "movdqa     %%xmm2,%%xmm3                    \n"
			
 
				-    "movq       (%0),%%xmm4                      \n"
			
 
				-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
			
 
				-    "movq       (%0,%3),%%xmm5                   \n"
			
 
				-    "lea        (%0,%3,2),%0                     \n"
			
 
				-    "punpcklbw  %%xmm5,%%xmm4                    \n"
			
 
				-    "movdqa     %%xmm4,%%xmm5                    \n"
			
 
				-    "movq       (%0),%%xmm6                      \n"
			
 
				-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
			
 
				-    "movq       (%0,%3),%%xmm7                   \n"
			
 
				-    "lea        (%0,%3,2),%0                     \n"
			
 
				-    "punpcklbw  %%xmm7,%%xmm6                    \n"
			
 
				-    "neg        %3                               \n"
			
 
				-    "movdqa     %%xmm6,%%xmm7                    \n"
			
 
				-    "lea        0x8(%0,%3,8),%0                  \n"
			
 
				-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
			
 
				-    "neg        %3                               \n"
			
 
				-     // Second round of bit swap.
			
 
				-    "punpcklwd  %%xmm2,%%xmm0                    \n"
			
 
				-    "punpcklwd  %%xmm3,%%xmm1                    \n"
			
 
				-    "movdqa     %%xmm0,%%xmm2                    \n"
			
 
				-    "movdqa     %%xmm1,%%xmm3                    \n"
			
 
				-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
			
 
				-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
			
 
				-    "punpcklwd  %%xmm6,%%xmm4                    \n"
			
 
				-    "punpcklwd  %%xmm7,%%xmm5                    \n"
			
 
				-    "movdqa     %%xmm4,%%xmm6                    \n"
			
 
				-    "movdqa     %%xmm5,%%xmm7                    \n"
			
 
				-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
			
 
				-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
			
 
				-    // Third round of bit swap.
			
 
				-    // Write to the destination pointer.
			
 
				-    "punpckldq  %%xmm4,%%xmm0                    \n"
			
 
				-    "movq       %%xmm0,(%1)                      \n"
			
 
				-    "movdqa     %%xmm0,%%xmm4                    \n"
			
 
				-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
			
 
				-    "movq       %%xmm4,(%1,%4)                   \n"
			
 
				-    "lea        (%1,%4,2),%1                     \n"
			
 
				-    "punpckldq  %%xmm6,%%xmm2                    \n"
			
 
				-    "movdqa     %%xmm2,%%xmm6                    \n"
			
 
				-    "movq       %%xmm2,(%1)                      \n"
			
 
				-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
			
 
				-    "punpckldq  %%xmm5,%%xmm1                    \n"
			
 
				-    "movq       %%xmm6,(%1,%4)                   \n"
			
 
				-    "lea        (%1,%4,2),%1                     \n"
			
 
				-    "movdqa     %%xmm1,%%xmm5                    \n"
			
 
				-    "movq       %%xmm1,(%1)                      \n"
			
 
				-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
			
 
				-    "movq       %%xmm5,(%1,%4)                   \n"
			
 
				-    "lea        (%1,%4,2),%1                     \n"
			
 
				-    "punpckldq  %%xmm7,%%xmm3                    \n"
			
 
				-    "movq       %%xmm3,(%1)                      \n"
			
 
				-    "movdqa     %%xmm3,%%xmm7                    \n"
			
 
				-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
			
 
				-    "sub        $0x8,%2                          \n"
			
 
				-    "movq       %%xmm7,(%1,%4)                   \n"
			
 
				-    "lea        (%1,%4,2),%1                     \n"
			
 
				-    "jg         1b                               \n"
			
 
				-    : "+r"(src),    // %0
			
 
				-      "+r"(dst),    // %1
			
 
				-      "+r"(width)   // %2
			
 
				-    : "r"((intptr_t)(src_stride)),  // %3
			
 
				-      "r"((intptr_t)(dst_stride))   // %4
			
 
				-    : "memory", "cc"
			
 
				-  #if defined(__SSE2__)
			
 
				-      , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-  #endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
			
 
				-#define HAS_TRANSPOSE_UVWX8_SSE2
			
 
				-extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
			
 
				-                                    uint8* dst_a, int dst_stride_a,
			
 
				-                                    uint8* dst_b, int dst_stride_b,
			
 
				-                                    int w);
			
 
				-  asm (
			
 
				-    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
			
 
				-    "push   %ebx                               \n"
			
 
				-    "push   %esi                               \n"
			
 
				-    "push   %edi                               \n"
			
 
				-    "push   %ebp                               \n"
			
 
				-    "mov    0x14(%esp),%eax                    \n"
			
 
				-    "mov    0x18(%esp),%edi                    \n"
			
 
				-    "mov    0x1c(%esp),%edx                    \n"
			
 
				-    "mov    0x20(%esp),%esi                    \n"
			
 
				-    "mov    0x24(%esp),%ebx                    \n"
			
 
				-    "mov    0x28(%esp),%ebp                    \n"
			
 
				-    "mov    %esp,%ecx                          \n"
			
 
				-    "sub    $0x14,%esp                         \n"
			
 
				-    "and    $0xfffffff0,%esp                   \n"
			
 
				-    "mov    %ecx,0x10(%esp)                    \n"
			
 
				-    "mov    0x2c(%ecx),%ecx                    \n"
			
 
				-
			
 
				-"1:                                            \n"
			
 
				-    "movdqa (%eax),%xmm0                       \n"
			
 
				-    "movdqa (%eax,%edi,1),%xmm1                \n"
			
 
				-    "lea    (%eax,%edi,2),%eax                 \n"
			
 
				-    "movdqa %xmm0,%xmm7                        \n"
			
 
				-    "punpcklbw %xmm1,%xmm0                     \n"
			
 
				-    "punpckhbw %xmm1,%xmm7                     \n"
			
 
				-    "movdqa %xmm7,%xmm1                        \n"
			
 
				-    "movdqa (%eax),%xmm2                       \n"
			
 
				-    "movdqa (%eax,%edi,1),%xmm3                \n"
			
 
				-    "lea    (%eax,%edi,2),%eax                 \n"
			
 
				-    "movdqa %xmm2,%xmm7                        \n"
			
 
				-    "punpcklbw %xmm3,%xmm2                     \n"
			
 
				-    "punpckhbw %xmm3,%xmm7                     \n"
			
 
				-    "movdqa %xmm7,%xmm3                        \n"
			
 
				-    "movdqa (%eax),%xmm4                       \n"
			
 
				-    "movdqa (%eax,%edi,1),%xmm5                \n"
			
 
				-    "lea    (%eax,%edi,2),%eax                 \n"
			
 
				-    "movdqa %xmm4,%xmm7                        \n"
			
 
				-    "punpcklbw %xmm5,%xmm4                     \n"
			
 
				-    "punpckhbw %xmm5,%xmm7                     \n"
			
 
				-    "movdqa %xmm7,%xmm5                        \n"
			
 
				-    "movdqa (%eax),%xmm6                       \n"
			
 
				-    "movdqa (%eax,%edi,1),%xmm7                \n"
			
 
				-    "lea    (%eax,%edi,2),%eax                 \n"
			
 
				-    "movdqa %xmm5,(%esp)                       \n"
			
 
				-    "neg    %edi                               \n"
			
 
				-    "movdqa %xmm6,%xmm5                        \n"
			
 
				-    "punpcklbw %xmm7,%xmm6                     \n"
			
 
				-    "punpckhbw %xmm7,%xmm5                     \n"
			
 
				-    "movdqa %xmm5,%xmm7                        \n"
			
 
				-    "lea    0x10(%eax,%edi,8),%eax             \n"
			
 
				-    "neg    %edi                               \n"
			
 
				-    "movdqa %xmm0,%xmm5                        \n"
			
 
				-    "punpcklwd %xmm2,%xmm0                     \n"
			
 
				-    "punpckhwd %xmm2,%xmm5                     \n"
			
 
				-    "movdqa %xmm5,%xmm2                        \n"
			
 
				-    "movdqa %xmm1,%xmm5                        \n"
			
 
				-    "punpcklwd %xmm3,%xmm1                     \n"
			
 
				-    "punpckhwd %xmm3,%xmm5                     \n"
			
 
				-    "movdqa %xmm5,%xmm3                        \n"
			
 
				-    "movdqa %xmm4,%xmm5                        \n"
			
 
				-    "punpcklwd %xmm6,%xmm4                     \n"
			
 
				-    "punpckhwd %xmm6,%xmm5                     \n"
			
 
				-    "movdqa %xmm5,%xmm6                        \n"
			
 
				-    "movdqa (%esp),%xmm5                       \n"
			
 
				-    "movdqa %xmm6,(%esp)                       \n"
			
 
				-    "movdqa %xmm5,%xmm6                        \n"
			
 
				-    "punpcklwd %xmm7,%xmm5                     \n"
			
 
				-    "punpckhwd %xmm7,%xmm6                     \n"
			
 
				-    "movdqa %xmm6,%xmm7                        \n"
			
 
				-    "movdqa %xmm0,%xmm6                        \n"
			
 
				-    "punpckldq %xmm4,%xmm0                     \n"
			
 
				-    "punpckhdq %xmm4,%xmm6                     \n"
			
 
				-    "movdqa %xmm6,%xmm4                        \n"
			
 
				-    "movdqa (%esp),%xmm6                       \n"
			
 
				-    "movlpd %xmm0,(%edx)                       \n"
			
 
				-    "movhpd %xmm0,(%ebx)                       \n"
			
 
				-    "movlpd %xmm4,(%edx,%esi,1)                \n"
			
 
				-    "lea    (%edx,%esi,2),%edx                 \n"
			
 
				-    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
			
 
				-    "lea    (%ebx,%ebp,2),%ebx                 \n"
			
 
				-    "movdqa %xmm2,%xmm0                        \n"
			
 
				-    "punpckldq %xmm6,%xmm2                     \n"
			
 
				-    "movlpd %xmm2,(%edx)                       \n"
			
 
				-    "movhpd %xmm2,(%ebx)                       \n"
			
 
				-    "punpckhdq %xmm6,%xmm0                     \n"
			
 
				-    "movlpd %xmm0,(%edx,%esi,1)                \n"
			
 
				-    "lea    (%edx,%esi,2),%edx                 \n"
			
 
				-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
			
 
				-    "lea    (%ebx,%ebp,2),%ebx                 \n"
			
 
				-    "movdqa %xmm1,%xmm0                        \n"
			
 
				-    "punpckldq %xmm5,%xmm1                     \n"
			
 
				-    "movlpd %xmm1,(%edx)                       \n"
			
 
				-    "movhpd %xmm1,(%ebx)                       \n"
			
 
				-    "punpckhdq %xmm5,%xmm0                     \n"
			
 
				-    "movlpd %xmm0,(%edx,%esi,1)                \n"
			
 
				-    "lea    (%edx,%esi,2),%edx                 \n"
			
 
				-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
			
 
				-    "lea    (%ebx,%ebp,2),%ebx                 \n"
			
 
				-    "movdqa %xmm3,%xmm0                        \n"
			
 
				-    "punpckldq %xmm7,%xmm3                     \n"
			
 
				-    "movlpd %xmm3,(%edx)                       \n"
			
 
				-    "movhpd %xmm3,(%ebx)                       \n"
			
 
				-    "punpckhdq %xmm7,%xmm0                     \n"
			
 
				-    "sub    $0x8,%ecx                          \n"
			
 
				-    "movlpd %xmm0,(%edx,%esi,1)                \n"
			
 
				-    "lea    (%edx,%esi,2),%edx                 \n"
			
 
				-    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
			
 
				-    "lea    (%ebx,%ebp,2),%ebx                 \n"
			
 
				-    "jg     1b                                 \n"
			
 
				-    "mov    0x10(%esp),%esp                    \n"
			
 
				-    "pop    %ebp                               \n"
			
 
				-    "pop    %edi                               \n"
			
 
				-    "pop    %esi                               \n"
			
 
				-    "pop    %ebx                               \n"
			
 
				-#if defined(__native_client__)
			
 
				-    "pop    %ecx                               \n"
			
 
				-    "and    $0xffffffe0,%ecx                   \n"
			
 
				-    "jmp    *%ecx                              \n"
			
 
				-#else
			
 
				-    "ret                                       \n"
			
 
				-#endif
			
 
				-);
			
 
				-#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
			
 
				-    defined(__x86_64__)
			
 
				-// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
			
 
				-#define HAS_TRANSPOSE_WX8_FAST_SSSE3
			
 
				-static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
			
 
				-                                    uint8* dst, int dst_stride, int width) {
			
 
				-  asm volatile (
			
 
				-  // Read in the data from the source pointer.
			
 
				-  // First round of bit swap.
			
 
				-  ".p2align  2                                 \n"
			
 
				-"1:                                            \n"
			
 
				-  "movdqa     (%0),%%xmm0                      \n"
			
 
				-  "movdqa     (%0,%3),%%xmm1                   \n"
			
 
				-  "lea        (%0,%3,2),%0                     \n"
			
 
				-  "movdqa     %%xmm0,%%xmm8                    \n"
			
 
				-  "punpcklbw  %%xmm1,%%xmm0                    \n"
			
 
				-  "punpckhbw  %%xmm1,%%xmm8                    \n"
			
 
				-  "movdqa     (%0),%%xmm2                      \n"
			
 
				-  "movdqa     %%xmm0,%%xmm1                    \n"
			
 
				-  "movdqa     %%xmm8,%%xmm9                    \n"
			
 
				-  "palignr    $0x8,%%xmm1,%%xmm1               \n"
			
 
				-  "palignr    $0x8,%%xmm9,%%xmm9               \n"
			
 
				-  "movdqa     (%0,%3),%%xmm3                   \n"
			
 
				-  "lea        (%0,%3,2),%0                     \n"
			
 
				-  "movdqa     %%xmm2,%%xmm10                   \n"
			
 
				-  "punpcklbw  %%xmm3,%%xmm2                    \n"
			
 
				-  "punpckhbw  %%xmm3,%%xmm10                   \n"
			
 
				-  "movdqa     %%xmm2,%%xmm3                    \n"
			
 
				-  "movdqa     %%xmm10,%%xmm11                  \n"
			
 
				-  "movdqa     (%0),%%xmm4                      \n"
			
 
				-  "palignr    $0x8,%%xmm3,%%xmm3               \n"
			
 
				-  "palignr    $0x8,%%xmm11,%%xmm11             \n"
			
 
				-  "movdqa     (%0,%3),%%xmm5                   \n"
			
 
				-  "lea        (%0,%3,2),%0                     \n"
			
 
				-  "movdqa     %%xmm4,%%xmm12                   \n"
			
 
				-  "punpcklbw  %%xmm5,%%xmm4                    \n"
			
 
				-  "punpckhbw  %%xmm5,%%xmm12                   \n"
			
 
				-  "movdqa     %%xmm4,%%xmm5                    \n"
			
 
				-  "movdqa     %%xmm12,%%xmm13                  \n"
			
 
				-  "movdqa     (%0),%%xmm6                      \n"
			
 
				-  "palignr    $0x8,%%xmm5,%%xmm5               \n"
			
 
				-  "palignr    $0x8,%%xmm13,%%xmm13             \n"
			
 
				-  "movdqa     (%0,%3),%%xmm7                   \n"
			
 
				-  "lea        (%0,%3,2),%0                     \n"
			
 
				-  "movdqa     %%xmm6,%%xmm14                   \n"
			
 
				-  "punpcklbw  %%xmm7,%%xmm6                    \n"
			
 
				-  "punpckhbw  %%xmm7,%%xmm14                   \n"
			
 
				-  "neg        %3                               \n"
			
 
				-  "movdqa     %%xmm6,%%xmm7                    \n"
			
 
				-  "movdqa     %%xmm14,%%xmm15                  \n"
			
 
				-  "lea        0x10(%0,%3,8),%0                 \n"
			
 
				-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
			
 
				-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
			
 
				-  "neg        %3                               \n"
			
 
				-   // Second round of bit swap.
			
 
				-  "punpcklwd  %%xmm2,%%xmm0                    \n"
			
 
				-  "punpcklwd  %%xmm3,%%xmm1                    \n"
			
 
				-  "movdqa     %%xmm0,%%xmm2                    \n"
			
 
				-  "movdqa     %%xmm1,%%xmm3                    \n"
			
 
				-  "palignr    $0x8,%%xmm2,%%xmm2               \n"
			
 
				-  "palignr    $0x8,%%xmm3,%%xmm3               \n"
			
 
				-  "punpcklwd  %%xmm6,%%xmm4                    \n"
			
 
				-  "punpcklwd  %%xmm7,%%xmm5                    \n"
			
 
				-  "movdqa     %%xmm4,%%xmm6                    \n"
			
 
				-  "movdqa     %%xmm5,%%xmm7                    \n"
			
 
				-  "palignr    $0x8,%%xmm6,%%xmm6               \n"
			
 
				-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
			
 
				-  "punpcklwd  %%xmm10,%%xmm8                   \n"
			
 
				-  "punpcklwd  %%xmm11,%%xmm9                   \n"
			
 
				-  "movdqa     %%xmm8,%%xmm10                   \n"
			
 
				-  "movdqa     %%xmm9,%%xmm11                   \n"
			
 
				-  "palignr    $0x8,%%xmm10,%%xmm10             \n"
			
 
				-  "palignr    $0x8,%%xmm11,%%xmm11             \n"
			
 
				-  "punpcklwd  %%xmm14,%%xmm12                  \n"
			
 
				-  "punpcklwd  %%xmm15,%%xmm13                  \n"
			
 
				-  "movdqa     %%xmm12,%%xmm14                  \n"
			
 
				-  "movdqa     %%xmm13,%%xmm15                  \n"
			
 
				-  "palignr    $0x8,%%xmm14,%%xmm14             \n"
			
 
				-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
			
 
				-  // Third round of bit swap.
			
 
				-  // Write to the destination pointer.
			
 
				-  "punpckldq  %%xmm4,%%xmm0                    \n"
			
 
				-  "movq       %%xmm0,(%1)                      \n"
			
 
				-  "movdqa     %%xmm0,%%xmm4                    \n"
			
 
				-  "palignr    $0x8,%%xmm4,%%xmm4               \n"
			
 
				-  "movq       %%xmm4,(%1,%4)                   \n"
			
 
				-  "lea        (%1,%4,2),%1                     \n"
			
 
				-  "punpckldq  %%xmm6,%%xmm2                    \n"
			
 
				-  "movdqa     %%xmm2,%%xmm6                    \n"
			
 
				-  "movq       %%xmm2,(%1)                      \n"
			
 
				-  "palignr    $0x8,%%xmm6,%%xmm6               \n"
			
 
				-  "punpckldq  %%xmm5,%%xmm1                    \n"
			
 
				-  "movq       %%xmm6,(%1,%4)                   \n"
			
 
				-  "lea        (%1,%4,2),%1                     \n"
			
 
				-  "movdqa     %%xmm1,%%xmm5                    \n"
			
 
				-  "movq       %%xmm1,(%1)                      \n"
			
 
				-  "palignr    $0x8,%%xmm5,%%xmm5               \n"
			
 
				-  "movq       %%xmm5,(%1,%4)                   \n"
			
 
				-  "lea        (%1,%4,2),%1                     \n"
			
 
				-  "punpckldq  %%xmm7,%%xmm3                    \n"
			
 
				-  "movq       %%xmm3,(%1)                      \n"
			
 
				-  "movdqa     %%xmm3,%%xmm7                    \n"
			
 
				-  "palignr    $0x8,%%xmm7,%%xmm7               \n"
			
 
				-  "movq       %%xmm7,(%1,%4)                   \n"
			
 
				-  "lea        (%1,%4,2),%1                     \n"
			
 
				-  "punpckldq  %%xmm12,%%xmm8                   \n"
			
 
				-  "movq       %%xmm8,(%1)                      \n"
			
 
				-  "movdqa     %%xmm8,%%xmm12                   \n"
			
 
				-  "palignr    $0x8,%%xmm12,%%xmm12             \n"
			
 
				-  "movq       %%xmm12,(%1,%4)                  \n"
			
 
				-  "lea        (%1,%4,2),%1                     \n"
			
 
				-  "punpckldq  %%xmm14,%%xmm10                  \n"
			
 
				-  "movdqa     %%xmm10,%%xmm14                  \n"
			
 
				-  "movq       %%xmm10,(%1)                     \n"
			
 
				-  "palignr    $0x8,%%xmm14,%%xmm14             \n"
			
 
				-  "punpckldq  %%xmm13,%%xmm9                   \n"
			
 
				-  "movq       %%xmm14,(%1,%4)                  \n"
			
 
				-  "lea        (%1,%4,2),%1                     \n"
			
 
				-  "movdqa     %%xmm9,%%xmm13                   \n"
			
 
				-  "movq       %%xmm9,(%1)                      \n"
			
 
				-  "palignr    $0x8,%%xmm13,%%xmm13             \n"
			
 
				-  "movq       %%xmm13,(%1,%4)                  \n"
			
 
				-  "lea        (%1,%4,2),%1                     \n"
			
 
				-  "punpckldq  %%xmm15,%%xmm11                  \n"
			
 
				-  "movq       %%xmm11,(%1)                     \n"
			
 
				-  "movdqa     %%xmm11,%%xmm15                  \n"
			
 
				-  "palignr    $0x8,%%xmm15,%%xmm15             \n"
			
 
				-  "sub        $0x10,%2                         \n"
			
 
				-  "movq       %%xmm15,(%1,%4)                  \n"
			
 
				-  "lea        (%1,%4,2),%1                     \n"
			
 
				-  "jg         1b                               \n"
			
 
				-  : "+r"(src),    // %0
			
 
				-    "+r"(dst),    // %1
			
 
				-    "+r"(width)   // %2
			
 
				-  : "r"((intptr_t)(src_stride)),  // %3
			
 
				-    "r"((intptr_t)(dst_stride))   // %4
			
 
				-  : "memory", "cc",
			
 
				-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
			
 
				-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
			
 
				-);
			
 
				-}
			
 
				-
			
 
				-#define HAS_TRANSPOSE_UVWX8_SSE2
			
 
				-static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
			
 
				-                                uint8* dst_a, int dst_stride_a,
			
 
				-                                uint8* dst_b, int dst_stride_b,
			
 
				-                                int w) {
			
 
				-  asm volatile (
			
 
				-  // Read in the data from the source pointer.
			
 
				-  // First round of bit swap.
			
 
				-  ".p2align  2                                 \n"
			
 
				-"1:                                            \n"
			
 
				-  "movdqa     (%0),%%xmm0                      \n"
			
 
				-  "movdqa     (%0,%4),%%xmm1                   \n"
			
 
				-  "lea        (%0,%4,2),%0                     \n"
			
 
				-  "movdqa     %%xmm0,%%xmm8                    \n"
			
 
				-  "punpcklbw  %%xmm1,%%xmm0                    \n"
			
 
				-  "punpckhbw  %%xmm1,%%xmm8                    \n"
			
 
				-  "movdqa     %%xmm8,%%xmm1                    \n"
			
 
				-  "movdqa     (%0),%%xmm2                      \n"
			
 
				-  "movdqa     (%0,%4),%%xmm3                   \n"
			
 
				-  "lea        (%0,%4,2),%0                     \n"
			
 
				-  "movdqa     %%xmm2,%%xmm8                    \n"
			
 
				-  "punpcklbw  %%xmm3,%%xmm2                    \n"
			
 
				-  "punpckhbw  %%xmm3,%%xmm8                    \n"
			
 
				-  "movdqa     %%xmm8,%%xmm3                    \n"
			
 
				-  "movdqa     (%0),%%xmm4                      \n"
			
 
				-  "movdqa     (%0,%4),%%xmm5                   \n"
			
 
				-  "lea        (%0,%4,2),%0                     \n"
			
 
				-  "movdqa     %%xmm4,%%xmm8                    \n"
			
 
				-  "punpcklbw  %%xmm5,%%xmm4                    \n"
			
 
				-  "punpckhbw  %%xmm5,%%xmm8                    \n"
			
 
				-  "movdqa     %%xmm8,%%xmm5                    \n"
			
 
				-  "movdqa     (%0),%%xmm6                      \n"
			
 
				-  "movdqa     (%0,%4),%%xmm7                   \n"
			
 
				-  "lea        (%0,%4,2),%0                     \n"
			
 
				-  "movdqa     %%xmm6,%%xmm8                    \n"
			
 
				-  "punpcklbw  %%xmm7,%%xmm6                    \n"
			
 
				-  "neg        %4                               \n"
			
 
				-  "lea        0x10(%0,%4,8),%0                 \n"
			
 
				-  "punpckhbw  %%xmm7,%%xmm8                    \n"
			
 
				-  "movdqa     %%xmm8,%%xmm7                    \n"
			
 
				-  "neg        %4                               \n"
			
 
				-   // Second round of bit swap.
			
 
				-  "movdqa     %%xmm0,%%xmm8                    \n"
			
 
				-  "movdqa     %%xmm1,%%xmm9                    \n"
			
 
				-  "punpckhwd  %%xmm2,%%xmm8                    \n"
			
 
				-  "punpckhwd  %%xmm3,%%xmm9                    \n"
			
 
				-  "punpcklwd  %%xmm2,%%xmm0                    \n"
			
 
				-  "punpcklwd  %%xmm3,%%xmm1                    \n"
			
 
				-  "movdqa     %%xmm8,%%xmm2                    \n"
			
 
				-  "movdqa     %%xmm9,%%xmm3                    \n"
			
 
				-  "movdqa     %%xmm4,%%xmm8                    \n"
			
 
				-  "movdqa     %%xmm5,%%xmm9                    \n"
			
 
				-  "punpckhwd  %%xmm6,%%xmm8                    \n"
			
 
				-  "punpckhwd  %%xmm7,%%xmm9                    \n"
			
 
				-  "punpcklwd  %%xmm6,%%xmm4                    \n"
			
 
				-  "punpcklwd  %%xmm7,%%xmm5                    \n"
			
 
				-  "movdqa     %%xmm8,%%xmm6                    \n"
			
 
				-  "movdqa     %%xmm9,%%xmm7                    \n"
			
 
				-  // Third round of bit swap.
			
 
				-  // Write to the destination pointer.
			
 
				-  "movdqa     %%xmm0,%%xmm8                    \n"
			
 
				-  "punpckldq  %%xmm4,%%xmm0                    \n"
			
 
				-  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
			
 
				-  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
			
 
				-  "punpckhdq  %%xmm4,%%xmm8                    \n"
			
 
				-  "movlpd     %%xmm8,(%1,%5)                   \n"
			
 
				-  "lea        (%1,%5,2),%1                     \n"
			
 
				-  "movhpd     %%xmm8,(%2,%6)                   \n"
			
 
				-  "lea        (%2,%6,2),%2                     \n"
			
 
				-  "movdqa     %%xmm2,%%xmm8                    \n"
			
 
				-  "punpckldq  %%xmm6,%%xmm2                    \n"
			
 
				-  "movlpd     %%xmm2,(%1)                      \n"
			
 
				-  "movhpd     %%xmm2,(%2)                      \n"
			
 
				-  "punpckhdq  %%xmm6,%%xmm8                    \n"
			
 
				-  "movlpd     %%xmm8,(%1,%5)                   \n"
			
 
				-  "lea        (%1,%5,2),%1                     \n"
			
 
				-  "movhpd     %%xmm8,(%2,%6)                   \n"
			
 
				-  "lea        (%2,%6,2),%2                     \n"
			
 
				-  "movdqa     %%xmm1,%%xmm8                    \n"
			
 
				-  "punpckldq  %%xmm5,%%xmm1                    \n"
			
 
				-  "movlpd     %%xmm1,(%1)                      \n"
			
 
				-  "movhpd     %%xmm1,(%2)                      \n"
			
 
				-  "punpckhdq  %%xmm5,%%xmm8                    \n"
			
 
				-  "movlpd     %%xmm8,(%1,%5)                   \n"
			
 
				-  "lea        (%1,%5,2),%1                     \n"
			
 
				-  "movhpd     %%xmm8,(%2,%6)                   \n"
			
 
				-  "lea        (%2,%6,2),%2                     \n"
			
 
				-  "movdqa     %%xmm3,%%xmm8                    \n"
			
 
				-  "punpckldq  %%xmm7,%%xmm3                    \n"
			
 
				-  "movlpd     %%xmm3,(%1)                      \n"
			
 
				-  "movhpd     %%xmm3,(%2)                      \n"
			
 
				-  "punpckhdq  %%xmm7,%%xmm8                    \n"
			
 
				-  "sub        $0x8,%3                          \n"
			
 
				-  "movlpd     %%xmm8,(%1,%5)                   \n"
			
 
				-  "lea        (%1,%5,2),%1                     \n"
			
 
				-  "movhpd     %%xmm8,(%2,%6)                   \n"
			
 
				-  "lea        (%2,%6,2),%2                     \n"
			
 
				-  "jg         1b                               \n"
			
 
				-  : "+r"(src),    // %0
			
 
				-    "+r"(dst_a),  // %1
			
 
				-    "+r"(dst_b),  // %2
			
 
				-    "+r"(w)   // %3
			
 
				-  : "r"((intptr_t)(src_stride)),    // %4
			
 
				-    "r"((intptr_t)(dst_stride_a)),  // %5
			
 
				-    "r"((intptr_t)(dst_stride_b))   // %6
			
 
				-  : "memory", "cc",
			
 
				-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
			
 
				-    "xmm8", "xmm9"
			
 
				-);
			
 
				-}
			
 
				-#endif
			
 
				-#endif
			
 
				-
			
 
				-static void TransposeWx8_C(const uint8* src, int src_stride,
			
 
				-                           uint8* dst, int dst_stride,
			
 
				-                           int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    dst[0] = src[0 * src_stride];
			
 
				-    dst[1] = src[1 * src_stride];
			
 
				-    dst[2] = src[2 * src_stride];
			
 
				-    dst[3] = src[3 * src_stride];
			
 
				-    dst[4] = src[4 * src_stride];
			
 
				-    dst[5] = src[5 * src_stride];
			
 
				-    dst[6] = src[6 * src_stride];
			
 
				-    dst[7] = src[7 * src_stride];
			
 
				-    ++src;
			
 
				-    dst += dst_stride;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-static void TransposeWxH_C(const uint8* src, int src_stride,
			
 
				-                           uint8* dst, int dst_stride,
			
 
				-                           int width, int height) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    int j;
			
 
				-    for (j = 0; j < height; ++j) {
			
 
				-      dst[i * dst_stride + j] = src[j * src_stride + i];
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void TransposePlane(const uint8* src, int src_stride,
			
 
				-                    uint8* dst, int dst_stride,
			
 
				-                    int width, int height) {
			
 
				-  int i = height;
			
 
				-  void (*TransposeWx8)(const uint8* src, int src_stride,
			
 
				-                       uint8* dst, int dst_stride,
			
 
				-                       int width) = TransposeWx8_C;
			
 
				-#if defined(HAS_TRANSPOSE_WX8_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON)) {
			
 
				-    TransposeWx8 = TransposeWx8_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_TRANSPOSE_WX8_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
			
 
				-    TransposeWx8 = TransposeWx8_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) &&
			
 
				-      IS_ALIGNED(width, 16) &&
			
 
				-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
			
 
				-    TransposeWx8 = TransposeWx8_FAST_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
			
 
				-    if (IS_ALIGNED(width, 4) &&
			
 
				-        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
			
 
				-      TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
			
 
				-    } else {
			
 
				-      TransposeWx8 = TransposeWx8_MIPS_DSPR2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Work across the source in 8x8 tiles
			
 
				-  while (i >= 8) {
			
 
				-    TransposeWx8(src, src_stride, dst, dst_stride, width);
			
 
				-    src += 8 * src_stride;    // Go down 8 rows.
			
 
				-    dst += 8;                 // Move over 8 columns.
			
 
				-    i -= 8;
			
 
				-  }
			
 
				-
			
 
				-  TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void RotatePlane90(const uint8* src, int src_stride,
			
 
				-                   uint8* dst, int dst_stride,
			
 
				-                   int width, int height) {
			
 
				-  // Rotate by 90 is a transpose with the source read
			
 
				-  // from bottom to top. So set the source pointer to the end
			
 
				-  // of the buffer and flip the sign of the source stride.
			
 
				-  src += src_stride * (height - 1);
			
 
				-  src_stride = -src_stride;
			
 
				-  TransposePlane(src, src_stride, dst, dst_stride, width, height);
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void RotatePlane270(const uint8* src, int src_stride,
			
 
				-                    uint8* dst, int dst_stride,
			
 
				-                    int width, int height) {
			
 
				-  // Rotate by 270 is a transpose with the destination written
			
 
				-  // from bottom to top. So set the destination pointer to the end
			
 
				-  // of the buffer and flip the sign of the destination stride.
			
 
				-  dst += dst_stride * (width - 1);
			
 
				-  dst_stride = -dst_stride;
			
 
				-  TransposePlane(src, src_stride, dst, dst_stride, width, height);
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void RotatePlane180(const uint8* src, int src_stride,
			
 
				-                    uint8* dst, int dst_stride,
			
 
				-                    int width, int height) {
			
 
				-  // Swap first and last row and mirror the content. Uses a temporary row.
			
 
				-  align_buffer_64(row, width);
			
 
				-  const uint8* src_bot = src + src_stride * (height - 1);
			
 
				-  uint8* dst_bot = dst + dst_stride * (height - 1);
			
 
				-  int half_height = (height + 1) >> 1;
			
 
				-  int y;
			
 
				-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
			
 
				-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
			
 
				-#if defined(HAS_MIRRORROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
			
 
				-    MirrorRow = MirrorRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MIRRORROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
			
 
				-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-    MirrorRow = MirrorRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MIRRORROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
			
 
				-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-    MirrorRow = MirrorRow_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MIRRORROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
			
 
				-    MirrorRow = MirrorRow_AVX2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_MIRRORROW_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
			
 
				-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
			
 
				-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
			
 
				-    MirrorRow = MirrorRow_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
			
 
				-    CopyRow = CopyRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_X86)
			
 
				-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
			
 
				-    CopyRow = CopyRow_X86;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
			
 
				-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-    CopyRow = CopyRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_ERMS)
			
 
				-  if (TestCpuFlag(kCpuHasERMS)) {
			
 
				-    CopyRow = CopyRow_ERMS;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_MIPS)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS)) {
			
 
				-    CopyRow = CopyRow_MIPS;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Odd height will harmlessly mirror the middle row twice.
			
 
				-  for (y = 0; y < half_height; ++y) {
			
 
				-    MirrorRow(src, row, width);  // Mirror first row into a buffer
			
 
				-    src += src_stride;
			
 
				-    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
			
 
				-    dst += dst_stride;
			
 
				-    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
			
 
				-    src_bot -= src_stride;
			
 
				-    dst_bot -= dst_stride;
			
 
				-  }
			
 
				-  free_aligned_buffer_64(row);
			
 
				-}
			
 
				-
			
 
				-static void TransposeUVWx8_C(const uint8* src, int src_stride,
			
 
				-                             uint8* dst_a, int dst_stride_a,
			
 
				-                             uint8* dst_b, int dst_stride_b,
			
 
				-                             int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    dst_a[0] = src[0 * src_stride + 0];
			
 
				-    dst_b[0] = src[0 * src_stride + 1];
			
 
				-    dst_a[1] = src[1 * src_stride + 0];
			
 
				-    dst_b[1] = src[1 * src_stride + 1];
			
 
				-    dst_a[2] = src[2 * src_stride + 0];
			
 
				-    dst_b[2] = src[2 * src_stride + 1];
			
 
				-    dst_a[3] = src[3 * src_stride + 0];
			
 
				-    dst_b[3] = src[3 * src_stride + 1];
			
 
				-    dst_a[4] = src[4 * src_stride + 0];
			
 
				-    dst_b[4] = src[4 * src_stride + 1];
			
 
				-    dst_a[5] = src[5 * src_stride + 0];
			
 
				-    dst_b[5] = src[5 * src_stride + 1];
			
 
				-    dst_a[6] = src[6 * src_stride + 0];
			
 
				-    dst_b[6] = src[6 * src_stride + 1];
			
 
				-    dst_a[7] = src[7 * src_stride + 0];
			
 
				-    dst_b[7] = src[7 * src_stride + 1];
			
 
				-    src += 2;
			
 
				-    dst_a += dst_stride_a;
			
 
				-    dst_b += dst_stride_b;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-static void TransposeUVWxH_C(const uint8* src, int src_stride,
			
 
				-                             uint8* dst_a, int dst_stride_a,
			
 
				-                             uint8* dst_b, int dst_stride_b,
			
 
				-                             int width, int height) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width * 2; i += 2) {
			
 
				-    int j;
			
 
				-    for (j = 0; j < height; ++j) {
			
 
				-      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
			
 
				-      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void TransposeUV(const uint8* src, int src_stride,
			
 
				-                 uint8* dst_a, int dst_stride_a,
			
 
				-                 uint8* dst_b, int dst_stride_b,
			
 
				-                 int width, int height) {
			
 
				-  int i = height;
			
 
				-  void (*TransposeUVWx8)(const uint8* src, int src_stride,
			
 
				-                         uint8* dst_a, int dst_stride_a,
			
 
				-                         uint8* dst_b, int dst_stride_b,
			
 
				-                         int width) = TransposeUVWx8_C;
			
 
				-#if defined(HAS_TRANSPOSE_UVWX8_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON)) {
			
 
				-    TransposeUVWx8 = TransposeUVWx8_NEON;
			
 
				-  }
			
 
				-#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) &&
			
 
				-      IS_ALIGNED(width, 8) &&
			
 
				-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
			
 
				-    TransposeUVWx8 = TransposeUVWx8_SSE2;
			
 
				-  }
			
 
				-#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
			
 
				-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
			
 
				-    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Work through the source in 8x8 tiles.
			
 
				-  while (i >= 8) {
			
 
				-    TransposeUVWx8(src, src_stride,
			
 
				-                   dst_a, dst_stride_a,
			
 
				-                   dst_b, dst_stride_b,
			
 
				-                   width);
			
 
				-    src += 8 * src_stride;    // Go down 8 rows.
			
 
				-    dst_a += 8;               // Move over 8 columns.
			
 
				-    dst_b += 8;               // Move over 8 columns.
			
 
				-    i -= 8;
			
 
				-  }
			
 
				-
			
 
				-  TransposeUVWxH_C(src, src_stride,
			
 
				-                   dst_a, dst_stride_a,
			
 
				-                   dst_b, dst_stride_b,
			
 
				-                   width, i);
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void RotateUV90(const uint8* src, int src_stride,
			
 
				-                uint8* dst_a, int dst_stride_a,
			
 
				-                uint8* dst_b, int dst_stride_b,
			
 
				-                int width, int height) {
			
 
				-  src += src_stride * (height - 1);
			
 
				-  src_stride = -src_stride;
			
 
				-
			
 
				-  TransposeUV(src, src_stride,
			
 
				-              dst_a, dst_stride_a,
			
 
				-              dst_b, dst_stride_b,
			
 
				-              width, height);
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void RotateUV270(const uint8* src, int src_stride,
			
 
				-                 uint8* dst_a, int dst_stride_a,
			
 
				-                 uint8* dst_b, int dst_stride_b,
			
 
				-                 int width, int height) {
			
 
				-  dst_a += dst_stride_a * (width - 1);
			
 
				-  dst_b += dst_stride_b * (width - 1);
			
 
				-  dst_stride_a = -dst_stride_a;
			
 
				-  dst_stride_b = -dst_stride_b;
			
 
				-
			
 
				-  TransposeUV(src, src_stride,
			
 
				-              dst_a, dst_stride_a,
			
 
				-              dst_b, dst_stride_b,
			
 
				-              width, height);
			
 
				-}
			
 
				-
			
 
				-// Rotate 180 is a horizontal and vertical flip.
			
 
				-LIBYUV_API
			
 
				-void RotateUV180(const uint8* src, int src_stride,
			
 
				-                 uint8* dst_a, int dst_stride_a,
			
 
				-                 uint8* dst_b, int dst_stride_b,
			
 
				-                 int width, int height) {
			
 
				-  int i;
			
 
				-  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
			
 
				-      MirrorUVRow_C;
			
 
				-#if defined(HAS_MIRRORUVROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
			
 
				-    MirrorRowUV = MirrorUVRow_NEON;
			
 
				-  }
			
 
				-#elif defined(HAS_MIRRORROW_UV_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
			
 
				-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
			
 
				-    MirrorRowUV = MirrorUVRow_SSSE3;
			
 
				-  }
			
 
				-#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
			
 
				-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
			
 
				-    MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  dst_a += dst_stride_a * (height - 1);
			
 
				-  dst_b += dst_stride_b * (height - 1);
			
 
				-
			
 
				-  for (i = 0; i < height; ++i) {
			
 
				-    MirrorRowUV(src, dst_a, dst_b, width);
			
 
				-    src += src_stride;
			
 
				-    dst_a -= dst_stride_a;
			
 
				-    dst_b -= dst_stride_b;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int RotatePlane(const uint8* src, int src_stride,
			
 
				-                uint8* dst, int dst_stride,
			
 
				-                int width, int height,
			
 
				-                enum RotationMode mode) {
			
 
				-  if (!src || width <= 0 || height == 0 || !dst) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src = src + (height - 1) * src_stride;
			
 
				-    src_stride = -src_stride;
			
 
				-  }
			
 
				-
			
 
				-  switch (mode) {
			
 
				-    case kRotate0:
			
 
				-      // copy frame
			
 
				-      CopyPlane(src, src_stride,
			
 
				-                dst, dst_stride,
			
 
				-                width, height);
			
 
				-      return 0;
			
 
				-    case kRotate90:
			
 
				-      RotatePlane90(src, src_stride,
			
 
				-                    dst, dst_stride,
			
 
				-                    width, height);
			
 
				-      return 0;
			
 
				-    case kRotate270:
			
 
				-      RotatePlane270(src, src_stride,
			
 
				-                     dst, dst_stride,
			
 
				-                     width, height);
			
 
				-      return 0;
			
 
				-    case kRotate180:
			
 
				-      RotatePlane180(src, src_stride,
			
 
				-                     dst, dst_stride,
			
 
				-                     width, height);
			
 
				-      return 0;
			
 
				-    default:
			
 
				-      break;
			
 
				-  }
			
 
				-  return -1;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420Rotate(const uint8* src_y, int src_stride_y,
			
 
				-               const uint8* src_u, int src_stride_u,
			
 
				-               const uint8* src_v, int src_stride_v,
			
 
				-               uint8* dst_y, int dst_stride_y,
			
 
				-               uint8* dst_u, int dst_stride_u,
			
 
				-               uint8* dst_v, int dst_stride_v,
			
 
				-               int width, int height,
			
 
				-               enum RotationMode mode) {
			
 
				-  int halfwidth = (width + 1) >> 1;
			
 
				-  int halfheight = (height + 1) >> 1;
			
 
				-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
			
 
				-      !dst_y || !dst_u || !dst_v) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    halfheight = (height + 1) >> 1;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_u = src_u + (halfheight - 1) * src_stride_u;
			
 
				-    src_v = src_v + (halfheight - 1) * src_stride_v;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-    src_stride_u = -src_stride_u;
			
 
				-    src_stride_v = -src_stride_v;
			
 
				-  }
			
 
				-
			
 
				-  switch (mode) {
			
 
				-    case kRotate0:
			
 
				-      // copy frame
			
 
				-      return I420Copy(src_y, src_stride_y,
			
 
				-                      src_u, src_stride_u,
			
 
				-                      src_v, src_stride_v,
			
 
				-                      dst_y, dst_stride_y,
			
 
				-                      dst_u, dst_stride_u,
			
 
				-                      dst_v, dst_stride_v,
			
 
				-                      width, height);
			
 
				-    case kRotate90:
			
 
				-      RotatePlane90(src_y, src_stride_y,
			
 
				-                    dst_y, dst_stride_y,
			
 
				-                    width, height);
			
 
				-      RotatePlane90(src_u, src_stride_u,
			
 
				-                    dst_u, dst_stride_u,
			
 
				-                    halfwidth, halfheight);
			
 
				-      RotatePlane90(src_v, src_stride_v,
			
 
				-                    dst_v, dst_stride_v,
			
 
				-                    halfwidth, halfheight);
			
 
				-      return 0;
			
 
				-    case kRotate270:
			
 
				-      RotatePlane270(src_y, src_stride_y,
			
 
				-                     dst_y, dst_stride_y,
			
 
				-                     width, height);
			
 
				-      RotatePlane270(src_u, src_stride_u,
			
 
				-                     dst_u, dst_stride_u,
			
 
				-                     halfwidth, halfheight);
			
 
				-      RotatePlane270(src_v, src_stride_v,
			
 
				-                     dst_v, dst_stride_v,
			
 
				-                     halfwidth, halfheight);
			
 
				-      return 0;
			
 
				-    case kRotate180:
			
 
				-      RotatePlane180(src_y, src_stride_y,
			
 
				-                     dst_y, dst_stride_y,
			
 
				-                     width, height);
			
 
				-      RotatePlane180(src_u, src_stride_u,
			
 
				-                     dst_u, dst_stride_u,
			
 
				-                     halfwidth, halfheight);
			
 
				-      RotatePlane180(src_v, src_stride_v,
			
 
				-                     dst_v, dst_stride_v,
			
 
				-                     halfwidth, halfheight);
			
 
				-      return 0;
			
 
				-    default:
			
 
				-      break;
			
 
				-  }
			
 
				-  return -1;
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
			
 
				-                     const uint8* src_uv, int src_stride_uv,
			
 
				-                     uint8* dst_y, int dst_stride_y,
			
 
				-                     uint8* dst_u, int dst_stride_u,
			
 
				-                     uint8* dst_v, int dst_stride_v,
			
 
				-                     int width, int height,
			
 
				-                     enum RotationMode mode) {
			
 
				-  int halfwidth = (width + 1) >> 1;
			
 
				-  int halfheight = (height + 1) >> 1;
			
 
				-  if (!src_y || !src_uv || width <= 0 || height == 0 ||
			
 
				-      !dst_y || !dst_u || !dst_v) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    halfheight = (height + 1) >> 1;
			
 
				-    src_y = src_y + (height - 1) * src_stride_y;
			
 
				-    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
			
 
				-    src_stride_y = -src_stride_y;
			
 
				-    src_stride_uv = -src_stride_uv;
			
 
				-  }
			
 
				-
			
 
				-  switch (mode) {
			
 
				-    case kRotate0:
			
 
				-      // copy frame
			
 
				-      return NV12ToI420(src_y, src_stride_y,
			
 
				-                        src_uv, src_stride_uv,
			
 
				-                        dst_y, dst_stride_y,
			
 
				-                        dst_u, dst_stride_u,
			
 
				-                        dst_v, dst_stride_v,
			
 
				-                        width, height);
			
 
				-    case kRotate90:
			
 
				-      RotatePlane90(src_y, src_stride_y,
			
 
				-                    dst_y, dst_stride_y,
			
 
				-                    width, height);
			
 
				-      RotateUV90(src_uv, src_stride_uv,
			
 
				-                 dst_u, dst_stride_u,
			
 
				-                 dst_v, dst_stride_v,
			
 
				-                 halfwidth, halfheight);
			
 
				-      return 0;
			
 
				-    case kRotate270:
			
 
				-      RotatePlane270(src_y, src_stride_y,
			
 
				-                     dst_y, dst_stride_y,
			
 
				-                     width, height);
			
 
				-      RotateUV270(src_uv, src_stride_uv,
			
 
				-                  dst_u, dst_stride_u,
			
 
				-                  dst_v, dst_stride_v,
			
 
				-                  halfwidth, halfheight);
			
 
				-      return 0;
			
 
				-    case kRotate180:
			
 
				-      RotatePlane180(src_y, src_stride_y,
			
 
				-                     dst_y, dst_stride_y,
			
 
				-                     width, height);
			
 
				-      RotateUV180(src_uv, src_stride_uv,
			
 
				-                  dst_u, dst_stride_u,
			
 
				-                  dst_v, dst_stride_v,
			
 
				-                  halfwidth, halfheight);
			
 
				-      return 0;
			
 
				-    default:
			
 
				-      break;
			
 
				-  }
			
 
				-  return -1;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/rotate_argb.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/rotate_argb.cc
@@ -1,209 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/rotate.h"
			
 
				-
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/convert.h"
			
 
				-#include "libyuv/planar_functions.h"
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// ARGBScale has a function to copy pixels to a row, striding each source
			
 
				-// pixel by a constant.
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && \
			
 
				-    (defined(_M_IX86) || \
			
 
				-    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
			
 
				-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
			
 
				-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
			
 
				-                               int src_stepx,
			
 
				-                               uint8* dst_ptr, int dst_width);
			
 
				-#endif
			
 
				-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
			
 
				-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
			
 
				-#define HAS_SCALEARGBROWDOWNEVEN_NEON
			
 
				-void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
			
 
				-                               int src_stepx,
			
 
				-                               uint8* dst_ptr, int dst_width);
			
 
				-#endif
			
 
				-
			
 
				-void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
			
 
				-                            int src_stepx,
			
 
				-                            uint8* dst_ptr, int dst_width);
			
 
				-
			
 
				-static void ARGBTranspose(const uint8* src, int src_stride,
			
 
				-                          uint8* dst, int dst_stride,
			
 
				-                          int width, int height) {
			
 
				-  int i;
			
 
				-  int src_pixel_step = src_stride >> 2;
			
 
				-  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
			
 
				-      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
			
 
				-#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) &&  // Width of dest.
			
 
				-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
			
 
				-  }
			
 
				-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) &&  // Width of dest.
			
 
				-      IS_ALIGNED(src, 4)) {
			
 
				-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (i = 0; i < width; ++i) {  // column of source to row of dest.
			
 
				-    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
			
 
				-    dst += dst_stride;
			
 
				-    src += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBRotate90(const uint8* src, int src_stride,
			
 
				-                  uint8* dst, int dst_stride,
			
 
				-                  int width, int height) {
			
 
				-  // Rotate by 90 is a ARGBTranspose with the source read
			
 
				-  // from bottom to top. So set the source pointer to the end
			
 
				-  // of the buffer and flip the sign of the source stride.
			
 
				-  src += src_stride * (height - 1);
			
 
				-  src_stride = -src_stride;
			
 
				-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
			
 
				-}
			
 
				-
			
 
				-void ARGBRotate270(const uint8* src, int src_stride,
			
 
				-                    uint8* dst, int dst_stride,
			
 
				-                    int width, int height) {
			
 
				-  // Rotate by 270 is a ARGBTranspose with the destination written
			
 
				-  // from bottom to top. So set the destination pointer to the end
			
 
				-  // of the buffer and flip the sign of the destination stride.
			
 
				-  dst += dst_stride * (width - 1);
			
 
				-  dst_stride = -dst_stride;
			
 
				-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
			
 
				-}
			
 
				-
			
 
				-void ARGBRotate180(const uint8* src, int src_stride,
			
 
				-                   uint8* dst, int dst_stride,
			
 
				-                   int width, int height) {
			
 
				-  // Swap first and last row and mirror the content. Uses a temporary row.
			
 
				-  align_buffer_64(row, width * 4);
			
 
				-  const uint8* src_bot = src + src_stride * (height - 1);
			
 
				-  uint8* dst_bot = dst + dst_stride * (height - 1);
			
 
				-  int half_height = (height + 1) >> 1;
			
 
				-  int y;
			
 
				-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
			
 
				-      ARGBMirrorRow_C;
			
 
				-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
			
 
				-#if defined(HAS_ARGBMIRRORROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
			
 
				-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBMIRRORROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
			
 
				-    ARGBMirrorRow = ARGBMirrorRow_AVX2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBMIRRORROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
			
 
				-    ARGBMirrorRow = ARGBMirrorRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
			
 
				-    CopyRow = CopyRow_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_X86)
			
 
				-  if (TestCpuFlag(kCpuHasX86)) {
			
 
				-    CopyRow = CopyRow_X86;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
			
 
				-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-    CopyRow = CopyRow_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_ERMS)
			
 
				-  if (TestCpuFlag(kCpuHasERMS)) {
			
 
				-    CopyRow = CopyRow_ERMS;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_COPYROW_MIPS)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS)) {
			
 
				-    CopyRow = CopyRow_MIPS;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  // Odd height will harmlessly mirror the middle row twice.
			
 
				-  for (y = 0; y < half_height; ++y) {
			
 
				-    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
			
 
				-    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
			
 
				-    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
			
 
				-    src += src_stride;
			
 
				-    dst += dst_stride;
			
 
				-    src_bot -= src_stride;
			
 
				-    dst_bot -= dst_stride;
			
 
				-  }
			
 
				-  free_aligned_buffer_64(row);
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
			
 
				-               uint8* dst_argb, int dst_stride_argb,
			
 
				-               int width, int height,
			
 
				-               enum RotationMode mode) {
			
 
				-  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-
			
 
				-  // Negative height means invert the image.
			
 
				-  if (height < 0) {
			
 
				-    height = -height;
			
 
				-    src_argb = src_argb + (height - 1) * src_stride_argb;
			
 
				-    src_stride_argb = -src_stride_argb;
			
 
				-  }
			
 
				-
			
 
				-  switch (mode) {
			
 
				-    case kRotate0:
			
 
				-      // copy frame
			
 
				-      return ARGBCopy(src_argb, src_stride_argb,
			
 
				-                      dst_argb, dst_stride_argb,
			
 
				-                      width, height);
			
 
				-    case kRotate90:
			
 
				-      ARGBRotate90(src_argb, src_stride_argb,
			
 
				-                   dst_argb, dst_stride_argb,
			
 
				-                   width, height);
			
 
				-      return 0;
			
 
				-    case kRotate270:
			
 
				-      ARGBRotate270(src_argb, src_stride_argb,
			
 
				-                    dst_argb, dst_stride_argb,
			
 
				-                    width, height);
			
 
				-      return 0;
			
 
				-    case kRotate180:
			
 
				-      ARGBRotate180(src_argb, src_stride_argb,
			
 
				-                    dst_argb, dst_stride_argb,
			
 
				-                    width, height);
			
 
				-      return 0;
			
 
				-    default:
			
 
				-      break;
			
 
				-  }
			
 
				-  return -1;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/rotate_mips.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/rotate_mips.cc
@@ -1,486 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_MIPS) && \
			
 
				-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
			
 
				-
			
 
				-void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
			
 
				-                             uint8* dst, int dst_stride,
			
 
				-                             int width) {
			
 
				-   __asm__ __volatile__ (
			
 
				-      ".set push                                         \n"
			
 
				-      ".set noreorder                                    \n"
			
 
				-      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
			
 
				-      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
			
 
				-      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
			
 
				-      "addu             $t3, $t2, %[src_stride]          \n"
			
 
				-      "addu             $t5, $t4, %[src_stride]          \n"
			
 
				-      "addu             $t6, $t2, $t4                    \n"
			
 
				-      "andi             $t0, %[dst], 0x3                 \n"
			
 
				-      "andi             $t1, %[dst_stride], 0x3          \n"
			
 
				-      "or               $t0, $t0, $t1                    \n"
			
 
				-      "bnez             $t0, 11f                         \n"
			
 
				-      " subu            $t7, $t9, %[src_stride]          \n"
			
 
				-//dst + dst_stride word aligned
			
 
				-    "1:                                                  \n"
			
 
				-      "lbu              $t0, 0(%[src])                   \n"
			
 
				-      "lbux             $t1, %[src_stride](%[src])       \n"
			
 
				-      "lbux             $t8, $t2(%[src])                 \n"
			
 
				-      "lbux             $t9, $t3(%[src])                 \n"
			
 
				-      "sll              $t1, $t1, 16                     \n"
			
 
				-      "sll              $t9, $t9, 16                     \n"
			
 
				-      "or               $t0, $t0, $t1                    \n"
			
 
				-      "or               $t8, $t8, $t9                    \n"
			
 
				-      "precr.qb.ph      $s0, $t8, $t0                    \n"
			
 
				-      "lbux             $t0, $t4(%[src])                 \n"
			
 
				-      "lbux             $t1, $t5(%[src])                 \n"
			
 
				-      "lbux             $t8, $t6(%[src])                 \n"
			
 
				-      "lbux             $t9, $t7(%[src])                 \n"
			
 
				-      "sll              $t1, $t1, 16                     \n"
			
 
				-      "sll              $t9, $t9, 16                     \n"
			
 
				-      "or               $t0, $t0, $t1                    \n"
			
 
				-      "or               $t8, $t8, $t9                    \n"
			
 
				-      "precr.qb.ph      $s1, $t8, $t0                    \n"
			
 
				-      "sw               $s0, 0(%[dst])                   \n"
			
 
				-      "addiu            %[width], -1                     \n"
			
 
				-      "addiu            %[src], 1                        \n"
			
 
				-      "sw               $s1, 4(%[dst])                   \n"
			
 
				-      "bnez             %[width], 1b                     \n"
			
 
				-      " addu            %[dst], %[dst], %[dst_stride]    \n"
			
 
				-      "b                2f                               \n"
			
 
				-//dst + dst_stride unaligned
			
 
				-   "11:                                                  \n"
			
 
				-      "lbu              $t0, 0(%[src])                   \n"
			
 
				-      "lbux             $t1, %[src_stride](%[src])       \n"
			
 
				-      "lbux             $t8, $t2(%[src])                 \n"
			
 
				-      "lbux             $t9, $t3(%[src])                 \n"
			
 
				-      "sll              $t1, $t1, 16                     \n"
			
 
				-      "sll              $t9, $t9, 16                     \n"
			
 
				-      "or               $t0, $t0, $t1                    \n"
			
 
				-      "or               $t8, $t8, $t9                    \n"
			
 
				-      "precr.qb.ph      $s0, $t8, $t0                    \n"
			
 
				-      "lbux             $t0, $t4(%[src])                 \n"
			
 
				-      "lbux             $t1, $t5(%[src])                 \n"
			
 
				-      "lbux             $t8, $t6(%[src])                 \n"
			
 
				-      "lbux             $t9, $t7(%[src])                 \n"
			
 
				-      "sll              $t1, $t1, 16                     \n"
			
 
				-      "sll              $t9, $t9, 16                     \n"
			
 
				-      "or               $t0, $t0, $t1                    \n"
			
 
				-      "or               $t8, $t8, $t9                    \n"
			
 
				-      "precr.qb.ph      $s1, $t8, $t0                    \n"
			
 
				-      "swr              $s0, 0(%[dst])                   \n"
			
 
				-      "swl              $s0, 3(%[dst])                   \n"
			
 
				-      "addiu            %[width], -1                     \n"
			
 
				-      "addiu            %[src], 1                        \n"
			
 
				-      "swr              $s1, 4(%[dst])                   \n"
			
 
				-      "swl              $s1, 7(%[dst])                   \n"
			
 
				-      "bnez             %[width], 11b                    \n"
			
 
				-       "addu             %[dst], %[dst], %[dst_stride]   \n"
			
 
				-    "2:                                                  \n"
			
 
				-      ".set pop                                          \n"
			
 
				-      :[src] "+r" (src),
			
 
				-       [dst] "+r" (dst),
			
 
				-       [width] "+r" (width)
			
 
				-      :[src_stride] "r" (src_stride),
			
 
				-       [dst_stride] "r" (dst_stride)
			
 
				-      : "t0", "t1",  "t2", "t3", "t4", "t5",
			
 
				-        "t6", "t7", "t8", "t9",
			
 
				-        "s0", "s1"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
			
 
				-                                  uint8* dst, int dst_stride,
			
 
				-                                  int width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-      ".set noat                                         \n"
			
 
				-      ".set push                                         \n"
			
 
				-      ".set noreorder                                    \n"
			
 
				-      "beqz             %[width], 2f                     \n"
			
 
				-      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
			
 
				-      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
			
 
				-      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
			
 
				-      "addu             $t3, $t2, %[src_stride]          \n"
			
 
				-      "addu             $t5, $t4, %[src_stride]          \n"
			
 
				-      "addu             $t6, $t2, $t4                    \n"
			
 
				-
			
 
				-      "srl              $AT, %[width], 0x2               \n"
			
 
				-      "andi             $t0, %[dst], 0x3                 \n"
			
 
				-      "andi             $t1, %[dst_stride], 0x3          \n"
			
 
				-      "or               $t0, $t0, $t1                    \n"
			
 
				-      "bnez             $t0, 11f                         \n"
			
 
				-      " subu            $t7, $t9, %[src_stride]          \n"
			
 
				-//dst + dst_stride word aligned
			
 
				-      "1:                                                \n"
			
 
				-      "lw               $t0, 0(%[src])                   \n"
			
 
				-      "lwx              $t1, %[src_stride](%[src])       \n"
			
 
				-      "lwx              $t8, $t2(%[src])                 \n"
			
 
				-      "lwx              $t9, $t3(%[src])                 \n"
			
 
				-
			
 
				-// t0 = | 30 | 20 | 10 | 00 |
			
 
				-// t1 = | 31 | 21 | 11 | 01 |
			
 
				-// t8 = | 32 | 22 | 12 | 02 |
			
 
				-// t9 = | 33 | 23 | 13 | 03 |
			
 
				-
			
 
				-      "precr.qb.ph     $s0, $t1, $t0                     \n"
			
 
				-      "precr.qb.ph     $s1, $t9, $t8                     \n"
			
 
				-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
			
 
				-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
			
 
				-
			
 
				-  // s0 = | 21 | 01 | 20 | 00 |
			
 
				-  // s1 = | 23 | 03 | 22 | 02 |
			
 
				-  // s2 = | 31 | 11 | 30 | 10 |
			
 
				-  // s3 = | 33 | 13 | 32 | 12 |
			
 
				-
			
 
				-      "precr.qb.ph     $s4, $s1, $s0                     \n"
			
 
				-      "precrq.qb.ph    $s5, $s1, $s0                     \n"
			
 
				-      "precr.qb.ph     $s6, $s3, $s2                     \n"
			
 
				-      "precrq.qb.ph    $s7, $s3, $s2                     \n"
			
 
				-
			
 
				-  // s4 = | 03 | 02 | 01 | 00 |
			
 
				-  // s5 = | 23 | 22 | 21 | 20 |
			
 
				-  // s6 = | 13 | 12 | 11 | 10 |
			
 
				-  // s7 = | 33 | 32 | 31 | 30 |
			
 
				-
			
 
				-      "lwx              $t0, $t4(%[src])                 \n"
			
 
				-      "lwx              $t1, $t5(%[src])                 \n"
			
 
				-      "lwx              $t8, $t6(%[src])                 \n"
			
 
				-      "lwx              $t9, $t7(%[src])                 \n"
			
 
				-
			
 
				-// t0 = | 34 | 24 | 14 | 04 |
			
 
				-// t1 = | 35 | 25 | 15 | 05 |
			
 
				-// t8 = | 36 | 26 | 16 | 06 |
			
 
				-// t9 = | 37 | 27 | 17 | 07 |
			
 
				-
			
 
				-      "precr.qb.ph     $s0, $t1, $t0                     \n"
			
 
				-      "precr.qb.ph     $s1, $t9, $t8                     \n"
			
 
				-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
			
 
				-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
			
 
				-
			
 
				-  // s0 = | 25 | 05 | 24 | 04 |
			
 
				-  // s1 = | 27 | 07 | 26 | 06 |
			
 
				-  // s2 = | 35 | 15 | 34 | 14 |
			
 
				-  // s3 = | 37 | 17 | 36 | 16 |
			
 
				-
			
 
				-      "precr.qb.ph     $t0, $s1, $s0                     \n"
			
 
				-      "precrq.qb.ph    $t1, $s1, $s0                     \n"
			
 
				-      "precr.qb.ph     $t8, $s3, $s2                     \n"
			
 
				-      "precrq.qb.ph    $t9, $s3, $s2                     \n"
			
 
				-
			
 
				-  // t0 = | 07 | 06 | 05 | 04 |
			
 
				-  // t1 = | 27 | 26 | 25 | 24 |
			
 
				-  // t8 = | 17 | 16 | 15 | 14 |
			
 
				-  // t9 = | 37 | 36 | 35 | 34 |
			
 
				-
			
 
				-      "addu            $s0, %[dst], %[dst_stride]        \n"
			
 
				-      "addu            $s1, $s0, %[dst_stride]           \n"
			
 
				-      "addu            $s2, $s1, %[dst_stride]           \n"
			
 
				-
			
 
				-      "sw              $s4, 0(%[dst])                    \n"
			
 
				-      "sw              $t0, 4(%[dst])                    \n"
			
 
				-      "sw              $s6, 0($s0)                       \n"
			
 
				-      "sw              $t8, 4($s0)                       \n"
			
 
				-      "sw              $s5, 0($s1)                       \n"
			
 
				-      "sw              $t1, 4($s1)                       \n"
			
 
				-      "sw              $s7, 0($s2)                       \n"
			
 
				-      "sw              $t9, 4($s2)                       \n"
			
 
				-
			
 
				-      "addiu            $AT, -1                          \n"
			
 
				-      "addiu            %[src], 4                        \n"
			
 
				-
			
 
				-      "bnez             $AT, 1b                          \n"
			
 
				-      " addu            %[dst], $s2, %[dst_stride]       \n"
			
 
				-      "b                2f                               \n"
			
 
				-//dst + dst_stride unaligned
			
 
				-      "11:                                               \n"
			
 
				-      "lw               $t0, 0(%[src])                   \n"
			
 
				-      "lwx              $t1, %[src_stride](%[src])       \n"
			
 
				-      "lwx              $t8, $t2(%[src])                 \n"
			
 
				-      "lwx              $t9, $t3(%[src])                 \n"
			
 
				-
			
 
				-// t0 = | 30 | 20 | 10 | 00 |
			
 
				-// t1 = | 31 | 21 | 11 | 01 |
			
 
				-// t8 = | 32 | 22 | 12 | 02 |
			
 
				-// t9 = | 33 | 23 | 13 | 03 |
			
 
				-
			
 
				-      "precr.qb.ph     $s0, $t1, $t0                     \n"
			
 
				-      "precr.qb.ph     $s1, $t9, $t8                     \n"
			
 
				-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
			
 
				-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
			
 
				-
			
 
				-  // s0 = | 21 | 01 | 20 | 00 |
			
 
				-  // s1 = | 23 | 03 | 22 | 02 |
			
 
				-  // s2 = | 31 | 11 | 30 | 10 |
			
 
				-  // s3 = | 33 | 13 | 32 | 12 |
			
 
				-
			
 
				-      "precr.qb.ph     $s4, $s1, $s0                     \n"
			
 
				-      "precrq.qb.ph    $s5, $s1, $s0                     \n"
			
 
				-      "precr.qb.ph     $s6, $s3, $s2                     \n"
			
 
				-      "precrq.qb.ph    $s7, $s3, $s2                     \n"
			
 
				-
			
 
				-  // s4 = | 03 | 02 | 01 | 00 |
			
 
				-  // s5 = | 23 | 22 | 21 | 20 |
			
 
				-  // s6 = | 13 | 12 | 11 | 10 |
			
 
				-  // s7 = | 33 | 32 | 31 | 30 |
			
 
				-
			
 
				-      "lwx              $t0, $t4(%[src])                 \n"
			
 
				-      "lwx              $t1, $t5(%[src])                 \n"
			
 
				-      "lwx              $t8, $t6(%[src])                 \n"
			
 
				-      "lwx              $t9, $t7(%[src])                 \n"
			
 
				-
			
 
				-// t0 = | 34 | 24 | 14 | 04 |
			
 
				-// t1 = | 35 | 25 | 15 | 05 |
			
 
				-// t8 = | 36 | 26 | 16 | 06 |
			
 
				-// t9 = | 37 | 27 | 17 | 07 |
			
 
				-
			
 
				-      "precr.qb.ph     $s0, $t1, $t0                     \n"
			
 
				-      "precr.qb.ph     $s1, $t9, $t8                     \n"
			
 
				-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
			
 
				-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
			
 
				-
			
 
				-  // s0 = | 25 | 05 | 24 | 04 |
			
 
				-  // s1 = | 27 | 07 | 26 | 06 |
			
 
				-  // s2 = | 35 | 15 | 34 | 14 |
			
 
				-  // s3 = | 37 | 17 | 36 | 16 |
			
 
				-
			
 
				-      "precr.qb.ph     $t0, $s1, $s0                     \n"
			
 
				-      "precrq.qb.ph    $t1, $s1, $s0                     \n"
			
 
				-      "precr.qb.ph     $t8, $s3, $s2                     \n"
			
 
				-      "precrq.qb.ph    $t9, $s3, $s2                     \n"
			
 
				-
			
 
				-  // t0 = | 07 | 06 | 05 | 04 |
			
 
				-  // t1 = | 27 | 26 | 25 | 24 |
			
 
				-  // t8 = | 17 | 16 | 15 | 14 |
			
 
				-  // t9 = | 37 | 36 | 35 | 34 |
			
 
				-
			
 
				-      "addu            $s0, %[dst], %[dst_stride]        \n"
			
 
				-      "addu            $s1, $s0, %[dst_stride]           \n"
			
 
				-      "addu            $s2, $s1, %[dst_stride]           \n"
			
 
				-
			
 
				-      "swr              $s4, 0(%[dst])                   \n"
			
 
				-      "swl              $s4, 3(%[dst])                   \n"
			
 
				-      "swr              $t0, 4(%[dst])                   \n"
			
 
				-      "swl              $t0, 7(%[dst])                   \n"
			
 
				-      "swr              $s6, 0($s0)                      \n"
			
 
				-      "swl              $s6, 3($s0)                      \n"
			
 
				-      "swr              $t8, 4($s0)                      \n"
			
 
				-      "swl              $t8, 7($s0)                      \n"
			
 
				-      "swr              $s5, 0($s1)                      \n"
			
 
				-      "swl              $s5, 3($s1)                      \n"
			
 
				-      "swr              $t1, 4($s1)                      \n"
			
 
				-      "swl              $t1, 7($s1)                      \n"
			
 
				-      "swr              $s7, 0($s2)                      \n"
			
 
				-      "swl              $s7, 3($s2)                      \n"
			
 
				-      "swr              $t9, 4($s2)                      \n"
			
 
				-      "swl              $t9, 7($s2)                      \n"
			
 
				-
			
 
				-      "addiu            $AT, -1                          \n"
			
 
				-      "addiu            %[src], 4                        \n"
			
 
				-
			
 
				-      "bnez             $AT, 11b                         \n"
			
 
				-      " addu            %[dst], $s2, %[dst_stride]       \n"
			
 
				-      "2:                                                \n"
			
 
				-      ".set pop                                          \n"
			
 
				-      ".set at                                           \n"
			
 
				-      :[src] "+r" (src),
			
 
				-       [dst] "+r" (dst),
			
 
				-       [width] "+r" (width)
			
 
				-      :[src_stride] "r" (src_stride),
			
 
				-       [dst_stride] "r" (dst_stride)
			
 
				-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
			
 
				-        "t6", "t7", "t8", "t9",
			
 
				-        "s0", "s1", "s2", "s3", "s4",
			
 
				-        "s5", "s6", "s7"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
			
 
				-                               uint8* dst_a, int dst_stride_a,
			
 
				-                               uint8* dst_b, int dst_stride_b,
			
 
				-                               int width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-      ".set push                                         \n"
			
 
				-      ".set noreorder                                    \n"
			
 
				-      "beqz            %[width], 2f                      \n"
			
 
				-      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
			
 
				-      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
			
 
				-      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
			
 
				-      "addu            $t3, $t2, %[src_stride]           \n"
			
 
				-      "addu            $t5, $t4, %[src_stride]           \n"
			
 
				-      "addu            $t6, $t2, $t4                     \n"
			
 
				-      "subu            $t7, $t9, %[src_stride]           \n"
			
 
				-      "srl             $t1, %[width], 1                  \n"
			
 
				-
			
 
				-// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
			
 
				-      "andi            $t0, %[dst_a], 0x3                \n"
			
 
				-      "andi            $t8, %[dst_b], 0x3                \n"
			
 
				-      "or              $t0, $t0, $t8                     \n"
			
 
				-      "andi            $t8, %[dst_stride_a], 0x3         \n"
			
 
				-      "andi            $s5, %[dst_stride_b], 0x3         \n"
			
 
				-      "or              $t8, $t8, $s5                     \n"
			
 
				-      "or              $t0, $t0, $t8                     \n"
			
 
				-      "bnez            $t0, 11f                          \n"
			
 
				-      " nop                                              \n"
			
 
				-// dst + dst_stride word aligned (both, a & b dst addresses)
			
 
				-    "1:                                                  \n"
			
 
				-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
			
 
				-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
			
 
				-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
			
 
				-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
			
 
				-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
			
 
				-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
			
 
				-
			
 
				-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
			
 
				-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
			
 
				-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
			
 
				-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
			
 
				-
			
 
				-      "sll             $t0, $t0, 16                      \n"
			
 
				-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
			
 
				-      "sll             $t9, $t9, 16                      \n"
			
 
				-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
			
 
				-
			
 
				-      "sw              $s3, 0($s5)                       \n"
			
 
				-      "sw              $s4, 0($s6)                       \n"
			
 
				-
			
 
				-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
			
 
				-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
			
 
				-
			
 
				-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
			
 
				-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
			
 
				-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
			
 
				-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
			
 
				-      "sw              $s3, 0(%[dst_a])                  \n"
			
 
				-      "sw              $s4, 0(%[dst_b])                  \n"
			
 
				-
			
 
				-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
			
 
				-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
			
 
				-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
			
 
				-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
			
 
				-
			
 
				-      "sll             $t0, $t0, 16                      \n"
			
 
				-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
			
 
				-      "sll             $t9, $t9, 16                      \n"
			
 
				-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
			
 
				-      "sw              $s3, 4($s5)                       \n"
			
 
				-      "sw              $s4, 4($s6)                       \n"
			
 
				-
			
 
				-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
			
 
				-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
			
 
				-
			
 
				-      "addiu           %[src], 4                         \n"
			
 
				-      "addiu           $t1, -1                           \n"
			
 
				-      "sll             $t0, %[dst_stride_a], 1           \n"
			
 
				-      "sll             $t8, %[dst_stride_b], 1           \n"
			
 
				-      "sw              $s3, 4(%[dst_a])                  \n"
			
 
				-      "sw              $s4, 4(%[dst_b])                  \n"
			
 
				-      "addu            %[dst_a], %[dst_a], $t0           \n"
			
 
				-      "bnez            $t1, 1b                           \n"
			
 
				-      " addu           %[dst_b], %[dst_b], $t8           \n"
			
 
				-      "b               2f                                \n"
			
 
				-      " nop                                              \n"
			
 
				-
			
 
				-// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
			
 
				-   "11:                                                  \n"
			
 
				-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
			
 
				-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
			
 
				-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
			
 
				-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
			
 
				-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
			
 
				-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
			
 
				-
			
 
				-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
			
 
				-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
			
 
				-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
			
 
				-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
			
 
				-
			
 
				-      "sll             $t0, $t0, 16                      \n"
			
 
				-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
			
 
				-      "sll             $t9, $t9, 16                      \n"
			
 
				-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
			
 
				-
			
 
				-      "swr             $s3, 0($s5)                       \n"
			
 
				-      "swl             $s3, 3($s5)                       \n"
			
 
				-      "swr             $s4, 0($s6)                       \n"
			
 
				-      "swl             $s4, 3($s6)                       \n"
			
 
				-
			
 
				-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
			
 
				-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
			
 
				-
			
 
				-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
			
 
				-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
			
 
				-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
			
 
				-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
			
 
				-      "swr             $s3, 0(%[dst_a])                  \n"
			
 
				-      "swl             $s3, 3(%[dst_a])                  \n"
			
 
				-      "swr             $s4, 0(%[dst_b])                  \n"
			
 
				-      "swl             $s4, 3(%[dst_b])                  \n"
			
 
				-
			
 
				-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
			
 
				-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
			
 
				-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
			
 
				-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
			
 
				-
			
 
				-      "sll             $t0, $t0, 16                      \n"
			
 
				-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
			
 
				-      "sll             $t9, $t9, 16                      \n"
			
 
				-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
			
 
				-
			
 
				-      "swr             $s3, 4($s5)                       \n"
			
 
				-      "swl             $s3, 7($s5)                       \n"
			
 
				-      "swr             $s4, 4($s6)                       \n"
			
 
				-      "swl             $s4, 7($s6)                       \n"
			
 
				-
			
 
				-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
			
 
				-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
			
 
				-
			
 
				-      "addiu           %[src], 4                         \n"
			
 
				-      "addiu           $t1, -1                           \n"
			
 
				-      "sll             $t0, %[dst_stride_a], 1           \n"
			
 
				-      "sll             $t8, %[dst_stride_b], 1           \n"
			
 
				-      "swr             $s3, 4(%[dst_a])                  \n"
			
 
				-      "swl             $s3, 7(%[dst_a])                  \n"
			
 
				-      "swr             $s4, 4(%[dst_b])                  \n"
			
 
				-      "swl             $s4, 7(%[dst_b])                  \n"
			
 
				-      "addu            %[dst_a], %[dst_a], $t0           \n"
			
 
				-      "bnez            $t1, 11b                          \n"
			
 
				-      " addu           %[dst_b], %[dst_b], $t8           \n"
			
 
				-
			
 
				-      "2:                                                \n"
			
 
				-      ".set pop                                          \n"
			
 
				-      : [src] "+r" (src),
			
 
				-        [dst_a] "+r" (dst_a),
			
 
				-        [dst_b] "+r" (dst_b),
			
 
				-        [width] "+r" (width),
			
 
				-        [src_stride] "+r" (src_stride)
			
 
				-      : [dst_stride_a] "r" (dst_stride_a),
			
 
				-        [dst_stride_b] "r" (dst_stride_b)
			
 
				-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
			
 
				-        "t6", "t7", "t8", "t9",
			
 
				-        "s0", "s1", "s2", "s3",
			
 
				-        "s4", "s5", "s6"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/rotate_neon.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/rotate_neon.cc
@@ -1,412 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
			
 
				-static uvec8 kVTbl4x4Transpose =
			
 
				-  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
			
 
				-
			
 
				-void TransposeWx8_NEON(const uint8* src, int src_stride,
			
 
				-                       uint8* dst, int dst_stride,
			
 
				-                       int width) {
			
 
				-  const uint8* src_temp = NULL;
			
 
				-  asm volatile (
			
 
				-    // loops are on blocks of 8. loop will stop when
			
 
				-    // counter gets to or below 0. starting the counter
			
 
				-    // at w-8 allow for this
			
 
				-#ifdef _ANDROID
			
 
				-				".fpu neon\n"
			
 
				-#endif
			
 
				-    "sub         %5, #8                        \n"
			
 
				-
			
 
				-    // handle 8x8 blocks. this should be the majority of the plane
			
 
				-    ".p2align  2                               \n"
			
 
				-    "1:                                        \n"
			
 
				-      "mov         %0, %1                      \n"
			
 
				-
			
 
				-      "vld1.8      {d0}, [%0], %2              \n"
			
 
				-      "vld1.8      {d1}, [%0], %2              \n"
			
 
				-      "vld1.8      {d2}, [%0], %2              \n"
			
 
				-      "vld1.8      {d3}, [%0], %2              \n"
			
 
				-      "vld1.8      {d4}, [%0], %2              \n"
			
 
				-      "vld1.8      {d5}, [%0], %2              \n"
			
 
				-      "vld1.8      {d6}, [%0], %2              \n"
			
 
				-      "vld1.8      {d7}, [%0]                  \n"
			
 
				-
			
 
				-      "vtrn.8      d1, d0                      \n"
			
 
				-      "vtrn.8      d3, d2                      \n"
			
 
				-      "vtrn.8      d5, d4                      \n"
			
 
				-      "vtrn.8      d7, d6                      \n"
			
 
				-
			
 
				-      "vtrn.16     d1, d3                      \n"
			
 
				-      "vtrn.16     d0, d2                      \n"
			
 
				-      "vtrn.16     d5, d7                      \n"
			
 
				-      "vtrn.16     d4, d6                      \n"
			
 
				-
			
 
				-      "vtrn.32     d1, d5                      \n"
			
 
				-      "vtrn.32     d0, d4                      \n"
			
 
				-      "vtrn.32     d3, d7                      \n"
			
 
				-      "vtrn.32     d2, d6                      \n"
			
 
				-
			
 
				-      "vrev16.8    q0, q0                      \n"
			
 
				-      "vrev16.8    q1, q1                      \n"
			
 
				-      "vrev16.8    q2, q2                      \n"
			
 
				-      "vrev16.8    q3, q3                      \n"
			
 
				-
			
 
				-      "mov         %0, %3                      \n"
			
 
				-
			
 
				-      "vst1.8      {d1}, [%0], %4              \n"
			
 
				-      "vst1.8      {d0}, [%0], %4              \n"
			
 
				-      "vst1.8      {d3}, [%0], %4              \n"
			
 
				-      "vst1.8      {d2}, [%0], %4              \n"
			
 
				-      "vst1.8      {d5}, [%0], %4              \n"
			
 
				-      "vst1.8      {d4}, [%0], %4              \n"
			
 
				-      "vst1.8      {d7}, [%0], %4              \n"
			
 
				-      "vst1.8      {d6}, [%0]                  \n"
			
 
				-
			
 
				-      "add         %1, #8                      \n"  // src += 8
			
 
				-      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
			
 
				-      "subs        %5,  #8                     \n"  // w   -= 8
			
 
				-      "bge         1b                          \n"
			
 
				-
			
 
				-    // add 8 back to counter. if the result is 0 there are
			
 
				-    // no residuals.
			
 
				-    "adds        %5, #8                        \n"
			
 
				-    "beq         4f                            \n"
			
 
				-
			
 
				-    // some residual, so between 1 and 7 lines left to transpose
			
 
				-    "cmp         %5, #2                        \n"
			
 
				-    "blt         3f                            \n"
			
 
				-
			
 
				-    "cmp         %5, #4                        \n"
			
 
				-    "blt         2f                            \n"
			
 
				-
			
 
				-    // 4x8 block
			
 
				-    "mov         %0, %1                        \n"
			
 
				-    "vld1.32     {d0[0]}, [%0], %2             \n"
			
 
				-    "vld1.32     {d0[1]}, [%0], %2             \n"
			
 
				-    "vld1.32     {d1[0]}, [%0], %2             \n"
			
 
				-    "vld1.32     {d1[1]}, [%0], %2             \n"
			
 
				-    "vld1.32     {d2[0]}, [%0], %2             \n"
			
 
				-    "vld1.32     {d2[1]}, [%0], %2             \n"
			
 
				-    "vld1.32     {d3[0]}, [%0], %2             \n"
			
 
				-    "vld1.32     {d3[1]}, [%0]                 \n"
			
 
				-
			
 
				-    "mov         %0, %3                        \n"
			
 
				-
			
 
				-    "vld1.8      {q3}, [%6]                    \n"
			
 
				-
			
 
				-    "vtbl.8      d4, {d0, d1}, d6              \n"
			
 
				-    "vtbl.8      d5, {d0, d1}, d7              \n"
			
 
				-    "vtbl.8      d0, {d2, d3}, d6              \n"
			
 
				-    "vtbl.8      d1, {d2, d3}, d7              \n"
			
 
				-
			
 
				-    // TODO(frkoenig): Rework shuffle above to
			
 
				-    // write out with 4 instead of 8 writes.
			
 
				-    "vst1.32     {d4[0]}, [%0], %4             \n"
			
 
				-    "vst1.32     {d4[1]}, [%0], %4             \n"
			
 
				-    "vst1.32     {d5[0]}, [%0], %4             \n"
			
 
				-    "vst1.32     {d5[1]}, [%0]                 \n"
			
 
				-
			
 
				-    "add         %0, %3, #4                    \n"
			
 
				-    "vst1.32     {d0[0]}, [%0], %4             \n"
			
 
				-    "vst1.32     {d0[1]}, [%0], %4             \n"
			
 
				-    "vst1.32     {d1[0]}, [%0], %4             \n"
			
 
				-    "vst1.32     {d1[1]}, [%0]                 \n"
			
 
				-
			
 
				-    "add         %1, #4                        \n"  // src += 4
			
 
				-    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
			
 
				-    "subs        %5,  #4                       \n"  // w   -= 4
			
 
				-    "beq         4f                            \n"
			
 
				-
			
 
				-    // some residual, check to see if it includes a 2x8 block,
			
 
				-    // or less
			
 
				-    "cmp         %5, #2                        \n"
			
 
				-    "blt         3f                            \n"
			
 
				-
			
 
				-    // 2x8 block
			
 
				-    "2:                                        \n"
			
 
				-    "mov         %0, %1                        \n"
			
 
				-    "vld1.16     {d0[0]}, [%0], %2             \n"
			
 
				-    "vld1.16     {d1[0]}, [%0], %2             \n"
			
 
				-    "vld1.16     {d0[1]}, [%0], %2             \n"
			
 
				-    "vld1.16     {d1[1]}, [%0], %2             \n"
			
 
				-    "vld1.16     {d0[2]}, [%0], %2             \n"
			
 
				-    "vld1.16     {d1[2]}, [%0], %2             \n"
			
 
				-    "vld1.16     {d0[3]}, [%0], %2             \n"
			
 
				-    "vld1.16     {d1[3]}, [%0]                 \n"
			
 
				-
			
 
				-    "vtrn.8      d0, d1                        \n"
			
 
				-
			
 
				-    "mov         %0, %3                        \n"
			
 
				-
			
 
				-    "vst1.64     {d0}, [%0], %4                \n"
			
 
				-    "vst1.64     {d1}, [%0]                    \n"
			
 
				-
			
 
				-    "add         %1, #2                        \n"  // src += 2
			
 
				-    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
			
 
				-    "subs        %5,  #2                       \n"  // w   -= 2
			
 
				-    "beq         4f                            \n"
			
 
				-
			
 
				-    // 1x8 block
			
 
				-    "3:                                        \n"
			
 
				-    "vld1.8      {d0[0]}, [%1], %2             \n"
			
 
				-    "vld1.8      {d0[1]}, [%1], %2             \n"
			
 
				-    "vld1.8      {d0[2]}, [%1], %2             \n"
			
 
				-    "vld1.8      {d0[3]}, [%1], %2             \n"
			
 
				-    "vld1.8      {d0[4]}, [%1], %2             \n"
			
 
				-    "vld1.8      {d0[5]}, [%1], %2             \n"
			
 
				-    "vld1.8      {d0[6]}, [%1], %2             \n"
			
 
				-    "vld1.8      {d0[7]}, [%1]                 \n"
			
 
				-
			
 
				-    "vst1.64     {d0}, [%3]                    \n"
			
 
				-
			
 
				-    "4:                                        \n"
			
 
				-
			
 
				-    : "+r"(src_temp),          // %0
			
 
				-      "+r"(src),               // %1
			
 
				-      "+r"(src_stride),        // %2
			
 
				-      "+r"(dst),               // %3
			
 
				-      "+r"(dst_stride),        // %4
			
 
				-      "+r"(width)              // %5
			
 
				-    : "r"(&kVTbl4x4Transpose)  // %6
			
 
				-    : "memory", "cc", "q0", "q1", "q2", "q3"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-static uvec8 kVTbl4x4TransposeDi =
			
 
				-  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
			
 
				-
			
 
				-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
			
 
				-                         uint8* dst_a, int dst_stride_a,
			
 
				-                         uint8* dst_b, int dst_stride_b,
			
 
				-                         int width) {
			
 
				-  const uint8* src_temp = NULL;
			
 
				-  asm volatile (
			
 
				-    // loops are on blocks of 8. loop will stop when
			
 
				-    // counter gets to or below 0. starting the counter
			
 
				-    // at w-8 allow for this
			
 
				-    "sub         %7, #8                        \n"
			
 
				-
			
 
				-    // handle 8x8 blocks. this should be the majority of the plane
			
 
				-    ".p2align  2                               \n"
			
 
				-    "1:                                        \n"
			
 
				-      "mov         %0, %1                      \n"
			
 
				-
			
 
				-      "vld2.8      {d0,  d1},  [%0], %2        \n"
			
 
				-      "vld2.8      {d2,  d3},  [%0], %2        \n"
			
 
				-      "vld2.8      {d4,  d5},  [%0], %2        \n"
			
 
				-      "vld2.8      {d6,  d7},  [%0], %2        \n"
			
 
				-      "vld2.8      {d16, d17}, [%0], %2        \n"
			
 
				-      "vld2.8      {d18, d19}, [%0], %2        \n"
			
 
				-      "vld2.8      {d20, d21}, [%0], %2        \n"
			
 
				-      "vld2.8      {d22, d23}, [%0]            \n"
			
 
				-
			
 
				-      "vtrn.8      q1, q0                      \n"
			
 
				-      "vtrn.8      q3, q2                      \n"
			
 
				-      "vtrn.8      q9, q8                      \n"
			
 
				-      "vtrn.8      q11, q10                    \n"
			
 
				-
			
 
				-      "vtrn.16     q1, q3                      \n"
			
 
				-      "vtrn.16     q0, q2                      \n"
			
 
				-      "vtrn.16     q9, q11                     \n"
			
 
				-      "vtrn.16     q8, q10                     \n"
			
 
				-
			
 
				-      "vtrn.32     q1, q9                      \n"
			
 
				-      "vtrn.32     q0, q8                      \n"
			
 
				-      "vtrn.32     q3, q11                     \n"
			
 
				-      "vtrn.32     q2, q10                     \n"
			
 
				-
			
 
				-      "vrev16.8    q0, q0                      \n"
			
 
				-      "vrev16.8    q1, q1                      \n"
			
 
				-      "vrev16.8    q2, q2                      \n"
			
 
				-      "vrev16.8    q3, q3                      \n"
			
 
				-      "vrev16.8    q8, q8                      \n"
			
 
				-      "vrev16.8    q9, q9                      \n"
			
 
				-      "vrev16.8    q10, q10                    \n"
			
 
				-      "vrev16.8    q11, q11                    \n"
			
 
				-
			
 
				-      "mov         %0, %3                      \n"
			
 
				-
			
 
				-      "vst1.8      {d2},  [%0], %4             \n"
			
 
				-      "vst1.8      {d0},  [%0], %4             \n"
			
 
				-      "vst1.8      {d6},  [%0], %4             \n"
			
 
				-      "vst1.8      {d4},  [%0], %4             \n"
			
 
				-      "vst1.8      {d18}, [%0], %4             \n"
			
 
				-      "vst1.8      {d16}, [%0], %4             \n"
			
 
				-      "vst1.8      {d22}, [%0], %4             \n"
			
 
				-      "vst1.8      {d20}, [%0]                 \n"
			
 
				-
			
 
				-      "mov         %0, %5                      \n"
			
 
				-
			
 
				-      "vst1.8      {d3},  [%0], %6             \n"
			
 
				-      "vst1.8      {d1},  [%0], %6             \n"
			
 
				-      "vst1.8      {d7},  [%0], %6             \n"
			
 
				-      "vst1.8      {d5},  [%0], %6             \n"
			
 
				-      "vst1.8      {d19}, [%0], %6             \n"
			
 
				-      "vst1.8      {d17}, [%0], %6             \n"
			
 
				-      "vst1.8      {d23}, [%0], %6             \n"
			
 
				-      "vst1.8      {d21}, [%0]                 \n"
			
 
				-
			
 
				-      "add         %1, #8*2                    \n"  // src   += 8*2
			
 
				-      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
			
 
				-      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
			
 
				-      "subs        %7,  #8                     \n"  // w     -= 8
			
 
				-      "bge         1b                          \n"
			
 
				-
			
 
				-    // add 8 back to counter. if the result is 0 there are
			
 
				-    // no residuals.
			
 
				-    "adds        %7, #8                        \n"
			
 
				-    "beq         4f                            \n"
			
 
				-
			
 
				-    // some residual, so between 1 and 7 lines left to transpose
			
 
				-    "cmp         %7, #2                        \n"
			
 
				-    "blt         3f                            \n"
			
 
				-
			
 
				-    "cmp         %7, #4                        \n"
			
 
				-    "blt         2f                            \n"
			
 
				-
			
 
				-    //TODO(frkoenig): Clean this up
			
 
				-    // 4x8 block
			
 
				-    "mov         %0, %1                        \n"
			
 
				-    "vld1.64     {d0}, [%0], %2                \n"
			
 
				-    "vld1.64     {d1}, [%0], %2                \n"
			
 
				-    "vld1.64     {d2}, [%0], %2                \n"
			
 
				-    "vld1.64     {d3}, [%0], %2                \n"
			
 
				-    "vld1.64     {d4}, [%0], %2                \n"
			
 
				-    "vld1.64     {d5}, [%0], %2                \n"
			
 
				-    "vld1.64     {d6}, [%0], %2                \n"
			
 
				-    "vld1.64     {d7}, [%0]                    \n"
			
 
				-
			
 
				-    "vld1.8      {q15}, [%8]                   \n"
			
 
				-
			
 
				-    "vtrn.8      q0, q1                        \n"
			
 
				-    "vtrn.8      q2, q3                        \n"
			
 
				-
			
 
				-    "vtbl.8      d16, {d0, d1}, d30            \n"
			
 
				-    "vtbl.8      d17, {d0, d1}, d31            \n"
			
 
				-    "vtbl.8      d18, {d2, d3}, d30            \n"
			
 
				-    "vtbl.8      d19, {d2, d3}, d31            \n"
			
 
				-    "vtbl.8      d20, {d4, d5}, d30            \n"
			
 
				-    "vtbl.8      d21, {d4, d5}, d31            \n"
			
 
				-    "vtbl.8      d22, {d6, d7}, d30            \n"
			
 
				-    "vtbl.8      d23, {d6, d7}, d31            \n"
			
 
				-
			
 
				-    "mov         %0, %3                        \n"
			
 
				-
			
 
				-    "vst1.32     {d16[0]},  [%0], %4           \n"
			
 
				-    "vst1.32     {d16[1]},  [%0], %4           \n"
			
 
				-    "vst1.32     {d17[0]},  [%0], %4           \n"
			
 
				-    "vst1.32     {d17[1]},  [%0], %4           \n"
			
 
				-
			
 
				-    "add         %0, %3, #4                    \n"
			
 
				-    "vst1.32     {d20[0]}, [%0], %4            \n"
			
 
				-    "vst1.32     {d20[1]}, [%0], %4            \n"
			
 
				-    "vst1.32     {d21[0]}, [%0], %4            \n"
			
 
				-    "vst1.32     {d21[1]}, [%0]                \n"
			
 
				-
			
 
				-    "mov         %0, %5                        \n"
			
 
				-
			
 
				-    "vst1.32     {d18[0]}, [%0], %6            \n"
			
 
				-    "vst1.32     {d18[1]}, [%0], %6            \n"
			
 
				-    "vst1.32     {d19[0]}, [%0], %6            \n"
			
 
				-    "vst1.32     {d19[1]}, [%0], %6            \n"
			
 
				-
			
 
				-    "add         %0, %5, #4                    \n"
			
 
				-    "vst1.32     {d22[0]},  [%0], %6           \n"
			
 
				-    "vst1.32     {d22[1]},  [%0], %6           \n"
			
 
				-    "vst1.32     {d23[0]},  [%0], %6           \n"
			
 
				-    "vst1.32     {d23[1]},  [%0]               \n"
			
 
				-
			
 
				-    "add         %1, #4*2                      \n"  // src   += 4 * 2
			
 
				-    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
			
 
				-    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
			
 
				-    "subs        %7,  #4                       \n"  // w     -= 4
			
 
				-    "beq         4f                            \n"
			
 
				-
			
 
				-    // some residual, check to see if it includes a 2x8 block,
			
 
				-    // or less
			
 
				-    "cmp         %7, #2                        \n"
			
 
				-    "blt         3f                            \n"
			
 
				-
			
 
				-    // 2x8 block
			
 
				-    "2:                                        \n"
			
 
				-    "mov         %0, %1                        \n"
			
 
				-    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
			
 
				-    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
			
 
				-    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
			
 
				-    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
			
 
				-    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
			
 
				-    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
			
 
				-    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
			
 
				-    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
			
 
				-
			
 
				-    "vtrn.8      d0, d1                        \n"
			
 
				-    "vtrn.8      d2, d3                        \n"
			
 
				-
			
 
				-    "mov         %0, %3                        \n"
			
 
				-
			
 
				-    "vst1.64     {d0}, [%0], %4                \n"
			
 
				-    "vst1.64     {d2}, [%0]                    \n"
			
 
				-
			
 
				-    "mov         %0, %5                        \n"
			
 
				-
			
 
				-    "vst1.64     {d1}, [%0], %6                \n"
			
 
				-    "vst1.64     {d3}, [%0]                    \n"
			
 
				-
			
 
				-    "add         %1, #2*2                      \n"  // src   += 2 * 2
			
 
				-    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
			
 
				-    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
			
 
				-    "subs        %7,  #2                       \n"  // w     -= 2
			
 
				-    "beq         4f                            \n"
			
 
				-
			
 
				-    // 1x8 block
			
 
				-    "3:                                        \n"
			
 
				-    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
			
 
				-    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
			
 
				-    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
			
 
				-    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
			
 
				-    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
			
 
				-    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
			
 
				-    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
			
 
				-    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
			
 
				-
			
 
				-    "vst1.64     {d0}, [%3]                    \n"
			
 
				-    "vst1.64     {d1}, [%5]                    \n"
			
 
				-
			
 
				-    "4:                                        \n"
			
 
				-
			
 
				-    : "+r"(src_temp),            // %0
			
 
				-      "+r"(src),                 // %1
			
 
				-      "+r"(src_stride),          // %2
			
 
				-      "+r"(dst_a),               // %3
			
 
				-      "+r"(dst_stride_a),        // %4
			
 
				-      "+r"(dst_b),               // %5
			
 
				-      "+r"(dst_stride_b),        // %6
			
 
				-      "+r"(width)                // %7
			
 
				-    : "r"(&kVTbl4x4TransposeDi)  // %8
			
 
				-    : "memory", "cc",
			
 
				-      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
			
 
				-  );
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/row_any.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/row_any.cc
@@ -1,542 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels.
			
 
				-// TODO(fbarchard): Consider 'any' functions handling odd alignment.
			
 
				-// YUV to RGB does multiple of 8 with SIMD and remainder with C.
			
 
				-#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK)        \
			
 
				-    void NAMEANY(const uint8* y_buf,                                           \
			
 
				-                 const uint8* u_buf,                                           \
			
 
				-                 const uint8* v_buf,                                           \
			
 
				-                 uint8* rgb_buf,                                               \
			
 
				-                 int width) {                                                  \
			
 
				-      int n = width & ~MASK;                                                   \
			
 
				-      I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n);                         \
			
 
				-      I420TORGB_C(y_buf + n,                                                   \
			
 
				-                  u_buf + (n >> UV_SHIFT),                                     \
			
 
				-                  v_buf + (n >> UV_SHIFT),                                     \
			
 
				-                  rgb_buf + n * BPP, width & MASK);                            \
			
 
				-    }
			
 
				-
			
 
				-#ifdef HAS_I422TOARGBROW_SSSE3
			
 
				-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
			
 
				-     0, 4, 7)
			
 
				-YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
			
 
				-     1, 4, 7)
			
 
				-YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
			
 
				-     2, 4, 7)
			
 
				-YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
			
 
				-     1, 4, 7)
			
 
				-YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C,
			
 
				-     1, 4, 7)
			
 
				-YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C,
			
 
				-     1, 4, 7)
			
 
				-// I422ToRGB565Row_SSSE3 is unaligned.
			
 
				-YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C,
			
 
				-     1, 2, 7)
			
 
				-YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C,
			
 
				-     1, 2, 7)
			
 
				-YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C,
			
 
				-     1, 2, 7)
			
 
				-// I422ToRGB24Row_SSSE3 is unaligned.
			
 
				-YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
			
 
				-YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
			
 
				-YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
			
 
				-YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
			
 
				-#endif  // HAS_I422TOARGBROW_SSSE3
			
 
				-#ifdef HAS_I422TOARGBROW_AVX2
			
 
				-YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
			
 
				-#endif  // HAS_I422TOARGBROW_AVX2
			
 
				-#ifdef HAS_I422TOARGBROW_NEON
			
 
				-YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7)
			
 
				-YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7)
			
 
				-YANY(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, I411ToARGBRow_C, 2, 4, 7)
			
 
				-YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4, 7)
			
 
				-YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4, 7)
			
 
				-YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4, 7)
			
 
				-YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1, 3, 7)
			
 
				-YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1, 3, 7)
			
 
				-YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
			
 
				-     1, 2, 7)
			
 
				-YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
			
 
				-     1, 2, 7)
			
 
				-YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
			
 
				-YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
			
 
				-YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
			
 
				-#endif  // HAS_I422TOARGBROW_NEON
			
 
				-#undef YANY
			
 
				-
			
 
				-// Wrappers to handle odd width
			
 
				-#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP)             \
			
 
				-    void NAMEANY(const uint8* y_buf,                                           \
			
 
				-                 const uint8* uv_buf,                                          \
			
 
				-                 uint8* rgb_buf,                                               \
			
 
				-                 int width) {                                                  \
			
 
				-      int n = width & ~7;                                                      \
			
 
				-      NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n);                               \
			
 
				-      NV12TORGB_C(y_buf + n,                                                   \
			
 
				-                  uv_buf + (n >> UV_SHIFT),                                    \
			
 
				-                  rgb_buf + n * BPP, width & 7);                               \
			
 
				-    }
			
 
				-
			
 
				-#ifdef HAS_NV12TOARGBROW_SSSE3
			
 
				-NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,
			
 
				-      0, 4)
			
 
				-NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,
			
 
				-      0, 4)
			
 
				-#endif  // HAS_NV12TOARGBROW_SSSE3
			
 
				-#ifdef HAS_NV12TOARGBROW_NEON
			
 
				-NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4)
			
 
				-NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4)
			
 
				-#endif  // HAS_NV12TOARGBROW_NEON
			
 
				-#ifdef HAS_NV12TORGB565ROW_SSSE3
			
 
				-NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
			
 
				-      0, 2)
			
 
				-NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
			
 
				-      0, 2)
			
 
				-#endif  // HAS_NV12TORGB565ROW_SSSE3
			
 
				-#ifdef HAS_NV12TORGB565ROW_NEON
			
 
				-NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2)
			
 
				-NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
			
 
				-#endif  // HAS_NV12TORGB565ROW_NEON
			
 
				-#undef NVANY
			
 
				-
			
 
				-#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP)          \
			
 
				-    void NAMEANY(const uint8* src,                                             \
			
 
				-                 uint8* dst,                                                   \
			
 
				-                 int width) {                                                  \
			
 
				-      int n = width & ~MASK;                                                   \
			
 
				-      ARGBTORGB_SIMD(src, dst, n);                                             \
			
 
				-      ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK);                \
			
 
				-    }
			
 
				-
			
 
				-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
			
 
				-RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C,
			
 
				-       15, 4, 3)
			
 
				-RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C,
			
 
				-       15, 4, 3)
			
 
				-RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C,
			
 
				-       3, 4, 2)
			
 
				-RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
			
 
				-       3, 4, 2)
			
 
				-RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
			
 
				-       3, 4, 2)
			
 
				-#endif
			
 
				-#if defined(HAS_I400TOARGBROW_SSE2)
			
 
				-RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
			
 
				-       7, 1, 4)
			
 
				-#endif
			
 
				-#if defined(HAS_YTOARGBROW_SSE2)
			
 
				-RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
			
 
				-       7, 1, 4)
			
 
				-RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
			
 
				-       15, 2, 4)
			
 
				-RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
			
 
				-       15, 2, 4)
			
 
				-// These require alignment on ARGB, so C is used for remainder.
			
 
				-RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C,
			
 
				-       15, 3, 4)
			
 
				-RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C,
			
 
				-       15, 3, 4)
			
 
				-RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C,
			
 
				-       7, 2, 4)
			
 
				-RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C,
			
 
				-       7, 2, 4)
			
 
				-RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C,
			
 
				-       7, 2, 4)
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTORGB24ROW_NEON)
			
 
				-RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
			
 
				-RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3)
			
 
				-RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C,
			
 
				-       7, 4, 2)
			
 
				-RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
			
 
				-       7, 4, 2)
			
 
				-RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
			
 
				-       7, 4, 2)
			
 
				-RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
			
 
				-       7, 1, 4)
			
 
				-RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
			
 
				-       7, 1, 4)
			
 
				-RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
			
 
				-       7, 2, 4)
			
 
				-RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
			
 
				-       7, 2, 4)
			
 
				-#endif
			
 
				-#undef RGBANY
			
 
				-
			
 
				-// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
			
 
				-#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP)        \
			
 
				-    void NAMEANY(const uint8* src,                                             \
			
 
				-                 uint8* dst, uint32 selector,                                  \
			
 
				-                 int width) {                                                  \
			
 
				-      int n = width & ~MASK;                                                   \
			
 
				-      ARGBTORGB_SIMD(src, dst, selector, n);                                   \
			
 
				-      ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK);      \
			
 
				-    }
			
 
				-
			
 
				-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
			
 
				-BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
			
 
				-         7, 4, 1)
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOBAYERROW_NEON)
			
 
				-BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
			
 
				-         7, 4, 1)
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
			
 
				-BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,
			
 
				-         7, 4, 1)
			
 
				-#endif
			
 
				-#if defined(HAS_ARGBTOBAYERGGROW_NEON)
			
 
				-BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,
			
 
				-         7, 4, 1)
			
 
				-#endif
			
 
				-
			
 
				-#undef BAYERANY
			
 
				-
			
 
				-// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
			
 
				-#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM)                            \
			
 
				-    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
			
 
				-      ARGBTOY_SIMD(src_argb, dst_y, width - NUM);                              \
			
 
				-      ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP,                            \
			
 
				-                   dst_y + (width - NUM) * BPP, NUM);                          \
			
 
				-    }
			
 
				-
			
 
				-#ifdef HAS_ARGBTOYROW_AVX2
			
 
				-YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)
			
 
				-YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32)
			
 
				-YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
			
 
				-YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBTOYROW_SSSE3
			
 
				-YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
			
 
				-#endif
			
 
				-#ifdef HAS_BGRATOYROW_SSSE3
			
 
				-YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
			
 
				-YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
			
 
				-YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
			
 
				-YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)
			
 
				-YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBTOYJROW_SSSE3
			
 
				-YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBTOYROW_NEON
			
 
				-YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
			
 
				-YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
			
 
				-YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
			
 
				-YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
			
 
				-YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
			
 
				-YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
			
 
				-YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
			
 
				-YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
			
 
				-YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
			
 
				-YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
			
 
				-YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
			
 
				-YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
			
 
				-YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
			
 
				-YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
			
 
				-YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
			
 
				-YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
			
 
				-YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
			
 
				-#endif
			
 
				-#undef YANY
			
 
				-
			
 
				-#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \
			
 
				-    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
			
 
				-      int n = width & ~MASK;                                                   \
			
 
				-      ARGBTOY_SIMD(src_argb, dst_y, n);                                        \
			
 
				-      ARGBTOY_C(src_argb + n * SBPP,                                           \
			
 
				-                dst_y  + n * BPP, width & MASK);                               \
			
 
				-    }
			
 
				-
			
 
				-// Attenuate is destructive so last16 method can not be used due to overlap.
			
 
				-#ifdef HAS_ARGBATTENUATEROW_SSSE3
			
 
				-YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
			
 
				-     4, 4, 3)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBATTENUATEROW_SSE2
			
 
				-YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C,
			
 
				-     4, 4, 3)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
			
 
				-YANY(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, ARGBUnattenuateRow_C,
			
 
				-     4, 4, 3)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBATTENUATEROW_AVX2
			
 
				-YANY(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, ARGBAttenuateRow_C,
			
 
				-     4, 4, 7)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
			
 
				-YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C,
			
 
				-     4, 4, 7)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBATTENUATEROW_NEON
			
 
				-YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
			
 
				-     4, 4, 7)
			
 
				-#endif
			
 
				-#undef YANY
			
 
				-
			
 
				-// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
			
 
				-#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK)                     \
			
 
				-    void NAMEANY(const uint8* src_argb, int src_stride_argb,                   \
			
 
				-                 uint8* dst_u, uint8* dst_v, int width) {                      \
			
 
				-      int n = width & ~MASK;                                                   \
			
 
				-      ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n);                \
			
 
				-      ANYTOUV_C(src_argb  + n * BPP, src_stride_argb,                          \
			
 
				-                dst_u + (n >> 1),                                              \
			
 
				-                dst_v + (n >> 1),                                              \
			
 
				-                width & MASK);                                                 \
			
 
				-    }
			
 
				-
			
 
				-#ifdef HAS_ARGBTOUVROW_AVX2
			
 
				-UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
			
 
				-UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
			
 
				-UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBTOUVROW_SSSE3
			
 
				-UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
			
 
				-UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,
			
 
				-      4, 15)
			
 
				-UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
			
 
				-UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
			
 
				-UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
			
 
				-UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
			
 
				-UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBTOUVROW_NEON
			
 
				-UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
			
 
				-UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
			
 
				-UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
			
 
				-UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
			
 
				-UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
			
 
				-UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
			
 
				-UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
			
 
				-UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
			
 
				-UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
			
 
				-UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
			
 
				-UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
			
 
				-UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
			
 
				-#endif
			
 
				-#undef UVANY
			
 
				-
			
 
				-#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT)           \
			
 
				-    void NAMEANY(const uint8* src_uv,                                          \
			
 
				-                 uint8* dst_u, uint8* dst_v, int width) {                      \
			
 
				-      int n = width & ~MASK;                                                   \
			
 
				-      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \
			
 
				-      ANYTOUV_C(src_uv  + n * BPP,                                             \
			
 
				-                dst_u + (n >> SHIFT),                                          \
			
 
				-                dst_v + (n >> SHIFT),                                          \
			
 
				-                width & MASK);                                                 \
			
 
				-    }
			
 
				-
			
 
				-#ifdef HAS_ARGBTOUV444ROW_SSSE3
			
 
				-UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
			
 
				-         ARGBToUV444Row_C, 4, 15, 0)
			
 
				-#endif
			
 
				-#ifdef HAS_YUY2TOUV422ROW_AVX2
			
 
				-UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,
			
 
				-         YUY2ToUV422Row_C, 2, 31, 1)
			
 
				-UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
			
 
				-         UYVYToUV422Row_C, 2, 31, 1)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBTOUVROW_SSSE3
			
 
				-UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
			
 
				-         ARGBToUV422Row_C, 4, 15, 1)
			
 
				-UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,
			
 
				-         YUY2ToUV422Row_C, 2, 15, 1)
			
 
				-UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,
			
 
				-         UYVYToUV422Row_C, 2, 15, 1)
			
 
				-#endif
			
 
				-#ifdef HAS_YUY2TOUV422ROW_NEON
			
 
				-UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON,
			
 
				-         ARGBToUV444Row_C, 4, 7, 0)
			
 
				-UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON,
			
 
				-         ARGBToUV422Row_C, 4, 15, 1)
			
 
				-UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON,
			
 
				-         ARGBToUV411Row_C, 4, 31, 2)
			
 
				-UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,
			
 
				-         YUY2ToUV422Row_C, 2, 15, 1)
			
 
				-UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
			
 
				-         UYVYToUV422Row_C, 2, 15, 1)
			
 
				-#endif
			
 
				-#undef UV422ANY
			
 
				-
			
 
				-#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                  \
			
 
				-    void NAMEANY(const uint8* src_uv,                                          \
			
 
				-                 uint8* dst_u, uint8* dst_v, int width) {                      \
			
 
				-      int n = width & ~MASK;                                                   \
			
 
				-      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \
			
 
				-      ANYTOUV_C(src_uv + n * 2,                                                \
			
 
				-                dst_u + n,                                                     \
			
 
				-                dst_v + n,                                                     \
			
 
				-                width & MASK);                                                 \
			
 
				-    }
			
 
				-
			
 
				-#ifdef HAS_SPLITUVROW_SSE2
			
 
				-SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
			
 
				-#endif
			
 
				-#ifdef HAS_SPLITUVROW_AVX2
			
 
				-SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
			
 
				-#endif
			
 
				-#ifdef HAS_SPLITUVROW_NEON
			
 
				-SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
			
 
				-#endif
			
 
				-#ifdef HAS_SPLITUVROW_MIPS_DSPR2
			
 
				-SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
			
 
				-              SplitUVRow_C, 15)
			
 
				-#endif
			
 
				-#undef SPLITUVROWANY
			
 
				-
			
 
				-#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                 \
			
 
				-    void NAMEANY(const uint8* src_u, const uint8* src_v,                       \
			
 
				-                 uint8* dst_uv, int width) {                                   \
			
 
				-      int n = width & ~MASK;                                                   \
			
 
				-      ANYTOUV_SIMD(src_u, src_v, dst_uv, n);                                   \
			
 
				-      ANYTOUV_C(src_u + n,                                                     \
			
 
				-                src_v + n,                                                     \
			
 
				-                dst_uv + n * 2,                                                \
			
 
				-                width & MASK);                                                 \
			
 
				-    }
			
 
				-
			
 
				-#ifdef HAS_MERGEUVROW_SSE2
			
 
				-MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
			
 
				-#endif
			
 
				-#ifdef HAS_MERGEUVROW_AVX2
			
 
				-MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
			
 
				-#endif
			
 
				-#ifdef HAS_MERGEUVROW_NEON
			
 
				-MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
			
 
				-#endif
			
 
				-#undef MERGEUVROW_ANY
			
 
				-
			
 
				-#define MATHROW_ANY(NAMEANY, ARGBMATH_SIMD, ARGBMATH_C, MASK)                  \
			
 
				-    void NAMEANY(const uint8* src_argb0, const uint8* src_argb1,               \
			
 
				-                 uint8* dst_argb, int width) {                                 \
			
 
				-      int n = width & ~MASK;                                                   \
			
 
				-      ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n);                        \
			
 
				-      ARGBMATH_C(src_argb0 + n * 4,                                            \
			
 
				-                 src_argb1 + n * 4,                                            \
			
 
				-                 dst_argb + n * 4,                                             \
			
 
				-                 width & MASK);                                                \
			
 
				-    }
			
 
				-
			
 
				-#ifdef HAS_ARGBMULTIPLYROW_SSE2
			
 
				-MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C,
			
 
				-            3)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBADDROW_SSE2
			
 
				-MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBSUBTRACTROW_SSE2
			
 
				-MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C,
			
 
				-            3)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBMULTIPLYROW_AVX2
			
 
				-MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C,
			
 
				-            7)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBADDROW_AVX2
			
 
				-MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBSUBTRACTROW_AVX2
			
 
				-MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C,
			
 
				-            7)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBMULTIPLYROW_NEON
			
 
				-MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C,
			
 
				-            7)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBADDROW_NEON
			
 
				-MATHROW_ANY(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, ARGBAddRow_C, 7)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBSUBTRACTROW_NEON
			
 
				-MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
			
 
				-            7)
			
 
				-#endif
			
 
				-#undef MATHROW_ANY
			
 
				-
			
 
				-// Shuffle may want to work in place, so last16 method can not be used.
			
 
				-#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \
			
 
				-    void NAMEANY(const uint8* src_argb, uint8* dst_argb,                       \
			
 
				-                 const uint8* shuffler, int width) {                           \
			
 
				-      int n = width & ~MASK;                                                   \
			
 
				-      ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n);                           \
			
 
				-      ARGBTOY_C(src_argb + n * SBPP,                                           \
			
 
				-                dst_argb  + n * BPP, shuffler, width & MASK);                  \
			
 
				-    }
			
 
				-
			
 
				-#ifdef HAS_ARGBSHUFFLEROW_SSE2
			
 
				-YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,
			
 
				-     ARGBShuffleRow_C, 4, 4, 3)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
			
 
				-YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
			
 
				-     ARGBShuffleRow_C, 4, 4, 7)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBSHUFFLEROW_AVX2
			
 
				-YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2,
			
 
				-     ARGBShuffleRow_C, 4, 4, 15)
			
 
				-#endif
			
 
				-#ifdef HAS_ARGBSHUFFLEROW_NEON
			
 
				-YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
			
 
				-     ARGBShuffleRow_C, 4, 4, 3)
			
 
				-#endif
			
 
				-#undef YANY
			
 
				-
			
 
				-// Interpolate may want to work in place, so last16 method can not be used.
			
 
				-#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK)                      \
			
 
				-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
			
 
				-                 ptrdiff_t src_stride_ptr, int width,                          \
			
 
				-                 int source_y_fraction) {                                      \
			
 
				-      int n = width & ~MASK;                                                   \
			
 
				-      TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr,                              \
			
 
				-                n, source_y_fraction);                                         \
			
 
				-      TERP_C(dst_ptr + n * BPP,                                                \
			
 
				-             src_ptr + n * SBPP, src_stride_ptr,                               \
			
 
				-             width & MASK, source_y_fraction);                                 \
			
 
				-    }
			
 
				-
			
 
				-#ifdef HAS_INTERPOLATEROW_AVX2
			
 
				-NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
			
 
				-     InterpolateRow_C, 1, 1, 32)
			
 
				-#endif
			
 
				-#ifdef HAS_INTERPOLATEROW_SSSE3
			
 
				-NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
			
 
				-     InterpolateRow_C, 1, 1, 15)
			
 
				-#endif
			
 
				-#ifdef HAS_INTERPOLATEROW_SSE2
			
 
				-NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
			
 
				-     InterpolateRow_C, 1, 1, 15)
			
 
				-#endif
			
 
				-#ifdef HAS_INTERPOLATEROW_NEON
			
 
				-NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
			
 
				-     InterpolateRow_C, 1, 1, 15)
			
 
				-#endif
			
 
				-#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
			
 
				-NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
			
 
				-     InterpolateRow_C, 1, 1, 3)
			
 
				-#endif
			
 
				-#undef NANY
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/row_common.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/row_common.cc
@@ -1,2247 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#include <string.h>  // For memcpy and memset.
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// llvm x86 is poor at ternary operator, so use branchless min/max.
			
 
				-
			
 
				-#define USE_BRANCHLESS 1
			
 
				-#if USE_BRANCHLESS
			
 
				-static __inline int32 clamp0(int32 v) {
			
 
				-  return ((-(v) >> 31) & (v));
			
 
				-}
			
 
				-
			
 
				-static __inline int32 clamp255(int32 v) {
			
 
				-  return (((255 - (v)) >> 31) | (v)) & 255;
			
 
				-}
			
 
				-
			
 
				-static __inline uint32 Clamp(int32 val) {
			
 
				-  int v = clamp0(val);
			
 
				-  return (uint32)(clamp255(v));
			
 
				-}
			
 
				-
			
 
				-static __inline uint32 Abs(int32 v) {
			
 
				-  int m = v >> 31;
			
 
				-  return (v + m) ^ m;
			
 
				-}
			
 
				-#else  // USE_BRANCHLESS
			
 
				-static __inline int32 clamp0(int32 v) {
			
 
				-  return (v < 0) ? 0 : v;
			
 
				-}
			
 
				-
			
 
				-static __inline int32 clamp255(int32 v) {
			
 
				-  return (v > 255) ? 255 : v;
			
 
				-}
			
 
				-
			
 
				-static __inline uint32 Clamp(int32 val) {
			
 
				-  int v = clamp0(val);
			
 
				-  return (uint32)(clamp255(v));
			
 
				-}
			
 
				-
			
 
				-static __inline uint32 Abs(int32 v) {
			
 
				-  return (v < 0) ? -v : v;
			
 
				-}
			
 
				-#endif  // USE_BRANCHLESS
			
 
				-
			
 
				-#ifdef LIBYUV_LITTLE_ENDIAN
			
 
				-#define WRITEWORD(p, v) *(uint32*)(p) = v
			
 
				-#else
			
 
				-static inline void WRITEWORD(uint8* p, uint32 v) {
			
 
				-  p[0] = (uint8)(v & 255);
			
 
				-  p[1] = (uint8)((v >> 8) & 255);
			
 
				-  p[2] = (uint8)((v >> 16) & 255);
			
 
				-  p[3] = (uint8)((v >> 24) & 255);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 b = src_rgb24[0];
			
 
				-    uint8 g = src_rgb24[1];
			
 
				-    uint8 r = src_rgb24[2];
			
 
				-    dst_argb[0] = b;
			
 
				-    dst_argb[1] = g;
			
 
				-    dst_argb[2] = r;
			
 
				-    dst_argb[3] = 255u;
			
 
				-    dst_argb += 4;
			
 
				-    src_rgb24 += 3;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 r = src_raw[0];
			
 
				-    uint8 g = src_raw[1];
			
 
				-    uint8 b = src_raw[2];
			
 
				-    dst_argb[0] = b;
			
 
				-    dst_argb[1] = g;
			
 
				-    dst_argb[2] = r;
			
 
				-    dst_argb[3] = 255u;
			
 
				-    dst_argb += 4;
			
 
				-    src_raw += 3;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 b = src_rgb565[0] & 0x1f;
			
 
				-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
			
 
				-    uint8 r = src_rgb565[1] >> 3;
			
 
				-    dst_argb[0] = (b << 3) | (b >> 2);
			
 
				-    dst_argb[1] = (g << 2) | (g >> 4);
			
 
				-    dst_argb[2] = (r << 3) | (r >> 2);
			
 
				-    dst_argb[3] = 255u;
			
 
				-    dst_argb += 4;
			
 
				-    src_rgb565 += 2;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
			
 
				-                         int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 b = src_argb1555[0] & 0x1f;
			
 
				-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
			
 
				-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
			
 
				-    uint8 a = src_argb1555[1] >> 7;
			
 
				-    dst_argb[0] = (b << 3) | (b >> 2);
			
 
				-    dst_argb[1] = (g << 3) | (g >> 2);
			
 
				-    dst_argb[2] = (r << 3) | (r >> 2);
			
 
				-    dst_argb[3] = -a;
			
 
				-    dst_argb += 4;
			
 
				-    src_argb1555 += 2;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
			
 
				-                         int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 b = src_argb4444[0] & 0x0f;
			
 
				-    uint8 g = src_argb4444[0] >> 4;
			
 
				-    uint8 r = src_argb4444[1] & 0x0f;
			
 
				-    uint8 a = src_argb4444[1] >> 4;
			
 
				-    dst_argb[0] = (b << 4) | b;
			
 
				-    dst_argb[1] = (g << 4) | g;
			
 
				-    dst_argb[2] = (r << 4) | r;
			
 
				-    dst_argb[3] = (a << 4) | a;
			
 
				-    dst_argb += 4;
			
 
				-    src_argb4444 += 2;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 b = src_argb[0];
			
 
				-    uint8 g = src_argb[1];
			
 
				-    uint8 r = src_argb[2];
			
 
				-    dst_rgb[0] = b;
			
 
				-    dst_rgb[1] = g;
			
 
				-    dst_rgb[2] = r;
			
 
				-    dst_rgb += 3;
			
 
				-    src_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 b = src_argb[0];
			
 
				-    uint8 g = src_argb[1];
			
 
				-    uint8 r = src_argb[2];
			
 
				-    dst_rgb[0] = r;
			
 
				-    dst_rgb[1] = g;
			
 
				-    dst_rgb[2] = b;
			
 
				-    dst_rgb += 3;
			
 
				-    src_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    uint8 b0 = src_argb[0] >> 3;
			
 
				-    uint8 g0 = src_argb[1] >> 2;
			
 
				-    uint8 r0 = src_argb[2] >> 3;
			
 
				-    uint8 b1 = src_argb[4] >> 3;
			
 
				-    uint8 g1 = src_argb[5] >> 2;
			
 
				-    uint8 r1 = src_argb[6] >> 3;
			
 
				-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
			
 
				-              (b1 << 16) | (g1 << 21) | (r1 << 27));
			
 
				-    dst_rgb += 4;
			
 
				-    src_argb += 8;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    uint8 b0 = src_argb[0] >> 3;
			
 
				-    uint8 g0 = src_argb[1] >> 2;
			
 
				-    uint8 r0 = src_argb[2] >> 3;
			
 
				-    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    uint8 b0 = src_argb[0] >> 3;
			
 
				-    uint8 g0 = src_argb[1] >> 3;
			
 
				-    uint8 r0 = src_argb[2] >> 3;
			
 
				-    uint8 a0 = src_argb[3] >> 7;
			
 
				-    uint8 b1 = src_argb[4] >> 3;
			
 
				-    uint8 g1 = src_argb[5] >> 3;
			
 
				-    uint8 r1 = src_argb[6] >> 3;
			
 
				-    uint8 a1 = src_argb[7] >> 7;
			
 
				-    *(uint32*)(dst_rgb) =
			
 
				-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
			
 
				-        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
			
 
				-    dst_rgb += 4;
			
 
				-    src_argb += 8;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    uint8 b0 = src_argb[0] >> 3;
			
 
				-    uint8 g0 = src_argb[1] >> 3;
			
 
				-    uint8 r0 = src_argb[2] >> 3;
			
 
				-    uint8 a0 = src_argb[3] >> 7;
			
 
				-    *(uint16*)(dst_rgb) =
			
 
				-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    uint8 b0 = src_argb[0] >> 4;
			
 
				-    uint8 g0 = src_argb[1] >> 4;
			
 
				-    uint8 r0 = src_argb[2] >> 4;
			
 
				-    uint8 a0 = src_argb[3] >> 4;
			
 
				-    uint8 b1 = src_argb[4] >> 4;
			
 
				-    uint8 g1 = src_argb[5] >> 4;
			
 
				-    uint8 r1 = src_argb[6] >> 4;
			
 
				-    uint8 a1 = src_argb[7] >> 4;
			
 
				-    *(uint32*)(dst_rgb) =
			
 
				-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
			
 
				-        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
			
 
				-    dst_rgb += 4;
			
 
				-    src_argb += 8;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    uint8 b0 = src_argb[0] >> 4;
			
 
				-    uint8 g0 = src_argb[1] >> 4;
			
 
				-    uint8 r0 = src_argb[2] >> 4;
			
 
				-    uint8 a0 = src_argb[3] >> 4;
			
 
				-    *(uint16*)(dst_rgb) =
			
 
				-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
			
 
				-  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
			
 
				-}
			
 
				-
			
 
				-static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
			
 
				-  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
			
 
				-}
			
 
				-static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
			
 
				-  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
			
 
				-}
			
 
				-
			
 
				-#define MAKEROWY(NAME, R, G, B, BPP) \
			
 
				-void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
			
 
				-  int x;                                                                       \
			
 
				-  for (x = 0; x < width; ++x) {                                                \
			
 
				-    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
			
 
				-    src_argb0 += BPP;                                                          \
			
 
				-    dst_y += 1;                                                                \
			
 
				-  }                                                                            \
			
 
				-}                                                                              \
			
 
				-void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
			
 
				-                       uint8* dst_u, uint8* dst_v, int width) {                \
			
 
				-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
			
 
				-  int x;                                                                       \
			
 
				-  for (x = 0; x < width - 1; x += 2) {                                         \
			
 
				-    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
			
 
				-               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
			
 
				-    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
			
 
				-               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
			
 
				-    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
			
 
				-               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
			
 
				-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
			
 
				-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
			
 
				-    src_rgb0 += BPP * 2;                                                       \
			
 
				-    src_rgb1 += BPP * 2;                                                       \
			
 
				-    dst_u += 1;                                                                \
			
 
				-    dst_v += 1;                                                                \
			
 
				-  }                                                                            \
			
 
				-  if (width & 1) {                                                             \
			
 
				-    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
			
 
				-    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
			
 
				-    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
			
 
				-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
			
 
				-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
			
 
				-  }                                                                            \
			
 
				-}
			
 
				-
			
 
				-MAKEROWY(ARGB, 2, 1, 0, 4)
			
 
				-MAKEROWY(BGRA, 1, 2, 3, 4)
			
 
				-MAKEROWY(ABGR, 0, 1, 2, 4)
			
 
				-MAKEROWY(RGBA, 3, 2, 1, 4)
			
 
				-MAKEROWY(RGB24, 2, 1, 0, 3)
			
 
				-MAKEROWY(RAW, 0, 1, 2, 3)
			
 
				-#undef MAKEROWY
			
 
				-
			
 
				-// JPeg uses a variation on BT.601-1 full range
			
 
				-// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
			
 
				-// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
			
 
				-// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
			
 
				-// BT.601 Mpeg range uses:
			
 
				-// b 0.1016 * 255 = 25.908 = 25
			
 
				-// g 0.5078 * 255 = 129.489 = 129
			
 
				-// r 0.2578 * 255 = 65.739 = 66
			
 
				-// JPeg 8 bit Y (not used):
			
 
				-// b 0.11400 * 256 = 29.184 = 29
			
 
				-// g 0.58700 * 256 = 150.272 = 150
			
 
				-// r 0.29900 * 256 = 76.544 = 77
			
 
				-// JPeg 7 bit Y:
			
 
				-// b 0.11400 * 128 = 14.592 = 15
			
 
				-// g 0.58700 * 128 = 75.136 = 75
			
 
				-// r 0.29900 * 128 = 38.272 = 38
			
 
				-// JPeg 8 bit U:
			
 
				-// b  0.50000 * 255 = 127.5 = 127
			
 
				-// g -0.33126 * 255 = -84.4713 = -84
			
 
				-// r -0.16874 * 255 = -43.0287 = -43
			
 
				-// JPeg 8 bit V:
			
 
				-// b -0.08131 * 255 = -20.73405 = -20
			
 
				-// g -0.41869 * 255 = -106.76595 = -107
			
 
				-// r  0.50000 * 255 = 127.5 = 127
			
 
				-
			
 
				-static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
			
 
				-  return (38 * r + 75 * g +  15 * b + 64) >> 7;
			
 
				-}
			
 
				-
			
 
				-static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
			
 
				-  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
			
 
				-}
			
 
				-static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
			
 
				-  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
			
 
				-}
			
 
				-
			
 
				-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
			
 
				-
			
 
				-#define MAKEROWYJ(NAME, R, G, B, BPP) \
			
 
				-void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
			
 
				-  int x;                                                                       \
			
 
				-  for (x = 0; x < width; ++x) {                                                \
			
 
				-    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
			
 
				-    src_argb0 += BPP;                                                          \
			
 
				-    dst_y += 1;                                                                \
			
 
				-  }                                                                            \
			
 
				-}                                                                              \
			
 
				-void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
			
 
				-                        uint8* dst_u, uint8* dst_v, int width) {               \
			
 
				-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
			
 
				-  int x;                                                                       \
			
 
				-  for (x = 0; x < width - 1; x += 2) {                                         \
			
 
				-    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
			
 
				-                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
			
 
				-    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
			
 
				-                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
			
 
				-    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
			
 
				-                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
			
 
				-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
			
 
				-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
			
 
				-    src_rgb0 += BPP * 2;                                                       \
			
 
				-    src_rgb1 += BPP * 2;                                                       \
			
 
				-    dst_u += 1;                                                                \
			
 
				-    dst_v += 1;                                                                \
			
 
				-  }                                                                            \
			
 
				-  if (width & 1) {                                                             \
			
 
				-    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
			
 
				-    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
			
 
				-    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
			
 
				-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
			
 
				-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
			
 
				-  }                                                                            \
			
 
				-}
			
 
				-
			
 
				-MAKEROWYJ(ARGB, 2, 1, 0, 4)
			
 
				-#undef MAKEROWYJ
			
 
				-
			
 
				-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 b = src_rgb565[0] & 0x1f;
			
 
				-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
			
 
				-    uint8 r = src_rgb565[1] >> 3;
			
 
				-    b = (b << 3) | (b >> 2);
			
 
				-    g = (g << 2) | (g >> 4);
			
 
				-    r = (r << 3) | (r >> 2);
			
 
				-    dst_y[0] = RGBToY(r, g, b);
			
 
				-    src_rgb565 += 2;
			
 
				-    dst_y += 1;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 b = src_argb1555[0] & 0x1f;
			
 
				-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
			
 
				-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
			
 
				-    b = (b << 3) | (b >> 2);
			
 
				-    g = (g << 3) | (g >> 2);
			
 
				-    r = (r << 3) | (r >> 2);
			
 
				-    dst_y[0] = RGBToY(r, g, b);
			
 
				-    src_argb1555 += 2;
			
 
				-    dst_y += 1;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 b = src_argb4444[0] & 0x0f;
			
 
				-    uint8 g = src_argb4444[0] >> 4;
			
 
				-    uint8 r = src_argb4444[1] & 0x0f;
			
 
				-    b = (b << 4) | b;
			
 
				-    g = (g << 4) | g;
			
 
				-    r = (r << 4) | r;
			
 
				-    dst_y[0] = RGBToY(r, g, b);
			
 
				-    src_argb4444 += 2;
			
 
				-    dst_y += 1;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
			
 
				-                     uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    uint8 b0 = src_rgb565[0] & 0x1f;
			
 
				-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
			
 
				-    uint8 r0 = src_rgb565[1] >> 3;
			
 
				-    uint8 b1 = src_rgb565[2] & 0x1f;
			
 
				-    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
			
 
				-    uint8 r1 = src_rgb565[3] >> 3;
			
 
				-    uint8 b2 = next_rgb565[0] & 0x1f;
			
 
				-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
			
 
				-    uint8 r2 = next_rgb565[1] >> 3;
			
 
				-    uint8 b3 = next_rgb565[2] & 0x1f;
			
 
				-    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
			
 
				-    uint8 r3 = next_rgb565[3] >> 3;
			
 
				-    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
			
 
				-    uint8 g = (g0 + g1 + g2 + g3);
			
 
				-    uint8 r = (r0 + r1 + r2 + r3);
			
 
				-    b = (b << 1) | (b >> 6);  // 787 -> 888.
			
 
				-    r = (r << 1) | (r >> 6);
			
 
				-    dst_u[0] = RGBToU(r, g, b);
			
 
				-    dst_v[0] = RGBToV(r, g, b);
			
 
				-    src_rgb565 += 4;
			
 
				-    next_rgb565 += 4;
			
 
				-    dst_u += 1;
			
 
				-    dst_v += 1;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    uint8 b0 = src_rgb565[0] & 0x1f;
			
 
				-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
			
 
				-    uint8 r0 = src_rgb565[1] >> 3;
			
 
				-    uint8 b2 = next_rgb565[0] & 0x1f;
			
 
				-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
			
 
				-    uint8 r2 = next_rgb565[1] >> 3;
			
 
				-    uint8 b = (b0 + b2);  // 565 * 2 = 676.
			
 
				-    uint8 g = (g0 + g2);
			
 
				-    uint8 r = (r0 + r2);
			
 
				-    b = (b << 2) | (b >> 4);  // 676 -> 888
			
 
				-    g = (g << 1) | (g >> 6);
			
 
				-    r = (r << 2) | (r >> 4);
			
 
				-    dst_u[0] = RGBToU(r, g, b);
			
 
				-    dst_v[0] = RGBToV(r, g, b);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    uint8 b0 = src_argb1555[0] & 0x1f;
			
 
				-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
			
 
				-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
			
 
				-    uint8 b1 = src_argb1555[2] & 0x1f;
			
 
				-    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
			
 
				-    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
			
 
				-    uint8 b2 = next_argb1555[0] & 0x1f;
			
 
				-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
			
 
				-    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
			
 
				-    uint8 b3 = next_argb1555[2] & 0x1f;
			
 
				-    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
			
 
				-    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
			
 
				-    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
			
 
				-    uint8 g = (g0 + g1 + g2 + g3);
			
 
				-    uint8 r = (r0 + r1 + r2 + r3);
			
 
				-    b = (b << 1) | (b >> 6);  // 777 -> 888.
			
 
				-    g = (g << 1) | (g >> 6);
			
 
				-    r = (r << 1) | (r >> 6);
			
 
				-    dst_u[0] = RGBToU(r, g, b);
			
 
				-    dst_v[0] = RGBToV(r, g, b);
			
 
				-    src_argb1555 += 4;
			
 
				-    next_argb1555 += 4;
			
 
				-    dst_u += 1;
			
 
				-    dst_v += 1;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    uint8 b0 = src_argb1555[0] & 0x1f;
			
 
				-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
			
 
				-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
			
 
				-    uint8 b2 = next_argb1555[0] & 0x1f;
			
 
				-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
			
 
				-    uint8 r2 = next_argb1555[1] >> 3;
			
 
				-    uint8 b = (b0 + b2);  // 555 * 2 = 666.
			
 
				-    uint8 g = (g0 + g2);
			
 
				-    uint8 r = (r0 + r2);
			
 
				-    b = (b << 2) | (b >> 4);  // 666 -> 888.
			
 
				-    g = (g << 2) | (g >> 4);
			
 
				-    r = (r << 2) | (r >> 4);
			
 
				-    dst_u[0] = RGBToU(r, g, b);
			
 
				-    dst_v[0] = RGBToV(r, g, b);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    uint8 b0 = src_argb4444[0] & 0x0f;
			
 
				-    uint8 g0 = src_argb4444[0] >> 4;
			
 
				-    uint8 r0 = src_argb4444[1] & 0x0f;
			
 
				-    uint8 b1 = src_argb4444[2] & 0x0f;
			
 
				-    uint8 g1 = src_argb4444[2] >> 4;
			
 
				-    uint8 r1 = src_argb4444[3] & 0x0f;
			
 
				-    uint8 b2 = next_argb4444[0] & 0x0f;
			
 
				-    uint8 g2 = next_argb4444[0] >> 4;
			
 
				-    uint8 r2 = next_argb4444[1] & 0x0f;
			
 
				-    uint8 b3 = next_argb4444[2] & 0x0f;
			
 
				-    uint8 g3 = next_argb4444[2] >> 4;
			
 
				-    uint8 r3 = next_argb4444[3] & 0x0f;
			
 
				-    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
			
 
				-    uint8 g = (g0 + g1 + g2 + g3);
			
 
				-    uint8 r = (r0 + r1 + r2 + r3);
			
 
				-    b = (b << 2) | (b >> 4);  // 666 -> 888.
			
 
				-    g = (g << 2) | (g >> 4);
			
 
				-    r = (r << 2) | (r >> 4);
			
 
				-    dst_u[0] = RGBToU(r, g, b);
			
 
				-    dst_v[0] = RGBToV(r, g, b);
			
 
				-    src_argb4444 += 4;
			
 
				-    next_argb4444 += 4;
			
 
				-    dst_u += 1;
			
 
				-    dst_v += 1;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    uint8 b0 = src_argb4444[0] & 0x0f;
			
 
				-    uint8 g0 = src_argb4444[0] >> 4;
			
 
				-    uint8 r0 = src_argb4444[1] & 0x0f;
			
 
				-    uint8 b2 = next_argb4444[0] & 0x0f;
			
 
				-    uint8 g2 = next_argb4444[0] >> 4;
			
 
				-    uint8 r2 = next_argb4444[1] & 0x0f;
			
 
				-    uint8 b = (b0 + b2);  // 444 * 2 = 555.
			
 
				-    uint8 g = (g0 + g2);
			
 
				-    uint8 r = (r0 + r2);
			
 
				-    b = (b << 3) | (b >> 2);  // 555 -> 888.
			
 
				-    g = (g << 3) | (g >> 2);
			
 
				-    r = (r << 3) | (r >> 2);
			
 
				-    dst_u[0] = RGBToU(r, g, b);
			
 
				-    dst_v[0] = RGBToV(r, g, b);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBToUV444Row_C(const uint8* src_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 ab = src_argb[0];
			
 
				-    uint8 ag = src_argb[1];
			
 
				-    uint8 ar = src_argb[2];
			
 
				-    dst_u[0] = RGBToU(ar, ag, ab);
			
 
				-    dst_v[0] = RGBToV(ar, ag, ab);
			
 
				-    src_argb += 4;
			
 
				-    dst_u += 1;
			
 
				-    dst_v += 1;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBToUV422Row_C(const uint8* src_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
			
 
				-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
			
 
				-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
			
 
				-    dst_u[0] = RGBToU(ar, ag, ab);
			
 
				-    dst_v[0] = RGBToV(ar, ag, ab);
			
 
				-    src_argb += 8;
			
 
				-    dst_u += 1;
			
 
				-    dst_v += 1;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    uint8 ab = src_argb[0];
			
 
				-    uint8 ag = src_argb[1];
			
 
				-    uint8 ar = src_argb[2];
			
 
				-    dst_u[0] = RGBToU(ar, ag, ab);
			
 
				-    dst_v[0] = RGBToV(ar, ag, ab);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBToUV411Row_C(const uint8* src_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 3; x += 4) {
			
 
				-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
			
 
				-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
			
 
				-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
			
 
				-    dst_u[0] = RGBToU(ar, ag, ab);
			
 
				-    dst_v[0] = RGBToV(ar, ag, ab);
			
 
				-    src_argb += 16;
			
 
				-    dst_u += 1;
			
 
				-    dst_v += 1;
			
 
				-  }
			
 
				-  if ((width & 3) == 3) {
			
 
				-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
			
 
				-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
			
 
				-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
			
 
				-    dst_u[0] = RGBToU(ar, ag, ab);
			
 
				-    dst_v[0] = RGBToV(ar, ag, ab);
			
 
				-  } else if ((width & 3) == 2) {
			
 
				-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
			
 
				-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
			
 
				-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
			
 
				-    dst_u[0] = RGBToU(ar, ag, ab);
			
 
				-    dst_v[0] = RGBToV(ar, ag, ab);
			
 
				-  } else if ((width & 3) == 1) {
			
 
				-    uint8 ab = src_argb[0];
			
 
				-    uint8 ag = src_argb[1];
			
 
				-    uint8 ar = src_argb[2];
			
 
				-    dst_u[0] = RGBToU(ar, ag, ab);
			
 
				-    dst_v[0] = RGBToV(ar, ag, ab);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
			
 
				-    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
			
 
				-    dst_argb[3] = src_argb[3];
			
 
				-    dst_argb += 4;
			
 
				-    src_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Convert a row of image to Sepia tone.
			
 
				-void ARGBSepiaRow_C(uint8* dst_argb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    int b = dst_argb[0];
			
 
				-    int g = dst_argb[1];
			
 
				-    int r = dst_argb[2];
			
 
				-    int sb = (b * 17 + g * 68 + r * 35) >> 7;
			
 
				-    int sg = (b * 22 + g * 88 + r * 45) >> 7;
			
 
				-    int sr = (b * 24 + g * 98 + r * 50) >> 7;
			
 
				-    // b does not over flow. a is preserved from original.
			
 
				-    dst_argb[0] = sb;
			
 
				-    dst_argb[1] = clamp255(sg);
			
 
				-    dst_argb[2] = clamp255(sr);
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Apply color matrix to a row of image. Matrix is signed.
			
 
				-// TODO(fbarchard): Consider adding rounding (+32).
			
 
				-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
			
 
				-                          const int8* matrix_argb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    int b = src_argb[0];
			
 
				-    int g = src_argb[1];
			
 
				-    int r = src_argb[2];
			
 
				-    int a = src_argb[3];
			
 
				-    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
			
 
				-              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
			
 
				-    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
			
 
				-              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
			
 
				-    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
			
 
				-              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
			
 
				-    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
			
 
				-              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
			
 
				-    dst_argb[0] = Clamp(sb);
			
 
				-    dst_argb[1] = Clamp(sg);
			
 
				-    dst_argb[2] = Clamp(sr);
			
 
				-    dst_argb[3] = Clamp(sa);
			
 
				-    src_argb += 4;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Apply color table to a row of image.
			
 
				-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    int b = dst_argb[0];
			
 
				-    int g = dst_argb[1];
			
 
				-    int r = dst_argb[2];
			
 
				-    int a = dst_argb[3];
			
 
				-    dst_argb[0] = table_argb[b * 4 + 0];
			
 
				-    dst_argb[1] = table_argb[g * 4 + 1];
			
 
				-    dst_argb[2] = table_argb[r * 4 + 2];
			
 
				-    dst_argb[3] = table_argb[a * 4 + 3];
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Apply color table to a row of image.
			
 
				-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    int b = dst_argb[0];
			
 
				-    int g = dst_argb[1];
			
 
				-    int r = dst_argb[2];
			
 
				-    dst_argb[0] = table_argb[b * 4 + 0];
			
 
				-    dst_argb[1] = table_argb[g * 4 + 1];
			
 
				-    dst_argb[2] = table_argb[r * 4 + 2];
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
			
 
				-                       int interval_offset, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    int b = dst_argb[0];
			
 
				-    int g = dst_argb[1];
			
 
				-    int r = dst_argb[2];
			
 
				-    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
			
 
				-    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
			
 
				-    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#define REPEAT8(v) (v) | ((v) << 8)
			
 
				-#define SHADE(f, v) v * f >> 24
			
 
				-
			
 
				-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
			
 
				-                    uint32 value) {
			
 
				-  const uint32 b_scale = REPEAT8(value & 0xff);
			
 
				-  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
			
 
				-  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
			
 
				-  const uint32 a_scale = REPEAT8(value >> 24);
			
 
				-
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    const uint32 b = REPEAT8(src_argb[0]);
			
 
				-    const uint32 g = REPEAT8(src_argb[1]);
			
 
				-    const uint32 r = REPEAT8(src_argb[2]);
			
 
				-    const uint32 a = REPEAT8(src_argb[3]);
			
 
				-    dst_argb[0] = SHADE(b, b_scale);
			
 
				-    dst_argb[1] = SHADE(g, g_scale);
			
 
				-    dst_argb[2] = SHADE(r, r_scale);
			
 
				-    dst_argb[3] = SHADE(a, a_scale);
			
 
				-    src_argb += 4;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-#undef REPEAT8
			
 
				-#undef SHADE
			
 
				-
			
 
				-#define REPEAT8(v) (v) | ((v) << 8)
			
 
				-#define SHADE(f, v) v * f >> 16
			
 
				-
			
 
				-void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                       uint8* dst_argb, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    const uint32 b = REPEAT8(src_argb0[0]);
			
 
				-    const uint32 g = REPEAT8(src_argb0[1]);
			
 
				-    const uint32 r = REPEAT8(src_argb0[2]);
			
 
				-    const uint32 a = REPEAT8(src_argb0[3]);
			
 
				-    const uint32 b_scale = src_argb1[0];
			
 
				-    const uint32 g_scale = src_argb1[1];
			
 
				-    const uint32 r_scale = src_argb1[2];
			
 
				-    const uint32 a_scale = src_argb1[3];
			
 
				-    dst_argb[0] = SHADE(b, b_scale);
			
 
				-    dst_argb[1] = SHADE(g, g_scale);
			
 
				-    dst_argb[2] = SHADE(r, r_scale);
			
 
				-    dst_argb[3] = SHADE(a, a_scale);
			
 
				-    src_argb0 += 4;
			
 
				-    src_argb1 += 4;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-#undef REPEAT8
			
 
				-#undef SHADE
			
 
				-
			
 
				-#define SHADE(f, v) clamp255(v + f)
			
 
				-
			
 
				-void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                  uint8* dst_argb, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    const int b = src_argb0[0];
			
 
				-    const int g = src_argb0[1];
			
 
				-    const int r = src_argb0[2];
			
 
				-    const int a = src_argb0[3];
			
 
				-    const int b_add = src_argb1[0];
			
 
				-    const int g_add = src_argb1[1];
			
 
				-    const int r_add = src_argb1[2];
			
 
				-    const int a_add = src_argb1[3];
			
 
				-    dst_argb[0] = SHADE(b, b_add);
			
 
				-    dst_argb[1] = SHADE(g, g_add);
			
 
				-    dst_argb[2] = SHADE(r, r_add);
			
 
				-    dst_argb[3] = SHADE(a, a_add);
			
 
				-    src_argb0 += 4;
			
 
				-    src_argb1 += 4;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-#undef SHADE
			
 
				-
			
 
				-#define SHADE(f, v) clamp0(f - v)
			
 
				-
			
 
				-void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                       uint8* dst_argb, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    const int b = src_argb0[0];
			
 
				-    const int g = src_argb0[1];
			
 
				-    const int r = src_argb0[2];
			
 
				-    const int a = src_argb0[3];
			
 
				-    const int b_sub = src_argb1[0];
			
 
				-    const int g_sub = src_argb1[1];
			
 
				-    const int r_sub = src_argb1[2];
			
 
				-    const int a_sub = src_argb1[3];
			
 
				-    dst_argb[0] = SHADE(b, b_sub);
			
 
				-    dst_argb[1] = SHADE(g, g_sub);
			
 
				-    dst_argb[2] = SHADE(r, r_sub);
			
 
				-    dst_argb[3] = SHADE(a, a_sub);
			
 
				-    src_argb0 += 4;
			
 
				-    src_argb1 += 4;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-#undef SHADE
			
 
				-
			
 
				-// Sobel functions which mimics SSSE3.
			
 
				-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
			
 
				-                 uint8* dst_sobelx, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    int a = src_y0[i];
			
 
				-    int b = src_y1[i];
			
 
				-    int c = src_y2[i];
			
 
				-    int a_sub = src_y0[i + 2];
			
 
				-    int b_sub = src_y1[i + 2];
			
 
				-    int c_sub = src_y2[i + 2];
			
 
				-    int a_diff = a - a_sub;
			
 
				-    int b_diff = b - b_sub;
			
 
				-    int c_diff = c - c_sub;
			
 
				-    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
			
 
				-    dst_sobelx[i] = (uint8)(clamp255(sobel));
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
			
 
				-                 uint8* dst_sobely, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    int a = src_y0[i + 0];
			
 
				-    int b = src_y0[i + 1];
			
 
				-    int c = src_y0[i + 2];
			
 
				-    int a_sub = src_y1[i + 0];
			
 
				-    int b_sub = src_y1[i + 1];
			
 
				-    int c_sub = src_y1[i + 2];
			
 
				-    int a_diff = a - a_sub;
			
 
				-    int b_diff = b - b_sub;
			
 
				-    int c_diff = c - c_sub;
			
 
				-    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
			
 
				-    dst_sobely[i] = (uint8)(clamp255(sobel));
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                uint8* dst_argb, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    int r = src_sobelx[i];
			
 
				-    int b = src_sobely[i];
			
 
				-    int s = clamp255(r + b);
			
 
				-    dst_argb[0] = (uint8)(s);
			
 
				-    dst_argb[1] = (uint8)(s);
			
 
				-    dst_argb[2] = (uint8)(s);
			
 
				-    dst_argb[3] = (uint8)(255u);
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                       uint8* dst_y, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    int r = src_sobelx[i];
			
 
				-    int b = src_sobely[i];
			
 
				-    int s = clamp255(r + b);
			
 
				-    dst_y[i] = (uint8)(s);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                  uint8* dst_argb, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    int r = src_sobelx[i];
			
 
				-    int b = src_sobely[i];
			
 
				-    int g = clamp255(r + b);
			
 
				-    dst_argb[0] = (uint8)(b);
			
 
				-    dst_argb[1] = (uint8)(g);
			
 
				-    dst_argb[2] = (uint8)(r);
			
 
				-    dst_argb[3] = (uint8)(255u);
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
			
 
				-  // Copy a Y to RGB.
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    uint8 y = src_y[0];
			
 
				-    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
			
 
				-    dst_argb[3] = 255u;
			
 
				-    dst_argb += 4;
			
 
				-    ++src_y;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// C reference code that mimics the YUV assembly.
			
 
				-
			
 
				-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
			
 
				-
			
 
				-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
			
 
				-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
			
 
				-#define UR 0
			
 
				-
			
 
				-#define VB 0
			
 
				-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
			
 
				-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
			
 
				-
			
 
				-// Bias
			
 
				-#define BB UB * 128 + VB * 128
			
 
				-#define BG UG * 128 + VG * 128
			
 
				-#define BR UR * 128 + VR * 128
			
 
				-
			
 
				-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
			
 
				-                              uint8* b, uint8* g, uint8* r) {
			
 
				-  int32 y1 = ((int32)(y) - 16) * YG;
			
 
				-  *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6);
			
 
				-  *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6);
			
 
				-  *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6);
			
 
				-}
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_NEON) && \
			
 
				-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
			
 
				-// C mimic assembly.
			
 
				-// TODO(fbarchard): Remove subsampling from Neon.
			
 
				-void I444ToARGBRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
			
 
				-    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
			
 
				-    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
			
 
				-    rgb_buf[7] = 255;
			
 
				-    src_y += 2;
			
 
				-    src_u += 2;
			
 
				-    src_v += 2;
			
 
				-    rgb_buf += 8;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-  }
			
 
				-}
			
 
				-#else
			
 
				-void I444ToARGBRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-    src_y += 1;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-    rgb_buf += 4;  // Advance 1 pixel.
			
 
				-  }
			
 
				-}
			
 
				-#endif
			
 
				-// Also used for 420
			
 
				-void I422ToARGBRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-    YuvPixel(src_y[1], src_u[0], src_v[0],
			
 
				-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
			
 
				-    rgb_buf[7] = 255;
			
 
				-    src_y += 2;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-    rgb_buf += 8;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I422ToRGB24Row_C(const uint8* src_y,
			
 
				-                      const uint8* src_u,
			
 
				-                      const uint8* src_v,
			
 
				-                      uint8* rgb_buf,
			
 
				-                      int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    YuvPixel(src_y[1], src_u[0], src_v[0],
			
 
				-             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
			
 
				-    src_y += 2;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-    rgb_buf += 6;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I422ToRAWRow_C(const uint8* src_y,
			
 
				-                    const uint8* src_u,
			
 
				-                    const uint8* src_v,
			
 
				-                    uint8* rgb_buf,
			
 
				-                    int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
			
 
				-    YuvPixel(src_y[1], src_u[0], src_v[0],
			
 
				-             rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
			
 
				-    src_y += 2;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-    rgb_buf += 6;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I422ToARGB4444Row_C(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_argb4444,
			
 
				-                         int width) {
			
 
				-  uint8 b0;
			
 
				-  uint8 g0;
			
 
				-  uint8 r0;
			
 
				-  uint8 b1;
			
 
				-  uint8 g1;
			
 
				-  uint8 r1;
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
			
 
				-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
			
 
				-    b0 = b0 >> 4;
			
 
				-    g0 = g0 >> 4;
			
 
				-    r0 = r0 >> 4;
			
 
				-    b1 = b1 >> 4;
			
 
				-    g1 = g1 >> 4;
			
 
				-    r1 = r1 >> 4;
			
 
				-    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
			
 
				-        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
			
 
				-    src_y += 2;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-    dst_argb4444 += 4;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
			
 
				-    b0 = b0 >> 4;
			
 
				-    g0 = g0 >> 4;
			
 
				-    r0 = r0 >> 4;
			
 
				-    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
			
 
				-        0xf000;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I422ToARGB1555Row_C(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_argb1555,
			
 
				-                         int width) {
			
 
				-  uint8 b0;
			
 
				-  uint8 g0;
			
 
				-  uint8 r0;
			
 
				-  uint8 b1;
			
 
				-  uint8 g1;
			
 
				-  uint8 r1;
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
			
 
				-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
			
 
				-    b0 = b0 >> 3;
			
 
				-    g0 = g0 >> 3;
			
 
				-    r0 = r0 >> 3;
			
 
				-    b1 = b1 >> 3;
			
 
				-    g1 = g1 >> 3;
			
 
				-    r1 = r1 >> 3;
			
 
				-    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
			
 
				-        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
			
 
				-    src_y += 2;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-    dst_argb1555 += 4;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
			
 
				-    b0 = b0 >> 3;
			
 
				-    g0 = g0 >> 3;
			
 
				-    r0 = r0 >> 3;
			
 
				-    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
			
 
				-        0x8000;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I422ToRGB565Row_C(const uint8* src_y,
			
 
				-                       const uint8* src_u,
			
 
				-                       const uint8* src_v,
			
 
				-                       uint8* dst_rgb565,
			
 
				-                       int width) {
			
 
				-  uint8 b0;
			
 
				-  uint8 g0;
			
 
				-  uint8 r0;
			
 
				-  uint8 b1;
			
 
				-  uint8 g1;
			
 
				-  uint8 r1;
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
			
 
				-    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
			
 
				-    b0 = b0 >> 3;
			
 
				-    g0 = g0 >> 2;
			
 
				-    r0 = r0 >> 3;
			
 
				-    b1 = b1 >> 3;
			
 
				-    g1 = g1 >> 2;
			
 
				-    r1 = r1 >> 3;
			
 
				-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
			
 
				-        (b1 << 16) | (g1 << 21) | (r1 << 27);
			
 
				-    src_y += 2;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-    dst_rgb565 += 4;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
			
 
				-    b0 = b0 >> 3;
			
 
				-    g0 = g0 >> 2;
			
 
				-    r0 = r0 >> 3;
			
 
				-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I411ToARGBRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 3; x += 4) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-    YuvPixel(src_y[1], src_u[0], src_v[0],
			
 
				-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
			
 
				-    rgb_buf[7] = 255;
			
 
				-    YuvPixel(src_y[2], src_u[0], src_v[0],
			
 
				-             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
			
 
				-    rgb_buf[11] = 255;
			
 
				-    YuvPixel(src_y[3], src_u[0], src_v[0],
			
 
				-             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
			
 
				-    rgb_buf[15] = 255;
			
 
				-    src_y += 4;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-    rgb_buf += 16;  // Advance 4 pixels.
			
 
				-  }
			
 
				-  if (width & 2) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-    YuvPixel(src_y[1], src_u[0], src_v[0],
			
 
				-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
			
 
				-    rgb_buf[7] = 255;
			
 
				-    src_y += 2;
			
 
				-    rgb_buf += 8;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void NV12ToARGBRow_C(const uint8* src_y,
			
 
				-                     const uint8* usrc_v,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-    YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
			
 
				-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
			
 
				-    rgb_buf[7] = 255;
			
 
				-    src_y += 2;
			
 
				-    usrc_v += 2;
			
 
				-    rgb_buf += 8;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void NV21ToARGBRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_vu,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-
			
 
				-    YuvPixel(src_y[1], src_vu[1], src_vu[0],
			
 
				-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
			
 
				-    rgb_buf[7] = 255;
			
 
				-
			
 
				-    src_y += 2;
			
 
				-    src_vu += 2;
			
 
				-    rgb_buf += 8;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void NV12ToRGB565Row_C(const uint8* src_y,
			
 
				-                       const uint8* usrc_v,
			
 
				-                       uint8* dst_rgb565,
			
 
				-                       int width) {
			
 
				-  uint8 b0;
			
 
				-  uint8 g0;
			
 
				-  uint8 r0;
			
 
				-  uint8 b1;
			
 
				-  uint8 g1;
			
 
				-  uint8 r1;
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
			
 
				-    YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
			
 
				-    b0 = b0 >> 3;
			
 
				-    g0 = g0 >> 2;
			
 
				-    r0 = r0 >> 3;
			
 
				-    b1 = b1 >> 3;
			
 
				-    g1 = g1 >> 2;
			
 
				-    r1 = r1 >> 3;
			
 
				-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
			
 
				-        (b1 << 16) | (g1 << 21) | (r1 << 27);
			
 
				-    src_y += 2;
			
 
				-    usrc_v += 2;
			
 
				-    dst_rgb565 += 4;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
			
 
				-    b0 = b0 >> 3;
			
 
				-    g0 = g0 >> 2;
			
 
				-    r0 = r0 >> 3;
			
 
				-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void NV21ToRGB565Row_C(const uint8* src_y,
			
 
				-                       const uint8* vsrc_u,
			
 
				-                       uint8* dst_rgb565,
			
 
				-                       int width) {
			
 
				-  uint8 b0;
			
 
				-  uint8 g0;
			
 
				-  uint8 r0;
			
 
				-  uint8 b1;
			
 
				-  uint8 g1;
			
 
				-  uint8 r1;
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
			
 
				-    YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
			
 
				-    b0 = b0 >> 3;
			
 
				-    g0 = g0 >> 2;
			
 
				-    r0 = r0 >> 3;
			
 
				-    b1 = b1 >> 3;
			
 
				-    g1 = g1 >> 2;
			
 
				-    r1 = r1 >> 3;
			
 
				-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
			
 
				-        (b1 << 16) | (g1 << 21) | (r1 << 27);
			
 
				-    src_y += 2;
			
 
				-    vsrc_u += 2;
			
 
				-    dst_rgb565 += 4;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
			
 
				-    b0 = b0 >> 3;
			
 
				-    g0 = g0 >> 2;
			
 
				-    r0 = r0 >> 3;
			
 
				-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void YUY2ToARGBRow_C(const uint8* src_yuy2,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
			
 
				-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
			
 
				-    rgb_buf[7] = 255;
			
 
				-    src_yuy2 += 4;
			
 
				-    rgb_buf += 8;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void UYVYToARGBRow_C(const uint8* src_uyvy,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
			
 
				-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
			
 
				-    rgb_buf[7] = 255;
			
 
				-    src_uyvy += 4;
			
 
				-    rgb_buf += 8;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I422ToBGRARow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
			
 
				-    rgb_buf[0] = 255;
			
 
				-    YuvPixel(src_y[1], src_u[0], src_v[0],
			
 
				-             rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
			
 
				-    rgb_buf[4] = 255;
			
 
				-    src_y += 2;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-    rgb_buf += 8;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
			
 
				-    rgb_buf[0] = 255;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I422ToABGRRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
			
 
				-    rgb_buf[3] = 255;
			
 
				-    YuvPixel(src_y[1], src_u[0], src_v[0],
			
 
				-             rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
			
 
				-    rgb_buf[7] = 255;
			
 
				-    src_y += 2;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-    rgb_buf += 8;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
			
 
				-    rgb_buf[3] = 255;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I422ToRGBARow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
			
 
				-    rgb_buf[0] = 255;
			
 
				-    YuvPixel(src_y[1], src_u[0], src_v[0],
			
 
				-             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
			
 
				-    rgb_buf[4] = 255;
			
 
				-    src_y += 2;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-    rgb_buf += 8;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], src_u[0], src_v[0],
			
 
				-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
			
 
				-    rgb_buf[0] = 255;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    YuvPixel(src_y[0], 128, 128,
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-    YuvPixel(src_y[1], 128, 128,
			
 
				-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
			
 
				-    rgb_buf[7] = 255;
			
 
				-    src_y += 2;
			
 
				-    rgb_buf += 8;  // Advance 2 pixels.
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    YuvPixel(src_y[0], 128, 128,
			
 
				-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
			
 
				-    rgb_buf[3] = 255;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void MirrorRow_C(const uint8* src, uint8* dst, int width) {
			
 
				-  int x;
			
 
				-  src += width - 1;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    dst[x] = src[0];
			
 
				-    dst[x + 1] = src[-1];
			
 
				-    src -= 2;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst[width - 1] = src[0];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  int x;
			
 
				-  src_uv += (width - 1) << 1;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    dst_u[x] = src_uv[0];
			
 
				-    dst_u[x + 1] = src_uv[-2];
			
 
				-    dst_v[x] = src_uv[1];
			
 
				-    dst_v[x + 1] = src_uv[-2 + 1];
			
 
				-    src_uv -= 4;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst_u[width - 1] = src_uv[0];
			
 
				-    dst_v[width - 1] = src_uv[1];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
			
 
				-  int x;
			
 
				-  const uint32* src32 = (const uint32*)(src);
			
 
				-  uint32* dst32 = (uint32*)(dst);
			
 
				-  src32 += width - 1;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    dst32[x] = src32[0];
			
 
				-    dst32[x + 1] = src32[-1];
			
 
				-    src32 -= 2;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst32[width - 1] = src32[0];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    dst_u[x] = src_uv[0];
			
 
				-    dst_u[x + 1] = src_uv[2];
			
 
				-    dst_v[x] = src_uv[1];
			
 
				-    dst_v[x + 1] = src_uv[3];
			
 
				-    src_uv += 4;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst_u[width - 1] = src_uv[0];
			
 
				-    dst_v[width - 1] = src_uv[1];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                  int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    dst_uv[0] = src_u[x];
			
 
				-    dst_uv[1] = src_v[x];
			
 
				-    dst_uv[2] = src_u[x + 1];
			
 
				-    dst_uv[3] = src_v[x + 1];
			
 
				-    dst_uv += 4;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst_uv[0] = src_u[width - 1];
			
 
				-    dst_uv[1] = src_v[width - 1];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void CopyRow_C(const uint8* src, uint8* dst, int count) {
			
 
				-  memcpy(dst, src, count);
			
 
				-}
			
 
				-
			
 
				-void SetRow_C(uint8* dst, uint32 v8, int count) {
			
 
				-#ifdef _MSC_VER
			
 
				-  // VC will generate rep stosb.
			
 
				-  int x;
			
 
				-  for (x = 0; x < count; ++x) {
			
 
				-    dst[x] = v8;
			
 
				-  }
			
 
				-#else
			
 
				-  memset(dst, v8, count);
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
			
 
				-                 int dst_stride, int height) {
			
 
				-  int y;
			
 
				-  for (y = 0; y < height; ++y) {
			
 
				-    uint32* d = (uint32*)(dst);
			
 
				-    int x;
			
 
				-    for (x = 0; x < width; ++x) {
			
 
				-      d[x] = v32;
			
 
				-    }
			
 
				-    dst += dst_stride;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Filter 2 rows of YUY2 UV's (422) into U and V (420).
			
 
				-void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
			
 
				-                   uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  // Output a row of UV values, filtering 2 rows of YUY2.
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; x += 2) {
			
 
				-    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
			
 
				-    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
			
 
				-    src_yuy2 += 4;
			
 
				-    dst_u += 1;
			
 
				-    dst_v += 1;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Copy row of YUY2 UV's (422) into U and V (422).
			
 
				-void YUY2ToUV422Row_C(const uint8* src_yuy2,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  // Output a row of UV values.
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; x += 2) {
			
 
				-    dst_u[0] = src_yuy2[1];
			
 
				-    dst_v[0] = src_yuy2[3];
			
 
				-    src_yuy2 += 4;
			
 
				-    dst_u += 1;
			
 
				-    dst_v += 1;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Copy row of YUY2 Y's (422) into Y (420/422).
			
 
				-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
			
 
				-  // Output a row of Y values.
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    dst_y[x] = src_yuy2[0];
			
 
				-    dst_y[x + 1] = src_yuy2[2];
			
 
				-    src_yuy2 += 4;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst_y[width - 1] = src_yuy2[0];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Filter 2 rows of UYVY UV's (422) into U and V (420).
			
 
				-void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
			
 
				-                   uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  // Output a row of UV values.
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; x += 2) {
			
 
				-    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
			
 
				-    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
			
 
				-    src_uyvy += 4;
			
 
				-    dst_u += 1;
			
 
				-    dst_v += 1;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Copy row of UYVY UV's (422) into U and V (422).
			
 
				-void UYVYToUV422Row_C(const uint8* src_uyvy,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  // Output a row of UV values.
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; x += 2) {
			
 
				-    dst_u[0] = src_uyvy[0];
			
 
				-    dst_v[0] = src_uyvy[2];
			
 
				-    src_uyvy += 4;
			
 
				-    dst_u += 1;
			
 
				-    dst_v += 1;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Copy row of UYVY Y's (422) into Y (420/422).
			
 
				-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
			
 
				-  // Output a row of Y values.
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    dst_y[x] = src_uyvy[1];
			
 
				-    dst_y[x + 1] = src_uyvy[3];
			
 
				-    src_uyvy += 4;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst_y[width - 1] = src_uyvy[1];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
			
 
				-
			
 
				-// Blend src_argb0 over src_argb1 and store to dst_argb.
			
 
				-// dst_argb may be src_argb0 or src_argb1.
			
 
				-// This code mimics the SSSE3 version for better testability.
			
 
				-void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                    uint8* dst_argb, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    uint32 fb = src_argb0[0];
			
 
				-    uint32 fg = src_argb0[1];
			
 
				-    uint32 fr = src_argb0[2];
			
 
				-    uint32 a = src_argb0[3];
			
 
				-    uint32 bb = src_argb1[0];
			
 
				-    uint32 bg = src_argb1[1];
			
 
				-    uint32 br = src_argb1[2];
			
 
				-    dst_argb[0] = BLEND(fb, bb, a);
			
 
				-    dst_argb[1] = BLEND(fg, bg, a);
			
 
				-    dst_argb[2] = BLEND(fr, br, a);
			
 
				-    dst_argb[3] = 255u;
			
 
				-
			
 
				-    fb = src_argb0[4 + 0];
			
 
				-    fg = src_argb0[4 + 1];
			
 
				-    fr = src_argb0[4 + 2];
			
 
				-    a = src_argb0[4 + 3];
			
 
				-    bb = src_argb1[4 + 0];
			
 
				-    bg = src_argb1[4 + 1];
			
 
				-    br = src_argb1[4 + 2];
			
 
				-    dst_argb[4 + 0] = BLEND(fb, bb, a);
			
 
				-    dst_argb[4 + 1] = BLEND(fg, bg, a);
			
 
				-    dst_argb[4 + 2] = BLEND(fr, br, a);
			
 
				-    dst_argb[4 + 3] = 255u;
			
 
				-    src_argb0 += 8;
			
 
				-    src_argb1 += 8;
			
 
				-    dst_argb += 8;
			
 
				-  }
			
 
				-
			
 
				-  if (width & 1) {
			
 
				-    uint32 fb = src_argb0[0];
			
 
				-    uint32 fg = src_argb0[1];
			
 
				-    uint32 fr = src_argb0[2];
			
 
				-    uint32 a = src_argb0[3];
			
 
				-    uint32 bb = src_argb1[0];
			
 
				-    uint32 bg = src_argb1[1];
			
 
				-    uint32 br = src_argb1[2];
			
 
				-    dst_argb[0] = BLEND(fb, bb, a);
			
 
				-    dst_argb[1] = BLEND(fg, bg, a);
			
 
				-    dst_argb[2] = BLEND(fr, br, a);
			
 
				-    dst_argb[3] = 255u;
			
 
				-  }
			
 
				-}
			
 
				-#undef BLEND
			
 
				-#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
			
 
				-
			
 
				-// Multiply source RGB by alpha and store to destination.
			
 
				-// This code mimics the SSSE3 version for better testability.
			
 
				-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width - 1; i += 2) {
			
 
				-    uint32 b = src_argb[0];
			
 
				-    uint32 g = src_argb[1];
			
 
				-    uint32 r = src_argb[2];
			
 
				-    uint32 a = src_argb[3];
			
 
				-    dst_argb[0] = ATTENUATE(b, a);
			
 
				-    dst_argb[1] = ATTENUATE(g, a);
			
 
				-    dst_argb[2] = ATTENUATE(r, a);
			
 
				-    dst_argb[3] = a;
			
 
				-    b = src_argb[4];
			
 
				-    g = src_argb[5];
			
 
				-    r = src_argb[6];
			
 
				-    a = src_argb[7];
			
 
				-    dst_argb[4] = ATTENUATE(b, a);
			
 
				-    dst_argb[5] = ATTENUATE(g, a);
			
 
				-    dst_argb[6] = ATTENUATE(r, a);
			
 
				-    dst_argb[7] = a;
			
 
				-    src_argb += 8;
			
 
				-    dst_argb += 8;
			
 
				-  }
			
 
				-
			
 
				-  if (width & 1) {
			
 
				-    const uint32 b = src_argb[0];
			
 
				-    const uint32 g = src_argb[1];
			
 
				-    const uint32 r = src_argb[2];
			
 
				-    const uint32 a = src_argb[3];
			
 
				-    dst_argb[0] = ATTENUATE(b, a);
			
 
				-    dst_argb[1] = ATTENUATE(g, a);
			
 
				-    dst_argb[2] = ATTENUATE(r, a);
			
 
				-    dst_argb[3] = a;
			
 
				-  }
			
 
				-}
			
 
				-#undef ATTENUATE
			
 
				-
			
 
				-// Divide source RGB by alpha and store to destination.
			
 
				-// b = (b * 255 + (a / 2)) / a;
			
 
				-// g = (g * 255 + (a / 2)) / a;
			
 
				-// r = (r * 255 + (a / 2)) / a;
			
 
				-// Reciprocal method is off by 1 on some values. ie 125
			
 
				-// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
			
 
				-#define T(a) 0x01000000 + (0x10000 / a)
			
 
				-const uint32 fixed_invtbl8[256] = {
			
 
				-  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
			
 
				-  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
			
 
				-  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
			
 
				-  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
			
 
				-  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
			
 
				-  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
			
 
				-  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
			
 
				-  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
			
 
				-  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
			
 
				-  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
			
 
				-  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
			
 
				-  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
			
 
				-  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
			
 
				-  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
			
 
				-  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
			
 
				-  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
			
 
				-  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
			
 
				-  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
			
 
				-  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
			
 
				-  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
			
 
				-  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
			
 
				-  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
			
 
				-  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
			
 
				-  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
			
 
				-  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
			
 
				-  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
			
 
				-  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
			
 
				-  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
			
 
				-  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
			
 
				-  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
			
 
				-  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
			
 
				-  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
			
 
				-#undef T
			
 
				-
			
 
				-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    uint32 b = src_argb[0];
			
 
				-    uint32 g = src_argb[1];
			
 
				-    uint32 r = src_argb[2];
			
 
				-    const uint32 a = src_argb[3];
			
 
				-    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
			
 
				-    b = (b * ia) >> 8;
			
 
				-    g = (g * ia) >> 8;
			
 
				-    r = (r * ia) >> 8;
			
 
				-    // Clamping should not be necessary but is free in assembly.
			
 
				-    dst_argb[0] = clamp255(b);
			
 
				-    dst_argb[1] = clamp255(g);
			
 
				-    dst_argb[2] = clamp255(r);
			
 
				-    dst_argb[3] = a;
			
 
				-    src_argb += 4;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
			
 
				-                               const int32* previous_cumsum, int width) {
			
 
				-  int32 row_sum[4] = {0, 0, 0, 0};
			
 
				-  int x;
			
 
				-  for (x = 0; x < width; ++x) {
			
 
				-    row_sum[0] += row[x * 4 + 0];
			
 
				-    row_sum[1] += row[x * 4 + 1];
			
 
				-    row_sum[2] += row[x * 4 + 2];
			
 
				-    row_sum[3] += row[x * 4 + 3];
			
 
				-    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
			
 
				-    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
			
 
				-    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
			
 
				-    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
			
 
				-                                int w, int area, uint8* dst, int count) {
			
 
				-  float ooa = 1.0f / area;
			
 
				-  int i;
			
 
				-  for (i = 0; i < count; ++i) {
			
 
				-    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
			
 
				-    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
			
 
				-    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
			
 
				-    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
			
 
				-    dst += 4;
			
 
				-    tl += 4;
			
 
				-    bl += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Copy pixels from rotated source to destination row with a slope.
			
 
				-LIBYUV_API
			
 
				-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
			
 
				-                     uint8* dst_argb, const float* uv_dudv, int width) {
			
 
				-  int i;
			
 
				-  // Render a row of pixels from source into a buffer.
			
 
				-  float uv[2];
			
 
				-  uv[0] = uv_dudv[0];
			
 
				-  uv[1] = uv_dudv[1];
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    int x = (int)(uv[0]);
			
 
				-    int y = (int)(uv[1]);
			
 
				-    *(uint32*)(dst_argb) =
			
 
				-        *(const uint32*)(src_argb + y * src_argb_stride +
			
 
				-                                         x * 4);
			
 
				-    dst_argb += 4;
			
 
				-    uv[0] += uv_dudv[2];
			
 
				-    uv[1] += uv_dudv[3];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Blend 2 rows into 1 for conversions such as I422ToI420.
			
 
				-void HalfRow_C(const uint8* src_uv, int src_uv_stride,
			
 
				-               uint8* dst_uv, int pix) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < pix; ++x) {
			
 
				-    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// C version 2x2 -> 2x1.
			
 
				-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                      ptrdiff_t src_stride,
			
 
				-                      int width, int source_y_fraction) {
			
 
				-  int y1_fraction = source_y_fraction;
			
 
				-  int y0_fraction = 256 - y1_fraction;
			
 
				-  const uint8* src_ptr1 = src_ptr + src_stride;
			
 
				-  int x;
			
 
				-  if (source_y_fraction == 0) {
			
 
				-    memcpy(dst_ptr, src_ptr, width);
			
 
				-    return;
			
 
				-  }
			
 
				-  if (source_y_fraction == 128) {
			
 
				-    HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
			
 
				-    return;
			
 
				-  }
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
			
 
				-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
			
 
				-    src_ptr += 2;
			
 
				-    src_ptr1 += 2;
			
 
				-    dst_ptr += 2;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
			
 
				-void ARGBToBayerRow_C(const uint8* src_argb,
			
 
				-                      uint8* dst_bayer, uint32 selector, int pix) {
			
 
				-  int index0 = selector & 0xff;
			
 
				-  int index1 = (selector >> 8) & 0xff;
			
 
				-  // Copy a row of Bayer.
			
 
				-  int x;
			
 
				-  for (x = 0; x < pix - 1; x += 2) {
			
 
				-    dst_bayer[0] = src_argb[index0];
			
 
				-    dst_bayer[1] = src_argb[index1];
			
 
				-    src_argb += 8;
			
 
				-    dst_bayer += 2;
			
 
				-  }
			
 
				-  if (pix & 1) {
			
 
				-    dst_bayer[0] = src_argb[index0];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Select G channel from ARGB.  e.g.  GGGGGGGG
			
 
				-void ARGBToBayerGGRow_C(const uint8* src_argb,
			
 
				-                        uint8* dst_bayer, uint32 selector, int pix) {
			
 
				-  // Copy a row of G.
			
 
				-  int x;
			
 
				-  for (x = 0; x < pix - 1; x += 2) {
			
 
				-    dst_bayer[0] = src_argb[1];
			
 
				-    dst_bayer[1] = src_argb[5];
			
 
				-    src_argb += 8;
			
 
				-    dst_bayer += 2;
			
 
				-  }
			
 
				-  if (pix & 1) {
			
 
				-    dst_bayer[0] = src_argb[1];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Use first 4 shuffler values to reorder ARGB channels.
			
 
				-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
			
 
				-                      const uint8* shuffler, int pix) {
			
 
				-  int index0 = shuffler[0];
			
 
				-  int index1 = shuffler[1];
			
 
				-  int index2 = shuffler[2];
			
 
				-  int index3 = shuffler[3];
			
 
				-  // Shuffle a row of ARGB.
			
 
				-  int x;
			
 
				-  for (x = 0; x < pix; ++x) {
			
 
				-    // To support in-place conversion.
			
 
				-    uint8 b = src_argb[index0];
			
 
				-    uint8 g = src_argb[index1];
			
 
				-    uint8 r = src_argb[index2];
			
 
				-    uint8 a = src_argb[index3];
			
 
				-    dst_argb[0] = b;
			
 
				-    dst_argb[1] = g;
			
 
				-    dst_argb[2] = r;
			
 
				-    dst_argb[3] = a;
			
 
				-    src_argb += 4;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I422ToYUY2Row_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* dst_frame, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    dst_frame[0] = src_y[0];
			
 
				-    dst_frame[1] = src_u[0];
			
 
				-    dst_frame[2] = src_y[1];
			
 
				-    dst_frame[3] = src_v[0];
			
 
				-    dst_frame += 4;
			
 
				-    src_y += 2;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst_frame[0] = src_y[0];
			
 
				-    dst_frame[1] = src_u[0];
			
 
				-    dst_frame[2] = src_y[0];  // duplicate last y
			
 
				-    dst_frame[3] = src_v[0];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void I422ToUYVYRow_C(const uint8* src_y,
			
 
				-                     const uint8* src_u,
			
 
				-                     const uint8* src_v,
			
 
				-                     uint8* dst_frame, int width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < width - 1; x += 2) {
			
 
				-    dst_frame[0] = src_u[0];
			
 
				-    dst_frame[1] = src_y[0];
			
 
				-    dst_frame[2] = src_v[0];
			
 
				-    dst_frame[3] = src_y[1];
			
 
				-    dst_frame += 4;
			
 
				-    src_y += 2;
			
 
				-    src_u += 1;
			
 
				-    src_v += 1;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst_frame[0] = src_u[0];
			
 
				-    dst_frame[1] = src_y[0];
			
 
				-    dst_frame[2] = src_v[0];
			
 
				-    dst_frame[3] = src_y[0];  // duplicate last y
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
			
 
				-// row_win.cc has asm version, but GCC uses 2 step wrapper.
			
 
				-#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
			
 
				-void I422ToRGB565Row_SSSE3(const uint8* src_y,
			
 
				-                           const uint8* src_u,
			
 
				-                           const uint8* src_v,
			
 
				-                           uint8* rgb_buf,
			
 
				-                           int width) {
			
 
				-  // Allocate a row of ARGB.
			
 
				-  align_buffer_64(row, width * 4);
			
 
				-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
			
 
				-  ARGBToRGB565Row_SSE2(row, rgb_buf, width);
			
 
				-  free_aligned_buffer_64(row);
			
 
				-}
			
 
				-#endif  // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
			
 
				-
			
 
				-#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
			
 
				-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_u,
			
 
				-                             const uint8* src_v,
			
 
				-                             uint8* rgb_buf,
			
 
				-                             int width) {
			
 
				-  // Allocate a row of ARGB.
			
 
				-  align_buffer_64(row, width * 4);
			
 
				-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
			
 
				-  ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
			
 
				-  free_aligned_buffer_64(row);
			
 
				-}
			
 
				-
			
 
				-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
			
 
				-                             const uint8* src_u,
			
 
				-                             const uint8* src_v,
			
 
				-                             uint8* rgb_buf,
			
 
				-                             int width) {
			
 
				-  // Allocate a row of ARGB.
			
 
				-  align_buffer_64(row, width * 4);
			
 
				-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
			
 
				-  ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
			
 
				-  free_aligned_buffer_64(row);
			
 
				-}
			
 
				-
			
 
				-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
			
 
				-                           const uint8* src_uv,
			
 
				-                           uint8* dst_rgb565,
			
 
				-                           int width) {
			
 
				-  // Allocate a row of ARGB.
			
 
				-  align_buffer_64(row, width * 4);
			
 
				-  NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
			
 
				-  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
			
 
				-  free_aligned_buffer_64(row);
			
 
				-}
			
 
				-
			
 
				-void NV21ToRGB565Row_SSSE3(const uint8* src_y,
			
 
				-                           const uint8* src_vu,
			
 
				-                           uint8* dst_rgb565,
			
 
				-                           int width) {
			
 
				-  // Allocate a row of ARGB.
			
 
				-  align_buffer_64(row, width * 4);
			
 
				-  NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
			
 
				-  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
			
 
				-  free_aligned_buffer_64(row);
			
 
				-}
			
 
				-
			
 
				-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width) {
			
 
				-  // Allocate a rows of yuv.
			
 
				-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
			
 
				-  uint8* row_u = row_y + ((width + 63) & ~63);
			
 
				-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
			
 
				-  YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
			
 
				-  YUY2ToYRow_SSE2(src_yuy2, row_y, width);
			
 
				-  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
			
 
				-  free_aligned_buffer_64(row_y);
			
 
				-}
			
 
				-
			
 
				-void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width) {
			
 
				-  // Allocate a rows of yuv.
			
 
				-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
			
 
				-  uint8* row_u = row_y + ((width + 63) & ~63);
			
 
				-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
			
 
				-  YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
			
 
				-  YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
			
 
				-  I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
			
 
				-  free_aligned_buffer_64(row_y);
			
 
				-}
			
 
				-
			
 
				-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width) {
			
 
				-  // Allocate a rows of yuv.
			
 
				-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
			
 
				-  uint8* row_u = row_y + ((width + 63) & ~63);
			
 
				-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
			
 
				-  UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
			
 
				-  UYVYToYRow_SSE2(src_uyvy, row_y, width);
			
 
				-  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
			
 
				-  free_aligned_buffer_64(row_y);
			
 
				-}
			
 
				-
			
 
				-void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width) {
			
 
				-  // Allocate a rows of yuv.
			
 
				-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
			
 
				-  uint8* row_u = row_y + ((width + 63) & ~63);
			
 
				-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
			
 
				-  UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
			
 
				-  UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
			
 
				-  I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
			
 
				-  free_aligned_buffer_64(row_y);
			
 
				-}
			
 
				-
			
 
				-#endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
			
 
				-#endif  // !defined(LIBYUV_DISABLE_X86)
			
 
				-
			
 
				-void ARGBPolynomialRow_C(const uint8* src_argb,
			
 
				-                         uint8* dst_argb, const float* poly,
			
 
				-                         int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width; ++i) {
			
 
				-    float b = (float)(src_argb[0]);
			
 
				-    float g = (float)(src_argb[1]);
			
 
				-    float r = (float)(src_argb[2]);
			
 
				-    float a = (float)(src_argb[3]);
			
 
				-    float b2 = b * b;
			
 
				-    float g2 = g * g;
			
 
				-    float r2 = r * r;
			
 
				-    float a2 = a * a;
			
 
				-    float db = poly[0] + poly[4] * b;
			
 
				-    float dg = poly[1] + poly[5] * g;
			
 
				-    float dr = poly[2] + poly[6] * r;
			
 
				-    float da = poly[3] + poly[7] * a;
			
 
				-    float b3 = b2 * b;
			
 
				-    float g3 = g2 * g;
			
 
				-    float r3 = r2 * r;
			
 
				-    float a3 = a2 * a;
			
 
				-    db += poly[8] * b2;
			
 
				-    dg += poly[9] * g2;
			
 
				-    dr += poly[10] * r2;
			
 
				-    da += poly[11] * a2;
			
 
				-    db += poly[12] * b3;
			
 
				-    dg += poly[13] * g3;
			
 
				-    dr += poly[14] * r3;
			
 
				-    da += poly[15] * a3;
			
 
				-
			
 
				-    dst_argb[0] = Clamp((int32)(db));
			
 
				-    dst_argb[1] = Clamp((int32)(dg));
			
 
				-    dst_argb[2] = Clamp((int32)(dr));
			
 
				-    dst_argb[3] = Clamp((int32)(da));
			
 
				-    src_argb += 4;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
			
 
				-                             const uint8* luma, uint32 lumacoeff) {
			
 
				-  uint32 bc = lumacoeff & 0xff;
			
 
				-  uint32 gc = (lumacoeff >> 8) & 0xff;
			
 
				-  uint32 rc = (lumacoeff >> 16) & 0xff;
			
 
				-
			
 
				-  int i;
			
 
				-  for (i = 0; i < width - 1; i += 2) {
			
 
				-    // Luminance in rows, color values in columns.
			
 
				-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
			
 
				-                           src_argb[2] * rc) & 0x7F00u) + luma;
			
 
				-    const uint8* luma1;
			
 
				-    dst_argb[0] = luma0[src_argb[0]];
			
 
				-    dst_argb[1] = luma0[src_argb[1]];
			
 
				-    dst_argb[2] = luma0[src_argb[2]];
			
 
				-    dst_argb[3] = src_argb[3];
			
 
				-    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
			
 
				-              src_argb[6] * rc) & 0x7F00u) + luma;
			
 
				-    dst_argb[4] = luma1[src_argb[4]];
			
 
				-    dst_argb[5] = luma1[src_argb[5]];
			
 
				-    dst_argb[6] = luma1[src_argb[6]];
			
 
				-    dst_argb[7] = src_argb[7];
			
 
				-    src_argb += 8;
			
 
				-    dst_argb += 8;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    // Luminance in rows, color values in columns.
			
 
				-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
			
 
				-                           src_argb[2] * rc) & 0x7F00u) + luma;
			
 
				-    dst_argb[0] = luma0[src_argb[0]];
			
 
				-    dst_argb[1] = luma0[src_argb[1]];
			
 
				-    dst_argb[2] = luma0[src_argb[2]];
			
 
				-    dst_argb[3] = src_argb[3];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width - 1; i += 2) {
			
 
				-    dst[3] = src[3];
			
 
				-    dst[7] = src[7];
			
 
				-    dst += 8;
			
 
				-    src += 8;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst[3] = src[3];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
			
 
				-  int i;
			
 
				-  for (i = 0; i < width - 1; i += 2) {
			
 
				-    dst[3] = src[0];
			
 
				-    dst[7] = src[1];
			
 
				-    dst += 8;
			
 
				-    src += 2;
			
 
				-  }
			
 
				-  if (width & 1) {
			
 
				-    dst[3] = src[0];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/row_mips.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/row_mips.cc
@@ -1,991 +0,0 @@
 
				-/*
			
 
				- *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// The following are available on Mips platforms:
			
 
				-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
			
 
				-
			
 
				-#ifdef HAS_COPYROW_MIPS
			
 
				-void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
			
 
				-  __asm__ __volatile__ (
			
 
				-    ".set      noreorder                         \n"
			
 
				-    ".set      noat                              \n"
			
 
				-    "slti      $at, %[count], 8                  \n"
			
 
				-    "bne       $at ,$zero, $last8                \n"
			
 
				-    "xor       $t8, %[src], %[dst]               \n"
			
 
				-    "andi      $t8, $t8, 0x3                     \n"
			
 
				-
			
 
				-    "bne       $t8, $zero, unaligned             \n"
			
 
				-    "negu      $a3, %[dst]                       \n"
			
 
				-    // make dst/src aligned
			
 
				-    "andi      $a3, $a3, 0x3                     \n"
			
 
				-    "beq       $a3, $zero, $chk16w               \n"
			
 
				-    // word-aligned now count is the remining bytes count
			
 
				-    "subu     %[count], %[count], $a3            \n"
			
 
				-
			
 
				-    "lwr       $t8, 0(%[src])                    \n"
			
 
				-    "addu      %[src], %[src], $a3               \n"
			
 
				-    "swr       $t8, 0(%[dst])                    \n"
			
 
				-    "addu      %[dst], %[dst], $a3               \n"
			
 
				-
			
 
				-    // Now the dst/src are mutually word-aligned with word-aligned addresses
			
 
				-    "$chk16w:                                    \n"
			
 
				-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
			
 
				-    // t8 is the byte count after 64-byte chunks
			
 
				-    "beq       %[count], $t8, chk8w              \n"
			
 
				-    // There will be at most 1 32-byte chunk after it
			
 
				-    "subu      $a3, %[count], $t8                \n"  // the reminder
			
 
				-    // Here a3 counts bytes in 16w chunks
			
 
				-    "addu      $a3, %[dst], $a3                  \n"
			
 
				-    // Now a3 is the final dst after 64-byte chunks
			
 
				-    "addu      $t0, %[dst], %[count]             \n"
			
 
				-    // t0 is the "past the end" address
			
 
				-
			
 
				-    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
			
 
				-    // the "t0-32" address
			
 
				-    // This means: for x=128 the last "safe" a1 address is "t0-160"
			
 
				-    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
			
 
				-    // we will use "pref 30,128(a1)", so "t0-160" is the limit
			
 
				-    "subu      $t9, $t0, 160                     \n"
			
 
				-    // t9 is the "last safe pref 30,128(a1)" address
			
 
				-    "pref      0, 0(%[src])                      \n"  // first line of src
			
 
				-    "pref      0, 32(%[src])                     \n"  // second line of src
			
 
				-    "pref      0, 64(%[src])                     \n"
			
 
				-    "pref      30, 32(%[dst])                    \n"
			
 
				-    // In case the a1 > t9 don't use "pref 30" at all
			
 
				-    "sgtu      $v1, %[dst], $t9                  \n"
			
 
				-    "bgtz      $v1, $loop16w                     \n"
			
 
				-    "nop                                         \n"
			
 
				-    // otherwise, start with using pref30
			
 
				-    "pref      30, 64(%[dst])                    \n"
			
 
				-    "$loop16w:                                    \n"
			
 
				-    "pref      0, 96(%[src])                     \n"
			
 
				-    "lw        $t0, 0(%[src])                    \n"
			
 
				-    "bgtz      $v1, $skip_pref30_96              \n"  // skip
			
 
				-    "lw        $t1, 4(%[src])                    \n"
			
 
				-    "pref      30, 96(%[dst])                    \n"  // continue
			
 
				-    "$skip_pref30_96:                            \n"
			
 
				-    "lw        $t2, 8(%[src])                    \n"
			
 
				-    "lw        $t3, 12(%[src])                   \n"
			
 
				-    "lw        $t4, 16(%[src])                   \n"
			
 
				-    "lw        $t5, 20(%[src])                   \n"
			
 
				-    "lw        $t6, 24(%[src])                   \n"
			
 
				-    "lw        $t7, 28(%[src])                   \n"
			
 
				-    "pref      0, 128(%[src])                    \n"
			
 
				-    //  bring the next lines of src, addr 128
			
 
				-    "sw        $t0, 0(%[dst])                    \n"
			
 
				-    "sw        $t1, 4(%[dst])                    \n"
			
 
				-    "sw        $t2, 8(%[dst])                    \n"
			
 
				-    "sw        $t3, 12(%[dst])                   \n"
			
 
				-    "sw        $t4, 16(%[dst])                   \n"
			
 
				-    "sw        $t5, 20(%[dst])                   \n"
			
 
				-    "sw        $t6, 24(%[dst])                   \n"
			
 
				-    "sw        $t7, 28(%[dst])                   \n"
			
 
				-    "lw        $t0, 32(%[src])                   \n"
			
 
				-    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
			
 
				-    "lw        $t1, 36(%[src])                   \n"
			
 
				-    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
			
 
				-    "$skip_pref30_128:                           \n"
			
 
				-    "lw        $t2, 40(%[src])                   \n"
			
 
				-    "lw        $t3, 44(%[src])                   \n"
			
 
				-    "lw        $t4, 48(%[src])                   \n"
			
 
				-    "lw        $t5, 52(%[src])                   \n"
			
 
				-    "lw        $t6, 56(%[src])                   \n"
			
 
				-    "lw        $t7, 60(%[src])                   \n"
			
 
				-    "pref      0, 160(%[src])                    \n"
			
 
				-    // bring the next lines of src, addr 160
			
 
				-    "sw        $t0, 32(%[dst])                   \n"
			
 
				-    "sw        $t1, 36(%[dst])                   \n"
			
 
				-    "sw        $t2, 40(%[dst])                   \n"
			
 
				-    "sw        $t3, 44(%[dst])                   \n"
			
 
				-    "sw        $t4, 48(%[dst])                   \n"
			
 
				-    "sw        $t5, 52(%[dst])                   \n"
			
 
				-    "sw        $t6, 56(%[dst])                   \n"
			
 
				-    "sw        $t7, 60(%[dst])                   \n"
			
 
				-
			
 
				-    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
			
 
				-    "sgtu      $v1, %[dst], $t9                  \n"
			
 
				-    "bne       %[dst], $a3, $loop16w             \n"
			
 
				-    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
			
 
				-    "move      %[count], $t8                     \n"
			
 
				-
			
 
				-    // Here we have src and dest word-aligned but less than 64-bytes to go
			
 
				-
			
 
				-    "chk8w:                                      \n"
			
 
				-    "pref      0, 0x0(%[src])                    \n"
			
 
				-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
			
 
				-    // the t8 is the reminder count past 32-bytes
			
 
				-    "beq       %[count], $t8, chk1w              \n"
			
 
				-    // count=t8,no 32-byte chunk
			
 
				-    " nop                                        \n"
			
 
				-
			
 
				-    "lw        $t0, 0(%[src])                    \n"
			
 
				-    "lw        $t1, 4(%[src])                    \n"
			
 
				-    "lw        $t2, 8(%[src])                    \n"
			
 
				-    "lw        $t3, 12(%[src])                   \n"
			
 
				-    "lw        $t4, 16(%[src])                   \n"
			
 
				-    "lw        $t5, 20(%[src])                   \n"
			
 
				-    "lw        $t6, 24(%[src])                   \n"
			
 
				-    "lw        $t7, 28(%[src])                   \n"
			
 
				-    "addiu     %[src], %[src], 32                \n"
			
 
				-
			
 
				-    "sw        $t0, 0(%[dst])                    \n"
			
 
				-    "sw        $t1, 4(%[dst])                    \n"
			
 
				-    "sw        $t2, 8(%[dst])                    \n"
			
 
				-    "sw        $t3, 12(%[dst])                   \n"
			
 
				-    "sw        $t4, 16(%[dst])                   \n"
			
 
				-    "sw        $t5, 20(%[dst])                   \n"
			
 
				-    "sw        $t6, 24(%[dst])                   \n"
			
 
				-    "sw        $t7, 28(%[dst])                   \n"
			
 
				-    "addiu     %[dst], %[dst], 32                \n"
			
 
				-
			
 
				-    "chk1w:                                      \n"
			
 
				-    "andi      %[count], $t8, 0x3                \n"
			
 
				-    // now count is the reminder past 1w chunks
			
 
				-    "beq       %[count], $t8, $last8             \n"
			
 
				-    " subu     $a3, $t8, %[count]                \n"
			
 
				-    // a3 is count of bytes in 1w chunks
			
 
				-    "addu      $a3, %[dst], $a3                  \n"
			
 
				-    // now a3 is the dst address past the 1w chunks
			
 
				-    // copying in words (4-byte chunks)
			
 
				-    "$wordCopy_loop:                             \n"
			
 
				-    "lw        $t3, 0(%[src])                    \n"
			
 
				-    // the first t3 may be equal t0 ... optimize?
			
 
				-    "addiu     %[src], %[src],4                  \n"
			
 
				-    "addiu     %[dst], %[dst],4                  \n"
			
 
				-    "bne       %[dst], $a3,$wordCopy_loop        \n"
			
 
				-    " sw       $t3, -4(%[dst])                   \n"
			
 
				-
			
 
				-    // For the last (<8) bytes
			
 
				-    "$last8:                                     \n"
			
 
				-    "blez      %[count], leave                   \n"
			
 
				-    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
			
 
				-    "$last8loop:                                 \n"
			
 
				-    "lb        $v1, 0(%[src])                    \n"
			
 
				-    "addiu     %[src], %[src], 1                 \n"
			
 
				-    "addiu     %[dst], %[dst], 1                 \n"
			
 
				-    "bne       %[dst], $a3, $last8loop           \n"
			
 
				-    " sb       $v1, -1(%[dst])                   \n"
			
 
				-
			
 
				-    "leave:                                      \n"
			
 
				-    "  j       $ra                               \n"
			
 
				-    "  nop                                       \n"
			
 
				-
			
 
				-    //
			
 
				-    // UNALIGNED case
			
 
				-    //
			
 
				-
			
 
				-    "unaligned:                                  \n"
			
 
				-    // got here with a3="negu a1"
			
 
				-    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
			
 
				-    "beqz      $a3, $ua_chk16w                   \n"
			
 
				-    " subu     %[count], %[count], $a3           \n"
			
 
				-    // bytes left after initial a3 bytes
			
 
				-    "lwr       $v1, 0(%[src])                    \n"
			
 
				-    "lwl       $v1, 3(%[src])                    \n"
			
 
				-    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
			
 
				-    "swr       $v1, 0(%[dst])                    \n"
			
 
				-    "addu      %[dst], %[dst], $a3               \n"
			
 
				-    // below the dst will be word aligned (NOTE1)
			
 
				-    "$ua_chk16w:                                 \n"
			
 
				-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
			
 
				-    // t8 is the byte count after 64-byte chunks
			
 
				-    "beq       %[count], $t8, ua_chk8w           \n"
			
 
				-    // if a2==t8, no 64-byte chunks
			
 
				-    // There will be at most 1 32-byte chunk after it
			
 
				-    "subu      $a3, %[count], $t8                \n"  // the reminder
			
 
				-    // Here a3 counts bytes in 16w chunks
			
 
				-    "addu      $a3, %[dst], $a3                  \n"
			
 
				-    // Now a3 is the final dst after 64-byte chunks
			
 
				-    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
			
 
				-    "subu      $t9, $t0, 160                     \n"
			
 
				-    // t9 is the "last safe pref 30,128(a1)" address
			
 
				-    "pref      0, 0(%[src])                      \n"  // first line of src
			
 
				-    "pref      0, 32(%[src])                     \n"  // second line  addr 32
			
 
				-    "pref      0, 64(%[src])                     \n"
			
 
				-    "pref      30, 32(%[dst])                    \n"
			
 
				-    // safe, as we have at least 64 bytes ahead
			
 
				-    // In case the a1 > t9 don't use "pref 30" at all
			
 
				-    "sgtu      $v1, %[dst], $t9                  \n"
			
 
				-    "bgtz      $v1, $ua_loop16w                  \n"
			
 
				-    // skip "pref 30,64(a1)" for too short arrays
			
 
				-    " nop                                        \n"
			
 
				-    // otherwise, start with using pref30
			
 
				-    "pref      30, 64(%[dst])                    \n"
			
 
				-    "$ua_loop16w:                                \n"
			
 
				-    "pref      0, 96(%[src])                     \n"
			
 
				-    "lwr       $t0, 0(%[src])                    \n"
			
 
				-    "lwl       $t0, 3(%[src])                    \n"
			
 
				-    "lwr       $t1, 4(%[src])                    \n"
			
 
				-    "bgtz      $v1, $ua_skip_pref30_96           \n"
			
 
				-    " lwl      $t1, 7(%[src])                    \n"
			
 
				-    "pref      30, 96(%[dst])                    \n"
			
 
				-    // continue setting up the dest, addr 96
			
 
				-    "$ua_skip_pref30_96:                         \n"
			
 
				-    "lwr       $t2, 8(%[src])                    \n"
			
 
				-    "lwl       $t2, 11(%[src])                   \n"
			
 
				-    "lwr       $t3, 12(%[src])                   \n"
			
 
				-    "lwl       $t3, 15(%[src])                   \n"
			
 
				-    "lwr       $t4, 16(%[src])                   \n"
			
 
				-    "lwl       $t4, 19(%[src])                   \n"
			
 
				-    "lwr       $t5, 20(%[src])                   \n"
			
 
				-    "lwl       $t5, 23(%[src])                   \n"
			
 
				-    "lwr       $t6, 24(%[src])                   \n"
			
 
				-    "lwl       $t6, 27(%[src])                   \n"
			
 
				-    "lwr       $t7, 28(%[src])                   \n"
			
 
				-    "lwl       $t7, 31(%[src])                   \n"
			
 
				-    "pref      0, 128(%[src])                    \n"
			
 
				-    // bring the next lines of src, addr 128
			
 
				-    "sw        $t0, 0(%[dst])                    \n"
			
 
				-    "sw        $t1, 4(%[dst])                    \n"
			
 
				-    "sw        $t2, 8(%[dst])                    \n"
			
 
				-    "sw        $t3, 12(%[dst])                   \n"
			
 
				-    "sw        $t4, 16(%[dst])                   \n"
			
 
				-    "sw        $t5, 20(%[dst])                   \n"
			
 
				-    "sw        $t6, 24(%[dst])                   \n"
			
 
				-    "sw        $t7, 28(%[dst])                   \n"
			
 
				-    "lwr       $t0, 32(%[src])                   \n"
			
 
				-    "lwl       $t0, 35(%[src])                   \n"
			
 
				-    "lwr       $t1, 36(%[src])                   \n"
			
 
				-    "bgtz      $v1, ua_skip_pref30_128           \n"
			
 
				-    " lwl      $t1, 39(%[src])                   \n"
			
 
				-    "pref      30, 128(%[dst])                   \n"
			
 
				-    // continue setting up the dest, addr 128
			
 
				-    "ua_skip_pref30_128:                         \n"
			
 
				-
			
 
				-    "lwr       $t2, 40(%[src])                   \n"
			
 
				-    "lwl       $t2, 43(%[src])                   \n"
			
 
				-    "lwr       $t3, 44(%[src])                   \n"
			
 
				-    "lwl       $t3, 47(%[src])                   \n"
			
 
				-    "lwr       $t4, 48(%[src])                   \n"
			
 
				-    "lwl       $t4, 51(%[src])                   \n"
			
 
				-    "lwr       $t5, 52(%[src])                   \n"
			
 
				-    "lwl       $t5, 55(%[src])                   \n"
			
 
				-    "lwr       $t6, 56(%[src])                   \n"
			
 
				-    "lwl       $t6, 59(%[src])                   \n"
			
 
				-    "lwr       $t7, 60(%[src])                   \n"
			
 
				-    "lwl       $t7, 63(%[src])                   \n"
			
 
				-    "pref      0, 160(%[src])                    \n"
			
 
				-    // bring the next lines of src, addr 160
			
 
				-    "sw        $t0, 32(%[dst])                   \n"
			
 
				-    "sw        $t1, 36(%[dst])                   \n"
			
 
				-    "sw        $t2, 40(%[dst])                   \n"
			
 
				-    "sw        $t3, 44(%[dst])                   \n"
			
 
				-    "sw        $t4, 48(%[dst])                   \n"
			
 
				-    "sw        $t5, 52(%[dst])                   \n"
			
 
				-    "sw        $t6, 56(%[dst])                   \n"
			
 
				-    "sw        $t7, 60(%[dst])                   \n"
			
 
				-
			
 
				-    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
			
 
				-    "sgtu      $v1,%[dst],$t9                    \n"
			
 
				-    "bne       %[dst],$a3,$ua_loop16w            \n"
			
 
				-    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
			
 
				-    "move      %[count],$t8                      \n"
			
 
				-
			
 
				-    // Here we have src and dest word-aligned but less than 64-bytes to go
			
 
				-
			
 
				-    "ua_chk8w:                                   \n"
			
 
				-    "pref      0, 0x0(%[src])                    \n"
			
 
				-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
			
 
				-    // the t8 is the reminder count
			
 
				-    "beq       %[count], $t8, $ua_chk1w          \n"
			
 
				-    // when count==t8, no 32-byte chunk
			
 
				-
			
 
				-    "lwr       $t0, 0(%[src])                    \n"
			
 
				-    "lwl       $t0, 3(%[src])                    \n"
			
 
				-    "lwr       $t1, 4(%[src])                    \n"
			
 
				-    "lwl       $t1, 7(%[src])                    \n"
			
 
				-    "lwr       $t2, 8(%[src])                    \n"
			
 
				-    "lwl       $t2, 11(%[src])                   \n"
			
 
				-    "lwr       $t3, 12(%[src])                   \n"
			
 
				-    "lwl       $t3, 15(%[src])                   \n"
			
 
				-    "lwr       $t4, 16(%[src])                   \n"
			
 
				-    "lwl       $t4, 19(%[src])                   \n"
			
 
				-    "lwr       $t5, 20(%[src])                   \n"
			
 
				-    "lwl       $t5, 23(%[src])                   \n"
			
 
				-    "lwr       $t6, 24(%[src])                   \n"
			
 
				-    "lwl       $t6, 27(%[src])                   \n"
			
 
				-    "lwr       $t7, 28(%[src])                   \n"
			
 
				-    "lwl       $t7, 31(%[src])                   \n"
			
 
				-    "addiu     %[src], %[src], 32                \n"
			
 
				-
			
 
				-    "sw        $t0, 0(%[dst])                    \n"
			
 
				-    "sw        $t1, 4(%[dst])                    \n"
			
 
				-    "sw        $t2, 8(%[dst])                    \n"
			
 
				-    "sw        $t3, 12(%[dst])                   \n"
			
 
				-    "sw        $t4, 16(%[dst])                   \n"
			
 
				-    "sw        $t5, 20(%[dst])                   \n"
			
 
				-    "sw        $t6, 24(%[dst])                   \n"
			
 
				-    "sw        $t7, 28(%[dst])                   \n"
			
 
				-    "addiu     %[dst], %[dst], 32                \n"
			
 
				-
			
 
				-    "$ua_chk1w:                                  \n"
			
 
				-    "andi      %[count], $t8, 0x3                \n"
			
 
				-    // now count is the reminder past 1w chunks
			
 
				-    "beq       %[count], $t8, ua_smallCopy       \n"
			
 
				-    "subu      $a3, $t8, %[count]                \n"
			
 
				-    // a3 is count of bytes in 1w chunks
			
 
				-    "addu      $a3, %[dst], $a3                  \n"
			
 
				-    // now a3 is the dst address past the 1w chunks
			
 
				-
			
 
				-    // copying in words (4-byte chunks)
			
 
				-    "$ua_wordCopy_loop:                          \n"
			
 
				-    "lwr       $v1, 0(%[src])                    \n"
			
 
				-    "lwl       $v1, 3(%[src])                    \n"
			
 
				-    "addiu     %[src], %[src], 4                 \n"
			
 
				-    "addiu     %[dst], %[dst], 4                 \n"
			
 
				-    // note: dst=a1 is word aligned here, see NOTE1
			
 
				-    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
			
 
				-    " sw       $v1,-4(%[dst])                    \n"
			
 
				-
			
 
				-    // Now less than 4 bytes (value in count) left to copy
			
 
				-    "ua_smallCopy:                               \n"
			
 
				-    "beqz      %[count], leave                   \n"
			
 
				-    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
			
 
				-    "$ua_smallCopy_loop:                         \n"
			
 
				-    "lb        $v1, 0(%[src])                    \n"
			
 
				-    "addiu     %[src], %[src], 1                 \n"
			
 
				-    "addiu     %[dst], %[dst], 1                 \n"
			
 
				-    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
			
 
				-    " sb       $v1, -1(%[dst])                   \n"
			
 
				-
			
 
				-    "j         $ra                               \n"
			
 
				-    " nop                                        \n"
			
 
				-    ".set      at                                \n"
			
 
				-    ".set      reorder                           \n"
			
 
				-       : [dst] "+r" (dst), [src] "+r" (src)
			
 
				-       : [count] "r" (count)
			
 
				-       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
			
 
				-       "t8", "t9", "a3", "v1", "at"
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_COPYROW_MIPS
			
 
				-
			
 
				-// MIPS DSPR2 functions
			
 
				-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
			
 
				-    (__mips_dsp_rev >= 2)
			
 
				-void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                           int width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-    ".set push                                     \n"
			
 
				-    ".set noreorder                                \n"
			
 
				-    "srl             $t4, %[width], 4              \n"  // multiplies of 16
			
 
				-    "blez            $t4, 2f                       \n"
			
 
				-    " andi           %[width], %[width], 0xf       \n"  // residual
			
 
				-
			
 
				-    ".p2align        2                             \n"
			
 
				-  "1:                                              \n"
			
 
				-    "addiu           $t4, $t4, -1                  \n"
			
 
				-    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
			
 
				-    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
			
 
				-    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
			
 
				-    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
			
 
				-    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
			
 
				-    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
			
 
				-    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
			
 
				-    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
			
 
				-    "addiu           %[src_uv], %[src_uv], 32      \n"
			
 
				-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
			
 
				-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
			
 
				-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
			
 
				-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
			
 
				-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
			
 
				-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
			
 
				-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
			
 
				-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
			
 
				-    "sw              $t9, 0(%[dst_v])              \n"
			
 
				-    "sw              $t0, 0(%[dst_u])              \n"
			
 
				-    "sw              $t1, 4(%[dst_v])              \n"
			
 
				-    "sw              $t2, 4(%[dst_u])              \n"
			
 
				-    "sw              $t3, 8(%[dst_v])              \n"
			
 
				-    "sw              $t5, 8(%[dst_u])              \n"
			
 
				-    "sw              $t6, 12(%[dst_v])             \n"
			
 
				-    "sw              $t7, 12(%[dst_u])             \n"
			
 
				-    "addiu           %[dst_v], %[dst_v], 16        \n"
			
 
				-    "bgtz            $t4, 1b                       \n"
			
 
				-    " addiu          %[dst_u], %[dst_u], 16        \n"
			
 
				-
			
 
				-    "beqz            %[width], 3f                  \n"
			
 
				-    " nop                                          \n"
			
 
				-
			
 
				-  "2:                                              \n"
			
 
				-    "lbu             $t0, 0(%[src_uv])             \n"
			
 
				-    "lbu             $t1, 1(%[src_uv])             \n"
			
 
				-    "addiu           %[src_uv], %[src_uv], 2       \n"
			
 
				-    "addiu           %[width], %[width], -1        \n"
			
 
				-    "sb              $t0, 0(%[dst_u])              \n"
			
 
				-    "sb              $t1, 0(%[dst_v])              \n"
			
 
				-    "addiu           %[dst_u], %[dst_u], 1         \n"
			
 
				-    "bgtz            %[width], 2b                  \n"
			
 
				-    " addiu          %[dst_v], %[dst_v], 1         \n"
			
 
				-
			
 
				-  "3:                                              \n"
			
 
				-    ".set pop                                      \n"
			
 
				-     : [src_uv] "+r" (src_uv),
			
 
				-       [width] "+r" (width),
			
 
				-       [dst_u] "+r" (dst_u),
			
 
				-       [dst_v] "+r" (dst_v)
			
 
				-     :
			
 
				-     : "t0", "t1", "t2", "t3",
			
 
				-     "t4", "t5", "t6", "t7", "t8", "t9"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
			
 
				-                                     uint8* dst_v, int width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-    ".set push                                     \n"
			
 
				-    ".set noreorder                                \n"
			
 
				-    "srl             $t4, %[width], 4              \n"  // multiplies of 16
			
 
				-    "blez            $t4, 2f                       \n"
			
 
				-    " andi           %[width], %[width], 0xf       \n"  // residual
			
 
				-
			
 
				-    ".p2align        2                             \n"
			
 
				-  "1:                                              \n"
			
 
				-    "addiu           $t4, $t4, -1                  \n"
			
 
				-    "lwr             $t0, 0(%[src_uv])             \n"
			
 
				-    "lwl             $t0, 3(%[src_uv])             \n"  // V1 | U1 | V0 | U0
			
 
				-    "lwr             $t1, 4(%[src_uv])             \n"
			
 
				-    "lwl             $t1, 7(%[src_uv])             \n"  // V3 | U3 | V2 | U2
			
 
				-    "lwr             $t2, 8(%[src_uv])             \n"
			
 
				-    "lwl             $t2, 11(%[src_uv])            \n"  // V5 | U5 | V4 | U4
			
 
				-    "lwr             $t3, 12(%[src_uv])            \n"
			
 
				-    "lwl             $t3, 15(%[src_uv])            \n"  // V7 | U7 | V6 | U6
			
 
				-    "lwr             $t5, 16(%[src_uv])            \n"
			
 
				-    "lwl             $t5, 19(%[src_uv])            \n"  // V9 | U9 | V8 | U8
			
 
				-    "lwr             $t6, 20(%[src_uv])            \n"
			
 
				-    "lwl             $t6, 23(%[src_uv])            \n"  // V11 | U11 | V10 | U10
			
 
				-    "lwr             $t7, 24(%[src_uv])            \n"
			
 
				-    "lwl             $t7, 27(%[src_uv])            \n"  // V13 | U13 | V12 | U12
			
 
				-    "lwr             $t8, 28(%[src_uv])            \n"
			
 
				-    "lwl             $t8, 31(%[src_uv])            \n"  // V15 | U15 | V14 | U14
			
 
				-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
			
 
				-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
			
 
				-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
			
 
				-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
			
 
				-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
			
 
				-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
			
 
				-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
			
 
				-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
			
 
				-    "addiu           %[src_uv], %[src_uv], 32      \n"
			
 
				-    "swr             $t9, 0(%[dst_v])              \n"
			
 
				-    "swl             $t9, 3(%[dst_v])              \n"
			
 
				-    "swr             $t0, 0(%[dst_u])              \n"
			
 
				-    "swl             $t0, 3(%[dst_u])              \n"
			
 
				-    "swr             $t1, 4(%[dst_v])              \n"
			
 
				-    "swl             $t1, 7(%[dst_v])              \n"
			
 
				-    "swr             $t2, 4(%[dst_u])              \n"
			
 
				-    "swl             $t2, 7(%[dst_u])              \n"
			
 
				-    "swr             $t3, 8(%[dst_v])              \n"
			
 
				-    "swl             $t3, 11(%[dst_v])             \n"
			
 
				-    "swr             $t5, 8(%[dst_u])              \n"
			
 
				-    "swl             $t5, 11(%[dst_u])             \n"
			
 
				-    "swr             $t6, 12(%[dst_v])             \n"
			
 
				-    "swl             $t6, 15(%[dst_v])             \n"
			
 
				-    "swr             $t7, 12(%[dst_u])             \n"
			
 
				-    "swl             $t7, 15(%[dst_u])             \n"
			
 
				-    "addiu           %[dst_u], %[dst_u], 16        \n"
			
 
				-    "bgtz            $t4, 1b                       \n"
			
 
				-    " addiu          %[dst_v], %[dst_v], 16        \n"
			
 
				-
			
 
				-    "beqz            %[width], 3f                  \n"
			
 
				-    " nop                                          \n"
			
 
				-
			
 
				-  "2:                                              \n"
			
 
				-    "lbu             $t0, 0(%[src_uv])             \n"
			
 
				-    "lbu             $t1, 1(%[src_uv])             \n"
			
 
				-    "addiu           %[src_uv], %[src_uv], 2       \n"
			
 
				-    "addiu           %[width], %[width], -1        \n"
			
 
				-    "sb              $t0, 0(%[dst_u])              \n"
			
 
				-    "sb              $t1, 0(%[dst_v])              \n"
			
 
				-    "addiu           %[dst_u], %[dst_u], 1         \n"
			
 
				-    "bgtz            %[width], 2b                  \n"
			
 
				-    " addiu          %[dst_v], %[dst_v], 1         \n"
			
 
				-
			
 
				-  "3:                                              \n"
			
 
				-    ".set pop                                      \n"
			
 
				-     : [src_uv] "+r" (src_uv),
			
 
				-       [width] "+r" (width),
			
 
				-       [dst_u] "+r" (dst_u),
			
 
				-       [dst_v] "+r" (dst_v)
			
 
				-     :
			
 
				-     : "t0", "t1", "t2", "t3",
			
 
				-     "t4", "t5", "t6", "t7", "t8", "t9"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-    ".set push                             \n"
			
 
				-    ".set noreorder                        \n"
			
 
				-
			
 
				-    "srl       $t4, %[width], 4            \n"  // multiplies of 16
			
 
				-    "andi      $t5, %[width], 0xf          \n"
			
 
				-    "blez      $t4, 2f                     \n"
			
 
				-    " addu     %[src], %[src], %[width]    \n"  // src += width
			
 
				-
			
 
				-    ".p2align  2                           \n"
			
 
				-   "1:                                     \n"
			
 
				-    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
			
 
				-    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
			
 
				-    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
			
 
				-    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
			
 
				-    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
			
 
				-    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
			
 
				-    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
			
 
				-    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
			
 
				-    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
			
 
				-    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
			
 
				-    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
			
 
				-    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
			
 
				-    "addiu     %[src], %[src], -16         \n"
			
 
				-    "addiu     $t4, $t4, -1                \n"
			
 
				-    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
			
 
				-    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
			
 
				-    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
			
 
				-    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
			
 
				-    "bgtz      $t4, 1b                     \n"
			
 
				-    " addiu    %[dst], %[dst], 16          \n"
			
 
				-    "beqz      $t5, 3f                     \n"
			
 
				-    " nop                                  \n"
			
 
				-
			
 
				-   "2:                                     \n"
			
 
				-    "lbu       $t0, -1(%[src])             \n"
			
 
				-    "addiu     $t5, $t5, -1                \n"
			
 
				-    "addiu     %[src], %[src], -1          \n"
			
 
				-    "sb        $t0, 0(%[dst])              \n"
			
 
				-    "bgez      $t5, 2b                     \n"
			
 
				-    " addiu    %[dst], %[dst], 1           \n"
			
 
				-
			
 
				-   "3:                                     \n"
			
 
				-    ".set pop                              \n"
			
 
				-      : [src] "+r" (src), [dst] "+r" (dst)
			
 
				-      : [width] "r" (width)
			
 
				-      : "t0", "t1", "t2", "t3", "t4", "t5"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                            int width) {
			
 
				-  int x = 0;
			
 
				-  int y = 0;
			
 
				-  __asm__ __volatile__ (
			
 
				-    ".set push                                    \n"
			
 
				-    ".set noreorder                               \n"
			
 
				-
			
 
				-    "addu            $t4, %[width], %[width]      \n"
			
 
				-    "srl             %[x], %[width], 4            \n"
			
 
				-    "andi            %[y], %[width], 0xf          \n"
			
 
				-    "blez            %[x], 2f                     \n"
			
 
				-    " addu           %[src_uv], %[src_uv], $t4    \n"
			
 
				-
			
 
				-    ".p2align        2                            \n"
			
 
				-   "1:                                            \n"
			
 
				-    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
			
 
				-    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
			
 
				-    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
			
 
				-    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
			
 
				-    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
			
 
				-    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
			
 
				-    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
			
 
				-    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
			
 
				-
			
 
				-    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
			
 
				-    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
			
 
				-    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
			
 
				-    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
			
 
				-    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
			
 
				-    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
			
 
				-    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
			
 
				-    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
			
 
				-    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
			
 
				-    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
			
 
				-    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
			
 
				-    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
			
 
				-    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
			
 
				-    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
			
 
				-    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
			
 
				-    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
			
 
				-    "addiu           %[src_uv], %[src_uv], -32    \n"
			
 
				-    "addiu           %[x], %[x], -1               \n"
			
 
				-    "swr             $t4, 0(%[dst_u])             \n"
			
 
				-    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
			
 
				-    "swr             $t6, 0(%[dst_v])             \n"
			
 
				-    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
			
 
				-    "swr             $t2, 4(%[dst_u])             \n"
			
 
				-    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
			
 
				-    "swr             $t3, 4(%[dst_v])             \n"
			
 
				-    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
			
 
				-    "swr             $t0, 8(%[dst_u])             \n"
			
 
				-    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
			
 
				-    "swr             $t1, 8(%[dst_v])             \n"
			
 
				-    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
			
 
				-    "swr             $t9, 12(%[dst_u])            \n"
			
 
				-    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
			
 
				-    "swr             $t5, 12(%[dst_v])            \n"
			
 
				-    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
			
 
				-    "addiu           %[dst_v], %[dst_v], 16       \n"
			
 
				-    "bgtz            %[x], 1b                     \n"
			
 
				-    " addiu          %[dst_u], %[dst_u], 16       \n"
			
 
				-    "beqz            %[y], 3f                     \n"
			
 
				-    " nop                                         \n"
			
 
				-    "b               2f                           \n"
			
 
				-    " nop                                         \n"
			
 
				-
			
 
				-   "2:                                            \n"
			
 
				-    "lbu             $t0, -2(%[src_uv])           \n"
			
 
				-    "lbu             $t1, -1(%[src_uv])           \n"
			
 
				-    "addiu           %[src_uv], %[src_uv], -2     \n"
			
 
				-    "addiu           %[y], %[y], -1               \n"
			
 
				-    "sb              $t0, 0(%[dst_u])             \n"
			
 
				-    "sb              $t1, 0(%[dst_v])             \n"
			
 
				-    "addiu           %[dst_u], %[dst_u], 1        \n"
			
 
				-    "bgtz            %[y], 2b                     \n"
			
 
				-    " addiu          %[dst_v], %[dst_v], 1        \n"
			
 
				-
			
 
				-   "3:                                            \n"
			
 
				-    ".set pop                                     \n"
			
 
				-      : [src_uv] "+r" (src_uv),
			
 
				-        [dst_u] "+r" (dst_u),
			
 
				-        [dst_v] "+r" (dst_v),
			
 
				-        [x] "=&r" (x),
			
 
				-        [y] "+r" (y)
			
 
				-      : [width] "r" (width)
			
 
				-      : "t0", "t1", "t2", "t3", "t4",
			
 
				-      "t5", "t7", "t8", "t9"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Convert (4 Y and 2 VU) I422 and arrange RGB values into
			
 
				-// t5 = | 0 | B0 | 0 | b0 |
			
 
				-// t4 = | 0 | B1 | 0 | b1 |
			
 
				-// t9 = | 0 | G0 | 0 | g0 |
			
 
				-// t8 = | 0 | G1 | 0 | g1 |
			
 
				-// t2 = | 0 | R0 | 0 | r0 |
			
 
				-// t1 = | 0 | R1 | 0 | r1 |
			
 
				-#define I422ToTransientMipsRGB                                                 \
			
 
				-      "lw                $t0, 0(%[y_buf])       \n"                            \
			
 
				-      "lhu               $t1, 0(%[u_buf])       \n"                            \
			
 
				-      "lhu               $t2, 0(%[v_buf])       \n"                            \
			
 
				-      "preceu.ph.qbr     $t1, $t1               \n"                            \
			
 
				-      "preceu.ph.qbr     $t2, $t2               \n"                            \
			
 
				-      "preceu.ph.qbra    $t3, $t0               \n"                            \
			
 
				-      "preceu.ph.qbla    $t0, $t0               \n"                            \
			
 
				-      "subu.ph           $t1, $t1, $s5          \n"                            \
			
 
				-      "subu.ph           $t2, $t2, $s5          \n"                            \
			
 
				-      "subu.ph           $t3, $t3, $s4          \n"                            \
			
 
				-      "subu.ph           $t0, $t0, $s4          \n"                            \
			
 
				-      "mul.ph            $t3, $t3, $s0          \n"                            \
			
 
				-      "mul.ph            $t0, $t0, $s0          \n"                            \
			
 
				-      "shll.ph           $t4, $t1, 0x7          \n"                            \
			
 
				-      "subu.ph           $t4, $t4, $t1          \n"                            \
			
 
				-      "mul.ph            $t6, $t1, $s1          \n"                            \
			
 
				-      "mul.ph            $t1, $t2, $s2          \n"                            \
			
 
				-      "addq_s.ph         $t5, $t4, $t3          \n"                            \
			
 
				-      "addq_s.ph         $t4, $t4, $t0          \n"                            \
			
 
				-      "shra.ph           $t5, $t5, 6            \n"                            \
			
 
				-      "shra.ph           $t4, $t4, 6            \n"                            \
			
 
				-      "addiu             %[u_buf], 2            \n"                            \
			
 
				-      "addiu             %[v_buf], 2            \n"                            \
			
 
				-      "addu.ph           $t6, $t6, $t1          \n"                            \
			
 
				-      "mul.ph            $t1, $t2, $s3          \n"                            \
			
 
				-      "addu.ph           $t9, $t6, $t3          \n"                            \
			
 
				-      "addu.ph           $t8, $t6, $t0          \n"                            \
			
 
				-      "shra.ph           $t9, $t9, 6            \n"                            \
			
 
				-      "shra.ph           $t8, $t8, 6            \n"                            \
			
 
				-      "addu.ph           $t2, $t1, $t3          \n"                            \
			
 
				-      "addu.ph           $t1, $t1, $t0          \n"                            \
			
 
				-      "shra.ph           $t2, $t2, 6            \n"                            \
			
 
				-      "shra.ph           $t1, $t1, 6            \n"                            \
			
 
				-      "subu.ph           $t5, $t5, $s5          \n"                            \
			
 
				-      "subu.ph           $t4, $t4, $s5          \n"                            \
			
 
				-      "subu.ph           $t9, $t9, $s5          \n"                            \
			
 
				-      "subu.ph           $t8, $t8, $s5          \n"                            \
			
 
				-      "subu.ph           $t2, $t2, $s5          \n"                            \
			
 
				-      "subu.ph           $t1, $t1, $s5          \n"                            \
			
 
				-      "shll_s.ph         $t5, $t5, 8            \n"                            \
			
 
				-      "shll_s.ph         $t4, $t4, 8            \n"                            \
			
 
				-      "shll_s.ph         $t9, $t9, 8            \n"                            \
			
 
				-      "shll_s.ph         $t8, $t8, 8            \n"                            \
			
 
				-      "shll_s.ph         $t2, $t2, 8            \n"                            \
			
 
				-      "shll_s.ph         $t1, $t1, 8            \n"                            \
			
 
				-      "shra.ph           $t5, $t5, 8            \n"                            \
			
 
				-      "shra.ph           $t4, $t4, 8            \n"                            \
			
 
				-      "shra.ph           $t9, $t9, 8            \n"                            \
			
 
				-      "shra.ph           $t8, $t8, 8            \n"                            \
			
 
				-      "shra.ph           $t2, $t2, 8            \n"                            \
			
 
				-      "shra.ph           $t1, $t1, 8            \n"                            \
			
 
				-      "addu.ph           $t5, $t5, $s5          \n"                            \
			
 
				-      "addu.ph           $t4, $t4, $s5          \n"                            \
			
 
				-      "addu.ph           $t9, $t9, $s5          \n"                            \
			
 
				-      "addu.ph           $t8, $t8, $s5          \n"                            \
			
 
				-      "addu.ph           $t2, $t2, $s5          \n"                            \
			
 
				-      "addu.ph           $t1, $t1, $s5          \n"
			
 
				-
			
 
				-void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
			
 
				-                              const uint8* u_buf,
			
 
				-                              const uint8* v_buf,
			
 
				-                              uint8* rgb_buf,
			
 
				-                              int width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-    ".set push                                \n"
			
 
				-    ".set noreorder                           \n"
			
 
				-    "beqz              %[width], 2f           \n"
			
 
				-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
			
 
				-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
			
 
				-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
			
 
				-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
			
 
				-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
			
 
				-    "repl.ph           $s5, 128               \n"  // |128|128| // clipping
			
 
				-    "lui               $s6, 0xff00            \n"
			
 
				-    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
			
 
				-
			
 
				-    ".p2align          2                      \n"
			
 
				-   "1:                                        \n"
			
 
				-      I422ToTransientMipsRGB
			
 
				-// Arranging into argb format
			
 
				-    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
			
 
				-    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
			
 
				-    "addiu             %[width], -4           \n"
			
 
				-    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
			
 
				-    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
			
 
				-    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
			
 
				-
			
 
				-    "addiu             %[y_buf], 4            \n"
			
 
				-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
			
 
				-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
			
 
				-    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
			
 
				-    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
			
 
				-    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
			
 
				-    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
			
 
				-    "sll               $t9, $t9, 16           \n"
			
 
				-    "sll               $t8, $t8, 16           \n"
			
 
				-    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
			
 
				-    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
			
 
				-// Store results.
			
 
				-    "sw                $t2, 0(%[rgb_buf])     \n"
			
 
				-    "sw                $t0, 4(%[rgb_buf])     \n"
			
 
				-    "sw                $t1, 8(%[rgb_buf])     \n"
			
 
				-    "sw                $t3, 12(%[rgb_buf])    \n"
			
 
				-    "bnez              %[width], 1b           \n"
			
 
				-    " addiu            %[rgb_buf], 16         \n"
			
 
				-   "2:                                        \n"
			
 
				-    ".set pop                                 \n"
			
 
				-      :[y_buf] "+r" (y_buf),
			
 
				-       [u_buf] "+r" (u_buf),
			
 
				-       [v_buf] "+r" (v_buf),
			
 
				-       [width] "+r" (width),
			
 
				-       [rgb_buf] "+r" (rgb_buf)
			
 
				-      :
			
 
				-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
			
 
				-      "t6", "t7", "t8", "t9",
			
 
				-      "s0", "s1", "s2", "s3",
			
 
				-      "s4", "s5", "s6"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
			
 
				-                              const uint8* u_buf,
			
 
				-                              const uint8* v_buf,
			
 
				-                              uint8* rgb_buf,
			
 
				-                              int width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-    ".set push                                \n"
			
 
				-    ".set noreorder                           \n"
			
 
				-    "beqz              %[width], 2f           \n"
			
 
				-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
			
 
				-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
			
 
				-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
			
 
				-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
			
 
				-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
			
 
				-    "repl.ph           $s5, 128               \n"  // |128|128|
			
 
				-    "lui               $s6, 0xff00            \n"
			
 
				-    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|
			
 
				-
			
 
				-    ".p2align          2                       \n"
			
 
				-   "1:                                         \n"
			
 
				-      I422ToTransientMipsRGB
			
 
				-// Arranging into abgr format
			
 
				-    "precr.qb.ph      $t0, $t8, $t1           \n"  // |G1|g1|R1|r1|
			
 
				-    "precr.qb.ph      $t3, $t9, $t2           \n"  // |G0|g0|R0|r0|
			
 
				-    "precrq.qb.ph     $t8, $t0, $t3           \n"  // |G1|R1|G0|R0|
			
 
				-    "precr.qb.ph      $t9, $t0, $t3           \n"  // |g1|r1|g0|r0|
			
 
				-
			
 
				-    "precr.qb.ph       $t2, $t4, $t5          \n"  // |B1|b1|B0|b0|
			
 
				-    "addiu             %[width], -4           \n"
			
 
				-    "addiu             %[y_buf], 4            \n"
			
 
				-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |B1|0 |B0|
			
 
				-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |b1|0 |b0|
			
 
				-    "or                $t1, $t1, $s6          \n"  // |ff|B1|ff|B0|
			
 
				-    "or                $t2, $t2, $s6          \n"  // |ff|b1|ff|b0|
			
 
				-    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|b1|g1|r1|
			
 
				-    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|B1|G1|R1|
			
 
				-    "sll               $t9, $t9, 16           \n"
			
 
				-    "sll               $t8, $t8, 16           \n"
			
 
				-    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|b0|g0|r0|
			
 
				-    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|B0|G0|R0|
			
 
				-// Store results.
			
 
				-    "sw                $t2, 0(%[rgb_buf])     \n"
			
 
				-    "sw                $t0, 4(%[rgb_buf])     \n"
			
 
				-    "sw                $t1, 8(%[rgb_buf])     \n"
			
 
				-    "sw                $t3, 12(%[rgb_buf])    \n"
			
 
				-    "bnez              %[width], 1b           \n"
			
 
				-    " addiu            %[rgb_buf], 16         \n"
			
 
				-   "2:                                        \n"
			
 
				-    ".set pop                                 \n"
			
 
				-      :[y_buf] "+r" (y_buf),
			
 
				-       [u_buf] "+r" (u_buf),
			
 
				-       [v_buf] "+r" (v_buf),
			
 
				-       [width] "+r" (width),
			
 
				-       [rgb_buf] "+r" (rgb_buf)
			
 
				-      :
			
 
				-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
			
 
				-      "t6", "t7", "t8", "t9",
			
 
				-      "s0", "s1", "s2", "s3",
			
 
				-      "s4", "s5", "s6"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
			
 
				-                              const uint8* u_buf,
			
 
				-                              const uint8* v_buf,
			
 
				-                              uint8* rgb_buf,
			
 
				-                              int width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-    ".set push                                \n"
			
 
				-    ".set noreorder                           \n"
			
 
				-    "beqz              %[width], 2f           \n"
			
 
				-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |
			
 
				-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
			
 
				-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
			
 
				-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
			
 
				-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
			
 
				-    "repl.ph           $s5, 128               \n"  // |128|128|
			
 
				-    "lui               $s6, 0xff              \n"
			
 
				-    "ori               $s6, 0xff              \n"  // |00|ff|00|ff|
			
 
				-
			
 
				-    ".p2align          2                      \n"
			
 
				-   "1:                                        \n"
			
 
				-      I422ToTransientMipsRGB
			
 
				-      // Arranging into bgra format
			
 
				-    "precr.qb.ph       $t4, $t4, $t8          \n"  // |B1|b1|G1|g1|
			
 
				-    "precr.qb.ph       $t5, $t5, $t9          \n"  // |B0|b0|G0|g0|
			
 
				-    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |B1|G1|B0|G0|
			
 
				-    "precr.qb.ph       $t9, $t4, $t5          \n"  // |b1|g1|b0|g0|
			
 
				-
			
 
				-    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
			
 
				-    "addiu             %[width], -4           \n"
			
 
				-    "addiu             %[y_buf], 4            \n"
			
 
				-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
			
 
				-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
			
 
				-    "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |
			
 
				-    "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |
			
 
				-    "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|
			
 
				-    "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|
			
 
				-    "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|
			
 
				-    "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|
			
 
				-    "sll               $t1, $t1, 16           \n"
			
 
				-    "sll               $t2, $t2, 16           \n"
			
 
				-    "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|
			
 
				-    "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|
			
 
				-// Store results.
			
 
				-    "sw                $t2, 0(%[rgb_buf])     \n"
			
 
				-    "sw                $t0, 4(%[rgb_buf])     \n"
			
 
				-    "sw                $t1, 8(%[rgb_buf])     \n"
			
 
				-    "sw                $t3, 12(%[rgb_buf])    \n"
			
 
				-    "bnez              %[width], 1b           \n"
			
 
				-    " addiu            %[rgb_buf], 16         \n"
			
 
				-   "2:                                        \n"
			
 
				-    ".set pop                                 \n"
			
 
				-      :[y_buf] "+r" (y_buf),
			
 
				-       [u_buf] "+r" (u_buf),
			
 
				-       [v_buf] "+r" (v_buf),
			
 
				-       [width] "+r" (width),
			
 
				-       [rgb_buf] "+r" (rgb_buf)
			
 
				-      :
			
 
				-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
			
 
				-      "t6", "t7", "t8", "t9",
			
 
				-      "s0", "s1", "s2", "s3",
			
 
				-      "s4", "s5", "s6"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Bilinear filter 8x2 -> 8x1
			
 
				-void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                                ptrdiff_t src_stride, int dst_width,
			
 
				-                                int source_y_fraction) {
			
 
				-    int y0_fraction = 256 - source_y_fraction;
			
 
				-    const uint8* src_ptr1 = src_ptr + src_stride;
			
 
				-
			
 
				-  __asm__ __volatile__ (
			
 
				-     ".set push                                           \n"
			
 
				-     ".set noreorder                                      \n"
			
 
				-
			
 
				-     "replv.ph          $t0, %[y0_fraction]               \n"
			
 
				-     "replv.ph          $t1, %[source_y_fraction]         \n"
			
 
				-
			
 
				-    ".p2align           2                                 \n"
			
 
				-   "1:                                                    \n"
			
 
				-     "lw                $t2, 0(%[src_ptr])                \n"
			
 
				-     "lw                $t3, 0(%[src_ptr1])               \n"
			
 
				-     "lw                $t4, 4(%[src_ptr])                \n"
			
 
				-     "lw                $t5, 4(%[src_ptr1])               \n"
			
 
				-     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
			
 
				-     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
			
 
				-     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
			
 
				-     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
			
 
				-     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
			
 
				-     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
			
 
				-     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
			
 
				-     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
			
 
				-     "addq.ph           $t6, $t6, $t8                     \n"
			
 
				-     "addq.ph           $t7, $t7, $t9                     \n"
			
 
				-     "addq.ph           $t2, $t2, $t4                     \n"
			
 
				-     "addq.ph           $t3, $t3, $t5                     \n"
			
 
				-     "shra.ph           $t6, $t6, 8                       \n"
			
 
				-     "shra.ph           $t7, $t7, 8                       \n"
			
 
				-     "shra.ph           $t2, $t2, 8                       \n"
			
 
				-     "shra.ph           $t3, $t3, 8                       \n"
			
 
				-     "precr.qb.ph       $t6, $t6, $t7                     \n"
			
 
				-     "precr.qb.ph       $t2, $t2, $t3                     \n"
			
 
				-     "addiu             %[src_ptr], %[src_ptr], 8         \n"
			
 
				-     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
			
 
				-     "addiu             %[dst_width], %[dst_width], -8    \n"
			
 
				-     "sw                $t6, 0(%[dst_ptr])                \n"
			
 
				-     "sw                $t2, 4(%[dst_ptr])                \n"
			
 
				-     "bgtz              %[dst_width], 1b                  \n"
			
 
				-     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
			
 
				-
			
 
				-     ".set pop                                            \n"
			
 
				-  : [dst_ptr] "+r" (dst_ptr),
			
 
				-    [src_ptr1] "+r" (src_ptr1),
			
 
				-    [src_ptr] "+r" (src_ptr),
			
 
				-    [dst_width] "+r" (dst_width)
			
 
				-  : [source_y_fraction] "r" (source_y_fraction),
			
 
				-    [y0_fraction] "r" (y0_fraction),
			
 
				-    [src_stride] "r" (src_stride)
			
 
				-  : "t0", "t1", "t2", "t3", "t4", "t5",
			
 
				-    "t6", "t7", "t8", "t9"
			
 
				-  );
			
 
				-}
			
 
				-#endif  // __mips_dsp_rev >= 2
			
 
				-
			
 
				-#endif  // defined(__mips__)
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/row_neon.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/row_neon.cc
@@ -1,2847 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// This module is for GCC Neon
			
 
				-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
			
 
				-
			
 
				-// Read 8 Y, 4 U and 4 V from 422
			
 
				-#define READYUV422                                                             \
			
 
				-    "vld1.8     {d0}, [%0]!                    \n"                             \
			
 
				-    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
			
 
				-    "vld1.32    {d2[1]}, [%2]!                 \n"
			
 
				-
			
 
				-// Read 8 Y, 2 U and 2 V from 422
			
 
				-#define READYUV411                                                             \
			
 
				-    "vld1.8     {d0}, [%0]!                    \n"                             \
			
 
				-    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
			
 
				-    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
			
 
				-    "vmov.u8    d3, d2                         \n"                             \
			
 
				-    "vzip.u8    d2, d3                         \n"
			
 
				-
			
 
				-// Read 8 Y, 8 U and 8 V from 444
			
 
				-#define READYUV444                                                             \
			
 
				-    "vld1.8     {d0}, [%0]!                    \n"                             \
			
 
				-    "vld1.8     {d2}, [%1]!                    \n"                             \
			
 
				-    "vld1.8     {d3}, [%2]!                    \n"                             \
			
 
				-    "vpaddl.u8  q1, q1                         \n"                             \
			
 
				-    "vrshrn.u16 d2, q1, #1                     \n"
			
 
				-
			
 
				-// Read 8 Y, and set 4 U and 4 V to 128
			
 
				-#define READYUV400                                                             \
			
 
				-    "vld1.8     {d0}, [%0]!                    \n"                             \
			
 
				-    "vmov.u8    d2, #128                       \n"
			
 
				-
			
 
				-// Read 8 Y and 4 UV from NV12
			
 
				-#define READNV12                                                               \
			
 
				-    "vld1.8     {d0}, [%0]!                    \n"                             \
			
 
				-    "vld1.8     {d2}, [%1]!                    \n"                             \
			
 
				-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
			
 
				-    "vuzp.u8    d2, d3                         \n"                             \
			
 
				-    "vtrn.u32   d2, d3                         \n"
			
 
				-
			
 
				-// Read 8 Y and 4 VU from NV21
			
 
				-#define READNV21                                                               \
			
 
				-    "vld1.8     {d0}, [%0]!                    \n"                             \
			
 
				-    "vld1.8     {d2}, [%1]!                    \n"                             \
			
 
				-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
			
 
				-    "vuzp.u8    d3, d2                         \n"                             \
			
 
				-    "vtrn.u32   d2, d3                         \n"
			
 
				-
			
 
				-// Read 8 YUY2
			
 
				-#define READYUY2                                                               \
			
 
				-    "vld2.8     {d0, d2}, [%0]!                \n"                             \
			
 
				-    "vmov.u8    d3, d2                         \n"                             \
			
 
				-    "vuzp.u8    d2, d3                         \n"                             \
			
 
				-    "vtrn.u32   d2, d3                         \n"
			
 
				-
			
 
				-// Read 8 UYVY
			
 
				-#define READUYVY                                                               \
			
 
				-    "vld2.8     {d2, d3}, [%0]!                \n"                             \
			
 
				-    "vmov.u8    d0, d3                         \n"                             \
			
 
				-    "vmov.u8    d3, d2                         \n"                             \
			
 
				-    "vuzp.u8    d2, d3                         \n"                             \
			
 
				-    "vtrn.u32   d2, d3                         \n"
			
 
				-
			
 
				-#define YUV422TORGB                                                            \
			
 
				-    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
			
 
				-    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
			
 
				-    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
			
 
				-    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
			
 
				-    "vtrn.u8    d0, d1                         \n"                             \
			
 
				-    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
			
 
				-    "vmul.s16   q0, q0, q14                    \n"                             \
			
 
				-    "vadd.s16   d18, d19                       \n"                             \
			
 
				-    "vqadd.s16  d20, d0, d16                   \n" /* B */                     \
			
 
				-    "vqadd.s16  d21, d1, d16                   \n"                             \
			
 
				-    "vqadd.s16  d22, d0, d17                   \n" /* R */                     \
			
 
				-    "vqadd.s16  d23, d1, d17                   \n"                             \
			
 
				-    "vqadd.s16  d16, d0, d18                   \n" /* G */                     \
			
 
				-    "vqadd.s16  d17, d1, d18                   \n"                             \
			
 
				-    "vqshrun.s16 d0, q10, #6                   \n" /* B */                     \
			
 
				-    "vqshrun.s16 d1, q11, #6                   \n" /* G */                     \
			
 
				-    "vqshrun.s16 d2, q8, #6                    \n" /* R */                     \
			
 
				-    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
			
 
				-    "vmovl.u8   q11, d1                        \n"                             \
			
 
				-    "vmovl.u8   q8, d2                         \n"                             \
			
 
				-    "vtrn.u8    d20, d21                       \n"                             \
			
 
				-    "vtrn.u8    d22, d23                       \n"                             \
			
 
				-    "vtrn.u8    d16, d17                       \n"                             \
			
 
				-    "vmov.u8    d21, d16                       \n"
			
 
				-
			
 
				-static vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
			
 
				-                         0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				-static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
			
 
				-                       0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				-
			
 
				-void I444ToARGBRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width) {
			
 
				-  asm volatile (
			
 
				-#ifdef _ANDROID
			
 
				-				".fpu neon\n"
			
 
				-#endif
			
 
				-    "vld1.8     {d24}, [%5]                    \n"
			
 
				-    "vld1.8     {d25}, [%6]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV444
			
 
				-    YUV422TORGB
			
 
				-    "subs       %4, %4, #8                     \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(src_u),     // %1
			
 
				-      "+r"(src_v),     // %2
			
 
				-      "+r"(dst_argb),  // %3
			
 
				-      "+r"(width)      // %4
			
 
				-    : "r"(&kUVToRB),   // %5
			
 
				-      "r"(&kUVToG)     // %6
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I422ToARGBRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%5]                    \n"
			
 
				-    "vld1.8     {d25}, [%6]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUV422TORGB
			
 
				-    "subs       %4, %4, #8                     \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(src_u),     // %1
			
 
				-      "+r"(src_v),     // %2
			
 
				-      "+r"(dst_argb),  // %3
			
 
				-      "+r"(width)      // %4
			
 
				-    : "r"(&kUVToRB),   // %5
			
 
				-      "r"(&kUVToG)     // %6
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I411ToARGBRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%5]                    \n"
			
 
				-    "vld1.8     {d25}, [%6]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV411
			
 
				-    YUV422TORGB
			
 
				-    "subs       %4, %4, #8                     \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(src_u),     // %1
			
 
				-      "+r"(src_v),     // %2
			
 
				-      "+r"(dst_argb),  // %3
			
 
				-      "+r"(width)      // %4
			
 
				-    : "r"(&kUVToRB),   // %5
			
 
				-      "r"(&kUVToG)     // %6
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I422ToBGRARow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_bgra,
			
 
				-                        int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%5]                    \n"
			
 
				-    "vld1.8     {d25}, [%6]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUV422TORGB
			
 
				-    "subs       %4, %4, #8                     \n"
			
 
				-    "vswp.u8    d20, d22                       \n"
			
 
				-    "vmov.u8    d19, #255                      \n"
			
 
				-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(src_u),     // %1
			
 
				-      "+r"(src_v),     // %2
			
 
				-      "+r"(dst_bgra),  // %3
			
 
				-      "+r"(width)      // %4
			
 
				-    : "r"(&kUVToRB),   // %5
			
 
				-      "r"(&kUVToG)     // %6
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I422ToABGRRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_abgr,
			
 
				-                        int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%5]                    \n"
			
 
				-    "vld1.8     {d25}, [%6]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUV422TORGB
			
 
				-    "subs       %4, %4, #8                     \n"
			
 
				-    "vswp.u8    d20, d22                       \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(src_u),     // %1
			
 
				-      "+r"(src_v),     // %2
			
 
				-      "+r"(dst_abgr),  // %3
			
 
				-      "+r"(width)      // %4
			
 
				-    : "r"(&kUVToRB),   // %5
			
 
				-      "r"(&kUVToG)     // %6
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I422ToRGBARow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_rgba,
			
 
				-                        int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%5]                    \n"
			
 
				-    "vld1.8     {d25}, [%6]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUV422TORGB
			
 
				-    "subs       %4, %4, #8                     \n"
			
 
				-    "vmov.u8    d19, #255                      \n"
			
 
				-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(src_u),     // %1
			
 
				-      "+r"(src_v),     // %2
			
 
				-      "+r"(dst_rgba),  // %3
			
 
				-      "+r"(width)      // %4
			
 
				-    : "r"(&kUVToRB),   // %5
			
 
				-      "r"(&kUVToG)     // %6
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I422ToRGB24Row_NEON(const uint8* src_y,
			
 
				-                         const uint8* src_u,
			
 
				-                         const uint8* src_v,
			
 
				-                         uint8* dst_rgb24,
			
 
				-                         int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%5]                    \n"
			
 
				-    "vld1.8     {d25}, [%6]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUV422TORGB
			
 
				-    "subs       %4, %4, #8                     \n"
			
 
				-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),      // %0
			
 
				-      "+r"(src_u),      // %1
			
 
				-      "+r"(src_v),      // %2
			
 
				-      "+r"(dst_rgb24),  // %3
			
 
				-      "+r"(width)       // %4
			
 
				-    : "r"(&kUVToRB),    // %5
			
 
				-      "r"(&kUVToG)      // %6
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I422ToRAWRow_NEON(const uint8* src_y,
			
 
				-                       const uint8* src_u,
			
 
				-                       const uint8* src_v,
			
 
				-                       uint8* dst_raw,
			
 
				-                       int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%5]                    \n"
			
 
				-    "vld1.8     {d25}, [%6]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUV422TORGB
			
 
				-    "subs       %4, %4, #8                     \n"
			
 
				-    "vswp.u8    d20, d22                       \n"
			
 
				-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),    // %0
			
 
				-      "+r"(src_u),    // %1
			
 
				-      "+r"(src_v),    // %2
			
 
				-      "+r"(dst_raw),  // %3
			
 
				-      "+r"(width)     // %4
			
 
				-    : "r"(&kUVToRB),  // %5
			
 
				-      "r"(&kUVToG)    // %6
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-#define ARGBTORGB565                                                           \
			
 
				-    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
			
 
				-    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
			
 
				-    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
			
 
				-    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
			
 
				-    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
			
 
				-    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
			
 
				-    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
			
 
				-    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
			
 
				-    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
			
 
				-    "vorr       q0, q0, q10                    \n"  /* BGR                  */
			
 
				-
			
 
				-void I422ToRGB565Row_NEON(const uint8* src_y,
			
 
				-                          const uint8* src_u,
			
 
				-                          const uint8* src_v,
			
 
				-                          uint8* dst_rgb565,
			
 
				-                          int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%5]                    \n"
			
 
				-    "vld1.8     {d25}, [%6]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUV422TORGB
			
 
				-    "subs       %4, %4, #8                     \n"
			
 
				-    ARGBTORGB565
			
 
				-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),    // %0
			
 
				-      "+r"(src_u),    // %1
			
 
				-      "+r"(src_v),    // %2
			
 
				-      "+r"(dst_rgb565),  // %3
			
 
				-      "+r"(width)     // %4
			
 
				-    : "r"(&kUVToRB),  // %5
			
 
				-      "r"(&kUVToG)    // %6
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-#define ARGBTOARGB1555                                                         \
			
 
				-    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
			
 
				-    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
			
 
				-    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
			
 
				-    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
			
 
				-    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
			
 
				-    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
			
 
				-    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
			
 
				-    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
			
 
				-    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
			
 
				-    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
			
 
				-    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
			
 
				-    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
			
 
				-    "vorr       q0, q0, q1                     \n"  /* BGRA                 */
			
 
				-
			
 
				-void I422ToARGB1555Row_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb1555,
			
 
				-                            int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%5]                    \n"
			
 
				-    "vld1.8     {d25}, [%6]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUV422TORGB
			
 
				-    "subs       %4, %4, #8                     \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-    ARGBTOARGB1555
			
 
				-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),    // %0
			
 
				-      "+r"(src_u),    // %1
			
 
				-      "+r"(src_v),    // %2
			
 
				-      "+r"(dst_argb1555),  // %3
			
 
				-      "+r"(width)     // %4
			
 
				-    : "r"(&kUVToRB),  // %5
			
 
				-      "r"(&kUVToG)    // %6
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-#define ARGBTOARGB4444                                                         \
			
 
				-    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
			
 
				-    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
			
 
				-    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
			
 
				-    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
			
 
				-    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
			
 
				-    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
			
 
				-    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
			
 
				-
			
 
				-void I422ToARGB4444Row_NEON(const uint8* src_y,
			
 
				-                            const uint8* src_u,
			
 
				-                            const uint8* src_v,
			
 
				-                            uint8* dst_argb4444,
			
 
				-                            int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%5]                    \n"
			
 
				-    "vld1.8     {d25}, [%6]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUV422TORGB
			
 
				-    "subs       %4, %4, #8                     \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-    ARGBTOARGB4444
			
 
				-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),    // %0
			
 
				-      "+r"(src_u),    // %1
			
 
				-      "+r"(src_v),    // %2
			
 
				-      "+r"(dst_argb4444),  // %3
			
 
				-      "+r"(width)     // %4
			
 
				-    : "r"(&kUVToRB),  // %5
			
 
				-      "r"(&kUVToG)    // %6
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void YToARGBRow_NEON(const uint8* src_y,
			
 
				-                     uint8* dst_argb,
			
 
				-                     int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%3]                    \n"
			
 
				-    "vld1.8     {d25}, [%4]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUV400
			
 
				-    YUV422TORGB
			
 
				-    "subs       %2, %2, #8                     \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(dst_argb),  // %1
			
 
				-      "+r"(width)      // %2
			
 
				-    : "r"(&kUVToRB),   // %3
			
 
				-      "r"(&kUVToG)     // %4
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I400ToARGBRow_NEON(const uint8* src_y,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {d20}, [%0]!                   \n"
			
 
				-    "vmov       d21, d20                       \n"
			
 
				-    "vmov       d22, d20                       \n"
			
 
				-    "subs       %2, %2, #8                     \n"
			
 
				-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(dst_argb),  // %1
			
 
				-      "+r"(width)      // %2
			
 
				-    :
			
 
				-    : "cc", "memory", "d20", "d21", "d22", "d23"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void NV12ToARGBRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_uv,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%4]                    \n"
			
 
				-    "vld1.8     {d25}, [%5]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READNV12
			
 
				-    YUV422TORGB
			
 
				-    "subs       %3, %3, #8                     \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(src_uv),    // %1
			
 
				-      "+r"(dst_argb),  // %2
			
 
				-      "+r"(width)      // %3
			
 
				-    : "r"(&kUVToRB),   // %4
			
 
				-      "r"(&kUVToG)     // %5
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void NV21ToARGBRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_uv,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%4]                    \n"
			
 
				-    "vld1.8     {d25}, [%5]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READNV21
			
 
				-    YUV422TORGB
			
 
				-    "subs       %3, %3, #8                     \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(src_uv),    // %1
			
 
				-      "+r"(dst_argb),  // %2
			
 
				-      "+r"(width)      // %3
			
 
				-    : "r"(&kUVToRB),   // %4
			
 
				-      "r"(&kUVToG)     // %5
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void NV12ToRGB565Row_NEON(const uint8* src_y,
			
 
				-                          const uint8* src_uv,
			
 
				-                          uint8* dst_rgb565,
			
 
				-                          int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%4]                    \n"
			
 
				-    "vld1.8     {d25}, [%5]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READNV12
			
 
				-    YUV422TORGB
			
 
				-    "subs       %3, %3, #8                     \n"
			
 
				-    ARGBTORGB565
			
 
				-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(src_uv),    // %1
			
 
				-      "+r"(dst_rgb565),  // %2
			
 
				-      "+r"(width)      // %3
			
 
				-    : "r"(&kUVToRB),   // %4
			
 
				-      "r"(&kUVToG)     // %5
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void NV21ToRGB565Row_NEON(const uint8* src_y,
			
 
				-                          const uint8* src_uv,
			
 
				-                          uint8* dst_rgb565,
			
 
				-                          int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%4]                    \n"
			
 
				-    "vld1.8     {d25}, [%5]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READNV21
			
 
				-    YUV422TORGB
			
 
				-    "subs       %3, %3, #8                     \n"
			
 
				-    ARGBTORGB565
			
 
				-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_y),     // %0
			
 
				-      "+r"(src_uv),    // %1
			
 
				-      "+r"(dst_rgb565),  // %2
			
 
				-      "+r"(width)      // %3
			
 
				-    : "r"(&kUVToRB),   // %4
			
 
				-      "r"(&kUVToG)     // %5
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%3]                    \n"
			
 
				-    "vld1.8     {d25}, [%4]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READYUY2
			
 
				-    YUV422TORGB
			
 
				-    "subs       %2, %2, #8                     \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_yuy2),  // %0
			
 
				-      "+r"(dst_argb),  // %1
			
 
				-      "+r"(width)      // %2
			
 
				-    : "r"(&kUVToRB),   // %3
			
 
				-      "r"(&kUVToG)     // %4
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
			
 
				-                        uint8* dst_argb,
			
 
				-                        int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {d24}, [%3]                    \n"
			
 
				-    "vld1.8     {d25}, [%4]                    \n"
			
 
				-    "vmov.u8    d26, #128                      \n"
			
 
				-    "vmov.u16   q14, #74                       \n"
			
 
				-    "vmov.u16   q15, #16                       \n"
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    READUYVY
			
 
				-    YUV422TORGB
			
 
				-    "subs       %2, %2, #8                     \n"
			
 
				-    "vmov.u8    d23, #255                      \n"
			
 
				-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_uyvy),  // %0
			
 
				-      "+r"(dst_argb),  // %1
			
 
				-      "+r"(width)      // %2
			
 
				-    : "r"(&kUVToRB),   // %3
			
 
				-      "r"(&kUVToG)     // %4
			
 
				-    : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
			
 
				-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                     int width) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
			
 
				-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
			
 
				-    "vst1.8     {q0}, [%1]!                    \n"  // store U
			
 
				-    "vst1.8     {q1}, [%2]!                    \n"  // store V
			
 
				-    "bgt        1b                             \n"
			
 
				-    : "+r"(src_uv),  // %0
			
 
				-      "+r"(dst_u),   // %1
			
 
				-      "+r"(dst_v),   // %2
			
 
				-      "+r"(width)    // %3  // Output registers
			
 
				-    :                       // Input registers
			
 
				-    : "cc", "memory", "q0", "q1"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Reads 16 U's and V's and writes out 16 pairs of UV.
			
 
				-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                     int width) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load U
			
 
				-    "vld1.8     {q1}, [%1]!                    \n"  // load V
			
 
				-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
			
 
				-    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
			
 
				-    "bgt        1b                             \n"
			
 
				-    :
			
 
				-      "+r"(src_u),   // %0
			
 
				-      "+r"(src_v),   // %1
			
 
				-      "+r"(dst_uv),  // %2
			
 
				-      "+r"(width)    // %3  // Output registers
			
 
				-    :                       // Input registers
			
 
				-    : "cc", "memory", "q0", "q1"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
			
 
				-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
			
 
				-    "subs       %2, %2, #32                    \n"  // 32 processed per loop
			
 
				-    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src),   // %0
			
 
				-    "+r"(dst),   // %1
			
 
				-    "+r"(count)  // %2  // Output registers
			
 
				-  :                     // Input registers
			
 
				-  : "cc", "memory", "q0", "q1"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
			
 
				-void SetRow_NEON(uint8* dst, uint32 v32, int count) {
			
 
				-  asm volatile (
			
 
				-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
			
 
				-    "1:                                        \n"
			
 
				-    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
			
 
				-    "vst1.8    {q0}, [%0]!                     \n"  // store
			
 
				-    "bgt       1b                              \n"
			
 
				-  : "+r"(dst),   // %0
			
 
				-    "+r"(count)  // %1
			
 
				-  : "r"(v32)     // %2
			
 
				-  : "cc", "memory", "q0"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// TODO(fbarchard): Make fully assembler
			
 
				-// SetRow32 writes 'count' words using a 32 bit value repeated.
			
 
				-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
			
 
				-                      int dst_stride, int height) {
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    SetRow_NEON(dst, v32, width << 2);
			
 
				-    dst += dst_stride;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
			
 
				-  asm volatile (
			
 
				-    // Start at end of source row.
			
 
				-    "mov        r3, #-16                       \n"
			
 
				-    "add        %0, %0, %2                     \n"
			
 
				-    "sub        %0, #16                        \n"
			
 
				-
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
			
 
				-    "subs       %2, #16                        \n"  // 16 pixels per loop.
			
 
				-    "vrev64.8   q0, q0                         \n"
			
 
				-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src),   // %0
			
 
				-    "+r"(dst),   // %1
			
 
				-    "+r"(width)  // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "r3", "q0"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                      int width) {
			
 
				-  asm volatile (
			
 
				-    // Start at end of source row.
			
 
				-    "mov        r12, #-16                      \n"
			
 
				-    "add        %0, %0, %3, lsl #1             \n"
			
 
				-    "sub        %0, #16                        \n"
			
 
				-
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
			
 
				-    "subs       %3, #8                         \n"  // 8 pixels per loop.
			
 
				-    "vrev64.8   q0, q0                         \n"
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
			
 
				-    "vst1.8     {d1}, [%2]!                    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_uv),  // %0
			
 
				-    "+r"(dst_u),   // %1
			
 
				-    "+r"(dst_v),   // %2
			
 
				-    "+r"(width)    // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "r12", "q0"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
			
 
				-  asm volatile (
			
 
				-    // Start at end of source row.
			
 
				-    "mov        r3, #-16                       \n"
			
 
				-    "add        %0, %0, %2, lsl #2             \n"
			
 
				-    "sub        %0, #16                        \n"
			
 
				-
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
			
 
				-    "subs       %2, #4                         \n"  // 4 pixels per loop.
			
 
				-    "vrev64.32  q0, q0                         \n"
			
 
				-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src),   // %0
			
 
				-    "+r"(dst),   // %1
			
 
				-    "+r"(width)  // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "r3", "q0"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d4, #255                       \n"  // Alpha
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_rgb24),  // %0
			
 
				-    "+r"(dst_argb),   // %1
			
 
				-    "+r"(pix)         // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d4, #255                       \n"  // Alpha
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vswp.u8    d1, d3                         \n"  // swap R, B
			
 
				-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_raw),   // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-#define RGB565TOARGB                                                           \
			
 
				-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
			
 
				-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
			
 
				-    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
			
 
				-    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
			
 
				-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
			
 
				-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
			
 
				-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
			
 
				-    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
			
 
				-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
			
 
				-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
			
 
				-
			
 
				-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d3, #255                       \n"  // Alpha
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    RGB565TOARGB
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_rgb565),  // %0
			
 
				-    "+r"(dst_argb),    // %1
			
 
				-    "+r"(pix)          // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-#define ARGB1555TOARGB                                                         \
			
 
				-    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
			
 
				-    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
			
 
				-    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
			
 
				-    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
			
 
				-    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
			
 
				-    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
			
 
				-    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
			
 
				-    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
			
 
				-    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
			
 
				-    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
			
 
				-    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
			
 
				-    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
			
 
				-
			
 
				-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
			
 
				-#define RGB555TOARGB                                                           \
			
 
				-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
			
 
				-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
			
 
				-    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
			
 
				-    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
			
 
				-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
			
 
				-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
			
 
				-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
			
 
				-    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
			
 
				-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
			
 
				-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
			
 
				-
			
 
				-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
			
 
				-                            int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d3, #255                       \n"  // Alpha
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    ARGB1555TOARGB
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb1555),  // %0
			
 
				-    "+r"(dst_argb),    // %1
			
 
				-    "+r"(pix)          // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-#define ARGB4444TOARGB                                                         \
			
 
				-    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
			
 
				-    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
			
 
				-    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
			
 
				-    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
			
 
				-    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
			
 
				-    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
			
 
				-    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
			
 
				-    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
			
 
				-
			
 
				-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
			
 
				-                            int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d3, #255                       \n"  // Alpha
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    ARGB4444TOARGB
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb4444),  // %0
			
 
				-    "+r"(dst_argb),    // %1
			
 
				-    "+r"(pix)          // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),   // %0
			
 
				-    "+r"(dst_rgb24),  // %1
			
 
				-    "+r"(pix)         // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vswp.u8    d1, d3                         \n"  // swap R, B
			
 
				-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_raw),   // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
			
 
				-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
			
 
				-    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_yuy2),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
			
 
				-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
			
 
				-    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_uyvy),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
			
 
				-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
			
 
				-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
			
 
				-    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_yuy2),  // %0
			
 
				-    "+r"(dst_u),     // %1
			
 
				-    "+r"(dst_v),     // %2
			
 
				-    "+r"(pix)        // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
			
 
				-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
			
 
				-    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_uyvy),  // %0
			
 
				-    "+r"(dst_u),     // %1
			
 
				-    "+r"(dst_v),     // %2
			
 
				-    "+r"(pix)        // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // stride + src_yuy2
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
			
 
				-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
			
 
				-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
			
 
				-    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
			
 
				-    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
			
 
				-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
			
 
				-    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_yuy2),     // %0
			
 
				-    "+r"(stride_yuy2),  // %1
			
 
				-    "+r"(dst_u),        // %2
			
 
				-    "+r"(dst_v),        // %3
			
 
				-    "+r"(pix)           // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // stride + src_uyvy
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
			
 
				-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
			
 
				-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
			
 
				-    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
			
 
				-    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
			
 
				-    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_uyvy),     // %0
			
 
				-    "+r"(stride_uyvy),  // %1
			
 
				-    "+r"(dst_u),        // %2
			
 
				-    "+r"(dst_v),        // %3
			
 
				-    "+r"(pix)           // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
			
 
				-                  uint8* dst_uv, int pix) {
			
 
				-  asm volatile (
			
 
				-    // change the stride to row 2 pointer
			
 
				-    "add        %1, %0                         \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.
			
 
				-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
			
 
				-    "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.
			
 
				-    "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2
			
 
				-    "vst1.8     {q0}, [%2]!                    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_uv),         // %0
			
 
				-    "+r"(src_uv_stride),  // %1
			
 
				-    "+r"(dst_uv),         // %2
			
 
				-    "+r"(pix)             // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
			
 
				-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                         uint32 selector, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u32   d6[0], %3                      \n"  // selector
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
			
 
				-    "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels
			
 
				-    "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels
			
 
				-    "vtrn.u32   d4, d5                         \n"  // combine 8 pixels
			
 
				-    "vst1.8     {d4}, [%1]!                    \n"  // store 8.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),   // %0
			
 
				-    "+r"(dst_bayer),  // %1
			
 
				-    "+r"(pix)         // %2
			
 
				-  : "r"(selector)     // %3
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Select G channels from ARGB.  e.g.  GGGGGGGG
			
 
				-void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                           uint32 /*selector*/, int pix) {
			
 
				-  asm volatile (
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
			
 
				-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),   // %0
			
 
				-    "+r"(dst_bayer),  // %1
			
 
				-    "+r"(pix)         // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
			
 
				-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
			
 
				-                         const uint8* shuffler, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {q2}, [%3]                     \n"  // shuffler
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
			
 
				-    "subs       %2, %2, #4                     \n"  // 4 processed per loop
			
 
				-    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
			
 
				-    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
			
 
				-    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "r"(shuffler)    // %3
			
 
				-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I422ToYUY2Row_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_yuy2, int width) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
			
 
				-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
			
 
				-    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
			
 
				-    "subs       %4, %4, #16                    \n"  // 16 pixels
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_y),     // %0
			
 
				-    "+r"(src_u),     // %1
			
 
				-    "+r"(src_v),     // %2
			
 
				-    "+r"(dst_yuy2),  // %3
			
 
				-    "+r"(width)      // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "d0", "d1", "d2", "d3"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I422ToUYVYRow_NEON(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_uyvy, int width) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
			
 
				-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
			
 
				-    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
			
 
				-    "subs       %4, %4, #16                    \n"  // 16 pixels
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_y),     // %0
			
 
				-    "+r"(src_u),     // %1
			
 
				-    "+r"(src_v),     // %2
			
 
				-    "+r"(dst_uyvy),  // %3
			
 
				-    "+r"(width)      // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "d0", "d1", "d2", "d3"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    ARGBTORGB565
			
 
				-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_rgb565),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
			
 
				-                            int pix) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    ARGBTOARGB1555
			
 
				-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_argb1555),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
			
 
				-                            int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    ARGBTOARGB4444
			
 
				-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),      // %0
			
 
				-    "+r"(dst_argb4444),  // %1
			
 
				-    "+r"(pix)            // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
			
 
				-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
			
 
				-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
			
 
				-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q2, d0, d24                    \n"  // B
			
 
				-    "vmlal.u8   q2, d1, d25                    \n"  // G
			
 
				-    "vmlal.u8   q2, d2, d26                    \n"  // R
			
 
				-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
			
 
				-    "vqadd.u8   d0, d27                        \n"
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
			
 
				-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
			
 
				-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q2, d0, d24                    \n"  // B
			
 
				-    "vmlal.u8   q2, d1, d25                    \n"  // G
			
 
				-    "vmlal.u8   q2, d2, d26                    \n"  // R
			
 
				-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// 8x1 pixels.
			
 
				-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q2, d0, d24                    \n"  // B
			
 
				-    "vmlsl.u8   q2, d1, d25                    \n"  // G
			
 
				-    "vmlsl.u8   q2, d2, d26                    \n"  // R
			
 
				-    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
			
 
				-
			
 
				-    "vmull.u8   q3, d2, d24                    \n"  // R
			
 
				-    "vmlsl.u8   q3, d1, d28                    \n"  // G
			
 
				-    "vmlsl.u8   q3, d0, d27                    \n"  // B
			
 
				-    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
			
 
				-
			
 
				-    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
			
 
				-    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
			
 
				-
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_u),     // %1
			
 
				-    "+r"(dst_v),     // %2
			
 
				-    "+r"(pix)        // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
			
 
				-void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
			
 
				-
			
 
				-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-
			
 
				-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
			
 
				-    "vmul.s16   q8, q0, q10                    \n"  // B
			
 
				-    "vmls.s16   q8, q1, q11                    \n"  // G
			
 
				-    "vmls.s16   q8, q2, q12                    \n"  // R
			
 
				-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
			
 
				-
			
 
				-    "vmul.s16   q9, q2, q10                    \n"  // R
			
 
				-    "vmls.s16   q9, q1, q14                    \n"  // G
			
 
				-    "vmls.s16   q9, q0, q13                    \n"  // B
			
 
				-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
			
 
				-
			
 
				-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
			
 
				-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
			
 
				-
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_u),     // %1
			
 
				-    "+r"(dst_v),     // %2
			
 
				-    "+r"(pix)        // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
			
 
				-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                         int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
			
 
				-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
			
 
				-    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
			
 
				-    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-
			
 
				-    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
			
 
				-    "vpadd.u16  d1, d8, d9                     \n"  // B
			
 
				-    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
			
 
				-    "vpadd.u16  d3, d10, d11                   \n"  // G
			
 
				-    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
			
 
				-    "vpadd.u16  d5, d12, d13                   \n"  // R
			
 
				-
			
 
				-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
			
 
				-    "vrshr.u16  q1, q1, #1                     \n"
			
 
				-    "vrshr.u16  q2, q2, #1                     \n"
			
 
				-
			
 
				-    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
			
 
				-    "vmul.s16   q8, q0, q10                    \n"  // B
			
 
				-    "vmls.s16   q8, q1, q11                    \n"  // G
			
 
				-    "vmls.s16   q8, q2, q12                    \n"  // R
			
 
				-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
			
 
				-    "vmul.s16   q9, q2, q10                    \n"  // R
			
 
				-    "vmls.s16   q9, q1, q14                    \n"  // G
			
 
				-    "vmls.s16   q9, q0, q13                    \n"  // B
			
 
				-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
			
 
				-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
			
 
				-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_u),     // %1
			
 
				-    "+r"(dst_v),     // %2
			
 
				-    "+r"(pix)        // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
			
 
				-#define RGBTOUV(QB, QG, QR) \
			
 
				-    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
			
 
				-    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
			
 
				-    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
			
 
				-    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
			
 
				-    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
			
 
				-    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
			
 
				-    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
			
 
				-    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
			
 
				-    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
			
 
				-    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
			
 
				-
			
 
				-// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
			
 
				-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
			
 
				-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
			
 
				-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
			
 
				-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
			
 
				-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-
			
 
				-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
			
 
				-    "vrshr.u16  q1, q1, #1                     \n"
			
 
				-    "vrshr.u16  q2, q2, #1                     \n"
			
 
				-
			
 
				-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
			
 
				-    RGBTOUV(q0, q1, q2)
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(src_stride_argb),  // %1
			
 
				-    "+r"(dst_u),     // %2
			
 
				-    "+r"(dst_v),     // %3
			
 
				-    "+r"(pix)        // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// TODO(fbarchard): Subsample match C code.
			
 
				-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
			
 
				-                       uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
			
 
				-    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
			
 
				-    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
			
 
				-    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
			
 
				-    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
			
 
				-    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
			
 
				-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
			
 
				-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
			
 
				-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-
			
 
				-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
			
 
				-    "vrshr.u16  q1, q1, #1                     \n"
			
 
				-    "vrshr.u16  q2, q2, #1                     \n"
			
 
				-
			
 
				-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
			
 
				-    RGBTOUV(q0, q1, q2)
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(src_stride_argb),  // %1
			
 
				-    "+r"(dst_u),     // %2
			
 
				-    "+r"(dst_v),     // %3
			
 
				-    "+r"(pix)        // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
			
 
				-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
			
 
				-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
			
 
				-    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
			
 
				-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
			
 
				-    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-
			
 
				-    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
			
 
				-    "vrshr.u16  q2, q2, #1                     \n"
			
 
				-    "vrshr.u16  q3, q3, #1                     \n"
			
 
				-
			
 
				-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
			
 
				-    RGBTOUV(q3, q2, q1)
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_bgra),  // %0
			
 
				-    "+r"(src_stride_bgra),  // %1
			
 
				-    "+r"(dst_u),     // %2
			
 
				-    "+r"(dst_v),     // %3
			
 
				-    "+r"(pix)        // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
			
 
				-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
			
 
				-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
			
 
				-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
			
 
				-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
			
 
				-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-
			
 
				-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
			
 
				-    "vrshr.u16  q1, q1, #1                     \n"
			
 
				-    "vrshr.u16  q2, q2, #1                     \n"
			
 
				-
			
 
				-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
			
 
				-    RGBTOUV(q2, q1, q0)
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_abgr),  // %0
			
 
				-    "+r"(src_stride_abgr),  // %1
			
 
				-    "+r"(dst_u),     // %2
			
 
				-    "+r"(dst_v),     // %3
			
 
				-    "+r"(pix)        // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
			
 
				-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
			
 
				-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
			
 
				-    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
			
 
				-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
			
 
				-    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-
			
 
				-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
			
 
				-    "vrshr.u16  q1, q1, #1                     \n"
			
 
				-    "vrshr.u16  q2, q2, #1                     \n"
			
 
				-
			
 
				-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
			
 
				-    RGBTOUV(q0, q1, q2)
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_rgba),  // %0
			
 
				-    "+r"(src_stride_rgba),  // %1
			
 
				-    "+r"(dst_u),     // %2
			
 
				-    "+r"(dst_v),     // %3
			
 
				-    "+r"(pix)        // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
			
 
				-                       uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
			
 
				-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
			
 
				-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
			
 
				-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
			
 
				-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
			
 
				-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-
			
 
				-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
			
 
				-    "vrshr.u16  q1, q1, #1                     \n"
			
 
				-    "vrshr.u16  q2, q2, #1                     \n"
			
 
				-
			
 
				-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
			
 
				-    RGBTOUV(q0, q1, q2)
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_rgb24),  // %0
			
 
				-    "+r"(src_stride_rgb24),  // %1
			
 
				-    "+r"(dst_u),     // %2
			
 
				-    "+r"(dst_v),     // %3
			
 
				-    "+r"(pix)        // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
			
 
				-                     uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // src_stride + src_raw
			
 
				-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
			
 
				-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
			
 
				-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
			
 
				-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
			
 
				-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-
			
 
				-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
			
 
				-    "vrshr.u16  q1, q1, #1                     \n"
			
 
				-    "vrshr.u16  q2, q2, #1                     \n"
			
 
				-
			
 
				-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
			
 
				-    RGBTOUV(q2, q1, q0)
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_raw),  // %0
			
 
				-    "+r"(src_stride_raw),  // %1
			
 
				-    "+r"(dst_u),     // %2
			
 
				-    "+r"(dst_v),     // %3
			
 
				-    "+r"(pix)        // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
			
 
				-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
			
 
				-                        uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
			
 
				-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
			
 
				-    RGB565TOARGB
			
 
				-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
			
 
				-    RGB565TOARGB
			
 
				-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-
			
 
				-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
			
 
				-    RGB565TOARGB
			
 
				-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
			
 
				-    RGB565TOARGB
			
 
				-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-
			
 
				-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
			
 
				-    "vrshr.u16  q5, q5, #1                     \n"
			
 
				-    "vrshr.u16  q6, q6, #1                     \n"
			
 
				-
			
 
				-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
			
 
				-    "vmul.s16   q8, q4, q10                    \n"  // B
			
 
				-    "vmls.s16   q8, q5, q11                    \n"  // G
			
 
				-    "vmls.s16   q8, q6, q12                    \n"  // R
			
 
				-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
			
 
				-    "vmul.s16   q9, q6, q10                    \n"  // R
			
 
				-    "vmls.s16   q9, q5, q14                    \n"  // G
			
 
				-    "vmls.s16   q9, q4, q13                    \n"  // B
			
 
				-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
			
 
				-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
			
 
				-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_rgb565),  // %0
			
 
				-    "+r"(src_stride_rgb565),  // %1
			
 
				-    "+r"(dst_u),     // %2
			
 
				-    "+r"(dst_v),     // %3
			
 
				-    "+r"(pix)        // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
			
 
				-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
			
 
				-                        uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
			
 
				-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
			
 
				-    RGB555TOARGB
			
 
				-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
			
 
				-    RGB555TOARGB
			
 
				-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-
			
 
				-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
			
 
				-    RGB555TOARGB
			
 
				-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
			
 
				-    RGB555TOARGB
			
 
				-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-
			
 
				-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
			
 
				-    "vrshr.u16  q5, q5, #1                     \n"
			
 
				-    "vrshr.u16  q6, q6, #1                     \n"
			
 
				-
			
 
				-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
			
 
				-    "vmul.s16   q8, q4, q10                    \n"  // B
			
 
				-    "vmls.s16   q8, q5, q11                    \n"  // G
			
 
				-    "vmls.s16   q8, q6, q12                    \n"  // R
			
 
				-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
			
 
				-    "vmul.s16   q9, q6, q10                    \n"  // R
			
 
				-    "vmls.s16   q9, q5, q14                    \n"  // G
			
 
				-    "vmls.s16   q9, q4, q13                    \n"  // B
			
 
				-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
			
 
				-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
			
 
				-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb1555),  // %0
			
 
				-    "+r"(src_stride_argb1555),  // %1
			
 
				-    "+r"(dst_u),     // %2
			
 
				-    "+r"(dst_v),     // %3
			
 
				-    "+r"(pix)        // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
			
 
				-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
			
 
				-                          uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
			
 
				-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
			
 
				-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
			
 
				-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
			
 
				-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
			
 
				-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
			
 
				-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
			
 
				-    ARGB4444TOARGB
			
 
				-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
			
 
				-    ARGB4444TOARGB
			
 
				-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-
			
 
				-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
			
 
				-    ARGB4444TOARGB
			
 
				-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
			
 
				-    ARGB4444TOARGB
			
 
				-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
			
 
				-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
			
 
				-
			
 
				-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
			
 
				-    "vrshr.u16  q5, q5, #1                     \n"
			
 
				-    "vrshr.u16  q6, q6, #1                     \n"
			
 
				-
			
 
				-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
			
 
				-    "vmul.s16   q8, q4, q10                    \n"  // B
			
 
				-    "vmls.s16   q8, q5, q11                    \n"  // G
			
 
				-    "vmls.s16   q8, q6, q12                    \n"  // R
			
 
				-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
			
 
				-    "vmul.s16   q9, q6, q10                    \n"  // R
			
 
				-    "vmls.s16   q9, q5, q14                    \n"  // G
			
 
				-    "vmls.s16   q9, q4, q13                    \n"  // B
			
 
				-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
			
 
				-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
			
 
				-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
			
 
				-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb4444),  // %0
			
 
				-    "+r"(src_stride_argb4444),  // %1
			
 
				-    "+r"(dst_u),     // %2
			
 
				-    "+r"(dst_v),     // %3
			
 
				-    "+r"(pix)        // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
			
 
				-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
			
 
				-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
			
 
				-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
			
 
				-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    RGB565TOARGB
			
 
				-    "vmull.u8   q2, d0, d24                    \n"  // B
			
 
				-    "vmlal.u8   q2, d1, d25                    \n"  // G
			
 
				-    "vmlal.u8   q2, d2, d26                    \n"  // R
			
 
				-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
			
 
				-    "vqadd.u8   d0, d27                        \n"
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_rgb565),  // %0
			
 
				-    "+r"(dst_y),       // %1
			
 
				-    "+r"(pix)          // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
			
 
				-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
			
 
				-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
			
 
				-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    ARGB1555TOARGB
			
 
				-    "vmull.u8   q2, d0, d24                    \n"  // B
			
 
				-    "vmlal.u8   q2, d1, d25                    \n"  // G
			
 
				-    "vmlal.u8   q2, d2, d26                    \n"  // R
			
 
				-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
			
 
				-    "vqadd.u8   d0, d27                        \n"
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb1555),  // %0
			
 
				-    "+r"(dst_y),         // %1
			
 
				-    "+r"(pix)            // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
			
 
				-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
			
 
				-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
			
 
				-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    ARGB4444TOARGB
			
 
				-    "vmull.u8   q2, d0, d24                    \n"  // B
			
 
				-    "vmlal.u8   q2, d1, d25                    \n"  // G
			
 
				-    "vmlal.u8   q2, d2, d26                    \n"  // R
			
 
				-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
			
 
				-    "vqadd.u8   d0, d27                        \n"
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb4444),  // %0
			
 
				-    "+r"(dst_y),         // %1
			
 
				-    "+r"(pix)            // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
			
 
				-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
			
 
				-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
			
 
				-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q8, d1, d4                     \n"  // R
			
 
				-    "vmlal.u8   q8, d2, d5                     \n"  // G
			
 
				-    "vmlal.u8   q8, d3, d6                     \n"  // B
			
 
				-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
			
 
				-    "vqadd.u8   d0, d7                         \n"
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_bgra),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
			
 
				-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
			
 
				-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
			
 
				-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q8, d0, d4                     \n"  // R
			
 
				-    "vmlal.u8   q8, d1, d5                     \n"  // G
			
 
				-    "vmlal.u8   q8, d2, d6                     \n"  // B
			
 
				-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
			
 
				-    "vqadd.u8   d0, d7                         \n"
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_abgr),  // %0
			
 
				-    "+r"(dst_y),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
			
 
				-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
			
 
				-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
			
 
				-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q8, d1, d4                     \n"  // B
			
 
				-    "vmlal.u8   q8, d2, d5                     \n"  // G
			
 
				-    "vmlal.u8   q8, d3, d6                     \n"  // R
			
 
				-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
			
 
				-    "vqadd.u8   d0, d7                         \n"
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_rgba),  // %0
			
 
				-    "+r"(dst_y),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
			
 
				-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
			
 
				-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
			
 
				-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q8, d0, d4                     \n"  // B
			
 
				-    "vmlal.u8   q8, d1, d5                     \n"  // G
			
 
				-    "vmlal.u8   q8, d2, d6                     \n"  // R
			
 
				-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
			
 
				-    "vqadd.u8   d0, d7                         \n"
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_rgb24),  // %0
			
 
				-    "+r"(dst_y),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
			
 
				-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
			
 
				-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
			
 
				-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q8, d0, d4                     \n"  // B
			
 
				-    "vmlal.u8   q8, d1, d5                     \n"  // G
			
 
				-    "vmlal.u8   q8, d2, d6                     \n"  // R
			
 
				-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
			
 
				-    "vqadd.u8   d0, d7                         \n"
			
 
				-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_raw),  // %0
			
 
				-    "+r"(dst_y),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Bilinear filter 16x2 -> 16x1
			
 
				-void InterpolateRow_NEON(uint8* dst_ptr,
			
 
				-                         const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                         int dst_width, int source_y_fraction) {
			
 
				-  asm volatile (
			
 
				-    "cmp        %4, #0                         \n"
			
 
				-    "beq        100f                           \n"
			
 
				-    "add        %2, %1                         \n"
			
 
				-    "cmp        %4, #64                        \n"
			
 
				-    "beq        75f                            \n"
			
 
				-    "cmp        %4, #128                       \n"
			
 
				-    "beq        50f                            \n"
			
 
				-    "cmp        %4, #192                       \n"
			
 
				-    "beq        25f                            \n"
			
 
				-
			
 
				-    "vdup.8     d5, %4                         \n"
			
 
				-    "rsb        %4, #256                       \n"
			
 
				-    "vdup.8     d4, %4                         \n"
			
 
				-    // General purpose row blend.
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%1]!                    \n"
			
 
				-    "vld1.8     {q1}, [%2]!                    \n"
			
 
				-    "subs       %3, %3, #16                    \n"
			
 
				-    "vmull.u8   q13, d0, d4                    \n"
			
 
				-    "vmull.u8   q14, d1, d4                    \n"
			
 
				-    "vmlal.u8   q13, d2, d5                    \n"
			
 
				-    "vmlal.u8   q14, d3, d5                    \n"
			
 
				-    "vrshrn.u16 d0, q13, #8                    \n"
			
 
				-    "vrshrn.u16 d1, q14, #8                    \n"
			
 
				-    "vst1.8     {q0}, [%0]!                    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-    "b          99f                            \n"
			
 
				-
			
 
				-    // Blend 25 / 75.
			
 
				-  "25:                                         \n"
			
 
				-    "vld1.8     {q0}, [%1]!                    \n"
			
 
				-    "vld1.8     {q1}, [%2]!                    \n"
			
 
				-    "subs       %3, %3, #16                    \n"
			
 
				-    "vrhadd.u8  q0, q1                         \n"
			
 
				-    "vrhadd.u8  q0, q1                         \n"
			
 
				-    "vst1.8     {q0}, [%0]!                    \n"
			
 
				-    "bgt        25b                            \n"
			
 
				-    "b          99f                            \n"
			
 
				-
			
 
				-    // Blend 50 / 50.
			
 
				-  "50:                                         \n"
			
 
				-    "vld1.8     {q0}, [%1]!                    \n"
			
 
				-    "vld1.8     {q1}, [%2]!                    \n"
			
 
				-    "subs       %3, %3, #16                    \n"
			
 
				-    "vrhadd.u8  q0, q1                         \n"
			
 
				-    "vst1.8     {q0}, [%0]!                    \n"
			
 
				-    "bgt        50b                            \n"
			
 
				-    "b          99f                            \n"
			
 
				-
			
 
				-    // Blend 75 / 25.
			
 
				-  "75:                                         \n"
			
 
				-    "vld1.8     {q1}, [%1]!                    \n"
			
 
				-    "vld1.8     {q0}, [%2]!                    \n"
			
 
				-    "subs       %3, %3, #16                    \n"
			
 
				-    "vrhadd.u8  q0, q1                         \n"
			
 
				-    "vrhadd.u8  q0, q1                         \n"
			
 
				-    "vst1.8     {q0}, [%0]!                    \n"
			
 
				-    "bgt        75b                            \n"
			
 
				-    "b          99f                            \n"
			
 
				-
			
 
				-    // Blend 100 / 0 - Copy row unchanged.
			
 
				-  "100:                                        \n"
			
 
				-    "vld1.8     {q0}, [%1]!                    \n"
			
 
				-    "subs       %3, %3, #16                    \n"
			
 
				-    "vst1.8     {q0}, [%0]!                    \n"
			
 
				-    "bgt        100b                           \n"
			
 
				-
			
 
				-  "99:                                         \n"
			
 
				-  : "+r"(dst_ptr),          // %0
			
 
				-    "+r"(src_ptr),          // %1
			
 
				-    "+r"(src_stride),       // %2
			
 
				-    "+r"(dst_width),        // %3
			
 
				-    "+r"(source_y_fraction) // %4
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
			
 
				-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                       uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "subs       %3, #8                         \n"
			
 
				-    "blt        89f                            \n"
			
 
				-    // Blend 8 pixels.
			
 
				-  "8:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
			
 
				-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
			
 
				-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q10, d4, d3                    \n"  // db * a
			
 
				-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
			
 
				-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
			
 
				-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
			
 
				-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
			
 
				-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
			
 
				-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
			
 
				-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
			
 
				-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
			
 
				-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
			
 
				-    "vmov.u8    d3, #255                       \n"  // a = 255
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
			
 
				-    "bge        8b                             \n"
			
 
				-
			
 
				-  "89:                                         \n"
			
 
				-    "adds       %3, #8-1                       \n"
			
 
				-    "blt        99f                            \n"
			
 
				-
			
 
				-    // Blend 1 pixels.
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
			
 
				-    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
			
 
				-    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
			
 
				-    "vmull.u8   q10, d4, d3                    \n"  // db * a
			
 
				-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
			
 
				-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
			
 
				-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
			
 
				-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
			
 
				-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
			
 
				-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
			
 
				-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
			
 
				-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
			
 
				-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
			
 
				-    "vmov.u8    d3, #255                       \n"  // a = 255
			
 
				-    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
			
 
				-    "bge        1b                             \n"
			
 
				-
			
 
				-  "99:                                         \n"
			
 
				-
			
 
				-  : "+r"(src_argb0),    // %0
			
 
				-    "+r"(src_argb1),    // %1
			
 
				-    "+r"(dst_argb),     // %2
			
 
				-    "+r"(width)         // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Attenuate 8 pixels at a time.
			
 
				-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    // Attenuate 8 pixels.
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q10, d0, d3                    \n"  // b * a
			
 
				-    "vmull.u8   q11, d1, d3                    \n"  // g * a
			
 
				-    "vmull.u8   q12, d2, d3                    \n"  // r * a
			
 
				-    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
			
 
				-    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
			
 
				-    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),   // %0
			
 
				-    "+r"(dst_argb),   // %1
			
 
				-    "+r"(width)       // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Quantize 8 ARGB pixels (32 bytes).
			
 
				-// dst = (dst * scale >> 16) * interval_size + interval_offset;
			
 
				-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
			
 
				-                          int interval_offset, int width) {
			
 
				-  asm volatile (
			
 
				-    "vdup.u16   q8, %2                         \n"
			
 
				-    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
			
 
				-    "vdup.u16   q9, %3                         \n"  // interval multiply.
			
 
				-    "vdup.u16   q10, %4                        \n"  // interval add
			
 
				-
			
 
				-    // 8 pixel loop.
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
			
 
				-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
			
 
				-    "vmovl.u8   q1, d2                         \n"
			
 
				-    "vmovl.u8   q2, d4                         \n"
			
 
				-    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
			
 
				-    "vqdmulh.s16 q1, q1, q8                    \n"  // g
			
 
				-    "vqdmulh.s16 q2, q2, q8                    \n"  // r
			
 
				-    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
			
 
				-    "vmul.u16   q1, q1, q9                     \n"  // g
			
 
				-    "vmul.u16   q2, q2, q9                     \n"  // r
			
 
				-    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
			
 
				-    "vadd.u16   q1, q1, q10                    \n"  // g
			
 
				-    "vadd.u16   q2, q2, q10                    \n"  // r
			
 
				-    "vqmovn.u16 d0, q0                         \n"
			
 
				-    "vqmovn.u16 d2, q1                         \n"
			
 
				-    "vqmovn.u16 d4, q2                         \n"
			
 
				-    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(dst_argb),       // %0
			
 
				-    "+r"(width)           // %1
			
 
				-  : "r"(scale),           // %2
			
 
				-    "r"(interval_size),   // %3
			
 
				-    "r"(interval_offset)  // %4
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Shade 8 pixels at a time by specified value.
			
 
				-// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
			
 
				-// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
			
 
				-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
			
 
				-                       uint32 value) {
			
 
				-  asm volatile (
			
 
				-    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
			
 
				-    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
			
 
				-    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
			
 
				-
			
 
				-    // 8 pixel loop.
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
			
 
				-    "vmovl.u8   q11, d22                       \n"
			
 
				-    "vmovl.u8   q12, d24                       \n"
			
 
				-    "vmovl.u8   q13, d26                       \n"
			
 
				-    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
			
 
				-    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
			
 
				-    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
			
 
				-    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
			
 
				-    "vqmovn.u16 d20, q10                       \n"
			
 
				-    "vqmovn.u16 d22, q11                       \n"
			
 
				-    "vqmovn.u16 d24, q12                       \n"
			
 
				-    "vqmovn.u16 d26, q13                       \n"
			
 
				-    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),       // %0
			
 
				-    "+r"(dst_argb),       // %1
			
 
				-    "+r"(width)           // %2
			
 
				-  : "r"(value)            // %3
			
 
				-  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
			
 
				-// Similar to ARGBToYJ but stores ARGB.
			
 
				-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
			
 
				-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
			
 
				-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
			
 
				-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q2, d0, d24                    \n"  // B
			
 
				-    "vmlal.u8   q2, d1, d25                    \n"  // G
			
 
				-    "vmlal.u8   q2, d2, d26                    \n"  // R
			
 
				-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
			
 
				-    "vmov       d1, d0                         \n"  // G
			
 
				-    "vmov       d2, d0                         \n"  // R
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(width)      // %2
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
			
 
				-//    b = (r * 35 + g * 68 + b * 17) >> 7
			
 
				-//    g = (r * 45 + g * 88 + b * 22) >> 7
			
 
				-//    r = (r * 50 + g * 98 + b * 24) >> 7
			
 
				-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d20, #17                       \n"  // BB coefficient
			
 
				-    "vmov.u8    d21, #68                       \n"  // BG coefficient
			
 
				-    "vmov.u8    d22, #35                       \n"  // BR coefficient
			
 
				-    "vmov.u8    d24, #22                       \n"  // GB coefficient
			
 
				-    "vmov.u8    d25, #88                       \n"  // GG coefficient
			
 
				-    "vmov.u8    d26, #45                       \n"  // GR coefficient
			
 
				-    "vmov.u8    d28, #24                       \n"  // BB coefficient
			
 
				-    "vmov.u8    d29, #98                       \n"  // BG coefficient
			
 
				-    "vmov.u8    d30, #50                       \n"  // BR coefficient
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
			
 
				-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
			
 
				-    "vmlal.u8   q2, d1, d21                    \n"  // G
			
 
				-    "vmlal.u8   q2, d2, d22                    \n"  // R
			
 
				-    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
			
 
				-    "vmlal.u8   q3, d1, d25                    \n"  // G
			
 
				-    "vmlal.u8   q3, d2, d26                    \n"  // R
			
 
				-    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
			
 
				-    "vmlal.u8   q8, d1, d29                    \n"  // G
			
 
				-    "vmlal.u8   q8, d2, d30                    \n"  // R
			
 
				-    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
			
 
				-    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
			
 
				-    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(dst_argb),  // %0
			
 
				-    "+r"(width)      // %1
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3",
			
 
				-    "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
			
 
				-// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
			
 
				-// needs to saturate.  Consider doing a non-saturating version.
			
 
				-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
			
 
				-                             const int8* matrix_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
			
 
				-    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
			
 
				-    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
			
 
				-
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
			
 
				-    "vmovl.u8   q9, d18                        \n"  // g
			
 
				-    "vmovl.u8   q10, d20                       \n"  // r
			
 
				-    "vmovl.u8   q15, d22                       \n"  // a
			
 
				-    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
			
 
				-    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
			
 
				-    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
			
 
				-    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
			
 
				-    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
			
 
				-    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
			
 
				-    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
			
 
				-    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
			
 
				-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
			
 
				-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
			
 
				-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
			
 
				-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
			
 
				-    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
			
 
				-    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
			
 
				-    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
			
 
				-    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
			
 
				-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
			
 
				-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
			
 
				-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
			
 
				-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
			
 
				-    "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B
			
 
				-    "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G
			
 
				-    "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R
			
 
				-    "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A
			
 
				-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
			
 
				-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
			
 
				-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
			
 
				-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
			
 
				-    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
			
 
				-    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
			
 
				-    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
			
 
				-    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
			
 
				-    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),   // %0
			
 
				-    "+r"(dst_argb),   // %1
			
 
				-    "+r"(width)       // %2
			
 
				-  : "r"(matrix_argb)  // %3
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
			
 
				-    "q10", "q11", "q12", "q13", "q14", "q15"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
			
 
				-#ifdef HAS_ARGBMULTIPLYROW_NEON
			
 
				-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
			
 
				-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    // 8 pixel loop.
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
			
 
				-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
			
 
				-    "vmull.u8   q0, d0, d1                     \n"  // multiply B
			
 
				-    "vmull.u8   q1, d2, d3                     \n"  // multiply G
			
 
				-    "vmull.u8   q2, d4, d5                     \n"  // multiply R
			
 
				-    "vmull.u8   q3, d6, d7                     \n"  // multiply A
			
 
				-    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
			
 
				-    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
			
 
				-    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
			
 
				-    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
			
 
				-    "bgt        1b                             \n"
			
 
				-
			
 
				-  : "+r"(src_argb0),  // %0
			
 
				-    "+r"(src_argb1),  // %1
			
 
				-    "+r"(dst_argb),   // %2
			
 
				-    "+r"(width)       // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3"
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBMULTIPLYROW_NEON
			
 
				-
			
 
				-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
			
 
				-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                     uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    // 8 pixel loop.
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
			
 
				-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
			
 
				-    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
			
 
				-    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
			
 
				-    "bgt        1b                             \n"
			
 
				-
			
 
				-  : "+r"(src_argb0),  // %0
			
 
				-    "+r"(src_argb1),  // %1
			
 
				-    "+r"(dst_argb),   // %2
			
 
				-    "+r"(width)       // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
			
 
				-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    // 8 pixel loop.
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
			
 
				-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
			
 
				-    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
			
 
				-    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
			
 
				-    "bgt        1b                             \n"
			
 
				-
			
 
				-  : "+r"(src_argb0),  // %0
			
 
				-    "+r"(src_argb1),  // %1
			
 
				-    "+r"(dst_argb),   // %2
			
 
				-    "+r"(width)       // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1", "q2", "q3"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
			
 
				-// A = 255
			
 
				-// R = Sobel
			
 
				-// G = Sobel
			
 
				-// B = Sobel
			
 
				-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                     uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d3, #255                       \n"  // alpha
			
 
				-    // 8 pixel loop.
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
			
 
				-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
			
 
				-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
			
 
				-    "vqadd.u8   d0, d0, d1                     \n"  // add
			
 
				-    "vmov.u8    d1, d0                         \n"
			
 
				-    "vmov.u8    d2, d0                         \n"
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_sobelx),  // %0
			
 
				-    "+r"(src_sobely),  // %1
			
 
				-    "+r"(dst_argb),    // %2
			
 
				-    "+r"(width)        // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Adds Sobel X and Sobel Y and stores Sobel into plane.
			
 
				-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                          uint8* dst_y, int width) {
			
 
				-  asm volatile (
			
 
				-    // 16 pixel loop.
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
			
 
				-    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
			
 
				-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
			
 
				-    "vqadd.u8   q0, q0, q1                     \n"  // add
			
 
				-    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_sobelx),  // %0
			
 
				-    "+r"(src_sobely),  // %1
			
 
				-    "+r"(dst_y),       // %2
			
 
				-    "+r"(width)        // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
			
 
				-// A = 255
			
 
				-// R = Sobel X
			
 
				-// G = Sobel
			
 
				-// B = Sobel Y
			
 
				-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                     uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "vmov.u8    d3, #255                       \n"  // alpha
			
 
				-    // 8 pixel loop.
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
			
 
				-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
			
 
				-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
			
 
				-    "vqadd.u8   d1, d0, d2                     \n"  // add
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_sobelx),  // %0
			
 
				-    "+r"(src_sobely),  // %1
			
 
				-    "+r"(dst_argb),    // %2
			
 
				-    "+r"(width)        // %3
			
 
				-  :
			
 
				-  : "cc", "memory", "q0", "q1"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// SobelX as a matrix is
			
 
				-// -1  0  1
			
 
				-// -2  0  2
			
 
				-// -1  0  1
			
 
				-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {d0}, [%0],%5                  \n"  // top
			
 
				-    "vld1.8     {d1}, [%0],%6                  \n"
			
 
				-    "vsubl.u8   q0, d0, d1                     \n"
			
 
				-    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
			
 
				-    "vld1.8     {d3}, [%1],%6                  \n"
			
 
				-    "vsubl.u8   q1, d2, d3                     \n"
			
 
				-    "vadd.s16   q0, q0, q1                     \n"
			
 
				-    "vadd.s16   q0, q0, q1                     \n"
			
 
				-    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
			
 
				-    "vld1.8     {d3}, [%2],%6                  \n"
			
 
				-    "subs       %4, %4, #8                     \n"  // 8 pixels
			
 
				-    "vsubl.u8   q1, d2, d3                     \n"
			
 
				-    "vadd.s16   q0, q0, q1                     \n"
			
 
				-    "vabs.s16   q0, q0                         \n"
			
 
				-    "vqmovn.u16 d0, q0                         \n"
			
 
				-    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_y0),      // %0
			
 
				-    "+r"(src_y1),      // %1
			
 
				-    "+r"(src_y2),      // %2
			
 
				-    "+r"(dst_sobelx),  // %3
			
 
				-    "+r"(width)        // %4
			
 
				-  : "r"(2),            // %5
			
 
				-    "r"(6)             // %6
			
 
				-  : "cc", "memory", "q0", "q1"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// SobelY as a matrix is
			
 
				-// -1 -2 -1
			
 
				-//  0  0  0
			
 
				-//  1  2  1
			
 
				-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    uint8* dst_sobely, int width) {
			
 
				-  asm volatile (
			
 
				-    ".p2align   2                              \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8     {d0}, [%0],%4                  \n"  // left
			
 
				-    "vld1.8     {d1}, [%1],%4                  \n"
			
 
				-    "vsubl.u8   q0, d0, d1                     \n"
			
 
				-    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
			
 
				-    "vld1.8     {d3}, [%1],%4                  \n"
			
 
				-    "vsubl.u8   q1, d2, d3                     \n"
			
 
				-    "vadd.s16   q0, q0, q1                     \n"
			
 
				-    "vadd.s16   q0, q0, q1                     \n"
			
 
				-    "vld1.8     {d2}, [%0],%5                  \n"  // right
			
 
				-    "vld1.8     {d3}, [%1],%5                  \n"
			
 
				-    "subs       %3, %3, #8                     \n"  // 8 pixels
			
 
				-    "vsubl.u8   q1, d2, d3                     \n"
			
 
				-    "vadd.s16   q0, q0, q1                     \n"
			
 
				-    "vabs.s16   q0, q0                         \n"
			
 
				-    "vqmovn.u16 d0, q0                         \n"
			
 
				-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_y0),      // %0
			
 
				-    "+r"(src_y1),      // %1
			
 
				-    "+r"(dst_sobely),  // %2
			
 
				-    "+r"(width)        // %3
			
 
				-  : "r"(1),            // %4
			
 
				-    "r"(6)             // %5
			
 
				-  : "cc", "memory", "q0", "q1"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-#endif  // __ARM_NEON__
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/row_posix.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/row_posix.cc
@@ -1,6443 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// This module is for GCC x86 and x64.
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
			
 
				-
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
			
 
				-
			
 
				-// Constants for ARGB
			
 
				-static vec8 kARGBToY = {
			
 
				-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
			
 
				-};
			
 
				-
			
 
				-// JPeg full range.
			
 
				-static vec8 kARGBToYJ = {
			
 
				-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
			
 
				-};
			
 
				-#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
			
 
				-
			
 
				-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
			
 
				-
			
 
				-static vec8 kARGBToU = {
			
 
				-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
			
 
				-};
			
 
				-
			
 
				-static vec8 kARGBToUJ = {
			
 
				-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
			
 
				-};
			
 
				-
			
 
				-static vec8 kARGBToV = {
			
 
				-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
			
 
				-};
			
 
				-
			
 
				-static vec8 kARGBToVJ = {
			
 
				-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
			
 
				-};
			
 
				-
			
 
				-// Constants for BGRA
			
 
				-static vec8 kBGRAToY = {
			
 
				-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
			
 
				-};
			
 
				-
			
 
				-static vec8 kBGRAToU = {
			
 
				-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
			
 
				-};
			
 
				-
			
 
				-static vec8 kBGRAToV = {
			
 
				-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
			
 
				-};
			
 
				-
			
 
				-// Constants for ABGR
			
 
				-static vec8 kABGRToY = {
			
 
				-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
			
 
				-};
			
 
				-
			
 
				-static vec8 kABGRToU = {
			
 
				-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
			
 
				-};
			
 
				-
			
 
				-static vec8 kABGRToV = {
			
 
				-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
			
 
				-};
			
 
				-
			
 
				-// Constants for RGBA.
			
 
				-static vec8 kRGBAToY = {
			
 
				-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
			
 
				-};
			
 
				-
			
 
				-static vec8 kRGBAToU = {
			
 
				-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
			
 
				-};
			
 
				-
			
 
				-static vec8 kRGBAToV = {
			
 
				-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
			
 
				-};
			
 
				-
			
 
				-static uvec8 kAddY16 = {
			
 
				-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
			
 
				-};
			
 
				-
			
 
				-static vec16 kAddYJ64 = {
			
 
				-  64, 64, 64, 64, 64, 64, 64, 64
			
 
				-};
			
 
				-
			
 
				-static uvec8 kAddUV128 = {
			
 
				-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
			
 
				-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
			
 
				-};
			
 
				-
			
 
				-static uvec16 kAddUVJ128 = {
			
 
				-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
			
 
				-};
			
 
				-#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
			
 
				-
			
 
				-#ifdef HAS_RGB24TOARGBROW_SSSE3
			
 
				-
			
 
				-// Shuffle table for converting RGB24 to ARGB.
			
 
				-static uvec8 kShuffleMaskRGB24ToARGB = {
			
 
				-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting RAW to ARGB.
			
 
				-static uvec8 kShuffleMaskRAWToARGB = {
			
 
				-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting ARGB to RGB24.
			
 
				-static uvec8 kShuffleMaskARGBToRGB24 = {
			
 
				-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting ARGB to RAW.
			
 
				-static uvec8 kShuffleMaskARGBToRAW = {
			
 
				-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
			
 
				-static uvec8 kShuffleMaskARGBToRGB24_0 = {
			
 
				-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting ARGB to RAW.
			
 
				-static uvec8 kShuffleMaskARGBToRAW_0 = {
			
 
				-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
			
 
				-};
			
 
				-#endif  // HAS_RGB24TOARGBROW_SSSE3
			
 
				-
			
 
				-#if defined(TESTING) && defined(__x86_64__)
			
 
				-void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
			
 
				-  asm volatile (
			
 
				-    ".p2align  5                               \n"
			
 
				-    "mov       %%eax,%%eax                     \n"
			
 
				-    "mov       %%ebx,%%ebx                     \n"
			
 
				-    "mov       %%ecx,%%ecx                     \n"
			
 
				-    "mov       %%edx,%%edx                     \n"
			
 
				-    "mov       %%esi,%%esi                     \n"
			
 
				-    "mov       %%edi,%%edi                     \n"
			
 
				-    "mov       %%ebp,%%ebp                     \n"
			
 
				-    "mov       %%esp,%%esp                     \n"
			
 
				-    ".p2align  5                               \n"
			
 
				-    "mov       %%r8d,%%r8d                     \n"
			
 
				-    "mov       %%r9d,%%r9d                     \n"
			
 
				-    "mov       %%r10d,%%r10d                   \n"
			
 
				-    "mov       %%r11d,%%r11d                   \n"
			
 
				-    "mov       %%r12d,%%r12d                   \n"
			
 
				-    "mov       %%r13d,%%r13d                   \n"
			
 
				-    "mov       %%r14d,%%r14d                   \n"
			
 
				-    "mov       %%r15d,%%r15d                   \n"
			
 
				-    ".p2align  5                               \n"
			
 
				-    "lea       (%%rax),%%eax                   \n"
			
 
				-    "lea       (%%rbx),%%ebx                   \n"
			
 
				-    "lea       (%%rcx),%%ecx                   \n"
			
 
				-    "lea       (%%rdx),%%edx                   \n"
			
 
				-    "lea       (%%rsi),%%esi                   \n"
			
 
				-    "lea       (%%rdi),%%edi                   \n"
			
 
				-    "lea       (%%rbp),%%ebp                   \n"
			
 
				-    "lea       (%%rsp),%%esp                   \n"
			
 
				-    ".p2align  5                               \n"
			
 
				-    "lea       (%%r8),%%r8d                    \n"
			
 
				-    "lea       (%%r9),%%r9d                    \n"
			
 
				-    "lea       (%%r10),%%r10d                  \n"
			
 
				-    "lea       (%%r11),%%r11d                  \n"
			
 
				-    "lea       (%%r12),%%r12d                  \n"
			
 
				-    "lea       (%%r13),%%r13d                  \n"
			
 
				-    "lea       (%%r14),%%r14d                  \n"
			
 
				-    "lea       (%%r15),%%r15d                  \n"
			
 
				-
			
 
				-    ".p2align  5                               \n"
			
 
				-    "lea       0x10(%%rax),%%eax               \n"
			
 
				-    "lea       0x10(%%rbx),%%ebx               \n"
			
 
				-    "lea       0x10(%%rcx),%%ecx               \n"
			
 
				-    "lea       0x10(%%rdx),%%edx               \n"
			
 
				-    "lea       0x10(%%rsi),%%esi               \n"
			
 
				-    "lea       0x10(%%rdi),%%edi               \n"
			
 
				-    "lea       0x10(%%rbp),%%ebp               \n"
			
 
				-    "lea       0x10(%%rsp),%%esp               \n"
			
 
				-    ".p2align  5                               \n"
			
 
				-    "lea       0x10(%%r8),%%r8d                \n"
			
 
				-    "lea       0x10(%%r9),%%r9d                \n"
			
 
				-    "lea       0x10(%%r10),%%r10d              \n"
			
 
				-    "lea       0x10(%%r11),%%r11d              \n"
			
 
				-    "lea       0x10(%%r12),%%r12d              \n"
			
 
				-    "lea       0x10(%%r13),%%r13d              \n"
			
 
				-    "lea       0x10(%%r14),%%r14d              \n"
			
 
				-    "lea       0x10(%%r15),%%r15d              \n"
			
 
				-
			
 
				-    ".p2align  5                               \n"
			
 
				-    "add       0x10,%%eax                      \n"
			
 
				-    "add       0x10,%%ebx                      \n"
			
 
				-    "add       0x10,%%ecx                      \n"
			
 
				-    "add       0x10,%%edx                      \n"
			
 
				-    "add       0x10,%%esi                      \n"
			
 
				-    "add       0x10,%%edi                      \n"
			
 
				-    "add       0x10,%%ebp                      \n"
			
 
				-    "add       0x10,%%esp                      \n"
			
 
				-    ".p2align  5                               \n"
			
 
				-    "add       0x10,%%r8d                      \n"
			
 
				-    "add       0x10,%%r9d                      \n"
			
 
				-    "add       0x10,%%r10d                     \n"
			
 
				-    "add       0x10,%%r11d                     \n"
			
 
				-    "add       0x10,%%r12d                     \n"
			
 
				-    "add       0x10,%%r13d                     \n"
			
 
				-    "add       0x10,%%r14d                     \n"
			
 
				-    "add       0x10,%%r15d                     \n"
			
 
				-
			
 
				-    ".p2align  2                               \n"
			
 
				-  "1:                                          \n"
			
 
				-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x8,0) ",%0            \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x20,1) ",%1           \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_y),     // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // TESTING
			
 
				-
			
 
				-#ifdef HAS_I400TOARGBROW_SSE2
			
 
				-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pslld     $0x18,%%xmm5                    \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x8,0) ",%0            \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm0,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm1,%%xmm1                   \n"
			
 
				-    "por       %%xmm5,%%xmm0                   \n"
			
 
				-    "por       %%xmm5,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,1) ",%1           \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_y),     // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
			
 
				-                                  int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pslld     $0x18,%%xmm5                    \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x8,0) ",%0            \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm0,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm1,%%xmm1                   \n"
			
 
				-    "por       %%xmm5,%%xmm0                   \n"
			
 
				-    "por       %%xmm5,%%xmm1                   \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,1) ",%1           \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_y),     // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_I400TOARGBROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_RGB24TOARGBROW_SSSE3
			
 
				-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
			
 
				-    "pslld     $0x18,%%xmm5                    \n"
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
			
 
				-    "lea       " MEMLEA(0x30,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm3,%%xmm2                   \n"
			
 
				-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
			
 
				-    "pshufb    %%xmm4,%%xmm2                   \n"
			
 
				-    "por       %%xmm5,%%xmm2                   \n"
			
 
				-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
			
 
				-    "pshufb    %%xmm4,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
			
 
				-    "por       %%xmm5,%%xmm0                   \n"
			
 
				-    "pshufb    %%xmm4,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "por       %%xmm5,%%xmm1                   \n"
			
 
				-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
			
 
				-    "pshufb    %%xmm4,%%xmm3                   \n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "por       %%xmm5,%%xmm3                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x40,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_rgb24),  // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kShuffleMaskRGB24ToARGB)  // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
			
 
				-    "pslld     $0x18,%%xmm5                    \n"
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
			
 
				-    "lea       " MEMLEA(0x30,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm3,%%xmm2                   \n"
			
 
				-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
			
 
				-    "pshufb    %%xmm4,%%xmm2                   \n"
			
 
				-    "por       %%xmm5,%%xmm2                   \n"
			
 
				-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
			
 
				-    "pshufb    %%xmm4,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
			
 
				-    "por       %%xmm5,%%xmm0                   \n"
			
 
				-    "pshufb    %%xmm4,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "por       %%xmm5,%%xmm1                   \n"
			
 
				-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
			
 
				-    "pshufb    %%xmm4,%%xmm3                   \n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "por       %%xmm5,%%xmm3                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x40,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_raw),   // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kShuffleMaskRAWToARGB)  // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
			
 
				-  asm volatile (
			
 
				-    "mov       $0x1080108,%%eax                \n"
			
 
				-    "movd      %%eax,%%xmm5                    \n"
			
 
				-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
			
 
				-    "mov       $0x20802080,%%eax               \n"
			
 
				-    "movd      %%eax,%%xmm6                    \n"
			
 
				-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
			
 
				-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
			
 
				-    "psllw     $0xb,%%xmm3                     \n"
			
 
				-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
			
 
				-    "psllw     $0xa,%%xmm4                     \n"
			
 
				-    "psrlw     $0x5,%%xmm4                     \n"
			
 
				-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
			
 
				-    "psllw     $0x8,%%xmm7                     \n"
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm2                   \n"
			
 
				-    "pand      %%xmm3,%%xmm1                   \n"
			
 
				-    "psllw     $0xb,%%xmm2                     \n"
			
 
				-    "pmulhuw   %%xmm5,%%xmm1                   \n"
			
 
				-    "pmulhuw   %%xmm5,%%xmm2                   \n"
			
 
				-    "psllw     $0x8,%%xmm1                     \n"
			
 
				-    "por       %%xmm2,%%xmm1                   \n"
			
 
				-    "pand      %%xmm4,%%xmm0                   \n"
			
 
				-    "pmulhuw   %%xmm6,%%xmm0                   \n"
			
 
				-    "por       %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm1,%%xmm2                   \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm1                   \n"
			
 
				-    "punpckhbw %%xmm0,%%xmm2                   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
			
 
				-    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),  // %0
			
 
				-    "+r"(dst),  // %1
			
 
				-    "+r"(pix)   // %2
			
 
				-  :
			
 
				-  : "memory", "cc", "eax"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
			
 
				-  asm volatile (
			
 
				-    "mov       $0x1080108,%%eax                \n"
			
 
				-    "movd      %%eax,%%xmm5                    \n"
			
 
				-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
			
 
				-    "mov       $0x42004200,%%eax               \n"
			
 
				-    "movd      %%eax,%%xmm6                    \n"
			
 
				-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
			
 
				-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
			
 
				-    "psllw     $0xb,%%xmm3                     \n"
			
 
				-    "movdqa    %%xmm3,%%xmm4                   \n"
			
 
				-    "psrlw     $0x6,%%xmm4                     \n"
			
 
				-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
			
 
				-    "psllw     $0x8,%%xmm7                     \n"
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm2                   \n"
			
 
				-    "psllw     $0x1,%%xmm1                     \n"
			
 
				-    "psllw     $0xb,%%xmm2                     \n"
			
 
				-    "pand      %%xmm3,%%xmm1                   \n"
			
 
				-    "pmulhuw   %%xmm5,%%xmm2                   \n"
			
 
				-    "pmulhuw   %%xmm5,%%xmm1                   \n"
			
 
				-    "psllw     $0x8,%%xmm1                     \n"
			
 
				-    "por       %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm2                   \n"
			
 
				-    "pand      %%xmm4,%%xmm0                   \n"
			
 
				-    "psraw     $0x8,%%xmm2                     \n"
			
 
				-    "pmulhuw   %%xmm6,%%xmm0                   \n"
			
 
				-    "pand      %%xmm7,%%xmm2                   \n"
			
 
				-    "por       %%xmm2,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm1,%%xmm2                   \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm1                   \n"
			
 
				-    "punpckhbw %%xmm0,%%xmm2                   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
			
 
				-    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),  // %0
			
 
				-    "+r"(dst),  // %1
			
 
				-    "+r"(pix)   // %2
			
 
				-  :
			
 
				-  : "memory", "cc", "eax"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
			
 
				-  asm volatile (
			
 
				-    "mov       $0xf0f0f0f,%%eax                \n"
			
 
				-    "movd      %%eax,%%xmm4                    \n"
			
 
				-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
			
 
				-    "movdqa    %%xmm4,%%xmm5                   \n"
			
 
				-    "pslld     $0x4,%%xmm5                     \n"
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    %%xmm0,%%xmm2                   \n"
			
 
				-    "pand      %%xmm4,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm3                   \n"
			
 
				-    "psllw     $0x4,%%xmm1                     \n"
			
 
				-    "psrlw     $0x4,%%xmm3                     \n"
			
 
				-    "por       %%xmm1,%%xmm0                   \n"
			
 
				-    "por       %%xmm3,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhbw %%xmm2,%%xmm1                   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,0,2)           //  movdqa  %%xmm0,(%1,%0,2)
			
 
				-    MEMOPMEM(movdqa,xmm1,0x10,1,0,2)           //  movdqa  %%xmm1,0x10(%1,%0,2)
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),  // %0
			
 
				-    "+r"(dst),  // %1
			
 
				-    "+r"(pix)   // %2
			
 
				-  :
			
 
				-  : "memory", "cc", "eax"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %3,%%xmm6                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "pshufb    %%xmm6,%%xmm0                   \n"
			
 
				-    "pshufb    %%xmm6,%%xmm1                   \n"
			
 
				-    "pshufb    %%xmm6,%%xmm2                   \n"
			
 
				-    "pshufb    %%xmm6,%%xmm3                   \n"
			
 
				-    "movdqa    %%xmm1,%%xmm4                   \n"
			
 
				-    "psrldq    $0x4,%%xmm1                     \n"
			
 
				-    "pslldq    $0xc,%%xmm4                     \n"
			
 
				-    "movdqa    %%xmm2,%%xmm5                   \n"
			
 
				-    "por       %%xmm4,%%xmm0                   \n"
			
 
				-    "pslldq    $0x8,%%xmm5                     \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "por       %%xmm5,%%xmm1                   \n"
			
 
				-    "psrldq    $0x8,%%xmm2                     \n"
			
 
				-    "pslldq    $0x4,%%xmm3                     \n"
			
 
				-    "por       %%xmm3,%%xmm2                   \n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x30,1) ",%1           \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),  // %0
			
 
				-    "+r"(dst),  // %1
			
 
				-    "+r"(pix)   // %2
			
 
				-  : "m"(kShuffleMaskARGBToRGB24)  // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %3,%%xmm6                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "pshufb    %%xmm6,%%xmm0                   \n"
			
 
				-    "pshufb    %%xmm6,%%xmm1                   \n"
			
 
				-    "pshufb    %%xmm6,%%xmm2                   \n"
			
 
				-    "pshufb    %%xmm6,%%xmm3                   \n"
			
 
				-    "movdqa    %%xmm1,%%xmm4                   \n"
			
 
				-    "psrldq    $0x4,%%xmm1                     \n"
			
 
				-    "pslldq    $0xc,%%xmm4                     \n"
			
 
				-    "movdqa    %%xmm2,%%xmm5                   \n"
			
 
				-    "por       %%xmm4,%%xmm0                   \n"
			
 
				-    "pslldq    $0x8,%%xmm5                     \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "por       %%xmm5,%%xmm1                   \n"
			
 
				-    "psrldq    $0x8,%%xmm2                     \n"
			
 
				-    "pslldq    $0x4,%%xmm3                     \n"
			
 
				-    "por       %%xmm3,%%xmm2                   \n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x30,1) ",%1           \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),  // %0
			
 
				-    "+r"(dst),  // %1
			
 
				-    "+r"(pix)   // %2
			
 
				-  : "m"(kShuffleMaskARGBToRAW)  // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
			
 
				-    "psrld     $0x1b,%%xmm3                    \n"
			
 
				-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
			
 
				-    "psrld     $0x1a,%%xmm4                    \n"
			
 
				-    "pslld     $0x5,%%xmm4                     \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pslld     $0xb,%%xmm5                     \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm2                   \n"
			
 
				-    "pslld     $0x8,%%xmm0                     \n"
			
 
				-    "psrld     $0x3,%%xmm1                     \n"
			
 
				-    "psrld     $0x5,%%xmm2                     \n"
			
 
				-    "psrad     $0x10,%%xmm0                    \n"
			
 
				-    "pand      %%xmm3,%%xmm1                   \n"
			
 
				-    "pand      %%xmm4,%%xmm2                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "por       %%xmm2,%%xmm1                   \n"
			
 
				-    "por       %%xmm1,%%xmm0                   \n"
			
 
				-    "packssdw  %%xmm0,%%xmm0                   \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "sub       $0x4,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),  // %0
			
 
				-    "+r"(dst),  // %1
			
 
				-    "+r"(pix)   // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
			
 
				-    "psrld     $0x1b,%%xmm4                    \n"
			
 
				-    "movdqa    %%xmm4,%%xmm5                   \n"
			
 
				-    "pslld     $0x5,%%xmm5                     \n"
			
 
				-    "movdqa    %%xmm4,%%xmm6                   \n"
			
 
				-    "pslld     $0xa,%%xmm6                     \n"
			
 
				-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
			
 
				-    "pslld     $0xf,%%xmm7                     \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm3                   \n"
			
 
				-    "psrad     $0x10,%%xmm0                    \n"
			
 
				-    "psrld     $0x3,%%xmm1                     \n"
			
 
				-    "psrld     $0x6,%%xmm2                     \n"
			
 
				-    "psrld     $0x9,%%xmm3                     \n"
			
 
				-    "pand      %%xmm7,%%xmm0                   \n"
			
 
				-    "pand      %%xmm4,%%xmm1                   \n"
			
 
				-    "pand      %%xmm5,%%xmm2                   \n"
			
 
				-    "pand      %%xmm6,%%xmm3                   \n"
			
 
				-    "por       %%xmm1,%%xmm0                   \n"
			
 
				-    "por       %%xmm3,%%xmm2                   \n"
			
 
				-    "por       %%xmm2,%%xmm0                   \n"
			
 
				-    "packssdw  %%xmm0,%%xmm0                   \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMACCESS2(0x8,1) ",%1        \n"
			
 
				-    "sub       $0x4,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),  // %0
			
 
				-    "+r"(dst),  // %1
			
 
				-    "+r"(pix)   // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
			
 
				-    "psllw     $0xc,%%xmm4                     \n"
			
 
				-    "movdqa    %%xmm4,%%xmm3                   \n"
			
 
				-    "psrlw     $0x8,%%xmm3                     \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "pand      %%xmm3,%%xmm0                   \n"
			
 
				-    "pand      %%xmm4,%%xmm1                   \n"
			
 
				-    "psrlq     $0x4,%%xmm0                     \n"
			
 
				-    "psrlq     $0x8,%%xmm1                     \n"
			
 
				-    "por       %%xmm1,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "sub       $0x4,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),  // %0
			
 
				-    "+r"(dst),  // %1
			
 
				-    "+r"(pix)   // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_RGB24TOARGBROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBTOYROW_SSSE3
			
 
				-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm3                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm3,%%xmm2                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm2                     \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kARGBToY),   // %3
			
 
				-    "m"(kAddY16)     // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm3                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm3,%%xmm2                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm2                     \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kARGBToY),   // %3
			
 
				-    "m"(kAddY16)     // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBTOYROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBTOYJROW_SSSE3
			
 
				-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm3                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm3,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm5,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm5,%%xmm2                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm2                     \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kARGBToYJ),  // %3
			
 
				-    "m"(kAddYJ64)    // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm3                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm3,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm5,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm5,%%xmm2                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm2                     \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kARGBToYJ),  // %3
			
 
				-    "m"(kAddYJ64)    // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBTOYJROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBTOUVROW_SSSE3
			
 
				-// TODO(fbarchard): pass xmm constants to single block of assembly.
			
 
				-// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
			
 
				-// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
			
 
				-// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
			
 
				-// and considered unsafe.
			
 
				-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kARGBToU),  // %0
			
 
				-    "m"(kARGBToV),  // %1
			
 
				-    "m"(kAddUV128)  // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
			
 
				-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
			
 
				-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
			
 
				-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  : "r"((intptr_t)(src_stride_argb)) // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
			
 
				-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                        uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kARGBToUJ),  // %0
			
 
				-    "m"(kARGBToVJ),  // %1
			
 
				-    "m"(kAddUVJ128)  // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
			
 
				-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
			
 
				-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
			
 
				-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "paddw     %%xmm5,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm5,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  : "r"((intptr_t)(src_stride_argb)) // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kARGBToU),         // %0
			
 
				-    "m"(kARGBToV),         // %1
			
 
				-    "m"(kAddUV128)         // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm1                   \n"
			
 
				-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm6                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  : "r"((intptr_t)(src_stride_argb)) // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                                  uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kARGBToUJ),         // %0
			
 
				-    "m"(kARGBToVJ),         // %1
			
 
				-    "m"(kAddUVJ128)         // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm1                   \n"
			
 
				-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm6                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "paddw     %%xmm5,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm5,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  : "r"((intptr_t)(src_stride_argb))
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
			
 
				-                          int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kARGBToU),  // %0
			
 
				-    "m"(kARGBToV),  // %1
			
 
				-    "m"(kAddUV128)  // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm2                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm2                     \n"
			
 
				-    "packsswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm2                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm2                     \n"
			
 
				-    "packsswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,2,1)           //  movdqa  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),        // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
			
 
				-                                    uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kARGBToU),  // %0
			
 
				-    "m"(kARGBToV),  // %1
			
 
				-    "m"(kAddUV128)  // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm2                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm2                     \n"
			
 
				-    "packsswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm2                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm2                     \n"
			
 
				-    "packsswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),        // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
			
 
				-                          uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kARGBToU),  // %0
			
 
				-    "m"(kARGBToV),  // %1
			
 
				-    "m"(kAddUV128)  // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
			
 
				-                                    uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kARGBToU),  // %0
			
 
				-    "m"(kARGBToV),  // %1
			
 
				-    "m"(kAddUV128)  // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm3                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm3,%%xmm2                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm2                     \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_bgra),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kBGRAToY),   // %3
			
 
				-    "m"(kAddY16)     // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm3                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm3,%%xmm2                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm2                     \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_bgra),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kBGRAToY),   // %3
			
 
				-    "m"(kAddY16)     // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kBGRAToU),         // %0
			
 
				-    "m"(kBGRAToV),         // %1
			
 
				-    "m"(kAddUV128)         // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
			
 
				-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
			
 
				-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
			
 
				-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_bgra0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  : "r"((intptr_t)(src_stride_bgra)) // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kBGRAToU),         // %0
			
 
				-    "m"(kBGRAToV),         // %1
			
 
				-    "m"(kAddUV128)         // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm1                   \n"
			
 
				-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm6                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_bgra0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  : "r"((intptr_t)(src_stride_bgra)) // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm3                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm3,%%xmm2                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm2                     \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_abgr),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kABGRToY),   // %3
			
 
				-    "m"(kAddY16)     // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm3                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm3,%%xmm2                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm2                     \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_abgr),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kABGRToY),   // %3
			
 
				-    "m"(kAddY16)     // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm3                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm3,%%xmm2                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm2                     \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_rgba),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kRGBAToY),   // %3
			
 
				-    "m"(kAddY16)     // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm3                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm3,%%xmm2                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm2                     \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_rgba),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "m"(kRGBAToY),   // %3
			
 
				-    "m"(kAddY16)     // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kABGRToU),         // %0
			
 
				-    "m"(kABGRToV),         // %1
			
 
				-    "m"(kAddUV128)         // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
			
 
				-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
			
 
				-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
			
 
				-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_abgr0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  : "r"((intptr_t)(src_stride_abgr)) // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kABGRToU),         // %0
			
 
				-    "m"(kABGRToV),         // %1
			
 
				-    "m"(kAddUV128)         // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm1                   \n"
			
 
				-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm6                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_abgr0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  : "r"((intptr_t)(src_stride_abgr)) // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kRGBAToU),         // %0
			
 
				-    "m"(kRGBAToV),         // %1
			
 
				-    "m"(kAddUV128)         // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
			
 
				-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
			
 
				-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
			
 
				-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_rgba0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  : "r"((intptr_t)(src_stride_rgba))
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %0,%%xmm4                       \n"
			
 
				-    "movdqa    %1,%%xmm3                       \n"
			
 
				-    "movdqa    %2,%%xmm5                       \n"
			
 
				-  :
			
 
				-  : "m"(kRGBAToU),         // %0
			
 
				-    "m"(kRGBAToV),         // %1
			
 
				-    "m"(kAddUV128)         // %2
			
 
				-  );
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm1                   \n"
			
 
				-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
			
 
				-    "pavgb     %%xmm7,%%xmm6                   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
			
 
				-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm7                   \n"
			
 
				-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
			
 
				-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
			
 
				-    "pavgb     %%xmm7,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm2                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm2,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm1                   \n"
			
 
				-    "psraw     $0x8,%%xmm0                     \n"
			
 
				-    "psraw     $0x8,%%xmm1                     \n"
			
 
				-    "packsswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "paddb     %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_rgba0),       // %0
			
 
				-    "+r"(dst_u),           // %1
			
 
				-    "+r"(dst_v),           // %2
			
 
				-    "+rm"(width)           // %3
			
 
				-  : "r"((intptr_t)(src_stride_rgba)) // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBTOUVROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_I422TOARGBROW_SSSE3
			
 
				-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
			
 
				-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
			
 
				-#define UR 0
			
 
				-
			
 
				-#define VB 0
			
 
				-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
			
 
				-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
			
 
				-
			
 
				-// Bias
			
 
				-#define BB UB * 128 + VB * 128
			
 
				-#define BG UG * 128 + VG * 128
			
 
				-#define BR UR * 128 + VR * 128
			
 
				-
			
 
				-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
			
 
				-
			
 
				-struct {
			
 
				-  vec8 kUVToB;  // 0
			
 
				-  vec8 kUVToG;  // 16
			
 
				-  vec8 kUVToR;  // 32
			
 
				-  vec16 kUVBiasB;  // 48
			
 
				-  vec16 kUVBiasG;  // 64
			
 
				-  vec16 kUVBiasR;  // 80
			
 
				-  vec16 kYSub16;  // 96
			
 
				-  vec16 kYToRgb;  // 112
			
 
				-  vec8 kVUToB;  // 128
			
 
				-  vec8 kVUToG;  // 144
			
 
				-  vec8 kVUToR;  // 160
			
 
				-} static SIMD_ALIGNED(kYuvConstants) = {
			
 
				-  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
			
 
				-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
			
 
				-  { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
			
 
				-  { BB, BB, BB, BB, BB, BB, BB, BB },
			
 
				-  { BG, BG, BG, BG, BG, BG, BG, BG },
			
 
				-  { BR, BR, BR, BR, BR, BR, BR, BR },
			
 
				-  { 16, 16, 16, 16, 16, 16, 16, 16 },
			
 
				-  { YG, YG, YG, YG, YG, YG, YG, YG },
			
 
				-  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
			
 
				-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
			
 
				-  { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
			
 
				-};
			
 
				-
			
 
				-
			
 
				-// Read 8 UV from 411
			
 
				-#define READYUV444                                                             \
			
 
				-    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
			
 
				-    BUNDLEALIGN                                                                \
			
 
				-    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
			
 
				-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
			
 
				-    "punpcklbw  %%xmm1,%%xmm0                                   \n"
			
 
				-
			
 
				-// Read 4 UV from 422, upsample to 8 UV
			
 
				-#define READYUV422                                                             \
			
 
				-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
			
 
				-    BUNDLEALIGN                                                                \
			
 
				-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
			
 
				-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
			
 
				-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
			
 
				-    "punpcklwd  %%xmm0,%%xmm0                                   \n"
			
 
				-
			
 
				-// Read 2 UV from 411, upsample to 8 UV
			
 
				-#define READYUV411                                                             \
			
 
				-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
			
 
				-    BUNDLEALIGN                                                                \
			
 
				-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
			
 
				-    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
			
 
				-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
			
 
				-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
			
 
				-    "punpckldq  %%xmm0,%%xmm0                                   \n"
			
 
				-
			
 
				-// Read 4 UV from NV12, upsample to 8 UV
			
 
				-#define READNV12                                                               \
			
 
				-    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
			
 
				-    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
			
 
				-    "punpcklwd  %%xmm0,%%xmm0                                   \n"
			
 
				-
			
 
				-// Convert 8 pixels: 8 UV and 8 Y
			
 
				-#define YUVTORGB                                                               \
			
 
				-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
			
 
				-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
			
 
				-    "pmaddubsw  " MEMACCESS([kYuvConstants]) ",%%xmm0           \n"            \
			
 
				-    "pmaddubsw  " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1      \n"            \
			
 
				-    "pmaddubsw  " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2      \n"            \
			
 
				-    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
			
 
				-    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
			
 
				-    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
			
 
				-    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
			
 
				-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
			
 
				-    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
			
 
				-    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
			
 
				-    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
			
 
				-    "paddsw     %%xmm3,%%xmm0                                   \n"            \
			
 
				-    "paddsw     %%xmm3,%%xmm1                                   \n"            \
			
 
				-    "paddsw     %%xmm3,%%xmm2                                   \n"            \
			
 
				-    "psraw      $0x6,%%xmm0                                     \n"            \
			
 
				-    "psraw      $0x6,%%xmm1                                     \n"            \
			
 
				-    "psraw      $0x6,%%xmm2                                     \n"            \
			
 
				-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
			
 
				-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
			
 
				-    "packuswb   %%xmm2,%%xmm2                                   \n"
			
 
				-
			
 
				-// Convert 8 pixels: 8 VU and 8 Y
			
 
				-#define YVUTORGB                                                               \
			
 
				-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
			
 
				-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
			
 
				-    "pmaddubsw  " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0     \n"            \
			
 
				-    "pmaddubsw  " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1     \n"            \
			
 
				-    "pmaddubsw  " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2     \n"            \
			
 
				-    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
			
 
				-    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
			
 
				-    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
			
 
				-    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
			
 
				-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
			
 
				-    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
			
 
				-    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
			
 
				-    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
			
 
				-    "paddsw     %%xmm3,%%xmm0                                   \n"            \
			
 
				-    "paddsw     %%xmm3,%%xmm1                                   \n"            \
			
 
				-    "paddsw     %%xmm3,%%xmm2                                   \n"            \
			
 
				-    "psraw      $0x6,%%xmm0                                     \n"            \
			
 
				-    "psraw      $0x6,%%xmm1                                     \n"            \
			
 
				-    "psraw      $0x6,%%xmm2                                     \n"            \
			
 
				-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
			
 
				-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
			
 
				-    "packuswb   %%xmm2,%%xmm2                                   \n"
			
 
				-
			
 
				-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
			
 
				-                                const uint8* u_buf,
			
 
				-                                const uint8* v_buf,
			
 
				-                                uint8* dst_argb,
			
 
				-                                int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV444
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "         \n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]  \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
			
 
				-                                 const uint8* u_buf,
			
 
				-                                 const uint8* v_buf,
			
 
				-                                 uint8* dst_rgb24,
			
 
				-                                 int width) {
			
 
				-// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
			
 
				-#if defined(__i386__)
			
 
				-  asm volatile (
			
 
				-    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
			
 
				-    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
			
 
				-  :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
			
 
				-    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
			
 
				-#endif
			
 
				-
			
 
				-  asm volatile (
			
 
				-#if !defined(__i386__)
			
 
				-    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
			
 
				-    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
			
 
				-#endif
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "pshufb    %%xmm5,%%xmm0                   \n"
			
 
				-    "pshufb    %%xmm6,%%xmm1                   \n"
			
 
				-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
			
 
				-    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
			
 
				-    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
			
 
				-#if !defined(__i386__)
			
 
				-    , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
			
 
				-    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
			
 
				-#endif
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
			
 
				-                               const uint8* u_buf,
			
 
				-                               const uint8* v_buf,
			
 
				-                               uint8* dst_raw,
			
 
				-                               int width) {
			
 
				-// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
			
 
				-#if defined(__i386__)
			
 
				-  asm volatile (
			
 
				-    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
			
 
				-    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
			
 
				-  :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
			
 
				-    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
			
 
				-#endif
			
 
				-
			
 
				-  asm volatile (
			
 
				-#if !defined(__i386__)
			
 
				-    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
			
 
				-    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
			
 
				-#endif
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "pshufb    %%xmm5,%%xmm0                   \n"
			
 
				-    "pshufb    %%xmm6,%%xmm1                   \n"
			
 
				-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
			
 
				-    "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
			
 
				-    "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_raw]"+r"(dst_raw),  // %[dst_raw]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
			
 
				-#if !defined(__i386__)
			
 
				-    , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
			
 
				-    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
			
 
				-#endif
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
			
 
				-                                const uint8* u_buf,
			
 
				-                                const uint8* v_buf,
			
 
				-                                uint8* dst_argb,
			
 
				-                                int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
			
 
				-                                const uint8* u_buf,
			
 
				-                                const uint8* v_buf,
			
 
				-                                uint8* dst_argb,
			
 
				-                                int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV411
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
			
 
				-                                const uint8* uv_buf,
			
 
				-                                uint8* dst_argb,
			
 
				-                                int width) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READNV12
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
			
 
				-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-  // Does not use r14.
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
			
 
				-                                const uint8* uv_buf,
			
 
				-                                uint8* dst_argb,
			
 
				-                                int width) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READNV12
			
 
				-    YVUTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
			
 
				-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-  // Does not use r14.
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                          const uint8* u_buf,
			
 
				-                                          const uint8* v_buf,
			
 
				-                                          uint8* dst_argb,
			
 
				-                                          int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV444
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                          const uint8* u_buf,
			
 
				-                                          const uint8* v_buf,
			
 
				-                                          uint8* dst_argb,
			
 
				-                                          int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                          const uint8* u_buf,
			
 
				-                                          const uint8* v_buf,
			
 
				-                                          uint8* dst_argb,
			
 
				-                                          int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV411
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                          const uint8* uv_buf,
			
 
				-                                          uint8* dst_argb,
			
 
				-                                          int width) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READNV12
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
			
 
				-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-  // Does not use r14.
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                          const uint8* uv_buf,
			
 
				-                                          uint8* dst_argb,
			
 
				-                                          int width) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READNV12
			
 
				-    YVUTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
			
 
				-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-  // Does not use r14.
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
			
 
				-                                const uint8* u_buf,
			
 
				-                                const uint8* v_buf,
			
 
				-                                uint8* dst_bgra,
			
 
				-                                int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm5                   \n"
			
 
				-    "movdqa    %%xmm5,%%xmm0                   \n"
			
 
				-    "punpcklwd %%xmm1,%%xmm5                   \n"
			
 
				-    "punpckhwd %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm5," MEMACCESS([dst_bgra]) "\n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
			
 
				-                                const uint8* u_buf,
			
 
				-                                const uint8* v_buf,
			
 
				-                                uint8* dst_abgr,
			
 
				-                                int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm2                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm0,%%xmm2                   \n"
			
 
				-    "punpckhwd %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2," MEMACCESS([dst_abgr]) "\n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
			
 
				-                                const uint8* u_buf,
			
 
				-                                const uint8* v_buf,
			
 
				-                                uint8* dst_rgba,
			
 
				-                                int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm5                   \n"
			
 
				-    "movdqa    %%xmm5,%%xmm0                   \n"
			
 
				-    "punpcklwd %%xmm1,%%xmm5                   \n"
			
 
				-    "punpckhwd %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm5," MEMACCESS([dst_rgba]) "\n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                          const uint8* u_buf,
			
 
				-                                          const uint8* v_buf,
			
 
				-                                          uint8* dst_bgra,
			
 
				-                                          int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm5                   \n"
			
 
				-    "movdqa    %%xmm5,%%xmm0                   \n"
			
 
				-    "punpcklwd %%xmm1,%%xmm5                   \n"
			
 
				-    "punpckhwd %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "\n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                          const uint8* u_buf,
			
 
				-                                          const uint8* v_buf,
			
 
				-                                          uint8* dst_abgr,
			
 
				-                                          int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-    "punpcklbw %%xmm1,%%xmm2                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm0,%%xmm2                   \n"
			
 
				-    "punpckhwd %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "\n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                          const uint8* u_buf,
			
 
				-                                          const uint8* v_buf,
			
 
				-                                          uint8* dst_rgba,
			
 
				-                                          int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %[u_buf],%[v_buf]               \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm5                   \n"
			
 
				-    "movdqa    %%xmm5,%%xmm0                   \n"
			
 
				-    "punpcklwd %%xmm1,%%xmm5                   \n"
			
 
				-    "punpckhwd %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "\n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
			
 
				-    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
			
 
				-    "sub       $0x8,%[width]                   \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : [y_buf]"+r"(y_buf),    // %[y_buf]
			
 
				-    [u_buf]"+r"(u_buf),    // %[u_buf]
			
 
				-    [v_buf]"+r"(v_buf),    // %[v_buf]
			
 
				-    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
			
 
				-    [width]"+rm"(width)    // %[width]
			
 
				-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-#endif  // HAS_I422TOARGBROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_YTOARGBROW_SSE2
			
 
				-void YToARGBRow_SSE2(const uint8* y_buf,
			
 
				-                     uint8* dst_argb,
			
 
				-                     int width) {
			
 
				-  asm volatile (
			
 
				-    "pxor      %%xmm5,%%xmm5                   \n"
			
 
				-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
			
 
				-    "pslld     $0x18,%%xmm4                    \n"
			
 
				-    "mov       $0x00100010,%%eax               \n"
			
 
				-    "movd      %%eax,%%xmm3                    \n"
			
 
				-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
			
 
				-    "mov       $0x004a004a,%%eax               \n"
			
 
				-    "movd      %%eax,%%xmm2                    \n"
			
 
				-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
			
 
				-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x8,0) ",%0            \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm0                   \n"
			
 
				-    "psubusw   %%xmm3,%%xmm0                   \n"
			
 
				-    "pmullw    %%xmm2,%%xmm0                   \n"
			
 
				-    "psrlw     $6, %%xmm0                      \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-
			
 
				-    // Step 2: Weave into ARGB
			
 
				-    "punpcklbw %%xmm0,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm0,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm1,%%xmm1                   \n"
			
 
				-    "por       %%xmm4,%%xmm0                   \n"
			
 
				-    "por       %%xmm4,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,1) ",%1           \n"
			
 
				-
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(y_buf),     // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+rm"(width)     // %2
			
 
				-  :
			
 
				-  : "memory", "cc", "eax"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_YTOARGBROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_MIRRORROW_SSSE3
			
 
				-// Shuffle table for reversing the bytes.
			
 
				-static uvec8 kShuffleMirror = {
			
 
				-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
			
 
				-};
			
 
				-
			
 
				-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
			
 
				-  intptr_t temp_width = (intptr_t)(width);
			
 
				-  asm volatile (
			
 
				-    "movdqa    %3,%%xmm5                       \n"
			
 
				-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    MEMOPREG(movdqa,0x00,0,2,1,xmm0)           //  movdqa  (%0,%2),%%xmm0
			
 
				-    "pshufb    %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),  // %0
			
 
				-    "+r"(dst),  // %1
			
 
				-    "+r"(temp_width)  // %2
			
 
				-  : "m"(kShuffleMirror) // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_MIRRORROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_MIRRORROW_SSE2
			
 
				-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
			
 
				-  intptr_t temp_width = (intptr_t)(width);
			
 
				-  asm volatile (
			
 
				-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    MEMOPREG(movdqu,0x00,0,2,1,xmm0)           //  movdqu  (%0,%2),%%xmm0
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "psllw     $0x8,%%xmm0                     \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "por       %%xmm1,%%xmm0                   \n"
			
 
				-    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
			
 
				-    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
			
 
				-    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1)",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),  // %0
			
 
				-    "+r"(dst),  // %1
			
 
				-    "+r"(temp_width)  // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_MIRRORROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_MIRRORROW_UV_SSSE3
			
 
				-// Shuffle table for reversing the bytes of UV channels.
			
 
				-static uvec8 kShuffleMirrorUV = {
			
 
				-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
			
 
				-};
			
 
				-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
			
 
				-                       int width) {
			
 
				-  intptr_t temp_width = (intptr_t)(width);
			
 
				-  asm volatile (
			
 
				-    "movdqa    %4,%%xmm1                       \n"
			
 
				-    "lea       " MEMLEA4(-0x10,0,3,2) ",%0       \n"
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(-0x10,0) ",%0            \n"
			
 
				-    "pshufb    %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $8,%3                           \n"
			
 
				-    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),      // %0
			
 
				-    "+r"(dst_u),    // %1
			
 
				-    "+r"(dst_v),    // %2
			
 
				-    "+r"(temp_width)  // %3
			
 
				-  : "m"(kShuffleMirrorUV)  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_MIRRORROW_UV_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBMIRRORROW_SSSE3
			
 
				-// Shuffle table for reversing the bytes.
			
 
				-static uvec8 kARGBShuffleMirror = {
			
 
				-  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
			
 
				-};
			
 
				-
			
 
				-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
			
 
				-  intptr_t temp_width = (intptr_t)(width);
			
 
				-  asm volatile (
			
 
				-    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
			
 
				-    "movdqa    %3,%%xmm5                       \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "pshufb    %%xmm5,%%xmm0                   \n"
			
 
				-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
			
 
				-    "sub       $0x4,%2                         \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),  // %0
			
 
				-    "+r"(dst),  // %1
			
 
				-    "+r"(temp_width)  // %2
			
 
				-  : "m"(kARGBShuffleMirror)  // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBMIRRORROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_SPLITUVROW_SSE2
			
 
				-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb    %%xmm5,%%xmm5                    \n"
			
 
				-    "psrlw      $0x8,%%xmm5                      \n"
			
 
				-    "sub        %1,%2                            \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                            \n"
			
 
				-    "movdqa     " MEMACCESS(0) ",%%xmm0          \n"
			
 
				-    "movdqa     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
			
 
				-    "lea        " MEMLEA(0x20,0) ",%0            \n"
			
 
				-    "movdqa     %%xmm0,%%xmm2                    \n"
			
 
				-    "movdqa     %%xmm1,%%xmm3                    \n"
			
 
				-    "pand       %%xmm5,%%xmm0                    \n"
			
 
				-    "pand       %%xmm5,%%xmm1                    \n"
			
 
				-    "packuswb   %%xmm1,%%xmm0                    \n"
			
 
				-    "psrlw      $0x8,%%xmm2                      \n"
			
 
				-    "psrlw      $0x8,%%xmm3                      \n"
			
 
				-    "packuswb   %%xmm3,%%xmm2                    \n"
			
 
				-    "movdqa     %%xmm0," MEMACCESS(1) "          \n"
			
 
				-    MEMOPMEM(movdqa,xmm2,0x00,1,2,1)             // movdqa     %%xmm2,(%1,%2)
			
 
				-    "lea        " MEMLEA(0x10,1) ",%1            \n"
			
 
				-    "sub        $0x10,%3                         \n"
			
 
				-    "jg         1b                               \n"
			
 
				-  : "+r"(src_uv),     // %0
			
 
				-    "+r"(dst_u),      // %1
			
 
				-    "+r"(dst_v),      // %2
			
 
				-    "+r"(pix)         // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                               int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb    %%xmm5,%%xmm5                    \n"
			
 
				-    "psrlw      $0x8,%%xmm5                      \n"
			
 
				-    "sub        %1,%2                            \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                            \n"
			
 
				-    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
			
 
				-    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
			
 
				-    "lea        " MEMLEA(0x20,0) ",%0            \n"
			
 
				-    "movdqa     %%xmm0,%%xmm2                    \n"
			
 
				-    "movdqa     %%xmm1,%%xmm3                    \n"
			
 
				-    "pand       %%xmm5,%%xmm0                    \n"
			
 
				-    "pand       %%xmm5,%%xmm1                    \n"
			
 
				-    "packuswb   %%xmm1,%%xmm0                    \n"
			
 
				-    "psrlw      $0x8,%%xmm2                      \n"
			
 
				-    "psrlw      $0x8,%%xmm3                      \n"
			
 
				-    "packuswb   %%xmm3,%%xmm2                    \n"
			
 
				-    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
			
 
				-    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
			
 
				-    "lea        " MEMLEA(0x10,1) ",%1            \n"
			
 
				-    "sub        $0x10,%3                         \n"
			
 
				-    "jg         1b                               \n"
			
 
				-  : "+r"(src_uv),     // %0
			
 
				-    "+r"(dst_u),      // %1
			
 
				-    "+r"(dst_v),      // %2
			
 
				-    "+r"(pix)         // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_SPLITUVROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_MERGEUVROW_SSE2
			
 
				-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                     int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %0,%1                             \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                            \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0           \n"
			
 
				-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)             //  movdqa    (%0,%1,1),%%xmm1
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0             \n"
			
 
				-    "movdqa    %%xmm0,%%xmm2                     \n"
			
 
				-    "punpcklbw %%xmm1,%%xmm0                     \n"
			
 
				-    "punpckhbw %%xmm1,%%xmm2                     \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(2) "           \n"
			
 
				-    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "     \n"
			
 
				-    "lea       " MEMLEA(0x20,2) ",%2             \n"
			
 
				-    "sub       $0x10,%3                          \n"
			
 
				-    "jg        1b                                \n"
			
 
				-  : "+r"(src_u),     // %0
			
 
				-    "+r"(src_v),     // %1
			
 
				-    "+r"(dst_uv),    // %2
			
 
				-    "+r"(width)      // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
			
 
				-                               uint8* dst_uv, int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %0,%1                             \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                            \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
			
 
				-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0             \n"
			
 
				-    "movdqa    %%xmm0,%%xmm2                     \n"
			
 
				-    "punpcklbw %%xmm1,%%xmm0                     \n"
			
 
				-    "punpckhbw %%xmm1,%%xmm2                     \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
			
 
				-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
			
 
				-    "lea       " MEMLEA(0x20,2) ",%2             \n"
			
 
				-    "sub       $0x10,%3                          \n"
			
 
				-    "jg        1b                                \n"
			
 
				-  : "+r"(src_u),     // %0
			
 
				-    "+r"(src_v),     // %1
			
 
				-    "+r"(dst_uv),    // %2
			
 
				-    "+r"(width)      // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_MERGEUVROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_COPYROW_SSE2
			
 
				-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
			
 
				-  asm volatile (
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,1) ",%1           \n"
			
 
				-    "sub       $0x20,%2                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),   // %0
			
 
				-    "+r"(dst),   // %1
			
 
				-    "+r"(count)  // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_COPYROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_COPYROW_X86
			
 
				-void CopyRow_X86(const uint8* src, uint8* dst, int width) {
			
 
				-  size_t width_tmp = (size_t)(width);
			
 
				-  asm volatile (
			
 
				-    "shr       $0x2,%2                         \n"
			
 
				-    "rep movsl " MEMMOVESTRING(0,1) "          \n"
			
 
				-  : "+S"(src),  // %0
			
 
				-    "+D"(dst),  // %1
			
 
				-    "+c"(width_tmp) // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_COPYROW_X86
			
 
				-
			
 
				-#ifdef HAS_COPYROW_ERMS
			
 
				-// Unaligned Multiple of 1.
			
 
				-void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
			
 
				-  size_t width_tmp = (size_t)(width);
			
 
				-  asm volatile (
			
 
				-    "rep movsb " MEMMOVESTRING(0,1) "          \n"
			
 
				-  : "+S"(src),  // %0
			
 
				-    "+D"(dst),  // %1
			
 
				-    "+c"(width_tmp) // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_COPYROW_ERMS
			
 
				-
			
 
				-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
			
 
				-// width in pixels
			
 
				-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
			
 
				-    "pslld     $0x18,%%xmm0                    \n"
			
 
				-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
			
 
				-    "psrld     $0x8,%%xmm1                     \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
			
 
				-    "pand      %%xmm0,%%xmm2                   \n"
			
 
				-    "pand      %%xmm0,%%xmm3                   \n"
			
 
				-    "pand      %%xmm1,%%xmm4                   \n"
			
 
				-    "pand      %%xmm1,%%xmm5                   \n"
			
 
				-    "por       %%xmm4,%%xmm2                   \n"
			
 
				-    "por       %%xmm5,%%xmm3                   \n"
			
 
				-    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
			
 
				-    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,1) ",%1           \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),   // %0
			
 
				-    "+r"(dst),   // %1
			
 
				-    "+r"(width)  // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOPYALPHAROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
			
 
				-// width in pixels
			
 
				-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
			
 
				-  asm volatile (
			
 
				-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
			
 
				-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
			
 
				-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
			
 
				-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
			
 
				-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
			
 
				-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x40,1) ",%1           \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-    "vzeroupper                                \n"
			
 
				-  : "+r"(src),   // %0
			
 
				-    "+r"(dst),   // %1
			
 
				-    "+r"(width)  // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOPYALPHAROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
			
 
				-// width in pixels
			
 
				-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
			
 
				-    "pslld     $0x18,%%xmm0                    \n"
			
 
				-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
			
 
				-    "psrld     $0x8,%%xmm1                     \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movq      " MEMACCESS(0) ",%%xmm2         \n"
			
 
				-    "lea       " MEMLEA(0x8,0) ",%0            \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm2                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm3                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm2                   \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
			
 
				-    "pand      %%xmm0,%%xmm2                   \n"
			
 
				-    "pand      %%xmm0,%%xmm3                   \n"
			
 
				-    "pand      %%xmm1,%%xmm4                   \n"
			
 
				-    "pand      %%xmm1,%%xmm5                   \n"
			
 
				-    "por       %%xmm4,%%xmm2                   \n"
			
 
				-    "por       %%xmm5,%%xmm3                   \n"
			
 
				-    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
			
 
				-    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,1) ",%1           \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src),   // %0
			
 
				-    "+r"(dst),   // %1
			
 
				-    "+r"(width)  // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
			
 
				-// width in pixels
			
 
				-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
			
 
				-  asm volatile (
			
 
				-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
			
 
				-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
			
 
				-    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
			
 
				-    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
			
 
				-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
			
 
				-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
			
 
				-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
			
 
				-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x40,1) ",%1           \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-    "vzeroupper                                \n"
			
 
				-  : "+r"(src),   // %0
			
 
				-    "+r"(dst),   // %1
			
 
				-    "+r"(width)  // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_SETROW_X86
			
 
				-void SetRow_X86(uint8* dst, uint32 v32, int width) {
			
 
				-  size_t width_tmp = (size_t)(width);
			
 
				-  asm volatile (
			
 
				-    "shr       $0x2,%1                         \n"
			
 
				-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
			
 
				-    : "+D"(dst),       // %0
			
 
				-      "+c"(width_tmp)  // %1
			
 
				-    : "a"(v32)         // %2
			
 
				-    : "memory", "cc");
			
 
				-}
			
 
				-
			
 
				-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
			
 
				-                   int dst_stride, int height) {
			
 
				-  for (int y = 0; y < height; ++y) {
			
 
				-    size_t width_tmp = (size_t)(width);
			
 
				-    uint32* d = (uint32*)(dst);
			
 
				-    asm volatile (
			
 
				-      "rep stosl " MEMSTORESTRING(eax,0) "     \n"
			
 
				-      : "+D"(d),         // %0
			
 
				-        "+c"(width_tmp)  // %1
			
 
				-      : "a"(v32)         // %2
			
 
				-      : "memory", "cc");
			
 
				-    dst += dst_stride;
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_SETROW_X86
			
 
				-
			
 
				-#ifdef HAS_YUY2TOYROW_SSE2
			
 
				-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrlw     $0x8,%%xmm5                     \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_yuy2),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrlw     $0x8,%%xmm5                     \n"
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
			
 
				-    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "pavgb     %%xmm2,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm3,%%xmm1                   \n"
			
 
				-    "psrlw     $0x8,%%xmm0                     \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm1                   \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_yuy2),    // %0
			
 
				-    "+r"(dst_u),       // %1
			
 
				-    "+r"(dst_v),       // %2
			
 
				-    "+r"(pix)          // %3
			
 
				-  : "r"((intptr_t)(stride_yuy2))  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrlw     $0x8,%%xmm5                     \n"
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "psrlw     $0x8,%%xmm0                     \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm1                   \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_yuy2),    // %0
			
 
				-    "+r"(dst_u),       // %1
			
 
				-    "+r"(dst_v),       // %2
			
 
				-    "+r"(pix)          // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
			
 
				-                               uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrlw     $0x8,%%xmm5                     \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_yuy2),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
			
 
				-                                int stride_yuy2,
			
 
				-                                uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrlw     $0x8,%%xmm5                     \n"
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
			
 
				-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "pavgb     %%xmm2,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm3,%%xmm1                   \n"
			
 
				-    "psrlw     $0x8,%%xmm0                     \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm1                   \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_yuy2),    // %0
			
 
				-    "+r"(dst_u),       // %1
			
 
				-    "+r"(dst_v),       // %2
			
 
				-    "+r"(pix)          // %3
			
 
				-  : "r"((intptr_t)(stride_yuy2))  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
			
 
				-                                   uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrlw     $0x8,%%xmm5                     \n"
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "psrlw     $0x8,%%xmm0                     \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm1                   \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_yuy2),    // %0
			
 
				-    "+r"(dst_u),       // %1
			
 
				-    "+r"(dst_v),       // %2
			
 
				-    "+r"(pix)          // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "psrlw     $0x8,%%xmm0                     \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_uyvy),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrlw     $0x8,%%xmm5                     \n"
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
			
 
				-    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "pavgb     %%xmm2,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm3,%%xmm1                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm1                   \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_uyvy),    // %0
			
 
				-    "+r"(dst_u),       // %1
			
 
				-    "+r"(dst_v),       // %2
			
 
				-    "+r"(pix)          // %3
			
 
				-  : "r"((intptr_t)(stride_uyvy))  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrlw     $0x8,%%xmm5                     \n"
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm1                   \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_uyvy),    // %0
			
 
				-    "+r"(dst_u),       // %1
			
 
				-    "+r"(dst_v),       // %2
			
 
				-    "+r"(pix)          // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
			
 
				-                               uint8* dst_y, int pix) {
			
 
				-  asm volatile (
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "psrlw     $0x8,%%xmm0                     \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_uyvy),  // %0
			
 
				-    "+r"(dst_y),     // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                                uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrlw     $0x8,%%xmm5                     \n"
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
			
 
				-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "pavgb     %%xmm2,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm3,%%xmm1                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm1                   \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_uyvy),    // %0
			
 
				-    "+r"(dst_u),       // %1
			
 
				-    "+r"(dst_v),       // %2
			
 
				-    "+r"(pix)          // %3
			
 
				-  : "r"((intptr_t)(stride_uyvy))  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
			
 
				-                                   uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrlw     $0x8,%%xmm5                     \n"
			
 
				-    "sub       %1,%2                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm1                   \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_uyvy),    // %0
			
 
				-    "+r"(dst_u),       // %1
			
 
				-    "+r"(dst_v),       // %2
			
 
				-    "+r"(pix)          // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_YUY2TOYROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBBLENDROW_SSE2
			
 
				-// Blend 8 pixels at a time.
			
 
				-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                       uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
			
 
				-    "psrlw     $0xf,%%xmm7                     \n"
			
 
				-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
			
 
				-    "psrlw     $0x8,%%xmm6                     \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psllw     $0x8,%%xmm5                     \n"
			
 
				-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
			
 
				-    "pslld     $0x18,%%xmm4                    \n"
			
 
				-    "sub       $0x1,%3                         \n"
			
 
				-    "je        91f                             \n"
			
 
				-    "jl        99f                             \n"
			
 
				-
			
 
				-    // 1 pixel loop until destination pointer is aligned.
			
 
				-  "10:                                         \n"
			
 
				-    "test      $0xf,%2                         \n"
			
 
				-    "je        19f                             \n"
			
 
				-    "movd      " MEMACCESS(0) ",%%xmm3         \n"
			
 
				-    "lea       " MEMLEA(0x4,0) ",%0            \n"
			
 
				-    "movdqa    %%xmm3,%%xmm0                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm3                   \n"
			
 
				-    "movd      " MEMACCESS(1) ",%%xmm2         \n"
			
 
				-    "psrlw     $0x8,%%xmm3                     \n"
			
 
				-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
			
 
				-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
			
 
				-    "pand      %%xmm6,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm7,%%xmm3                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm2                   \n"
			
 
				-    "movd      " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    "lea       " MEMLEA(0x4,1) ",%1            \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "por       %%xmm4,%%xmm0                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm1                   \n"
			
 
				-    "psrlw     $0x8,%%xmm2                     \n"
			
 
				-    "paddusb   %%xmm2,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "paddusb   %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x1,%3                         \n"
			
 
				-    "movd      %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x4,2) ",%2            \n"
			
 
				-    "jge       10b                             \n"
			
 
				-
			
 
				-  "19:                                         \n"
			
 
				-    "add       $1-4,%3                         \n"
			
 
				-    "jl        49f                             \n"
			
 
				-
			
 
				-    // 4 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "41:                                         \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm3,%%xmm0                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm3                   \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
			
 
				-    "psrlw     $0x8,%%xmm3                     \n"
			
 
				-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
			
 
				-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
			
 
				-    "pand      %%xmm6,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm7,%%xmm3                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm2                   \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "por       %%xmm4,%%xmm0                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm1                   \n"
			
 
				-    "psrlw     $0x8,%%xmm2                     \n"
			
 
				-    "paddusb   %%xmm2,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "paddusb   %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "jge       41b                             \n"
			
 
				-
			
 
				-  "49:                                         \n"
			
 
				-    "add       $0x3,%3                         \n"
			
 
				-    "jl        99f                             \n"
			
 
				-
			
 
				-    // 1 pixel loop.
			
 
				-  "91:                                         \n"
			
 
				-    "movd      " MEMACCESS(0) ",%%xmm3         \n"
			
 
				-    "lea       " MEMLEA(0x4,0) ",%0            \n"
			
 
				-    "movdqa    %%xmm3,%%xmm0                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm3                   \n"
			
 
				-    "movd      " MEMACCESS(1) ",%%xmm2         \n"
			
 
				-    "psrlw     $0x8,%%xmm3                     \n"
			
 
				-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
			
 
				-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
			
 
				-    "pand      %%xmm6,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm7,%%xmm3                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm2                   \n"
			
 
				-    "movd      " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    "lea       " MEMLEA(0x4,1) ",%1            \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "por       %%xmm4,%%xmm0                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm1                   \n"
			
 
				-    "psrlw     $0x8,%%xmm2                     \n"
			
 
				-    "paddusb   %%xmm2,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "paddusb   %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x1,%3                         \n"
			
 
				-    "movd      %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x4,2) ",%2            \n"
			
 
				-    "jge       91b                             \n"
			
 
				-  "99:                                         \n"
			
 
				-  : "+r"(src_argb0),    // %0
			
 
				-    "+r"(src_argb1),    // %1
			
 
				-    "+r"(dst_argb),     // %2
			
 
				-    "+r"(width)         // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBBLENDROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBBLENDROW_SSSE3
			
 
				-// Shuffle table for isolating alpha.
			
 
				-static uvec8 kShuffleAlpha = {
			
 
				-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
			
 
				-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
			
 
				-};
			
 
				-
			
 
				-// Blend 8 pixels at a time
			
 
				-// Shuffle table for reversing the bytes.
			
 
				-
			
 
				-// Same as SSE2, but replaces
			
 
				-//    psrlw      xmm3, 8          // alpha
			
 
				-//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
			
 
				-//    pshuflw    xmm3, xmm3,0F5h
			
 
				-// with..
			
 
				-//    pshufb     xmm3, kShuffleAlpha // alpha
			
 
				-
			
 
				-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                        uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
			
 
				-    "psrlw     $0xf,%%xmm7                     \n"
			
 
				-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
			
 
				-    "psrlw     $0x8,%%xmm6                     \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psllw     $0x8,%%xmm5                     \n"
			
 
				-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
			
 
				-    "pslld     $0x18,%%xmm4                    \n"
			
 
				-    "sub       $0x1,%3                         \n"
			
 
				-    "je        91f                             \n"
			
 
				-    "jl        99f                             \n"
			
 
				-
			
 
				-    // 1 pixel loop until destination pointer is aligned.
			
 
				-  "10:                                         \n"
			
 
				-    "test      $0xf,%2                         \n"
			
 
				-    "je        19f                             \n"
			
 
				-    "movd      " MEMACCESS(0) ",%%xmm3         \n"
			
 
				-    "lea       " MEMLEA(0x4,0) ",%0            \n"
			
 
				-    "movdqa    %%xmm3,%%xmm0                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm3                   \n"
			
 
				-    "movd      " MEMACCESS(1) ",%%xmm2         \n"
			
 
				-    "pshufb    %4,%%xmm3                       \n"
			
 
				-    "pand      %%xmm6,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm7,%%xmm3                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm2                   \n"
			
 
				-    "movd      " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    "lea       " MEMLEA(0x4,1) ",%1            \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "por       %%xmm4,%%xmm0                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm1                   \n"
			
 
				-    "psrlw     $0x8,%%xmm2                     \n"
			
 
				-    "paddusb   %%xmm2,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "paddusb   %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x1,%3                         \n"
			
 
				-    "movd      %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x4,2) ",%2            \n"
			
 
				-    "jge       10b                             \n"
			
 
				-
			
 
				-  "19:                                         \n"
			
 
				-    "add       $1-4,%3                         \n"
			
 
				-    "jl        49f                             \n"
			
 
				-    "test      $0xf,%0                         \n"
			
 
				-    "jne       41f                             \n"
			
 
				-    "test      $0xf,%1                         \n"
			
 
				-    "jne       41f                             \n"
			
 
				-
			
 
				-    // 4 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "40:                                         \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm3         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm3,%%xmm0                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm3                   \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
			
 
				-    "pshufb    %4,%%xmm3                       \n"
			
 
				-    "pand      %%xmm6,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm7,%%xmm3                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm2                   \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "por       %%xmm4,%%xmm0                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm1                   \n"
			
 
				-    "psrlw     $0x8,%%xmm2                     \n"
			
 
				-    "paddusb   %%xmm2,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "paddusb   %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "jge       40b                             \n"
			
 
				-    "jmp       49f                             \n"
			
 
				-
			
 
				-    // 4 pixel unaligned loop.
			
 
				-    LABELALIGN
			
 
				-  "41:                                         \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm3,%%xmm0                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm3                   \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
			
 
				-    "pshufb    %4,%%xmm3                       \n"
			
 
				-    "pand      %%xmm6,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm7,%%xmm3                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm2                   \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "por       %%xmm4,%%xmm0                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm1                   \n"
			
 
				-    "psrlw     $0x8,%%xmm2                     \n"
			
 
				-    "paddusb   %%xmm2,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "paddusb   %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "jge       41b                             \n"
			
 
				-
			
 
				-  "49:                                         \n"
			
 
				-    "add       $0x3,%3                         \n"
			
 
				-    "jl        99f                             \n"
			
 
				-
			
 
				-    // 1 pixel loop.
			
 
				-  "91:                                         \n"
			
 
				-    "movd      " MEMACCESS(0) ",%%xmm3         \n"
			
 
				-    "lea       " MEMLEA(0x4,0) ",%0            \n"
			
 
				-    "movdqa    %%xmm3,%%xmm0                   \n"
			
 
				-    "pxor      %%xmm4,%%xmm3                   \n"
			
 
				-    "movd      " MEMACCESS(1) ",%%xmm2         \n"
			
 
				-    "pshufb    %4,%%xmm3                       \n"
			
 
				-    "pand      %%xmm6,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm7,%%xmm3                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm2                   \n"
			
 
				-    "movd      " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    "lea       " MEMLEA(0x4,1) ",%1            \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "por       %%xmm4,%%xmm0                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm1                   \n"
			
 
				-    "psrlw     $0x8,%%xmm2                     \n"
			
 
				-    "paddusb   %%xmm2,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "paddusb   %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x1,%3                         \n"
			
 
				-    "movd      %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x4,2) ",%2            \n"
			
 
				-    "jge       91b                             \n"
			
 
				-  "99:                                         \n"
			
 
				-  : "+r"(src_argb0),    // %0
			
 
				-    "+r"(src_argb1),    // %1
			
 
				-    "+r"(dst_argb),     // %2
			
 
				-    "+r"(width)         // %3
			
 
				-  : "m"(kShuffleAlpha)  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBBLENDROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBATTENUATEROW_SSE2
			
 
				-// Attenuate 4 pixels at a time.
			
 
				-// aligned to 16 bytes
			
 
				-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
			
 
				-    "pslld     $0x18,%%xmm4                    \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrld     $0x8,%%xmm5                     \n"
			
 
				-
			
 
				-    // 4 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm0                   \n"
			
 
				-    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
			
 
				-    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
			
 
				-    "pmulhuw   %%xmm2,%%xmm0                   \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
			
 
				-    "punpckhbw %%xmm1,%%xmm1                   \n"
			
 
				-    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
			
 
				-    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
			
 
				-    "pmulhuw   %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "psrlw     $0x8,%%xmm0                     \n"
			
 
				-    "pand      %%xmm4,%%xmm2                   \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "por       %%xmm2,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%2                         \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),    // %0
			
 
				-    "+r"(dst_argb),    // %1
			
 
				-    "+r"(width)        // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBATTENUATEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBATTENUATEROW_SSSE3
			
 
				-// Shuffle table duplicating alpha
			
 
				-static uvec8 kShuffleAlpha0 = {
			
 
				-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
			
 
				-};
			
 
				-static uvec8 kShuffleAlpha1 = {
			
 
				-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
			
 
				-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
			
 
				-};
			
 
				-// Attenuate 4 pixels at a time.
			
 
				-// aligned to 16 bytes
			
 
				-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
			
 
				-    "pslld     $0x18,%%xmm3                    \n"
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-
			
 
				-    // 4 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "pshufb    %%xmm4,%%xmm0                   \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
			
 
				-    "punpcklbw %%xmm1,%%xmm1                   \n"
			
 
				-    "pmulhuw   %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
			
 
				-    "pshufb    %%xmm5,%%xmm1                   \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
			
 
				-    "punpckhbw %%xmm2,%%xmm2                   \n"
			
 
				-    "pmulhuw   %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "pand      %%xmm3,%%xmm2                   \n"
			
 
				-    "psrlw     $0x8,%%xmm0                     \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "por       %%xmm2,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%2                         \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),    // %0
			
 
				-    "+r"(dst_argb),    // %1
			
 
				-    "+r"(width)        // %2
			
 
				-  : "m"(kShuffleAlpha0),  // %3
			
 
				-    "m"(kShuffleAlpha1)  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBATTENUATEROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
			
 
				-// Unattenuate 4 pixels at a time.
			
 
				-// aligned to 16 bytes
			
 
				-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                             int width) {
			
 
				-  uintptr_t alpha = 0;
			
 
				-  asm volatile (
			
 
				-    // 4 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm0                   \n"
			
 
				-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
			
 
				-    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
			
 
				-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
			
 
				-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
			
 
				-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
			
 
				-    "movlhps   %%xmm3,%%xmm2                   \n"
			
 
				-    "pmulhuw   %%xmm2,%%xmm0                   \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
			
 
				-    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
			
 
				-    "punpckhbw %%xmm1,%%xmm1                   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
			
 
				-    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
			
 
				-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
			
 
				-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
			
 
				-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
			
 
				-    "movlhps   %%xmm3,%%xmm2                   \n"
			
 
				-    "pmulhuw   %%xmm2,%%xmm1                   \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%2                         \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),    // %0
			
 
				-    "+r"(dst_argb),    // %1
			
 
				-    "+r"(width),       // %2
			
 
				-    "+r"(alpha)        // %3
			
 
				-  : "r"(fixed_invtbl8)  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBUNATTENUATEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBGRAYROW_SSSE3
			
 
				-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
			
 
				-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %3,%%xmm4                       \n"
			
 
				-    "movdqa    %4,%%xmm5                       \n"
			
 
				-
			
 
				-    // 8 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "phaddw    %%xmm1,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm5,%%xmm0                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "psrld     $0x18,%%xmm2                    \n"
			
 
				-    "psrld     $0x18,%%xmm3                    \n"
			
 
				-    "packuswb  %%xmm3,%%xmm2                   \n"
			
 
				-    "packuswb  %%xmm2,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm3                   \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm3                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm3,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm3,%%xmm1                   \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),   // %0
			
 
				-    "+r"(dst_argb),   // %1
			
 
				-    "+r"(width)       // %2
			
 
				-  : "m"(kARGBToYJ),   // %3
			
 
				-    "m"(kAddYJ64)     // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBGRAYROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBSEPIAROW_SSSE3
			
 
				-//    b = (r * 35 + g * 68 + b * 17) >> 7
			
 
				-//    g = (r * 45 + g * 88 + b * 22) >> 7
			
 
				-//    r = (r * 50 + g * 98 + b * 24) >> 7
			
 
				-// Constant for ARGB color to sepia tone
			
 
				-static vec8 kARGBToSepiaB = {
			
 
				-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
			
 
				-};
			
 
				-
			
 
				-static vec8 kARGBToSepiaG = {
			
 
				-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
			
 
				-};
			
 
				-
			
 
				-static vec8 kARGBToSepiaR = {
			
 
				-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
			
 
				-};
			
 
				-
			
 
				-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
			
 
				-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    %2,%%xmm2                       \n"
			
 
				-    "movdqa    %3,%%xmm3                       \n"
			
 
				-    "movdqa    %4,%%xmm4                       \n"
			
 
				-
			
 
				-    // 8 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
			
 
				-    "pmaddubsw %%xmm2,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm2,%%xmm6                   \n"
			
 
				-    "phaddw    %%xmm6,%%xmm0                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm5                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "phaddw    %%xmm1,%%xmm5                   \n"
			
 
				-    "psrlw     $0x7,%%xmm5                     \n"
			
 
				-    "packuswb  %%xmm5,%%xmm5                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm0                   \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm5                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "phaddw    %%xmm1,%%xmm5                   \n"
			
 
				-    "psrlw     $0x7,%%xmm5                     \n"
			
 
				-    "packuswb  %%xmm5,%%xmm5                   \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "psrld     $0x18,%%xmm6                    \n"
			
 
				-    "psrld     $0x18,%%xmm1                    \n"
			
 
				-    "packuswb  %%xmm1,%%xmm6                   \n"
			
 
				-    "packuswb  %%xmm6,%%xmm6                   \n"
			
 
				-    "punpcklbw %%xmm6,%%xmm5                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm5,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm5,%%xmm1                   \n"
			
 
				-    "sub       $0x8,%1                         \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(dst_argb),      // %0
			
 
				-    "+r"(width)          // %1
			
 
				-  : "m"(kARGBToSepiaB),  // %2
			
 
				-    "m"(kARGBToSepiaG),  // %3
			
 
				-    "m"(kARGBToSepiaR)   // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBSEPIAROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
			
 
				-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
			
 
				-// Same as Sepia except matrix is provided.
			
 
				-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                              const int8* matrix_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
			
 
				-    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
			
 
				-    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
			
 
				-    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
			
 
				-    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
			
 
				-
			
 
				-    // 8 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
			
 
				-    "pmaddubsw %%xmm2,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm2,%%xmm7                   \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm1                   \n"
			
 
				-    "phaddsw   %%xmm7,%%xmm0                   \n"
			
 
				-    "phaddsw   %%xmm1,%%xmm6                   \n"
			
 
				-    "psraw     $0x6,%%xmm0                     \n"
			
 
				-    "psraw     $0x6,%%xmm6                     \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm6,%%xmm6                   \n"
			
 
				-    "punpcklbw %%xmm6,%%xmm0                   \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm4,%%xmm7                   \n"
			
 
				-    "phaddsw   %%xmm7,%%xmm1                   \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
			
 
				-    "pmaddubsw %%xmm5,%%xmm6                   \n"
			
 
				-    "pmaddubsw %%xmm5,%%xmm7                   \n"
			
 
				-    "phaddsw   %%xmm7,%%xmm6                   \n"
			
 
				-    "psraw     $0x6,%%xmm1                     \n"
			
 
				-    "psraw     $0x6,%%xmm6                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm1                   \n"
			
 
				-    "packuswb  %%xmm6,%%xmm6                   \n"
			
 
				-    "punpcklbw %%xmm6,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm6                   \n"
			
 
				-    "punpcklwd %%xmm1,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm1,%%xmm6                   \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "movdqa    %%xmm6," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "lea       " MEMLEA(0x20,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),      // %0
			
 
				-    "+r"(dst_argb),      // %1
			
 
				-    "+r"(width)          // %2
			
 
				-  : "r"(matrix_argb)     // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBQUANTIZEROW_SSE2
			
 
				-// Quantize 4 ARGB pixels (16 bytes).
			
 
				-// aligned to 16 bytes
			
 
				-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
			
 
				-                          int interval_offset, int width) {
			
 
				-  asm volatile (
			
 
				-    "movd      %2,%%xmm2                       \n"
			
 
				-    "movd      %3,%%xmm3                       \n"
			
 
				-    "movd      %4,%%xmm4                       \n"
			
 
				-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
			
 
				-    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
			
 
				-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
			
 
				-    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
			
 
				-    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
			
 
				-    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
			
 
				-    "pxor      %%xmm5,%%xmm5                   \n"
			
 
				-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
			
 
				-    "pslld     $0x18,%%xmm6                    \n"
			
 
				-
			
 
				-    // 4 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm0                   \n"
			
 
				-    "pmulhuw   %%xmm2,%%xmm0                   \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
			
 
				-    "punpckhbw %%xmm5,%%xmm1                   \n"
			
 
				-    "pmulhuw   %%xmm2,%%xmm1                   \n"
			
 
				-    "pmullw    %%xmm3,%%xmm0                   \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm7         \n"
			
 
				-    "pmullw    %%xmm3,%%xmm1                   \n"
			
 
				-    "pand      %%xmm6,%%xmm7                   \n"
			
 
				-    "paddw     %%xmm4,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm4,%%xmm1                   \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "por       %%xmm7,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%1                         \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(dst_argb),       // %0
			
 
				-    "+r"(width)           // %1
			
 
				-  : "r"(scale),           // %2
			
 
				-    "r"(interval_size),   // %3
			
 
				-    "r"(interval_offset)  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBQUANTIZEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBSHADEROW_SSE2
			
 
				-// Shade 4 pixels at a time by specified value.
			
 
				-// Aligned to 16 bytes.
			
 
				-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
			
 
				-                       uint32 value) {
			
 
				-  asm volatile (
			
 
				-    "movd      %3,%%xmm2                       \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm2                   \n"
			
 
				-    "punpcklqdq %%xmm2,%%xmm2                  \n"
			
 
				-
			
 
				-    // 4 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm0                   \n"
			
 
				-    "punpckhbw %%xmm1,%%xmm1                   \n"
			
 
				-    "pmulhuw   %%xmm2,%%xmm0                   \n"
			
 
				-    "pmulhuw   %%xmm2,%%xmm1                   \n"
			
 
				-    "psrlw     $0x8,%%xmm0                     \n"
			
 
				-    "psrlw     $0x8,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%2                         \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(width)      // %2
			
 
				-  : "r"(value)       // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBSHADEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBMULTIPLYROW_SSE2
			
 
				-// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
			
 
				-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "pxor      %%xmm5,%%xmm5                   \n"
			
 
				-
			
 
				-    // 4 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "movdqu    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqu    %%xmm2,%%xmm3                   \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm0                   \n"
			
 
				-    "punpckhbw %%xmm1,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "punpckhbw %%xmm5,%%xmm3                   \n"
			
 
				-    "pmulhuw   %%xmm2,%%xmm0                   \n"
			
 
				-    "pmulhuw   %%xmm3,%%xmm1                   \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb0),  // %0
			
 
				-    "+r"(src_argb1),  // %1
			
 
				-    "+r"(dst_argb),   // %2
			
 
				-    "+r"(width)       // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBMULTIPLYROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBADDROW_SSE2
			
 
				-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
			
 
				-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                     uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    // 4 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "paddusb   %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb0),  // %0
			
 
				-    "+r"(src_argb1),  // %1
			
 
				-    "+r"(dst_argb),   // %2
			
 
				-    "+r"(width)       // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBADDROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBSUBTRACTROW_SSE2
			
 
				-// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
			
 
				-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    // 4 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "psubusb   %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb0),  // %0
			
 
				-    "+r"(src_argb1),  // %1
			
 
				-    "+r"(dst_argb),   // %2
			
 
				-    "+r"(width)       // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBSUBTRACTROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_SOBELXROW_SSE2
			
 
				-// SobelX as a matrix is
			
 
				-// -1  0  1
			
 
				-// -2  0  2
			
 
				-// -1  0  1
			
 
				-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    "sub       %0,%2                           \n"
			
 
				-    "sub       %0,%3                           \n"
			
 
				-    "pxor      %%xmm5,%%xmm5                   \n"
			
 
				-
			
 
				-    // 8 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm1                   \n"
			
 
				-    "psubw     %%xmm1,%%xmm0                   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
			
 
				-    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
			
 
				-    "punpcklbw %%xmm5,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "psubw     %%xmm2,%%xmm1                   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
			
 
				-    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm3                   \n"
			
 
				-    "psubw     %%xmm3,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm2,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm1,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm1,%%xmm0                   \n"
			
 
				-    "pxor      %%xmm1,%%xmm1                   \n"
			
 
				-    "psubw     %%xmm0,%%xmm1                   \n"
			
 
				-    "pmaxsw    %%xmm1,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "sub       $0x8,%4                         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
			
 
				-    "lea       " MEMLEA(0x8,0) ",%0            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_y0),      // %0
			
 
				-    "+r"(src_y1),      // %1
			
 
				-    "+r"(src_y2),      // %2
			
 
				-    "+r"(dst_sobelx),  // %3
			
 
				-    "+r"(width)        // %4
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_SOBELXROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_SOBELYROW_SSE2
			
 
				-// SobelY as a matrix is
			
 
				-// -1 -2 -1
			
 
				-//  0  0  0
			
 
				-//  1  2  1
			
 
				-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    uint8* dst_sobely, int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    "sub       %0,%2                           \n"
			
 
				-    "pxor      %%xmm5,%%xmm5                   \n"
			
 
				-
			
 
				-    // 8 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
			
 
				-    "punpcklbw %%xmm5,%%xmm0                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm1                   \n"
			
 
				-    "psubw     %%xmm1,%%xmm0                   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
			
 
				-    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
			
 
				-    "punpcklbw %%xmm5,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "psubw     %%xmm2,%%xmm1                   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
			
 
				-    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
			
 
				-    "punpcklbw %%xmm5,%%xmm2                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm3                   \n"
			
 
				-    "psubw     %%xmm3,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm2,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm1,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm1,%%xmm0                   \n"
			
 
				-    "pxor      %%xmm1,%%xmm1                   \n"
			
 
				-    "psubw     %%xmm0,%%xmm1                   \n"
			
 
				-    "pmaxsw    %%xmm1,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "sub       $0x8,%3                         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
			
 
				-    "lea       " MEMLEA(0x8,0) ",%0            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_y0),      // %0
			
 
				-    "+r"(src_y1),      // %1
			
 
				-    "+r"(dst_sobely),  // %2
			
 
				-    "+r"(width)        // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_SOBELYROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_SOBELROW_SSE2
			
 
				-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
			
 
				-// A = 255
			
 
				-// R = Sobel
			
 
				-// G = Sobel
			
 
				-// B = Sobel
			
 
				-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                   uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pslld     $0x18,%%xmm5                    \n"
			
 
				-
			
 
				-    // 8 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "paddusb   %%xmm1,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm2                   \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm2                   \n"
			
 
				-    "punpckhbw %%xmm0,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm1                   \n"
			
 
				-    "punpcklwd %%xmm2,%%xmm1                   \n"
			
 
				-    "punpckhwd %%xmm2,%%xmm2                   \n"
			
 
				-    "por       %%xmm5,%%xmm1                   \n"
			
 
				-    "por       %%xmm5,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm3                   \n"
			
 
				-    "punpcklwd %%xmm0,%%xmm3                   \n"
			
 
				-    "punpckhwd %%xmm0,%%xmm0                   \n"
			
 
				-    "por       %%xmm5,%%xmm3                   \n"
			
 
				-    "por       %%xmm5,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS(2) "         \n"
			
 
				-    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "   \n"
			
 
				-    "movdqa    %%xmm3," MEMACCESS2(0x20,2) "   \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS2(0x30,2) "   \n"
			
 
				-    "lea       " MEMLEA(0x40,2) ",%2           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_sobelx),  // %0
			
 
				-    "+r"(src_sobely),  // %1
			
 
				-    "+r"(dst_argb),    // %2
			
 
				-    "+r"(width)        // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_SOBELROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_SOBELTOPLANEROW_SSE2
			
 
				-// Adds Sobel X and Sobel Y and stores Sobel into a plane.
			
 
				-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                          uint8* dst_y, int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "pslld     $0x18,%%xmm5                    \n"
			
 
				-
			
 
				-    // 8 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "paddusb   %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_sobelx),  // %0
			
 
				-    "+r"(src_sobely),  // %1
			
 
				-    "+r"(dst_y),       // %2
			
 
				-    "+r"(width)        // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_SOBELTOPLANEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_SOBELXYROW_SSE2
			
 
				-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
			
 
				-// A = 255
			
 
				-// R = Sobel X
			
 
				-// G = Sobel
			
 
				-// B = Sobel Y
			
 
				-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                     uint8* dst_argb, int width) {
			
 
				-  asm volatile (
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-
			
 
				-    // 8 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm2                   \n"
			
 
				-    "paddusb   %%xmm1,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm3                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm3                   \n"
			
 
				-    "punpckhbw %%xmm5,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm1,%%xmm4                   \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm4                   \n"
			
 
				-    "punpckhbw %%xmm2,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm4,%%xmm6                   \n"
			
 
				-    "punpcklwd %%xmm3,%%xmm6                   \n"
			
 
				-    "punpckhwd %%xmm3,%%xmm4                   \n"
			
 
				-    "movdqa    %%xmm1,%%xmm7                   \n"
			
 
				-    "punpcklwd %%xmm0,%%xmm7                   \n"
			
 
				-    "punpckhwd %%xmm0,%%xmm1                   \n"
			
 
				-    "sub       $0x10,%3                        \n"
			
 
				-    "movdqa    %%xmm6," MEMACCESS(2) "         \n"
			
 
				-    "movdqa    %%xmm4," MEMACCESS2(0x10,2) "   \n"
			
 
				-    "movdqa    %%xmm7," MEMACCESS2(0x20,2) "   \n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x30,2) "   \n"
			
 
				-    "lea       " MEMLEA(0x40,2) ",%2           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_sobelx),  // %0
			
 
				-    "+r"(src_sobely),  // %1
			
 
				-    "+r"(dst_argb),    // %2
			
 
				-    "+r"(width)        // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_SOBELXYROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
			
 
				-// Creates a table of cumulative sums where each value is a sum of all values
			
 
				-// above and to the left of the value, inclusive of the value.
			
 
				-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
			
 
				-                                  const int32* previous_cumsum, int width) {
			
 
				-  asm volatile (
			
 
				-    "pxor      %%xmm0,%%xmm0                   \n"
			
 
				-    "pxor      %%xmm1,%%xmm1                   \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "jl        49f                             \n"
			
 
				-    "test      $0xf,%1                         \n"
			
 
				-    "jne       49f                             \n"
			
 
				-
			
 
				-  // 4 pixel loop                              \n"
			
 
				-    LABELALIGN
			
 
				-  "40:                                         \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm2,%%xmm4                   \n"
			
 
				-    "punpcklbw %%xmm1,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm3                   \n"
			
 
				-    "punpcklwd %%xmm1,%%xmm2                   \n"
			
 
				-    "punpckhwd %%xmm1,%%xmm3                   \n"
			
 
				-    "punpckhbw %%xmm1,%%xmm4                   \n"
			
 
				-    "movdqa    %%xmm4,%%xmm5                   \n"
			
 
				-    "punpcklwd %%xmm1,%%xmm4                   \n"
			
 
				-    "punpckhwd %%xmm1,%%xmm5                   \n"
			
 
				-    "paddd     %%xmm2,%%xmm0                   \n"
			
 
				-    "movdqa    " MEMACCESS(2) ",%%xmm2         \n"
			
 
				-    "paddd     %%xmm0,%%xmm2                   \n"
			
 
				-    "paddd     %%xmm3,%%xmm0                   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
			
 
				-    "paddd     %%xmm0,%%xmm3                   \n"
			
 
				-    "paddd     %%xmm4,%%xmm0                   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
			
 
				-    "paddd     %%xmm0,%%xmm4                   \n"
			
 
				-    "paddd     %%xmm5,%%xmm0                   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
			
 
				-    "lea       " MEMLEA(0x40,2) ",%2           \n"
			
 
				-    "paddd     %%xmm0,%%xmm5                   \n"
			
 
				-    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
			
 
				-    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "movdqa    %%xmm4," MEMACCESS2(0x20,1) "   \n"
			
 
				-    "movdqa    %%xmm5," MEMACCESS2(0x30,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x40,1) ",%1           \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "jge       40b                             \n"
			
 
				-
			
 
				-  "49:                                         \n"
			
 
				-    "add       $0x3,%3                         \n"
			
 
				-    "jl        19f                             \n"
			
 
				-
			
 
				-  // 1 pixel loop                              \n"
			
 
				-    LABELALIGN
			
 
				-  "10:                                         \n"
			
 
				-    "movd      " MEMACCESS(0) ",%%xmm2         \n"
			
 
				-    "lea       " MEMLEA(0x4,0) ",%0            \n"
			
 
				-    "punpcklbw %%xmm1,%%xmm2                   \n"
			
 
				-    "punpcklwd %%xmm1,%%xmm2                   \n"
			
 
				-    "paddd     %%xmm2,%%xmm0                   \n"
			
 
				-    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "paddd     %%xmm0,%%xmm2                   \n"
			
 
				-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "sub       $0x1,%3                         \n"
			
 
				-    "jge       10b                             \n"
			
 
				-
			
 
				-  "19:                                         \n"
			
 
				-  : "+r"(row),  // %0
			
 
				-    "+r"(cumsum),  // %1
			
 
				-    "+r"(previous_cumsum),  // %2
			
 
				-    "+r"(width)  // %3
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
			
 
				-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
			
 
				-                                    int width, int area, uint8* dst,
			
 
				-                                    int count) {
			
 
				-  asm volatile (
			
 
				-    "movd      %5,%%xmm5                       \n"
			
 
				-    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
			
 
				-    "rcpss     %%xmm5,%%xmm4                   \n"
			
 
				-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "jl        49f                             \n"
			
 
				-    "cmpl      $0x80,%5                        \n"
			
 
				-    "ja        40f                             \n"
			
 
				-
			
 
				-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
			
 
				-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
			
 
				-    "psrld     $0x10,%%xmm6                    \n"
			
 
				-    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
			
 
				-    "addps     %%xmm6,%%xmm5                   \n"
			
 
				-    "mulps     %%xmm4,%%xmm5                   \n"
			
 
				-    "cvtps2dq  %%xmm5,%%xmm5                   \n"
			
 
				-    "packssdw  %%xmm5,%%xmm5                   \n"
			
 
				-
			
 
				-  // 4 pixel small loop                        \n"
			
 
				-    LABELALIGN
			
 
				-  "4:                                         \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
			
 
				-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
			
 
				-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
			
 
				-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
			
 
				-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
			
 
				-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
			
 
				-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
			
 
				-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
			
 
				-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
			
 
				-    "lea       " MEMLEA(0x40,1) ",%1           \n"
			
 
				-    "packssdw  %%xmm1,%%xmm0                   \n"
			
 
				-    "packssdw  %%xmm3,%%xmm2                   \n"
			
 
				-    "pmulhuw   %%xmm5,%%xmm0                   \n"
			
 
				-    "pmulhuw   %%xmm5,%%xmm2                   \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "jge       4b                              \n"
			
 
				-    "jmp       49f                             \n"
			
 
				-
			
 
				-  // 4 pixel loop                              \n"
			
 
				-    LABELALIGN
			
 
				-  "40:                                         \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
			
 
				-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
			
 
				-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
			
 
				-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
			
 
				-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
			
 
				-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
			
 
				-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
			
 
				-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
			
 
				-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
			
 
				-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
			
 
				-    "lea       " MEMLEA(0x40,1) ",%1           \n"
			
 
				-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
			
 
				-    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
			
 
				-    "mulps     %%xmm4,%%xmm0                   \n"
			
 
				-    "mulps     %%xmm4,%%xmm1                   \n"
			
 
				-    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
			
 
				-    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
			
 
				-    "mulps     %%xmm4,%%xmm2                   \n"
			
 
				-    "mulps     %%xmm4,%%xmm3                   \n"
			
 
				-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
			
 
				-    "cvtps2dq  %%xmm1,%%xmm1                   \n"
			
 
				-    "cvtps2dq  %%xmm2,%%xmm2                   \n"
			
 
				-    "cvtps2dq  %%xmm3,%%xmm3                   \n"
			
 
				-    "packssdw  %%xmm1,%%xmm0                   \n"
			
 
				-    "packssdw  %%xmm3,%%xmm2                   \n"
			
 
				-    "packuswb  %%xmm2,%%xmm0                   \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "jge       40b                             \n"
			
 
				-
			
 
				-  "49:                                         \n"
			
 
				-    "add       $0x3,%3                         \n"
			
 
				-    "jl        19f                             \n"
			
 
				-
			
 
				-  // 1 pixel loop                              \n"
			
 
				-    LABELALIGN
			
 
				-  "10:                                         \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
			
 
				-    "mulps     %%xmm4,%%xmm0                   \n"
			
 
				-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
			
 
				-    "packssdw  %%xmm0,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "movd      %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x4,2) ",%2            \n"
			
 
				-    "sub       $0x1,%3                         \n"
			
 
				-    "jge       10b                             \n"
			
 
				-  "19:                                         \n"
			
 
				-  : "+r"(topleft),  // %0
			
 
				-    "+r"(botleft),  // %1
			
 
				-    "+r"(dst),      // %2
			
 
				-    "+rm"(count)    // %3
			
 
				-  : "r"((intptr_t)(width)),  // %4
			
 
				-    "rm"(area)     // %5
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBAFFINEROW_SSE2
			
 
				-// Copy ARGB pixels from source image with slope to a row of destination.
			
 
				-LIBYUV_API
			
 
				-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
			
 
				-                        uint8* dst_argb, const float* src_dudv, int width) {
			
 
				-  intptr_t src_argb_stride_temp = src_argb_stride;
			
 
				-  intptr_t temp = 0;
			
 
				-  asm volatile (
			
 
				-    "movq      " MEMACCESS(3) ",%%xmm2         \n"
			
 
				-    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
			
 
				-    "shl       $0x10,%1                        \n"
			
 
				-    "add       $0x4,%1                         \n"
			
 
				-    "movd      %1,%%xmm5                       \n"
			
 
				-    "sub       $0x4,%4                         \n"
			
 
				-    "jl        49f                             \n"
			
 
				-
			
 
				-    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
			
 
				-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
			
 
				-    "movdqa    %%xmm2,%%xmm0                   \n"
			
 
				-    "addps     %%xmm7,%%xmm0                   \n"
			
 
				-    "movlhps   %%xmm0,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm7,%%xmm4                   \n"
			
 
				-    "addps     %%xmm4,%%xmm4                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm3                   \n"
			
 
				-    "addps     %%xmm4,%%xmm3                   \n"
			
 
				-    "addps     %%xmm4,%%xmm4                   \n"
			
 
				-
			
 
				-  // 4 pixel loop                              \n"
			
 
				-    LABELALIGN
			
 
				-  "40:                                         \n"
			
 
				-    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
			
 
				-    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
			
 
				-    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
			
 
				-    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
			
 
				-    "movd      %%xmm0,%k1                      \n"
			
 
				-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
			
 
				-    "movd      %%xmm0,%k5                      \n"
			
 
				-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
			
 
				-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
			
 
				-    "punpckldq %%xmm6,%%xmm1                   \n"
			
 
				-    "addps     %%xmm4,%%xmm2                   \n"
			
 
				-    "movq      %%xmm1," MEMACCESS(2) "         \n"
			
 
				-    "movd      %%xmm0,%k1                      \n"
			
 
				-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
			
 
				-    "movd      %%xmm0,%k5                      \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
			
 
				-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
			
 
				-    "punpckldq %%xmm6,%%xmm0                   \n"
			
 
				-    "addps     %%xmm4,%%xmm3                   \n"
			
 
				-    "sub       $0x4,%4                         \n"
			
 
				-    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "jge       40b                             \n"
			
 
				-
			
 
				-  "49:                                         \n"
			
 
				-    "add       $0x3,%4                         \n"
			
 
				-    "jl        19f                             \n"
			
 
				-
			
 
				-  // 1 pixel loop                              \n"
			
 
				-    LABELALIGN
			
 
				-  "10:                                         \n"
			
 
				-    "cvttps2dq %%xmm2,%%xmm0                   \n"
			
 
				-    "packssdw  %%xmm0,%%xmm0                   \n"
			
 
				-    "pmaddwd   %%xmm5,%%xmm0                   \n"
			
 
				-    "addps     %%xmm7,%%xmm2                   \n"
			
 
				-    "movd      %%xmm0,%k1                      \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
			
 
				-    "sub       $0x1,%4                         \n"
			
 
				-    "movd      %%xmm0," MEMACCESS(2) "         \n"
			
 
				-    "lea       " MEMLEA(0x04,2) ",%2           \n"
			
 
				-    "jge       10b                             \n"
			
 
				-  "19:                                         \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(src_argb_stride_temp),  // %1
			
 
				-    "+r"(dst_argb),  // %2
			
 
				-    "+r"(src_dudv),  // %3
			
 
				-    "+rm"(width),    // %4
			
 
				-    "+r"(temp)   // %5
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBAFFINEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_INTERPOLATEROW_SSSE3
			
 
				-// Bilinear filter 16x2 -> 16x1
			
 
				-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                          ptrdiff_t src_stride, int dst_width,
			
 
				-                          int source_y_fraction) {
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%0                           \n"
			
 
				-    "shr       %3                              \n"
			
 
				-    "cmp       $0x0,%3                         \n"
			
 
				-    "je        100f                            \n"
			
 
				-    "cmp       $0x20,%3                        \n"
			
 
				-    "je        75f                             \n"
			
 
				-    "cmp       $0x40,%3                        \n"
			
 
				-    "je        50f                             \n"
			
 
				-    "cmp       $0x60,%3                        \n"
			
 
				-    "je        25f                             \n"
			
 
				-
			
 
				-    "movd      %3,%%xmm0                       \n"
			
 
				-    "neg       %3                              \n"
			
 
				-    "add       $0x80,%3                        \n"
			
 
				-    "movd      %3,%%xmm5                       \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm5                   \n"
			
 
				-    "punpcklwd %%xmm5,%%xmm5                   \n"
			
 
				-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
			
 
				-
			
 
				-    // General purpose row blend.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqa,0x00,1,4,1,xmm2)
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhbw %%xmm2,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm5,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm5,%%xmm1                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 25 / 75.
			
 
				-    LABELALIGN
			
 
				-  "25:                                         \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        25b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 50 / 50.
			
 
				-    LABELALIGN
			
 
				-  "50:                                         \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        50b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 75 / 25.
			
 
				-    LABELALIGN
			
 
				-  "75:                                         \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    MEMOPREG(movdqa,0x00,1,4,1,xmm0)
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        75b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 100 / 0 - Copy row unchanged.
			
 
				-    LABELALIGN
			
 
				-  "100:                                        \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        100b                            \n"
			
 
				-
			
 
				-  "99:                                         \n"
			
 
				-  : "+r"(dst_ptr),    // %0
			
 
				-    "+r"(src_ptr),    // %1
			
 
				-    "+r"(dst_width),  // %2
			
 
				-    "+r"(source_y_fraction)  // %3
			
 
				-  : "r"((intptr_t)(src_stride))  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_INTERPOLATEROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_INTERPOLATEROW_SSE2
			
 
				-// Bilinear filter 16x2 -> 16x1
			
 
				-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                         ptrdiff_t src_stride, int dst_width,
			
 
				-                         int source_y_fraction) {
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%0                           \n"
			
 
				-    "shr       %3                              \n"
			
 
				-    "cmp       $0x0,%3                         \n"
			
 
				-    "je        100f                            \n"
			
 
				-    "cmp       $0x20,%3                        \n"
			
 
				-    "je        75f                             \n"
			
 
				-    "cmp       $0x40,%3                        \n"
			
 
				-    "je        50f                             \n"
			
 
				-    "cmp       $0x60,%3                        \n"
			
 
				-    "je        25f                             \n"
			
 
				-
			
 
				-    "movd      %3,%%xmm0                       \n"
			
 
				-    "neg       %3                              \n"
			
 
				-    "add       $0x80,%3                        \n"
			
 
				-    "movd      %3,%%xmm5                       \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm5                   \n"
			
 
				-    "punpcklwd %%xmm5,%%xmm5                   \n"
			
 
				-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-
			
 
				-    // General purpose row blend.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqa,0x00,1,4,1,xmm2)           //  movdqa    (%1,%4,1),%%xmm2
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm2,%%xmm3                   \n"
			
 
				-    "punpcklbw %%xmm4,%%xmm2                   \n"
			
 
				-    "punpckhbw %%xmm4,%%xmm3                   \n"
			
 
				-    "punpcklbw %%xmm4,%%xmm0                   \n"
			
 
				-    "punpckhbw %%xmm4,%%xmm1                   \n"
			
 
				-    "psubw     %%xmm0,%%xmm2                   \n"
			
 
				-    "psubw     %%xmm1,%%xmm3                   \n"
			
 
				-    "paddw     %%xmm2,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm3,%%xmm3                   \n"
			
 
				-    "pmulhw    %%xmm5,%%xmm2                   \n"
			
 
				-    "pmulhw    %%xmm5,%%xmm3                   \n"
			
 
				-    "paddw     %%xmm2,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm3,%%xmm1                   \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 25 / 75.
			
 
				-    LABELALIGN
			
 
				-  "25:                                         \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        25b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 50 / 50.
			
 
				-    LABELALIGN
			
 
				-  "50:                                         \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        50b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 75 / 25.
			
 
				-    LABELALIGN
			
 
				-  "75:                                         \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    MEMOPREG(movdqa,0x00,1,4,1,xmm0)           //  movdqa    (%1,%4,1),%%xmm0
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        75b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 100 / 0 - Copy row unchanged.
			
 
				-    LABELALIGN
			
 
				-  "100:                                        \n"
			
 
				-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        100b                            \n"
			
 
				-
			
 
				-  "99:                                         \n"
			
 
				-  : "+r"(dst_ptr),    // %0
			
 
				-    "+r"(src_ptr),    // %1
			
 
				-    "+r"(dst_width),  // %2
			
 
				-    "+r"(source_y_fraction)  // %3
			
 
				-  : "r"((intptr_t)(src_stride))  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_INTERPOLATEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_INTERPOLATEROW_SSSE3
			
 
				-// Bilinear filter 16x2 -> 16x1
			
 
				-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                                    ptrdiff_t src_stride, int dst_width,
			
 
				-                                    int source_y_fraction) {
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%0                           \n"
			
 
				-    "shr       %3                              \n"
			
 
				-    "cmp       $0x0,%3                         \n"
			
 
				-    "je        100f                            \n"
			
 
				-    "cmp       $0x20,%3                        \n"
			
 
				-    "je        75f                             \n"
			
 
				-    "cmp       $0x40,%3                        \n"
			
 
				-    "je        50f                             \n"
			
 
				-    "cmp       $0x60,%3                        \n"
			
 
				-    "je        25f                             \n"
			
 
				-
			
 
				-    "movd      %3,%%xmm0                       \n"
			
 
				-    "neg       %3                              \n"
			
 
				-    "add       $0x80,%3                        \n"
			
 
				-    "movd      %3,%%xmm5                       \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm5                   \n"
			
 
				-    "punpcklwd %%xmm5,%%xmm5                   \n"
			
 
				-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
			
 
				-
			
 
				-    // General purpose row blend.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
			
 
				-    "movdqu    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm0                   \n"
			
 
				-    "punpckhbw %%xmm2,%%xmm1                   \n"
			
 
				-    "pmaddubsw %%xmm5,%%xmm0                   \n"
			
 
				-    "pmaddubsw %%xmm5,%%xmm1                   \n"
			
 
				-    "psrlw     $0x7,%%xmm0                     \n"
			
 
				-    "psrlw     $0x7,%%xmm1                     \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 25 / 75.
			
 
				-    LABELALIGN
			
 
				-  "25:                                         \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        25b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 50 / 50.
			
 
				-    LABELALIGN
			
 
				-  "50:                                         \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        50b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 75 / 25.
			
 
				-    LABELALIGN
			
 
				-  "75:                                         \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        75b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 100 / 0 - Copy row unchanged.
			
 
				-    LABELALIGN
			
 
				-  "100:                                        \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        100b                            \n"
			
 
				-
			
 
				-  "99:                                         \n"
			
 
				-  : "+r"(dst_ptr),    // %0
			
 
				-    "+r"(src_ptr),    // %1
			
 
				-    "+r"(dst_width),  // %2
			
 
				-    "+r"(source_y_fraction)  // %3
			
 
				-  : "r"((intptr_t)(src_stride))  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif   // HAS_INTERPOLATEROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_INTERPOLATEROW_SSE2
			
 
				-// Bilinear filter 16x2 -> 16x1
			
 
				-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                                   ptrdiff_t src_stride, int dst_width,
			
 
				-                                   int source_y_fraction) {
			
 
				-  asm volatile (
			
 
				-    "sub       %1,%0                           \n"
			
 
				-    "shr       %3                              \n"
			
 
				-    "cmp       $0x0,%3                         \n"
			
 
				-    "je        100f                            \n"
			
 
				-    "cmp       $0x20,%3                        \n"
			
 
				-    "je        75f                             \n"
			
 
				-    "cmp       $0x40,%3                        \n"
			
 
				-    "je        50f                             \n"
			
 
				-    "cmp       $0x60,%3                        \n"
			
 
				-    "je        25f                             \n"
			
 
				-
			
 
				-    "movd      %3,%%xmm0                       \n"
			
 
				-    "neg       %3                              \n"
			
 
				-    "add       $0x80,%3                        \n"
			
 
				-    "movd      %3,%%xmm5                       \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm5                   \n"
			
 
				-    "punpcklwd %%xmm5,%%xmm5                   \n"
			
 
				-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
			
 
				-    "pxor      %%xmm4,%%xmm4                   \n"
			
 
				-
			
 
				-    // General purpose row blend.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
			
 
				-    "movdqu    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqu    %%xmm2,%%xmm3                   \n"
			
 
				-    "punpcklbw %%xmm4,%%xmm2                   \n"
			
 
				-    "punpckhbw %%xmm4,%%xmm3                   \n"
			
 
				-    "punpcklbw %%xmm4,%%xmm0                   \n"
			
 
				-    "punpckhbw %%xmm4,%%xmm1                   \n"
			
 
				-    "psubw     %%xmm0,%%xmm2                   \n"
			
 
				-    "psubw     %%xmm1,%%xmm3                   \n"
			
 
				-    "paddw     %%xmm2,%%xmm2                   \n"
			
 
				-    "paddw     %%xmm3,%%xmm3                   \n"
			
 
				-    "pmulhw    %%xmm5,%%xmm2                   \n"
			
 
				-    "pmulhw    %%xmm5,%%xmm3                   \n"
			
 
				-    "paddw     %%xmm2,%%xmm0                   \n"
			
 
				-    "paddw     %%xmm3,%%xmm1                   \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 25 / 75.
			
 
				-    LABELALIGN
			
 
				-  "25:                                         \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        25b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 50 / 50.
			
 
				-    LABELALIGN
			
 
				-  "50:                                         \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        50b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 75 / 25.
			
 
				-    LABELALIGN
			
 
				-  "75:                                         \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
			
 
				-    MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "pavgb     %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    BUNDLEALIGN
			
 
				-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        75b                             \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    // Blend 100 / 0 - Copy row unchanged.
			
 
				-    LABELALIGN
			
 
				-  "100:                                        \n"
			
 
				-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        100b                            \n"
			
 
				-
			
 
				-  "99:                                         \n"
			
 
				-  : "+r"(dst_ptr),    // %0
			
 
				-    "+r"(src_ptr),    // %1
			
 
				-    "+r"(dst_width),  // %2
			
 
				-    "+r"(source_y_fraction)  // %3
			
 
				-  : "r"((intptr_t)(src_stride))  // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_INTERPOLATEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_HALFROW_SSE2
			
 
				-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
			
 
				-                  uint8* dst_uv, int pix) {
			
 
				-  asm volatile (
			
 
				-    "sub       %0,%1                           \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb     (%0,%3),%%xmm0
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    MEMOPMEM(movdqa,xmm0,0x00,0,1,1)           //  movdqa    %%xmm0,(%0,%1)
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_uv),  // %0
			
 
				-    "+r"(dst_uv),  // %1
			
 
				-    "+r"(pix)      // %2
			
 
				-  : "r"((intptr_t)(src_uv_stride))  // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-      , "xmm0"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_HALFROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBTOBAYERROW_SSSE3
			
 
				-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                          uint32 selector, int pix) {
			
 
				-  asm volatile (
			
 
				-    // NaCL caveat - assumes movd is from GPR
			
 
				-    "movd      %3,%%xmm5                       \n"
			
 
				-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "pshufb    %%xmm5,%%xmm0                   \n"
			
 
				-    "pshufb    %%xmm5,%%xmm1                   \n"
			
 
				-    "punpckldq %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_bayer), // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "g"(selector)    // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBTOBAYERROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBTOBAYERGGROW_SSE2
			
 
				-void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                           uint32 selector, int pix) {
			
 
				-  asm volatile (
			
 
				-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
			
 
				-    "psrld     $0x18,%%xmm5                    \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "psrld     $0x8,%%xmm0                     \n"
			
 
				-    "psrld     $0x8,%%xmm1                     \n"
			
 
				-    "pand      %%xmm5,%%xmm0                   \n"
			
 
				-    "pand      %%xmm5,%%xmm1                   \n"
			
 
				-    "packssdw  %%xmm1,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_bayer), // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  :
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBTOBAYERGGROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBSHUFFLEROW_SSSE3
			
 
				-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
			
 
				-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                          const uint8* shuffler, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "pshufb    %%xmm5,%%xmm0                   \n"
			
 
				-    "pshufb    %%xmm5,%%xmm1                   \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "r"(shuffler)    // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                                    const uint8* shuffler, int pix) {
			
 
				-  asm volatile (
			
 
				-    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
			
 
				-    "lea       " MEMLEA(0x20,0) ",%0           \n"
			
 
				-    "pshufb    %%xmm5,%%xmm0                   \n"
			
 
				-    "pshufb    %%xmm5,%%xmm1                   \n"
			
 
				-    "sub       $0x8,%2                         \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x20,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "r"(shuffler)    // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBSHUFFLEROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBSHUFFLEROW_AVX2
			
 
				-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
			
 
				-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                         const uint8* shuffler, int pix) {
			
 
				-  asm volatile (
			
 
				-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
			
 
				-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
			
 
				-    "lea       " MEMLEA(0x40,0) ",%0           \n"
			
 
				-    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
			
 
				-    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
			
 
				-    "sub       $0x10,%2                        \n"
			
 
				-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
			
 
				-    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
			
 
				-    "lea       " MEMLEA(0x40,1) ",%1           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(pix)        // %2
			
 
				-  : "r"(shuffler)    // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBSHUFFLEROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_ARGBSHUFFLEROW_SSE2
			
 
				-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
			
 
				-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                         const uint8* shuffler, int pix) {
			
 
				-  uintptr_t pixel_temp = 0u;
			
 
				-  asm volatile (
			
 
				-    "pxor      %%xmm5,%%xmm5                   \n"
			
 
				-    "mov       " MEMACCESS(4) ",%k2            \n"
			
 
				-    "cmp       $0x3000102,%k2                  \n"
			
 
				-    "je        3012f                           \n"
			
 
				-    "cmp       $0x10203,%k2                    \n"
			
 
				-    "je        123f                            \n"
			
 
				-    "cmp       $0x30201,%k2                    \n"
			
 
				-    "je        321f                            \n"
			
 
				-    "cmp       $0x2010003,%k2                  \n"
			
 
				-    "je        2103f                           \n"
			
 
				-
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movzb     " MEMACCESS(4) ",%2             \n"
			
 
				-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
			
 
				-    "mov       %b2," MEMACCESS(1) "            \n"
			
 
				-    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
			
 
				-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
			
 
				-    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
			
 
				-    BUNDLEALIGN
			
 
				-    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
			
 
				-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
			
 
				-    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
			
 
				-    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
			
 
				-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
			
 
				-    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
			
 
				-    "lea       " MEMLEA(0x4,0) ",%0            \n"
			
 
				-    "lea       " MEMLEA(0x4,1) ",%1            \n"
			
 
				-    "sub       $0x1,%3                         \n"
			
 
				-    "jg        1b                              \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    LABELALIGN
			
 
				-  "123:                                        \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm0                   \n"
			
 
				-    "punpckhbw %%xmm5,%%xmm1                   \n"
			
 
				-    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
			
 
				-    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
			
 
				-    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
			
 
				-    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        123b                            \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    LABELALIGN
			
 
				-  "321:                                        \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm0                   \n"
			
 
				-    "punpckhbw %%xmm5,%%xmm1                   \n"
			
 
				-    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
			
 
				-    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
			
 
				-    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
			
 
				-    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        321b                            \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    LABELALIGN
			
 
				-  "2103:                                       \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm0                   \n"
			
 
				-    "punpckhbw %%xmm5,%%xmm1                   \n"
			
 
				-    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
			
 
				-    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
			
 
				-    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
			
 
				-    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        2103b                           \n"
			
 
				-    "jmp       99f                             \n"
			
 
				-
			
 
				-    LABELALIGN
			
 
				-  "3012:                                       \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0           \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "punpcklbw %%xmm5,%%xmm0                   \n"
			
 
				-    "punpckhbw %%xmm5,%%xmm1                   \n"
			
 
				-    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
			
 
				-    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
			
 
				-    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
			
 
				-    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
			
 
				-    "packuswb  %%xmm1,%%xmm0                   \n"
			
 
				-    "sub       $0x4,%3                         \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x10,1) ",%1           \n"
			
 
				-    "jg        3012b                           \n"
			
 
				-
			
 
				-  "99:                                         \n"
			
 
				-  : "+r"(src_argb),    // %0
			
 
				-    "+r"(dst_argb),    // %1
			
 
				-    "+d"(pixel_temp),  // %2
			
 
				-    "+r"(pix)         // %3
			
 
				-  : "r"(shuffler)      // %4
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBSHUFFLEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_I422TOYUY2ROW_SSE2
			
 
				-void I422ToYUY2Row_SSE2(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_frame, int width) {
			
 
				- asm volatile (
			
 
				-    "sub       %1,%2                             \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                            \n"
			
 
				-    "movq      " MEMACCESS(1) ",%%xmm2           \n"
			
 
				-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1              \n"
			
 
				-    "punpcklbw %%xmm3,%%xmm2                     \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0             \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                     \n"
			
 
				-    "punpcklbw %%xmm2,%%xmm0                     \n"
			
 
				-    "punpckhbw %%xmm2,%%xmm1                     \n"
			
 
				-    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
			
 
				-    "lea       " MEMLEA(0x20,3) ",%3             \n"
			
 
				-    "sub       $0x10,%4                          \n"
			
 
				-    "jg         1b                               \n"
			
 
				-    : "+r"(src_y),  // %0
			
 
				-      "+r"(src_u),  // %1
			
 
				-      "+r"(src_v),  // %2
			
 
				-      "+r"(dst_frame),  // %3
			
 
				-      "+rm"(width)  // %4
			
 
				-    :
			
 
				-    : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_I422TOYUY2ROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_I422TOUYVYROW_SSE2
			
 
				-void I422ToUYVYRow_SSE2(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_frame, int width) {
			
 
				- asm volatile (
			
 
				-    "sub        %1,%2                            \n"
			
 
				-    LABELALIGN
			
 
				-  "1:                                            \n"
			
 
				-    "movq      " MEMACCESS(1) ",%%xmm2           \n"
			
 
				-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1              \n"
			
 
				-    "punpcklbw %%xmm3,%%xmm2                     \n"
			
 
				-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
			
 
				-    "movdqa    %%xmm2,%%xmm1                     \n"
			
 
				-    "lea       " MEMLEA(0x10,0) ",%0             \n"
			
 
				-    "punpcklbw %%xmm0,%%xmm1                     \n"
			
 
				-    "punpckhbw %%xmm0,%%xmm2                     \n"
			
 
				-    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
			
 
				-    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
			
 
				-    "lea       " MEMLEA(0x20,3) ",%3             \n"
			
 
				-    "sub       $0x10,%4                          \n"
			
 
				-    "jg         1b                               \n"
			
 
				-    : "+r"(src_y),  // %0
			
 
				-      "+r"(src_u),  // %1
			
 
				-      "+r"(src_v),  // %2
			
 
				-      "+r"(dst_frame),  // %3
			
 
				-      "+rm"(width)  // %4
			
 
				-    :
			
 
				-    : "memory", "cc"
			
 
				-#if defined(__native_client__) && defined(__x86_64__)
			
 
				-    , "r14"
			
 
				-#endif
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_I422TOUYVYROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
			
 
				-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
			
 
				-                            uint8* dst_argb, const float* poly,
			
 
				-                            int width) {
			
 
				-  asm volatile (
			
 
				-    "pxor      %%xmm3,%%xmm3                   \n"
			
 
				-
			
 
				-    // 2 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
			
 
				-    "lea       " MEMLEA(0x8,0) ",%0            \n"
			
 
				-    "punpcklbw %%xmm3,%%xmm0                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm4                   \n"
			
 
				-    "punpcklwd %%xmm3,%%xmm0                   \n"
			
 
				-    "punpckhwd %%xmm3,%%xmm4                   \n"
			
 
				-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
			
 
				-    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
			
 
				-    "movdqa    %%xmm0,%%xmm1                   \n"
			
 
				-    "movdqa    %%xmm4,%%xmm5                   \n"
			
 
				-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
			
 
				-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
			
 
				-    "addps     " MEMACCESS(3) ",%%xmm0         \n"
			
 
				-    "addps     " MEMACCESS(3) ",%%xmm4         \n"
			
 
				-    "movdqa    %%xmm1,%%xmm2                   \n"
			
 
				-    "movdqa    %%xmm5,%%xmm6                   \n"
			
 
				-    "mulps     %%xmm1,%%xmm2                   \n"
			
 
				-    "mulps     %%xmm5,%%xmm6                   \n"
			
 
				-    "mulps     %%xmm2,%%xmm1                   \n"
			
 
				-    "mulps     %%xmm6,%%xmm5                   \n"
			
 
				-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
			
 
				-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
			
 
				-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
			
 
				-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
			
 
				-    "addps     %%xmm2,%%xmm0                   \n"
			
 
				-    "addps     %%xmm6,%%xmm4                   \n"
			
 
				-    "addps     %%xmm1,%%xmm0                   \n"
			
 
				-    "addps     %%xmm5,%%xmm4                   \n"
			
 
				-    "cvttps2dq %%xmm0,%%xmm0                   \n"
			
 
				-    "cvttps2dq %%xmm4,%%xmm4                   \n"
			
 
				-    "packuswb  %%xmm4,%%xmm0                   \n"
			
 
				-    "packuswb  %%xmm0,%%xmm0                   \n"
			
 
				-    "sub       $0x2,%2                         \n"
			
 
				-    "movq      %%xmm0," MEMACCESS(1) "         \n"
			
 
				-    "lea       " MEMLEA(0x8,1) ",%1            \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(width)      // %2
			
 
				-  : "r"(poly)        // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
			
 
				-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
			
 
				-                            uint8* dst_argb, const float* poly,
			
 
				-                            int width) {
			
 
				-  asm volatile (
			
 
				-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
			
 
				-    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
			
 
				-    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
			
 
				-    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
			
 
				-
			
 
				-    // 2 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
			
 
				-    "lea         " MEMLEA(0x8,0) ",%0          \n"
			
 
				-    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
			
 
				-    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
			
 
				-    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
			
 
				-    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
			
 
				-    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
			
 
				-    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
			
 
				-    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
			
 
				-    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
			
 
				-    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
			
 
				-    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
			
 
				-    "sub         $0x2,%2                       \n"
			
 
				-    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
			
 
				-    "lea         " MEMLEA(0x8,1) ",%1          \n"
			
 
				-    "jg          1b                            \n"
			
 
				-    "vzeroupper                                \n"
			
 
				-  : "+r"(src_argb),  // %0
			
 
				-    "+r"(dst_argb),  // %1
			
 
				-    "+r"(width)      // %2
			
 
				-  : "r"(poly)        // %3
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-// TODO(fbarchard): declare ymm usage when applicable.
			
 
				-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_ARGBCOLORTABLEROW_X86
			
 
				-// Tranform ARGB pixels with color table.
			
 
				-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
			
 
				-                           int width) {
			
 
				-  uintptr_t pixel_temp = 0u;
			
 
				-  asm volatile (
			
 
				-    // 1 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movzb     " MEMACCESS(0) ",%1             \n"
			
 
				-    "lea       " MEMLEA(0x4,0) ",%0            \n"
			
 
				-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
			
 
				-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
			
 
				-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
			
 
				-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
			
 
				-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
			
 
				-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
			
 
				-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
			
 
				-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
			
 
				-    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
			
 
				-    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
			
 
				-    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
			
 
				-    "dec       %2                              \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(dst_argb),   // %0
			
 
				-    "+d"(pixel_temp), // %1
			
 
				-    "+r"(width)       // %2
			
 
				-  : "r"(table_argb)   // %3
			
 
				-  : "memory", "cc");
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOLORTABLEROW_X86
			
 
				-
			
 
				-#ifdef HAS_RGBCOLORTABLEROW_X86
			
 
				-// Tranform RGB pixels with color table.
			
 
				-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
			
 
				-  uintptr_t pixel_temp = 0u;
			
 
				-  asm volatile (
			
 
				-    // 1 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movzb     " MEMACCESS(0) ",%1             \n"
			
 
				-    "lea       " MEMLEA(0x4,0) ",%0            \n"
			
 
				-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
			
 
				-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
			
 
				-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
			
 
				-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
			
 
				-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
			
 
				-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
			
 
				-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
			
 
				-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
			
 
				-    "dec       %2                              \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+r"(dst_argb),   // %0
			
 
				-    "+d"(pixel_temp), // %1
			
 
				-    "+r"(width)       // %2
			
 
				-  : "r"(table_argb)   // %3
			
 
				-  : "memory", "cc");
			
 
				-}
			
 
				-#endif  // HAS_RGBCOLORTABLEROW_X86
			
 
				-
			
 
				-#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
			
 
				-// Tranform RGB pixels with luma table.
			
 
				-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                                 int width,
			
 
				-                                 const uint8* luma, uint32 lumacoeff) {
			
 
				-  uintptr_t pixel_temp = 0u;
			
 
				-  uintptr_t table_temp = 0u;
			
 
				-  asm volatile (
			
 
				-    "movd      %6,%%xmm3                       \n"
			
 
				-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
			
 
				-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
			
 
				-    "psllw     $0x8,%%xmm4                     \n"
			
 
				-    "pxor      %%xmm5,%%xmm5                   \n"
			
 
				-
			
 
				-    // 4 pixel loop.
			
 
				-    LABELALIGN
			
 
				-  "1:                                          \n"
			
 
				-    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
			
 
				-    "pmaddubsw %%xmm3,%%xmm0                   \n"
			
 
				-    "phaddw    %%xmm0,%%xmm0                   \n"
			
 
				-    "pand      %%xmm4,%%xmm0                   \n"
			
 
				-    "punpcklwd %%xmm5,%%xmm0                   \n"
			
 
				-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
			
 
				-    "add       %5,%1                           \n"
			
 
				-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
			
 
				-
			
 
				-    "movzb     " MEMACCESS(2) ",%0             \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS(3) "            \n"
			
 
				-    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
			
 
				-    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
			
 
				-    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
			
 
				-    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
			
 
				-
			
 
				-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
			
 
				-    "add       %5,%1                           \n"
			
 
				-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
			
 
				-
			
 
				-    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
			
 
				-    BUNDLEALIGN
			
 
				-    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
			
 
				-    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
			
 
				-    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
			
 
				-    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
			
 
				-
			
 
				-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
			
 
				-    "add       %5,%1                           \n"
			
 
				-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
			
 
				-
			
 
				-    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
			
 
				-    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
			
 
				-    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
			
 
				-    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
			
 
				-    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
			
 
				-
			
 
				-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
			
 
				-    "add       %5,%1                           \n"
			
 
				-
			
 
				-    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
			
 
				-    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
			
 
				-    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
			
 
				-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
			
 
				-    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
			
 
				-    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
			
 
				-    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
			
 
				-    "sub       $0x4,%4                         \n"
			
 
				-    "lea       " MEMLEA(0x10,2) ",%2           \n"
			
 
				-    "lea       " MEMLEA(0x10,3) ",%3           \n"
			
 
				-    "jg        1b                              \n"
			
 
				-  : "+d"(pixel_temp),  // %0
			
 
				-    "+a"(table_temp),  // %1
			
 
				-    "+r"(src_argb),    // %2
			
 
				-    "+r"(dst_argb),    // %3
			
 
				-    "+rm"(width)       // %4
			
 
				-  : "r"(luma),         // %5
			
 
				-    "rm"(lumacoeff)    // %6
			
 
				-  : "memory", "cc"
			
 
				-#if defined(__SSE2__)
			
 
				-    , "xmm0", "xmm3", "xmm4", "xmm5"
			
 
				-#endif
			
 
				-  );
			
 
				-}
			
 
				-#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
			
 
				-
			
 
				-#endif  // defined(__x86_64__) || defined(__i386__)
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/row_win.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/row_win.cc
@@ -1,7284 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// This module is for Visual C x86.
			
 
				-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
			
 
				-
			
 
				-#ifdef HAS_ARGBTOYROW_SSSE3
			
 
				-
			
 
				-// Constants for ARGB.
			
 
				-static const vec8 kARGBToY = {
			
 
				-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
			
 
				-};
			
 
				-
			
 
				-// JPeg full range.
			
 
				-static const vec8 kARGBToYJ = {
			
 
				-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
			
 
				-};
			
 
				-
			
 
				-static const vec8 kARGBToU = {
			
 
				-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
			
 
				-};
			
 
				-
			
 
				-static const vec8 kARGBToUJ = {
			
 
				-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
			
 
				-};
			
 
				-
			
 
				-static const vec8 kARGBToV = {
			
 
				-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
			
 
				-};
			
 
				-
			
 
				-static const vec8 kARGBToVJ = {
			
 
				-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
			
 
				-};
			
 
				-
			
 
				-// vpermd for vphaddw + vpackuswb vpermd.
			
 
				-static const lvec32 kPermdARGBToY_AVX = {
			
 
				-  0, 4, 1, 5, 2, 6, 3, 7
			
 
				-};
			
 
				-
			
 
				-// vpshufb for vphaddw + vpackuswb packed to shorts.
			
 
				-static const lvec8 kShufARGBToUV_AVX = {
			
 
				-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
			
 
				-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
			
 
				-};
			
 
				-
			
 
				-// Constants for BGRA.
			
 
				-static const vec8 kBGRAToY = {
			
 
				-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
			
 
				-};
			
 
				-
			
 
				-static const vec8 kBGRAToU = {
			
 
				-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
			
 
				-};
			
 
				-
			
 
				-static const vec8 kBGRAToV = {
			
 
				-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
			
 
				-};
			
 
				-
			
 
				-// Constants for ABGR.
			
 
				-static const vec8 kABGRToY = {
			
 
				-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
			
 
				-};
			
 
				-
			
 
				-static const vec8 kABGRToU = {
			
 
				-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
			
 
				-};
			
 
				-
			
 
				-static const vec8 kABGRToV = {
			
 
				-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
			
 
				-};
			
 
				-
			
 
				-// Constants for RGBA.
			
 
				-static const vec8 kRGBAToY = {
			
 
				-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
			
 
				-};
			
 
				-
			
 
				-static const vec8 kRGBAToU = {
			
 
				-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
			
 
				-};
			
 
				-
			
 
				-static const vec8 kRGBAToV = {
			
 
				-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
			
 
				-};
			
 
				-
			
 
				-static const uvec8 kAddY16 = {
			
 
				-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
			
 
				-};
			
 
				-
			
 
				-static const vec16 kAddYJ64 = {
			
 
				-  64, 64, 64, 64, 64, 64, 64, 64
			
 
				-};
			
 
				-
			
 
				-static const uvec8 kAddUV128 = {
			
 
				-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
			
 
				-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
			
 
				-};
			
 
				-
			
 
				-static const uvec16 kAddUVJ128 = {
			
 
				-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting RGB24 to ARGB.
			
 
				-static const uvec8 kShuffleMaskRGB24ToARGB = {
			
 
				-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting RAW to ARGB.
			
 
				-static const uvec8 kShuffleMaskRAWToARGB = {
			
 
				-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting ARGB to RGB24.
			
 
				-static const uvec8 kShuffleMaskARGBToRGB24 = {
			
 
				-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting ARGB to RAW.
			
 
				-static const uvec8 kShuffleMaskARGBToRAW = {
			
 
				-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
			
 
				-static const uvec8 kShuffleMaskARGBToRGB24_0 = {
			
 
				-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
			
 
				-};
			
 
				-
			
 
				-// Shuffle table for converting ARGB to RAW.
			
 
				-static const uvec8 kShuffleMaskARGBToRAW_0 = {
			
 
				-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
			
 
				-};
			
 
				-
			
 
				-// Duplicates gray value 3 times and fills in alpha opaque.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]        // src_y
			
 
				-    mov        edx, [esp + 8]        // dst_argb
			
 
				-    mov        ecx, [esp + 12]       // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
			
 
				-    pslld      xmm5, 24
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movq       xmm0, qword ptr [eax]
			
 
				-    lea        eax,  [eax + 8]
			
 
				-    punpcklbw  xmm0, xmm0
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm0
			
 
				-    punpckhwd  xmm1, xmm1
			
 
				-    por        xmm0, xmm5
			
 
				-    por        xmm1, xmm5
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + 16], xmm1
			
 
				-    lea        edx, [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
			
 
				-                                  int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]        // src_y
			
 
				-    mov        edx, [esp + 8]        // dst_argb
			
 
				-    mov        ecx, [esp + 12]       // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
			
 
				-    pslld      xmm5, 24
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movq       xmm0, qword ptr [eax]
			
 
				-    lea        eax,  [eax + 8]
			
 
				-    punpcklbw  xmm0, xmm0
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm0
			
 
				-    punpckhwd  xmm1, xmm1
			
 
				-    por        xmm0, xmm5
			
 
				-    por        xmm1, xmm5
			
 
				-    movdqu     [edx], xmm0
			
 
				-    movdqu     [edx + 16], xmm1
			
 
				-    lea        edx, [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src_rgb24
			
 
				-    mov       edx, [esp + 8]   // dst_argb
			
 
				-    mov       ecx, [esp + 12]  // pix
			
 
				-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
			
 
				-    pslld     xmm5, 24
			
 
				-    movdqa    xmm4, kShuffleMaskRGB24ToARGB
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu    xmm0, [eax]
			
 
				-    movdqu    xmm1, [eax + 16]
			
 
				-    movdqu    xmm3, [eax + 32]
			
 
				-    lea       eax, [eax + 48]
			
 
				-    movdqa    xmm2, xmm3
			
 
				-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
			
 
				-    pshufb    xmm2, xmm4
			
 
				-    por       xmm2, xmm5
			
 
				-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
			
 
				-    pshufb    xmm0, xmm4
			
 
				-    movdqa    [edx + 32], xmm2
			
 
				-    por       xmm0, xmm5
			
 
				-    pshufb    xmm1, xmm4
			
 
				-    movdqa    [edx], xmm0
			
 
				-    por       xmm1, xmm5
			
 
				-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
			
 
				-    pshufb    xmm3, xmm4
			
 
				-    movdqa    [edx + 16], xmm1
			
 
				-    por       xmm3, xmm5
			
 
				-    sub       ecx, 16
			
 
				-    movdqa    [edx + 48], xmm3
			
 
				-    lea       edx, [edx + 64]
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
			
 
				-                        int pix) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src_raw
			
 
				-    mov       edx, [esp + 8]   // dst_argb
			
 
				-    mov       ecx, [esp + 12]  // pix
			
 
				-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
			
 
				-    pslld     xmm5, 24
			
 
				-    movdqa    xmm4, kShuffleMaskRAWToARGB
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu    xmm0, [eax]
			
 
				-    movdqu    xmm1, [eax + 16]
			
 
				-    movdqu    xmm3, [eax + 32]
			
 
				-    lea       eax, [eax + 48]
			
 
				-    movdqa    xmm2, xmm3
			
 
				-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
			
 
				-    pshufb    xmm2, xmm4
			
 
				-    por       xmm2, xmm5
			
 
				-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
			
 
				-    pshufb    xmm0, xmm4
			
 
				-    movdqa    [edx + 32], xmm2
			
 
				-    por       xmm0, xmm5
			
 
				-    pshufb    xmm1, xmm4
			
 
				-    movdqa    [edx], xmm0
			
 
				-    por       xmm1, xmm5
			
 
				-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
			
 
				-    pshufb    xmm3, xmm4
			
 
				-    movdqa    [edx + 16], xmm1
			
 
				-    por       xmm3, xmm5
			
 
				-    sub       ecx, 16
			
 
				-    movdqa    [edx + 48], xmm3
			
 
				-    lea       edx, [edx + 64]
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// pmul method to replicate bits.
			
 
				-// Math to replicate bits:
			
 
				-// (v << 8) | (v << 3)
			
 
				-// v * 256 + v * 8
			
 
				-// v * (256 + 8)
			
 
				-// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
			
 
				-// 20 instructions.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
			
 
				-                          int pix) {
			
 
				-  __asm {
			
 
				-    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
			
 
				-    movd      xmm5, eax
			
 
				-    pshufd    xmm5, xmm5, 0
			
 
				-    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
			
 
				-    movd      xmm6, eax
			
 
				-    pshufd    xmm6, xmm6, 0
			
 
				-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
			
 
				-    psllw     xmm3, 11
			
 
				-    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
			
 
				-    psllw     xmm4, 10
			
 
				-    psrlw     xmm4, 5
			
 
				-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
			
 
				-    psllw     xmm7, 8
			
 
				-
			
 
				-    mov       eax, [esp + 4]   // src_rgb565
			
 
				-    mov       edx, [esp + 8]   // dst_argb
			
 
				-    mov       ecx, [esp + 12]  // pix
			
 
				-    sub       edx, eax
			
 
				-    sub       edx, eax
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
			
 
				-    movdqa    xmm1, xmm0
			
 
				-    movdqa    xmm2, xmm0
			
 
				-    pand      xmm1, xmm3    // R in upper 5 bits
			
 
				-    psllw     xmm2, 11      // B in upper 5 bits
			
 
				-    pmulhuw   xmm1, xmm5    // * (256 + 8)
			
 
				-    pmulhuw   xmm2, xmm5    // * (256 + 8)
			
 
				-    psllw     xmm1, 8
			
 
				-    por       xmm1, xmm2    // RB
			
 
				-    pand      xmm0, xmm4    // G in middle 6 bits
			
 
				-    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
			
 
				-    por       xmm0, xmm7    // AG
			
 
				-    movdqa    xmm2, xmm1
			
 
				-    punpcklbw xmm1, xmm0
			
 
				-    punpckhbw xmm2, xmm0
			
 
				-    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
			
 
				-    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
			
 
				-    lea       eax, [eax + 16]
			
 
				-    sub       ecx, 8
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 24 instructions
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
			
 
				-                            int pix) {
			
 
				-  __asm {
			
 
				-    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
			
 
				-    movd      xmm5, eax
			
 
				-    pshufd    xmm5, xmm5, 0
			
 
				-    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
			
 
				-    movd      xmm6, eax
			
 
				-    pshufd    xmm6, xmm6, 0
			
 
				-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
			
 
				-    psllw     xmm3, 11
			
 
				-    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
			
 
				-    psrlw     xmm4, 6
			
 
				-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
			
 
				-    psllw     xmm7, 8
			
 
				-
			
 
				-    mov       eax, [esp + 4]   // src_argb1555
			
 
				-    mov       edx, [esp + 8]   // dst_argb
			
 
				-    mov       ecx, [esp + 12]  // pix
			
 
				-    sub       edx, eax
			
 
				-    sub       edx, eax
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
			
 
				-    movdqa    xmm1, xmm0
			
 
				-    movdqa    xmm2, xmm0
			
 
				-    psllw     xmm1, 1       // R in upper 5 bits
			
 
				-    psllw     xmm2, 11      // B in upper 5 bits
			
 
				-    pand      xmm1, xmm3
			
 
				-    pmulhuw   xmm2, xmm5    // * (256 + 8)
			
 
				-    pmulhuw   xmm1, xmm5    // * (256 + 8)
			
 
				-    psllw     xmm1, 8
			
 
				-    por       xmm1, xmm2    // RB
			
 
				-    movdqa    xmm2, xmm0
			
 
				-    pand      xmm0, xmm4    // G in middle 5 bits
			
 
				-    psraw     xmm2, 8       // A
			
 
				-    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
			
 
				-    pand      xmm2, xmm7
			
 
				-    por       xmm0, xmm2    // AG
			
 
				-    movdqa    xmm2, xmm1
			
 
				-    punpcklbw xmm1, xmm0
			
 
				-    punpckhbw xmm2, xmm0
			
 
				-    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
			
 
				-    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
			
 
				-    lea       eax, [eax + 16]
			
 
				-    sub       ecx, 8
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 18 instructions.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
			
 
				-                            int pix) {
			
 
				-  __asm {
			
 
				-    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
			
 
				-    movd      xmm4, eax
			
 
				-    pshufd    xmm4, xmm4, 0
			
 
				-    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
			
 
				-    pslld     xmm5, 4
			
 
				-    mov       eax, [esp + 4]   // src_argb4444
			
 
				-    mov       edx, [esp + 8]   // dst_argb
			
 
				-    mov       ecx, [esp + 12]  // pix
			
 
				-    sub       edx, eax
			
 
				-    sub       edx, eax
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
			
 
				-    movdqa    xmm2, xmm0
			
 
				-    pand      xmm0, xmm4    // mask low nibbles
			
 
				-    pand      xmm2, xmm5    // mask high nibbles
			
 
				-    movdqa    xmm1, xmm0
			
 
				-    movdqa    xmm3, xmm2
			
 
				-    psllw     xmm1, 4
			
 
				-    psrlw     xmm3, 4
			
 
				-    por       xmm0, xmm1
			
 
				-    por       xmm2, xmm3
			
 
				-    movdqa    xmm1, xmm0
			
 
				-    punpcklbw xmm0, xmm2
			
 
				-    punpckhbw xmm1, xmm2
			
 
				-    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
			
 
				-    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
			
 
				-    lea       eax, [eax + 16]
			
 
				-    sub       ecx, 8
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src_argb
			
 
				-    mov       edx, [esp + 8]   // dst_rgb
			
 
				-    mov       ecx, [esp + 12]  // pix
			
 
				-    movdqa    xmm6, kShuffleMaskARGBToRGB24
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
			
 
				-    movdqu    xmm1, [eax + 16]
			
 
				-    movdqu    xmm2, [eax + 32]
			
 
				-    movdqu    xmm3, [eax + 48]
			
 
				-    lea       eax, [eax + 64]
			
 
				-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
			
 
				-    pshufb    xmm1, xmm6
			
 
				-    pshufb    xmm2, xmm6
			
 
				-    pshufb    xmm3, xmm6
			
 
				-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
			
 
				-    psrldq    xmm1, 4      // 8 bytes from 1
			
 
				-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
			
 
				-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
			
 
				-    por       xmm0, xmm4   // 4 bytes from 1 for 0
			
 
				-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
			
 
				-    movdqu    [edx], xmm0  // store 0
			
 
				-    por       xmm1, xmm5   // 8 bytes from 2 for 1
			
 
				-    psrldq    xmm2, 8      // 4 bytes from 2
			
 
				-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
			
 
				-    por       xmm2, xmm3   // 12 bytes from 3 for 2
			
 
				-    movdqu    [edx + 16], xmm1   // store 1
			
 
				-    movdqu    [edx + 32], xmm2   // store 2
			
 
				-    lea       edx, [edx + 48]
			
 
				-    sub       ecx, 16
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src_argb
			
 
				-    mov       edx, [esp + 8]   // dst_rgb
			
 
				-    mov       ecx, [esp + 12]  // pix
			
 
				-    movdqa    xmm6, kShuffleMaskARGBToRAW
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
			
 
				-    movdqu    xmm1, [eax + 16]
			
 
				-    movdqu    xmm2, [eax + 32]
			
 
				-    movdqu    xmm3, [eax + 48]
			
 
				-    lea       eax, [eax + 64]
			
 
				-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
			
 
				-    pshufb    xmm1, xmm6
			
 
				-    pshufb    xmm2, xmm6
			
 
				-    pshufb    xmm3, xmm6
			
 
				-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
			
 
				-    psrldq    xmm1, 4      // 8 bytes from 1
			
 
				-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
			
 
				-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
			
 
				-    por       xmm0, xmm4   // 4 bytes from 1 for 0
			
 
				-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
			
 
				-    movdqu    [edx], xmm0  // store 0
			
 
				-    por       xmm1, xmm5   // 8 bytes from 2 for 1
			
 
				-    psrldq    xmm2, 8      // 4 bytes from 2
			
 
				-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
			
 
				-    por       xmm2, xmm3   // 12 bytes from 3 for 2
			
 
				-    movdqu    [edx + 16], xmm1   // store 1
			
 
				-    movdqu    [edx + 32], xmm2   // store 2
			
 
				-    lea       edx, [edx + 48]
			
 
				-    sub       ecx, 16
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src_argb
			
 
				-    mov       edx, [esp + 8]   // dst_rgb
			
 
				-    mov       ecx, [esp + 12]  // pix
			
 
				-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
			
 
				-    psrld     xmm3, 27
			
 
				-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
			
 
				-    psrld     xmm4, 26
			
 
				-    pslld     xmm4, 5
			
 
				-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
			
 
				-    pslld     xmm5, 11
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
			
 
				-    movdqa    xmm1, xmm0    // B
			
 
				-    movdqa    xmm2, xmm0    // G
			
 
				-    pslld     xmm0, 8       // R
			
 
				-    psrld     xmm1, 3       // B
			
 
				-    psrld     xmm2, 5       // G
			
 
				-    psrad     xmm0, 16      // R
			
 
				-    pand      xmm1, xmm3    // B
			
 
				-    pand      xmm2, xmm4    // G
			
 
				-    pand      xmm0, xmm5    // R
			
 
				-    por       xmm1, xmm2    // BG
			
 
				-    por       xmm0, xmm1    // BGR
			
 
				-    packssdw  xmm0, xmm0
			
 
				-    lea       eax, [eax + 16]
			
 
				-    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
			
 
				-    lea       edx, [edx + 8]
			
 
				-    sub       ecx, 4
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// TODO(fbarchard): Improve sign extension/packing.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src_argb
			
 
				-    mov       edx, [esp + 8]   // dst_rgb
			
 
				-    mov       ecx, [esp + 12]  // pix
			
 
				-    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
			
 
				-    psrld     xmm4, 27
			
 
				-    movdqa    xmm5, xmm4       // generate mask 0x000003e0
			
 
				-    pslld     xmm5, 5
			
 
				-    movdqa    xmm6, xmm4       // generate mask 0x00007c00
			
 
				-    pslld     xmm6, 10
			
 
				-    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
			
 
				-    pslld     xmm7, 15
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
			
 
				-    movdqa    xmm1, xmm0    // B
			
 
				-    movdqa    xmm2, xmm0    // G
			
 
				-    movdqa    xmm3, xmm0    // R
			
 
				-    psrad     xmm0, 16      // A
			
 
				-    psrld     xmm1, 3       // B
			
 
				-    psrld     xmm2, 6       // G
			
 
				-    psrld     xmm3, 9       // R
			
 
				-    pand      xmm0, xmm7    // A
			
 
				-    pand      xmm1, xmm4    // B
			
 
				-    pand      xmm2, xmm5    // G
			
 
				-    pand      xmm3, xmm6    // R
			
 
				-    por       xmm0, xmm1    // BA
			
 
				-    por       xmm2, xmm3    // GR
			
 
				-    por       xmm0, xmm2    // BGRA
			
 
				-    packssdw  xmm0, xmm0
			
 
				-    lea       eax, [eax + 16]
			
 
				-    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
			
 
				-    lea       edx, [edx + 8]
			
 
				-    sub       ecx, 4
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src_argb
			
 
				-    mov       edx, [esp + 8]   // dst_rgb
			
 
				-    mov       ecx, [esp + 12]  // pix
			
 
				-    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
			
 
				-    psllw     xmm4, 12
			
 
				-    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
			
 
				-    psrlw     xmm3, 8
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
			
 
				-    movdqa    xmm1, xmm0
			
 
				-    pand      xmm0, xmm3    // low nibble
			
 
				-    pand      xmm1, xmm4    // high nibble
			
 
				-    psrl      xmm0, 4
			
 
				-    psrl      xmm1, 8
			
 
				-    por       xmm0, xmm1
			
 
				-    packuswb  xmm0, xmm0
			
 
				-    lea       eax, [eax + 16]
			
 
				-    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
			
 
				-    lea       edx, [edx + 8]
			
 
				-    sub       ecx, 4
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    movdqa     xmm5, kAddY16
			
 
				-    movdqa     xmm4, kARGBToY
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    pmaddubsw  xmm2, xmm4
			
 
				-    pmaddubsw  xmm3, xmm4
			
 
				-    lea        eax, [eax + 64]
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm2, 7
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    movdqa     xmm4, kARGBToYJ
			
 
				-    movdqa     xmm5, kAddYJ64
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    pmaddubsw  xmm2, xmm4
			
 
				-    pmaddubsw  xmm3, xmm4
			
 
				-    lea        eax, [eax + 64]
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    paddw      xmm0, xmm5  // Add .5 for rounding.
			
 
				-    paddw      xmm2, xmm5
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm2, 7
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#ifdef HAS_ARGBTOYROW_AVX2
			
 
				-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
			
 
				-__declspec(naked) __declspec(align(32))
			
 
				-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    vbroadcastf128 ymm4, kARGBToY
			
 
				-    vbroadcastf128 ymm5, kAddY16
			
 
				-    vmovdqa    ymm6, kPermdARGBToY_AVX
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vmovdqu    ymm1, [eax + 32]
			
 
				-    vmovdqu    ymm2, [eax + 64]
			
 
				-    vmovdqu    ymm3, [eax + 96]
			
 
				-    vpmaddubsw ymm0, ymm0, ymm4
			
 
				-    vpmaddubsw ymm1, ymm1, ymm4
			
 
				-    vpmaddubsw ymm2, ymm2, ymm4
			
 
				-    vpmaddubsw ymm3, ymm3, ymm4
			
 
				-    lea        eax, [eax + 128]
			
 
				-    vphaddw    ymm0, ymm0, ymm1  // mutates.
			
 
				-    vphaddw    ymm2, ymm2, ymm3
			
 
				-    vpsrlw     ymm0, ymm0, 7
			
 
				-    vpsrlw     ymm2, ymm2, 7
			
 
				-    vpackuswb  ymm0, ymm0, ymm2  // mutates.
			
 
				-    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
			
 
				-    vpaddb     ymm0, ymm0, ymm5
			
 
				-    sub        ecx, 32
			
 
				-    vmovdqu    [edx], ymm0
			
 
				-    lea        edx, [edx + 32]
			
 
				-    jg         convertloop
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  //  HAS_ARGBTOYROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_ARGBTOYROW_AVX2
			
 
				-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
			
 
				-__declspec(naked) __declspec(align(32))
			
 
				-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    vbroadcastf128 ymm4, kARGBToYJ
			
 
				-    vbroadcastf128 ymm5, kAddYJ64
			
 
				-    vmovdqa    ymm6, kPermdARGBToY_AVX
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vmovdqu    ymm1, [eax + 32]
			
 
				-    vmovdqu    ymm2, [eax + 64]
			
 
				-    vmovdqu    ymm3, [eax + 96]
			
 
				-    vpmaddubsw ymm0, ymm0, ymm4
			
 
				-    vpmaddubsw ymm1, ymm1, ymm4
			
 
				-    vpmaddubsw ymm2, ymm2, ymm4
			
 
				-    vpmaddubsw ymm3, ymm3, ymm4
			
 
				-    lea        eax, [eax + 128]
			
 
				-    vphaddw    ymm0, ymm0, ymm1  // mutates.
			
 
				-    vphaddw    ymm2, ymm2, ymm3
			
 
				-    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
			
 
				-    vpaddw     ymm2, ymm2, ymm5
			
 
				-    vpsrlw     ymm0, ymm0, 7
			
 
				-    vpsrlw     ymm2, ymm2, 7
			
 
				-    vpackuswb  ymm0, ymm0, ymm2  // mutates.
			
 
				-    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
			
 
				-    sub        ecx, 32
			
 
				-    vmovdqu    [edx], ymm0
			
 
				-    lea        edx, [edx + 32]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  //  HAS_ARGBTOYJROW_AVX2
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    movdqa     xmm5, kAddY16
			
 
				-    movdqa     xmm4, kARGBToY
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    pmaddubsw  xmm2, xmm4
			
 
				-    pmaddubsw  xmm3, xmm4
			
 
				-    lea        eax, [eax + 64]
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm2, 7
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    movdqa     xmm4, kARGBToYJ
			
 
				-    movdqa     xmm5, kAddYJ64
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    pmaddubsw  xmm2, xmm4
			
 
				-    pmaddubsw  xmm3, xmm4
			
 
				-    lea        eax, [eax + 64]
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    paddw      xmm0, xmm5
			
 
				-    paddw      xmm2, xmm5
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm2, 7
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    movdqa     xmm5, kAddY16
			
 
				-    movdqa     xmm4, kBGRAToY
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    pmaddubsw  xmm2, xmm4
			
 
				-    pmaddubsw  xmm3, xmm4
			
 
				-    lea        eax, [eax + 64]
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm2, 7
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    movdqa     xmm5, kAddY16
			
 
				-    movdqa     xmm4, kBGRAToY
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    pmaddubsw  xmm2, xmm4
			
 
				-    pmaddubsw  xmm3, xmm4
			
 
				-    lea        eax, [eax + 64]
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm2, 7
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    movdqa     xmm5, kAddY16
			
 
				-    movdqa     xmm4, kABGRToY
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    pmaddubsw  xmm2, xmm4
			
 
				-    pmaddubsw  xmm3, xmm4
			
 
				-    lea        eax, [eax + 64]
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm2, 7
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    movdqa     xmm5, kAddY16
			
 
				-    movdqa     xmm4, kABGRToY
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    pmaddubsw  xmm2, xmm4
			
 
				-    pmaddubsw  xmm3, xmm4
			
 
				-    lea        eax, [eax + 64]
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm2, 7
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    movdqa     xmm5, kAddY16
			
 
				-    movdqa     xmm4, kRGBAToY
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    pmaddubsw  xmm2, xmm4
			
 
				-    pmaddubsw  xmm3, xmm4
			
 
				-    lea        eax, [eax + 64]
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm2, 7
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_y */
			
 
				-    mov        ecx, [esp + 12]  /* pix */
			
 
				-    movdqa     xmm5, kAddY16
			
 
				-    movdqa     xmm4, kRGBAToY
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    pmaddubsw  xmm2, xmm4
			
 
				-    pmaddubsw  xmm3, xmm4
			
 
				-    lea        eax, [eax + 64]
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm2, 7
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb
			
 
				-    mov        esi, [esp + 8 + 8]   // src_stride_argb
			
 
				-    mov        edx, [esp + 8 + 12]  // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]  // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]  // pix
			
 
				-    movdqa     xmm7, kARGBToU
			
 
				-    movdqa     xmm6, kARGBToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pavgb      xmm0, [eax + esi]
			
 
				-    pavgb      xmm1, [eax + esi + 16]
			
 
				-    pavgb      xmm2, [eax + esi + 32]
			
 
				-    pavgb      xmm3, [eax + esi + 48]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-    paddb      xmm0, xmm5            // -> unsigned
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                        uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb
			
 
				-    mov        esi, [esp + 8 + 8]   // src_stride_argb
			
 
				-    mov        edx, [esp + 8 + 12]  // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]  // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]  // pix
			
 
				-    movdqa     xmm7, kARGBToUJ
			
 
				-    movdqa     xmm6, kARGBToVJ
			
 
				-    movdqa     xmm5, kAddUVJ128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pavgb      xmm0, [eax + esi]
			
 
				-    pavgb      xmm1, [eax + esi + 16]
			
 
				-    pavgb      xmm2, [eax + esi + 32]
			
 
				-    pavgb      xmm3, [eax + esi + 48]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
			
 
				-    paddw      xmm1, xmm5
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#ifdef HAS_ARGBTOUVROW_AVX2
			
 
				-__declspec(naked) __declspec(align(32))
			
 
				-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
			
 
				-                      uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb
			
 
				-    mov        esi, [esp + 8 + 8]   // src_stride_argb
			
 
				-    mov        edx, [esp + 8 + 12]  // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]  // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]  // pix
			
 
				-    vbroadcastf128 ymm5, kAddUV128
			
 
				-    vbroadcastf128 ymm6, kARGBToV
			
 
				-    vbroadcastf128 ymm7, kARGBToU
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 32x2 argb pixels to 16x1 */
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vmovdqu    ymm1, [eax + 32]
			
 
				-    vmovdqu    ymm2, [eax + 64]
			
 
				-    vmovdqu    ymm3, [eax + 96]
			
 
				-    vpavgb     ymm0, ymm0, [eax + esi]
			
 
				-    vpavgb     ymm1, ymm1, [eax + esi + 32]
			
 
				-    vpavgb     ymm2, ymm2, [eax + esi + 64]
			
 
				-    vpavgb     ymm3, ymm3, [eax + esi + 96]
			
 
				-    lea        eax,  [eax + 128]
			
 
				-    vshufps    ymm4, ymm0, ymm1, 0x88
			
 
				-    vshufps    ymm0, ymm0, ymm1, 0xdd
			
 
				-    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
			
 
				-    vshufps    ymm4, ymm2, ymm3, 0x88
			
 
				-    vshufps    ymm2, ymm2, ymm3, 0xdd
			
 
				-    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 32 different pixels, its 16 pixels of U and 16 of V
			
 
				-    vpmaddubsw ymm1, ymm0, ymm7  // U
			
 
				-    vpmaddubsw ymm3, ymm2, ymm7
			
 
				-    vpmaddubsw ymm0, ymm0, ymm6  // V
			
 
				-    vpmaddubsw ymm2, ymm2, ymm6
			
 
				-    vphaddw    ymm1, ymm1, ymm3  // mutates
			
 
				-    vphaddw    ymm0, ymm0, ymm2
			
 
				-    vpsraw     ymm1, ymm1, 8
			
 
				-    vpsraw     ymm0, ymm0, 8
			
 
				-    vpacksswb  ymm0, ymm1, ymm0  // mutates
			
 
				-    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
			
 
				-    vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
			
 
				-    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
			
 
				-
			
 
				-    // step 3 - store 16 U and 16 V values
			
 
				-    sub         ecx, 32
			
 
				-    vextractf128 [edx], ymm0, 0 // U
			
 
				-    vextractf128 [edx + edi], ymm0, 1 // V
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBTOUVROW_AVX2
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb
			
 
				-    mov        esi, [esp + 8 + 8]   // src_stride_argb
			
 
				-    mov        edx, [esp + 8 + 12]  // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]  // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]  // pix
			
 
				-    movdqa     xmm7, kARGBToU
			
 
				-    movdqa     xmm6, kARGBToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    movdqu     xmm4, [eax + esi]
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 16]
			
 
				-    pavgb      xmm1, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 32]
			
 
				-    pavgb      xmm2, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 48]
			
 
				-    pavgb      xmm3, xmm4
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-    paddb      xmm0, xmm5            // -> unsigned
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb
			
 
				-    mov        esi, [esp + 8 + 8]   // src_stride_argb
			
 
				-    mov        edx, [esp + 8 + 12]  // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]  // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]  // pix
			
 
				-    movdqa     xmm7, kARGBToUJ
			
 
				-    movdqa     xmm6, kARGBToVJ
			
 
				-    movdqa     xmm5, kAddUVJ128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    movdqu     xmm4, [eax + esi]
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 16]
			
 
				-    pavgb      xmm1, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 32]
			
 
				-    pavgb      xmm2, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 48]
			
 
				-    pavgb      xmm3, xmm4
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
			
 
				-    paddw      xmm1, xmm5
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
			
 
				-                          uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb
			
 
				-    mov        edx, [esp + 4 + 8]   // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]  // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]  // pix
			
 
				-    movdqa     xmm7, kARGBToU
			
 
				-    movdqa     xmm6, kARGBToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* convert to U and V */
			
 
				-    movdqa     xmm0, [eax]          // U
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm7
			
 
				-    pmaddubsw  xmm1, xmm7
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm3, xmm7
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm2, 8
			
 
				-    packsswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    sub        ecx,  16
			
 
				-    movdqa     [edx], xmm0
			
 
				-
			
 
				-    movdqa     xmm0, [eax]          // V
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm6
			
 
				-    pmaddubsw  xmm1, xmm6
			
 
				-    pmaddubsw  xmm2, xmm6
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm2, 8
			
 
				-    packsswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     [edx + edi], xmm0
			
 
				-    lea        edx,  [edx + 16]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
			
 
				-                                    uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb
			
 
				-    mov        edx, [esp + 4 + 8]   // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]  // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]  // pix
			
 
				-    movdqa     xmm7, kARGBToU
			
 
				-    movdqa     xmm6, kARGBToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* convert to U and V */
			
 
				-    movdqu     xmm0, [eax]          // U
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm7
			
 
				-    pmaddubsw  xmm1, xmm7
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm3, xmm7
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm2, 8
			
 
				-    packsswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    sub        ecx,  16
			
 
				-    movdqu     [edx], xmm0
			
 
				-
			
 
				-    movdqu     xmm0, [eax]          // V
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    pmaddubsw  xmm0, xmm6
			
 
				-    pmaddubsw  xmm1, xmm6
			
 
				-    pmaddubsw  xmm2, xmm6
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    phaddw     xmm2, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm2, 8
			
 
				-    packsswb   xmm0, xmm2
			
 
				-    paddb      xmm0, xmm5
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqu     [edx + edi], xmm0
			
 
				-    lea        edx,  [edx + 16]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
			
 
				-                          uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb
			
 
				-    mov        edx, [esp + 4 + 8]   // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]  // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]  // pix
			
 
				-    movdqa     xmm7, kARGBToU
			
 
				-    movdqa     xmm6, kARGBToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-    paddb      xmm0, xmm5            // -> unsigned
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
			
 
				-                                    uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb
			
 
				-    mov        edx, [esp + 4 + 8]   // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]  // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]  // pix
			
 
				-    movdqa     xmm7, kARGBToU
			
 
				-    movdqa     xmm6, kARGBToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-    paddb      xmm0, xmm5            // -> unsigned
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb
			
 
				-    mov        esi, [esp + 8 + 8]   // src_stride_argb
			
 
				-    mov        edx, [esp + 8 + 12]  // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]  // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]  // pix
			
 
				-    movdqa     xmm7, kBGRAToU
			
 
				-    movdqa     xmm6, kBGRAToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pavgb      xmm0, [eax + esi]
			
 
				-    pavgb      xmm1, [eax + esi + 16]
			
 
				-    pavgb      xmm2, [eax + esi + 32]
			
 
				-    pavgb      xmm3, [eax + esi + 48]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-    paddb      xmm0, xmm5            // -> unsigned
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb
			
 
				-    mov        esi, [esp + 8 + 8]   // src_stride_argb
			
 
				-    mov        edx, [esp + 8 + 12]  // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]  // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]  // pix
			
 
				-    movdqa     xmm7, kBGRAToU
			
 
				-    movdqa     xmm6, kBGRAToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    movdqu     xmm4, [eax + esi]
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 16]
			
 
				-    pavgb      xmm1, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 32]
			
 
				-    pavgb      xmm2, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 48]
			
 
				-    pavgb      xmm3, xmm4
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-    paddb      xmm0, xmm5            // -> unsigned
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb
			
 
				-    mov        esi, [esp + 8 + 8]   // src_stride_argb
			
 
				-    mov        edx, [esp + 8 + 12]  // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]  // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]  // pix
			
 
				-    movdqa     xmm7, kABGRToU
			
 
				-    movdqa     xmm6, kABGRToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pavgb      xmm0, [eax + esi]
			
 
				-    pavgb      xmm1, [eax + esi + 16]
			
 
				-    pavgb      xmm2, [eax + esi + 32]
			
 
				-    pavgb      xmm3, [eax + esi + 48]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-    paddb      xmm0, xmm5            // -> unsigned
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb
			
 
				-    mov        esi, [esp + 8 + 8]   // src_stride_argb
			
 
				-    mov        edx, [esp + 8 + 12]  // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]  // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]  // pix
			
 
				-    movdqa     xmm7, kABGRToU
			
 
				-    movdqa     xmm6, kABGRToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    movdqu     xmm4, [eax + esi]
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 16]
			
 
				-    pavgb      xmm1, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 32]
			
 
				-    pavgb      xmm2, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 48]
			
 
				-    pavgb      xmm3, xmm4
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-    paddb      xmm0, xmm5            // -> unsigned
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                       uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb
			
 
				-    mov        esi, [esp + 8 + 8]   // src_stride_argb
			
 
				-    mov        edx, [esp + 8 + 12]  // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]  // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]  // pix
			
 
				-    movdqa     xmm7, kRGBAToU
			
 
				-    movdqa     xmm6, kRGBAToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-    pavgb      xmm0, [eax + esi]
			
 
				-    pavgb      xmm1, [eax + esi + 16]
			
 
				-    pavgb      xmm2, [eax + esi + 32]
			
 
				-    pavgb      xmm3, [eax + esi + 48]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-    paddb      xmm0, xmm5            // -> unsigned
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
			
 
				-                                 uint8* dst_u, uint8* dst_v, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb
			
 
				-    mov        esi, [esp + 8 + 8]   // src_stride_argb
			
 
				-    mov        edx, [esp + 8 + 12]  // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]  // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]  // pix
			
 
				-    movdqa     xmm7, kRGBAToU
			
 
				-    movdqa     xmm6, kRGBAToV
			
 
				-    movdqa     xmm5, kAddUV128
			
 
				-    sub        edi, edx             // stride from u to v
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + 32]
			
 
				-    movdqu     xmm3, [eax + 48]
			
 
				-    movdqu     xmm4, [eax + esi]
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 16]
			
 
				-    pavgb      xmm1, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 32]
			
 
				-    pavgb      xmm2, xmm4
			
 
				-    movdqu     xmm4, [eax + esi + 48]
			
 
				-    pavgb      xmm3, xmm4
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    shufps     xmm0, xmm1, 0x88
			
 
				-    shufps     xmm4, xmm1, 0xdd
			
 
				-    pavgb      xmm0, xmm4
			
 
				-    movdqa     xmm4, xmm2
			
 
				-    shufps     xmm2, xmm3, 0x88
			
 
				-    shufps     xmm4, xmm3, 0xdd
			
 
				-    pavgb      xmm2, xmm4
			
 
				-
			
 
				-    // step 2 - convert to U and V
			
 
				-    // from here down is very similar to Y code except
			
 
				-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    pmaddubsw  xmm0, xmm7  // U
			
 
				-    pmaddubsw  xmm2, xmm7
			
 
				-    pmaddubsw  xmm1, xmm6  // V
			
 
				-    pmaddubsw  xmm3, xmm6
			
 
				-    phaddw     xmm0, xmm2
			
 
				-    phaddw     xmm1, xmm3
			
 
				-    psraw      xmm0, 8
			
 
				-    psraw      xmm1, 8
			
 
				-    packsswb   xmm0, xmm1
			
 
				-    paddb      xmm0, xmm5            // -> unsigned
			
 
				-
			
 
				-    // step 3 - store 8 U and 8 V values
			
 
				-    sub        ecx, 16
			
 
				-    movlps     qword ptr [edx], xmm0 // U
			
 
				-    movhps     qword ptr [edx + edi], xmm0 // V
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBTOYROW_SSSE3
			
 
				-
			
 
				-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
			
 
				-
			
 
				-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
			
 
				-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
			
 
				-#define UR 0
			
 
				-
			
 
				-#define VB 0
			
 
				-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
			
 
				-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
			
 
				-
			
 
				-// Bias
			
 
				-#define BB UB * 128 + VB * 128
			
 
				-#define BG UG * 128 + VG * 128
			
 
				-#define BR UR * 128 + VR * 128
			
 
				-
			
 
				-#ifdef HAS_I422TOARGBROW_AVX2
			
 
				-
			
 
				-static const lvec8 kUVToB_AVX = {
			
 
				-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
			
 
				-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
			
 
				-};
			
 
				-static const lvec8 kUVToR_AVX = {
			
 
				-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
			
 
				-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
			
 
				-};
			
 
				-static const lvec8 kUVToG_AVX = {
			
 
				-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
			
 
				-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
			
 
				-};
			
 
				-static const lvec16 kYToRgb_AVX = {
			
 
				-  YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
			
 
				-};
			
 
				-static const lvec16 kYSub16_AVX = {
			
 
				-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
			
 
				-};
			
 
				-static const lvec16 kUVBiasB_AVX = {
			
 
				-  BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
			
 
				-};
			
 
				-static const lvec16 kUVBiasG_AVX = {
			
 
				-  BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
			
 
				-};
			
 
				-static const lvec16 kUVBiasR_AVX = {
			
 
				-  BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
			
 
				-};
			
 
				-
			
 
				-// 16 pixels
			
 
				-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToARGBRow_AVX2(const uint8* y_buf,
			
 
				-                         const uint8* u_buf,
			
 
				-                         const uint8* v_buf,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // argb
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
			
 
				-    vpxor      ymm4, ymm4, ymm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    vmovq      xmm0, qword ptr [esi]          //  U
			
 
				-    vmovq      xmm1, qword ptr [esi + edi]    //  V
			
 
				-    lea        esi,  [esi + 8]
			
 
				-    vpunpcklbw ymm0, ymm0, ymm1               // UV
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    vpunpcklwd ymm0, ymm0, ymm0              // UVUV
			
 
				-    vpmaddubsw ymm2, ymm0, kUVToB_AVX        // scale B UV
			
 
				-    vpmaddubsw ymm1, ymm0, kUVToG_AVX        // scale G UV
			
 
				-    vpmaddubsw ymm0, ymm0, kUVToR_AVX        // scale R UV
			
 
				-    vpsubw     ymm2, ymm2, kUVBiasB_AVX      // unbias back to signed
			
 
				-    vpsubw     ymm1, ymm1, kUVBiasG_AVX
			
 
				-    vpsubw     ymm0, ymm0, kUVBiasR_AVX
			
 
				-
			
 
				-    // Step 2: Find Y contribution to 16 R,G,B values
			
 
				-    vmovdqu    xmm3, [eax]                  // NOLINT
			
 
				-    lea        eax, [eax + 16]
			
 
				-    vpermq     ymm3, ymm3, 0xd8
			
 
				-    vpunpcklbw ymm3, ymm3, ymm4
			
 
				-    vpsubsw    ymm3, ymm3, kYSub16_AVX
			
 
				-    vpmullw    ymm3, ymm3, kYToRgb_AVX
			
 
				-    vpaddsw    ymm2, ymm2, ymm3           // B += Y
			
 
				-    vpaddsw    ymm1, ymm1, ymm3           // G += Y
			
 
				-    vpaddsw    ymm0, ymm0, ymm3           // R += Y
			
 
				-    vpsraw     ymm2, ymm2, 6
			
 
				-    vpsraw     ymm1, ymm1, 6
			
 
				-    vpsraw     ymm0, ymm0, 6
			
 
				-    vpackuswb  ymm2, ymm2, ymm2           // B
			
 
				-    vpackuswb  ymm1, ymm1, ymm1           // G
			
 
				-    vpackuswb  ymm0, ymm0, ymm0           // R
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    vpunpcklbw ymm2, ymm2, ymm1           // BG
			
 
				-    vpermq     ymm2, ymm2, 0xd8
			
 
				-    vpunpcklbw ymm0, ymm0, ymm5           // RA
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    vpunpcklwd ymm1, ymm2, ymm0           // BGRA first 8 pixels
			
 
				-    vpunpckhwd ymm2, ymm2, ymm0           // BGRA next 8 pixels
			
 
				-    vmovdqu    [edx], ymm1
			
 
				-    vmovdqu    [edx + 32], ymm2
			
 
				-    lea        edx,  [edx + 64]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-    vzeroupper
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_I422TOARGBROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_I422TOARGBROW_SSSE3
			
 
				-
			
 
				-static const vec8 kUVToB = {
			
 
				-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
			
 
				-};
			
 
				-
			
 
				-static const vec8 kUVToR = {
			
 
				-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
			
 
				-};
			
 
				-
			
 
				-static const vec8 kUVToG = {
			
 
				-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
			
 
				-};
			
 
				-
			
 
				-static const vec8 kVUToB = {
			
 
				-  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
			
 
				-};
			
 
				-
			
 
				-static const vec8 kVUToR = {
			
 
				-  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
			
 
				-};
			
 
				-
			
 
				-static const vec8 kVUToG = {
			
 
				-  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
			
 
				-};
			
 
				-
			
 
				-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
			
 
				-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
			
 
				-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
			
 
				-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
			
 
				-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
			
 
				-
			
 
				-// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
			
 
				-
			
 
				-// Read 8 UV from 444.
			
 
				-#define READYUV444 __asm {                                                     \
			
 
				-    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
			
 
				-    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
			
 
				-    __asm lea        esi,  [esi + 8]                                           \
			
 
				-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
			
 
				-  }
			
 
				-
			
 
				-// Read 4 UV from 422, upsample to 8 UV.
			
 
				-#define READYUV422 __asm {                                                     \
			
 
				-    __asm movd       xmm0, [esi]          /* U */                              \
			
 
				-    __asm movd       xmm1, [esi + edi]    /* V */                              \
			
 
				-    __asm lea        esi,  [esi + 4]                                           \
			
 
				-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
			
 
				-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
			
 
				-  }
			
 
				-
			
 
				-// Read 2 UV from 411, upsample to 8 UV.
			
 
				-#define READYUV411 __asm {                                                     \
			
 
				-    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
			
 
				-    __asm movd       xmm0, ebx                                                 \
			
 
				-    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
			
 
				-    __asm movd       xmm1, ebx                                                 \
			
 
				-    __asm lea        esi,  [esi + 2]                                           \
			
 
				-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
			
 
				-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
			
 
				-    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
			
 
				-  }
			
 
				-
			
 
				-// Read 4 UV from NV12, upsample to 8 UV.
			
 
				-#define READNV12 __asm {                                                       \
			
 
				-    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
			
 
				-    __asm lea        esi,  [esi + 8]                                           \
			
 
				-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
			
 
				-  }
			
 
				-
			
 
				-// Convert 8 pixels: 8 UV and 8 Y.
			
 
				-#define YUVTORGB __asm {                                                       \
			
 
				-    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
			
 
				-    __asm movdqa     xmm1, xmm0                                                \
			
 
				-    __asm movdqa     xmm2, xmm0                                                \
			
 
				-    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
			
 
				-    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
			
 
				-    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
			
 
				-    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
			
 
				-    __asm psubw      xmm1, kUVBiasG                                            \
			
 
				-    __asm psubw      xmm2, kUVBiasR                                            \
			
 
				-    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
			
 
				-    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
			
 
				-    __asm lea        eax, [eax + 8]                                            \
			
 
				-    __asm punpcklbw  xmm3, xmm4                                                \
			
 
				-    __asm psubsw     xmm3, kYSub16                                             \
			
 
				-    __asm pmullw     xmm3, kYToRgb                                             \
			
 
				-    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
			
 
				-    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
			
 
				-    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
			
 
				-    __asm psraw      xmm0, 6                                                   \
			
 
				-    __asm psraw      xmm1, 6                                                   \
			
 
				-    __asm psraw      xmm2, 6                                                   \
			
 
				-    __asm packuswb   xmm0, xmm0           /* B */                              \
			
 
				-    __asm packuswb   xmm1, xmm1           /* G */                              \
			
 
				-    __asm packuswb   xmm2, xmm2           /* R */                              \
			
 
				-  }
			
 
				-
			
 
				-// Convert 8 pixels: 8 VU and 8 Y.
			
 
				-#define YVUTORGB __asm {                                                       \
			
 
				-    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
			
 
				-    __asm movdqa     xmm1, xmm0                                                \
			
 
				-    __asm movdqa     xmm2, xmm0                                                \
			
 
				-    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
			
 
				-    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
			
 
				-    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
			
 
				-    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
			
 
				-    __asm psubw      xmm1, kUVBiasG                                            \
			
 
				-    __asm psubw      xmm2, kUVBiasR                                            \
			
 
				-    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
			
 
				-    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
			
 
				-    __asm lea        eax, [eax + 8]                                            \
			
 
				-    __asm punpcklbw  xmm3, xmm4                                                \
			
 
				-    __asm psubsw     xmm3, kYSub16                                             \
			
 
				-    __asm pmullw     xmm3, kYToRgb                                             \
			
 
				-    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
			
 
				-    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
			
 
				-    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
			
 
				-    __asm psraw      xmm0, 6                                                   \
			
 
				-    __asm psraw      xmm1, 6                                                   \
			
 
				-    __asm psraw      xmm2, 6                                                   \
			
 
				-    __asm packuswb   xmm0, xmm0           /* B */                              \
			
 
				-    __asm packuswb   xmm1, xmm1           /* G */                              \
			
 
				-    __asm packuswb   xmm2, xmm2           /* R */                              \
			
 
				-  }
			
 
				-
			
 
				-// 8 pixels, dest aligned 16.
			
 
				-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I444ToARGBRow_SSSE3(const uint8* y_buf,
			
 
				-                         const uint8* u_buf,
			
 
				-                         const uint8* v_buf,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // argb
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV444
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm5           // RA
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, dest aligned 16.
			
 
				-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToRGB24Row_SSSE3(const uint8* y_buf,
			
 
				-                          const uint8* u_buf,
			
 
				-                          const uint8* v_buf,
			
 
				-                          uint8* dst_rgb24,
			
 
				-                          int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // rgb24
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pxor       xmm4, xmm4
			
 
				-    movdqa     xmm5, kShuffleMaskARGBToRGB24_0
			
 
				-    movdqa     xmm6, kShuffleMaskARGBToRGB24
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into RRGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm2           // RR
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
			
 
				-    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
			
 
				-    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
			
 
				-    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
			
 
				-    movq       qword ptr [edx], xmm0  // First 8 bytes
			
 
				-    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
			
 
				-    lea        edx,  [edx + 24]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, dest aligned 16.
			
 
				-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToRAWRow_SSSE3(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* dst_raw,
			
 
				-                        int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // raw
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pxor       xmm4, xmm4
			
 
				-    movdqa     xmm5, kShuffleMaskARGBToRAW_0
			
 
				-    movdqa     xmm6, kShuffleMaskARGBToRAW
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into RRGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm2           // RR
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
			
 
				-    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
			
 
				-    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
			
 
				-    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
			
 
				-    movq       qword ptr [edx], xmm0  // First 8 bytes
			
 
				-    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
			
 
				-    lea        edx,  [edx + 24]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, dest unaligned.
			
 
				-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToRGB565Row_SSSE3(const uint8* y_buf,
			
 
				-                           const uint8* u_buf,
			
 
				-                           const uint8* v_buf,
			
 
				-                           uint8* rgb565_buf,
			
 
				-                           int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // rgb565
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pxor       xmm4, xmm4
			
 
				-    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
			
 
				-    psrld      xmm5, 27
			
 
				-    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
			
 
				-    psrld      xmm6, 26
			
 
				-    pslld      xmm6, 5
			
 
				-    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
			
 
				-    pslld      xmm7, 11
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into RRGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm2           // RR
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
			
 
				-
			
 
				-    // Step 3b: RRGB -> RGB565
			
 
				-    movdqa     xmm3, xmm0    // B  first 4 pixels of argb
			
 
				-    movdqa     xmm2, xmm0    // G
			
 
				-    pslld      xmm0, 8       // R
			
 
				-    psrld      xmm3, 3       // B
			
 
				-    psrld      xmm2, 5       // G
			
 
				-    psrad      xmm0, 16      // R
			
 
				-    pand       xmm3, xmm5    // B
			
 
				-    pand       xmm2, xmm6    // G
			
 
				-    pand       xmm0, xmm7    // R
			
 
				-    por        xmm3, xmm2    // BG
			
 
				-    por        xmm0, xmm3    // BGR
			
 
				-    movdqa     xmm3, xmm1    // B  next 4 pixels of argb
			
 
				-    movdqa     xmm2, xmm1    // G
			
 
				-    pslld      xmm1, 8       // R
			
 
				-    psrld      xmm3, 3       // B
			
 
				-    psrld      xmm2, 5       // G
			
 
				-    psrad      xmm1, 16      // R
			
 
				-    pand       xmm3, xmm5    // B
			
 
				-    pand       xmm2, xmm6    // G
			
 
				-    pand       xmm1, xmm7    // R
			
 
				-    por        xmm3, xmm2    // BG
			
 
				-    por        xmm1, xmm3    // BGR
			
 
				-    packssdw   xmm0, xmm1
			
 
				-    sub        ecx, 8
			
 
				-    movdqu     [edx], xmm0   // store 8 pixels of RGB565
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, dest aligned 16.
			
 
				-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToARGBRow_SSSE3(const uint8* y_buf,
			
 
				-                         const uint8* u_buf,
			
 
				-                         const uint8* v_buf,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // argb
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm5           // RA
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, dest aligned 16.
			
 
				-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-// Similar to I420 but duplicate UV once more.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I411ToARGBRow_SSSE3(const uint8* y_buf,
			
 
				-                         const uint8* u_buf,
			
 
				-                         const uint8* v_buf,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width) {
			
 
				-  __asm {
			
 
				-    push       ebx
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 12 + 4]   // Y
			
 
				-    mov        esi, [esp + 12 + 8]   // U
			
 
				-    mov        edi, [esp + 12 + 12]  // V
			
 
				-    mov        edx, [esp + 12 + 16]  // argb
			
 
				-    mov        ecx, [esp + 12 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV411  // modifies EBX
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm5           // RA
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    pop        ebx
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, dest aligned 16.
			
 
				-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void NV12ToARGBRow_SSSE3(const uint8* y_buf,
			
 
				-                         const uint8* uv_buf,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // Y
			
 
				-    mov        esi, [esp + 4 + 8]   // UV
			
 
				-    mov        edx, [esp + 4 + 12]  // argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READNV12
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm5           // RA
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, dest aligned 16.
			
 
				-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void NV21ToARGBRow_SSSE3(const uint8* y_buf,
			
 
				-                         const uint8* uv_buf,
			
 
				-                         uint8* dst_argb,
			
 
				-                         int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // Y
			
 
				-    mov        esi, [esp + 4 + 8]   // VU
			
 
				-    mov        edx, [esp + 4 + 12]  // argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READNV12
			
 
				-    YVUTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm5           // RA
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, unaligned.
			
 
				-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                   const uint8* u_buf,
			
 
				-                                   const uint8* v_buf,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // argb
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV444
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm5           // RA
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
			
 
				-    movdqu     [edx], xmm0
			
 
				-    movdqu     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, unaligned.
			
 
				-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                   const uint8* u_buf,
			
 
				-                                   const uint8* v_buf,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // argb
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm5           // RA
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
			
 
				-    movdqu     [edx], xmm0
			
 
				-    movdqu     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, unaligned.
			
 
				-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-// Similar to I420 but duplicate UV once more.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                   const uint8* u_buf,
			
 
				-                                   const uint8* v_buf,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width) {
			
 
				-  __asm {
			
 
				-    push       ebx
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 12 + 4]   // Y
			
 
				-    mov        esi, [esp + 12 + 8]   // U
			
 
				-    mov        edi, [esp + 12 + 12]  // V
			
 
				-    mov        edx, [esp + 12 + 16]  // argb
			
 
				-    mov        ecx, [esp + 12 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV411  // modifies EBX
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm5           // RA
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
			
 
				-    movdqu     [edx], xmm0
			
 
				-    movdqu     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    pop        ebx
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, dest aligned 16.
			
 
				-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                   const uint8* uv_buf,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // Y
			
 
				-    mov        esi, [esp + 4 + 8]   // UV
			
 
				-    mov        edx, [esp + 4 + 12]  // argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READNV12
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm5           // RA
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
			
 
				-    movdqu     [edx], xmm0
			
 
				-    movdqu     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8 pixels, dest aligned 16.
			
 
				-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                   const uint8* uv_buf,
			
 
				-                                   uint8* dst_argb,
			
 
				-                                   int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // Y
			
 
				-    mov        esi, [esp + 4 + 8]   // VU
			
 
				-    mov        edx, [esp + 4 + 12]  // argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READNV12
			
 
				-    YVUTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm0, xmm1           // BG
			
 
				-    punpcklbw  xmm2, xmm5           // RA
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
			
 
				-    movdqu     [edx], xmm0
			
 
				-    movdqu     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToBGRARow_SSSE3(const uint8* y_buf,
			
 
				-                         const uint8* u_buf,
			
 
				-                         const uint8* v_buf,
			
 
				-                         uint8* dst_bgra,
			
 
				-                         int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // bgra
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into BGRA
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    punpcklbw  xmm1, xmm0           // GB
			
 
				-    punpcklbw  xmm5, xmm2           // AR
			
 
				-    movdqa     xmm0, xmm5
			
 
				-    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
			
 
				-    movdqa     [edx], xmm5
			
 
				-    movdqa     [edx + 16], xmm0
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                   const uint8* u_buf,
			
 
				-                                   const uint8* v_buf,
			
 
				-                                   uint8* dst_bgra,
			
 
				-                                   int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // bgra
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into BGRA
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    punpcklbw  xmm1, xmm0           // GB
			
 
				-    punpcklbw  xmm5, xmm2           // AR
			
 
				-    movdqa     xmm0, xmm5
			
 
				-    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
			
 
				-    movdqu     [edx], xmm5
			
 
				-    movdqu     [edx + 16], xmm0
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToABGRRow_SSSE3(const uint8* y_buf,
			
 
				-                         const uint8* u_buf,
			
 
				-                         const uint8* v_buf,
			
 
				-                         uint8* dst_abgr,
			
 
				-                         int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // abgr
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm2, xmm1           // RG
			
 
				-    punpcklbw  xmm0, xmm5           // BA
			
 
				-    movdqa     xmm1, xmm2
			
 
				-    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
			
 
				-    movdqa     [edx], xmm2
			
 
				-    movdqa     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                   const uint8* u_buf,
			
 
				-                                   const uint8* v_buf,
			
 
				-                                   uint8* dst_abgr,
			
 
				-                                   int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // abgr
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into ARGB
			
 
				-    punpcklbw  xmm2, xmm1           // RG
			
 
				-    punpcklbw  xmm0, xmm5           // BA
			
 
				-    movdqa     xmm1, xmm2
			
 
				-    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
			
 
				-    movdqu     [edx], xmm2
			
 
				-    movdqu     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToRGBARow_SSSE3(const uint8* y_buf,
			
 
				-                         const uint8* u_buf,
			
 
				-                         const uint8* v_buf,
			
 
				-                         uint8* dst_rgba,
			
 
				-                         int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // rgba
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into RGBA
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    punpcklbw  xmm1, xmm2           // GR
			
 
				-    punpcklbw  xmm5, xmm0           // AB
			
 
				-    movdqa     xmm0, xmm5
			
 
				-    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
			
 
				-    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
			
 
				-    movdqa     [edx], xmm5
			
 
				-    movdqa     [edx + 16], xmm0
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
			
 
				-                                   const uint8* u_buf,
			
 
				-                                   const uint8* v_buf,
			
 
				-                                   uint8* dst_rgba,
			
 
				-                                   int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // Y
			
 
				-    mov        esi, [esp + 8 + 8]   // U
			
 
				-    mov        edi, [esp + 8 + 12]  // V
			
 
				-    mov        edx, [esp + 8 + 16]  // rgba
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        edi, esi
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    READYUV422
			
 
				-    YUVTORGB
			
 
				-
			
 
				-    // Step 3: Weave into RGBA
			
 
				-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
			
 
				-    punpcklbw  xmm1, xmm2           // GR
			
 
				-    punpcklbw  xmm5, xmm0           // AB
			
 
				-    movdqa     xmm0, xmm5
			
 
				-    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
			
 
				-    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
			
 
				-    movdqu     [edx], xmm5
			
 
				-    movdqu     [edx + 16], xmm0
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#endif  // HAS_I422TOARGBROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_YTOARGBROW_SSE2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void YToARGBRow_SSE2(const uint8* y_buf,
			
 
				-                     uint8* rgb_buf,
			
 
				-                     int width) {
			
 
				-  __asm {
			
 
				-    pxor       xmm5, xmm5
			
 
				-    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
			
 
				-    pslld      xmm4, 24
			
 
				-    mov        eax, 0x00100010
			
 
				-    movd       xmm3, eax
			
 
				-    pshufd     xmm3, xmm3, 0
			
 
				-    mov        eax, 0x004a004a       // 74
			
 
				-    movd       xmm2, eax
			
 
				-    pshufd     xmm2, xmm2,0
			
 
				-    mov        eax, [esp + 4]       // Y
			
 
				-    mov        edx, [esp + 8]       // rgb
			
 
				-    mov        ecx, [esp + 12]      // width
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
			
 
				-    movq       xmm0, qword ptr [eax]
			
 
				-    lea        eax, [eax + 8]
			
 
				-    punpcklbw  xmm0, xmm5           // 0.Y
			
 
				-    psubusw    xmm0, xmm3
			
 
				-    pmullw     xmm0, xmm2
			
 
				-    psrlw      xmm0, 6
			
 
				-    packuswb   xmm0, xmm0           // G
			
 
				-
			
 
				-    // Step 2: Weave into ARGB
			
 
				-    punpcklbw  xmm0, xmm0           // GG
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
			
 
				-    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
			
 
				-    por        xmm0, xmm4
			
 
				-    por        xmm1, xmm4
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + 16], xmm1
			
 
				-    lea        edx,  [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_YTOARGBROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_MIRRORROW_SSSE3
			
 
				-// Shuffle table for reversing the bytes.
			
 
				-static const uvec8 kShuffleMirror = {
			
 
				-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
			
 
				-};
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src
			
 
				-    mov       edx, [esp + 8]   // dst
			
 
				-    mov       ecx, [esp + 12]  // width
			
 
				-    movdqa    xmm5, kShuffleMirror
			
 
				-    lea       eax, [eax - 16]
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa    xmm0, [eax + ecx]
			
 
				-    pshufb    xmm0, xmm5
			
 
				-    sub       ecx, 16
			
 
				-    movdqa    [edx], xmm0
			
 
				-    lea       edx, [edx + 16]
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_MIRRORROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_MIRRORROW_AVX2
			
 
				-// Shuffle table for reversing the bytes.
			
 
				-static const ulvec8 kShuffleMirror_AVX2 = {
			
 
				-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
			
 
				-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
			
 
				-};
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src
			
 
				-    mov       edx, [esp + 8]   // dst
			
 
				-    mov       ecx, [esp + 12]  // width
			
 
				-    vmovdqa   ymm5, kShuffleMirror_AVX2
			
 
				-    lea       eax, [eax - 32]
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    vmovdqu   ymm0, [eax + ecx]
			
 
				-    vpshufb   ymm0, ymm0, ymm5
			
 
				-    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
			
 
				-    sub       ecx, 32
			
 
				-    vmovdqu   [edx], ymm0
			
 
				-    lea       edx, [edx + 32]
			
 
				-    jg        convertloop
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_MIRRORROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_MIRRORROW_SSE2
			
 
				-// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
			
 
				-// version can not.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src
			
 
				-    mov       edx, [esp + 8]   // dst
			
 
				-    mov       ecx, [esp + 12]  // width
			
 
				-    lea       eax, [eax - 16]
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu    xmm0, [eax + ecx]
			
 
				-    movdqa    xmm1, xmm0        // swap bytes
			
 
				-    psllw     xmm0, 8
			
 
				-    psrlw     xmm1, 8
			
 
				-    por       xmm0, xmm1
			
 
				-    pshuflw   xmm0, xmm0, 0x1b  // swap words
			
 
				-    pshufhw   xmm0, xmm0, 0x1b
			
 
				-    pshufd    xmm0, xmm0, 0x4e  // swap qwords
			
 
				-    sub       ecx, 16
			
 
				-    movdqu    [edx], xmm0
			
 
				-    lea       edx, [edx + 16]
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_MIRRORROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_MIRRORROW_UV_SSSE3
			
 
				-// Shuffle table for reversing the bytes of UV channels.
			
 
				-static const uvec8 kShuffleMirrorUV = {
			
 
				-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
			
 
				-};
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
			
 
				-                       int width) {
			
 
				-  __asm {
			
 
				-    push      edi
			
 
				-    mov       eax, [esp + 4 + 4]   // src
			
 
				-    mov       edx, [esp + 4 + 8]   // dst_u
			
 
				-    mov       edi, [esp + 4 + 12]  // dst_v
			
 
				-    mov       ecx, [esp + 4 + 16]  // width
			
 
				-    movdqa    xmm1, kShuffleMirrorUV
			
 
				-    lea       eax, [eax + ecx * 2 - 16]
			
 
				-    sub       edi, edx
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa    xmm0, [eax]
			
 
				-    lea       eax, [eax - 16]
			
 
				-    pshufb    xmm0, xmm1
			
 
				-    sub       ecx, 8
			
 
				-    movlpd    qword ptr [edx], xmm0
			
 
				-    movhpd    qword ptr [edx + edi], xmm0
			
 
				-    lea       edx, [edx + 8]
			
 
				-    jg        convertloop
			
 
				-
			
 
				-    pop       edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_MIRRORROW_UV_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBMIRRORROW_SSSE3
			
 
				-// Shuffle table for reversing the bytes.
			
 
				-static const uvec8 kARGBShuffleMirror = {
			
 
				-  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
			
 
				-};
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src
			
 
				-    mov       edx, [esp + 8]   // dst
			
 
				-    mov       ecx, [esp + 12]  // width
			
 
				-    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
			
 
				-    movdqa    xmm5, kARGBShuffleMirror
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa    xmm0, [eax]
			
 
				-    lea       eax, [eax - 16]
			
 
				-    pshufb    xmm0, xmm5
			
 
				-    sub       ecx, 4
			
 
				-    movdqa    [edx], xmm0
			
 
				-    lea       edx, [edx + 16]
			
 
				-    jg        convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBMIRRORROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBMIRRORROW_AVX2
			
 
				-// Shuffle table for reversing the bytes.
			
 
				-static const ulvec32 kARGBShuffleMirror_AVX2 = {
			
 
				-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
			
 
				-};
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
			
 
				-  __asm {
			
 
				-    mov       eax, [esp + 4]   // src
			
 
				-    mov       edx, [esp + 8]   // dst
			
 
				-    mov       ecx, [esp + 12]  // width
			
 
				-    lea       eax, [eax - 32]
			
 
				-    vmovdqa   ymm5, kARGBShuffleMirror_AVX2
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    vpermd    ymm0, ymm5, [eax + ecx * 4]  // permute dword order
			
 
				-    sub       ecx, 8
			
 
				-    vmovdqu   [edx], ymm0
			
 
				-    lea       edx, [edx + 32]
			
 
				-    jg        convertloop
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBMIRRORROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_SPLITUVROW_SSE2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_uv
			
 
				-    mov        edx, [esp + 4 + 8]    // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    movdqa     xmm2, xmm0
			
 
				-    movdqa     xmm3, xmm1
			
 
				-    pand       xmm0, xmm5   // even bytes
			
 
				-    pand       xmm1, xmm5
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    psrlw      xmm2, 8      // odd bytes
			
 
				-    psrlw      xmm3, 8
			
 
				-    packuswb   xmm2, xmm3
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + edi], xmm2
			
 
				-    lea        edx, [edx + 16]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
			
 
				-                               int pix) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_uv
			
 
				-    mov        edx, [esp + 4 + 8]    // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    movdqa     xmm2, xmm0
			
 
				-    movdqa     xmm3, xmm1
			
 
				-    pand       xmm0, xmm5   // even bytes
			
 
				-    pand       xmm1, xmm5
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    psrlw      xmm2, 8      // odd bytes
			
 
				-    psrlw      xmm3, 8
			
 
				-    packuswb   xmm2, xmm3
			
 
				-    movdqu     [edx], xmm0
			
 
				-    movdqu     [edx + edi], xmm2
			
 
				-    lea        edx, [edx + 16]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_SPLITUVROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_SPLITUVROW_AVX2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_uv
			
 
				-    mov        edx, [esp + 4 + 8]    // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]   // pix
			
 
				-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
			
 
				-    vpsrlw     ymm5, ymm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vmovdqu    ymm1, [eax + 32]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    vpsrlw     ymm2, ymm0, 8      // odd bytes
			
 
				-    vpsrlw     ymm3, ymm1, 8
			
 
				-    vpand      ymm0, ymm0, ymm5   // even bytes
			
 
				-    vpand      ymm1, ymm1, ymm5
			
 
				-    vpackuswb  ymm0, ymm0, ymm1
			
 
				-    vpackuswb  ymm2, ymm2, ymm3
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    vpermq     ymm2, ymm2, 0xd8
			
 
				-    vmovdqu    [edx], ymm0
			
 
				-    vmovdqu    [edx + edi], ymm2
			
 
				-    lea        edx, [edx + 32]
			
 
				-    sub        ecx, 32
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_SPLITUVROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_MERGEUVROW_SSE2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                     int width) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_u
			
 
				-    mov        edx, [esp + 4 + 8]    // src_v
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_uv
			
 
				-    mov        ecx, [esp + 4 + 16]   // width
			
 
				-    sub        edx, eax
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqa     xmm0, [eax]      // read 16 U's
			
 
				-    movdqa     xmm1, [eax + edx]  // and 16 V's
			
 
				-    lea        eax,  [eax + 16]
			
 
				-    movdqa     xmm2, xmm0
			
 
				-    punpcklbw  xmm0, xmm1       // first 8 UV pairs
			
 
				-    punpckhbw  xmm2, xmm1       // next 8 UV pairs
			
 
				-    movdqa     [edi], xmm0
			
 
				-    movdqa     [edi + 16], xmm2
			
 
				-    lea        edi, [edi + 32]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
			
 
				-                               uint8* dst_uv, int width) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_u
			
 
				-    mov        edx, [esp + 4 + 8]    // src_v
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_uv
			
 
				-    mov        ecx, [esp + 4 + 16]   // width
			
 
				-    sub        edx, eax
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqu     xmm0, [eax]      // read 16 U's
			
 
				-    movdqu     xmm1, [eax + edx]  // and 16 V's
			
 
				-    lea        eax,  [eax + 16]
			
 
				-    movdqa     xmm2, xmm0
			
 
				-    punpcklbw  xmm0, xmm1       // first 8 UV pairs
			
 
				-    punpckhbw  xmm2, xmm1       // next 8 UV pairs
			
 
				-    movdqu     [edi], xmm0
			
 
				-    movdqu     [edi + 16], xmm2
			
 
				-    lea        edi, [edi + 32]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  //  HAS_MERGEUVROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_MERGEUVROW_AVX2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-                     int width) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_u
			
 
				-    mov        edx, [esp + 4 + 8]    // src_v
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_uv
			
 
				-    mov        ecx, [esp + 4 + 16]   // width
			
 
				-    sub        edx, eax
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    vmovdqu    ymm0, [eax]           // read 32 U's
			
 
				-    vmovdqu    ymm1, [eax + edx]     // and 32 V's
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
			
 
				-    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
			
 
				-    vperm2i128 ymm1, ymm2, ymm0, 0x20  // low 128 of ymm2 and low 128 of ymm0
			
 
				-    vperm2i128 ymm2, ymm2, ymm0, 0x31  // high 128 of ymm2 and high 128 of ymm0
			
 
				-    vmovdqu    [edi], ymm1
			
 
				-    vmovdqu    [edi + 32], ymm2
			
 
				-    lea        edi, [edi + 64]
			
 
				-    sub        ecx, 32
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  //  HAS_MERGEUVROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_COPYROW_SSE2
			
 
				-// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   // src
			
 
				-    mov        edx, [esp + 8]   // dst
			
 
				-    mov        ecx, [esp + 12]  // count
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    lea        eax, [eax + 32]
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + 16], xmm1
			
 
				-    lea        edx, [edx + 32]
			
 
				-    sub        ecx, 32
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_COPYROW_SSE2
			
 
				-
			
 
				-// Unaligned Multiple of 1.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
			
 
				-  __asm {
			
 
				-    mov        eax, esi
			
 
				-    mov        edx, edi
			
 
				-    mov        esi, [esp + 4]   // src
			
 
				-    mov        edi, [esp + 8]   // dst
			
 
				-    mov        ecx, [esp + 12]  // count
			
 
				-    rep movsb
			
 
				-    mov        edi, edx
			
 
				-    mov        esi, eax
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#ifdef HAS_COPYROW_X86
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void CopyRow_X86(const uint8* src, uint8* dst, int count) {
			
 
				-  __asm {
			
 
				-    mov        eax, esi
			
 
				-    mov        edx, edi
			
 
				-    mov        esi, [esp + 4]   // src
			
 
				-    mov        edi, [esp + 8]   // dst
			
 
				-    mov        ecx, [esp + 12]  // count
			
 
				-    shr        ecx, 2
			
 
				-    rep movsd
			
 
				-    mov        edi, edx
			
 
				-    mov        esi, eax
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_COPYROW_X86
			
 
				-
			
 
				-#ifdef HAS_ARGBCOPYALPHAROW_SSE2
			
 
				-// width in pixels
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   // src
			
 
				-    mov        edx, [esp + 8]   // dst
			
 
				-    mov        ecx, [esp + 12]  // count
			
 
				-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
			
 
				-    pslld      xmm0, 24
			
 
				-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
			
 
				-    psrld      xmm1, 8
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqa     xmm2, [eax]
			
 
				-    movdqa     xmm3, [eax + 16]
			
 
				-    lea        eax, [eax + 32]
			
 
				-    movdqa     xmm4, [edx]
			
 
				-    movdqa     xmm5, [edx + 16]
			
 
				-    pand       xmm2, xmm0
			
 
				-    pand       xmm3, xmm0
			
 
				-    pand       xmm4, xmm1
			
 
				-    pand       xmm5, xmm1
			
 
				-    por        xmm2, xmm4
			
 
				-    por        xmm3, xmm5
			
 
				-    movdqa     [edx], xmm2
			
 
				-    movdqa     [edx + 16], xmm3
			
 
				-    lea        edx, [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOPYALPHAROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBCOPYALPHAROW_AVX2
			
 
				-// width in pixels
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   // src
			
 
				-    mov        edx, [esp + 8]   // dst
			
 
				-    mov        ecx, [esp + 12]  // count
			
 
				-    vpcmpeqb   ymm0, ymm0, ymm0
			
 
				-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    vmovdqu    ymm1, [eax]
			
 
				-    vmovdqu    ymm2, [eax + 32]
			
 
				-    lea        eax, [eax + 64]
			
 
				-    vpblendvb  ymm1, ymm1, [edx], ymm0
			
 
				-    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
			
 
				-    vmovdqu    [edx], ymm1
			
 
				-    vmovdqu    [edx + 32], ymm2
			
 
				-    lea        edx, [edx + 64]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOPYALPHAROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
			
 
				-// width in pixels
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   // src
			
 
				-    mov        edx, [esp + 8]   // dst
			
 
				-    mov        ecx, [esp + 12]  // count
			
 
				-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
			
 
				-    pslld      xmm0, 24
			
 
				-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
			
 
				-    psrld      xmm1, 8
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movq       xmm2, qword ptr [eax]  // 8 Y's
			
 
				-    lea        eax, [eax + 8]
			
 
				-    punpcklbw  xmm2, xmm2
			
 
				-    punpckhwd  xmm3, xmm2
			
 
				-    punpcklwd  xmm2, xmm2
			
 
				-    movdqa     xmm4, [edx]
			
 
				-    movdqa     xmm5, [edx + 16]
			
 
				-    pand       xmm2, xmm0
			
 
				-    pand       xmm3, xmm0
			
 
				-    pand       xmm4, xmm1
			
 
				-    pand       xmm5, xmm1
			
 
				-    por        xmm2, xmm4
			
 
				-    por        xmm3, xmm5
			
 
				-    movdqa     [edx], xmm2
			
 
				-    movdqa     [edx + 16], xmm3
			
 
				-    lea        edx, [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
			
 
				-// width in pixels
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   // src
			
 
				-    mov        edx, [esp + 8]   // dst
			
 
				-    mov        ecx, [esp + 12]  // count
			
 
				-    vpcmpeqb   ymm0, ymm0, ymm0
			
 
				-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    vpmovzxbd  ymm1, qword ptr [eax]
			
 
				-    vpmovzxbd  ymm2, qword ptr [eax + 8]
			
 
				-    lea        eax, [eax + 16]
			
 
				-    vpslld     ymm1, ymm1, 24
			
 
				-    vpslld     ymm2, ymm2, 24
			
 
				-    vpblendvb  ymm1, ymm1, [edx], ymm0
			
 
				-    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
			
 
				-    vmovdqu    [edx], ymm1
			
 
				-    vmovdqu    [edx + 32], ymm2
			
 
				-    lea        edx, [edx + 64]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_SETROW_X86
			
 
				-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void SetRow_X86(uint8* dst, uint32 v32, int count) {
			
 
				-  __asm {
			
 
				-    mov        edx, edi
			
 
				-    mov        edi, [esp + 4]   // dst
			
 
				-    mov        eax, [esp + 8]   // v32
			
 
				-    mov        ecx, [esp + 12]  // count
			
 
				-    shr        ecx, 2
			
 
				-    rep stosd
			
 
				-    mov        edi, edx
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// SetRow32 writes 'count' words using a 32 bit value repeated.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
			
 
				-                   int dst_stride, int height) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    push       ebp
			
 
				-    mov        edi, [esp + 12 + 4]   // dst
			
 
				-    mov        eax, [esp + 12 + 8]   // v32
			
 
				-    mov        ebp, [esp + 12 + 12]  // width
			
 
				-    mov        edx, [esp + 12 + 16]  // dst_stride
			
 
				-    mov        esi, [esp + 12 + 20]  // height
			
 
				-    lea        ecx, [ebp * 4]
			
 
				-    sub        edx, ecx             // stride - width * 4
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    mov        ecx, ebp
			
 
				-    rep stosd
			
 
				-    add        edi, edx
			
 
				-    sub        esi, 1
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        ebp
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_SETROW_X86
			
 
				-
			
 
				-#ifdef HAS_YUY2TOYROW_AVX2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void YUY2ToYRow_AVX2(const uint8* src_yuy2,
			
 
				-                     uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_yuy2
			
 
				-    mov        edx, [esp + 8]    // dst_y
			
 
				-    mov        ecx, [esp + 12]   // pix
			
 
				-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
			
 
				-    vpsrlw     ymm5, ymm5, 8
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vmovdqu    ymm1, [eax + 32]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    vpand      ymm0, ymm0, ymm5   // even bytes are Y
			
 
				-    vpand      ymm1, ymm1, ymm5
			
 
				-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    sub        ecx, 32
			
 
				-    vmovdqu    [edx], ymm0
			
 
				-    lea        edx, [edx + 32]
			
 
				-    jg         convertloop
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]    // src_yuy2
			
 
				-    mov        esi, [esp + 8 + 8]    // stride_yuy2
			
 
				-    mov        edx, [esp + 8 + 12]   // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]   // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]   // pix
			
 
				-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
			
 
				-    vpsrlw     ymm5, ymm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vmovdqu    ymm1, [eax + 32]
			
 
				-    vpavgb     ymm0, ymm0, [eax + esi]
			
 
				-    vpavgb     ymm1, ymm1, [eax + esi + 32]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
			
 
				-    vpsrlw     ymm1, ymm1, 8
			
 
				-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    vpand      ymm1, ymm0, ymm5  // U
			
 
				-    vpsrlw     ymm0, ymm0, 8     // V
			
 
				-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
			
 
				-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
			
 
				-    vpermq     ymm1, ymm1, 0xd8
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    vextractf128 [edx], ymm1, 0  // U
			
 
				-    vextractf128 [edx + edi], ymm0, 0 // V
			
 
				-    lea        edx, [edx + 16]
			
 
				-    sub        ecx, 32
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_yuy2
			
 
				-    mov        edx, [esp + 4 + 8]    // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]   // pix
			
 
				-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
			
 
				-    vpsrlw     ymm5, ymm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vmovdqu    ymm1, [eax + 32]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
			
 
				-    vpsrlw     ymm1, ymm1, 8
			
 
				-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    vpand      ymm1, ymm0, ymm5  // U
			
 
				-    vpsrlw     ymm0, ymm0, 8     // V
			
 
				-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
			
 
				-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
			
 
				-    vpermq     ymm1, ymm1, 0xd8
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    vextractf128 [edx], ymm1, 0  // U
			
 
				-    vextractf128 [edx + edi], ymm0, 0 // V
			
 
				-    lea        edx, [edx + 16]
			
 
				-    sub        ecx, 32
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void UYVYToYRow_AVX2(const uint8* src_uyvy,
			
 
				-                     uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_uyvy
			
 
				-    mov        edx, [esp + 8]    // dst_y
			
 
				-    mov        ecx, [esp + 12]   // pix
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vmovdqu    ymm1, [eax + 32]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
			
 
				-    vpsrlw     ymm1, ymm1, 8
			
 
				-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    sub        ecx, 32
			
 
				-    vmovdqu    [edx], ymm0
			
 
				-    lea        edx, [edx + 32]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-    vzeroupper
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]    // src_yuy2
			
 
				-    mov        esi, [esp + 8 + 8]    // stride_yuy2
			
 
				-    mov        edx, [esp + 8 + 12]   // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]   // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]   // pix
			
 
				-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
			
 
				-    vpsrlw     ymm5, ymm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vmovdqu    ymm1, [eax + 32]
			
 
				-    vpavgb     ymm0, ymm0, [eax + esi]
			
 
				-    vpavgb     ymm1, ymm1, [eax + esi + 32]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
			
 
				-    vpand      ymm1, ymm1, ymm5
			
 
				-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    vpand      ymm1, ymm0, ymm5  // U
			
 
				-    vpsrlw     ymm0, ymm0, 8     // V
			
 
				-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
			
 
				-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
			
 
				-    vpermq     ymm1, ymm1, 0xd8
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    vextractf128 [edx], ymm1, 0  // U
			
 
				-    vextractf128 [edx + edi], ymm0, 0 // V
			
 
				-    lea        edx, [edx + 16]
			
 
				-    sub        ecx, 32
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_yuy2
			
 
				-    mov        edx, [esp + 4 + 8]    // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]   // pix
			
 
				-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
			
 
				-    vpsrlw     ymm5, ymm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vmovdqu    ymm1, [eax + 32]
			
 
				-    lea        eax,  [eax + 64]
			
 
				-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
			
 
				-    vpand      ymm1, ymm1, ymm5
			
 
				-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    vpand      ymm1, ymm0, ymm5  // U
			
 
				-    vpsrlw     ymm0, ymm0, 8     // V
			
 
				-    vpackuswb  ymm1, ymm1, ymm1  // mutates.
			
 
				-    vpackuswb  ymm0, ymm0, ymm0  // mutates.
			
 
				-    vpermq     ymm1, ymm1, 0xd8
			
 
				-    vpermq     ymm0, ymm0, 0xd8
			
 
				-    vextractf128 [edx], ymm1, 0  // U
			
 
				-    vextractf128 [edx + edi], ymm0, 0 // V
			
 
				-    lea        edx, [edx + 16]
			
 
				-    sub        ecx, 32
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_YUY2TOYROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_YUY2TOYROW_SSE2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void YUY2ToYRow_SSE2(const uint8* src_yuy2,
			
 
				-                     uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_yuy2
			
 
				-    mov        edx, [esp + 8]    // dst_y
			
 
				-    mov        ecx, [esp + 12]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    pand       xmm0, xmm5   // even bytes are Y
			
 
				-    pand       xmm1, xmm5
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]    // src_yuy2
			
 
				-    mov        esi, [esp + 8 + 8]    // stride_yuy2
			
 
				-    mov        edx, [esp + 8 + 12]   // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]   // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + esi]
			
 
				-    movdqa     xmm3, [eax + esi + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    pavgb      xmm0, xmm2
			
 
				-    pavgb      xmm1, xmm3
			
 
				-    psrlw      xmm0, 8      // YUYV -> UVUV
			
 
				-    psrlw      xmm1, 8
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    pand       xmm0, xmm5  // U
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    psrlw      xmm1, 8     // V
			
 
				-    packuswb   xmm1, xmm1
			
 
				-    movq       qword ptr [edx], xmm0
			
 
				-    movq       qword ptr [edx + edi], xmm1
			
 
				-    lea        edx, [edx + 8]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_yuy2
			
 
				-    mov        edx, [esp + 4 + 8]    // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    psrlw      xmm0, 8      // YUYV -> UVUV
			
 
				-    psrlw      xmm1, 8
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    pand       xmm0, xmm5  // U
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    psrlw      xmm1, 8     // V
			
 
				-    packuswb   xmm1, xmm1
			
 
				-    movq       qword ptr [edx], xmm0
			
 
				-    movq       qword ptr [edx + edi], xmm1
			
 
				-    lea        edx, [edx + 8]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
			
 
				-                               uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_yuy2
			
 
				-    mov        edx, [esp + 8]    // dst_y
			
 
				-    mov        ecx, [esp + 12]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    pand       xmm0, xmm5   // even bytes are Y
			
 
				-    pand       xmm1, xmm5
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
			
 
				-                                uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]    // src_yuy2
			
 
				-    mov        esi, [esp + 8 + 8]    // stride_yuy2
			
 
				-    mov        edx, [esp + 8 + 12]   // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]   // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + esi]
			
 
				-    movdqu     xmm3, [eax + esi + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    pavgb      xmm0, xmm2
			
 
				-    pavgb      xmm1, xmm3
			
 
				-    psrlw      xmm0, 8      // YUYV -> UVUV
			
 
				-    psrlw      xmm1, 8
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    pand       xmm0, xmm5  // U
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    psrlw      xmm1, 8     // V
			
 
				-    packuswb   xmm1, xmm1
			
 
				-    movq       qword ptr [edx], xmm0
			
 
				-    movq       qword ptr [edx + edi], xmm1
			
 
				-    lea        edx, [edx + 8]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
			
 
				-                                   uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_yuy2
			
 
				-    mov        edx, [esp + 4 + 8]    // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    psrlw      xmm0, 8      // YUYV -> UVUV
			
 
				-    psrlw      xmm1, 8
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    pand       xmm0, xmm5  // U
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    psrlw      xmm1, 8     // V
			
 
				-    packuswb   xmm1, xmm1
			
 
				-    movq       qword ptr [edx], xmm0
			
 
				-    movq       qword ptr [edx + edi], xmm1
			
 
				-    lea        edx, [edx + 8]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void UYVYToYRow_SSE2(const uint8* src_uyvy,
			
 
				-                     uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_uyvy
			
 
				-    mov        edx, [esp + 8]    // dst_y
			
 
				-    mov        ecx, [esp + 12]   // pix
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    psrlw      xmm0, 8    // odd bytes are Y
			
 
				-    psrlw      xmm1, 8
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                      uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]    // src_yuy2
			
 
				-    mov        esi, [esp + 8 + 8]    // stride_yuy2
			
 
				-    mov        edx, [esp + 8 + 12]   // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]   // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + esi]
			
 
				-    movdqa     xmm3, [eax + esi + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    pavgb      xmm0, xmm2
			
 
				-    pavgb      xmm1, xmm3
			
 
				-    pand       xmm0, xmm5   // UYVY -> UVUV
			
 
				-    pand       xmm1, xmm5
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    pand       xmm0, xmm5  // U
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    psrlw      xmm1, 8     // V
			
 
				-    packuswb   xmm1, xmm1
			
 
				-    movq       qword ptr [edx], xmm0
			
 
				-    movq       qword ptr [edx + edi], xmm1
			
 
				-    lea        edx, [edx + 8]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
			
 
				-                         uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_yuy2
			
 
				-    mov        edx, [esp + 4 + 8]    // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    pand       xmm0, xmm5   // UYVY -> UVUV
			
 
				-    pand       xmm1, xmm5
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    pand       xmm0, xmm5  // U
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    psrlw      xmm1, 8     // V
			
 
				-    packuswb   xmm1, xmm1
			
 
				-    movq       qword ptr [edx], xmm0
			
 
				-    movq       qword ptr [edx + edi], xmm1
			
 
				-    lea        edx, [edx + 8]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
			
 
				-                               uint8* dst_y, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_uyvy
			
 
				-    mov        edx, [esp + 8]    // dst_y
			
 
				-    mov        ecx, [esp + 12]   // pix
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    psrlw      xmm0, 8    // odd bytes are Y
			
 
				-    psrlw      xmm1, 8
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
			
 
				-                                uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]    // src_yuy2
			
 
				-    mov        esi, [esp + 8 + 8]    // stride_yuy2
			
 
				-    mov        edx, [esp + 8 + 12]   // dst_u
			
 
				-    mov        edi, [esp + 8 + 16]   // dst_v
			
 
				-    mov        ecx, [esp + 8 + 20]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    movdqu     xmm2, [eax + esi]
			
 
				-    movdqu     xmm3, [eax + esi + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    pavgb      xmm0, xmm2
			
 
				-    pavgb      xmm1, xmm3
			
 
				-    pand       xmm0, xmm5   // UYVY -> UVUV
			
 
				-    pand       xmm1, xmm5
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    pand       xmm0, xmm5  // U
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    psrlw      xmm1, 8     // V
			
 
				-    packuswb   xmm1, xmm1
			
 
				-    movq       qword ptr [edx], xmm0
			
 
				-    movq       qword ptr [edx + edi], xmm1
			
 
				-    lea        edx, [edx + 8]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
			
 
				-                                   uint8* dst_u, uint8* dst_v, int pix) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_yuy2
			
 
				-    mov        edx, [esp + 4 + 8]    // dst_u
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm5, 8
			
 
				-    sub        edi, edx
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    pand       xmm0, xmm5   // UYVY -> UVUV
			
 
				-    pand       xmm1, xmm5
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    pand       xmm0, xmm5  // U
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    psrlw      xmm1, 8     // V
			
 
				-    packuswb   xmm1, xmm1
			
 
				-    movq       qword ptr [edx], xmm0
			
 
				-    movq       qword ptr [edx + edi], xmm1
			
 
				-    lea        edx, [edx + 8]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_YUY2TOYROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBBLENDROW_SSE2
			
 
				-// Blend 8 pixels at a time.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                       uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb0
			
 
				-    mov        esi, [esp + 4 + 8]   // src_argb1
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    pcmpeqb    xmm7, xmm7       // generate constant 1
			
 
				-    psrlw      xmm7, 15
			
 
				-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm6, 8
			
 
				-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
			
 
				-    psllw      xmm5, 8
			
 
				-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
			
 
				-    pslld      xmm4, 24
			
 
				-
			
 
				-    sub        ecx, 1
			
 
				-    je         convertloop1     // only 1 pixel?
			
 
				-    jl         convertloop1b
			
 
				-
			
 
				-    // 1 pixel loop until destination pointer is aligned.
			
 
				-  alignloop1:
			
 
				-    test       edx, 15          // aligned?
			
 
				-    je         alignloop1b
			
 
				-    movd       xmm3, [eax]
			
 
				-    lea        eax, [eax + 4]
			
 
				-    movdqa     xmm0, xmm3       // src argb
			
 
				-    pxor       xmm3, xmm4       // ~alpha
			
 
				-    movd       xmm2, [esi]      // _r_b
			
 
				-    psrlw      xmm3, 8          // alpha
			
 
				-    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
			
 
				-    pshuflw    xmm3, xmm3, 0F5h
			
 
				-    pand       xmm2, xmm6       // _r_b
			
 
				-    paddw      xmm3, xmm7       // 256 - alpha
			
 
				-    pmullw     xmm2, xmm3       // _r_b * alpha
			
 
				-    movd       xmm1, [esi]      // _a_g
			
 
				-    lea        esi, [esi + 4]
			
 
				-    psrlw      xmm1, 8          // _a_g
			
 
				-    por        xmm0, xmm4       // set alpha to 255
			
 
				-    pmullw     xmm1, xmm3       // _a_g * alpha
			
 
				-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm2       // + src argb
			
 
				-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm1       // + src argb
			
 
				-    sub        ecx, 1
			
 
				-    movd       [edx], xmm0
			
 
				-    lea        edx, [edx + 4]
			
 
				-    jge        alignloop1
			
 
				-
			
 
				-  alignloop1b:
			
 
				-    add        ecx, 1 - 4
			
 
				-    jl         convertloop4b
			
 
				-
			
 
				-    // 4 pixel loop.
			
 
				-  convertloop4:
			
 
				-    movdqu     xmm3, [eax]      // src argb
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqa     xmm0, xmm3       // src argb
			
 
				-    pxor       xmm3, xmm4       // ~alpha
			
 
				-    movdqu     xmm2, [esi]      // _r_b
			
 
				-    psrlw      xmm3, 8          // alpha
			
 
				-    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
			
 
				-    pshuflw    xmm3, xmm3, 0F5h
			
 
				-    pand       xmm2, xmm6       // _r_b
			
 
				-    paddw      xmm3, xmm7       // 256 - alpha
			
 
				-    pmullw     xmm2, xmm3       // _r_b * alpha
			
 
				-    movdqu     xmm1, [esi]      // _a_g
			
 
				-    lea        esi, [esi + 16]
			
 
				-    psrlw      xmm1, 8          // _a_g
			
 
				-    por        xmm0, xmm4       // set alpha to 255
			
 
				-    pmullw     xmm1, xmm3       // _a_g * alpha
			
 
				-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm2       // + src argb
			
 
				-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm1       // + src argb
			
 
				-    sub        ecx, 4
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jge        convertloop4
			
 
				-
			
 
				-  convertloop4b:
			
 
				-    add        ecx, 4 - 1
			
 
				-    jl         convertloop1b
			
 
				-
			
 
				-    // 1 pixel loop.
			
 
				-  convertloop1:
			
 
				-    movd       xmm3, [eax]      // src argb
			
 
				-    lea        eax, [eax + 4]
			
 
				-    movdqa     xmm0, xmm3       // src argb
			
 
				-    pxor       xmm3, xmm4       // ~alpha
			
 
				-    movd       xmm2, [esi]      // _r_b
			
 
				-    psrlw      xmm3, 8          // alpha
			
 
				-    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
			
 
				-    pshuflw    xmm3, xmm3, 0F5h
			
 
				-    pand       xmm2, xmm6       // _r_b
			
 
				-    paddw      xmm3, xmm7       // 256 - alpha
			
 
				-    pmullw     xmm2, xmm3       // _r_b * alpha
			
 
				-    movd       xmm1, [esi]      // _a_g
			
 
				-    lea        esi, [esi + 4]
			
 
				-    psrlw      xmm1, 8          // _a_g
			
 
				-    por        xmm0, xmm4       // set alpha to 255
			
 
				-    pmullw     xmm1, xmm3       // _a_g * alpha
			
 
				-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm2       // + src argb
			
 
				-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm1       // + src argb
			
 
				-    sub        ecx, 1
			
 
				-    movd       [edx], xmm0
			
 
				-    lea        edx, [edx + 4]
			
 
				-    jge        convertloop1
			
 
				-
			
 
				-  convertloop1b:
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBBLENDROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBBLENDROW_SSSE3
			
 
				-// Shuffle table for isolating alpha.
			
 
				-static const uvec8 kShuffleAlpha = {
			
 
				-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
			
 
				-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
			
 
				-};
			
 
				-// Same as SSE2, but replaces:
			
 
				-//    psrlw      xmm3, 8          // alpha
			
 
				-//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
			
 
				-//    pshuflw    xmm3, xmm3, 0F5h
			
 
				-// with..
			
 
				-//    pshufb     xmm3, kShuffleAlpha // alpha
			
 
				-// Blend 8 pixels at a time.
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                        uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb0
			
 
				-    mov        esi, [esp + 4 + 8]   // src_argb1
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
			
 
				-    psrlw      xmm7, 15
			
 
				-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
			
 
				-    psrlw      xmm6, 8
			
 
				-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
			
 
				-    psllw      xmm5, 8
			
 
				-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
			
 
				-    pslld      xmm4, 24
			
 
				-
			
 
				-    sub        ecx, 1
			
 
				-    je         convertloop1     // only 1 pixel?
			
 
				-    jl         convertloop1b
			
 
				-
			
 
				-    // 1 pixel loop until destination pointer is aligned.
			
 
				-  alignloop1:
			
 
				-    test       edx, 15          // aligned?
			
 
				-    je         alignloop1b
			
 
				-    movd       xmm3, [eax]
			
 
				-    lea        eax, [eax + 4]
			
 
				-    movdqa     xmm0, xmm3       // src argb
			
 
				-    pxor       xmm3, xmm4       // ~alpha
			
 
				-    movd       xmm2, [esi]      // _r_b
			
 
				-    pshufb     xmm3, kShuffleAlpha // alpha
			
 
				-    pand       xmm2, xmm6       // _r_b
			
 
				-    paddw      xmm3, xmm7       // 256 - alpha
			
 
				-    pmullw     xmm2, xmm3       // _r_b * alpha
			
 
				-    movd       xmm1, [esi]      // _a_g
			
 
				-    lea        esi, [esi + 4]
			
 
				-    psrlw      xmm1, 8          // _a_g
			
 
				-    por        xmm0, xmm4       // set alpha to 255
			
 
				-    pmullw     xmm1, xmm3       // _a_g * alpha
			
 
				-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm2       // + src argb
			
 
				-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm1       // + src argb
			
 
				-    sub        ecx, 1
			
 
				-    movd       [edx], xmm0
			
 
				-    lea        edx, [edx + 4]
			
 
				-    jge        alignloop1
			
 
				-
			
 
				-  alignloop1b:
			
 
				-    add        ecx, 1 - 4
			
 
				-    jl         convertloop4b
			
 
				-
			
 
				-    test       eax, 15          // unaligned?
			
 
				-    jne        convertuloop4
			
 
				-    test       esi, 15          // unaligned?
			
 
				-    jne        convertuloop4
			
 
				-
			
 
				-    // 4 pixel loop.
			
 
				-  convertloop4:
			
 
				-    movdqa     xmm3, [eax]      // src argb
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqa     xmm0, xmm3       // src argb
			
 
				-    pxor       xmm3, xmm4       // ~alpha
			
 
				-    movdqa     xmm2, [esi]      // _r_b
			
 
				-    pshufb     xmm3, kShuffleAlpha // alpha
			
 
				-    pand       xmm2, xmm6       // _r_b
			
 
				-    paddw      xmm3, xmm7       // 256 - alpha
			
 
				-    pmullw     xmm2, xmm3       // _r_b * alpha
			
 
				-    movdqa     xmm1, [esi]      // _a_g
			
 
				-    lea        esi, [esi + 16]
			
 
				-    psrlw      xmm1, 8          // _a_g
			
 
				-    por        xmm0, xmm4       // set alpha to 255
			
 
				-    pmullw     xmm1, xmm3       // _a_g * alpha
			
 
				-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm2       // + src argb
			
 
				-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm1       // + src argb
			
 
				-    sub        ecx, 4
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jge        convertloop4
			
 
				-    jmp        convertloop4b
			
 
				-
			
 
				-    // 4 pixel unaligned loop.
			
 
				-  convertuloop4:
			
 
				-    movdqu     xmm3, [eax]      // src argb
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqa     xmm0, xmm3       // src argb
			
 
				-    pxor       xmm3, xmm4       // ~alpha
			
 
				-    movdqu     xmm2, [esi]      // _r_b
			
 
				-    pshufb     xmm3, kShuffleAlpha // alpha
			
 
				-    pand       xmm2, xmm6       // _r_b
			
 
				-    paddw      xmm3, xmm7       // 256 - alpha
			
 
				-    pmullw     xmm2, xmm3       // _r_b * alpha
			
 
				-    movdqu     xmm1, [esi]      // _a_g
			
 
				-    lea        esi, [esi + 16]
			
 
				-    psrlw      xmm1, 8          // _a_g
			
 
				-    por        xmm0, xmm4       // set alpha to 255
			
 
				-    pmullw     xmm1, xmm3       // _a_g * alpha
			
 
				-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm2       // + src argb
			
 
				-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm1       // + src argb
			
 
				-    sub        ecx, 4
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jge        convertuloop4
			
 
				-
			
 
				-  convertloop4b:
			
 
				-    add        ecx, 4 - 1
			
 
				-    jl         convertloop1b
			
 
				-
			
 
				-    // 1 pixel loop.
			
 
				-  convertloop1:
			
 
				-    movd       xmm3, [eax]      // src argb
			
 
				-    lea        eax, [eax + 4]
			
 
				-    movdqa     xmm0, xmm3       // src argb
			
 
				-    pxor       xmm3, xmm4       // ~alpha
			
 
				-    movd       xmm2, [esi]      // _r_b
			
 
				-    pshufb     xmm3, kShuffleAlpha // alpha
			
 
				-    pand       xmm2, xmm6       // _r_b
			
 
				-    paddw      xmm3, xmm7       // 256 - alpha
			
 
				-    pmullw     xmm2, xmm3       // _r_b * alpha
			
 
				-    movd       xmm1, [esi]      // _a_g
			
 
				-    lea        esi, [esi + 4]
			
 
				-    psrlw      xmm1, 8          // _a_g
			
 
				-    por        xmm0, xmm4       // set alpha to 255
			
 
				-    pmullw     xmm1, xmm3       // _a_g * alpha
			
 
				-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm2       // + src argb
			
 
				-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
			
 
				-    paddusb    xmm0, xmm1       // + src argb
			
 
				-    sub        ecx, 1
			
 
				-    movd       [edx], xmm0
			
 
				-    lea        edx, [edx + 4]
			
 
				-    jge        convertloop1
			
 
				-
			
 
				-  convertloop1b:
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBBLENDROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBATTENUATEROW_SSE2
			
 
				-// Attenuate 4 pixels at a time.
			
 
				-// Aligned to 16 bytes.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   // src_argb0
			
 
				-    mov        edx, [esp + 8]   // dst_argb
			
 
				-    mov        ecx, [esp + 12]  // width
			
 
				-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
			
 
				-    pslld      xmm4, 24
			
 
				-    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
			
 
				-    psrld      xmm5, 8
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]      // read 4 pixels
			
 
				-    punpcklbw  xmm0, xmm0       // first 2
			
 
				-    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
			
 
				-    pshuflw    xmm2, xmm2, 0FFh
			
 
				-    pmulhuw    xmm0, xmm2       // rgb * a
			
 
				-    movdqa     xmm1, [eax]      // read 4 pixels
			
 
				-    punpckhbw  xmm1, xmm1       // next 2 pixels
			
 
				-    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
			
 
				-    pshuflw    xmm2, xmm2, 0FFh
			
 
				-    pmulhuw    xmm1, xmm2       // rgb * a
			
 
				-    movdqa     xmm2, [eax]      // alphas
			
 
				-    lea        eax, [eax + 16]
			
 
				-    psrlw      xmm0, 8
			
 
				-    pand       xmm2, xmm4
			
 
				-    psrlw      xmm1, 8
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    pand       xmm0, xmm5       // keep original alphas
			
 
				-    por        xmm0, xmm2
			
 
				-    sub        ecx, 4
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBATTENUATEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBATTENUATEROW_SSSE3
			
 
				-// Shuffle table duplicating alpha.
			
 
				-static const uvec8 kShuffleAlpha0 = {
			
 
				-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
			
 
				-};
			
 
				-static const uvec8 kShuffleAlpha1 = {
			
 
				-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
			
 
				-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
			
 
				-};
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   // src_argb0
			
 
				-    mov        edx, [esp + 8]   // dst_argb
			
 
				-    mov        ecx, [esp + 12]  // width
			
 
				-    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
			
 
				-    pslld      xmm3, 24
			
 
				-    movdqa     xmm4, kShuffleAlpha0
			
 
				-    movdqa     xmm5, kShuffleAlpha1
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu     xmm0, [eax]      // read 4 pixels
			
 
				-    pshufb     xmm0, xmm4       // isolate first 2 alphas
			
 
				-    movdqu     xmm1, [eax]      // read 4 pixels
			
 
				-    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
			
 
				-    pmulhuw    xmm0, xmm1       // rgb * a
			
 
				-    movdqu     xmm1, [eax]      // read 4 pixels
			
 
				-    pshufb     xmm1, xmm5       // isolate next 2 alphas
			
 
				-    movdqu     xmm2, [eax]      // read 4 pixels
			
 
				-    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
			
 
				-    pmulhuw    xmm1, xmm2       // rgb * a
			
 
				-    movdqu     xmm2, [eax]      // mask original alpha
			
 
				-    lea        eax, [eax + 16]
			
 
				-    pand       xmm2, xmm3
			
 
				-    psrlw      xmm0, 8
			
 
				-    psrlw      xmm1, 8
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    por        xmm0, xmm2       // copy original alpha
			
 
				-    sub        ecx, 4
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBATTENUATEROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBATTENUATEROW_AVX2
			
 
				-// Shuffle table duplicating alpha.
			
 
				-static const ulvec8 kShuffleAlpha_AVX2 = {
			
 
				-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
			
 
				-  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
			
 
				-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
			
 
				-  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
			
 
				-};
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   // src_argb0
			
 
				-    mov        edx, [esp + 8]   // dst_argb
			
 
				-    mov        ecx, [esp + 12]  // width
			
 
				-    sub        edx, eax
			
 
				-    vmovdqa    ymm4, kShuffleAlpha_AVX2
			
 
				-    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
			
 
				-    vpslld     ymm5, ymm5, 24
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    vmovdqu    ymm6, [eax]       // read 8 pixels.
			
 
				-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
			
 
				-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
			
 
				-    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
			
 
				-    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
			
 
				-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
			
 
				-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
			
 
				-    vpand      ymm6, ymm6, ymm5  // isolate alpha
			
 
				-    vpsrlw     ymm0, ymm0, 8
			
 
				-    vpsrlw     ymm1, ymm1, 8
			
 
				-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
			
 
				-    vpor       ymm0, ymm0, ymm6  // copy original alpha
			
 
				-    sub        ecx, 8
			
 
				-    vmovdqu    [eax + edx], ymm0
			
 
				-    lea        eax, [eax + 32]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBATTENUATEROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_ARGBUNATTENUATEROW_SSE2
			
 
				-// Unattenuate 4 pixels at a time.
			
 
				-// Aligned to 16 bytes.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                             int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_argb0
			
 
				-    mov        edx, [esp + 8 + 8]   // dst_argb
			
 
				-    mov        ecx, [esp + 8 + 12]  // width
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu     xmm0, [eax]      // read 4 pixels
			
 
				-    movzx      esi, byte ptr [eax + 3]  // first alpha
			
 
				-    movzx      edi, byte ptr [eax + 7]  // second alpha
			
 
				-    punpcklbw  xmm0, xmm0       // first 2
			
 
				-    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
			
 
				-    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
			
 
				-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
			
 
				-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
			
 
				-    movlhps    xmm2, xmm3
			
 
				-    pmulhuw    xmm0, xmm2       // rgb * a
			
 
				-
			
 
				-    movdqu     xmm1, [eax]      // read 4 pixels
			
 
				-    movzx      esi, byte ptr [eax + 11]  // third alpha
			
 
				-    movzx      edi, byte ptr [eax + 15]  // forth alpha
			
 
				-    punpckhbw  xmm1, xmm1       // next 2
			
 
				-    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
			
 
				-    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
			
 
				-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
			
 
				-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
			
 
				-    movlhps    xmm2, xmm3
			
 
				-    pmulhuw    xmm1, xmm2       // rgb * a
			
 
				-    lea        eax, [eax + 16]
			
 
				-
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 4
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBUNATTENUATEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBUNATTENUATEROW_AVX2
			
 
				-// Shuffle table duplicating alpha.
			
 
				-static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
			
 
				-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
			
 
				-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
			
 
				-};
			
 
				-// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
			
 
				-// USE_GATHER is not on by default, due to being a slow instruction.
			
 
				-#ifdef USE_GATHER
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                             int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   // src_argb0
			
 
				-    mov        edx, [esp + 8]   // dst_argb
			
 
				-    mov        ecx, [esp + 12]  // width
			
 
				-    sub        edx, eax
			
 
				-    vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    vmovdqu    ymm6, [eax]       // read 8 pixels.
			
 
				-    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
			
 
				-    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
			
 
				-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
			
 
				-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
			
 
				-    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
			
 
				-    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
			
 
				-    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
			
 
				-    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
			
 
				-    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
			
 
				-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
			
 
				-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
			
 
				-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
			
 
				-    sub        ecx, 8
			
 
				-    vmovdqu    [eax + edx], ymm0
			
 
				-    lea        eax, [eax + 32]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#else  // USE_GATHER
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                             int width) {
			
 
				-  __asm {
			
 
				-
			
 
				-    mov        eax, [esp + 4]   // src_argb0
			
 
				-    mov        edx, [esp + 8]   // dst_argb
			
 
				-    mov        ecx, [esp + 12]  // width
			
 
				-    sub        edx, eax
			
 
				-    vmovdqa    ymm5, kUnattenShuffleAlpha_AVX2
			
 
				-
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    // replace VPGATHER
			
 
				-    movzx      esi, byte ptr [eax + 3]                 // alpha0
			
 
				-    movzx      edi, byte ptr [eax + 7]                 // alpha1
			
 
				-    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
			
 
				-    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
			
 
				-    movzx      esi, byte ptr [eax + 11]                // alpha2
			
 
				-    movzx      edi, byte ptr [eax + 15]                // alpha3
			
 
				-    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
			
 
				-    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
			
 
				-    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
			
 
				-    movzx      esi, byte ptr [eax + 19]                // alpha4
			
 
				-    movzx      edi, byte ptr [eax + 23]                // alpha5
			
 
				-    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
			
 
				-    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
			
 
				-    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
			
 
				-    movzx      esi, byte ptr [eax + 27]                // alpha6
			
 
				-    movzx      edi, byte ptr [eax + 31]                // alpha7
			
 
				-    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
			
 
				-    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
			
 
				-    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
			
 
				-    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
			
 
				-    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
			
 
				-    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
			
 
				-    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
			
 
				-    // end of VPGATHER
			
 
				-
			
 
				-    vmovdqu    ymm6, [eax]       // read 8 pixels.
			
 
				-    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
			
 
				-    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
			
 
				-    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
			
 
				-    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
			
 
				-    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
			
 
				-    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
			
 
				-    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
			
 
				-    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
			
 
				-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
			
 
				-    sub        ecx, 8
			
 
				-    vmovdqu    [eax + edx], ymm0
			
 
				-    lea        eax, [eax + 32]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // USE_GATHER
			
 
				-#endif  // HAS_ARGBATTENUATEROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_ARGBGRAYROW_SSSE3
			
 
				-// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_argb */
			
 
				-    mov        ecx, [esp + 12]  /* width */
			
 
				-    movdqa     xmm4, kARGBToYJ
			
 
				-    movdqa     xmm5, kAddYJ64
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]  // G
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    pmaddubsw  xmm0, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    phaddw     xmm0, xmm1
			
 
				-    paddw      xmm0, xmm5  // Add .5 for rounding.
			
 
				-    psrlw      xmm0, 7
			
 
				-    packuswb   xmm0, xmm0   // 8 G bytes
			
 
				-    movdqa     xmm2, [eax]  // A
			
 
				-    movdqa     xmm3, [eax + 16]
			
 
				-    lea        eax, [eax + 32]
			
 
				-    psrld      xmm2, 24
			
 
				-    psrld      xmm3, 24
			
 
				-    packuswb   xmm2, xmm3
			
 
				-    packuswb   xmm2, xmm2   // 8 A bytes
			
 
				-    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
			
 
				-    punpcklbw  xmm0, xmm0   // 8 GG words
			
 
				-    punpcklbw  xmm3, xmm2   // 8 GA words
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklwd  xmm0, xmm3   // GGGA first 4
			
 
				-    punpckhwd  xmm1, xmm3   // GGGA next 4
			
 
				-    sub        ecx, 8
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + 16], xmm1
			
 
				-    lea        edx, [edx + 32]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBGRAYROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBSEPIAROW_SSSE3
			
 
				-//    b = (r * 35 + g * 68 + b * 17) >> 7
			
 
				-//    g = (r * 45 + g * 88 + b * 22) >> 7
			
 
				-//    r = (r * 50 + g * 98 + b * 24) >> 7
			
 
				-// Constant for ARGB color to sepia tone.
			
 
				-static const vec8 kARGBToSepiaB = {
			
 
				-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
			
 
				-};
			
 
				-
			
 
				-static const vec8 kARGBToSepiaG = {
			
 
				-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
			
 
				-};
			
 
				-
			
 
				-static const vec8 kARGBToSepiaR = {
			
 
				-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
			
 
				-};
			
 
				-
			
 
				-// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* dst_argb */
			
 
				-    mov        ecx, [esp + 8]   /* width */
			
 
				-    movdqa     xmm2, kARGBToSepiaB
			
 
				-    movdqa     xmm3, kARGBToSepiaG
			
 
				-    movdqa     xmm4, kARGBToSepiaR
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]  // B
			
 
				-    movdqa     xmm6, [eax + 16]
			
 
				-    pmaddubsw  xmm0, xmm2
			
 
				-    pmaddubsw  xmm6, xmm2
			
 
				-    phaddw     xmm0, xmm6
			
 
				-    psrlw      xmm0, 7
			
 
				-    packuswb   xmm0, xmm0   // 8 B values
			
 
				-    movdqa     xmm5, [eax]  // G
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    pmaddubsw  xmm5, xmm3
			
 
				-    pmaddubsw  xmm1, xmm3
			
 
				-    phaddw     xmm5, xmm1
			
 
				-    psrlw      xmm5, 7
			
 
				-    packuswb   xmm5, xmm5   // 8 G values
			
 
				-    punpcklbw  xmm0, xmm5   // 8 BG values
			
 
				-    movdqa     xmm5, [eax]  // R
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    pmaddubsw  xmm5, xmm4
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    phaddw     xmm5, xmm1
			
 
				-    psrlw      xmm5, 7
			
 
				-    packuswb   xmm5, xmm5   // 8 R values
			
 
				-    movdqa     xmm6, [eax]  // A
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    psrld      xmm6, 24
			
 
				-    psrld      xmm1, 24
			
 
				-    packuswb   xmm6, xmm1
			
 
				-    packuswb   xmm6, xmm6   // 8 A values
			
 
				-    punpcklbw  xmm5, xmm6   // 8 RA values
			
 
				-    movdqa     xmm1, xmm0   // Weave BG, RA together
			
 
				-    punpcklwd  xmm0, xmm5   // BGRA first 4
			
 
				-    punpckhwd  xmm1, xmm5   // BGRA next 4
			
 
				-    sub        ecx, 8
			
 
				-    movdqa     [eax], xmm0
			
 
				-    movdqa     [eax + 16], xmm1
			
 
				-    lea        eax, [eax + 32]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBSEPIAROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
			
 
				-// Tranform 8 ARGB pixels (32 bytes) with color matrix.
			
 
				-// Same as Sepia except matrix is provided.
			
 
				-// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
			
 
				-// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                              const int8* matrix_argb, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_argb */
			
 
				-    mov        ecx, [esp + 12]  /* matrix_argb */
			
 
				-    movdqu     xmm5, [ecx]
			
 
				-    pshufd     xmm2, xmm5, 0x00
			
 
				-    pshufd     xmm3, xmm5, 0x55
			
 
				-    pshufd     xmm4, xmm5, 0xaa
			
 
				-    pshufd     xmm5, xmm5, 0xff
			
 
				-    mov        ecx, [esp + 16]  /* width */
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]  // B
			
 
				-    movdqa     xmm7, [eax + 16]
			
 
				-    pmaddubsw  xmm0, xmm2
			
 
				-    pmaddubsw  xmm7, xmm2
			
 
				-    movdqa     xmm6, [eax]  // G
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    pmaddubsw  xmm6, xmm3
			
 
				-    pmaddubsw  xmm1, xmm3
			
 
				-    phaddsw    xmm0, xmm7   // B
			
 
				-    phaddsw    xmm6, xmm1   // G
			
 
				-    psraw      xmm0, 6      // B
			
 
				-    psraw      xmm6, 6      // G
			
 
				-    packuswb   xmm0, xmm0   // 8 B values
			
 
				-    packuswb   xmm6, xmm6   // 8 G values
			
 
				-    punpcklbw  xmm0, xmm6   // 8 BG values
			
 
				-    movdqa     xmm1, [eax]  // R
			
 
				-    movdqa     xmm7, [eax + 16]
			
 
				-    pmaddubsw  xmm1, xmm4
			
 
				-    pmaddubsw  xmm7, xmm4
			
 
				-    phaddsw    xmm1, xmm7   // R
			
 
				-    movdqa     xmm6, [eax]  // A
			
 
				-    movdqa     xmm7, [eax + 16]
			
 
				-    pmaddubsw  xmm6, xmm5
			
 
				-    pmaddubsw  xmm7, xmm5
			
 
				-    phaddsw    xmm6, xmm7   // A
			
 
				-    psraw      xmm1, 6      // R
			
 
				-    psraw      xmm6, 6      // A
			
 
				-    packuswb   xmm1, xmm1   // 8 R values
			
 
				-    packuswb   xmm6, xmm6   // 8 A values
			
 
				-    punpcklbw  xmm1, xmm6   // 8 RA values
			
 
				-    movdqa     xmm6, xmm0   // Weave BG, RA together
			
 
				-    punpcklwd  xmm0, xmm1   // BGRA first 4
			
 
				-    punpckhwd  xmm6, xmm1   // BGRA next 4
			
 
				-    sub        ecx, 8
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + 16], xmm6
			
 
				-    lea        eax, [eax + 32]
			
 
				-    lea        edx, [edx + 32]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_ARGBQUANTIZEROW_SSE2
			
 
				-// Quantize 4 ARGB pixels (16 bytes).
			
 
				-// Aligned to 16 bytes.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
			
 
				-                          int interval_offset, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    /* dst_argb */
			
 
				-    movd       xmm2, [esp + 8]   /* scale */
			
 
				-    movd       xmm3, [esp + 12]  /* interval_size */
			
 
				-    movd       xmm4, [esp + 16]  /* interval_offset */
			
 
				-    mov        ecx, [esp + 20]   /* width */
			
 
				-    pshuflw    xmm2, xmm2, 040h
			
 
				-    pshufd     xmm2, xmm2, 044h
			
 
				-    pshuflw    xmm3, xmm3, 040h
			
 
				-    pshufd     xmm3, xmm3, 044h
			
 
				-    pshuflw    xmm4, xmm4, 040h
			
 
				-    pshufd     xmm4, xmm4, 044h
			
 
				-    pxor       xmm5, xmm5  // constant 0
			
 
				-    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
			
 
				-    pslld      xmm6, 24
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]  // read 4 pixels
			
 
				-    punpcklbw  xmm0, xmm5   // first 2 pixels
			
 
				-    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
			
 
				-    movdqa     xmm1, [eax]  // read 4 pixels
			
 
				-    punpckhbw  xmm1, xmm5   // next 2 pixels
			
 
				-    pmulhuw    xmm1, xmm2
			
 
				-    pmullw     xmm0, xmm3   // * interval_size
			
 
				-    movdqa     xmm7, [eax]  // read 4 pixels
			
 
				-    pmullw     xmm1, xmm3
			
 
				-    pand       xmm7, xmm6   // mask alpha
			
 
				-    paddw      xmm0, xmm4   // + interval_size / 2
			
 
				-    paddw      xmm1, xmm4
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    por        xmm0, xmm7
			
 
				-    sub        ecx, 4
			
 
				-    movdqa     [eax], xmm0
			
 
				-    lea        eax, [eax + 16]
			
 
				-    jg         convertloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBQUANTIZEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBSHADEROW_SSE2
			
 
				-// Shade 4 pixels at a time by specified value.
			
 
				-// Aligned to 16 bytes.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
			
 
				-                       uint32 value) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   // src_argb
			
 
				-    mov        edx, [esp + 8]   // dst_argb
			
 
				-    mov        ecx, [esp + 12]  // width
			
 
				-    movd       xmm2, [esp + 16]  // value
			
 
				-    punpcklbw  xmm2, xmm2
			
 
				-    punpcklqdq xmm2, xmm2
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]      // read 4 pixels
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklbw  xmm0, xmm0       // first 2
			
 
				-    punpckhbw  xmm1, xmm1       // next 2
			
 
				-    pmulhuw    xmm0, xmm2       // argb * value
			
 
				-    pmulhuw    xmm1, xmm2       // argb * value
			
 
				-    psrlw      xmm0, 8
			
 
				-    psrlw      xmm1, 8
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 4
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBSHADEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBMULTIPLYROW_SSE2
			
 
				-// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb0
			
 
				-    mov        esi, [esp + 4 + 8]   // src_argb1
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    pxor       xmm5, xmm5  // constant 0
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
			
 
				-    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
			
 
				-    movdqu     xmm1, xmm0
			
 
				-    movdqu     xmm3, xmm2
			
 
				-    punpcklbw  xmm0, xmm0         // first 2
			
 
				-    punpckhbw  xmm1, xmm1         // next 2
			
 
				-    punpcklbw  xmm2, xmm5         // first 2
			
 
				-    punpckhbw  xmm3, xmm5         // next 2
			
 
				-    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
			
 
				-    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
			
 
				-    lea        eax, [eax + 16]
			
 
				-    lea        esi, [esi + 16]
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 4
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBMULTIPLYROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBADDROW_SSE2
			
 
				-// Add 2 rows of ARGB pixels together, 4 pixels at a time.
			
 
				-// TODO(fbarchard): Port this to posix, neon and other math functions.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                     uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb0
			
 
				-    mov        esi, [esp + 4 + 8]   // src_argb1
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-
			
 
				-    sub        ecx, 4
			
 
				-    jl         convertloop49
			
 
				-
			
 
				-    align      4
			
 
				- convertloop4:
			
 
				-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
			
 
				-    lea        esi, [esi + 16]
			
 
				-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
			
 
				-    sub        ecx, 4
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jge        convertloop4
			
 
				-
			
 
				- convertloop49:
			
 
				-    add        ecx, 4 - 1
			
 
				-    jl         convertloop19
			
 
				-
			
 
				- convertloop1:
			
 
				-    movd       xmm0, [eax]        // read 1 pixels from src_argb0
			
 
				-    lea        eax, [eax + 4]
			
 
				-    movd       xmm1, [esi]        // read 1 pixels from src_argb1
			
 
				-    lea        esi, [esi + 4]
			
 
				-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
			
 
				-    sub        ecx, 1
			
 
				-    movd       [edx], xmm0
			
 
				-    lea        edx, [edx + 4]
			
 
				-    jge        convertloop1
			
 
				-
			
 
				- convertloop19:
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBADDROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBSUBTRACTROW_SSE2
			
 
				-// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb0
			
 
				-    mov        esi, [esp + 4 + 8]   // src_argb1
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
			
 
				-    lea        esi, [esi + 16]
			
 
				-    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
			
 
				-    sub        ecx, 4
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBSUBTRACTROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBMULTIPLYROW_AVX2
			
 
				-// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb0
			
 
				-    mov        esi, [esp + 4 + 8]   // src_argb1
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    vpxor      ymm5, ymm5, ymm5     // constant 0
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
			
 
				-    lea        eax, [eax + 32]
			
 
				-    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
			
 
				-    lea        esi, [esi + 32]
			
 
				-    vpunpcklbw ymm0, ymm1, ymm1   // low 4
			
 
				-    vpunpckhbw ymm1, ymm1, ymm1   // high 4
			
 
				-    vpunpcklbw ymm2, ymm3, ymm5   // low 4
			
 
				-    vpunpckhbw ymm3, ymm3, ymm5   // high 4
			
 
				-    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
			
 
				-    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
			
 
				-    vpackuswb  ymm0, ymm0, ymm1
			
 
				-    vmovdqu    [edx], ymm0
			
 
				-    lea        edx, [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBMULTIPLYROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_ARGBADDROW_AVX2
			
 
				-// Add 2 rows of ARGB pixels together, 8 pixels at a time.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                     uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb0
			
 
				-    mov        esi, [esp + 4 + 8]   // src_argb1
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
			
 
				-    lea        eax, [eax + 32]
			
 
				-    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
			
 
				-    lea        esi, [esi + 32]
			
 
				-    vmovdqu    [edx], ymm0
			
 
				-    lea        edx, [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBADDROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_ARGBSUBTRACTROW_AVX2
			
 
				-// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
			
 
				-                          uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_argb0
			
 
				-    mov        esi, [esp + 4 + 8]   // src_argb1
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
			
 
				-    lea        eax, [eax + 32]
			
 
				-    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
			
 
				-    lea        esi, [esi + 32]
			
 
				-    vmovdqu    [edx], ymm0
			
 
				-    lea        edx, [edx + 32]
			
 
				-    sub        ecx, 8
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBSUBTRACTROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_SOBELXROW_SSE2
			
 
				-// SobelX as a matrix is
			
 
				-// -1  0  1
			
 
				-// -2  0  2
			
 
				-// -1  0  1
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   // src_y0
			
 
				-    mov        esi, [esp + 8 + 8]   // src_y1
			
 
				-    mov        edi, [esp + 8 + 12]  // src_y2
			
 
				-    mov        edx, [esp + 8 + 16]  // dst_sobelx
			
 
				-    mov        ecx, [esp + 8 + 20]  // width
			
 
				-    sub        esi, eax
			
 
				-    sub        edi, eax
			
 
				-    sub        edx, eax
			
 
				-    pxor       xmm5, xmm5  // constant 0
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
			
 
				-    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
			
 
				-    punpcklbw  xmm0, xmm5
			
 
				-    punpcklbw  xmm1, xmm5
			
 
				-    psubw      xmm0, xmm1
			
 
				-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
			
 
				-    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
			
 
				-    punpcklbw  xmm1, xmm5
			
 
				-    punpcklbw  xmm2, xmm5
			
 
				-    psubw      xmm1, xmm2
			
 
				-    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
			
 
				-    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
			
 
				-    punpcklbw  xmm2, xmm5
			
 
				-    punpcklbw  xmm3, xmm5
			
 
				-    psubw      xmm2, xmm3
			
 
				-    paddw      xmm0, xmm2
			
 
				-    paddw      xmm0, xmm1
			
 
				-    paddw      xmm0, xmm1
			
 
				-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
			
 
				-    psubw      xmm1, xmm0
			
 
				-    pmaxsw     xmm0, xmm1
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    sub        ecx, 8
			
 
				-    movq       qword ptr [eax + edx], xmm0
			
 
				-    lea        eax, [eax + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_SOBELXROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_SOBELYROW_SSE2
			
 
				-// SobelY as a matrix is
			
 
				-// -1 -2 -1
			
 
				-//  0  0  0
			
 
				-//  1  2  1
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
			
 
				-                    uint8* dst_sobely, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_y0
			
 
				-    mov        esi, [esp + 4 + 8]   // src_y1
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_sobely
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    sub        esi, eax
			
 
				-    sub        edx, eax
			
 
				-    pxor       xmm5, xmm5  // constant 0
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
			
 
				-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
			
 
				-    punpcklbw  xmm0, xmm5
			
 
				-    punpcklbw  xmm1, xmm5
			
 
				-    psubw      xmm0, xmm1
			
 
				-    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
			
 
				-    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
			
 
				-    punpcklbw  xmm1, xmm5
			
 
				-    punpcklbw  xmm2, xmm5
			
 
				-    psubw      xmm1, xmm2
			
 
				-    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
			
 
				-    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
			
 
				-    punpcklbw  xmm2, xmm5
			
 
				-    punpcklbw  xmm3, xmm5
			
 
				-    psubw      xmm2, xmm3
			
 
				-    paddw      xmm0, xmm2
			
 
				-    paddw      xmm0, xmm1
			
 
				-    paddw      xmm0, xmm1
			
 
				-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
			
 
				-    psubw      xmm1, xmm0
			
 
				-    pmaxsw     xmm0, xmm1
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    sub        ecx, 8
			
 
				-    movq       qword ptr [eax + edx], xmm0
			
 
				-    lea        eax, [eax + 8]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_SOBELYROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_SOBELROW_SSE2
			
 
				-// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
			
 
				-// A = 255
			
 
				-// R = Sobel
			
 
				-// G = Sobel
			
 
				-// B = Sobel
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                   uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_sobelx
			
 
				-    mov        esi, [esp + 4 + 8]   // src_sobely
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    sub        esi, eax
			
 
				-    pcmpeqb    xmm5, xmm5           // alpha 255
			
 
				-    pslld      xmm5, 24             // 0xff000000
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
			
 
				-    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
			
 
				-    lea        eax, [eax + 16]
			
 
				-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
			
 
				-    movdqa     xmm2, xmm0             // GG
			
 
				-    punpcklbw  xmm2, xmm0             // First 8
			
 
				-    punpckhbw  xmm0, xmm0             // Next 8
			
 
				-    movdqa     xmm1, xmm2             // GGGG
			
 
				-    punpcklwd  xmm1, xmm2             // First 4
			
 
				-    punpckhwd  xmm2, xmm2             // Next 4
			
 
				-    por        xmm1, xmm5             // GGGA
			
 
				-    por        xmm2, xmm5
			
 
				-    movdqa     xmm3, xmm0             // GGGG
			
 
				-    punpcklwd  xmm3, xmm0             // Next 4
			
 
				-    punpckhwd  xmm0, xmm0             // Last 4
			
 
				-    por        xmm3, xmm5             // GGGA
			
 
				-    por        xmm0, xmm5
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [edx], xmm1
			
 
				-    movdqa     [edx + 16], xmm2
			
 
				-    movdqa     [edx + 32], xmm3
			
 
				-    movdqa     [edx + 48], xmm0
			
 
				-    lea        edx, [edx + 64]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_SOBELROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_SOBELTOPLANEROW_SSE2
			
 
				-// Adds Sobel X and Sobel Y and stores Sobel into a plane.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                          uint8* dst_y, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_sobelx
			
 
				-    mov        esi, [esp + 4 + 8]   // src_sobely
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    sub        esi, eax
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
			
 
				-    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
			
 
				-    lea        eax, [eax + 16]
			
 
				-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_SOBELTOPLANEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_SOBELXYROW_SSE2
			
 
				-// Mixes Sobel X, Sobel Y and Sobel into ARGB.
			
 
				-// A = 255
			
 
				-// R = Sobel X
			
 
				-// G = Sobel
			
 
				-// B = Sobel Y
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
			
 
				-                     uint8* dst_argb, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   // src_sobelx
			
 
				-    mov        esi, [esp + 4 + 8]   // src_sobely
			
 
				-    mov        edx, [esp + 4 + 12]  // dst_argb
			
 
				-    mov        ecx, [esp + 4 + 16]  // width
			
 
				-    sub        esi, eax
			
 
				-    pcmpeqb    xmm5, xmm5           // alpha 255
			
 
				-
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
			
 
				-    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqa     xmm2, xmm0
			
 
				-    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
			
 
				-    movdqa     xmm3, xmm0             // XA
			
 
				-    punpcklbw  xmm3, xmm5
			
 
				-    punpckhbw  xmm0, xmm5
			
 
				-    movdqa     xmm4, xmm1             // YS
			
 
				-    punpcklbw  xmm4, xmm2
			
 
				-    punpckhbw  xmm1, xmm2
			
 
				-    movdqa     xmm6, xmm4             // YSXA
			
 
				-    punpcklwd  xmm6, xmm3             // First 4
			
 
				-    punpckhwd  xmm4, xmm3             // Next 4
			
 
				-    movdqa     xmm7, xmm1             // YSXA
			
 
				-    punpcklwd  xmm7, xmm0             // Next 4
			
 
				-    punpckhwd  xmm1, xmm0             // Last 4
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [edx], xmm6
			
 
				-    movdqa     [edx + 16], xmm4
			
 
				-    movdqa     [edx + 32], xmm7
			
 
				-    movdqa     [edx + 48], xmm1
			
 
				-    lea        edx, [edx + 64]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_SOBELXYROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
			
 
				-// Consider float CumulativeSum.
			
 
				-// Consider calling CumulativeSum one row at time as needed.
			
 
				-// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
			
 
				-// Convert cumulative sum for an area to an average for 1 pixel.
			
 
				-// topleft is pointer to top left of CumulativeSum buffer for area.
			
 
				-// botleft is pointer to bottom left of CumulativeSum buffer.
			
 
				-// width is offset from left to right of area in CumulativeSum buffer measured
			
 
				-//   in number of ints.
			
 
				-// area is the number of pixels in the area being averaged.
			
 
				-// dst points to pixel to store result to.
			
 
				-// count is number of averaged pixels to produce.
			
 
				-// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
			
 
				-// aligned.
			
 
				-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
			
 
				-                                    int width, int area, uint8* dst,
			
 
				-                                    int count) {
			
 
				-  __asm {
			
 
				-    mov        eax, topleft  // eax topleft
			
 
				-    mov        esi, botleft  // esi botleft
			
 
				-    mov        edx, width
			
 
				-    movd       xmm5, area
			
 
				-    mov        edi, dst
			
 
				-    mov        ecx, count
			
 
				-    cvtdq2ps   xmm5, xmm5
			
 
				-    rcpss      xmm4, xmm5  // 1.0f / area
			
 
				-    pshufd     xmm4, xmm4, 0
			
 
				-    sub        ecx, 4
			
 
				-    jl         l4b
			
 
				-
			
 
				-    cmp        area, 128  // 128 pixels will not overflow 15 bits.
			
 
				-    ja         l4
			
 
				-
			
 
				-    pshufd     xmm5, xmm5, 0        // area
			
 
				-    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
			
 
				-    psrld      xmm6, 16
			
 
				-    cvtdq2ps   xmm6, xmm6
			
 
				-    addps      xmm5, xmm6           // (65536.0 + area - 1)
			
 
				-    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
			
 
				-    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
			
 
				-    packssdw   xmm5, xmm5           // 16 bit shorts
			
 
				-
			
 
				-    // 4 pixel loop small blocks.
			
 
				-    align      4
			
 
				-  s4:
			
 
				-    // top left
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-
			
 
				-    // - top right
			
 
				-    psubd      xmm0, [eax + edx * 4]
			
 
				-    psubd      xmm1, [eax + edx * 4 + 16]
			
 
				-    psubd      xmm2, [eax + edx * 4 + 32]
			
 
				-    psubd      xmm3, [eax + edx * 4 + 48]
			
 
				-    lea        eax, [eax + 64]
			
 
				-
			
 
				-    // - bottom left
			
 
				-    psubd      xmm0, [esi]
			
 
				-    psubd      xmm1, [esi + 16]
			
 
				-    psubd      xmm2, [esi + 32]
			
 
				-    psubd      xmm3, [esi + 48]
			
 
				-
			
 
				-    // + bottom right
			
 
				-    paddd      xmm0, [esi + edx * 4]
			
 
				-    paddd      xmm1, [esi + edx * 4 + 16]
			
 
				-    paddd      xmm2, [esi + edx * 4 + 32]
			
 
				-    paddd      xmm3, [esi + edx * 4 + 48]
			
 
				-    lea        esi, [esi + 64]
			
 
				-
			
 
				-    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
			
 
				-    packssdw   xmm2, xmm3
			
 
				-
			
 
				-    pmulhuw    xmm0, xmm5
			
 
				-    pmulhuw    xmm2, xmm5
			
 
				-
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    movdqu     [edi], xmm0
			
 
				-    lea        edi, [edi + 16]
			
 
				-    sub        ecx, 4
			
 
				-    jge        s4
			
 
				-
			
 
				-    jmp        l4b
			
 
				-
			
 
				-    // 4 pixel loop
			
 
				-    align      4
			
 
				-  l4:
			
 
				-    // top left
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    movdqa     xmm2, [eax + 32]
			
 
				-    movdqa     xmm3, [eax + 48]
			
 
				-
			
 
				-    // - top right
			
 
				-    psubd      xmm0, [eax + edx * 4]
			
 
				-    psubd      xmm1, [eax + edx * 4 + 16]
			
 
				-    psubd      xmm2, [eax + edx * 4 + 32]
			
 
				-    psubd      xmm3, [eax + edx * 4 + 48]
			
 
				-    lea        eax, [eax + 64]
			
 
				-
			
 
				-    // - bottom left
			
 
				-    psubd      xmm0, [esi]
			
 
				-    psubd      xmm1, [esi + 16]
			
 
				-    psubd      xmm2, [esi + 32]
			
 
				-    psubd      xmm3, [esi + 48]
			
 
				-
			
 
				-    // + bottom right
			
 
				-    paddd      xmm0, [esi + edx * 4]
			
 
				-    paddd      xmm1, [esi + edx * 4 + 16]
			
 
				-    paddd      xmm2, [esi + edx * 4 + 32]
			
 
				-    paddd      xmm3, [esi + edx * 4 + 48]
			
 
				-    lea        esi, [esi + 64]
			
 
				-
			
 
				-    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
			
 
				-    cvtdq2ps   xmm1, xmm1
			
 
				-    mulps      xmm0, xmm4
			
 
				-    mulps      xmm1, xmm4
			
 
				-    cvtdq2ps   xmm2, xmm2
			
 
				-    cvtdq2ps   xmm3, xmm3
			
 
				-    mulps      xmm2, xmm4
			
 
				-    mulps      xmm3, xmm4
			
 
				-    cvtps2dq   xmm0, xmm0
			
 
				-    cvtps2dq   xmm1, xmm1
			
 
				-    cvtps2dq   xmm2, xmm2
			
 
				-    cvtps2dq   xmm3, xmm3
			
 
				-    packssdw   xmm0, xmm1
			
 
				-    packssdw   xmm2, xmm3
			
 
				-    packuswb   xmm0, xmm2
			
 
				-    movdqu     [edi], xmm0
			
 
				-    lea        edi, [edi + 16]
			
 
				-    sub        ecx, 4
			
 
				-    jge        l4
			
 
				-
			
 
				-  l4b:
			
 
				-    add        ecx, 4 - 1
			
 
				-    jl         l1b
			
 
				-
			
 
				-    // 1 pixel loop
			
 
				-    align      4
			
 
				-  l1:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    psubd      xmm0, [eax + edx * 4]
			
 
				-    lea        eax, [eax + 16]
			
 
				-    psubd      xmm0, [esi]
			
 
				-    paddd      xmm0, [esi + edx * 4]
			
 
				-    lea        esi, [esi + 16]
			
 
				-    cvtdq2ps   xmm0, xmm0
			
 
				-    mulps      xmm0, xmm4
			
 
				-    cvtps2dq   xmm0, xmm0
			
 
				-    packssdw   xmm0, xmm0
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    movd       dword ptr [edi], xmm0
			
 
				-    lea        edi, [edi + 4]
			
 
				-    sub        ecx, 1
			
 
				-    jge        l1
			
 
				-  l1b:
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
			
 
				-// Creates a table of cumulative sums where each value is a sum of all values
			
 
				-// above and to the left of the value.
			
 
				-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
			
 
				-                                  const int32* previous_cumsum, int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, row
			
 
				-    mov        edx, cumsum
			
 
				-    mov        esi, previous_cumsum
			
 
				-    mov        ecx, width
			
 
				-    pxor       xmm0, xmm0
			
 
				-    pxor       xmm1, xmm1
			
 
				-
			
 
				-    sub        ecx, 4
			
 
				-    jl         l4b
			
 
				-    test       edx, 15
			
 
				-    jne        l4b
			
 
				-
			
 
				-    // 4 pixel loop
			
 
				-    align      4
			
 
				-  l4:
			
 
				-    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqa     xmm4, xmm2
			
 
				-
			
 
				-    punpcklbw  xmm2, xmm1
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    punpcklwd  xmm2, xmm1
			
 
				-    punpckhwd  xmm3, xmm1
			
 
				-
			
 
				-    punpckhbw  xmm4, xmm1
			
 
				-    movdqa     xmm5, xmm4
			
 
				-    punpcklwd  xmm4, xmm1
			
 
				-    punpckhwd  xmm5, xmm1
			
 
				-
			
 
				-    paddd      xmm0, xmm2
			
 
				-    movdqa     xmm2, [esi]  // previous row above.
			
 
				-    paddd      xmm2, xmm0
			
 
				-
			
 
				-    paddd      xmm0, xmm3
			
 
				-    movdqa     xmm3, [esi + 16]
			
 
				-    paddd      xmm3, xmm0
			
 
				-
			
 
				-    paddd      xmm0, xmm4
			
 
				-    movdqa     xmm4, [esi + 32]
			
 
				-    paddd      xmm4, xmm0
			
 
				-
			
 
				-    paddd      xmm0, xmm5
			
 
				-    movdqa     xmm5, [esi + 48]
			
 
				-    lea        esi, [esi + 64]
			
 
				-    paddd      xmm5, xmm0
			
 
				-
			
 
				-    movdqa     [edx], xmm2
			
 
				-    movdqa     [edx + 16], xmm3
			
 
				-    movdqa     [edx + 32], xmm4
			
 
				-    movdqa     [edx + 48], xmm5
			
 
				-
			
 
				-    lea        edx, [edx + 64]
			
 
				-    sub        ecx, 4
			
 
				-    jge        l4
			
 
				-
			
 
				-  l4b:
			
 
				-    add        ecx, 4 - 1
			
 
				-    jl         l1b
			
 
				-
			
 
				-    // 1 pixel loop
			
 
				-    align      4
			
 
				-  l1:
			
 
				-    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
			
 
				-    lea        eax, [eax + 4]
			
 
				-    punpcklbw  xmm2, xmm1
			
 
				-    punpcklwd  xmm2, xmm1
			
 
				-    paddd      xmm0, xmm2
			
 
				-    movdqu     xmm2, [esi]
			
 
				-    lea        esi, [esi + 16]
			
 
				-    paddd      xmm2, xmm0
			
 
				-    movdqu     [edx], xmm2
			
 
				-    lea        edx, [edx + 16]
			
 
				-    sub        ecx, 1
			
 
				-    jge        l1
			
 
				-
			
 
				- l1b:
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBAFFINEROW_SSE2
			
 
				-// Copy ARGB pixels from source image with slope to a row of destination.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-LIBYUV_API
			
 
				-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
			
 
				-                        uint8* dst_argb, const float* uv_dudv, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 12]  // src_argb
			
 
				-    mov        esi, [esp + 16]  // stride
			
 
				-    mov        edx, [esp + 20]  // dst_argb
			
 
				-    mov        ecx, [esp + 24]  // pointer to uv_dudv
			
 
				-    movq       xmm2, qword ptr [ecx]  // uv
			
 
				-    movq       xmm7, qword ptr [ecx + 8]  // dudv
			
 
				-    mov        ecx, [esp + 28]  // width
			
 
				-    shl        esi, 16          // 4, stride
			
 
				-    add        esi, 4
			
 
				-    movd       xmm5, esi
			
 
				-    sub        ecx, 4
			
 
				-    jl         l4b
			
 
				-
			
 
				-    // setup for 4 pixel loop
			
 
				-    pshufd     xmm7, xmm7, 0x44  // dup dudv
			
 
				-    pshufd     xmm5, xmm5, 0  // dup 4, stride
			
 
				-    movdqa     xmm0, xmm2    // x0, y0, x1, y1
			
 
				-    addps      xmm0, xmm7
			
 
				-    movlhps    xmm2, xmm0
			
 
				-    movdqa     xmm4, xmm7
			
 
				-    addps      xmm4, xmm4    // dudv *= 2
			
 
				-    movdqa     xmm3, xmm2    // x2, y2, x3, y3
			
 
				-    addps      xmm3, xmm4
			
 
				-    addps      xmm4, xmm4    // dudv *= 4
			
 
				-
			
 
				-    // 4 pixel loop
			
 
				-    align      4
			
 
				-  l4:
			
 
				-    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
			
 
				-    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
			
 
				-    packssdw   xmm0, xmm1    // x, y as 8 shorts
			
 
				-    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
			
 
				-    movd       esi, xmm0
			
 
				-    pshufd     xmm0, xmm0, 0x39  // shift right
			
 
				-    movd       edi, xmm0
			
 
				-    pshufd     xmm0, xmm0, 0x39  // shift right
			
 
				-    movd       xmm1, [eax + esi]  // read pixel 0
			
 
				-    movd       xmm6, [eax + edi]  // read pixel 1
			
 
				-    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
			
 
				-    addps      xmm2, xmm4    // x, y += dx, dy first 2
			
 
				-    movq       qword ptr [edx], xmm1
			
 
				-    movd       esi, xmm0
			
 
				-    pshufd     xmm0, xmm0, 0x39  // shift right
			
 
				-    movd       edi, xmm0
			
 
				-    movd       xmm6, [eax + esi]  // read pixel 2
			
 
				-    movd       xmm0, [eax + edi]  // read pixel 3
			
 
				-    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
			
 
				-    addps      xmm3, xmm4    // x, y += dx, dy next 2
			
 
				-    sub        ecx, 4
			
 
				-    movq       qword ptr 8[edx], xmm6
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jge        l4
			
 
				-
			
 
				-  l4b:
			
 
				-    add        ecx, 4 - 1
			
 
				-    jl         l1b
			
 
				-
			
 
				-    // 1 pixel loop
			
 
				-    align      4
			
 
				-  l1:
			
 
				-    cvttps2dq  xmm0, xmm2    // x, y float to int
			
 
				-    packssdw   xmm0, xmm0    // x, y as shorts
			
 
				-    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
			
 
				-    addps      xmm2, xmm7    // x, y += dx, dy
			
 
				-    movd       esi, xmm0
			
 
				-    movd       xmm0, [eax + esi]  // copy a pixel
			
 
				-    sub        ecx, 1
			
 
				-    movd       [edx], xmm0
			
 
				-    lea        edx, [edx + 4]
			
 
				-    jge        l1
			
 
				-  l1b:
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBAFFINEROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_INTERPOLATEROW_AVX2
			
 
				-// Bilinear filter 16x2 -> 16x1
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                          ptrdiff_t src_stride, int dst_width,
			
 
				-                          int source_y_fraction) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        edi, [esp + 8 + 4]   // dst_ptr
			
 
				-    mov        esi, [esp + 8 + 8]   // src_ptr
			
 
				-    mov        edx, [esp + 8 + 12]  // src_stride
			
 
				-    mov        ecx, [esp + 8 + 16]  // dst_width
			
 
				-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
			
 
				-    shr        eax, 1
			
 
				-    // Dispatch to specialized filters if applicable.
			
 
				-    cmp        eax, 0
			
 
				-    je         xloop100  // 0 / 128.  Blend 100 / 0.
			
 
				-    sub        edi, esi
			
 
				-    cmp        eax, 32
			
 
				-    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
			
 
				-    cmp        eax, 64
			
 
				-    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
			
 
				-    cmp        eax, 96
			
 
				-    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
			
 
				-
			
 
				-    vmovd      xmm0, eax  // high fraction 0..127
			
 
				-    neg        eax
			
 
				-    add        eax, 128
			
 
				-    vmovd      xmm5, eax  // low fraction 128..1
			
 
				-    vpunpcklbw xmm5, xmm5, xmm0
			
 
				-    vpunpcklwd xmm5, xmm5, xmm5
			
 
				-    vpxor      ymm0, ymm0, ymm0
			
 
				-    vpermd     ymm5, ymm0, ymm5
			
 
				-
			
 
				-    align      4
			
 
				-  xloop:
			
 
				-    vmovdqu    ymm0, [esi]
			
 
				-    vmovdqu    ymm2, [esi + edx]
			
 
				-    vpunpckhbw ymm1, ymm0, ymm2  // mutates
			
 
				-    vpunpcklbw ymm0, ymm0, ymm2  // mutates
			
 
				-    vpmaddubsw ymm0, ymm0, ymm5
			
 
				-    vpmaddubsw ymm1, ymm1, ymm5
			
 
				-    vpsrlw     ymm0, ymm0, 7
			
 
				-    vpsrlw     ymm1, ymm1, 7
			
 
				-    vpackuswb  ymm0, ymm0, ymm1  // unmutates
			
 
				-    sub        ecx, 32
			
 
				-    vmovdqu    [esi + edi], ymm0
			
 
				-    lea        esi, [esi + 32]
			
 
				-    jg         xloop
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 25 / 75.
			
 
				-    align      4
			
 
				-  xloop25:
			
 
				-    vmovdqu    ymm0, [esi]
			
 
				-    vpavgb     ymm0, ymm0, [esi + edx]
			
 
				-    vpavgb     ymm0, ymm0, [esi + edx]
			
 
				-    sub        ecx, 32
			
 
				-    vmovdqu    [esi + edi], ymm0
			
 
				-    lea        esi, [esi + 32]
			
 
				-    jg         xloop25
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 50 / 50.
			
 
				-    align      4
			
 
				-  xloop50:
			
 
				-    vmovdqu    ymm0, [esi]
			
 
				-    vpavgb     ymm0, ymm0, [esi + edx]
			
 
				-    sub        ecx, 32
			
 
				-    vmovdqu    [esi + edi], ymm0
			
 
				-    lea        esi, [esi + 32]
			
 
				-    jg         xloop50
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 75 / 25.
			
 
				-    align      4
			
 
				-  xloop75:
			
 
				-    vmovdqu    ymm0, [esi + edx]
			
 
				-    vpavgb     ymm0, ymm0, [esi]
			
 
				-    vpavgb     ymm0, ymm0, [esi]
			
 
				-    sub        ecx, 32
			
 
				-    vmovdqu     [esi + edi], ymm0
			
 
				-    lea        esi, [esi + 32]
			
 
				-    jg         xloop75
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 100 / 0 - Copy row unchanged.
			
 
				-    align      4
			
 
				-  xloop100:
			
 
				-    rep movsb
			
 
				-
			
 
				-  xloop99:
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_INTERPOLATEROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_INTERPOLATEROW_SSSE3
			
 
				-// Bilinear filter 16x2 -> 16x1
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                          ptrdiff_t src_stride, int dst_width,
			
 
				-                          int source_y_fraction) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        edi, [esp + 8 + 4]   // dst_ptr
			
 
				-    mov        esi, [esp + 8 + 8]   // src_ptr
			
 
				-    mov        edx, [esp + 8 + 12]  // src_stride
			
 
				-    mov        ecx, [esp + 8 + 16]  // dst_width
			
 
				-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
			
 
				-    sub        edi, esi
			
 
				-    shr        eax, 1
			
 
				-    // Dispatch to specialized filters if applicable.
			
 
				-    cmp        eax, 0
			
 
				-    je         xloop100  // 0 / 128.  Blend 100 / 0.
			
 
				-    cmp        eax, 32
			
 
				-    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
			
 
				-    cmp        eax, 64
			
 
				-    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
			
 
				-    cmp        eax, 96
			
 
				-    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
			
 
				-
			
 
				-    movd       xmm0, eax  // high fraction 0..127
			
 
				-    neg        eax
			
 
				-    add        eax, 128
			
 
				-    movd       xmm5, eax  // low fraction 128..1
			
 
				-    punpcklbw  xmm5, xmm0
			
 
				-    punpcklwd  xmm5, xmm5
			
 
				-    pshufd     xmm5, xmm5, 0
			
 
				-
			
 
				-    align      4
			
 
				-  xloop:
			
 
				-    movdqa     xmm0, [esi]
			
 
				-    movdqa     xmm2, [esi + edx]
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklbw  xmm0, xmm2
			
 
				-    punpckhbw  xmm1, xmm2
			
 
				-    pmaddubsw  xmm0, xmm5
			
 
				-    pmaddubsw  xmm1, xmm5
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm1, 7
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 25 / 75.
			
 
				-    align      4
			
 
				-  xloop25:
			
 
				-    movdqa     xmm0, [esi]
			
 
				-    movdqa     xmm1, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop25
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 50 / 50.
			
 
				-    align      4
			
 
				-  xloop50:
			
 
				-    movdqa     xmm0, [esi]
			
 
				-    movdqa     xmm1, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop50
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 75 / 25.
			
 
				-    align      4
			
 
				-  xloop75:
			
 
				-    movdqa     xmm1, [esi]
			
 
				-    movdqa     xmm0, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop75
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 100 / 0 - Copy row unchanged.
			
 
				-    align      4
			
 
				-  xloop100:
			
 
				-    movdqa     xmm0, [esi]
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop100
			
 
				-
			
 
				-  xloop99:
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_INTERPOLATEROW_SSSE3
			
 
				-
			
 
				-#ifdef HAS_INTERPOLATEROW_SSE2
			
 
				-// Bilinear filter 16x2 -> 16x1
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                         ptrdiff_t src_stride, int dst_width,
			
 
				-                         int source_y_fraction) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        edi, [esp + 8 + 4]   // dst_ptr
			
 
				-    mov        esi, [esp + 8 + 8]   // src_ptr
			
 
				-    mov        edx, [esp + 8 + 12]  // src_stride
			
 
				-    mov        ecx, [esp + 8 + 16]  // dst_width
			
 
				-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
			
 
				-    sub        edi, esi
			
 
				-    // Dispatch to specialized filters if applicable.
			
 
				-    cmp        eax, 0
			
 
				-    je         xloop100  // 0 / 256.  Blend 100 / 0.
			
 
				-    cmp        eax, 64
			
 
				-    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
			
 
				-    cmp        eax, 128
			
 
				-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
			
 
				-    cmp        eax, 192
			
 
				-    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
			
 
				-
			
 
				-    movd       xmm5, eax            // xmm5 = y fraction
			
 
				-    punpcklbw  xmm5, xmm5
			
 
				-    psrlw      xmm5, 1
			
 
				-    punpcklwd  xmm5, xmm5
			
 
				-    punpckldq  xmm5, xmm5
			
 
				-    punpcklqdq xmm5, xmm5
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				-  xloop:
			
 
				-    movdqa     xmm0, [esi]  // row0
			
 
				-    movdqa     xmm2, [esi + edx]  // row1
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    movdqa     xmm3, xmm2
			
 
				-    punpcklbw  xmm2, xmm4
			
 
				-    punpckhbw  xmm3, xmm4
			
 
				-    punpcklbw  xmm0, xmm4
			
 
				-    punpckhbw  xmm1, xmm4
			
 
				-    psubw      xmm2, xmm0  // row1 - row0
			
 
				-    psubw      xmm3, xmm1
			
 
				-    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
			
 
				-    paddw      xmm3, xmm3
			
 
				-    pmulhw     xmm2, xmm5  // scale diff
			
 
				-    pmulhw     xmm3, xmm5
			
 
				-    paddw      xmm0, xmm2  // sum rows
			
 
				-    paddw      xmm1, xmm3
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 25 / 75.
			
 
				-    align      4
			
 
				-  xloop25:
			
 
				-    movdqa     xmm0, [esi]
			
 
				-    movdqa     xmm1, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop25
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 50 / 50.
			
 
				-    align      4
			
 
				-  xloop50:
			
 
				-    movdqa     xmm0, [esi]
			
 
				-    movdqa     xmm1, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop50
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 75 / 25.
			
 
				-    align      4
			
 
				-  xloop75:
			
 
				-    movdqa     xmm1, [esi]
			
 
				-    movdqa     xmm0, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop75
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 100 / 0 - Copy row unchanged.
			
 
				-    align      4
			
 
				-  xloop100:
			
 
				-    movdqa     xmm0, [esi]
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop100
			
 
				-
			
 
				-  xloop99:
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_INTERPOLATEROW_SSE2
			
 
				-
			
 
				-// Bilinear filter 16x2 -> 16x1
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                                    ptrdiff_t src_stride, int dst_width,
			
 
				-                                    int source_y_fraction) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        edi, [esp + 8 + 4]   // dst_ptr
			
 
				-    mov        esi, [esp + 8 + 8]   // src_ptr
			
 
				-    mov        edx, [esp + 8 + 12]  // src_stride
			
 
				-    mov        ecx, [esp + 8 + 16]  // dst_width
			
 
				-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
			
 
				-    sub        edi, esi
			
 
				-    shr        eax, 1
			
 
				-    // Dispatch to specialized filters if applicable.
			
 
				-    cmp        eax, 0
			
 
				-    je         xloop100  // 0 / 128.  Blend 100 / 0.
			
 
				-    cmp        eax, 32
			
 
				-    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
			
 
				-    cmp        eax, 64
			
 
				-    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
			
 
				-    cmp        eax, 96
			
 
				-    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
			
 
				-
			
 
				-    movd       xmm0, eax  // high fraction 0..127
			
 
				-    neg        eax
			
 
				-    add        eax, 128
			
 
				-    movd       xmm5, eax  // low fraction 128..1
			
 
				-    punpcklbw  xmm5, xmm0
			
 
				-    punpcklwd  xmm5, xmm5
			
 
				-    pshufd     xmm5, xmm5, 0
			
 
				-
			
 
				-    align      4
			
 
				-  xloop:
			
 
				-    movdqu     xmm0, [esi]
			
 
				-    movdqu     xmm2, [esi + edx]
			
 
				-    movdqu     xmm1, xmm0
			
 
				-    punpcklbw  xmm0, xmm2
			
 
				-    punpckhbw  xmm1, xmm2
			
 
				-    pmaddubsw  xmm0, xmm5
			
 
				-    pmaddubsw  xmm1, xmm5
			
 
				-    psrlw      xmm0, 7
			
 
				-    psrlw      xmm1, 7
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 25 / 75.
			
 
				-    align      4
			
 
				-  xloop25:
			
 
				-    movdqu     xmm0, [esi]
			
 
				-    movdqu     xmm1, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop25
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 50 / 50.
			
 
				-    align      4
			
 
				-  xloop50:
			
 
				-    movdqu     xmm0, [esi]
			
 
				-    movdqu     xmm1, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop50
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 75 / 25.
			
 
				-    align      4
			
 
				-  xloop75:
			
 
				-    movdqu     xmm1, [esi]
			
 
				-    movdqu     xmm0, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop75
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 100 / 0 - Copy row unchanged.
			
 
				-    align      4
			
 
				-  xloop100:
			
 
				-    movdqu     xmm0, [esi]
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop100
			
 
				-
			
 
				-  xloop99:
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#ifdef HAS_INTERPOLATEROW_SSE2
			
 
				-// Bilinear filter 16x2 -> 16x1
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                                   ptrdiff_t src_stride, int dst_width,
			
 
				-                                   int source_y_fraction) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        edi, [esp + 8 + 4]   // dst_ptr
			
 
				-    mov        esi, [esp + 8 + 8]   // src_ptr
			
 
				-    mov        edx, [esp + 8 + 12]  // src_stride
			
 
				-    mov        ecx, [esp + 8 + 16]  // dst_width
			
 
				-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
			
 
				-    sub        edi, esi
			
 
				-    // Dispatch to specialized filters if applicable.
			
 
				-    cmp        eax, 0
			
 
				-    je         xloop100  // 0 / 256.  Blend 100 / 0.
			
 
				-    cmp        eax, 64
			
 
				-    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
			
 
				-    cmp        eax, 128
			
 
				-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
			
 
				-    cmp        eax, 192
			
 
				-    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
			
 
				-
			
 
				-    movd       xmm5, eax            // xmm5 = y fraction
			
 
				-    punpcklbw  xmm5, xmm5
			
 
				-    psrlw      xmm5, 1
			
 
				-    punpcklwd  xmm5, xmm5
			
 
				-    punpckldq  xmm5, xmm5
			
 
				-    punpcklqdq xmm5, xmm5
			
 
				-    pxor       xmm4, xmm4
			
 
				-
			
 
				-    align      4
			
 
				-  xloop:
			
 
				-    movdqu     xmm0, [esi]  // row0
			
 
				-    movdqu     xmm2, [esi + edx]  // row1
			
 
				-    movdqu     xmm1, xmm0
			
 
				-    movdqu     xmm3, xmm2
			
 
				-    punpcklbw  xmm2, xmm4
			
 
				-    punpckhbw  xmm3, xmm4
			
 
				-    punpcklbw  xmm0, xmm4
			
 
				-    punpckhbw  xmm1, xmm4
			
 
				-    psubw      xmm2, xmm0  // row1 - row0
			
 
				-    psubw      xmm3, xmm1
			
 
				-    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
			
 
				-    paddw      xmm3, xmm3
			
 
				-    pmulhw     xmm2, xmm5  // scale diff
			
 
				-    pmulhw     xmm3, xmm5
			
 
				-    paddw      xmm0, xmm2  // sum rows
			
 
				-    paddw      xmm1, xmm3
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 25 / 75.
			
 
				-    align      4
			
 
				-  xloop25:
			
 
				-    movdqu     xmm0, [esi]
			
 
				-    movdqu     xmm1, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop25
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 50 / 50.
			
 
				-    align      4
			
 
				-  xloop50:
			
 
				-    movdqu     xmm0, [esi]
			
 
				-    movdqu     xmm1, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop50
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 75 / 25.
			
 
				-    align      4
			
 
				-  xloop75:
			
 
				-    movdqu     xmm1, [esi]
			
 
				-    movdqu     xmm0, [esi + edx]
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    pavgb      xmm0, xmm1
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop75
			
 
				-    jmp        xloop99
			
 
				-
			
 
				-    // Blend 100 / 0 - Copy row unchanged.
			
 
				-    align      4
			
 
				-  xloop100:
			
 
				-    movdqu     xmm0, [esi]
			
 
				-    sub        ecx, 16
			
 
				-    movdqu     [esi + edi], xmm0
			
 
				-    lea        esi, [esi + 16]
			
 
				-    jg         xloop100
			
 
				-
			
 
				-  xloop99:
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_INTERPOLATEROW_SSE2
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
			
 
				-                  uint8* dst_uv, int pix) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_uv
			
 
				-    mov        edx, [esp + 4 + 8]    // src_uv_stride
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]   // pix
			
 
				-    sub        edi, eax
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    pavgb      xmm0, [eax + edx]
			
 
				-    sub        ecx, 16
			
 
				-    movdqa     [eax + edi], xmm0
			
 
				-    lea        eax,  [eax + 16]
			
 
				-    jg         convertloop
			
 
				-    pop        edi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#ifdef HAS_HALFROW_AVX2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
			
 
				-                  uint8* dst_uv, int pix) {
			
 
				-  __asm {
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 4 + 4]    // src_uv
			
 
				-    mov        edx, [esp + 4 + 8]    // src_uv_stride
			
 
				-    mov        edi, [esp + 4 + 12]   // dst_v
			
 
				-    mov        ecx, [esp + 4 + 16]   // pix
			
 
				-    sub        edi, eax
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vpavgb     ymm0, ymm0, [eax + edx]
			
 
				-    sub        ecx, 32
			
 
				-    vmovdqu    [eax + edi], ymm0
			
 
				-    lea        eax,  [eax + 32]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_HALFROW_AVX2
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                          uint32 selector, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_argb
			
 
				-    mov        edx, [esp + 8]    // dst_bayer
			
 
				-    movd       xmm5, [esp + 12]  // selector
			
 
				-    mov        ecx, [esp + 16]   // pix
			
 
				-    pshufd     xmm5, xmm5, 0
			
 
				-
			
 
				-    align      4
			
 
				-  wloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    lea        eax, [eax + 32]
			
 
				-    pshufb     xmm0, xmm5
			
 
				-    pshufb     xmm1, xmm5
			
 
				-    punpckldq  xmm0, xmm1
			
 
				-    sub        ecx, 8
			
 
				-    movq       qword ptr [edx], xmm0
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         wloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Specialized ARGB to Bayer that just isolates G channel.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
			
 
				-                           uint32 selector, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_argb
			
 
				-    mov        edx, [esp + 8]    // dst_bayer
			
 
				-                                 // selector
			
 
				-    mov        ecx, [esp + 16]   // pix
			
 
				-    pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
			
 
				-    psrld      xmm5, 24
			
 
				-
			
 
				-    align      4
			
 
				-  wloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    lea        eax, [eax + 32]
			
 
				-    psrld      xmm0, 8  // Move green to bottom.
			
 
				-    psrld      xmm1, 8
			
 
				-    pand       xmm0, xmm5
			
 
				-    pand       xmm1, xmm5
			
 
				-    packssdw   xmm0, xmm1
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 8
			
 
				-    movq       qword ptr [edx], xmm0
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         wloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                          const uint8* shuffler, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_argb
			
 
				-    mov        edx, [esp + 8]    // dst_argb
			
 
				-    mov        ecx, [esp + 12]   // shuffler
			
 
				-    movdqa     xmm5, [ecx]
			
 
				-    mov        ecx, [esp + 16]   // pix
			
 
				-
			
 
				-    align      4
			
 
				-  wloop:
			
 
				-    movdqa     xmm0, [eax]
			
 
				-    movdqa     xmm1, [eax + 16]
			
 
				-    lea        eax, [eax + 32]
			
 
				-    pshufb     xmm0, xmm5
			
 
				-    pshufb     xmm1, xmm5
			
 
				-    sub        ecx, 8
			
 
				-    movdqa     [edx], xmm0
			
 
				-    movdqa     [edx + 16], xmm1
			
 
				-    lea        edx, [edx + 32]
			
 
				-    jg         wloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                                    const uint8* shuffler, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]    // src_argb
			
 
				-    mov        edx, [esp + 8]    // dst_argb
			
 
				-    mov        ecx, [esp + 12]   // shuffler
			
 
				-    movdqa     xmm5, [ecx]
			
 
				-    mov        ecx, [esp + 16]   // pix
			
 
				-
			
 
				-    align      4
			
 
				-  wloop:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    movdqu     xmm1, [eax + 16]
			
 
				-    lea        eax, [eax + 32]
			
 
				-    pshufb     xmm0, xmm5
			
 
				-    pshufb     xmm1, xmm5
			
 
				-    sub        ecx, 8
			
 
				-    movdqu     [edx], xmm0
			
 
				-    movdqu     [edx + 16], xmm1
			
 
				-    lea        edx, [edx + 32]
			
 
				-    jg         wloop
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#ifdef HAS_ARGBSHUFFLEROW_AVX2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                         const uint8* shuffler, int pix) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]     // src_argb
			
 
				-    mov        edx, [esp + 8]     // dst_argb
			
 
				-    mov        ecx, [esp + 12]    // shuffler
			
 
				-    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
			
 
				-    mov        ecx, [esp + 16]    // pix
			
 
				-
			
 
				-    align      4
			
 
				-  wloop:
			
 
				-    vmovdqu    ymm0, [eax]
			
 
				-    vmovdqu    ymm1, [eax + 32]
			
 
				-    lea        eax, [eax + 64]
			
 
				-    vpshufb    ymm0, ymm0, ymm5
			
 
				-    vpshufb    ymm1, ymm1, ymm5
			
 
				-    sub        ecx, 16
			
 
				-    vmovdqu    [edx], ymm0
			
 
				-    vmovdqu    [edx + 32], ymm1
			
 
				-    lea        edx, [edx + 64]
			
 
				-    jg         wloop
			
 
				-
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBSHUFFLEROW_AVX2
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
			
 
				-                         const uint8* shuffler, int pix) {
			
 
				-  __asm {
			
 
				-    push       ebx
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 8 + 4]    // src_argb
			
 
				-    mov        edx, [esp + 8 + 8]    // dst_argb
			
 
				-    mov        esi, [esp + 8 + 12]   // shuffler
			
 
				-    mov        ecx, [esp + 8 + 16]   // pix
			
 
				-    pxor       xmm5, xmm5
			
 
				-
			
 
				-    mov        ebx, [esi]   // shuffler
			
 
				-    cmp        ebx, 0x03000102
			
 
				-    je         shuf_3012
			
 
				-    cmp        ebx, 0x00010203
			
 
				-    je         shuf_0123
			
 
				-    cmp        ebx, 0x00030201
			
 
				-    je         shuf_0321
			
 
				-    cmp        ebx, 0x02010003
			
 
				-    je         shuf_2103
			
 
				-
			
 
				-  // TODO(fbarchard): Use one source pointer and 3 offsets.
			
 
				-  shuf_any1:
			
 
				-    movzx      ebx, byte ptr [esi]
			
 
				-    movzx      ebx, byte ptr [eax + ebx]
			
 
				-    mov        [edx], bl
			
 
				-    movzx      ebx, byte ptr [esi + 1]
			
 
				-    movzx      ebx, byte ptr [eax + ebx]
			
 
				-    mov        [edx + 1], bl
			
 
				-    movzx      ebx, byte ptr [esi + 2]
			
 
				-    movzx      ebx, byte ptr [eax + ebx]
			
 
				-    mov        [edx + 2], bl
			
 
				-    movzx      ebx, byte ptr [esi + 3]
			
 
				-    movzx      ebx, byte ptr [eax + ebx]
			
 
				-    mov        [edx + 3], bl
			
 
				-    lea        eax, [eax + 4]
			
 
				-    lea        edx, [edx + 4]
			
 
				-    sub        ecx, 1
			
 
				-    jg         shuf_any1
			
 
				-    jmp        shuf99
			
 
				-
			
 
				-    align      4
			
 
				-  shuf_0123:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklbw  xmm0, xmm5
			
 
				-    punpckhbw  xmm1, xmm5
			
 
				-    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
			
 
				-    pshuflw    xmm0, xmm0, 01Bh
			
 
				-    pshufhw    xmm1, xmm1, 01Bh
			
 
				-    pshuflw    xmm1, xmm1, 01Bh
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 4
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         shuf_0123
			
 
				-    jmp        shuf99
			
 
				-
			
 
				-    align      4
			
 
				-  shuf_0321:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklbw  xmm0, xmm5
			
 
				-    punpckhbw  xmm1, xmm5
			
 
				-    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
			
 
				-    pshuflw    xmm0, xmm0, 039h
			
 
				-    pshufhw    xmm1, xmm1, 039h
			
 
				-    pshuflw    xmm1, xmm1, 039h
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 4
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         shuf_0321
			
 
				-    jmp        shuf99
			
 
				-
			
 
				-    align      4
			
 
				-  shuf_2103:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklbw  xmm0, xmm5
			
 
				-    punpckhbw  xmm1, xmm5
			
 
				-    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
			
 
				-    pshuflw    xmm0, xmm0, 093h
			
 
				-    pshufhw    xmm1, xmm1, 093h
			
 
				-    pshuflw    xmm1, xmm1, 093h
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 4
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         shuf_2103
			
 
				-    jmp        shuf99
			
 
				-
			
 
				-    align      4
			
 
				-  shuf_3012:
			
 
				-    movdqu     xmm0, [eax]
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklbw  xmm0, xmm5
			
 
				-    punpckhbw  xmm1, xmm5
			
 
				-    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
			
 
				-    pshuflw    xmm0, xmm0, 0C6h
			
 
				-    pshufhw    xmm1, xmm1, 0C6h
			
 
				-    pshuflw    xmm1, xmm1, 0C6h
			
 
				-    packuswb   xmm0, xmm1
			
 
				-    sub        ecx, 4
			
 
				-    movdqu     [edx], xmm0
			
 
				-    lea        edx, [edx + 16]
			
 
				-    jg         shuf_3012
			
 
				-
			
 
				-  shuf99:
			
 
				-    pop        esi
			
 
				-    pop        ebx
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// YUY2 - Macro-pixel = 2 image pixels
			
 
				-// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
			
 
				-
			
 
				-// UYVY - Macro-pixel = 2 image pixels
			
 
				-// U0Y0V0Y1
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToYUY2Row_SSE2(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_frame, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]    // src_y
			
 
				-    mov        esi, [esp + 8 + 8]    // src_u
			
 
				-    mov        edx, [esp + 8 + 12]   // src_v
			
 
				-    mov        edi, [esp + 8 + 16]   // dst_frame
			
 
				-    mov        ecx, [esp + 8 + 20]   // width
			
 
				-    sub        edx, esi
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movq       xmm2, qword ptr [esi] // U
			
 
				-    movq       xmm3, qword ptr [esi + edx] // V
			
 
				-    lea        esi, [esi + 8]
			
 
				-    punpcklbw  xmm2, xmm3 // UV
			
 
				-    movdqu     xmm0, [eax] // Y
			
 
				-    lea        eax, [eax + 16]
			
 
				-    movdqa     xmm1, xmm0
			
 
				-    punpcklbw  xmm0, xmm2 // YUYV
			
 
				-    punpckhbw  xmm1, xmm2
			
 
				-    movdqu     [edi], xmm0
			
 
				-    movdqu     [edi + 16], xmm1
			
 
				-    lea        edi, [edi + 32]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void I422ToUYVYRow_SSE2(const uint8* src_y,
			
 
				-                        const uint8* src_u,
			
 
				-                        const uint8* src_v,
			
 
				-                        uint8* dst_frame, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]    // src_y
			
 
				-    mov        esi, [esp + 8 + 8]    // src_u
			
 
				-    mov        edx, [esp + 8 + 12]   // src_v
			
 
				-    mov        edi, [esp + 8 + 16]   // dst_frame
			
 
				-    mov        ecx, [esp + 8 + 20]   // width
			
 
				-    sub        edx, esi
			
 
				-
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movq       xmm2, qword ptr [esi] // U
			
 
				-    movq       xmm3, qword ptr [esi + edx] // V
			
 
				-    lea        esi, [esi + 8]
			
 
				-    punpcklbw  xmm2, xmm3 // UV
			
 
				-    movdqu     xmm0, [eax] // Y
			
 
				-    movdqa     xmm1, xmm2
			
 
				-    lea        eax, [eax + 16]
			
 
				-    punpcklbw  xmm1, xmm0 // UYVY
			
 
				-    punpckhbw  xmm2, xmm0
			
 
				-    movdqu     [edi], xmm1
			
 
				-    movdqu     [edi + 16], xmm2
			
 
				-    lea        edi, [edi + 32]
			
 
				-    sub        ecx, 16
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
			
 
				-                            uint8* dst_argb, const float* poly,
			
 
				-                            int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 4 + 8]   /* dst_argb */
			
 
				-    mov        esi, [esp + 4 + 12]  /* poly */
			
 
				-    mov        ecx, [esp + 4 + 16]  /* width */
			
 
				-    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
			
 
				-
			
 
				-    // 2 pixel loop.
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
			
 
				-//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
			
 
				-    movq       xmm0, qword ptr [eax]  // BGRABGRA
			
 
				-    lea        eax, [eax + 8]
			
 
				-    punpcklbw  xmm0, xmm3
			
 
				-    movdqa     xmm4, xmm0
			
 
				-    punpcklwd  xmm0, xmm3  // pixel 0
			
 
				-    punpckhwd  xmm4, xmm3  // pixel 1
			
 
				-    cvtdq2ps   xmm0, xmm0  // 4 floats
			
 
				-    cvtdq2ps   xmm4, xmm4
			
 
				-    movdqa     xmm1, xmm0  // X
			
 
				-    movdqa     xmm5, xmm4
			
 
				-    mulps      xmm0, [esi + 16]  // C1 * X
			
 
				-    mulps      xmm4, [esi + 16]
			
 
				-    addps      xmm0, [esi]  // result = C0 + C1 * X
			
 
				-    addps      xmm4, [esi]
			
 
				-    movdqa     xmm2, xmm1
			
 
				-    movdqa     xmm6, xmm5
			
 
				-    mulps      xmm2, xmm1  // X * X
			
 
				-    mulps      xmm6, xmm5
			
 
				-    mulps      xmm1, xmm2  // X * X * X
			
 
				-    mulps      xmm5, xmm6
			
 
				-    mulps      xmm2, [esi + 32]  // C2 * X * X
			
 
				-    mulps      xmm6, [esi + 32]
			
 
				-    mulps      xmm1, [esi + 48]  // C3 * X * X * X
			
 
				-    mulps      xmm5, [esi + 48]
			
 
				-    addps      xmm0, xmm2  // result += C2 * X * X
			
 
				-    addps      xmm4, xmm6
			
 
				-    addps      xmm0, xmm1  // result += C3 * X * X * X
			
 
				-    addps      xmm4, xmm5
			
 
				-    cvttps2dq  xmm0, xmm0
			
 
				-    cvttps2dq  xmm4, xmm4
			
 
				-    packuswb   xmm0, xmm4
			
 
				-    packuswb   xmm0, xmm0
			
 
				-    sub        ecx, 2
			
 
				-    movq       qword ptr [edx], xmm0
			
 
				-    lea        edx, [edx + 8]
			
 
				-    jg         convertloop
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
			
 
				-
			
 
				-#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
			
 
				-                            uint8* dst_argb, const float* poly,
			
 
				-                            int width) {
			
 
				-  __asm {
			
 
				-    mov        eax, [esp + 4]   /* src_argb */
			
 
				-    mov        edx, [esp + 8]   /* dst_argb */
			
 
				-    mov        ecx, [esp + 12]   /* poly */
			
 
				-    vbroadcastf128 ymm4, [ecx]       // C0
			
 
				-    vbroadcastf128 ymm5, [ecx + 16]  // C1
			
 
				-    vbroadcastf128 ymm6, [ecx + 32]  // C2
			
 
				-    vbroadcastf128 ymm7, [ecx + 48]  // C3
			
 
				-    mov        ecx, [esp + 16]  /* width */
			
 
				-
			
 
				-    // 2 pixel loop.
			
 
				-    align      4
			
 
				- convertloop:
			
 
				-    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
			
 
				-    lea         eax, [eax + 8]
			
 
				-    vcvtdq2ps   ymm0, ymm0        // X 8 floats
			
 
				-    vmulps      ymm2, ymm0, ymm0  // X * X
			
 
				-    vmulps      ymm3, ymm0, ymm7  // C3 * X
			
 
				-    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
			
 
				-    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
			
 
				-    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
			
 
				-    vcvttps2dq  ymm0, ymm0
			
 
				-    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
			
 
				-    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
			
 
				-    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
			
 
				-    sub         ecx, 2
			
 
				-    vmovq       qword ptr [edx], xmm0
			
 
				-    lea         edx, [edx + 8]
			
 
				-    jg          convertloop
			
 
				-    vzeroupper
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
			
 
				-
			
 
				-#ifdef HAS_ARGBCOLORTABLEROW_X86
			
 
				-// Tranform ARGB pixels with color table.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
			
 
				-                           int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   /* dst_argb */
			
 
				-    mov        esi, [esp + 4 + 8]   /* table_argb */
			
 
				-    mov        ecx, [esp + 4 + 12]  /* width */
			
 
				-
			
 
				-    // 1 pixel loop.
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movzx      edx, byte ptr [eax]
			
 
				-    lea        eax, [eax + 4]
			
 
				-    movzx      edx, byte ptr [esi + edx * 4]
			
 
				-    mov        byte ptr [eax - 4], dl
			
 
				-    movzx      edx, byte ptr [eax - 4 + 1]
			
 
				-    movzx      edx, byte ptr [esi + edx * 4 + 1]
			
 
				-    mov        byte ptr [eax - 4 + 1], dl
			
 
				-    movzx      edx, byte ptr [eax - 4 + 2]
			
 
				-    movzx      edx, byte ptr [esi + edx * 4 + 2]
			
 
				-    mov        byte ptr [eax - 4 + 2], dl
			
 
				-    movzx      edx, byte ptr [eax - 4 + 3]
			
 
				-    movzx      edx, byte ptr [esi + edx * 4 + 3]
			
 
				-    mov        byte ptr [eax - 4 + 3], dl
			
 
				-    dec        ecx
			
 
				-    jg         convertloop
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBCOLORTABLEROW_X86
			
 
				-
			
 
				-#ifdef HAS_RGBCOLORTABLEROW_X86
			
 
				-// Tranform RGB pixels with color table.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    mov        eax, [esp + 4 + 4]   /* dst_argb */
			
 
				-    mov        esi, [esp + 4 + 8]   /* table_argb */
			
 
				-    mov        ecx, [esp + 4 + 12]  /* width */
			
 
				-
			
 
				-    // 1 pixel loop.
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movzx      edx, byte ptr [eax]
			
 
				-    lea        eax, [eax + 4]
			
 
				-    movzx      edx, byte ptr [esi + edx * 4]
			
 
				-    mov        byte ptr [eax - 4], dl
			
 
				-    movzx      edx, byte ptr [eax - 4 + 1]
			
 
				-    movzx      edx, byte ptr [esi + edx * 4 + 1]
			
 
				-    mov        byte ptr [eax - 4 + 1], dl
			
 
				-    movzx      edx, byte ptr [eax - 4 + 2]
			
 
				-    movzx      edx, byte ptr [esi + edx * 4 + 2]
			
 
				-    mov        byte ptr [eax - 4 + 2], dl
			
 
				-    dec        ecx
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_RGBCOLORTABLEROW_X86
			
 
				-
			
 
				-#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
			
 
				-// Tranform RGB pixels with luma table.
			
 
				-__declspec(naked) __declspec(align(16))
			
 
				-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
			
 
				-                                 int width,
			
 
				-                                 const uint8* luma, uint32 lumacoeff) {
			
 
				-  __asm {
			
 
				-    push       esi
			
 
				-    push       edi
			
 
				-    mov        eax, [esp + 8 + 4]   /* src_argb */
			
 
				-    mov        edi, [esp + 8 + 8]   /* dst_argb */
			
 
				-    mov        ecx, [esp + 8 + 12]  /* width */
			
 
				-    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
			
 
				-    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
			
 
				-    pshufd     xmm2, xmm2, 0
			
 
				-    pshufd     xmm3, xmm3, 0
			
 
				-    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
			
 
				-    psllw      xmm4, 8
			
 
				-    pxor       xmm5, xmm5
			
 
				-
			
 
				-    // 4 pixel loop.
			
 
				-    align      4
			
 
				-  convertloop:
			
 
				-    movdqu     xmm0, qword ptr [eax]      // generate luma ptr
			
 
				-    pmaddubsw  xmm0, xmm3
			
 
				-    phaddw     xmm0, xmm0
			
 
				-    pand       xmm0, xmm4  // mask out low bits
			
 
				-    punpcklwd  xmm0, xmm5
			
 
				-    paddd      xmm0, xmm2  // add table base
			
 
				-    movd       esi, xmm0
			
 
				-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
			
 
				-
			
 
				-    movzx      edx, byte ptr [eax]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi], dl
			
 
				-    movzx      edx, byte ptr [eax + 1]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi + 1], dl
			
 
				-    movzx      edx, byte ptr [eax + 2]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi + 2], dl
			
 
				-    movzx      edx, byte ptr [eax + 3]  // copy alpha.
			
 
				-    mov        byte ptr [edi + 3], dl
			
 
				-
			
 
				-    movd       esi, xmm0
			
 
				-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
			
 
				-
			
 
				-    movzx      edx, byte ptr [eax + 4]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi + 4], dl
			
 
				-    movzx      edx, byte ptr [eax + 5]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi + 5], dl
			
 
				-    movzx      edx, byte ptr [eax + 6]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi + 6], dl
			
 
				-    movzx      edx, byte ptr [eax + 7]  // copy alpha.
			
 
				-    mov        byte ptr [edi + 7], dl
			
 
				-
			
 
				-    movd       esi, xmm0
			
 
				-    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
			
 
				-
			
 
				-    movzx      edx, byte ptr [eax + 8]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi + 8], dl
			
 
				-    movzx      edx, byte ptr [eax + 9]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi + 9], dl
			
 
				-    movzx      edx, byte ptr [eax + 10]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi + 10], dl
			
 
				-    movzx      edx, byte ptr [eax + 11]  // copy alpha.
			
 
				-    mov        byte ptr [edi + 11], dl
			
 
				-
			
 
				-    movd       esi, xmm0
			
 
				-
			
 
				-    movzx      edx, byte ptr [eax + 12]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi + 12], dl
			
 
				-    movzx      edx, byte ptr [eax + 13]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi + 13], dl
			
 
				-    movzx      edx, byte ptr [eax + 14]
			
 
				-    movzx      edx, byte ptr [esi + edx]
			
 
				-    mov        byte ptr [edi + 14], dl
			
 
				-    movzx      edx, byte ptr [eax + 15]  // copy alpha.
			
 
				-    mov        byte ptr [edi + 15], dl
			
 
				-
			
 
				-    sub        ecx, 4
			
 
				-    lea        eax, [eax + 16]
			
 
				-    lea        edi, [edi + 16]
			
 
				-    jg         convertloop
			
 
				-
			
 
				-    pop        edi
			
 
				-    pop        esi
			
 
				-    ret
			
 
				-  }
			
 
				-}
			
 
				-#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
			
 
				-
			
 
				-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/row_x86.asm
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/row_x86.asm
@@ -1,146 +0,0 @@
 
				-;
			
 
				-; Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				-;
			
 
				-; Use of this source code is governed by a BSD-style license
			
 
				-; that can be found in the LICENSE file in the root of the source
			
 
				-; tree. An additional intellectual property rights grant can be found
			
 
				-; in the file PATENTS. All contributing project authors may
			
 
				-; be found in the AUTHORS file in the root of the source tree.
			
 
				-;
			
 
				-
			
 
				-%ifdef __YASM_VERSION_ID__
			
 
				-%if __YASM_VERSION_ID__ < 01020000h
			
 
				-%error AVX2 is supported only by yasm 1.2.0 or later.
			
 
				-%endif
			
 
				-%endif
			
 
				-%include "x86inc.asm"
			
 
				-
			
 
				-SECTION .text
			
 
				-
			
 
				-; cglobal numeric constants are parameters, gpr regs, mm regs
			
 
				-
			
 
				-; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
			
 
				-
			
 
				-%macro YUY2TOYROW 2-3
			
 
				-cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
			
 
				-%ifidn %1,YUY2
			
 
				-    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
			
 
				-    psrlw      m2, m2, 8
			
 
				-%endif
			
 
				-
			
 
				-    ALIGN      4
			
 
				-.convertloop:
			
 
				-    mov%2      m0, [src_yuy2q]
			
 
				-    mov%2      m1, [src_yuy2q + mmsize]
			
 
				-    lea        src_yuy2q, [src_yuy2q + mmsize * 2]
			
 
				-%ifidn %1,YUY2
			
 
				-    pand       m0, m0, m2   ; YUY2 even bytes are Y
			
 
				-    pand       m1, m1, m2
			
 
				-%else
			
 
				-    psrlw      m0, m0, 8    ; UYVY odd bytes are Y
			
 
				-    psrlw      m1, m1, 8
			
 
				-%endif
			
 
				-    packuswb   m0, m0, m1
			
 
				-%if cpuflag(AVX2)
			
 
				-    vpermq     m0, m0, 0xd8
			
 
				-%endif
			
 
				-    sub        pixd, mmsize
			
 
				-    mov%2      [dst_yq], m0
			
 
				-    lea        dst_yq, [dst_yq + mmsize]
			
 
				-    jg         .convertloop
			
 
				-    REP_RET
			
 
				-%endmacro
			
 
				-
			
 
				-; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
			
 
				-INIT_MMX MMX
			
 
				-YUY2TOYROW YUY2,a,
			
 
				-YUY2TOYROW YUY2,u,_Unaligned
			
 
				-YUY2TOYROW UYVY,a,
			
 
				-YUY2TOYROW UYVY,u,_Unaligned
			
 
				-INIT_XMM SSE2
			
 
				-YUY2TOYROW YUY2,a,
			
 
				-YUY2TOYROW YUY2,u,_Unaligned
			
 
				-YUY2TOYROW UYVY,a,
			
 
				-YUY2TOYROW UYVY,u,_Unaligned
			
 
				-INIT_YMM AVX2
			
 
				-YUY2TOYROW YUY2,a,
			
 
				-YUY2TOYROW UYVY,a,
			
 
				-
			
 
				-; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
			
 
				-
			
 
				-%macro SplitUVRow 1-2
			
 
				-cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
			
 
				-    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
			
 
				-    psrlw      m4, m4, 8
			
 
				-    sub        dst_vq, dst_uq
			
 
				-
			
 
				-    ALIGN      4
			
 
				-.convertloop:
			
 
				-    mov%1      m0, [src_uvq]
			
 
				-    mov%1      m1, [src_uvq + mmsize]
			
 
				-    lea        src_uvq, [src_uvq + mmsize * 2]
			
 
				-    psrlw      m2, m0, 8         ; odd bytes
			
 
				-    psrlw      m3, m1, 8
			
 
				-    pand       m0, m0, m4        ; even bytes
			
 
				-    pand       m1, m1, m4
			
 
				-    packuswb   m0, m0, m1
			
 
				-    packuswb   m2, m2, m3
			
 
				-%if cpuflag(AVX2)
			
 
				-    vpermq     m0, m0, 0xd8
			
 
				-    vpermq     m2, m2, 0xd8
			
 
				-%endif
			
 
				-    mov%1      [dst_uq], m0
			
 
				-    mov%1      [dst_uq + dst_vq], m2
			
 
				-    lea        dst_uq, [dst_uq + mmsize]
			
 
				-    sub        pixd, mmsize
			
 
				-    jg         .convertloop
			
 
				-    REP_RET
			
 
				-%endmacro
			
 
				-
			
 
				-INIT_MMX MMX
			
 
				-SplitUVRow a,
			
 
				-SplitUVRow u,_Unaligned
			
 
				-INIT_XMM SSE2
			
 
				-SplitUVRow a,
			
 
				-SplitUVRow u,_Unaligned
			
 
				-INIT_YMM AVX2
			
 
				-SplitUVRow a,
			
 
				-
			
 
				-; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
			
 
				-;                      int width);
			
 
				-
			
 
				-%macro MergeUVRow_ 1-2
			
 
				-cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
			
 
				-    sub        src_vq, src_uq
			
 
				-
			
 
				-    ALIGN      4
			
 
				-.convertloop:
			
 
				-    mov%1      m0, [src_uq]
			
 
				-    mov%1      m1, [src_vq]
			
 
				-    lea        src_uq, [src_uq + mmsize]
			
 
				-    punpcklbw  m2, m0, m1       // first 8 UV pairs
			
 
				-    punpckhbw  m0, m0, m1       // next 8 UV pairs
			
 
				-%if cpuflag(AVX2)
			
 
				-    vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
			
 
				-    vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
			
 
				-    mov%1      [dst_uvq], m1
			
 
				-    mov%1      [dst_uvq + mmsize], m2
			
 
				-%else
			
 
				-    mov%1      [dst_uvq], m2
			
 
				-    mov%1      [dst_uvq + mmsize], m0
			
 
				-%endif
			
 
				-    lea        dst_uvq, [dst_uvq + mmsize * 2]
			
 
				-    sub        pixd, mmsize
			
 
				-    jg         .convertloop
			
 
				-    REP_RET
			
 
				-%endmacro
			
 
				-
			
 
				-INIT_MMX MMX
			
 
				-MergeUVRow_ a,
			
 
				-MergeUVRow_ u,_Unaligned
			
 
				-INIT_XMM SSE2
			
 
				-MergeUVRow_ a,
			
 
				-MergeUVRow_ u,_Unaligned
			
 
				-INIT_YMM AVX2
			
 
				-MergeUVRow_ a,
			
 
				-
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/scale.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/scale.cc
@@ -1,926 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/scale.h"
			
 
				-
			
 
				-#include <assert.h>
			
 
				-#include <string.h>
			
 
				-
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/planar_functions.h"  // For CopyPlane
			
 
				-#include "libyuv/row.h"
			
 
				-#include "libyuv/scale_row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// Remove this macro if OVERREAD is safe.
			
 
				-#define AVOID_OVERREAD 1
			
 
				-
			
 
				-static __inline int Abs(int v) {
			
 
				-  return v >= 0 ? v : -v;
			
 
				-}
			
 
				-
			
 
				-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
			
 
				-
			
 
				-// Scale plane, 1/2
			
 
				-// This is an optimized version for scaling down a plane to 1/2 of
			
 
				-// its original size.
			
 
				-
			
 
				-static void ScalePlaneDown2(int src_width, int src_height,
			
 
				-                            int dst_width, int dst_height,
			
 
				-                            int src_stride, int dst_stride,
			
 
				-                            const uint8* src_ptr, uint8* dst_ptr,
			
 
				-                            enum FilterMode filtering) {
			
 
				-  int y;
			
 
				-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                        uint8* dst_ptr, int dst_width) =
			
 
				-    filtering == kFilterNone ? ScaleRowDown2_C :
			
 
				-        (filtering == kFilterLinear ? ScaleRowDown2Linear_C :
			
 
				-        ScaleRowDown2Box_C);
			
 
				-  int row_stride = src_stride << 1;
			
 
				-  if (!filtering) {
			
 
				-    src_ptr += src_stride;  // Point to odd rows.
			
 
				-    src_stride = 0;
			
 
				-  }
			
 
				-
			
 
				-#if defined(HAS_SCALEROWDOWN2_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
			
 
				-    ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
			
 
				-  }
			
 
				-#elif defined(HAS_SCALEROWDOWN2_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
			
 
				-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
			
 
				-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
			
 
				-        ScaleRowDown2Box_Unaligned_SSE2);
			
 
				-    if (IS_ALIGNED(src_ptr, 16) &&
			
 
				-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
			
 
				-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
			
 
				-          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
			
 
				-          ScaleRowDown2Box_SSE2);
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
			
 
				-      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
			
 
				-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
			
 
				-    ScaleRowDown2 = filtering ?
			
 
				-        ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  if (filtering == kFilterLinear) {
			
 
				-    src_stride = 0;
			
 
				-  }
			
 
				-  // TODO(fbarchard): Loop through source height to allow odd height.
			
 
				-  for (y = 0; y < dst_height; ++y) {
			
 
				-    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
			
 
				-    src_ptr += row_stride;
			
 
				-    dst_ptr += dst_stride;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scale plane, 1/4
			
 
				-// This is an optimized version for scaling down a plane to 1/4 of
			
 
				-// its original size.
			
 
				-
			
 
				-static void ScalePlaneDown4(int src_width, int src_height,
			
 
				-                            int dst_width, int dst_height,
			
 
				-                            int src_stride, int dst_stride,
			
 
				-                            const uint8* src_ptr, uint8* dst_ptr,
			
 
				-                            enum FilterMode filtering) {
			
 
				-  int y;
			
 
				-  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                        uint8* dst_ptr, int dst_width) =
			
 
				-      filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
			
 
				-  int row_stride = src_stride << 2;
			
 
				-  if (!filtering) {
			
 
				-    src_ptr += src_stride * 2;  // Point to row 2.
			
 
				-    src_stride = 0;
			
 
				-  }
			
 
				-#if defined(HAS_SCALEROWDOWN4_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
			
 
				-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
			
 
				-  }
			
 
				-#elif defined(HAS_SCALEROWDOWN4_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) &&
			
 
				-      IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
			
 
				-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
			
 
				-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
			
 
				-  }
			
 
				-#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
			
 
				-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
			
 
				-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
			
 
				-    ScaleRowDown4 = filtering ?
			
 
				-        ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  if (filtering == kFilterLinear) {
			
 
				-    src_stride = 0;
			
 
				-  }
			
 
				-  for (y = 0; y < dst_height; ++y) {
			
 
				-    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
			
 
				-    src_ptr += row_stride;
			
 
				-    dst_ptr += dst_stride;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scale plane down, 3/4
			
 
				-
			
 
				-static void ScalePlaneDown34(int src_width, int src_height,
			
 
				-                             int dst_width, int dst_height,
			
 
				-                             int src_stride, int dst_stride,
			
 
				-                             const uint8* src_ptr, uint8* dst_ptr,
			
 
				-                             enum FilterMode filtering) {
			
 
				-  int y;
			
 
				-  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                           uint8* dst_ptr, int dst_width);
			
 
				-  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                           uint8* dst_ptr, int dst_width);
			
 
				-  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
			
 
				-  assert(dst_width % 3 == 0);
			
 
				-  if (!filtering) {
			
 
				-    ScaleRowDown34_0 = ScaleRowDown34_C;
			
 
				-    ScaleRowDown34_1 = ScaleRowDown34_C;
			
 
				-  } else {
			
 
				-    ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
			
 
				-    ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
			
 
				-  }
			
 
				-#if defined(HAS_SCALEROWDOWN34_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
			
 
				-    if (!filtering) {
			
 
				-      ScaleRowDown34_0 = ScaleRowDown34_NEON;
			
 
				-      ScaleRowDown34_1 = ScaleRowDown34_NEON;
			
 
				-    } else {
			
 
				-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
			
 
				-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SCALEROWDOWN34_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
			
 
				-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
			
 
				-    if (!filtering) {
			
 
				-      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
			
 
				-      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
			
 
				-    } else {
			
 
				-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
			
 
				-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
			
 
				-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
			
 
				-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
			
 
				-    if (!filtering) {
			
 
				-      ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
			
 
				-      ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
			
 
				-    } else {
			
 
				-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;
			
 
				-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < dst_height - 2; y += 3) {
			
 
				-    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
			
 
				-    src_ptr += src_stride;
			
 
				-    dst_ptr += dst_stride;
			
 
				-    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
			
 
				-    src_ptr += src_stride;
			
 
				-    dst_ptr += dst_stride;
			
 
				-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
			
 
				-                     dst_ptr, dst_width);
			
 
				-    src_ptr += src_stride * 2;
			
 
				-    dst_ptr += dst_stride;
			
 
				-  }
			
 
				-
			
 
				-  // Remainder 1 or 2 rows with last row vertically unfiltered
			
 
				-  if ((dst_height % 3) == 2) {
			
 
				-    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
			
 
				-    src_ptr += src_stride;
			
 
				-    dst_ptr += dst_stride;
			
 
				-    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
			
 
				-  } else if ((dst_height % 3) == 1) {
			
 
				-    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-
			
 
				-// Scale plane, 3/8
			
 
				-// This is an optimized version for scaling down a plane to 3/8
			
 
				-// of its original size.
			
 
				-//
			
 
				-// Uses box filter arranges like this
			
 
				-// aaabbbcc -> abc
			
 
				-// aaabbbcc    def
			
 
				-// aaabbbcc    ghi
			
 
				-// dddeeeff
			
 
				-// dddeeeff
			
 
				-// dddeeeff
			
 
				-// ggghhhii
			
 
				-// ggghhhii
			
 
				-// Boxes are 3x3, 2x3, 3x2 and 2x2
			
 
				-
			
 
				-static void ScalePlaneDown38(int src_width, int src_height,
			
 
				-                             int dst_width, int dst_height,
			
 
				-                             int src_stride, int dst_stride,
			
 
				-                             const uint8* src_ptr, uint8* dst_ptr,
			
 
				-                             enum FilterMode filtering) {
			
 
				-  int y;
			
 
				-  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                           uint8* dst_ptr, int dst_width);
			
 
				-  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                           uint8* dst_ptr, int dst_width);
			
 
				-  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
			
 
				-  assert(dst_width % 3 == 0);
			
 
				-  if (!filtering) {
			
 
				-    ScaleRowDown38_3 = ScaleRowDown38_C;
			
 
				-    ScaleRowDown38_2 = ScaleRowDown38_C;
			
 
				-  } else {
			
 
				-    ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
			
 
				-    ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
			
 
				-  }
			
 
				-#if defined(HAS_SCALEROWDOWN38_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
			
 
				-    if (!filtering) {
			
 
				-      ScaleRowDown38_3 = ScaleRowDown38_NEON;
			
 
				-      ScaleRowDown38_2 = ScaleRowDown38_NEON;
			
 
				-    } else {
			
 
				-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
			
 
				-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_SCALEROWDOWN38_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
			
 
				-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
			
 
				-    if (!filtering) {
			
 
				-      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
			
 
				-      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
			
 
				-    } else {
			
 
				-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
			
 
				-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
			
 
				-    }
			
 
				-  }
			
 
				-#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
			
 
				-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
			
 
				-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
			
 
				-    if (!filtering) {
			
 
				-      ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
			
 
				-      ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
			
 
				-    } else {
			
 
				-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;
			
 
				-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  for (y = 0; y < dst_height - 2; y += 3) {
			
 
				-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
			
 
				-    src_ptr += src_stride * 3;
			
 
				-    dst_ptr += dst_stride;
			
 
				-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
			
 
				-    src_ptr += src_stride * 3;
			
 
				-    dst_ptr += dst_stride;
			
 
				-    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
			
 
				-    src_ptr += src_stride * 2;
			
 
				-    dst_ptr += dst_stride;
			
 
				-  }
			
 
				-
			
 
				-  // Remainder 1 or 2 rows with last row vertically unfiltered
			
 
				-  if ((dst_height % 3) == 2) {
			
 
				-    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
			
 
				-    src_ptr += src_stride * 3;
			
 
				-    dst_ptr += dst_stride;
			
 
				-    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
			
 
				-  } else if ((dst_height % 3) == 1) {
			
 
				-    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-static __inline uint32 SumBox(int iboxwidth, int iboxheight,
			
 
				-                              ptrdiff_t src_stride, const uint8* src_ptr) {
			
 
				-  uint32 sum = 0u;
			
 
				-  int y;
			
 
				-  assert(iboxwidth > 0);
			
 
				-  assert(iboxheight > 0);
			
 
				-  for (y = 0; y < iboxheight; ++y) {
			
 
				-    int x;
			
 
				-    for (x = 0; x < iboxwidth; ++x) {
			
 
				-      sum += src_ptr[x];
			
 
				-    }
			
 
				-    src_ptr += src_stride;
			
 
				-  }
			
 
				-  return sum;
			
 
				-}
			
 
				-
			
 
				-static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
			
 
				-                               int x, int dx, ptrdiff_t src_stride,
			
 
				-                               const uint8* src_ptr, uint8* dst_ptr) {
			
 
				-  int i;
			
 
				-  int boxwidth;
			
 
				-  for (i = 0; i < dst_width; ++i) {
			
 
				-    int ix = x >> 16;
			
 
				-    x += dx;
			
 
				-    boxwidth = (x >> 16) - ix;
			
 
				-    *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
			
 
				-        (boxwidth * boxheight);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
			
 
				-  uint32 sum = 0u;
			
 
				-  int x;
			
 
				-  assert(iboxwidth > 0);
			
 
				-  for (x = 0; x < iboxwidth; ++x) {
			
 
				-    sum += src_ptr[x];
			
 
				-  }
			
 
				-  return sum;
			
 
				-}
			
 
				-
			
 
				-static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
			
 
				-                            const uint16* src_ptr, uint8* dst_ptr) {
			
 
				-  int i;
			
 
				-  int scaletbl[2];
			
 
				-  int minboxwidth = (dx >> 16);
			
 
				-  int* scaleptr = scaletbl - minboxwidth;
			
 
				-  int boxwidth;
			
 
				-  scaletbl[0] = 65536 / (minboxwidth * boxheight);
			
 
				-  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
			
 
				-  for (i = 0; i < dst_width; ++i) {
			
 
				-    int ix = x >> 16;
			
 
				-    x += dx;
			
 
				-    boxwidth = (x >> 16) - ix;
			
 
				-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
			
 
				-                            const uint16* src_ptr, uint8* dst_ptr) {
			
 
				-  int boxwidth = (dx >> 16);
			
 
				-  int scaleval = 65536 / (boxwidth * boxheight);
			
 
				-  int i;
			
 
				-  for (i = 0; i < dst_width; ++i) {
			
 
				-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
			
 
				-    x += boxwidth;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scale plane down to any dimensions, with interpolation.
			
 
				-// (boxfilter).
			
 
				-//
			
 
				-// Same method as SimpleScale, which is fixed point, outputting
			
 
				-// one pixel of destination using fixed point (16.16) to step
			
 
				-// through source, sampling a box of pixel with simple
			
 
				-// averaging.
			
 
				-static void ScalePlaneBox(int src_width, int src_height,
			
 
				-                          int dst_width, int dst_height,
			
 
				-                          int src_stride, int dst_stride,
			
 
				-                          const uint8* src_ptr, uint8* dst_ptr) {
			
 
				-  int j;
			
 
				-  // Initial source x/y coordinate and step values as 16.16 fixed point.
			
 
				-  int x = 0;
			
 
				-  int y = 0;
			
 
				-  int dx = 0;
			
 
				-  int dy = 0;
			
 
				-  const int max_y = (src_height << 16);
			
 
				-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
			
 
				-             &x, &y, &dx, &dy);
			
 
				-  src_width = Abs(src_width);
			
 
				-  // TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
			
 
				-  if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
			
 
				-    uint8* dst = dst_ptr;
			
 
				-    int j;
			
 
				-    for (j = 0; j < dst_height; ++j) {
			
 
				-      int boxheight;
			
 
				-      int iy = y >> 16;
			
 
				-      const uint8* src = src_ptr + iy * src_stride;
			
 
				-      y += dy;
			
 
				-      if (y > max_y) {
			
 
				-        y = max_y;
			
 
				-      }
			
 
				-      boxheight = (y >> 16) - iy;
			
 
				-      ScalePlaneBoxRow_C(dst_width, boxheight,
			
 
				-                         x, dx, src_stride,
			
 
				-                         src, dst);
			
 
				-      dst += dst_stride;
			
 
				-    }
			
 
				-    return;
			
 
				-  }
			
 
				-  {
			
 
				-    // Allocate a row buffer of uint16.
			
 
				-    align_buffer_64(row16, src_width * 2);
			
 
				-    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
			
 
				-        const uint16* src_ptr, uint8* dst_ptr) =
			
 
				-        (dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C;
			
 
				-    void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-        uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
			
 
				-
			
 
				-#if defined(HAS_SCALEADDROWS_SSE2)
			
 
				-    if (TestCpuFlag(kCpuHasSSE2) &&
			
 
				-#ifdef AVOID_OVERREAD
			
 
				-        IS_ALIGNED(src_width, 16) &&
			
 
				-#endif
			
 
				-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
			
 
				-      ScaleAddRows = ScaleAddRows_SSE2;
			
 
				-    }
			
 
				-#endif
			
 
				-
			
 
				-    for (j = 0; j < dst_height; ++j) {
			
 
				-      int boxheight;
			
 
				-      int iy = y >> 16;
			
 
				-      const uint8* src = src_ptr + iy * src_stride;
			
 
				-      y += dy;
			
 
				-      if (y > (src_height << 16)) {
			
 
				-        y = (src_height << 16);
			
 
				-      }
			
 
				-      boxheight = (y >> 16) - iy;
			
 
				-      ScaleAddRows(src, src_stride, (uint16*)(row16),
			
 
				-                 src_width, boxheight);
			
 
				-      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16),
			
 
				-                 dst_ptr);
			
 
				-      dst_ptr += dst_stride;
			
 
				-    }
			
 
				-    free_aligned_buffer_64(row16);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scale plane down with bilinear interpolation.
			
 
				-void ScalePlaneBilinearDown(int src_width, int src_height,
			
 
				-                            int dst_width, int dst_height,
			
 
				-                            int src_stride, int dst_stride,
			
 
				-                            const uint8* src_ptr, uint8* dst_ptr,
			
 
				-                            enum FilterMode filtering) {
			
 
				-  // Initial source x/y coordinate and step values as 16.16 fixed point.
			
 
				-  int x = 0;
			
 
				-  int y = 0;
			
 
				-  int dx = 0;
			
 
				-  int dy = 0;
			
 
				-  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
			
 
				-  // Allocate a row buffer.
			
 
				-  align_buffer_64(row, src_width);
			
 
				-
			
 
				-  const int max_y = (src_height - 1) << 16;
			
 
				-  int j;
			
 
				-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-      int dst_width, int x, int dx) =
			
 
				-      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
			
 
				-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
			
 
				-      InterpolateRow_C;
			
 
				-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
			
 
				-             &x, &y, &dx, &dy);
			
 
				-  src_width = Abs(src_width);
			
 
				-
			
 
				-#if defined(HAS_INTERPOLATEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(src_width, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(src_width, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
			
 
				-    InterpolateRow = InterpolateRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(src_width, 32)) {
			
 
				-      InterpolateRow = InterpolateRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(src_width, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
			
 
				-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
			
 
				-    if (IS_ALIGNED(src_width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_MIPS_DSPR2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-
			
 
				-#if defined(HAS_SCALEFILTERCOLS_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
			
 
				-    ScaleFilterCols = ScaleFilterCols_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-  if (y > max_y) {
			
 
				-    y = max_y;
			
 
				-  }
			
 
				-
			
 
				-  for (j = 0; j < dst_height; ++j) {
			
 
				-    int yi = y >> 16;
			
 
				-    const uint8* src = src_ptr + yi * src_stride;
			
 
				-    if (filtering == kFilterLinear) {
			
 
				-      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
			
 
				-    } else {
			
 
				-      int yf = (y >> 8) & 255;
			
 
				-      InterpolateRow(row, src, src_stride, src_width, yf);
			
 
				-      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
			
 
				-    }
			
 
				-    dst_ptr += dst_stride;
			
 
				-    y += dy;
			
 
				-    if (y > max_y) {
			
 
				-      y = max_y;
			
 
				-    }
			
 
				-  }
			
 
				-  free_aligned_buffer_64(row);
			
 
				-}
			
 
				-
			
 
				-// Scale up down with bilinear interpolation.
			
 
				-void ScalePlaneBilinearUp(int src_width, int src_height,
			
 
				-                          int dst_width, int dst_height,
			
 
				-                          int src_stride, int dst_stride,
			
 
				-                          const uint8* src_ptr, uint8* dst_ptr,
			
 
				-                          enum FilterMode filtering) {
			
 
				-  int j;
			
 
				-  // Initial source x/y coordinate and step values as 16.16 fixed point.
			
 
				-  int x = 0;
			
 
				-  int y = 0;
			
 
				-  int dx = 0;
			
 
				-  int dy = 0;
			
 
				-  const int max_y = (src_height - 1) << 16;
			
 
				-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
			
 
				-      InterpolateRow_C;
			
 
				-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-       int dst_width, int x, int dx) =
			
 
				-       filtering ? ScaleFilterCols_C : ScaleCols_C;
			
 
				-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
			
 
				-             &x, &y, &dx, &dy);
			
 
				-  src_width = Abs(src_width);
			
 
				-
			
 
				-#if defined(HAS_INTERPOLATEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(dst_width, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(dst_width, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
			
 
				-    InterpolateRow = InterpolateRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(dst_width, 32)) {
			
 
				-      InterpolateRow = InterpolateRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(dst_width, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
			
 
				-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
			
 
				-    if (IS_ALIGNED(dst_width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_MIPS_DSPR2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  if (filtering && src_width >= 32768) {
			
 
				-    ScaleFilterCols = ScaleFilterCols64_C;
			
 
				-  }
			
 
				-#if defined(HAS_SCALEFILTERCOLS_SSSE3)
			
 
				-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
			
 
				-    ScaleFilterCols = ScaleFilterCols_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
			
 
				-    ScaleFilterCols = ScaleColsUp2_C;
			
 
				-#if defined(HAS_SCALECOLS_SSE2)
			
 
				-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
			
 
				-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-      ScaleFilterCols = ScaleColsUp2_SSE2;
			
 
				-    }
			
 
				-#endif
			
 
				-  }
			
 
				-
			
 
				-  if (y > max_y) {
			
 
				-    y = max_y;
			
 
				-  }
			
 
				-  {
			
 
				-    int yi = y >> 16;
			
 
				-    const uint8* src = src_ptr + yi * src_stride;
			
 
				-
			
 
				-    // Allocate 2 row buffers.
			
 
				-    const int kRowSize = (dst_width + 15) & ~15;
			
 
				-    align_buffer_64(row, kRowSize * 2);
			
 
				-
			
 
				-    uint8* rowptr = row;
			
 
				-    int rowstride = kRowSize;
			
 
				-    int lasty = yi;
			
 
				-
			
 
				-    ScaleFilterCols(rowptr, src, dst_width, x, dx);
			
 
				-    if (src_height > 1) {
			
 
				-      src += src_stride;
			
 
				-    }
			
 
				-    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
			
 
				-    src += src_stride;
			
 
				-
			
 
				-    for (j = 0; j < dst_height; ++j) {
			
 
				-      yi = y >> 16;
			
 
				-      if (yi != lasty) {
			
 
				-        if (y > max_y) {
			
 
				-          y = max_y;
			
 
				-          yi = y >> 16;
			
 
				-          src = src_ptr + yi * src_stride;
			
 
				-        }
			
 
				-        if (yi != lasty) {
			
 
				-          ScaleFilterCols(rowptr, src, dst_width, x, dx);
			
 
				-          rowptr += rowstride;
			
 
				-          rowstride = -rowstride;
			
 
				-          lasty = yi;
			
 
				-          src += src_stride;
			
 
				-        }
			
 
				-      }
			
 
				-      if (filtering == kFilterLinear) {
			
 
				-        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
			
 
				-      } else {
			
 
				-        int yf = (y >> 8) & 255;
			
 
				-        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
			
 
				-      }
			
 
				-      dst_ptr += dst_stride;
			
 
				-      y += dy;
			
 
				-    }
			
 
				-    free_aligned_buffer_64(row);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scale Plane to/from any dimensions, without interpolation.
			
 
				-// Fixed point math is used for performance: The upper 16 bits
			
 
				-// of x and dx is the integer part of the source position and
			
 
				-// the lower 16 bits are the fixed decimal part.
			
 
				-
			
 
				-static void ScalePlaneSimple(int src_width, int src_height,
			
 
				-                             int dst_width, int dst_height,
			
 
				-                             int src_stride, int dst_stride,
			
 
				-                             const uint8* src_ptr, uint8* dst_ptr) {
			
 
				-  int i;
			
 
				-  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-      int dst_width, int x, int dx) = ScaleCols_C;
			
 
				-  // Initial source x/y coordinate and step values as 16.16 fixed point.
			
 
				-  int x = 0;
			
 
				-  int y = 0;
			
 
				-  int dx = 0;
			
 
				-  int dy = 0;
			
 
				-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
			
 
				-             &x, &y, &dx, &dy);
			
 
				-  src_width = Abs(src_width);
			
 
				-
			
 
				-  if (src_width * 2 == dst_width && x < 0x8000) {
			
 
				-    ScaleCols = ScaleColsUp2_C;
			
 
				-#if defined(HAS_SCALECOLS_SSE2)
			
 
				-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
			
 
				-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-      ScaleCols = ScaleColsUp2_SSE2;
			
 
				-    }
			
 
				-#endif
			
 
				-  }
			
 
				-
			
 
				-  for (i = 0; i < dst_height; ++i) {
			
 
				-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
			
 
				-              dst_width, x, dx);
			
 
				-    dst_ptr += dst_stride;
			
 
				-    y += dy;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scale a plane.
			
 
				-// This function dispatches to a specialized scaler based on scale factor.
			
 
				-
			
 
				-LIBYUV_API
			
 
				-void ScalePlane(const uint8* src, int src_stride,
			
 
				-                int src_width, int src_height,
			
 
				-                uint8* dst, int dst_stride,
			
 
				-                int dst_width, int dst_height,
			
 
				-                enum FilterMode filtering) {
			
 
				-  // Simplify filtering when possible.
			
 
				-  filtering = ScaleFilterReduce(src_width, src_height,
			
 
				-                                dst_width, dst_height,
			
 
				-                                filtering);
			
 
				-
			
 
				-  // Negative height means invert the image.
			
 
				-  if (src_height < 0) {
			
 
				-    src_height = -src_height;
			
 
				-    src = src + (src_height - 1) * src_stride;
			
 
				-    src_stride = -src_stride;
			
 
				-  }
			
 
				-
			
 
				-  // Use specialized scales to improve performance for common resolutions.
			
 
				-  // For example, all the 1/2 scalings will use ScalePlaneDown2()
			
 
				-  if (dst_width == src_width && dst_height == src_height) {
			
 
				-    // Straight copy.
			
 
				-    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
			
 
				-    return;
			
 
				-  }
			
 
				-  if (dst_width == src_width) {
			
 
				-    int dy = FixedDiv(src_height, dst_height);
			
 
				-    // Arbitrary scale vertically, but unscaled vertically.
			
 
				-    ScalePlaneVertical(src_height,
			
 
				-                       dst_width, dst_height,
			
 
				-                       src_stride, dst_stride, src, dst,
			
 
				-                       0, 0, dy, 1, filtering);
			
 
				-    return;
			
 
				-  }
			
 
				-  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
			
 
				-    // Scale down.
			
 
				-    if (4 * dst_width == 3 * src_width &&
			
 
				-        4 * dst_height == 3 * src_height) {
			
 
				-      // optimized, 3/4
			
 
				-      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
			
 
				-                       src_stride, dst_stride, src, dst, filtering);
			
 
				-      return;
			
 
				-    }
			
 
				-    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
			
 
				-      // optimized, 1/2
			
 
				-      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
			
 
				-                      src_stride, dst_stride, src, dst, filtering);
			
 
				-      return;
			
 
				-    }
			
 
				-    // 3/8 rounded up for odd sized chroma height.
			
 
				-    if (8 * dst_width == 3 * src_width &&
			
 
				-        dst_height == ((src_height * 3 + 7) / 8)) {
			
 
				-      // optimized, 3/8
			
 
				-      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
			
 
				-                       src_stride, dst_stride, src, dst, filtering);
			
 
				-      return;
			
 
				-    }
			
 
				-    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
			
 
				-               filtering != kFilterBilinear) {
			
 
				-      // optimized, 1/4
			
 
				-      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
			
 
				-                      src_stride, dst_stride, src, dst, filtering);
			
 
				-      return;
			
 
				-    }
			
 
				-  }
			
 
				-  if (filtering == kFilterBox && dst_height * 2 < src_height) {
			
 
				-    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
			
 
				-                  src_stride, dst_stride, src, dst);
			
 
				-    return;
			
 
				-  }
			
 
				-  if (filtering && dst_height > src_height) {
			
 
				-    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
			
 
				-                         src_stride, dst_stride, src, dst, filtering);
			
 
				-    return;
			
 
				-  }
			
 
				-  if (filtering) {
			
 
				-    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
			
 
				-                           src_stride, dst_stride, src, dst, filtering);
			
 
				-    return;
			
 
				-  }
			
 
				-  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
			
 
				-                   src_stride, dst_stride, src, dst);
			
 
				-}
			
 
				-
			
 
				-// Scale an I420 image.
			
 
				-// This function in turn calls a scaling function for each plane.
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int I420Scale(const uint8* src_y, int src_stride_y,
			
 
				-              const uint8* src_u, int src_stride_u,
			
 
				-              const uint8* src_v, int src_stride_v,
			
 
				-              int src_width, int src_height,
			
 
				-              uint8* dst_y, int dst_stride_y,
			
 
				-              uint8* dst_u, int dst_stride_u,
			
 
				-              uint8* dst_v, int dst_stride_v,
			
 
				-              int dst_width, int dst_height,
			
 
				-              enum FilterMode filtering) {
			
 
				-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
			
 
				-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
			
 
				-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
			
 
				-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
			
 
				-  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
			
 
				-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-
			
 
				-  ScalePlane(src_y, src_stride_y, src_width, src_height,
			
 
				-             dst_y, dst_stride_y, dst_width, dst_height,
			
 
				-             filtering);
			
 
				-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
			
 
				-             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
			
 
				-             filtering);
			
 
				-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
			
 
				-             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
			
 
				-             filtering);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Deprecated api
			
 
				-LIBYUV_API
			
 
				-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
			
 
				-          int src_stride_y, int src_stride_u, int src_stride_v,
			
 
				-          int src_width, int src_height,
			
 
				-          uint8* dst_y, uint8* dst_u, uint8* dst_v,
			
 
				-          int dst_stride_y, int dst_stride_u, int dst_stride_v,
			
 
				-          int dst_width, int dst_height,
			
 
				-          LIBYUV_BOOL interpolate) {
			
 
				-  return I420Scale(src_y, src_stride_y,
			
 
				-                   src_u, src_stride_u,
			
 
				-                   src_v, src_stride_v,
			
 
				-                   src_width, src_height,
			
 
				-                   dst_y, dst_stride_y,
			
 
				-                   dst_u, dst_stride_u,
			
 
				-                   dst_v, dst_stride_v,
			
 
				-                   dst_width, dst_height,
			
 
				-                   interpolate ? kFilterBox : kFilterNone);
			
 
				-}
			
 
				-
			
 
				-// Deprecated api
			
 
				-LIBYUV_API
			
 
				-int ScaleOffset(const uint8* src, int src_width, int src_height,
			
 
				-                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
			
 
				-                LIBYUV_BOOL interpolate) {
			
 
				-  // Chroma requires offset to multiple of 2.
			
 
				-  int dst_yoffset_even = dst_yoffset & ~1;
			
 
				-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
			
 
				-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
			
 
				-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
			
 
				-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
			
 
				-  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
			
 
				-  const uint8* src_y = src;
			
 
				-  const uint8* src_u = src + src_width * src_height;
			
 
				-  const uint8* src_v = src + src_width * src_height +
			
 
				-                             src_halfwidth * src_halfheight;
			
 
				-  uint8* dst_y = dst + dst_yoffset_even * dst_width;
			
 
				-  uint8* dst_u = dst + dst_width * dst_height +
			
 
				-                 (dst_yoffset_even >> 1) * dst_halfwidth;
			
 
				-  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
			
 
				-                 (dst_yoffset_even >> 1) * dst_halfwidth;
			
 
				-  if (!src || src_width <= 0 || src_height <= 0 ||
			
 
				-      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
			
 
				-      dst_yoffset_even >= dst_height) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  return I420Scale(src_y, src_width,
			
 
				-                   src_u, src_halfwidth,
			
 
				-                   src_v, src_halfwidth,
			
 
				-                   src_width, src_height,
			
 
				-                   dst_y, dst_width,
			
 
				-                   dst_u, dst_halfwidth,
			
 
				-                   dst_v, dst_halfwidth,
			
 
				-                   dst_width, aheight,
			
 
				-                   interpolate ? kFilterBox : kFilterNone);
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/scale_argb.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/scale_argb.cc
@@ -1,809 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/scale.h"
			
 
				-
			
 
				-#include <assert.h>
			
 
				-#include <string.h>
			
 
				-
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/planar_functions.h"  // For CopyARGB
			
 
				-#include "libyuv/row.h"
			
 
				-#include "libyuv/scale_row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-static __inline int Abs(int v) {
			
 
				-  return v >= 0 ? v : -v;
			
 
				-}
			
 
				-
			
 
				-// ScaleARGB ARGB, 1/2
			
 
				-// This is an optimized version for scaling down a ARGB to 1/2 of
			
 
				-// its original size.
			
 
				-static void ScaleARGBDown2(int src_width, int src_height,
			
 
				-                           int dst_width, int dst_height,
			
 
				-                           int src_stride, int dst_stride,
			
 
				-                           const uint8* src_argb, uint8* dst_argb,
			
 
				-                           int x, int dx, int y, int dy,
			
 
				-                           enum FilterMode filtering) {
			
 
				-  int j;
			
 
				-  int row_stride = src_stride * (dy >> 16);
			
 
				-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
			
 
				-                            uint8* dst_argb, int dst_width) =
			
 
				-    filtering == kFilterNone ? ScaleARGBRowDown2_C :
			
 
				-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
			
 
				-        ScaleARGBRowDown2Box_C);
			
 
				-  assert(dx == 65536 * 2);  // Test scale factor of 2.
			
 
				-  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
			
 
				-  // Advance to odd row, even column.
			
 
				-  if (filtering == kFilterBilinear) {
			
 
				-    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
			
 
				-  } else {
			
 
				-    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
			
 
				-  }
			
 
				-
			
 
				-#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
			
 
				-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
			
 
				-        ScaleARGBRowDown2Box_SSE2);
			
 
				-  }
			
 
				-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
			
 
				-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
			
 
				-    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
			
 
				-        ScaleARGBRowDown2_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  if (filtering == kFilterLinear) {
			
 
				-    src_stride = 0;
			
 
				-  }
			
 
				-  for (j = 0; j < dst_height; ++j) {
			
 
				-    ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
			
 
				-    src_argb += row_stride;
			
 
				-    dst_argb += dst_stride;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// ScaleARGB ARGB, 1/4
			
 
				-// This is an optimized version for scaling down a ARGB to 1/4 of
			
 
				-// its original size.
			
 
				-static void ScaleARGBDown4Box(int src_width, int src_height,
			
 
				-                              int dst_width, int dst_height,
			
 
				-                              int src_stride, int dst_stride,
			
 
				-                              const uint8* src_argb, uint8* dst_argb,
			
 
				-                              int x, int dx, int y, int dy) {
			
 
				-  int j;
			
 
				-  // Allocate 2 rows of ARGB.
			
 
				-  const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;
			
 
				-  align_buffer_64(row, kRowSize * 2);
			
 
				-  int row_stride = src_stride * (dy >> 16);
			
 
				-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
			
 
				-    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
			
 
				-  // Advance to odd row, even column.
			
 
				-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
			
 
				-  assert(dx == 65536 * 4);  // Test scale factor of 4.
			
 
				-  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
			
 
				-#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
			
 
				-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
			
 
				-  }
			
 
				-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
			
 
				-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
			
 
				-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-  for (j = 0; j < dst_height; ++j) {
			
 
				-    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
			
 
				-    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
			
 
				-                      row + kRowSize, dst_width * 2);
			
 
				-    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
			
 
				-    src_argb += row_stride;
			
 
				-    dst_argb += dst_stride;
			
 
				-  }
			
 
				-  free_aligned_buffer_64(row);
			
 
				-}
			
 
				-
			
 
				-// ScaleARGB ARGB Even
			
 
				-// This is an optimized version for scaling down a ARGB to even
			
 
				-// multiple of its original size.
			
 
				-static void ScaleARGBDownEven(int src_width, int src_height,
			
 
				-                              int dst_width, int dst_height,
			
 
				-                              int src_stride, int dst_stride,
			
 
				-                              const uint8* src_argb, uint8* dst_argb,
			
 
				-                              int x, int dx, int y, int dy,
			
 
				-                              enum FilterMode filtering) {
			
 
				-  int j;
			
 
				-  int col_step = dx >> 16;
			
 
				-  int row_stride = (dy >> 16) * src_stride;
			
 
				-  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
			
 
				-                               int src_step, uint8* dst_argb, int dst_width) =
			
 
				-      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
			
 
				-  assert(IS_ALIGNED(src_width, 2));
			
 
				-  assert(IS_ALIGNED(src_height, 2));
			
 
				-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
			
 
				-#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
			
 
				-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
			
 
				-        ScaleARGBRowDownEven_SSE2;
			
 
				-  }
			
 
				-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
			
 
				-      IS_ALIGNED(src_argb, 4)) {
			
 
				-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
			
 
				-        ScaleARGBRowDownEven_NEON;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  if (filtering == kFilterLinear) {
			
 
				-    src_stride = 0;
			
 
				-  }
			
 
				-  for (j = 0; j < dst_height; ++j) {
			
 
				-    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
			
 
				-    src_argb += row_stride;
			
 
				-    dst_argb += dst_stride;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scale ARGB down with bilinear interpolation.
			
 
				-static void ScaleARGBBilinearDown(int src_width, int src_height,
			
 
				-                                  int dst_width, int dst_height,
			
 
				-                                  int src_stride, int dst_stride,
			
 
				-                                  const uint8* src_argb, uint8* dst_argb,
			
 
				-                                  int x, int dx, int y, int dy,
			
 
				-                                  enum FilterMode filtering) {
			
 
				-  int j;
			
 
				-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
			
 
				-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
			
 
				-      InterpolateRow_C;
			
 
				-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
			
 
				-      int dst_width, int x, int dx) =
			
 
				-      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
			
 
				-  int64 xlast = x + (int64)(dst_width - 1) * dx;
			
 
				-  int64 xl = (dx >= 0) ? x : xlast;
			
 
				-  int64 xr = (dx >= 0) ? xlast : x;
			
 
				-  int clip_src_width;
			
 
				-  xl = (xl >> 16) & ~3;  // Left edge aligned.
			
 
				-  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
			
 
				-  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
			
 
				-  if (xr > src_width) {
			
 
				-    xr = src_width;
			
 
				-  }
			
 
				-  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
			
 
				-  src_argb += xl * 4;
			
 
				-  x -= (int)(xl << 16);
			
 
				-#if defined(HAS_INTERPOLATEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(clip_src_width, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(clip_src_width, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
			
 
				-    InterpolateRow = InterpolateRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(clip_src_width, 32)) {
			
 
				-      InterpolateRow = InterpolateRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(clip_src_width, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
			
 
				-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
			
 
				-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
			
 
				-    if (IS_ALIGNED(clip_src_width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_MIPS_DSPR2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
			
 
				-    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
			
 
				-  // Allocate a row of ARGB.
			
 
				-  {
			
 
				-    align_buffer_64(row, clip_src_width * 4);
			
 
				-
			
 
				-    const int max_y = (src_height - 1) << 16;
			
 
				-    if (y > max_y) {
			
 
				-      y = max_y;
			
 
				-    }
			
 
				-    for (j = 0; j < dst_height; ++j) {
			
 
				-      int yi = y >> 16;
			
 
				-      const uint8* src = src_argb + yi * src_stride;
			
 
				-      if (filtering == kFilterLinear) {
			
 
				-        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
			
 
				-      } else {
			
 
				-        int yf = (y >> 8) & 255;
			
 
				-        InterpolateRow(row, src, src_stride, clip_src_width, yf);
			
 
				-        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
			
 
				-      }
			
 
				-      dst_argb += dst_stride;
			
 
				-      y += dy;
			
 
				-      if (y > max_y) {
			
 
				-        y = max_y;
			
 
				-      }
			
 
				-    }
			
 
				-    free_aligned_buffer_64(row);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scale ARGB up with bilinear interpolation.
			
 
				-static void ScaleARGBBilinearUp(int src_width, int src_height,
			
 
				-                                int dst_width, int dst_height,
			
 
				-                                int src_stride, int dst_stride,
			
 
				-                                const uint8* src_argb, uint8* dst_argb,
			
 
				-                                int x, int dx, int y, int dy,
			
 
				-                                enum FilterMode filtering) {
			
 
				-  int j;
			
 
				-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
			
 
				-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
			
 
				-      InterpolateRow_C;
			
 
				-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
			
 
				-      int dst_width, int x, int dx) =
			
 
				-      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
			
 
				-  const int max_y = (src_height - 1) << 16;
			
 
				-#if defined(HAS_INTERPOLATEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(dst_width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(dst_width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
			
 
				-    InterpolateRow = InterpolateRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(dst_width, 8)) {
			
 
				-      InterpolateRow = InterpolateRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
			
 
				-    InterpolateRow = InterpolateRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(dst_width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
			
 
				-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
			
 
				-    InterpolateRow = InterpolateRow_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-  if (src_width >= 32768) {
			
 
				-    ScaleARGBFilterCols = filtering ?
			
 
				-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
			
 
				-  }
			
 
				-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
			
 
				-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
			
 
				-    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SCALEARGBCOLS_SSE2)
			
 
				-  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
			
 
				-    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
			
 
				-    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
			
 
				-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
			
 
				-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
			
 
				-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
			
 
				-    }
			
 
				-#endif
			
 
				-  }
			
 
				-
			
 
				-  if (y > max_y) {
			
 
				-    y = max_y;
			
 
				-  }
			
 
				-
			
 
				-  {
			
 
				-    int yi = y >> 16;
			
 
				-    const uint8* src = src_argb + yi * src_stride;
			
 
				-
			
 
				-    // Allocate 2 rows of ARGB.
			
 
				-    const int kRowSize = (dst_width * 4 + 15) & ~15;
			
 
				-    align_buffer_64(row, kRowSize * 2);
			
 
				-
			
 
				-    uint8* rowptr = row;
			
 
				-    int rowstride = kRowSize;
			
 
				-    int lasty = yi;
			
 
				-
			
 
				-    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
			
 
				-    if (src_height > 1) {
			
 
				-      src += src_stride;
			
 
				-    }
			
 
				-    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
			
 
				-    src += src_stride;
			
 
				-
			
 
				-    for (j = 0; j < dst_height; ++j) {
			
 
				-      yi = y >> 16;
			
 
				-      if (yi != lasty) {
			
 
				-        if (y > max_y) {
			
 
				-          y = max_y;
			
 
				-          yi = y >> 16;
			
 
				-          src = src_argb + yi * src_stride;
			
 
				-        }
			
 
				-        if (yi != lasty) {
			
 
				-          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
			
 
				-          rowptr += rowstride;
			
 
				-          rowstride = -rowstride;
			
 
				-          lasty = yi;
			
 
				-          src += src_stride;
			
 
				-        }
			
 
				-      }
			
 
				-      if (filtering == kFilterLinear) {
			
 
				-        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
			
 
				-      } else {
			
 
				-        int yf = (y >> 8) & 255;
			
 
				-        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
			
 
				-      }
			
 
				-      dst_argb += dst_stride;
			
 
				-      y += dy;
			
 
				-    }
			
 
				-    free_aligned_buffer_64(row);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#ifdef YUVSCALEUP
			
 
				-// Scale YUV to ARGB up with bilinear interpolation.
			
 
				-static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
			
 
				-                                     int dst_width, int dst_height,
			
 
				-                                     int src_stride_y,
			
 
				-                                     int src_stride_u,
			
 
				-                                     int src_stride_v,
			
 
				-                                     int dst_stride_argb,
			
 
				-                                     const uint8* src_y,
			
 
				-                                     const uint8* src_u,
			
 
				-                                     const uint8* src_v,
			
 
				-                                     uint8* dst_argb,
			
 
				-                                     int x, int dx, int y, int dy,
			
 
				-                                     enum FilterMode filtering) {
			
 
				-  int j;
			
 
				-  void (*I422ToARGBRow)(const uint8* y_buf,
			
 
				-                        const uint8* u_buf,
			
 
				-                        const uint8* v_buf,
			
 
				-                        uint8* rgb_buf,
			
 
				-                        int width) = I422ToARGBRow_C;
			
 
				-#if defined(HAS_I422TOARGBROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(src_width, 8)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        I422ToARGBRow = I422ToARGBRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(src_width, 16)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(src_width, 8)) {
			
 
				-      I422ToARGBRow = I422ToARGBRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
			
 
				-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
			
 
				-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
			
 
				-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
			
 
				-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
			
 
				-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
			
 
				-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
			
 
				-      InterpolateRow_C;
			
 
				-#if defined(HAS_INTERPOLATEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(dst_width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(dst_width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
			
 
				-    InterpolateRow = InterpolateRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(dst_width, 8)) {
			
 
				-      InterpolateRow = InterpolateRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
			
 
				-    InterpolateRow = InterpolateRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(dst_width, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
			
 
				-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
			
 
				-    InterpolateRow = InterpolateRow_MIPS_DSPR2;
			
 
				-  }
			
 
				-#endif
			
 
				-
			
 
				-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
			
 
				-      int dst_width, int x, int dx) =
			
 
				-      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
			
 
				-  if (src_width >= 32768) {
			
 
				-    ScaleARGBFilterCols = filtering ?
			
 
				-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
			
 
				-  }
			
 
				-#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
			
 
				-  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
			
 
				-    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_SCALEARGBCOLS_SSE2)
			
 
				-  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
			
 
				-    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
			
 
				-    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
			
 
				-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
			
 
				-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
			
 
				-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
			
 
				-    }
			
 
				-#endif
			
 
				-  }
			
 
				-
			
 
				-  const int max_y = (src_height - 1) << 16;
			
 
				-  if (y > max_y) {
			
 
				-    y = max_y;
			
 
				-  }
			
 
				-  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
			
 
				-  int yi = y >> 16;
			
 
				-  int uv_yi = yi >> kYShift;
			
 
				-  const uint8* src_row_y = src_y + yi * src_stride_y;
			
 
				-  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
			
 
				-  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
			
 
				-
			
 
				-  // Allocate 2 rows of ARGB.
			
 
				-  const int kRowSize = (dst_width * 4 + 15) & ~15;
			
 
				-  align_buffer_64(row, kRowSize * 2);
			
 
				-
			
 
				-  // Allocate 1 row of ARGB for source conversion.
			
 
				-  align_buffer_64(argb_row, src_width * 4);
			
 
				-
			
 
				-  uint8* rowptr = row;
			
 
				-  int rowstride = kRowSize;
			
 
				-  int lasty = yi;
			
 
				-
			
 
				-  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
			
 
				-  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
			
 
				-  if (src_height > 1) {
			
 
				-    src_row_y += src_stride_y;
			
 
				-    if (yi & 1) {
			
 
				-      src_row_u += src_stride_u;
			
 
				-      src_row_v += src_stride_v;
			
 
				-    }
			
 
				-  }
			
 
				-  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
			
 
				-  if (src_height > 2) {
			
 
				-    src_row_y += src_stride_y;
			
 
				-    if (!(yi & 1)) {
			
 
				-      src_row_u += src_stride_u;
			
 
				-      src_row_v += src_stride_v;
			
 
				-    }
			
 
				-  }
			
 
				-
			
 
				-  for (j = 0; j < dst_height; ++j) {
			
 
				-    yi = y >> 16;
			
 
				-    if (yi != lasty) {
			
 
				-      if (y > max_y) {
			
 
				-        y = max_y;
			
 
				-        yi = y >> 16;
			
 
				-        uv_yi = yi >> kYShift;
			
 
				-        src_row_y = src_y + yi * src_stride_y;
			
 
				-        src_row_u = src_u + uv_yi * src_stride_u;
			
 
				-        src_row_v = src_v + uv_yi * src_stride_v;
			
 
				-      }
			
 
				-      if (yi != lasty) {
			
 
				-        // TODO(fbarchard): Convert the clipped region of row.
			
 
				-        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
			
 
				-        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
			
 
				-        rowptr += rowstride;
			
 
				-        rowstride = -rowstride;
			
 
				-        lasty = yi;
			
 
				-        src_row_y += src_stride_y;
			
 
				-        if (yi & 1) {
			
 
				-          src_row_u += src_stride_u;
			
 
				-          src_row_v += src_stride_v;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-    if (filtering == kFilterLinear) {
			
 
				-      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
			
 
				-    } else {
			
 
				-      int yf = (y >> 8) & 255;
			
 
				-      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
			
 
				-    }
			
 
				-    dst_argb += dst_stride_argb;
			
 
				-    y += dy;
			
 
				-  }
			
 
				-  free_aligned_buffer_64(row);
			
 
				-  free_aligned_buffer_64(row_argb);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-// Scale ARGB to/from any dimensions, without interpolation.
			
 
				-// Fixed point math is used for performance: The upper 16 bits
			
 
				-// of x and dx is the integer part of the source position and
			
 
				-// the lower 16 bits are the fixed decimal part.
			
 
				-
			
 
				-static void ScaleARGBSimple(int src_width, int src_height,
			
 
				-                            int dst_width, int dst_height,
			
 
				-                            int src_stride, int dst_stride,
			
 
				-                            const uint8* src_argb, uint8* dst_argb,
			
 
				-                            int x, int dx, int y, int dy) {
			
 
				-  int j;
			
 
				-  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
			
 
				-      int dst_width, int x, int dx) =
			
 
				-      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
			
 
				-#if defined(HAS_SCALEARGBCOLS_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
			
 
				-    ScaleARGBCols = ScaleARGBCols_SSE2;
			
 
				-  }
			
 
				-#endif
			
 
				-  if (src_width * 2 == dst_width && x < 0x8000) {
			
 
				-    ScaleARGBCols = ScaleARGBColsUp2_C;
			
 
				-#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
			
 
				-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
			
 
				-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
			
 
				-    }
			
 
				-#endif
			
 
				-  }
			
 
				-
			
 
				-  for (j = 0; j < dst_height; ++j) {
			
 
				-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
			
 
				-                  dst_width, x, dx);
			
 
				-    dst_argb += dst_stride;
			
 
				-    y += dy;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// ScaleARGB a ARGB.
			
 
				-// This function in turn calls a scaling function
			
 
				-// suitable for handling the desired resolutions.
			
 
				-static void ScaleARGB(const uint8* src, int src_stride,
			
 
				-                      int src_width, int src_height,
			
 
				-                      uint8* dst, int dst_stride,
			
 
				-                      int dst_width, int dst_height,
			
 
				-                      int clip_x, int clip_y, int clip_width, int clip_height,
			
 
				-                      enum FilterMode filtering) {
			
 
				-  // Initial source x/y coordinate and step values as 16.16 fixed point.
			
 
				-  int x = 0;
			
 
				-  int y = 0;
			
 
				-  int dx = 0;
			
 
				-  int dy = 0;
			
 
				-  // ARGB does not support box filter yet, but allow the user to pass it.
			
 
				-  // Simplify filtering when possible.
			
 
				-  filtering = ScaleFilterReduce(src_width, src_height,
			
 
				-                                dst_width, dst_height,
			
 
				-                                filtering);
			
 
				-
			
 
				-  // Negative src_height means invert the image.
			
 
				-  if (src_height < 0) {
			
 
				-    src_height = -src_height;
			
 
				-    src = src + (src_height - 1) * src_stride;
			
 
				-    src_stride = -src_stride;
			
 
				-  }
			
 
				-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
			
 
				-             &x, &y, &dx, &dy);
			
 
				-  src_width = Abs(src_width);
			
 
				-  if (clip_x) {
			
 
				-    int64 clipf = (int64)(clip_x) * dx;
			
 
				-    x += (clipf & 0xffff);
			
 
				-    src += (clipf >> 16) * 4;
			
 
				-    dst += clip_x * 4;
			
 
				-  }
			
 
				-  if (clip_y) {
			
 
				-    int64 clipf = (int64)(clip_y) * dy;
			
 
				-    y += (clipf & 0xffff);
			
 
				-    src += (clipf >> 16) * src_stride;
			
 
				-    dst += clip_y * dst_stride;
			
 
				-  }
			
 
				-
			
 
				-  // Special case for integer step values.
			
 
				-  if (((dx | dy) & 0xffff) == 0) {
			
 
				-    if (!dx || !dy) {  // 1 pixel wide and/or tall.
			
 
				-      filtering = kFilterNone;
			
 
				-    } else {
			
 
				-      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
			
 
				-      if (!(dx & 0x10000) && !(dy & 0x10000)) {
			
 
				-        if (dx == 0x20000) {
			
 
				-          // Optimized 1/2 downsample.
			
 
				-          ScaleARGBDown2(src_width, src_height,
			
 
				-                         clip_width, clip_height,
			
 
				-                         src_stride, dst_stride, src, dst,
			
 
				-                         x, dx, y, dy, filtering);
			
 
				-          return;
			
 
				-        }
			
 
				-        if (dx == 0x40000 && filtering == kFilterBox) {
			
 
				-          // Optimized 1/4 box downsample.
			
 
				-          ScaleARGBDown4Box(src_width, src_height,
			
 
				-                            clip_width, clip_height,
			
 
				-                            src_stride, dst_stride, src, dst,
			
 
				-                            x, dx, y, dy);
			
 
				-          return;
			
 
				-        }
			
 
				-        ScaleARGBDownEven(src_width, src_height,
			
 
				-                          clip_width, clip_height,
			
 
				-                          src_stride, dst_stride, src, dst,
			
 
				-                          x, dx, y, dy, filtering);
			
 
				-        return;
			
 
				-      }
			
 
				-      // Optimized odd scale down. ie 3, 5, 7, 9x.
			
 
				-      if ((dx & 0x10000) && (dy & 0x10000)) {
			
 
				-        filtering = kFilterNone;
			
 
				-        if (dx == 0x10000 && dy == 0x10000) {
			
 
				-          // Straight copy.
			
 
				-          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
			
 
				-                   dst, dst_stride, clip_width, clip_height);
			
 
				-          return;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-  if (dx == 0x10000 && (x & 0xffff) == 0) {
			
 
				-    // Arbitrary scale vertically, but unscaled vertically.
			
 
				-    ScalePlaneVertical(src_height,
			
 
				-                       clip_width, clip_height,
			
 
				-                       src_stride, dst_stride, src, dst,
			
 
				-                       x, y, dy, 4, filtering);
			
 
				-    return;
			
 
				-  }
			
 
				-  if (filtering && dy < 65536) {
			
 
				-    ScaleARGBBilinearUp(src_width, src_height,
			
 
				-                        clip_width, clip_height,
			
 
				-                        src_stride, dst_stride, src, dst,
			
 
				-                        x, dx, y, dy, filtering);
			
 
				-    return;
			
 
				-  }
			
 
				-  if (filtering) {
			
 
				-    ScaleARGBBilinearDown(src_width, src_height,
			
 
				-                          clip_width, clip_height,
			
 
				-                          src_stride, dst_stride, src, dst,
			
 
				-                          x, dx, y, dy, filtering);
			
 
				-    return;
			
 
				-  }
			
 
				-  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
			
 
				-                  src_stride, dst_stride, src, dst,
			
 
				-                  x, dx, y, dy);
			
 
				-}
			
 
				-
			
 
				-LIBYUV_API
			
 
				-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
			
 
				-                  int src_width, int src_height,
			
 
				-                  uint8* dst_argb, int dst_stride_argb,
			
 
				-                  int dst_width, int dst_height,
			
 
				-                  int clip_x, int clip_y, int clip_width, int clip_height,
			
 
				-                  enum FilterMode filtering) {
			
 
				-  if (!src_argb || src_width == 0 || src_height == 0 ||
			
 
				-      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
			
 
				-      clip_x < 0 || clip_y < 0 ||
			
 
				-      (clip_x + clip_width) > dst_width ||
			
 
				-      (clip_y + clip_height) > dst_height) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
			
 
				-            dst_argb, dst_stride_argb, dst_width, dst_height,
			
 
				-            clip_x, clip_y, clip_width, clip_height, filtering);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-// Scale an ARGB image.
			
 
				-LIBYUV_API
			
 
				-int ARGBScale(const uint8* src_argb, int src_stride_argb,
			
 
				-              int src_width, int src_height,
			
 
				-              uint8* dst_argb, int dst_stride_argb,
			
 
				-              int dst_width, int dst_height,
			
 
				-              enum FilterMode filtering) {
			
 
				-  if (!src_argb || src_width == 0 || src_height == 0 ||
			
 
				-      !dst_argb || dst_width <= 0 || dst_height <= 0) {
			
 
				-    return -1;
			
 
				-  }
			
 
				-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
			
 
				-            dst_argb, dst_stride_argb, dst_width, dst_height,
			
 
				-            0, 0, dst_width, dst_height, filtering);
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/scale_argb_neon.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/scale_argb_neon.cc
@@ -1,145 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// This module is for GCC Neon
			
 
				-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
			
 
				-
			
 
				-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
			
 
				-                            uint8* dst, int dst_width) {
			
 
				-  asm volatile (
			
 
				-#ifdef _ANDROID

			
 
				-	".fpu neon\n"

			
 
				-#endif
			
 
				-  "1:                                          \n"
			
 
				-    // load even pixels into q0, odd into q1
			
 
				-    "vld2.32    {q0, q1}, [%0]!                \n"
			
 
				-    "vld2.32    {q2, q3}, [%0]!                \n"
			
 
				-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
			
 
				-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
			
 
				-    "vst1.8     {q3}, [%1]!                    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_ptr),          // %0
			
 
				-    "+r"(dst),              // %1
			
 
				-    "+r"(dst_width)         // %2
			
 
				-  :
			
 
				-  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                               uint8* dst, int dst_width) {
			
 
				-  asm volatile (
			
 
				-    // change the stride to row 2 pointer
			
 
				-    "add        %1, %1, %0                     \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
			
 
				-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
			
 
				-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
			
 
				-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
			
 
				-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
			
 
				-    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
			
 
				-    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
			
 
				-    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
			
 
				-    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
			
 
				-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
			
 
				-    "vrshrn.u16 d1, q1, #2                     \n"
			
 
				-    "vrshrn.u16 d2, q2, #2                     \n"
			
 
				-    "vrshrn.u16 d3, q3, #2                     \n"
			
 
				-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_ptr),          // %0
			
 
				-    "+r"(src_stride),       // %1
			
 
				-    "+r"(dst),              // %2
			
 
				-    "+r"(dst_width)         // %3
			
 
				-  :
			
 
				-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Reads 4 pixels at a time.
			
 
				-// Alignment requirement: src_argb 4 byte aligned.
			
 
				-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx,
			
 
				-                               uint8* dst_argb, int dst_width) {
			
 
				-  asm volatile (
			
 
				-    "mov        r12, %3, lsl #2                \n"
			
 
				-    ".p2align  2                               \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.32    {d0[0]}, [%0], r12             \n"
			
 
				-    "vld1.32    {d0[1]}, [%0], r12             \n"
			
 
				-    "vld1.32    {d1[0]}, [%0], r12             \n"
			
 
				-    "vld1.32    {d1[1]}, [%0], r12             \n"
			
 
				-    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
			
 
				-    "vst1.8     {q0}, [%1]!                    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),    // %0
			
 
				-    "+r"(dst_argb),    // %1
			
 
				-    "+r"(dst_width)    // %2
			
 
				-  : "r"(src_stepx)     // %3
			
 
				-  : "memory", "cc", "r12", "q0"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-// Reads 4 pixels at a time.
			
 
				-// Alignment requirement: src_argb 4 byte aligned.
			
 
				-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
			
 
				-                                  int src_stepx,
			
 
				-                                  uint8* dst_argb, int dst_width) {
			
 
				-  asm volatile (
			
 
				-    "mov       r12, %4, lsl #2                 \n"
			
 
				-    "add       %1, %1, %0                      \n"
			
 
				-    ".p2align  2                               \n"
			
 
				-  "1:                                          \n"
			
 
				-    "vld1.8    {d0}, [%0], r12                 \n"  // Read 4 2x2 blocks -> 2x1
			
 
				-    "vld1.8    {d1}, [%1], r12                 \n"
			
 
				-    "vld1.8    {d2}, [%0], r12                 \n"
			
 
				-    "vld1.8    {d3}, [%1], r12                 \n"
			
 
				-    "vld1.8    {d4}, [%0], r12                 \n"
			
 
				-    "vld1.8    {d5}, [%1], r12                 \n"
			
 
				-    "vld1.8    {d6}, [%0], r12                 \n"
			
 
				-    "vld1.8    {d7}, [%1], r12                 \n"
			
 
				-    "vaddl.u8  q0, d0, d1                      \n"
			
 
				-    "vaddl.u8  q1, d2, d3                      \n"
			
 
				-    "vaddl.u8  q2, d4, d5                      \n"
			
 
				-    "vaddl.u8  q3, d6, d7                      \n"
			
 
				-    "vswp.8    d1, d2                          \n"  // ab_cd -> ac_bd
			
 
				-    "vswp.8    d5, d6                          \n"  // ef_gh -> eg_fh
			
 
				-    "vadd.u16  q0, q0, q1                      \n"  // (a+b)_(c+d)
			
 
				-    "vadd.u16  q2, q2, q3                      \n"  // (e+f)_(g+h)
			
 
				-    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
			
 
				-    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
			
 
				-    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
			
 
				-    "vst1.8     {q0}, [%2]!                    \n"
			
 
				-    "bgt        1b                             \n"
			
 
				-  : "+r"(src_argb),    // %0
			
 
				-    "+r"(src_stride),  // %1
			
 
				-    "+r"(dst_argb),    // %2
			
 
				-    "+r"(dst_width)    // %3
			
 
				-  : "r"(src_stepx)     // %4
			
 
				-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
			
 
				-  );
			
 
				-}
			
 
				-#endif  // __ARM_NEON__
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/scale_common.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/scale_common.cc
@@ -1,772 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/scale.h"
			
 
				-
			
 
				-#include <assert.h>
			
 
				-#include <string.h>
			
 
				-
			
 
				-#include "libyuv/cpu_id.h"
			
 
				-#include "libyuv/planar_functions.h"  // For CopyARGB
			
 
				-#include "libyuv/row.h"
			
 
				-#include "libyuv/scale_row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-static __inline int Abs(int v) {
			
 
				-  return v >= 0 ? v : -v;
			
 
				-}
			
 
				-
			
 
				-// CPU agnostic row functions
			
 
				-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                     uint8* dst, int dst_width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < dst_width - 1; x += 2) {
			
 
				-    dst[0] = src_ptr[1];
			
 
				-    dst[1] = src_ptr[3];
			
 
				-    dst += 2;
			
 
				-    src_ptr += 4;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst[0] = src_ptr[1];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                           uint8* dst, int dst_width) {
			
 
				-  const uint8* s = src_ptr;
			
 
				-  int x;
			
 
				-  for (x = 0; x < dst_width - 1; x += 2) {
			
 
				-    dst[0] = (s[0] + s[1] + 1) >> 1;
			
 
				-    dst[1] = (s[2] + s[3] + 1) >> 1;
			
 
				-    dst += 2;
			
 
				-    s += 4;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst[0] = (s[0] + s[1] + 1) >> 1;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                        uint8* dst, int dst_width) {
			
 
				-  const uint8* s = src_ptr;
			
 
				-  const uint8* t = src_ptr + src_stride;
			
 
				-  int x;
			
 
				-  for (x = 0; x < dst_width - 1; x += 2) {
			
 
				-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
			
 
				-    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
			
 
				-    dst += 2;
			
 
				-    s += 4;
			
 
				-    t += 4;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                     uint8* dst, int dst_width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < dst_width - 1; x += 2) {
			
 
				-    dst[0] = src_ptr[2];
			
 
				-    dst[1] = src_ptr[6];
			
 
				-    dst += 2;
			
 
				-    src_ptr += 8;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst[0] = src_ptr[2];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                        uint8* dst, int dst_width) {
			
 
				-  intptr_t stride = src_stride;
			
 
				-  int x;
			
 
				-  for (x = 0; x < dst_width - 1; x += 2) {
			
 
				-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
			
 
				-             src_ptr[stride + 0] + src_ptr[stride + 1] +
			
 
				-             src_ptr[stride + 2] + src_ptr[stride + 3] +
			
 
				-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
			
 
				-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
			
 
				-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
			
 
				-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
			
 
				-             8) >> 4;
			
 
				-    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
			
 
				-             src_ptr[stride + 4] + src_ptr[stride + 5] +
			
 
				-             src_ptr[stride + 6] + src_ptr[stride + 7] +
			
 
				-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
			
 
				-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
			
 
				-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
			
 
				-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
			
 
				-             8) >> 4;
			
 
				-    dst += 2;
			
 
				-    src_ptr += 8;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
			
 
				-             src_ptr[stride + 0] + src_ptr[stride + 1] +
			
 
				-             src_ptr[stride + 2] + src_ptr[stride + 3] +
			
 
				-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
			
 
				-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
			
 
				-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
			
 
				-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
			
 
				-             8) >> 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                      uint8* dst, int dst_width) {
			
 
				-  int x;
			
 
				-  assert((dst_width % 3 == 0) && (dst_width > 0));
			
 
				-  for (x = 0; x < dst_width; x += 3) {
			
 
				-    dst[0] = src_ptr[0];
			
 
				-    dst[1] = src_ptr[1];
			
 
				-    dst[2] = src_ptr[3];
			
 
				-    dst += 3;
			
 
				-    src_ptr += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Filter rows 0 and 1 together, 3 : 1
			
 
				-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                            uint8* d, int dst_width) {
			
 
				-  const uint8* s = src_ptr;
			
 
				-  const uint8* t = src_ptr + src_stride;
			
 
				-  int x;
			
 
				-  assert((dst_width % 3 == 0) && (dst_width > 0));
			
 
				-  for (x = 0; x < dst_width; x += 3) {
			
 
				-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
			
 
				-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
			
 
				-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
			
 
				-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
			
 
				-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
			
 
				-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
			
 
				-    d[0] = (a0 * 3 + b0 + 2) >> 2;
			
 
				-    d[1] = (a1 * 3 + b1 + 2) >> 2;
			
 
				-    d[2] = (a2 * 3 + b2 + 2) >> 2;
			
 
				-    d += 3;
			
 
				-    s += 4;
			
 
				-    t += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Filter rows 1 and 2 together, 1 : 1
			
 
				-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                            uint8* d, int dst_width) {
			
 
				-  const uint8* s = src_ptr;
			
 
				-  const uint8* t = src_ptr + src_stride;
			
 
				-  int x;
			
 
				-  assert((dst_width % 3 == 0) && (dst_width > 0));
			
 
				-  for (x = 0; x < dst_width; x += 3) {
			
 
				-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
			
 
				-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
			
 
				-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
			
 
				-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
			
 
				-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
			
 
				-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
			
 
				-    d[0] = (a0 + b0 + 1) >> 1;
			
 
				-    d[1] = (a1 + b1 + 1) >> 1;
			
 
				-    d[2] = (a2 + b2 + 1) >> 1;
			
 
				-    d += 3;
			
 
				-    s += 4;
			
 
				-    t += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scales a single row of pixels using point sampling.
			
 
				-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                 int dst_width, int x, int dx) {
			
 
				-  int j;
			
 
				-  for (j = 0; j < dst_width - 1; j += 2) {
			
 
				-    dst_ptr[0] = src_ptr[x >> 16];
			
 
				-    x += dx;
			
 
				-    dst_ptr[1] = src_ptr[x >> 16];
			
 
				-    x += dx;
			
 
				-    dst_ptr += 2;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst_ptr[0] = src_ptr[x >> 16];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scales a single row of pixels up by 2x using point sampling.
			
 
				-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                    int dst_width, int x, int dx) {
			
 
				-  int j;
			
 
				-  for (j = 0; j < dst_width - 1; j += 2) {
			
 
				-    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
			
 
				-    src_ptr += 1;
			
 
				-    dst_ptr += 2;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst_ptr[0] = src_ptr[0];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// (1-f)a + fb can be replaced with a + f(b-a)
			
 
				-#define BLENDER(a, b, f) (uint8)((int)(a) + \
			
 
				-    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
			
 
				-
			
 
				-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                       int dst_width, int x, int dx) {
			
 
				-  int j;
			
 
				-  for (j = 0; j < dst_width - 1; j += 2) {
			
 
				-    int xi = x >> 16;
			
 
				-    int a = src_ptr[xi];
			
 
				-    int b = src_ptr[xi + 1];
			
 
				-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
			
 
				-    x += dx;
			
 
				-    xi = x >> 16;
			
 
				-    a = src_ptr[xi];
			
 
				-    b = src_ptr[xi + 1];
			
 
				-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
			
 
				-    x += dx;
			
 
				-    dst_ptr += 2;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    int xi = x >> 16;
			
 
				-    int a = src_ptr[xi];
			
 
				-    int b = src_ptr[xi + 1];
			
 
				-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
			
 
				-                         int dst_width, int x32, int dx) {
			
 
				-  int64 x = (int64)(x32);
			
 
				-  int j;
			
 
				-  for (j = 0; j < dst_width - 1; j += 2) {
			
 
				-    int64 xi = x >> 16;
			
 
				-    int a = src_ptr[xi];
			
 
				-    int b = src_ptr[xi + 1];
			
 
				-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
			
 
				-    x += dx;
			
 
				-    xi = x >> 16;
			
 
				-    a = src_ptr[xi];
			
 
				-    b = src_ptr[xi + 1];
			
 
				-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
			
 
				-    x += dx;
			
 
				-    dst_ptr += 2;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    int64 xi = x >> 16;
			
 
				-    int a = src_ptr[xi];
			
 
				-    int b = src_ptr[xi + 1];
			
 
				-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
			
 
				-  }
			
 
				-}
			
 
				-#undef BLENDER
			
 
				-
			
 
				-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                      uint8* dst, int dst_width) {
			
 
				-  int x;
			
 
				-  assert(dst_width % 3 == 0);
			
 
				-  for (x = 0; x < dst_width; x += 3) {
			
 
				-    dst[0] = src_ptr[0];
			
 
				-    dst[1] = src_ptr[3];
			
 
				-    dst[2] = src_ptr[6];
			
 
				-    dst += 3;
			
 
				-    src_ptr += 8;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8x3 -> 3x1
			
 
				-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
			
 
				-                            ptrdiff_t src_stride,
			
 
				-                            uint8* dst_ptr, int dst_width) {
			
 
				-  intptr_t stride = src_stride;
			
 
				-  int i;
			
 
				-  assert((dst_width % 3 == 0) && (dst_width > 0));
			
 
				-  for (i = 0; i < dst_width; i += 3) {
			
 
				-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
			
 
				-        src_ptr[stride + 0] + src_ptr[stride + 1] +
			
 
				-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
			
 
				-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
			
 
				-        (65536 / 9) >> 16;
			
 
				-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
			
 
				-        src_ptr[stride + 3] + src_ptr[stride + 4] +
			
 
				-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
			
 
				-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
			
 
				-        (65536 / 9) >> 16;
			
 
				-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
			
 
				-        src_ptr[stride + 6] + src_ptr[stride + 7] +
			
 
				-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
			
 
				-        (65536 / 6) >> 16;
			
 
				-    src_ptr += 8;
			
 
				-    dst_ptr += 3;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// 8x2 -> 3x1
			
 
				-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                            uint8* dst_ptr, int dst_width) {
			
 
				-  intptr_t stride = src_stride;
			
 
				-  int i;
			
 
				-  assert((dst_width % 3 == 0) && (dst_width > 0));
			
 
				-  for (i = 0; i < dst_width; i += 3) {
			
 
				-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
			
 
				-        src_ptr[stride + 0] + src_ptr[stride + 1] +
			
 
				-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
			
 
				-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
			
 
				-        src_ptr[stride + 3] + src_ptr[stride + 4] +
			
 
				-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
			
 
				-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
			
 
				-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
			
 
				-        (65536 / 4) >> 16;
			
 
				-    src_ptr += 8;
			
 
				-    dst_ptr += 3;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                    uint16* dst_ptr, int src_width, int src_height) {
			
 
				-  int x;
			
 
				-  assert(src_width > 0);
			
 
				-  assert(src_height > 0);
			
 
				-  for (x = 0; x < src_width; ++x) {
			
 
				-    const uint8* s = src_ptr + x;
			
 
				-    unsigned int sum = 0u;
			
 
				-    int y;
			
 
				-    for (y = 0; y < src_height; ++y) {
			
 
				-      sum += s[0];
			
 
				-      s += src_stride;
			
 
				-    }
			
 
				-    // TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
			
 
				-    dst_ptr[x] = sum < 65535u ? sum : 65535u;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleARGBRowDown2_C(const uint8* src_argb,
			
 
				-                         ptrdiff_t src_stride,
			
 
				-                         uint8* dst_argb, int dst_width) {
			
 
				-  const uint32* src = (const uint32*)(src_argb);
			
 
				-  uint32* dst = (uint32*)(dst_argb);
			
 
				-
			
 
				-  int x;
			
 
				-  for (x = 0; x < dst_width - 1; x += 2) {
			
 
				-    dst[0] = src[1];
			
 
				-    dst[1] = src[3];
			
 
				-    src += 4;
			
 
				-    dst += 2;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst[0] = src[1];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
			
 
				-                               ptrdiff_t src_stride,
			
 
				-                               uint8* dst_argb, int dst_width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < dst_width; ++x) {
			
 
				-    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
			
 
				-    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
			
 
				-    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
			
 
				-    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
			
 
				-    src_argb += 8;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
			
 
				-                            uint8* dst_argb, int dst_width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < dst_width; ++x) {
			
 
				-    dst_argb[0] = (src_argb[0] + src_argb[4] +
			
 
				-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
			
 
				-    dst_argb[1] = (src_argb[1] + src_argb[5] +
			
 
				-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
			
 
				-    dst_argb[2] = (src_argb[2] + src_argb[6] +
			
 
				-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
			
 
				-    dst_argb[3] = (src_argb[3] + src_argb[7] +
			
 
				-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
			
 
				-    src_argb += 8;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
			
 
				-                            int src_stepx,
			
 
				-                            uint8* dst_argb, int dst_width) {
			
 
				-  const uint32* src = (const uint32*)(src_argb);
			
 
				-  uint32* dst = (uint32*)(dst_argb);
			
 
				-
			
 
				-  int x;
			
 
				-  for (x = 0; x < dst_width - 1; x += 2) {
			
 
				-    dst[0] = src[0];
			
 
				-    dst[1] = src[src_stepx];
			
 
				-    src += src_stepx * 2;
			
 
				-    dst += 2;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst[0] = src[0];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
			
 
				-                               ptrdiff_t src_stride,
			
 
				-                               int src_stepx,
			
 
				-                               uint8* dst_argb, int dst_width) {
			
 
				-  int x;
			
 
				-  for (x = 0; x < dst_width; ++x) {
			
 
				-    dst_argb[0] = (src_argb[0] + src_argb[4] +
			
 
				-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
			
 
				-    dst_argb[1] = (src_argb[1] + src_argb[5] +
			
 
				-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
			
 
				-    dst_argb[2] = (src_argb[2] + src_argb[6] +
			
 
				-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
			
 
				-    dst_argb[3] = (src_argb[3] + src_argb[7] +
			
 
				-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
			
 
				-    src_argb += src_stepx * 4;
			
 
				-    dst_argb += 4;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scales a single row of pixels using point sampling.
			
 
				-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
			
 
				-                     int dst_width, int x, int dx) {
			
 
				-  const uint32* src = (const uint32*)(src_argb);
			
 
				-  uint32* dst = (uint32*)(dst_argb);
			
 
				-  int j;
			
 
				-  for (j = 0; j < dst_width - 1; j += 2) {
			
 
				-    dst[0] = src[x >> 16];
			
 
				-    x += dx;
			
 
				-    dst[1] = src[x >> 16];
			
 
				-    x += dx;
			
 
				-    dst += 2;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst[0] = src[x >> 16];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
			
 
				-                       int dst_width, int x32, int dx) {
			
 
				-  int64 x = (int64)(x32);
			
 
				-  const uint32* src = (const uint32*)(src_argb);
			
 
				-  uint32* dst = (uint32*)(dst_argb);
			
 
				-  int j;
			
 
				-  for (j = 0; j < dst_width - 1; j += 2) {
			
 
				-    dst[0] = src[x >> 16];
			
 
				-    x += dx;
			
 
				-    dst[1] = src[x >> 16];
			
 
				-    x += dx;
			
 
				-    dst += 2;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst[0] = src[x >> 16];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Scales a single row of pixels up by 2x using point sampling.
			
 
				-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
			
 
				-                        int dst_width, int x, int dx) {
			
 
				-  const uint32* src = (const uint32*)(src_argb);
			
 
				-  uint32* dst = (uint32*)(dst_argb);
			
 
				-  int j;
			
 
				-  for (j = 0; j < dst_width - 1; j += 2) {
			
 
				-    dst[1] = dst[0] = src[0];
			
 
				-    src += 1;
			
 
				-    dst += 2;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    dst[0] = src[0];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Mimics SSSE3 blender
			
 
				-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
			
 
				-#define BLENDERC(a, b, f, s) (uint32)( \
			
 
				-    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
			
 
				-#define BLENDER(a, b, f) \
			
 
				-    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
			
 
				-    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
			
 
				-
			
 
				-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
			
 
				-                           int dst_width, int x, int dx) {
			
 
				-  const uint32* src = (const uint32*)(src_argb);
			
 
				-  uint32* dst = (uint32*)(dst_argb);
			
 
				-  int j;
			
 
				-  for (j = 0; j < dst_width - 1; j += 2) {
			
 
				-    int xi = x >> 16;
			
 
				-    int xf = (x >> 9) & 0x7f;
			
 
				-    uint32 a = src[xi];
			
 
				-    uint32 b = src[xi + 1];
			
 
				-    dst[0] = BLENDER(a, b, xf);
			
 
				-    x += dx;
			
 
				-    xi = x >> 16;
			
 
				-    xf = (x >> 9) & 0x7f;
			
 
				-    a = src[xi];
			
 
				-    b = src[xi + 1];
			
 
				-    dst[1] = BLENDER(a, b, xf);
			
 
				-    x += dx;
			
 
				-    dst += 2;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    int xi = x >> 16;
			
 
				-    int xf = (x >> 9) & 0x7f;
			
 
				-    uint32 a = src[xi];
			
 
				-    uint32 b = src[xi + 1];
			
 
				-    dst[0] = BLENDER(a, b, xf);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
			
 
				-                             int dst_width, int x32, int dx) {
			
 
				-  int64 x = (int64)(x32);
			
 
				-  const uint32* src = (const uint32*)(src_argb);
			
 
				-  uint32* dst = (uint32*)(dst_argb);
			
 
				-  int j;
			
 
				-  for (j = 0; j < dst_width - 1; j += 2) {
			
 
				-    int64 xi = x >> 16;
			
 
				-    int xf = (x >> 9) & 0x7f;
			
 
				-    uint32 a = src[xi];
			
 
				-    uint32 b = src[xi + 1];
			
 
				-    dst[0] = BLENDER(a, b, xf);
			
 
				-    x += dx;
			
 
				-    xi = x >> 16;
			
 
				-    xf = (x >> 9) & 0x7f;
			
 
				-    a = src[xi];
			
 
				-    b = src[xi + 1];
			
 
				-    dst[1] = BLENDER(a, b, xf);
			
 
				-    x += dx;
			
 
				-    dst += 2;
			
 
				-  }
			
 
				-  if (dst_width & 1) {
			
 
				-    int64 xi = x >> 16;
			
 
				-    int xf = (x >> 9) & 0x7f;
			
 
				-    uint32 a = src[xi];
			
 
				-    uint32 b = src[xi + 1];
			
 
				-    dst[0] = BLENDER(a, b, xf);
			
 
				-  }
			
 
				-}
			
 
				-#undef BLENDER1
			
 
				-#undef BLENDERC
			
 
				-#undef BLENDER
			
 
				-
			
 
				-// Scale plane vertically with bilinear interpolation.
			
 
				-void ScalePlaneVertical(int src_height,
			
 
				-                        int dst_width, int dst_height,
			
 
				-                        int src_stride, int dst_stride,
			
 
				-                        const uint8* src_argb, uint8* dst_argb,
			
 
				-                        int x, int y, int dy,
			
 
				-                        int bpp, enum FilterMode filtering) {
			
 
				-  // TODO(fbarchard): Allow higher bpp.
			
 
				-  int dst_width_bytes = dst_width * bpp;
			
 
				-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
			
 
				-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
			
 
				-      InterpolateRow_C;
			
 
				-  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
			
 
				-  int j;
			
 
				-  assert(bpp >= 1 && bpp <= 4);
			
 
				-  assert(src_height != 0);
			
 
				-  assert(dst_width > 0);
			
 
				-  assert(dst_height > 0);
			
 
				-  src_argb += (x >> 16) * bpp;
			
 
				-#if defined(HAS_INTERPOLATEROW_SSE2)
			
 
				-  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSE2;
			
 
				-    if (IS_ALIGNED(dst_width_bytes, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSE2;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_SSSE3)
			
 
				-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_SSSE3;
			
 
				-    if (IS_ALIGNED(dst_width_bytes, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
			
 
				-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
			
 
				-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
			
 
				-        InterpolateRow = InterpolateRow_SSSE3;
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_AVX2)
			
 
				-  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
			
 
				-    InterpolateRow = InterpolateRow_Any_AVX2;
			
 
				-    if (IS_ALIGNED(dst_width_bytes, 32)) {
			
 
				-      InterpolateRow = InterpolateRow_AVX2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROW_NEON)
			
 
				-  if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
			
 
				-    InterpolateRow = InterpolateRow_Any_NEON;
			
 
				-    if (IS_ALIGNED(dst_width_bytes, 16)) {
			
 
				-      InterpolateRow = InterpolateRow_NEON;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
			
 
				-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
			
 
				-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
			
 
				-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
			
 
				-    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
			
 
				-    if (IS_ALIGNED(dst_width_bytes, 4)) {
			
 
				-      InterpolateRow = InterpolateRow_MIPS_DSPR2;
			
 
				-    }
			
 
				-  }
			
 
				-#endif
			
 
				-  for (j = 0; j < dst_height; ++j) {
			
 
				-    int yi;
			
 
				-    int yf;
			
 
				-    if (y > max_y) {
			
 
				-      y = max_y;
			
 
				-    }
			
 
				-    yi = y >> 16;
			
 
				-    yf = filtering ? ((y >> 8) & 255) : 0;
			
 
				-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
			
 
				-                   src_stride, dst_width_bytes, yf);
			
 
				-    dst_argb += dst_stride;
			
 
				-    y += dy;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Simplify the filtering based on scale factors.
			
 
				-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
			
 
				-                                  int dst_width, int dst_height,
			
 
				-                                  enum FilterMode filtering) {
			
 
				-  if (src_width < 0) {
			
 
				-    src_width = -src_width;
			
 
				-  }
			
 
				-  if (src_height < 0) {
			
 
				-    src_height = -src_height;
			
 
				-  }
			
 
				-  if (filtering == kFilterBox) {
			
 
				-    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
			
 
				-    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
			
 
				-      filtering = kFilterBilinear;
			
 
				-    }
			
 
				-    // If scaling to larger, switch from Box to Bilinear.
			
 
				-    if (dst_width >= src_width || dst_height >= src_height) {
			
 
				-      filtering = kFilterBilinear;
			
 
				-    }
			
 
				-  }
			
 
				-  if (filtering == kFilterBilinear) {
			
 
				-    if (src_height == 1) {
			
 
				-      filtering = kFilterLinear;
			
 
				-    }
			
 
				-    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
			
 
				-    if (dst_height == src_height || dst_height * 3 == src_height) {
			
 
				-      filtering = kFilterLinear;
			
 
				-    }
			
 
				-    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
			
 
				-    // avoid reading 2 pixels horizontally that causes memory exception.
			
 
				-    if (src_width == 1) {
			
 
				-      filtering = kFilterNone;
			
 
				-    }
			
 
				-  }
			
 
				-  if (filtering == kFilterLinear) {
			
 
				-    if (src_width == 1) {
			
 
				-      filtering = kFilterNone;
			
 
				-    }
			
 
				-    // TODO(fbarchard): Detect any odd scale factor and reduce to None.
			
 
				-    if (dst_width == src_width || dst_width * 3 == src_width) {
			
 
				-      filtering = kFilterNone;
			
 
				-    }
			
 
				-  }
			
 
				-  return filtering;
			
 
				-}
			
 
				-
			
 
				-// Divide num by div and return as 16.16 fixed point result.
			
 
				-int FixedDiv_C(int num, int div) {
			
 
				-  return (int)(((int64)(num) << 16) / div);
			
 
				-}
			
 
				-
			
 
				-// Divide num by div and return as 16.16 fixed point result.
			
 
				-int FixedDiv1_C(int num, int div) {
			
 
				-  return (int)((((int64)(num) << 16) - 0x00010001) /
			
 
				-                          (div - 1));
			
 
				-}
			
 
				-
			
 
				-#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
			
 
				-
			
 
				-// Compute slope values for stepping.
			
 
				-void ScaleSlope(int src_width, int src_height,
			
 
				-                int dst_width, int dst_height,
			
 
				-                enum FilterMode filtering,
			
 
				-                int* x, int* y, int* dx, int* dy) {
			
 
				-  assert(x != NULL);
			
 
				-  assert(y != NULL);
			
 
				-  assert(dx != NULL);
			
 
				-  assert(dy != NULL);
			
 
				-  assert(src_width != 0);
			
 
				-  assert(src_height != 0);
			
 
				-  assert(dst_width > 0);
			
 
				-  assert(dst_height > 0);
			
 
				-  // Check for 1 pixel and avoid FixedDiv overflow.
			
 
				-  if (dst_width == 1 && src_width >= 32768) {
			
 
				-    dst_width = src_width;
			
 
				-  }
			
 
				-  if (dst_height == 1 && src_height >= 32768) {
			
 
				-    dst_height = src_height;
			
 
				-  }
			
 
				-  if (filtering == kFilterBox) {
			
 
				-    // Scale step for point sampling duplicates all pixels equally.
			
 
				-    *dx = FixedDiv(Abs(src_width), dst_width);
			
 
				-    *dy = FixedDiv(src_height, dst_height);
			
 
				-    *x = 0;
			
 
				-    *y = 0;
			
 
				-  } else if (filtering == kFilterBilinear) {
			
 
				-    // Scale step for bilinear sampling renders last pixel once for upsample.
			
 
				-    if (dst_width <= Abs(src_width)) {
			
 
				-      *dx = FixedDiv(Abs(src_width), dst_width);
			
 
				-      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
			
 
				-    } else if (dst_width > 1) {
			
 
				-      *dx = FixedDiv1(Abs(src_width), dst_width);
			
 
				-      *x = 0;
			
 
				-    }
			
 
				-    if (dst_height <= src_height) {
			
 
				-      *dy = FixedDiv(src_height,  dst_height);
			
 
				-      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
			
 
				-    } else if (dst_height > 1) {
			
 
				-      *dy = FixedDiv1(src_height, dst_height);
			
 
				-      *y = 0;
			
 
				-    }
			
 
				-  } else if (filtering == kFilterLinear) {
			
 
				-    // Scale step for bilinear sampling renders last pixel once for upsample.
			
 
				-    if (dst_width <= Abs(src_width)) {
			
 
				-      *dx = FixedDiv(Abs(src_width), dst_width);
			
 
				-      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
			
 
				-    } else if (dst_width > 1) {
			
 
				-      *dx = FixedDiv1(Abs(src_width), dst_width);
			
 
				-      *x = 0;
			
 
				-    }
			
 
				-    *dy = FixedDiv(src_height, dst_height);
			
 
				-    *y = *dy >> 1;
			
 
				-  } else {
			
 
				-    // Scale step for point sampling duplicates all pixels equally.
			
 
				-    *dx = FixedDiv(Abs(src_width), dst_width);
			
 
				-    *dy = FixedDiv(src_height, dst_height);
			
 
				-    *x = CENTERSTART(*dx, 0);
			
 
				-    *y = CENTERSTART(*dy, 0);
			
 
				-  }
			
 
				-  // Negative src_width means horizontally mirror.
			
 
				-  if (src_width < 0) {
			
 
				-    *x += (dst_width - 1) * *dx;
			
 
				-    *dx = -*dx;
			
 
				-    // src_width = -src_width;   // Caller must do this.
			
 
				-  }
			
 
				-}
			
 
				-#undef CENTERSTART
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
--- a/drivers/theoraplayer/src/YUV/libyuv/src/scale_mips.cc
+++ b/drivers/theoraplayer/src/YUV/libyuv/src/scale_mips.cc
@@ -1,653 +0,0 @@
 
				-/*
			
 
				- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
			
 
				- *
			
 
				- *  Use of this source code is governed by a BSD-style license
			
 
				- *  that can be found in the LICENSE file in the root of the source
			
 
				- *  tree. An additional intellectual property rights grant can be found
			
 
				- *  in the file PATENTS. All contributing project authors may
			
 
				- *  be found in the AUTHORS file in the root of the source tree.
			
 
				- */
			
 
				-
			
 
				-#include "libyuv/basic_types.h"
			
 
				-#include "libyuv/row.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-namespace libyuv {
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// This module is for GCC MIPS DSPR2
			
 
				-#if !defined(LIBYUV_DISABLE_MIPS) && \
			
 
				-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
			
 
				-
			
 
				-void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                              uint8* dst, int dst_width) {
			
 
				-  __asm__ __volatile__(
			
 
				-    ".set push                                     \n"
			
 
				-    ".set noreorder                                \n"
			
 
				-
			
 
				-    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
			
 
				-    "beqz           $t9, 2f                        \n"
			
 
				-    " nop                                          \n"
			
 
				-
			
 
				-    ".p2align       2                              \n"
			
 
				-  "1:                                              \n"
			
 
				-    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
			
 
				-    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
			
 
				-    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
			
 
				-    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
			
 
				-    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
			
 
				-    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
			
 
				-    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
			
 
				-    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
			
 
				-    // TODO(fbarchard): Use odd pixels instead of even.
			
 
				-    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
			
 
				-    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
			
 
				-    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
			
 
				-    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
			
 
				-    "addiu          %[src_ptr], %[src_ptr], 32     \n"
			
 
				-    "addiu          $t9, $t9, -1                   \n"
			
 
				-    "sw             $t8, 0(%[dst])                 \n"
			
 
				-    "sw             $t0, 4(%[dst])                 \n"
			
 
				-    "sw             $t1, 8(%[dst])                 \n"
			
 
				-    "sw             $t2, 12(%[dst])                \n"
			
 
				-    "bgtz           $t9, 1b                        \n"
			
 
				-    " addiu         %[dst], %[dst], 16             \n"
			
 
				-
			
 
				-  "2:                                              \n"
			
 
				-    "andi           $t9, %[dst_width], 0xf         \n"  // residue
			
 
				-    "beqz           $t9, 3f                        \n"
			
 
				-    " nop                                          \n"
			
 
				-
			
 
				-  "21:                                             \n"
			
 
				-    "lbu            $t0, 0(%[src_ptr])             \n"
			
 
				-    "addiu          %[src_ptr], %[src_ptr], 2      \n"
			
 
				-    "addiu          $t9, $t9, -1                   \n"
			
 
				-    "sb             $t0, 0(%[dst])                 \n"
			
 
				-    "bgtz           $t9, 21b                       \n"
			
 
				-    " addiu         %[dst], %[dst], 1              \n"
			
 
				-
			
 
				-  "3:                                              \n"
			
 
				-    ".set pop                                      \n"
			
 
				-  : [src_ptr] "+r" (src_ptr),
			
 
				-    [dst] "+r" (dst)
			
 
				-  : [dst_width] "r" (dst_width)
			
 
				-  : "t0", "t1", "t2", "t3", "t4", "t5",
			
 
				-    "t6", "t7", "t8", "t9"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                                 uint8* dst, int dst_width) {
			
 
				-  const uint8* t = src_ptr + src_stride;
			
 
				-
			
 
				-  __asm__ __volatile__ (
			
 
				-    ".set push                                    \n"
			
 
				-    ".set noreorder                               \n"
			
 
				-
			
 
				-    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
			
 
				-    "bltz           $t9, 2f                       \n"
			
 
				-    " nop                                         \n"
			
 
				-
			
 
				-    ".p2align       2                             \n"
			
 
				-  "1:                                             \n"
			
 
				-    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
			
 
				-    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
			
 
				-    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
			
 
				-    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
			
 
				-    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
			
 
				-    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
			
 
				-    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
			
 
				-    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
			
 
				-    "addiu          $t9, $t9, -1                  \n"
			
 
				-    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
			
 
				-    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
			
 
				-    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
			
 
				-    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
			
 
				-    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
			
 
				-    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
			
 
				-    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
			
 
				-    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
			
 
				-    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
			
 
				-    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
			
 
				-    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
			
 
				-    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
			
 
				-    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
			
 
				-    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
			
 
				-    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
			
 
				-    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
			
 
				-    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
			
 
				-    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
			
 
				-    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
			
 
				-    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
			
 
				-    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
			
 
				-    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
			
 
				-    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
			
 
				-    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
			
 
				-    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
			
 
				-    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
			
 
				-    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
			
 
				-    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
			
 
				-    "addiu          %[src_ptr], %[src_ptr], 16    \n"
			
 
				-    "addiu          %[t], %[t], 16                \n"
			
 
				-    "sb             $t0, 0(%[dst])                \n"
			
 
				-    "sb             $t4, 1(%[dst])                \n"
			
 
				-    "sb             $t1, 2(%[dst])                \n"
			
 
				-    "sb             $t5, 3(%[dst])                \n"
			
 
				-    "sb             $t2, 4(%[dst])                \n"
			
 
				-    "sb             $t6, 5(%[dst])                \n"
			
 
				-    "sb             $t3, 6(%[dst])                \n"
			
 
				-    "sb             $t7, 7(%[dst])                \n"
			
 
				-    "bgtz           $t9, 1b                       \n"
			
 
				-    " addiu         %[dst], %[dst], 8             \n"
			
 
				-
			
 
				-  "2:                                             \n"
			
 
				-    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
			
 
				-    "beqz           $t9, 3f                       \n"
			
 
				-    " nop                                         \n"
			
 
				-
			
 
				-    "21:                                          \n"
			
 
				-    "lwr            $t1, 0(%[src_ptr])            \n"
			
 
				-    "lwl            $t1, 3(%[src_ptr])            \n"
			
 
				-    "lwr            $t2, 0(%[t])                  \n"
			
 
				-    "lwl            $t2, 3(%[t])                  \n"
			
 
				-    "srl            $t8, $t1, 16                  \n"
			
 
				-    "ins            $t1, $t2, 16, 16              \n"
			
 
				-    "ins            $t2, $t8, 0, 16               \n"
			
 
				-    "raddu.w.qb     $t1, $t1                      \n"
			
 
				-    "raddu.w.qb     $t2, $t2                      \n"
			
 
				-    "shra_r.w       $t1, $t1, 2                   \n"
			
 
				-    "shra_r.w       $t2, $t2, 2                   \n"
			
 
				-    "sb             $t1, 0(%[dst])                \n"
			
 
				-    "sb             $t2, 1(%[dst])                \n"
			
 
				-    "addiu          %[src_ptr], %[src_ptr], 4     \n"
			
 
				-    "addiu          $t9, $t9, -2                  \n"
			
 
				-    "addiu          %[t], %[t], 4                 \n"
			
 
				-    "bgtz           $t9, 21b                      \n"
			
 
				-    " addiu         %[dst], %[dst], 2             \n"
			
 
				-
			
 
				-  "3:                                             \n"
			
 
				-    ".set pop                                     \n"
			
 
				-
			
 
				-  : [src_ptr] "+r" (src_ptr),
			
 
				-    [dst] "+r" (dst), [t] "+r" (t)
			
 
				-  : [dst_width] "r" (dst_width)
			
 
				-  : "t0", "t1", "t2", "t3", "t4", "t5",
			
 
				-    "t6", "t7", "t8", "t9"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                              uint8* dst, int dst_width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-      ".set push                                    \n"
			
 
				-      ".set noreorder                               \n"
			
 
				-
			
 
				-      "srl            $t9, %[dst_width], 3          \n"
			
 
				-      "beqz           $t9, 2f                       \n"
			
 
				-      " nop                                         \n"
			
 
				-
			
 
				-      ".p2align       2                             \n"
			
 
				-     "1:                                            \n"
			
 
				-      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
			
 
				-      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
			
 
				-      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
			
 
				-      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
			
 
				-      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
			
 
				-      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
			
 
				-      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
			
 
				-      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
			
 
				-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
			
 
				-      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
			
 
				-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
			
 
				-      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
			
 
				-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
			
 
				-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
			
 
				-      "addiu          %[src_ptr], %[src_ptr], 32    \n"
			
 
				-      "addiu          $t9, $t9, -1                  \n"
			
 
				-      "sw             $t1, 0(%[dst])                \n"
			
 
				-      "sw             $t5, 4(%[dst])                \n"
			
 
				-      "bgtz           $t9, 1b                       \n"
			
 
				-      " addiu         %[dst], %[dst], 8             \n"
			
 
				-
			
 
				-    "2:                                             \n"
			
 
				-      "andi           $t9, %[dst_width], 7          \n"  // residue
			
 
				-      "beqz           $t9, 3f                       \n"
			
 
				-      " nop                                         \n"
			
 
				-
			
 
				-    "21:                                            \n"
			
 
				-      "lbu            $t1, 0(%[src_ptr])            \n"
			
 
				-      "addiu          %[src_ptr], %[src_ptr], 4     \n"
			
 
				-      "addiu          $t9, $t9, -1                  \n"
			
 
				-      "sb             $t1, 0(%[dst])                \n"
			
 
				-      "bgtz           $t9, 21b                      \n"
			
 
				-      " addiu         %[dst], %[dst], 1             \n"
			
 
				-
			
 
				-    "3:                                             \n"
			
 
				-      ".set pop                                     \n"
			
 
				-      : [src_ptr] "+r" (src_ptr),
			
 
				-        [dst] "+r" (dst)
			
 
				-      : [dst_width] "r" (dst_width)
			
 
				-      : "t1", "t2", "t3", "t4", "t5",
			
 
				-        "t6", "t7", "t8", "t9"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                                 uint8* dst, int dst_width) {
			
 
				-  intptr_t stride = src_stride;
			
 
				-  const uint8* s1 = src_ptr + stride;
			
 
				-  const uint8* s2 = s1 + stride;
			
 
				-  const uint8* s3 = s2 + stride;
			
 
				-
			
 
				-  __asm__ __volatile__ (
			
 
				-      ".set push                                  \n"
			
 
				-      ".set noreorder                             \n"
			
 
				-
			
 
				-      "srl           $t9, %[dst_width], 1         \n"
			
 
				-      "andi          $t8, %[dst_width], 1         \n"
			
 
				-
			
 
				-      ".p2align      2                            \n"
			
 
				-     "1:                                          \n"
			
 
				-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
			
 
				-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
			
 
				-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
			
 
				-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
			
 
				-      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
			
 
				-      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
			
 
				-      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
			
 
				-      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
			
 
				-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
			
 
				-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
			
 
				-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
			
 
				-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
			
 
				-      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
			
 
				-      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
			
 
				-      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
			
 
				-      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
			
 
				-      "add           $t0, $t0, $t1                \n"
			
 
				-      "add           $t1, $t2, $t3                \n"
			
 
				-      "add           $t0, $t0, $t1                \n"
			
 
				-      "add           $t4, $t4, $t5                \n"
			
 
				-      "add           $t6, $t6, $t7                \n"
			
 
				-      "add           $t4, $t4, $t6                \n"
			
 
				-      "shra_r.w      $t0, $t0, 4                  \n"
			
 
				-      "shra_r.w      $t4, $t4, 4                  \n"
			
 
				-      "sb            $t0, 0(%[dst])               \n"
			
 
				-      "sb            $t4, 1(%[dst])               \n"
			
 
				-      "addiu         %[src_ptr], %[src_ptr], 8    \n"
			
 
				-      "addiu         %[s1], %[s1], 8              \n"
			
 
				-      "addiu         %[s2], %[s2], 8              \n"
			
 
				-      "addiu         %[s3], %[s3], 8              \n"
			
 
				-      "addiu         $t9, $t9, -1                 \n"
			
 
				-      "bgtz          $t9, 1b                      \n"
			
 
				-      " addiu        %[dst], %[dst], 2            \n"
			
 
				-      "beqz          $t8, 2f                      \n"
			
 
				-      " nop                                       \n"
			
 
				-
			
 
				-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
			
 
				-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
			
 
				-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
			
 
				-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
			
 
				-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
			
 
				-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
			
 
				-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
			
 
				-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
			
 
				-      "add           $t0, $t0, $t1                \n"
			
 
				-      "add           $t1, $t2, $t3                \n"
			
 
				-      "add           $t0, $t0, $t1                \n"
			
 
				-      "shra_r.w      $t0, $t0, 4                  \n"
			
 
				-      "sb            $t0, 0(%[dst])               \n"
			
 
				-
			
 
				-      "2:                                         \n"
			
 
				-      ".set pop                                   \n"
			
 
				-
			
 
				-      : [src_ptr] "+r" (src_ptr),
			
 
				-        [dst] "+r" (dst),
			
 
				-        [s1] "+r" (s1),
			
 
				-        [s2] "+r" (s2),
			
 
				-        [s3] "+r" (s3)
			
 
				-      : [dst_width] "r" (dst_width)
			
 
				-      : "t0", "t1", "t2", "t3", "t4", "t5",
			
 
				-        "t6","t7", "t8", "t9"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                               uint8* dst, int dst_width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-      ".set push                                          \n"
			
 
				-      ".set noreorder                                     \n"
			
 
				-      ".p2align        2                                  \n"
			
 
				-    "1:                                                   \n"
			
 
				-      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
			
 
				-      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
			
 
				-      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
			
 
				-      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
			
 
				-      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
			
 
				-      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
			
 
				-      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
			
 
				-      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
			
 
				-      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
			
 
				-      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
			
 
				-      "addiu           %[dst_width], %[dst_width], -24    \n"
			
 
				-      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
			
 
				-      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
			
 
				-      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
			
 
				-      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
			
 
				-      "addiu           %[src_ptr], %[src_ptr], 32         \n"
			
 
				-      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
			
 
				-      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
			
 
				-      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
			
 
				-      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
			
 
				-      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
			
 
				-      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
			
 
				-      "sw              $t1, 0(%[dst])                     \n"
			
 
				-      "sw              $t0, 4(%[dst])                     \n"
			
 
				-      "sw              $t3, 8(%[dst])                     \n"
			
 
				-      "sw              $t5, 12(%[dst])                    \n"
			
 
				-      "sw              $t9, 16(%[dst])                    \n"
			
 
				-      "sw              $t7, 20(%[dst])                    \n"
			
 
				-      "bnez            %[dst_width], 1b                   \n"
			
 
				-      " addiu          %[dst], %[dst], 24                 \n"
			
 
				-      ".set pop                                           \n"
			
 
				-      : [src_ptr] "+r" (src_ptr),
			
 
				-        [dst] "+r" (dst),
			
 
				-        [dst_width] "+r" (dst_width)
			
 
				-      :
			
 
				-      : "t0", "t1", "t2", "t3", "t4", "t5",
			
 
				-        "t6","t7", "t8", "t9"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                                     uint8* d, int dst_width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-      ".set push                                         \n"
			
 
				-      ".set noreorder                                    \n"
			
 
				-      "repl.ph           $t3, 3                          \n"  // 0x00030003
			
 
				-
			
 
				-     ".p2align           2                               \n"
			
 
				-    "1:                                                  \n"
			
 
				-      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
			
 
				-      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
			
 
				-      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
			
 
				-      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
			
 
				-      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
			
 
				-      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
			
 
				-      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
			
 
				-      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
			
 
				-      "raddu.w.qb        $t0, $t0                        \n"
			
 
				-      "raddu.w.qb        $t1, $t1                        \n"
			
 
				-      "shra_r.w          $t0, $t0, 1                     \n"
			
 
				-      "shra_r.w          $t1, $t1, 1                     \n"
			
 
				-      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
			
 
				-      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
			
 
				-      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
			
 
				-      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
			
 
				-      "addu.ph           $t2, $t2, $t4                   \n"
			
 
				-      "addu.ph           $t6, $t6, $t5                   \n"
			
 
				-      "sll               $t5, $t0, 1                     \n"
			
 
				-      "add               $t0, $t5, $t0                   \n"
			
 
				-      "shra_r.ph         $t2, $t2, 2                     \n"
			
 
				-      "shra_r.ph         $t6, $t6, 2                     \n"
			
 
				-      "shll.ph           $t4, $t2, 1                     \n"
			
 
				-      "addq.ph           $t4, $t4, $t2                   \n"
			
 
				-      "addu              $t0, $t0, $t1                   \n"
			
 
				-      "addiu             %[src_ptr], %[src_ptr], 4       \n"
			
 
				-      "shra_r.w          $t0, $t0, 2                     \n"
			
 
				-      "addu.ph           $t6, $t6, $t4                   \n"
			
 
				-      "shra_r.ph         $t6, $t6, 2                     \n"
			
 
				-      "srl               $t1, $t6, 16                    \n"
			
 
				-      "addiu             %[dst_width], %[dst_width], -3  \n"
			
 
				-      "sb                $t1, 0(%[d])                    \n"
			
 
				-      "sb                $t0, 1(%[d])                    \n"
			
 
				-      "sb                $t6, 2(%[d])                    \n"
			
 
				-      "bgtz              %[dst_width], 1b                \n"
			
 
				-      " addiu            %[d], %[d], 3                   \n"
			
 
				-    "3:                                                  \n"
			
 
				-      ".set pop                                          \n"
			
 
				-      : [src_ptr] "+r" (src_ptr),
			
 
				-        [src_stride] "+r" (src_stride),
			
 
				-        [d] "+r" (d),
			
 
				-        [dst_width] "+r" (dst_width)
			
 
				-      :
			
 
				-      : "t0", "t1", "t2", "t3",
			
 
				-        "t4", "t5", "t6"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                                     uint8* d, int dst_width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-      ".set push                                           \n"
			
 
				-      ".set noreorder                                      \n"
			
 
				-      "repl.ph           $t2, 3                            \n"  // 0x00030003
			
 
				-
			
 
				-      ".p2align          2                                 \n"
			
 
				-    "1:                                                    \n"
			
 
				-      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
			
 
				-      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
			
 
				-      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
			
 
				-      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
			
 
				-      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
			
 
				-      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
			
 
				-      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
			
 
				-      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
			
 
				-      "raddu.w.qb        $t0, $t0                          \n"
			
 
				-      "raddu.w.qb        $t1, $t1                          \n"
			
 
				-      "shra_r.w          $t0, $t0, 1                       \n"
			
 
				-      "shra_r.w          $t1, $t1, 1                       \n"
			
 
				-      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
			
 
				-      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
			
 
				-      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
			
 
				-      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
			
 
				-      "addu.ph           $t4, $t4, $t3                     \n"
			
 
				-      "addu.ph           $t6, $t6, $t5                     \n"
			
 
				-      "shra_r.ph         $t6, $t6, 2                       \n"
			
 
				-      "shra_r.ph         $t4, $t4, 2                       \n"
			
 
				-      "addu.ph           $t6, $t6, $t4                     \n"
			
 
				-      "addiu             %[src_ptr], %[src_ptr], 4         \n"
			
 
				-      "shra_r.ph         $t6, $t6, 1                       \n"
			
 
				-      "addu              $t0, $t0, $t1                     \n"
			
 
				-      "addiu             %[dst_width], %[dst_width], -3    \n"
			
 
				-      "shra_r.w          $t0, $t0, 1                       \n"
			
 
				-      "srl               $t1, $t6, 16                      \n"
			
 
				-      "sb                $t1, 0(%[d])                      \n"
			
 
				-      "sb                $t0, 1(%[d])                      \n"
			
 
				-      "sb                $t6, 2(%[d])                      \n"
			
 
				-      "bgtz              %[dst_width], 1b                  \n"
			
 
				-      " addiu            %[d], %[d], 3                     \n"
			
 
				-    "3:                                                    \n"
			
 
				-      ".set pop                                            \n"
			
 
				-      : [src_ptr] "+r" (src_ptr),
			
 
				-        [src_stride] "+r" (src_stride),
			
 
				-        [d] "+r" (d),
			
 
				-        [dst_width] "+r" (dst_width)
			
 
				-      :
			
 
				-      : "t0", "t1", "t2", "t3",
			
 
				-        "t4", "t5", "t6"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                               uint8* dst, int dst_width) {
			
 
				-  __asm__ __volatile__ (
			
 
				-      ".set push                                     \n"
			
 
				-      ".set noreorder                                \n"
			
 
				-
			
 
				-      ".p2align   2                                  \n"
			
 
				-    "1:                                              \n"
			
 
				-      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
			
 
				-      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
			
 
				-      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
			
 
				-      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
			
 
				-      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
			
 
				-      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
			
 
				-      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
			
 
				-      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
			
 
				-      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
			
 
				-      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
			
 
				-      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
			
 
				-      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
			
 
				-      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
			
 
				-      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
			
 
				-      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
			
 
				-      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
			
 
				-      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
			
 
				-      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
			
 
				-      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
			
 
				-      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
			
 
				-      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
			
 
				-      "addiu      %[src_ptr], %[src_ptr], 32         \n"
			
 
				-      "addiu      %[dst_width], %[dst_width], -12    \n"
			
 
				-      "addiu      $t8,%[dst_width], -12              \n"
			
 
				-      "sw         $t1, 0(%[dst])                     \n"
			
 
				-      "sw         $t4, 4(%[dst])                     \n"
			
 
				-      "sw         $t6, 8(%[dst])                     \n"
			
 
				-      "bgez       $t8, 1b                            \n"
			
 
				-      " addiu     %[dst], %[dst], 12                 \n"
			
 
				-      ".set pop                                      \n"
			
 
				-      : [src_ptr] "+r" (src_ptr),
			
 
				-        [dst] "+r" (dst),
			
 
				-        [dst_width] "+r" (dst_width)
			
 
				-      :
			
 
				-      : "t0", "t1", "t2", "t3", "t4",
			
 
				-        "t5", "t6", "t7", "t8"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
			
 
				-                                     uint8* dst_ptr, int dst_width) {
			
 
				-  intptr_t stride = src_stride;
			
 
				-  const uint8* t = src_ptr + stride;
			
 
				-  const int c = 0x2AAA;
			
 
				-
			
 
				-  __asm__ __volatile__ (
			
 
				-      ".set push                                         \n"
			
 
				-      ".set noreorder                                    \n"
			
 
				-
			
 
				-      ".p2align        2                                 \n"
			
 
				-    "1:                                                  \n"
			
 
				-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
			
 
				-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
			
 
				-      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
			
 
				-      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
			
 
				-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
			
 
				-      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
			
 
				-      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
			
 
				-      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
			
 
				-      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
			
 
				-      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
			
 
				-      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
			
 
				-      "srl             $t4, $t4, 2                       \n"  // t4 / 4
			
 
				-      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
			
 
				-      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
			
 
				-      "addu            $t6, $t5, $t6                     \n"
			
 
				-      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
			
 
				-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
			
 
				-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
			
 
				-      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
			
 
				-      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
			
 
				-      "addu            $t0, $t0, $t2                     \n"
			
 
				-      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
			
 
				-      "addiu           %[src_ptr], %[src_ptr], 8         \n"
			
 
				-      "addiu           %[t], %[t], 8                     \n"
			
 
				-      "addiu           %[dst_width], %[dst_width], -3    \n"
			
 
				-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
			
 
				-      "srl             $t6, $t6, 16                      \n"
			
 
				-      "srl             $t0, $t0, 16                      \n"
			
 
				-      "sb              $t4, -1(%[dst_ptr])               \n"
			
 
				-      "sb              $t6, -2(%[dst_ptr])               \n"
			
 
				-      "bgtz            %[dst_width], 1b                  \n"
			
 
				-      " sb             $t0, -3(%[dst_ptr])               \n"
			
 
				-      ".set pop                                          \n"
			
 
				-      : [src_ptr] "+r" (src_ptr),
			
 
				-        [dst_ptr] "+r" (dst_ptr),
			
 
				-        [t] "+r" (t),
			
 
				-        [dst_width] "+r" (dst_width)
			
 
				-      : [c] "r" (c)
			
 
				-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
			
 
				-                                     ptrdiff_t src_stride,
			
 
				-                                     uint8* dst_ptr, int dst_width) {
			
 
				-  intptr_t stride = src_stride;
			
 
				-  const uint8* s1 = src_ptr + stride;
			
 
				-  stride += stride;
			
 
				-  const uint8* s2 = src_ptr + stride;
			
 
				-  const int c1 = 0x1C71;
			
 
				-  const int c2 = 0x2AAA;
			
 
				-
			
 
				-  __asm__ __volatile__ (
			
 
				-      ".set push                                         \n"
			
 
				-      ".set noreorder                                    \n"
			
 
				-
			
 
				-      ".p2align        2                                 \n"
			
 
				-    "1:                                                  \n"
			
 
				-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
			
 
				-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
			
 
				-      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
			
 
				-      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
			
 
				-      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
			
 
				-      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
			
 
				-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
			
 
				-      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
			
 
				-      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
			
 
				-      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
			
 
				-      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
			
 
				-      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
			
 
				-      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
			
 
				-      "addu            $t7, $t7, $t8                     \n"
			
 
				-      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
			
 
				-      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
			
 
				-      "addu            $t6, $t6, $t8                     \n"
			
 
				-      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
			
 
				-      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
			
 
				-      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
			
 
				-      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
			
 
				-      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
			
 
				-      "addu            $t7, $t7, $t8                     \n"
			
 
				-      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
			
 
				-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
			
 
				-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
			
 
				-      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
			
 
				-      "raddu.w.qb      $t0, $t0                          \n"
			
 
				-      "raddu.w.qb      $t2, $t2                          \n"
			
 
				-      "raddu.w.qb      $t4, $t4                          \n"
			
 
				-      "addu            $t0, $t0, $t2                     \n"
			
 
				-      "addu            $t0, $t0, $t4                     \n"
			
 
				-      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
			
 
				-      "addiu           %[src_ptr], %[src_ptr], 8         \n"
			
 
				-      "addiu           %[s1], %[s1], 8                   \n"
			
 
				-      "addiu           %[s2], %[s2], 8                   \n"
			
 
				-      "addiu           %[dst_width], %[dst_width], -3    \n"
			
 
				-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
			
 
				-      "srl             $t6, $t6, 16                      \n"
			
 
				-      "srl             $t7, $t7, 16                      \n"
			
 
				-      "srl             $t0, $t0, 16                      \n"
			
 
				-      "sb              $t6, -1(%[dst_ptr])               \n"
			
 
				-      "sb              $t7, -2(%[dst_ptr])               \n"
			
 
				-      "bgtz            %[dst_width], 1b                  \n"
			
 
				-      " sb             $t0, -3(%[dst_ptr])               \n"
			
 
				-      ".set pop                                          \n"
			
 
				-      : [src_ptr] "+r" (src_ptr),
			
 
				-        [dst_ptr] "+r" (dst_ptr),
			
 
				-        [s1] "+r" (s1),
			
 
				-        [s2] "+r" (s2),
			
 
				-        [dst_width] "+r" (dst_width)
			
 
				-      : [c1] "r" (c1), [c2] "r" (c2)
			
 
				-      : "t0", "t1", "t2", "t3", "t4",
			
 
				-        "t5", "t6", "t7", "t8"
			
 
				-  );
			
 
				-}
			
 
				-
			
 
				-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}  // extern "C"
			
 
				-}  // namespace libyuv
			
 
				-#endif
			
 
				-