9 tahun lalu · 5268443fdf
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -114,6 +114,13 @@ Files extracted from upstream source:
 
				 - COPYING
			
 
				 
			
 
				 
			
 
				+## libvpx
			
 
				+
			
 
				+- Upstream: http://www.webmproject.org/code/
			
 
				+- Version: 1.6.0
			
 
				+- License: BSD-3-Clause
			
 
				+
			
 
				+
			
 
				 ## libwebp
			
 
				 
			
 
				 - Upstream: https://chromium.googlesource.com/webm/libwebp/
			
--- a/thirdparty/libvpx/AUTHORS
+++ b/thirdparty/libvpx/AUTHORS
@@ -0,0 +1,142 @@
 
				+# This file is automatically generated from the git commit history
			
 
				+# by tools/gen_authors.sh.
			
 
				+
			
 
				+Aaron Watry <[email protected]>
			
 
				+Abo Talib Mahfoodh <[email protected]>
			
 
				+Adam Xu <[email protected]>
			
 
				+Adrian Grange <[email protected]>
			
 
				+Aℓex Converse <[email protected]>
			
 
				+Ahmad Sharif <[email protected]>
			
 
				+Alexander Voronov <[email protected]>
			
 
				+Alexis Ballier <[email protected]>
			
 
				+Alok Ahuja <[email protected]>
			
 
				+Alpha Lam <[email protected]>
			
 
				+A.Mahfoodh <[email protected]>
			
 
				+Ami Fischman <[email protected]>
			
 
				+Andoni Morales Alastruey <[email protected]>
			
 
				+Andres Mejia <[email protected]>
			
 
				+Andrew Russell <[email protected]>
			
 
				+Angie Chiang <[email protected]>
			
 
				+Aron Rosenberg <[email protected]>
			
 
				+Attila Nagy <[email protected]>
			
 
				+Brion Vibber <[email protected]>
			
 
				+changjun.yang <[email protected]>
			
 
				+Charles 'Buck' Krasic <[email protected]>
			
 
				+chm <[email protected]>
			
 
				+Christian Duvivier <[email protected]>
			
 
				+Daniele Castagna <[email protected]>
			
 
				+Daniel Kang <[email protected]>
			
 
				+Deb Mukherjee <[email protected]>
			
 
				+Dim Temp <[email protected]>
			
 
				+Dmitry Kovalev <[email protected]>
			
 
				+Dragan Mrdjan <[email protected]>
			
 
				+Ed Baker <[email protected]>
			
 
				+Ehsan Akhgari <[email protected]>
			
 
				+Erik Niemeyer <[email protected]>
			
 
				+Fabio Pedretti <[email protected]>
			
 
				+Frank Galligan <[email protected]>
			
 
				+Fredrik Söderquist <[email protected]>
			
 
				+Fritz Koenig <[email protected]>
			
 
				+Gaute Strokkenes <[email protected]>
			
 
				+Geza Lore <[email protected]>
			
 
				+Ghislain MARY <[email protected]>
			
 
				+Giuseppe Scrivano <[email protected]>
			
 
				+Gordana Cmiljanovic <[email protected]>
			
 
				+Guillaume Martres <[email protected]>
			
 
				+Guillermo Ballester Valor <[email protected]>
			
 
				+Hangyu Kuang <[email protected]>
			
 
				+Hanno Böck <[email protected]>
			
 
				+Henrik Lundin <[email protected]>
			
 
				+Hui Su <[email protected]>
			
 
				+Ivan Maltz <[email protected]>
			
 
				+Jacek Caban <[email protected]>
			
 
				+Jacky Chen <[email protected]>
			
 
				+James Berry <[email protected]>
			
 
				+James Yu <[email protected]>
			
 
				+James Zern <[email protected]>
			
 
				+Jan Gerber <[email protected]>
			
 
				+Jan Kratochvil <[email protected]>
			
 
				+Janne Salonen <[email protected]>
			
 
				+Jean-Yves Avenard <[email protected]>
			
 
				+Jeff Faust <[email protected]>
			
 
				+Jeff Muizelaar <[email protected]>
			
 
				+Jeff Petkau <[email protected]>
			
 
				+Jia Jia <[email protected]>
			
 
				+Jian Zhou <[email protected]>
			
 
				+Jim Bankoski <[email protected]>
			
 
				+Jingning Han <[email protected]>
			
 
				+Joey Parrish <[email protected]>
			
 
				+Johann Koenig <[email protected]>
			
 
				+John Koleszar <[email protected]>
			
 
				+Johnny Klonaris <[email protected]>
			
 
				+John Stark <[email protected]>
			
 
				+Joshua Bleecher Snyder <[email protected]>
			
 
				+Joshua Litt <[email protected]>
			
 
				+Julia Robson <[email protected]>
			
 
				+Justin Clift <[email protected]>
			
 
				+Justin Lebar <[email protected]>
			
 
				+KO Myung-Hun <[email protected]>
			
 
				+Lawrence Velázquez <[email protected]>
			
 
				+Linfeng Zhang <[email protected]>
			
 
				+Lou Quillio <[email protected]>
			
 
				+Luca Barbato <[email protected]>
			
 
				+Makoto Kato <[email protected]>
			
 
				+Mans Rullgard <[email protected]>
			
 
				+Marco Paniconi <[email protected]>
			
 
				+Mark Mentovai <[email protected]>
			
 
				+Martin Ettl <[email protected]>
			
 
				+Martin Storsjo <[email protected]>
			
 
				+Matthew Heaney <[email protected]>
			
 
				+Michael Kohler <[email protected]>
			
 
				+Mike Frysinger <[email protected]>
			
 
				+Mike Hommey <[email protected]>
			
 
				+Mikhal Shemer <[email protected]>
			
 
				+Minghai Shang <[email protected]>
			
 
				+Morton Jonuschat <[email protected]>
			
 
				+Nico Weber <[email protected]>
			
 
				+Parag Salasakar <[email protected]>
			
 
				+Pascal Massimino <[email protected]>
			
 
				+Patrik Westin <[email protected]>
			
 
				+Paul Wilkins <[email protected]>
			
 
				+Pavol Rusnak <[email protected]>
			
 
				+Paweł Hajdan <[email protected]>
			
 
				+Pengchong Jin <[email protected]>
			
 
				+Peter de Rivaz <[email protected]>
			
 
				+Philip Jägenstedt <[email protected]>
			
 
				+Priit Laes <[email protected]>
			
 
				+Rafael Ávila de Espíndola <[email protected]>
			
 
				+Rafaël Carré <[email protected]>
			
 
				+Ralph Giles <[email protected]>
			
 
				+Rob Bradford <[email protected]>
			
 
				+Ronald S. Bultje <[email protected]>
			
 
				+Rui Ueyama <[email protected]>
			
 
				+Sami Pietilä <[email protected]>
			
 
				+Sasi Inguva <[email protected]>
			
 
				+Scott Graham <[email protected]>
			
 
				+Scott LaVarnway <[email protected]>
			
 
				+Sean McGovern <[email protected]>
			
 
				+Sergey Kolomenkin <[email protected]>
			
 
				+Sergey Ulanov <[email protected]>
			
 
				+Shimon Doodkin <[email protected]>
			
 
				+Shunyao Li <[email protected]>
			
 
				+Stefan Holmer <[email protected]>
			
 
				+Suman Sunkara <[email protected]>
			
 
				+Taekhyun Kim <[email protected]>
			
 
				+Takanori MATSUURA <[email protected]>
			
 
				+Tamar Levy <[email protected]>
			
 
				+Tao Bai <[email protected]>
			
 
				+Tero Rintaluoma <[email protected]>
			
 
				+Thijs Vermeir <[email protected]>
			
 
				+Tim Kopp <[email protected]>
			
 
				+Timothy B. Terriberry <[email protected]>
			
 
				+Tom Finegan <[email protected]>
			
 
				+Vignesh Venkatasubramanian <[email protected]>
			
 
				+Yaowu Xu <[email protected]>
			
 
				+Yi Luo <[email protected]>
			
 
				+Yongzhe Wang <[email protected]>
			
 
				+Yunqing Wang <[email protected]>
			
 
				+Yury Gitman <[email protected]>
			
 
				+Zoe Liu <[email protected]>
			
 
				+Google Inc.
			
 
				+The Mozilla Foundation
			
 
				+The Xiph.Org Foundation
			
--- a/thirdparty/libvpx/CHANGELOG
+++ b/thirdparty/libvpx/CHANGELOG
@@ -0,0 +1,654 @@
 
				+2016-07-20 v1.6.0 "Khaki Campbell Duck"
			
 
				+  This release improves upon the VP9 encoder and speeds up the encoding and
			
 
				+  decoding processes.
			
 
				+
			
 
				+  - Upgrading:
			
 
				+    This release is ABI incompatible with 1.5.0 due to a new 'color_range' enum
			
 
				+    in vpx_image and some minor changes to the VP8_COMP structure.
			
 
				+
			
 
				+    The default key frame interval for VP9 has changed from 128 to 9999.
			
 
				+
			
 
				+  - Enhancement:
			
 
				+    A core focus has been performance for low end Intel processors. SSSE3
			
 
				+    instructions such as 'pshufb' have been avoided and instructions have been
			
 
				+    reordered to better accommodate the more constrained pipelines.
			
 
				+
			
 
				+    As a result, devices based on Celeron processors have seen substantial
			
 
				+    decoding improvements. From Indian Runner Duck to Javan Whistling Duck,
			
 
				+    decoding speed improved between 10 and 30%. Between Javan Whistling Duck
			
 
				+    and Khaki Campbell Duck, it improved another 10 to 15%.
			
 
				+
			
 
				+    While Celeron benefited most, Core-i5 also improved 5% and 10% between the
			
 
				+    respective releases.
			
 
				+
			
 
				+    Realtime performance for WebRTC for both speed and quality has received a
			
 
				+    lot of attention.
			
 
				+
			
 
				+  - Bug Fixes:
			
 
				+    A number of fuzzing issues, found variously by Mozilla, Chromium and others,
			
 
				+    have been fixed and we strongly recommend updating.
			
 
				+
			
 
				+2015-11-09 v1.5.0 "Javan Whistling Duck"
			
 
				+  This release improves upon the VP9 encoder and speeds up the encoding and
			
 
				+  decoding processes.
			
 
				+
			
 
				+  - Upgrading:
			
 
				+    This release is ABI incompatible with 1.4.0. It drops deprecated VP8
			
 
				+    controls and adds a variety of VP9 controls for testing.
			
 
				+
			
 
				+    The vpxenc utility now prefers VP9 by default.
			
 
				+
			
 
				+  - Enhancements:
			
 
				+    Faster VP9 encoding and decoding
			
 
				+    Smaller library size by combining functions used by VP8 and VP9
			
 
				+
			
 
				+  - Bug Fixes:
			
 
				+    A variety of fuzzing issues
			
 
				+
			
 
				+2015-04-03 v1.4.0 "Indian Runner Duck"
			
 
				+  This release includes significant improvements to the VP9 codec.
			
 
				+
			
 
				+  - Upgrading:
			
 
				+    This release is ABI incompatible with 1.3.0. It drops the compatibility
			
 
				+    layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
			
 
				+    controls for VP9.
			
 
				+
			
 
				+  - Enhancements:
			
 
				+    Faster VP9 encoding and decoding
			
 
				+    Multithreaded VP9 decoding (tile and frame-based)
			
 
				+    Multithreaded VP9 encoding - on by default
			
 
				+    YUV 4:2:2 and 4:4:4 support in VP9
			
 
				+    10 and 12bit support in VP9
			
 
				+    64bit ARM support by replacing ARM assembly with intrinsics
			
 
				+
			
 
				+  - Bug Fixes:
			
 
				+    Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
			
 
				+    files.
			
 
				+
			
 
				+  - Known Issues:
			
 
				+    Frame Parallel decoding fails for segmented and non-420 files.
			
 
				+
			
 
				+2013-11-15 v1.3.0 "Forest"
			
 
				+  This release introduces the VP9 codec in a backward-compatible way.
			
 
				+  All existing users of VP8 can continue to use the library without
			
 
				+  modification. However, some VP8 options do not map to VP9 in the same manner.
			
 
				+
			
 
				+  The VP9 encoder in this release is not feature complete. Users interested in
			
 
				+  the encoder are advised to use the git master branch and discuss issues on
			
 
				+  libvpx mailing lists.
			
 
				+
			
 
				+  - Upgrading:
			
 
				+    This release is ABI and API compatible with Duclair (v1.0.0). Users
			
 
				+    of older releases should refer to the Upgrading notes in this document
			
 
				+    for that release.
			
 
				+
			
 
				+  - Enhancements:
			
 
				+      Get rid of bashisms in the main build scripts
			
 
				+      Added usage info on command line options
			
 
				+      Add lossless compression mode
			
 
				+      Dll build of libvpx
			
 
				+      Add additional Mac OS X targets: 10.7, 10.8 and 10.9 (darwin11-13)
			
 
				+      Add option to disable documentation
			
 
				+      configure: add --enable-external-build support
			
 
				+      make: support V=1 as short form of verbose=yes
			
 
				+      configure: support mingw-w64
			
 
				+      configure: support hardfloat armv7 CHOSTS
			
 
				+      configure: add support for android x86
			
 
				+      Add estimated completion time to vpxenc
			
 
				+      Don't exit on decode errors in vpxenc
			
 
				+      vpxenc: support scaling prior to encoding
			
 
				+      vpxdec: support scaling output
			
 
				+      vpxenc: improve progress indicators with --skip
			
 
				+      msvs: Don't link to winmm.lib
			
 
				+      Add a new script for producing vcxproj files
			
 
				+      Produce Visual Studio 10 and 11 project files
			
 
				+      Produce Windows Phone project files
			
 
				+      msvs-build: use msbuild for vs >= 2005
			
 
				+      configure: default configure log to config.log
			
 
				+      Add encoding option --static-thresh
			
 
				+
			
 
				+  - Speed:
			
 
				+      Miscellaneous speed optimizations for VP8 and VP9.
			
 
				+
			
 
				+  - Quality:
			
 
				+      In general, quality is consistent with the Eider release.
			
 
				+
			
 
				+  - Bug Fixes:
			
 
				+      This release represents approximately a year of engineering effort,
			
 
				+      and contains multiple bug fixes. Please refer to git history for details.
			
 
				+
			
 
				+
			
 
				+2012-12-21 v1.2.0
			
 
				+  This release acts as a checkpoint for a large amount of internal refactoring
			
 
				+  and testing. It also contains a number of small bugfixes, so all users are
			
 
				+  encouraged to upgrade.
			
 
				+
			
 
				+  - Upgrading:
			
 
				+    This release is ABI and API compatible with Duclair (v1.0.0). Users
			
 
				+    of older releases should refer to the Upgrading notes in this
			
 
				+    document for that release.
			
 
				+
			
 
				+  - Enhancements:
			
 
				+      VP8 optimizations for MIPS dspr2
			
 
				+      vpxenc: add -quiet option
			
 
				+
			
 
				+  - Speed:
			
 
				+      Encoder and decoder speed is consistent with the Eider release.
			
 
				+
			
 
				+  - Quality:
			
 
				+      In general, quality is consistent with the Eider release.
			
 
				+
			
 
				+      Minor tweaks to ARNR filtering
			
 
				+      Minor improvements to real time encoding with multiple temporal layers
			
 
				+
			
 
				+  - Bug Fixes:
			
 
				+      Fixes multithreaded encoder race condition in loopfilter
			
 
				+      Fixes multi-resolution threaded encoding
			
 
				+      Fix potential encoder dead-lock after picture resize
			
 
				+
			
 
				+
			
 
				+2012-05-09 v1.1.0 "Eider"
			
 
				+  This introduces a number of enhancements, mostly focused on real-time
			
 
				+  encoding. In addition, it fixes a decoder bug (first introduced in
			
 
				+  Duclair) so all users of that release are encouraged to upgrade.
			
 
				+
			
 
				+  - Upgrading:
			
 
				+    This release is ABI and API compatible with Duclair (v1.0.0). Users
			
 
				+    of older releases should refer to the Upgrading notes in this
			
 
				+    document for that release.
			
 
				+
			
 
				+    This release introduces a new temporal denoiser, controlled by the
			
 
				+    VP8E_SET_NOISE_SENSITIVITY control. The temporal denoiser does not
			
 
				+    currently take a strength parameter, so the control is effectively
			
 
				+    a boolean - zero (off) or non-zero (on). For compatibility with
			
 
				+    existing applications, the values accepted are the same as those
			
 
				+    for the spatial denoiser (0-6). The temporal denoiser is enabled
			
 
				+    by default, and the older spatial denoiser may be restored by
			
 
				+    configuring with --disable-temporal-denoising. The temporal denoiser
			
 
				+    is more computationally intensive than the spatial one.
			
 
				+
			
 
				+    This release removes support for a legacy, decode only API that was
			
 
				+    supported, but deprecated, at the initial release of libvpx
			
 
				+    (v0.9.0). This is not expected to have any impact. If you are
			
 
				+    impacted, you can apply a reversion to commit 2bf8fb58 locally.
			
 
				+    Please update to the latest libvpx API if you are affected.
			
 
				+
			
 
				+  - Enhancements:
			
 
				+      Adds a motion compensated temporal denoiser to the encoder, which
			
 
				+      gives higher quality than the older spatial denoiser. (See above
			
 
				+      for notes on upgrading).
			
 
				+
			
 
				+      In addition, support for new compilers and platforms were added,
			
 
				+      including:
			
 
				+        improved support for XCode
			
 
				+        Android x86 NDK build
			
 
				+        OS/2 support
			
 
				+        SunCC support
			
 
				+
			
 
				+      Changing resolution with vpx_codec_enc_config_set() is now
			
 
				+      supported. Previously, reinitializing the codec was required to
			
 
				+      change the input resolution.
			
 
				+
			
 
				+      The vpxenc application has initial support for producing multiple
			
 
				+      encodes from the same input in one call. Resizing is not yet
			
 
				+      supported, but varying other codec parameters is. Use -- to
			
 
				+      delineate output streams. Options persist from one stream to the
			
 
				+      next.
			
 
				+
			
 
				+      Also, the vpxenc application will now use a keyframe interval of
			
 
				+      5 seconds by default. Use the --kf-max-dist option to override.
			
 
				+
			
 
				+  - Speed:
			
 
				+      Decoder performance improved 2.5% versus Duclair. Encoder speed is
			
 
				+      consistent with Duclair for most material. Two pass encoding of
			
 
				+      slideshow-like material will see significant improvements.
			
 
				+
			
 
				+      Large realtime encoding speed gains at a small quality expense are
			
 
				+      possible by configuring the on-the-fly bitpacking experiment with
			
 
				+      --enable-onthefly-bitpacking. Realtime encoder can be up to 13%
			
 
				+      faster (ARM) depending on the number of threads and bitrate
			
 
				+      settings. This technique sees constant gain over the 5-16 speed
			
 
				+      range. For VC style input the loss seen is up to 0.2dB. See commit
			
 
				+      52cf4dca for further details.
			
 
				+
			
 
				+  - Quality:
			
 
				+      On the whole, quality is consistent with the Duclair release. Some
			
 
				+      tweaks:
			
 
				+
			
 
				+        Reduced blockiness in easy sections by applying a penalty to
			
 
				+        intra modes.
			
 
				+
			
 
				+        Improved quality of static sections (like slideshows) with
			
 
				+        two pass encoding.
			
 
				+
			
 
				+        Improved keyframe sizing with multiple temporal layers
			
 
				+
			
 
				+  - Bug Fixes:
			
 
				+      Corrected alt-ref contribution to frame rate for visible updates
			
 
				+      to the alt-ref buffer. This affected applications making manual
			
 
				+      usage of the frame reference flags, or temporal layers.
			
 
				+
			
 
				+      Additional constraints were added to disable multi-frame quality
			
 
				+      enhancement (MFQE) in sections of the frame where there is motion.
			
 
				+      (#392)
			
 
				+
			
 
				+      Fixed corruption issues when vpx_codec_enc_config_set() was called
			
 
				+      with spatial resampling enabled.
			
 
				+
			
 
				+      Fixed a decoder error introduced in Duclair where the segmentation
			
 
				+      map was not being reinitialized on keyframes (#378)
			
 
				+
			
 
				+
			
 
				+2012-01-27 v1.0.0 "Duclair"
			
 
				+  Our fourth named release, focused on performance and features related to
			
 
				+  real-time encoding. It also fixes a decoder crash bug introduced in
			
 
				+  v0.9.7, so all users of that release are encouraged to upgrade.
			
 
				+
			
 
				+  - Upgrading:
			
 
				+      This release is ABI incompatible with prior releases of libvpx, so the
			
 
				+      "major" version number has been bumped to 1. You must recompile your
			
 
				+      applications against the latest version of the libvpx headers. The
			
 
				+      API remains compatible, and this should not require code changes in most
			
 
				+      applications.
			
 
				+
			
 
				+  - Enhancements:
			
 
				+      This release introduces several substantial new features to the encoder,
			
 
				+      of particular interest to real time streaming applications.
			
 
				+
			
 
				+      Temporal scalability allows the encoder to produce a stream that can
			
 
				+      be decimated to different frame rates, with independent rate targetting
			
 
				+      for each substream.
			
 
				+
			
 
				+      Multiframe quality enhancement postprocessing can make visual quality
			
 
				+      more consistent in the presence of frames that are substantially
			
 
				+      different quality than the surrounding frames, as in the temporal
			
 
				+      scalability case and in some forced keyframe scenarios.
			
 
				+
			
 
				+      Multiple-resolution encoding support allows the encoding of the
			
 
				+      same content at different resolutions faster than encoding them
			
 
				+      separately.
			
 
				+
			
 
				+  - Speed:
			
 
				+      Optimization targets for this release included the decoder and the real-
			
 
				+      time modes of the encoder. Decoder speed on x86 has improved 10.5% with
			
 
				+      this release. Encoder improvements followed a curve where speeds 1-3
			
 
				+      improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved
			
 
				+      1.5% to 10.5%, respectively. "Best" mode speed is consistent with the
			
 
				+      Cayuga release.
			
 
				+
			
 
				+  - Quality:
			
 
				+      Encoder quality in the single stream case is consistent with the Cayuga
			
 
				+      release.
			
 
				+
			
 
				+  - Bug Fixes:
			
 
				+      This release fixes an OOB read decoder crash bug present in v0.9.7
			
 
				+      related to the clamping of motion vectors in SPLITMV blocks. This
			
 
				+      behavior could be triggered by corrupt input or by starting
			
 
				+      decoding from a P-frame.
			
 
				+
			
 
				+
			
 
				+2011-08-15 v0.9.7-p1 "Cayuga" patch 1
			
 
				+  This is an incremental bugfix release against Cayuga. All users of that
			
 
				+  release are strongly encouraged to upgrade.
			
 
				+
			
 
				+    - Fix potential OOB reads (cdae03a)
			
 
				+
			
 
				+          An unbounded out of bounds read was discovered when the
			
 
				+          decoder was requested to perform error concealment (new in
			
 
				+          Cayuga) given a frame with corrupt partition sizes.
			
 
				+
			
 
				+          A bounded out of bounds read was discovered affecting all
			
 
				+          versions of libvpx. Given an multipartition input frame that
			
 
				+          is truncated between the mode/mv partition and the first
			
 
				+          residiual paritition (in the block of partition offsets), up
			
 
				+          to 3 extra bytes could have been read from the source buffer.
			
 
				+          The code will not take any action regardless of the contents
			
 
				+          of these undefined bytes, as the truncated buffer is detected
			
 
				+          immediately following the read based on the calculated
			
 
				+          starting position of the coefficient partition.
			
 
				+
			
 
				+    - Fix potential error concealment crash when the very first frame
			
 
				+      is missing or corrupt (a609be5)
			
 
				+
			
 
				+    - Fix significant artifacts in error concealment (a4c2211, 99d870a)
			
 
				+
			
 
				+    - Revert 1-pass CBR rate control changes (e961317)
			
 
				+      Further testing showed this change produced undesirable visual
			
 
				+      artifacts, rolling back for now.
			
 
				+
			
 
				+
			
 
				+2011-08-02 v0.9.7 "Cayuga"
			
 
				+  Our third named release, focused on a faster, higher quality, encoder.
			
 
				+
			
 
				+  - Upgrading:
			
 
				+    This release is backwards compatible with Aylesbury (v0.9.5) and
			
 
				+    Bali (v0.9.6). Users of older releases should refer to the Upgrading
			
 
				+    notes in this document for that release.
			
 
				+
			
 
				+  - Enhancements:
			
 
				+          Stereo 3D format support for vpxenc
			
 
				+          Runtime detection of available processor cores.
			
 
				+          Allow specifying --end-usage by enum name
			
 
				+          vpxdec: test for frame corruption
			
 
				+          vpxenc: add quantizer histogram display
			
 
				+          vpxenc: add rate histogram display
			
 
				+          Set VPX_FRAME_IS_DROPPABLE
			
 
				+          update configure for ios sdk 4.3
			
 
				+          Avoid text relocations in ARM vp8 decoder
			
 
				+          Generate a vpx.pc file for pkg-config.
			
 
				+          New ways of passing encoded data between encoder and decoder.
			
 
				+
			
 
				+  - Speed:
			
 
				+      This release includes across-the-board speed improvements to the
			
 
				+      encoder. On x86, these measure at approximately 11.5% in Best mode,
			
 
				+      21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6).
			
 
				+      On ARM Cortex A9 with Neon extensions, real-time encoding of video
			
 
				+      telephony content is 35% faster than Bali on single core and 48%
			
 
				+      faster on multi-core. On the NVidia Tegra2 platform, real time
			
 
				+      encoding is 40% faster than Bali.
			
 
				+
			
 
				+      Decoder speed was not a priority for this release, but improved
			
 
				+      approximately 8.4% on x86.
			
 
				+
			
 
				+          Reduce motion vector search on alt-ref frame.
			
 
				+          Encoder loopfilter running in its own thread
			
 
				+          Reworked loopfilter to precalculate more parameters
			
 
				+          SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}().
			
 
				+          Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3.
			
 
				+          Removed redundant checks
			
 
				+          Reduced structure sizes
			
 
				+          utilize preload in ARMv6 MC/LPF/Copy routines
			
 
				+          ARM optimized quantization, dfct, variance, subtract
			
 
				+          Increase chrow row alignment to 16 bytes.
			
 
				+          disable trellis optimization for first pass
			
 
				+          Write SSSE3 sub-pixel filter function
			
 
				+          Improve SSE2 half-pixel filter funtions
			
 
				+          Add vp8_sub_pixel_variance16x8_ssse3 function
			
 
				+          Reduce unnecessary distortion computation
			
 
				+          Use diamond search to replace full search
			
 
				+          Preload reference area in sub-pixel motion search (real-time mode)
			
 
				+
			
 
				+  - Quality:
			
 
				+      This release focused primarily on one-pass use cases, including
			
 
				+      video conferencing. Low latency data rate control was significantly
			
 
				+      improved, improving streamability over bandwidth constrained links.
			
 
				+      Added support for error concealment, allowing frames to maintain
			
 
				+      visual quality in the presence of substantial packet loss.
			
 
				+
			
 
				+          Add rc_max_intra_bitrate_pct control
			
 
				+          Limit size of initial keyframe in one-pass.
			
 
				+          Improve framerate adaptation
			
 
				+          Improved 1-pass CBR rate control
			
 
				+          Improved KF insertion after fades to still.
			
 
				+          Improved key frame detection.
			
 
				+          Improved activity masking (lower PSNR impact for same SSIM boost)
			
 
				+          Improved interaction between GF and ARFs
			
 
				+          Adding error-concealment to the decoder.
			
 
				+          Adding support for independent partitions
			
 
				+          Adjusted rate-distortion constants
			
 
				+
			
 
				+
			
 
				+  - Bug Fixes:
			
 
				+          Removed firstpass motion map
			
 
				+          Fix parallel make install
			
 
				+          Fix multithreaded encoding for 1 MB wide frame
			
 
				+          Fixed iwalsh_neon build problems with RVDS4.1
			
 
				+          Fix semaphore emulation, spin-wait intrinsics on Windows
			
 
				+          Fix build with xcode4 and simplify GLOBAL.
			
 
				+          Mark ARM asm objects as allowing a non-executable stack.
			
 
				+          Fix vpxenc encoding incorrect webm file header on big endian
			
 
				+
			
 
				+
			
 
				+2011-03-07 v0.9.6 "Bali"
			
 
				+  Our second named release, focused on a faster, higher quality, encoder.
			
 
				+
			
 
				+  - Upgrading:
			
 
				+    This release is backwards compatible with Aylesbury (v0.9.5). Users
			
 
				+    of older releases should refer to the Upgrading notes in this
			
 
				+    document for that release.
			
 
				+
			
 
				+  - Enhancements:
			
 
				+      vpxenc --psnr shows a summary when encode completes
			
 
				+      --tune=ssim option to enable activity masking
			
 
				+      improved postproc visualizations for development
			
 
				+      updated support for Apple iOS to SDK 4.2
			
 
				+      query decoder to determine which reference frames were updated
			
 
				+      implemented error tracking in the decoder
			
 
				+      fix pipe support on windows
			
 
				+
			
 
				+  - Speed:
			
 
				+      Primary focus was on good quality mode, speed 0. Average improvement
			
 
				+      on x86 about 40%, up to 100% on user-generated content at that speed.
			
 
				+      Best quality mode speed improved 35%, and realtime speed 10-20%. This
			
 
				+      release also saw significant improvement in realtime encoding speed
			
 
				+      on ARM platforms.
			
 
				+
			
 
				+        Improved encoder threading
			
 
				+        Dont pick encoder filter level when loopfilter is disabled.
			
 
				+        Avoid double copying of key frames into alt and golden buffer
			
 
				+        FDCT optimizations.
			
 
				+        x86 sse2 temporal filter
			
 
				+        SSSE3 version of fast quantizer
			
 
				+        vp8_rd_pick_best_mbsegmentation code restructure
			
 
				+        Adjusted breakout RD for SPLITMV
			
 
				+        Changed segmentation check order
			
 
				+        Improved rd_pick_intra4x4block
			
 
				+        Adds armv6 optimized variance calculation
			
 
				+        ARMv6 optimized sad16x16
			
 
				+        ARMv6 optimized half pixel variance calculations
			
 
				+        Full search SAD function optimization in SSE4.1
			
 
				+        Improve MV prediction accuracy to achieve performance gain
			
 
				+        Improve MV prediction in vp8_pick_inter_mode() for speed>3
			
 
				+
			
 
				+  - Quality:
			
 
				+      Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release
			
 
				+      also includes support for "activity masking," which greatly improves
			
 
				+      SSIM at the expense of PSNR. For now, this feature is available with
			
 
				+      the --tune=ssim option. Further experimentation in this area
			
 
				+      is ongoing. This release also introduces a new rate control mode
			
 
				+      called "CQ," which changes the allocation of bits within a clip to
			
 
				+      the sections where they will have the most visual impact.
			
 
				+
			
 
				+        Tuning for the more exact quantizer.
			
 
				+        Relax rate control for last few frames
			
 
				+        CQ Mode
			
 
				+        Limit key frame quantizer for forced key frames.
			
 
				+        KF/GF Pulsing
			
 
				+        Add simple version of activity masking.
			
 
				+        make rdmult adaptive for intra in quantizer RDO
			
 
				+        cap the best quantizer for 2nd order DC
			
 
				+        change the threshold of DC check for encode breakout
			
 
				+
			
 
				+  - Bug Fixes:
			
 
				+      Fix crash on Sparc Solaris.
			
 
				+      Fix counter of fixed keyframe distance
			
 
				+      ARNR filter pointer update bug fix
			
 
				+      Fixed use of motion percentage in KF/GF group calc
			
 
				+      Changed condition for using RD in Intra Mode
			
 
				+      Fix encoder real-time only configuration.
			
 
				+      Fix ARM encoder crash with multiple token partitions
			
 
				+      Fixed bug first cluster timecode of webm file is wrong.
			
 
				+      Fixed various encoder bugs with odd-sized images
			
 
				+      vp8e_get_preview fixed when spatial resampling enabled
			
 
				+      quantizer: fix assertion in fast quantizer path
			
 
				+      Allocate source buffers to be multiples of 16
			
 
				+      Fix for manual Golden frame frequency
			
 
				+      Fix drastic undershoot in long form content
			
 
				+
			
 
				+
			
 
				+2010-10-28 v0.9.5 "Aylesbury"
			
 
				+  Our first named release, focused on a faster decoder, and a better encoder.
			
 
				+
			
 
				+  - Upgrading:
			
 
				+    This release incorporates backwards-incompatible changes to the
			
 
				+    ivfenc and ivfdec tools. These tools are now called vpxenc and vpxdec.
			
 
				+
			
 
				+    vpxdec
			
 
				+      * the -q (quiet) option has been removed, and replaced with
			
 
				+        -v (verbose). the output is quiet by default. Use -v to see
			
 
				+        the version number of the binary.
			
 
				+
			
 
				+      * The default behavior is now to write output to a single file
			
 
				+        instead of individual frames. The -y option has been removed.
			
 
				+        Y4M output is the default.
			
 
				+
			
 
				+      * For raw I420/YV12 output instead of Y4M, the --i420 or --yv12
			
 
				+        options must be specified.
			
 
				+
			
 
				+          $ ivfdec -o OUTPUT INPUT
			
 
				+          $ vpxdec --i420 -o OUTPUT INPUT
			
 
				+
			
 
				+      * If an output file is not specified, the default is to write
			
 
				+        Y4M to stdout. This makes piping more natural.
			
 
				+
			
 
				+          $ ivfdec -y -o - INPUT | ...
			
 
				+          $ vpxdec INPUT | ...
			
 
				+
			
 
				+      * The output file has additional flexibility for formatting the
			
 
				+        filename. It supports escape characters for constructing a
			
 
				+        filename from the width, height, and sequence number. This
			
 
				+        replaces the -p option. To get the equivalent:
			
 
				+
			
 
				+          $ ivfdec -p frame INPUT
			
 
				+          $ vpxdec --i420 -o frame-%wx%h-%4.i420 INPUT
			
 
				+
			
 
				+    vpxenc
			
 
				+      * The output file must be specified with -o, rather than as the
			
 
				+        last argument.
			
 
				+
			
 
				+          $ ivfenc <options> INPUT OUTPUT
			
 
				+          $ vpxenc <options> -o OUTPUT INPUT
			
 
				+
			
 
				+      * The output defaults to webm. To get IVF output, use the --ivf
			
 
				+        option.
			
 
				+
			
 
				+          $ ivfenc <options> INPUT OUTPUT.ivf
			
 
				+          $ vpxenc <options> -o OUTPUT.ivf --ivf INPUT
			
 
				+
			
 
				+
			
 
				+  - Enhancements:
			
 
				+      ivfenc and ivfdec have been renamed to vpxenc, vpxdec.
			
 
				+      vpxdec supports .webm input
			
 
				+      vpxdec writes .y4m by default
			
 
				+      vpxenc writes .webm output by default
			
 
				+      vpxenc --psnr now shows the average/overall PSNR at the end
			
 
				+      ARM platforms now support runtime cpu detection
			
 
				+      vpxdec visualizations added for motion vectors, block modes, references
			
 
				+      vpxdec now silent by default
			
 
				+      vpxdec --progress shows frame-by-frame timing information
			
 
				+      vpxenc supports the distinction between --fps and --timebase
			
 
				+      NASM is now a supported assembler
			
 
				+      configure: enable PIC for shared libs by default
			
 
				+      configure: add --enable-small
			
 
				+      configure: support for ppc32-linux-gcc
			
 
				+      configure: support for sparc-solaris-gcc
			
 
				+
			
 
				+  - Bugs:
			
 
				+      Improve handling of invalid frames
			
 
				+      Fix valgrind errors in the NEON loop filters.
			
 
				+      Fix loopfilter delta zero transitions
			
 
				+      Fix valgrind errors in vp8_sixtap_predict8x4_armv6().
			
 
				+      Build fixes for darwin-icc
			
 
				+
			
 
				+  - Speed:
			
 
				+      20-40% (average 28%) improvement in libvpx decoder speed,
			
 
				+      including:
			
 
				+        Rewrite vp8_short_walsh4x4_sse2()
			
 
				+        Optimizations on the loopfilters.
			
 
				+        Miscellaneous improvements for Atom
			
 
				+        Add 4-tap version of 2nd-pass ARMv6 MC filter.
			
 
				+        Improved multithread utilization
			
 
				+        Better instruction choices on x86
			
 
				+        reorder data to use wider instructions
			
 
				+        Update NEON wide idcts
			
 
				+        Make block access to frame buffer sequential
			
 
				+        Improved subset block search
			
 
				+        Bilinear subpixel optimizations for ssse3.
			
 
				+        Decrease memory footprint
			
 
				+
			
 
				+      Encoder speed improvements (percentage gain not measured):
			
 
				+        Skip unnecessary search of identical frames
			
 
				+        Add SSE2 subtract functions
			
 
				+        Improve bounds checking in vp8_diamond_search_sadx4()
			
 
				+        Added vp8_fast_quantize_b_sse2
			
 
				+
			
 
				+  - Quality:
			
 
				+      Over 7% overall PSNR improvement (6.3% SSIM) in "best" quality
			
 
				+      encoding mode, and up to 60% improvement on very noisy, still
			
 
				+      or slow moving source video
			
 
				+
			
 
				+        Motion compensated temporal filter for Alt-Ref Noise Reduction
			
 
				+        Improved use of trellis quantization on 2nd order Y blocks
			
 
				+        Tune effect of motion on KF/GF boost in two pass
			
 
				+        Allow coefficient optimization for good quality speed 0.
			
 
				+        Improved control of active min quantizer for two pass.
			
 
				+        Enable ARFs for non-lagged compress
			
 
				+
			
 
				+2010-09-02 v0.9.2
			
 
				+  - Enhancements:
			
 
				+      Disable frame dropping by default
			
 
				+      Improved multithreaded performance
			
 
				+      Improved Force Key Frame Behaviour
			
 
				+      Increased rate control buffer level precision
			
 
				+      Fix bug in 1st pass motion compensation
			
 
				+      ivfenc: correct fixed kf interval, --disable-kf
			
 
				+  - Speed:
			
 
				+      Changed above and left context data layout
			
 
				+      Rework idct calling structure.
			
 
				+      Removed unnecessary MB_MODE_INFO copies
			
 
				+      x86: SSSE3 sixtap prediction
			
 
				+      Reworked IDCT to include reconstruction (add) step
			
 
				+      Swap alt/gold/new/last frame buffer ptrs instead of copying.
			
 
				+      Improve SSE2 loopfilter functions
			
 
				+      Change bitreader to use a larger window.
			
 
				+      Avoid loopfilter reinitialization when possible
			
 
				+  - Quality:
			
 
				+      Normalize quantizer's zero bin and rounding factors
			
 
				+      Add trellis quantization.
			
 
				+      Make the quantizer exact.
			
 
				+      Updates to ARNR filtering algorithm
			
 
				+      Fix breakout thresh computation for golden & AltRef frames
			
 
				+      Redo the forward 4x4 dct
			
 
				+      Improve the accuracy of forward walsh-hadamard transform
			
 
				+      Further adjustment of RD behaviour with Q and Zbin.
			
 
				+  - Build System:
			
 
				+      Allow linking of libs built with MinGW to MSVC
			
 
				+      Fix target auto-detection on mingw32
			
 
				+      Allow --cpu= to work for x86.
			
 
				+      configure: pass original arguments through to make dist
			
 
				+      Fix builds without runtime CPU detection
			
 
				+      msvs: fix install of codec sources
			
 
				+      msvs: Change devenv.com command line for better msys support
			
 
				+      msvs: Add vs9 targets.
			
 
				+      Add x86_64-linux-icc target
			
 
				+  - Bugs:
			
 
				+      Potential crashes on older MinGW builds
			
 
				+      Fix two-pass framrate for Y4M input.
			
 
				+      Fixed simple loop filter, other crashes on ARM v6
			
 
				+      arm: fix missing dependency with --enable-shared
			
 
				+      configure: support directories containing .o
			
 
				+      Replace pinsrw (SSE) with MMX instructions
			
 
				+      apple: include proper mach primatives
			
 
				+      Fixed rate control bug with long key frame interval.
			
 
				+      Fix DSO link errors on x86-64 when not using a version script
			
 
				+      Fixed buffer selection for UV in AltRef filtering
			
 
				+
			
 
				+
			
 
				+2010-06-17 v0.9.1
			
 
				+  - Enhancements:
			
 
				+      * ivfenc/ivfdec now support YUV4MPEG2 input and pipe I/O
			
 
				+      * Speed optimizations
			
 
				+  - Bugfixes:
			
 
				+      * Rate control
			
 
				+      * Prevent out-of-bounds accesses on invalid data
			
 
				+  - Build system updates:
			
 
				+      * Detect toolchain to be used automatically for native builds
			
 
				+      * Support building shared libraries
			
 
				+      * Better autotools emulation (--prefix, --libdir, DESTDIR)
			
 
				+  - Updated LICENSE
			
 
				+      * http://webmproject.blogspot.com/2010/06/changes-to-webm-open-source-license.html
			
 
				+
			
 
				+
			
 
				+2010-05-18 v0.9.0
			
 
				+  - Initial open source release. Welcome to WebM and VP8!
			
 
				+
			
--- a/thirdparty/libvpx/LICENSE
+++ b/thirdparty/libvpx/LICENSE
@@ -0,0 +1,31 @@
 
				+Copyright (c) 2010, The WebM Project authors. All rights reserved.
			
 
				+
			
 
				+Redistribution and use in source and binary forms, with or without
			
 
				+modification, are permitted provided that the following conditions are
			
 
				+met:
			
 
				+
			
 
				+  * Redistributions of source code must retain the above copyright
			
 
				+    notice, this list of conditions and the following disclaimer.
			
 
				+
			
 
				+  * Redistributions in binary form must reproduce the above copyright
			
 
				+    notice, this list of conditions and the following disclaimer in
			
 
				+    the documentation and/or other materials provided with the
			
 
				+    distribution.
			
 
				+
			
 
				+  * Neither the name of Google, nor the WebM Project, nor the names
			
 
				+    of its contributors may be used to endorse or promote products
			
 
				+    derived from this software without specific prior written
			
 
				+    permission.
			
 
				+
			
 
				+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
			
 
				+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
			
 
				+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
			
 
				+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+
			
--- a/thirdparty/libvpx/PATENTS
+++ b/thirdparty/libvpx/PATENTS
@@ -0,0 +1,23 @@
 
				+Additional IP Rights Grant (Patents)
			
 
				+------------------------------------
			
 
				+
			
 
				+"These implementations" means the copyrightable works that implement the WebM
			
 
				+codecs distributed by Google as part of the WebM Project.
			
 
				+
			
 
				+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
			
 
				+royalty-free, irrevocable (except as stated in this section) patent license to
			
 
				+make, have made, use, offer to sell, sell, import, transfer, and otherwise
			
 
				+run, modify and propagate the contents of these implementations of WebM, where
			
 
				+such license applies only to those patent claims, both currently owned by
			
 
				+Google and acquired in the future, licensable by Google that are necessarily
			
 
				+infringed by these implementations of WebM. This grant does not include claims
			
 
				+that would be infringed only as a consequence of further modification of these
			
 
				+implementations. If you or your agent or exclusive licensee institute or order
			
 
				+or agree to the institution of patent litigation or any other patent
			
 
				+enforcement activity against any entity (including a cross-claim or
			
 
				+counterclaim in a lawsuit) alleging that any of these implementations of WebM
			
 
				+or any code incorporated within any of these implementations of WebM
			
 
				+constitute direct or contributory patent infringement, or inducement of
			
 
				+patent infringement, then any patent rights granted to you under this License
			
 
				+for these implementations of WebM shall terminate as of the date such
			
 
				+litigation is filed.
			
--- a/thirdparty/libvpx/third_party/x86inc/LICENSE
+++ b/thirdparty/libvpx/third_party/x86inc/LICENSE
@@ -0,0 +1,18 @@
 
				+Copyright (C) 2005-2012 x264 project
			
 
				+
			
 
				+Authors: Loren Merritt <[email protected]>
			
 
				+         Anton Mitrofanov <[email protected]>
			
 
				+         Jason Garrett-Glaser <[email protected]>
			
 
				+         Henrik Gramner <[email protected]>
			
 
				+
			
 
				+Permission to use, copy, modify, and/or distribute this software for any
			
 
				+purpose with or without fee is hereby granted, provided that the above
			
 
				+copyright notice and this permission notice appear in all copies.
			
 
				+
			
 
				+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
			
 
				+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
			
 
				+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
			
 
				+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
			
 
				+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
			
 
				+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
			
 
				+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
			
--- a/thirdparty/libvpx/third_party/x86inc/README.libvpx
+++ b/thirdparty/libvpx/third_party/x86inc/README.libvpx
@@ -0,0 +1,20 @@
 
				+URL: https://git.videolan.org/git/x264.git
			
 
				+Version: d23d18655249944c1ca894b451e2c82c7a584c62
			
 
				+License: ISC
			
 
				+License File: LICENSE
			
 
				+
			
 
				+Description:
			
 
				+x264/libav's framework for x86 assembly. Contains a variety of macros and
			
 
				+defines that help automatically allow assembly to work cross-platform.
			
 
				+
			
 
				+Local Modifications:
			
 
				+Get configuration from vpx_config.asm.
			
 
				+Prefix functions with vpx by default.
			
 
				+Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
			
 
				+  exist in libvpx.
			
 
				+Expand PIC default to macho64 and respect CONFIG_PIC from libvpx
			
 
				+Set 'private_extern' visibility for macho targets.
			
 
				+Copy PIC 'GLOBAL' macros from x86_abi_support.asm
			
 
				+Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
			
 
				+Use .text with no alignment for aout
			
 
				+Only use 'hidden' visibility with Chromium
			
--- a/thirdparty/libvpx/third_party/x86inc/x86inc.asm
+++ b/thirdparty/libvpx/third_party/x86inc/x86inc.asm
@@ -0,0 +1,1649 @@
 
				+;*****************************************************************************
			
 
				+;* x86inc.asm: x264asm abstraction layer
			
 
				+;*****************************************************************************
			
 
				+;* Copyright (C) 2005-2016 x264 project
			
 
				+;*
			
 
				+;* Authors: Loren Merritt <[email protected]>
			
 
				+;*          Anton Mitrofanov <[email protected]>
			
 
				+;*          Fiona Glaser <[email protected]>
			
 
				+;*          Henrik Gramner <[email protected]>
			
 
				+;*
			
 
				+;* Permission to use, copy, modify, and/or distribute this software for any
			
 
				+;* purpose with or without fee is hereby granted, provided that the above
			
 
				+;* copyright notice and this permission notice appear in all copies.
			
 
				+;*
			
 
				+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
			
 
				+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
			
 
				+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
			
 
				+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
			
 
				+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
			
 
				+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
			
 
				+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
			
 
				+;*****************************************************************************
			
 
				+
			
 
				+; This is a header file for the x264ASM assembly language, which uses
			
 
				+; NASM/YASM syntax combined with a large number of macros to provide easy
			
 
				+; abstraction between different calling conventions (x86_32, win64, linux64).
			
 
				+; It also has various other useful features to simplify writing the kind of
			
 
				+; DSP functions that are most often used in x264.
			
 
				+
			
 
				+; Unlike the rest of x264, this file is available under an ISC license, as it
			
 
				+; has significant usefulness outside of x264 and we want it to be available
			
 
				+; to the largest audience possible.  Of course, if you modify it for your own
			
 
				+; purposes to add a new feature, we strongly encourage contributing a patch
			
 
				+; as this feature might be useful for others as well.  Send patches or ideas
			
 
				+; to [email protected] .
			
 
				+
			
 
				+%include "vpx_config.asm"
			
 
				+
			
 
				+%ifndef private_prefix
			
 
				+    %define private_prefix vpx
			
 
				+%endif
			
 
				+
			
 
				+%ifndef public_prefix
			
 
				+    %define public_prefix private_prefix
			
 
				+%endif
			
 
				+
			
 
				+%ifndef STACK_ALIGNMENT
			
 
				+    %if ARCH_X86_64
			
 
				+        %define STACK_ALIGNMENT 16
			
 
				+    %else
			
 
				+        %define STACK_ALIGNMENT 4
			
 
				+    %endif
			
 
				+%endif
			
 
				+
			
 
				+%define WIN64  0
			
 
				+%define UNIX64 0
			
 
				+%if ARCH_X86_64
			
 
				+    %ifidn __OUTPUT_FORMAT__,win32
			
 
				+        %define WIN64  1
			
 
				+    %elifidn __OUTPUT_FORMAT__,win64
			
 
				+        %define WIN64  1
			
 
				+    %elifidn __OUTPUT_FORMAT__,x64
			
 
				+        %define WIN64  1
			
 
				+    %else
			
 
				+        %define UNIX64 1
			
 
				+    %endif
			
 
				+%endif
			
 
				+
			
 
				+%define FORMAT_ELF 0
			
 
				+%ifidn __OUTPUT_FORMAT__,elf
			
 
				+    %define FORMAT_ELF 1
			
 
				+%elifidn __OUTPUT_FORMAT__,elf32
			
 
				+    %define FORMAT_ELF 1
			
 
				+%elifidn __OUTPUT_FORMAT__,elf64
			
 
				+    %define FORMAT_ELF 1
			
 
				+%endif
			
 
				+
			
 
				+%define FORMAT_MACHO 0
			
 
				+%ifidn __OUTPUT_FORMAT__,macho32
			
 
				+     %define FORMAT_MACHO 1
			
 
				+%elifidn __OUTPUT_FORMAT__,macho64
			
 
				+     %define FORMAT_MACHO 1
			
 
				+%endif
			
 
				+
			
 
				+; Set PREFIX for libvpx builds.
			
 
				+%if FORMAT_ELF
			
 
				+    %undef PREFIX
			
 
				+%elif WIN64
			
 
				+    %undef PREFIX
			
 
				+%else
			
 
				+    %define PREFIX
			
 
				+%endif
			
 
				+
			
 
				+%ifdef PREFIX
			
 
				+    %define mangle(x) _ %+ x
			
 
				+%else
			
 
				+    %define mangle(x) x
			
 
				+%endif
			
 
				+
			
 
				+; In some instances macho32 tables get misaligned when using .rodata.
			
 
				+; When looking at the disassembly it appears that the offset is either
			
 
				+; correct or consistently off by 90. Placing them in the .text section
			
 
				+; works around the issue. It appears to be specific to the way libvpx
			
 
				+; handles the tables.
			
 
				+%macro SECTION_RODATA 0-1 16
			
 
				+    %ifidn __OUTPUT_FORMAT__,macho32
			
 
				+        SECTION .text align=%1
			
 
				+        fakegot:
			
 
				+    %elifidn __OUTPUT_FORMAT__,aout
			
 
				+        SECTION .text
			
 
				+    %else
			
 
				+        SECTION .rodata align=%1
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
			
 
				+; from original code is added in for 64bit.
			
 
				+%ifidn __OUTPUT_FORMAT__,elf32
			
 
				+%define ABI_IS_32BIT 1
			
 
				+%elifidn __OUTPUT_FORMAT__,macho32
			
 
				+%define ABI_IS_32BIT 1
			
 
				+%elifidn __OUTPUT_FORMAT__,win32
			
 
				+%define ABI_IS_32BIT 1
			
 
				+%elifidn __OUTPUT_FORMAT__,aout
			
 
				+%define ABI_IS_32BIT 1
			
 
				+%else
			
 
				+%define ABI_IS_32BIT 0
			
 
				+%endif
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+    %if CONFIG_PIC=1
			
 
				+        %ifidn __OUTPUT_FORMAT__,elf32
			
 
				+            %define GET_GOT_DEFINED 1
			
 
				+            %define WRT_PLT wrt ..plt
			
 
				+            %macro GET_GOT 1
			
 
				+                extern _GLOBAL_OFFSET_TABLE_
			
 
				+                push %1
			
 
				+                call %%get_got
			
 
				+                %%sub_offset:
			
 
				+                jmp %%exitGG
			
 
				+                %%get_got:
			
 
				+                mov %1, [esp]
			
 
				+                add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
			
 
				+                ret
			
 
				+                %%exitGG:
			
 
				+                %undef GLOBAL
			
 
				+                %define GLOBAL(x) x + %1 wrt ..gotoff
			
 
				+                %undef RESTORE_GOT
			
 
				+                %define RESTORE_GOT pop %1
			
 
				+            %endmacro
			
 
				+        %elifidn __OUTPUT_FORMAT__,macho32
			
 
				+            %define GET_GOT_DEFINED 1
			
 
				+            %macro GET_GOT 1
			
 
				+                push %1
			
 
				+                call %%get_got
			
 
				+                %%get_got:
			
 
				+                pop  %1
			
 
				+                %undef GLOBAL
			
 
				+                %define GLOBAL(x) x + %1 - %%get_got
			
 
				+                %undef RESTORE_GOT
			
 
				+                %define RESTORE_GOT pop %1
			
 
				+            %endmacro
			
 
				+        %else
			
 
				+            %define GET_GOT_DEFINED 0
			
 
				+        %endif
			
 
				+    %endif
			
 
				+
			
 
				+    %if ARCH_X86_64 == 0
			
 
				+        %undef PIC
			
 
				+    %endif
			
 
				+
			
 
				+%else
			
 
				+    %macro GET_GOT 1
			
 
				+    %endmacro
			
 
				+    %define GLOBAL(x) rel x
			
 
				+    %define WRT_PLT wrt ..plt
			
 
				+
			
 
				+    %if WIN64
			
 
				+        %define PIC
			
 
				+    %elifidn __OUTPUT_FORMAT__,macho64
			
 
				+        %define PIC
			
 
				+    %elif CONFIG_PIC
			
 
				+        %define PIC
			
 
				+    %endif
			
 
				+%endif
			
 
				+
			
 
				+%ifnmacro GET_GOT
			
 
				+    %macro GET_GOT 1
			
 
				+    %endmacro
			
 
				+    %define GLOBAL(x) x
			
 
				+%endif
			
 
				+%ifndef RESTORE_GOT
			
 
				+    %define RESTORE_GOT
			
 
				+%endif
			
 
				+%ifndef WRT_PLT
			
 
				+    %define WRT_PLT
			
 
				+%endif
			
 
				+
			
 
				+%ifdef PIC
			
 
				+    default rel
			
 
				+%endif
			
 
				+
			
 
				+%ifndef GET_GOT_DEFINED
			
 
				+    %define GET_GOT_DEFINED 0
			
 
				+%endif
			
 
				+; Done with PIC macros
			
 
				+
			
 
				+%ifdef __NASM_VER__
			
 
				+    %use smartalign
			
 
				+%endif
			
 
				+
			
 
				+; Macros to eliminate most code duplication between x86_32 and x86_64:
			
 
				+; Currently this works only for leaf functions which load all their arguments
			
 
				+; into registers at the start, and make no other use of the stack. Luckily that
			
 
				+; covers most of x264's asm.
			
 
				+
			
 
				+; PROLOGUE:
			
 
				+; %1 = number of arguments. loads them from stack if needed.
			
 
				+; %2 = number of registers used. pushes callee-saved regs if needed.
			
 
				+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
			
 
				+; %4 = (optional) stack size to be allocated. The stack will be aligned before
			
 
				+;      allocating the specified stack size. If the required stack alignment is
			
 
				+;      larger than the known stack alignment the stack will be manually aligned
			
 
				+;      and an extra register will be allocated to hold the original stack
			
 
				+;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
			
 
				+;      register as stack pointer, request a negative stack size.
			
 
				+; %4+/%5+ = list of names to define to registers
			
 
				+; PROLOGUE can also be invoked by adding the same options to cglobal
			
 
				+
			
 
				+; e.g.
			
 
				+; cglobal foo, 2,3,7,0x40, dst, src, tmp
			
 
				+; declares a function (foo) that automatically loads two arguments (dst and
			
 
				+; src) into registers, uses one additional register (tmp) plus 7 vector
			
 
				+; registers (m0-m6) and allocates 0x40 bytes of stack space.
			
 
				+
			
 
				+; TODO Some functions can use some args directly from the stack. If they're the
			
 
				+; last args then you can just not declare them, but if they're in the middle
			
 
				+; we need more flexible macro.
			
 
				+
			
 
				+; RET:
			
 
				+; Pops anything that was pushed by PROLOGUE, and returns.
			
 
				+
			
 
				+; REP_RET:
			
 
				+; Use this instead of RET if it's a branch target.
			
 
				+
			
 
				+; registers:
			
 
				+; rN and rNq are the native-size register holding function argument N
			
 
				+; rNd, rNw, rNb are dword, word, and byte size
			
 
				+; rNh is the high 8 bits of the word size
			
 
				+; rNm is the original location of arg N (a register or on the stack), dword
			
 
				+; rNmp is native size
			
 
				+
			
 
				+%macro DECLARE_REG 2-3
			
 
				+    %define r%1q %2
			
 
				+    %define r%1d %2d
			
 
				+    %define r%1w %2w
			
 
				+    %define r%1b %2b
			
 
				+    %define r%1h %2h
			
 
				+    %define %2q %2
			
 
				+    %if %0 == 2
			
 
				+        %define r%1m  %2d
			
 
				+        %define r%1mp %2
			
 
				+    %elif ARCH_X86_64 ; memory
			
 
				+        %define r%1m [rstk + stack_offset + %3]
			
 
				+        %define r%1mp qword r %+ %1 %+ m
			
 
				+    %else
			
 
				+        %define r%1m [rstk + stack_offset + %3]
			
 
				+        %define r%1mp dword r %+ %1 %+ m
			
 
				+    %endif
			
 
				+    %define r%1  %2
			
 
				+%endmacro
			
 
				+
			
 
				+%macro DECLARE_REG_SIZE 3
			
 
				+    %define r%1q r%1
			
 
				+    %define e%1q r%1
			
 
				+    %define r%1d e%1
			
 
				+    %define e%1d e%1
			
 
				+    %define r%1w %1
			
 
				+    %define e%1w %1
			
 
				+    %define r%1h %3
			
 
				+    %define e%1h %3
			
 
				+    %define r%1b %2
			
 
				+    %define e%1b %2
			
 
				+    %if ARCH_X86_64 == 0
			
 
				+        %define r%1 e%1
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+DECLARE_REG_SIZE ax, al, ah
			
 
				+DECLARE_REG_SIZE bx, bl, bh
			
 
				+DECLARE_REG_SIZE cx, cl, ch
			
 
				+DECLARE_REG_SIZE dx, dl, dh
			
 
				+DECLARE_REG_SIZE si, sil, null
			
 
				+DECLARE_REG_SIZE di, dil, null
			
 
				+DECLARE_REG_SIZE bp, bpl, null
			
 
				+
			
 
				+; t# defines for when per-arch register allocation is more complex than just function arguments
			
 
				+
			
 
				+%macro DECLARE_REG_TMP 1-*
			
 
				+    %assign %%i 0
			
 
				+    %rep %0
			
 
				+        CAT_XDEFINE t, %%i, r%1
			
 
				+        %assign %%i %%i+1
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+%endmacro
			
 
				+
			
 
				+%macro DECLARE_REG_TMP_SIZE 0-*
			
 
				+    %rep %0
			
 
				+        %define t%1q t%1 %+ q
			
 
				+        %define t%1d t%1 %+ d
			
 
				+        %define t%1w t%1 %+ w
			
 
				+        %define t%1h t%1 %+ h
			
 
				+        %define t%1b t%1 %+ b
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+%endmacro
			
 
				+
			
 
				+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
			
 
				+
			
 
				+%if ARCH_X86_64
			
 
				+    %define gprsize 8
			
 
				+%else
			
 
				+    %define gprsize 4
			
 
				+%endif
			
 
				+
			
 
				+%macro PUSH 1
			
 
				+    push %1
			
 
				+    %ifidn rstk, rsp
			
 
				+        %assign stack_offset stack_offset+gprsize
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro POP 1
			
 
				+    pop %1
			
 
				+    %ifidn rstk, rsp
			
 
				+        %assign stack_offset stack_offset-gprsize
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro PUSH_IF_USED 1-*
			
 
				+    %rep %0
			
 
				+        %if %1 < regs_used
			
 
				+            PUSH r%1
			
 
				+        %endif
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+%endmacro
			
 
				+
			
 
				+%macro POP_IF_USED 1-*
			
 
				+    %rep %0
			
 
				+        %if %1 < regs_used
			
 
				+            pop r%1
			
 
				+        %endif
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+%endmacro
			
 
				+
			
 
				+%macro LOAD_IF_USED 1-*
			
 
				+    %rep %0
			
 
				+        %if %1 < num_args
			
 
				+            mov r%1, r %+ %1 %+ mp
			
 
				+        %endif
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+%endmacro
			
 
				+
			
 
				+%macro SUB 2
			
 
				+    sub %1, %2
			
 
				+    %ifidn %1, rstk
			
 
				+        %assign stack_offset stack_offset+(%2)
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro ADD 2
			
 
				+    add %1, %2
			
 
				+    %ifidn %1, rstk
			
 
				+        %assign stack_offset stack_offset-(%2)
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro movifnidn 2
			
 
				+    %ifnidn %1, %2
			
 
				+        mov %1, %2
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro movsxdifnidn 2
			
 
				+    %ifnidn %1, %2
			
 
				+        movsxd %1, %2
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro ASSERT 1
			
 
				+    %if (%1) == 0
			
 
				+        %error assertion ``%1'' failed
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro DEFINE_ARGS 0-*
			
 
				+    %ifdef n_arg_names
			
 
				+        %assign %%i 0
			
 
				+        %rep n_arg_names
			
 
				+            CAT_UNDEF arg_name %+ %%i, q
			
 
				+            CAT_UNDEF arg_name %+ %%i, d
			
 
				+            CAT_UNDEF arg_name %+ %%i, w
			
 
				+            CAT_UNDEF arg_name %+ %%i, h
			
 
				+            CAT_UNDEF arg_name %+ %%i, b
			
 
				+            CAT_UNDEF arg_name %+ %%i, m
			
 
				+            CAT_UNDEF arg_name %+ %%i, mp
			
 
				+            CAT_UNDEF arg_name, %%i
			
 
				+            %assign %%i %%i+1
			
 
				+        %endrep
			
 
				+    %endif
			
 
				+
			
 
				+    %xdefine %%stack_offset stack_offset
			
 
				+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
			
 
				+    %assign %%i 0
			
 
				+    %rep %0
			
 
				+        %xdefine %1q r %+ %%i %+ q
			
 
				+        %xdefine %1d r %+ %%i %+ d
			
 
				+        %xdefine %1w r %+ %%i %+ w
			
 
				+        %xdefine %1h r %+ %%i %+ h
			
 
				+        %xdefine %1b r %+ %%i %+ b
			
 
				+        %xdefine %1m r %+ %%i %+ m
			
 
				+        %xdefine %1mp r %+ %%i %+ mp
			
 
				+        CAT_XDEFINE arg_name, %%i, %1
			
 
				+        %assign %%i %%i+1
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+    %xdefine stack_offset %%stack_offset
			
 
				+    %assign n_arg_names %0
			
 
				+%endmacro
			
 
				+
			
 
				+%define required_stack_alignment ((mmsize + 15) & ~15)
			
 
				+
			
 
				+%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
			
 
				+    %ifnum %1
			
 
				+        %if %1 != 0
			
 
				+            %assign %%pad 0
			
 
				+            %assign stack_size %1
			
 
				+            %if stack_size < 0
			
 
				+                %assign stack_size -stack_size
			
 
				+            %endif
			
 
				+            %if WIN64
			
 
				+                %assign %%pad %%pad + 32 ; shadow space
			
 
				+                %if mmsize != 8
			
 
				+                    %assign xmm_regs_used %2
			
 
				+                    %if xmm_regs_used > 8
			
 
				+                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
			
 
				+                    %endif
			
 
				+                %endif
			
 
				+            %endif
			
 
				+            %if required_stack_alignment <= STACK_ALIGNMENT
			
 
				+                ; maintain the current stack alignment
			
 
				+                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
			
 
				+                SUB rsp, stack_size_padded
			
 
				+            %else
			
 
				+                %assign %%reg_num (regs_used - 1)
			
 
				+                %xdefine rstk r %+ %%reg_num
			
 
				+                ; align stack, and save original stack location directly above
			
 
				+                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
			
 
				+                ; stack in a single instruction (i.e. mov rsp, rstk or mov
			
 
				+                ; rsp, [rsp+stack_size_padded])
			
 
				+                %if %1 < 0 ; need to store rsp on stack
			
 
				+                    %xdefine rstkm [rsp + stack_size + %%pad]
			
 
				+                    %assign %%pad %%pad + gprsize
			
 
				+                %else ; can keep rsp in rstk during whole function
			
 
				+                    %xdefine rstkm rstk
			
 
				+                %endif
			
 
				+                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
			
 
				+                mov rstk, rsp
			
 
				+                and rsp, ~(required_stack_alignment-1)
			
 
				+                sub rsp, stack_size_padded
			
 
				+                movifnidn rstkm, rstk
			
 
				+            %endif
			
 
				+            WIN64_PUSH_XMM
			
 
				+        %endif
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro SETUP_STACK_POINTER 1
			
 
				+    %ifnum %1
			
 
				+        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
			
 
				+            %if %1 > 0
			
 
				+                %assign regs_used (regs_used + 1)
			
 
				+            %endif
			
 
				+            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
			
 
				+                ; Ensure that we don't clobber any registers containing arguments
			
 
				+                %assign regs_used 5 + UNIX64 * 3
			
 
				+            %endif
			
 
				+        %endif
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro DEFINE_ARGS_INTERNAL 3+
			
 
				+    %ifnum %2
			
 
				+        DEFINE_ARGS %3
			
 
				+    %elif %1 == 4
			
 
				+        DEFINE_ARGS %2
			
 
				+    %elif %1 > 4
			
 
				+        DEFINE_ARGS %2, %3
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%if WIN64 ; Windows x64 ;=================================================
			
 
				+
			
 
				+DECLARE_REG 0,  rcx
			
 
				+DECLARE_REG 1,  rdx
			
 
				+DECLARE_REG 2,  R8
			
 
				+DECLARE_REG 3,  R9
			
 
				+DECLARE_REG 4,  R10, 40
			
 
				+DECLARE_REG 5,  R11, 48
			
 
				+DECLARE_REG 6,  rax, 56
			
 
				+DECLARE_REG 7,  rdi, 64
			
 
				+DECLARE_REG 8,  rsi, 72
			
 
				+DECLARE_REG 9,  rbx, 80
			
 
				+DECLARE_REG 10, rbp, 88
			
 
				+DECLARE_REG 11, R12, 96
			
 
				+DECLARE_REG 12, R13, 104
			
 
				+DECLARE_REG 13, R14, 112
			
 
				+DECLARE_REG 14, R15, 120
			
 
				+
			
 
				+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
			
 
				+    %assign num_args %1
			
 
				+    %assign regs_used %2
			
 
				+    ASSERT regs_used >= num_args
			
 
				+    SETUP_STACK_POINTER %4
			
 
				+    ASSERT regs_used <= 15
			
 
				+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
			
 
				+    ALLOC_STACK %4, %3
			
 
				+    %if mmsize != 8 && stack_size == 0
			
 
				+        WIN64_SPILL_XMM %3
			
 
				+    %endif
			
 
				+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
			
 
				+    DEFINE_ARGS_INTERNAL %0, %4, %5
			
 
				+%endmacro
			
 
				+
			
 
				+%macro WIN64_PUSH_XMM 0
			
 
				+    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
			
 
				+    %if xmm_regs_used > 6
			
 
				+        movaps [rstk + stack_offset +  8], xmm6
			
 
				+    %endif
			
 
				+    %if xmm_regs_used > 7
			
 
				+        movaps [rstk + stack_offset + 24], xmm7
			
 
				+    %endif
			
 
				+    %if xmm_regs_used > 8
			
 
				+        %assign %%i 8
			
 
				+        %rep xmm_regs_used-8
			
 
				+            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
			
 
				+            %assign %%i %%i+1
			
 
				+        %endrep
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro WIN64_SPILL_XMM 1
			
 
				+    %assign xmm_regs_used %1
			
 
				+    ASSERT xmm_regs_used <= 16
			
 
				+    %if xmm_regs_used > 8
			
 
				+        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
			
 
				+        %assign %%pad (xmm_regs_used-8)*16 + 32
			
 
				+        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
			
 
				+        SUB rsp, stack_size_padded
			
 
				+    %endif
			
 
				+    WIN64_PUSH_XMM
			
 
				+%endmacro
			
 
				+
			
 
				+%macro WIN64_RESTORE_XMM_INTERNAL 1
			
 
				+    %assign %%pad_size 0
			
 
				+    %if xmm_regs_used > 8
			
 
				+        %assign %%i xmm_regs_used
			
 
				+        %rep xmm_regs_used-8
			
 
				+            %assign %%i %%i-1
			
 
				+            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
			
 
				+        %endrep
			
 
				+    %endif
			
 
				+    %if stack_size_padded > 0
			
 
				+        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
			
 
				+            mov rsp, rstkm
			
 
				+        %else
			
 
				+            add %1, stack_size_padded
			
 
				+            %assign %%pad_size stack_size_padded
			
 
				+        %endif
			
 
				+    %endif
			
 
				+    %if xmm_regs_used > 7
			
 
				+        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
			
 
				+    %endif
			
 
				+    %if xmm_regs_used > 6
			
 
				+        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro WIN64_RESTORE_XMM 1
			
 
				+    WIN64_RESTORE_XMM_INTERNAL %1
			
 
				+    %assign stack_offset (stack_offset-stack_size_padded)
			
 
				+    %assign xmm_regs_used 0
			
 
				+%endmacro
			
 
				+
			
 
				+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
			
 
				+
			
 
				+%macro RET 0
			
 
				+    WIN64_RESTORE_XMM_INTERNAL rsp
			
 
				+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
			
 
				+    %if mmsize == 32
			
 
				+        vzeroupper
			
 
				+    %endif
			
 
				+    AUTO_REP_RET
			
 
				+%endmacro
			
 
				+
			
 
				+%elif ARCH_X86_64 ; *nix x64 ;=============================================
			
 
				+
			
 
				+DECLARE_REG 0,  rdi
			
 
				+DECLARE_REG 1,  rsi
			
 
				+DECLARE_REG 2,  rdx
			
 
				+DECLARE_REG 3,  rcx
			
 
				+DECLARE_REG 4,  R8
			
 
				+DECLARE_REG 5,  R9
			
 
				+DECLARE_REG 6,  rax, 8
			
 
				+DECLARE_REG 7,  R10, 16
			
 
				+DECLARE_REG 8,  R11, 24
			
 
				+DECLARE_REG 9,  rbx, 32
			
 
				+DECLARE_REG 10, rbp, 40
			
 
				+DECLARE_REG 11, R12, 48
			
 
				+DECLARE_REG 12, R13, 56
			
 
				+DECLARE_REG 13, R14, 64
			
 
				+DECLARE_REG 14, R15, 72
			
 
				+
			
 
				+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
			
 
				+    %assign num_args %1
			
 
				+    %assign regs_used %2
			
 
				+    ASSERT regs_used >= num_args
			
 
				+    SETUP_STACK_POINTER %4
			
 
				+    ASSERT regs_used <= 15
			
 
				+    PUSH_IF_USED 9, 10, 11, 12, 13, 14
			
 
				+    ALLOC_STACK %4
			
 
				+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
			
 
				+    DEFINE_ARGS_INTERNAL %0, %4, %5
			
 
				+%endmacro
			
 
				+
			
 
				+%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
			
 
				+
			
 
				+%macro RET 0
			
 
				+    %if stack_size_padded > 0
			
 
				+        %if required_stack_alignment > STACK_ALIGNMENT
			
 
				+            mov rsp, rstkm
			
 
				+        %else
			
 
				+            add rsp, stack_size_padded
			
 
				+        %endif
			
 
				+    %endif
			
 
				+    POP_IF_USED 14, 13, 12, 11, 10, 9
			
 
				+    %if mmsize == 32
			
 
				+        vzeroupper
			
 
				+    %endif
			
 
				+    AUTO_REP_RET
			
 
				+%endmacro
			
 
				+
			
 
				+%else ; X86_32 ;==============================================================
			
 
				+
			
 
				+DECLARE_REG 0, eax, 4
			
 
				+DECLARE_REG 1, ecx, 8
			
 
				+DECLARE_REG 2, edx, 12
			
 
				+DECLARE_REG 3, ebx, 16
			
 
				+DECLARE_REG 4, esi, 20
			
 
				+DECLARE_REG 5, edi, 24
			
 
				+DECLARE_REG 6, ebp, 28
			
 
				+%define rsp esp
			
 
				+
			
 
				+%macro DECLARE_ARG 1-*
			
 
				+    %rep %0
			
 
				+        %define r%1m [rstk + stack_offset + 4*%1 + 4]
			
 
				+        %define r%1mp dword r%1m
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+%endmacro
			
 
				+
			
 
				+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
			
 
				+
			
 
				+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
			
 
				+    %assign num_args %1
			
 
				+    %assign regs_used %2
			
 
				+    ASSERT regs_used >= num_args
			
 
				+    %if num_args > 7
			
 
				+        %assign num_args 7
			
 
				+    %endif
			
 
				+    %if regs_used > 7
			
 
				+        %assign regs_used 7
			
 
				+    %endif
			
 
				+    SETUP_STACK_POINTER %4
			
 
				+    ASSERT regs_used <= 7
			
 
				+    PUSH_IF_USED 3, 4, 5, 6
			
 
				+    ALLOC_STACK %4
			
 
				+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
			
 
				+    DEFINE_ARGS_INTERNAL %0, %4, %5
			
 
				+%endmacro
			
 
				+
			
 
				+%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
			
 
				+
			
 
				+%macro RET 0
			
 
				+    %if stack_size_padded > 0
			
 
				+        %if required_stack_alignment > STACK_ALIGNMENT
			
 
				+            mov rsp, rstkm
			
 
				+        %else
			
 
				+            add rsp, stack_size_padded
			
 
				+        %endif
			
 
				+    %endif
			
 
				+    POP_IF_USED 6, 5, 4, 3
			
 
				+    %if mmsize == 32
			
 
				+        vzeroupper
			
 
				+    %endif
			
 
				+    AUTO_REP_RET
			
 
				+%endmacro
			
 
				+
			
 
				+%endif ;======================================================================
			
 
				+
			
 
				+%if WIN64 == 0
			
 
				+    %macro WIN64_SPILL_XMM 1
			
 
				+    %endmacro
			
 
				+    %macro WIN64_RESTORE_XMM 1
			
 
				+    %endmacro
			
 
				+    %macro WIN64_PUSH_XMM 0
			
 
				+    %endmacro
			
 
				+%endif
			
 
				+
			
 
				+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
			
 
				+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
			
 
				+; We can automatically detect "follows a branch", but not a branch target.
			
 
				+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
			
 
				+%macro REP_RET 0
			
 
				+    %if has_epilogue
			
 
				+        RET
			
 
				+    %else
			
 
				+        rep ret
			
 
				+    %endif
			
 
				+    annotate_function_size
			
 
				+%endmacro
			
 
				+
			
 
				+%define last_branch_adr $$
			
 
				+%macro AUTO_REP_RET 0
			
 
				+    %if notcpuflag(ssse3)
			
 
				+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
			
 
				+    %endif
			
 
				+    ret
			
 
				+    annotate_function_size
			
 
				+%endmacro
			
 
				+
			
 
				+%macro BRANCH_INSTR 0-*
			
 
				+    %rep %0
			
 
				+        %macro %1 1-2 %1
			
 
				+            %2 %1
			
 
				+            %if notcpuflag(ssse3)
			
 
				+                %%branch_instr equ $
			
 
				+                %xdefine last_branch_adr %%branch_instr
			
 
				+            %endif
			
 
				+        %endmacro
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+%endmacro
			
 
				+
			
 
				+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
			
 
				+
			
 
				+%macro TAIL_CALL 2 ; callee, is_nonadjacent
			
 
				+    %if has_epilogue
			
 
				+        call %1
			
 
				+        RET
			
 
				+    %elif %2
			
 
				+        jmp %1
			
 
				+    %endif
			
 
				+    annotate_function_size
			
 
				+%endmacro
			
 
				+
			
 
				+;=============================================================================
			
 
				+; arch-independent part
			
 
				+;=============================================================================
			
 
				+
			
 
				+%assign function_align 16
			
 
				+
			
 
				+; Begin a function.
			
 
				+; Applies any symbol mangling needed for C linkage, and sets up a define such that
			
 
				+; subsequent uses of the function name automatically refer to the mangled version.
			
 
				+; Appends cpuflags to the function name if cpuflags has been specified.
			
 
				+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
			
 
				+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
			
 
				+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
			
 
				+    cglobal_internal 1, %1 %+ SUFFIX, %2
			
 
				+%endmacro
			
 
				+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
			
 
				+    cglobal_internal 0, %1 %+ SUFFIX, %2
			
 
				+%endmacro
			
 
				+%macro cglobal_internal 2-3+
			
 
				+    annotate_function_size
			
 
				+    %if %1
			
 
				+        %xdefine %%FUNCTION_PREFIX private_prefix
			
 
				+        ; libvpx explicitly sets visibility in shared object builds. Avoid
			
 
				+        ; setting visibility to hidden as it may break builds that split
			
 
				+        ; sources on e.g., directory boundaries.
			
 
				+        %ifdef CHROMIUM
			
 
				+            %xdefine %%VISIBILITY hidden
			
 
				+        %else
			
 
				+            %xdefine %%VISIBILITY
			
 
				+        %endif
			
 
				+    %else
			
 
				+        %xdefine %%FUNCTION_PREFIX public_prefix
			
 
				+        %xdefine %%VISIBILITY
			
 
				+    %endif
			
 
				+    %ifndef cglobaled_%2
			
 
				+        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
			
 
				+        %xdefine %2.skip_prologue %2 %+ .skip_prologue
			
 
				+        CAT_XDEFINE cglobaled_, %2, 1
			
 
				+    %endif
			
 
				+    %xdefine current_function %2
			
 
				+    %xdefine current_function_section __SECT__
			
 
				+    %if FORMAT_ELF
			
 
				+        global %2:function %%VISIBILITY
			
 
				+    %elif FORMAT_MACHO
			
 
				+        %ifdef __NASM_VER__
			
 
				+            global %2
			
 
				+        %else
			
 
				+            global %2:private_extern
			
 
				+        %endif
			
 
				+    %else
			
 
				+        global %2
			
 
				+    %endif
			
 
				+    align function_align
			
 
				+    %2:
			
 
				+    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
			
 
				+    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
			
 
				+    %assign stack_offset 0      ; stack pointer offset relative to the return address
			
 
				+    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
			
 
				+    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
			
 
				+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
			
 
				+    %ifnidn %3, ""
			
 
				+        PROLOGUE %3
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro cextern 1
			
 
				+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
			
 
				+    CAT_XDEFINE cglobaled_, %1, 1
			
 
				+    extern %1
			
 
				+%endmacro
			
 
				+
			
 
				+; like cextern, but without the prefix
			
 
				+%macro cextern_naked 1
			
 
				+    %ifdef PREFIX
			
 
				+        %xdefine %1 mangle(%1)
			
 
				+    %endif
			
 
				+    CAT_XDEFINE cglobaled_, %1, 1
			
 
				+    extern %1
			
 
				+%endmacro
			
 
				+
			
 
				+%macro const 1-2+
			
 
				+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
			
 
				+    %if FORMAT_ELF
			
 
				+        global %1:data hidden
			
 
				+    %else
			
 
				+        global %1
			
 
				+    %endif
			
 
				+    %1: %2
			
 
				+%endmacro
			
 
				+
			
 
				+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
			
 
				+%if FORMAT_ELF
			
 
				+    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
			
 
				+%endif
			
 
				+
			
 
				+; Tell debuggers how large the function was.
			
 
				+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
			
 
				+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
			
 
				+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
			
 
				+; then its size might be unspecified.
			
 
				+%macro annotate_function_size 0
			
 
				+    %ifdef __YASM_VER__
			
 
				+        %ifdef current_function
			
 
				+            %if FORMAT_ELF
			
 
				+                current_function_section
			
 
				+                %%ecf equ $
			
 
				+                size current_function %%ecf - current_function
			
 
				+                __SECT__
			
 
				+            %endif
			
 
				+        %endif
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+; cpuflags
			
 
				+
			
 
				+%assign cpuflags_mmx      (1<<0)
			
 
				+%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
			
 
				+%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
			
 
				+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
			
 
				+%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
			
 
				+%assign cpuflags_sse2     (1<<5) | cpuflags_sse
			
 
				+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
			
 
				+%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
			
 
				+%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
			
 
				+%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
			
 
				+%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
			
 
				+%assign cpuflags_avx      (1<<11)| cpuflags_sse42
			
 
				+%assign cpuflags_xop      (1<<12)| cpuflags_avx
			
 
				+%assign cpuflags_fma4     (1<<13)| cpuflags_avx
			
 
				+%assign cpuflags_fma3     (1<<14)| cpuflags_avx
			
 
				+%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
			
 
				+
			
 
				+%assign cpuflags_cache32  (1<<16)
			
 
				+%assign cpuflags_cache64  (1<<17)
			
 
				+%assign cpuflags_slowctz  (1<<18)
			
 
				+%assign cpuflags_lzcnt    (1<<19)
			
 
				+%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
			
 
				+%assign cpuflags_atom     (1<<21)
			
 
				+%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
			
 
				+%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
			
 
				+
			
 
				+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
			
 
				+%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
			
 
				+%define notcpuflag(x) (cpuflag(x) ^ 1)
			
 
				+
			
 
				+; Takes an arbitrary number of cpuflags from the above list.
			
 
				+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
			
 
				+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
			
 
				+%macro INIT_CPUFLAGS 0-*
			
 
				+    %xdefine SUFFIX
			
 
				+    %undef cpuname
			
 
				+    %assign cpuflags 0
			
 
				+
			
 
				+    %if %0 >= 1
			
 
				+        %rep %0
			
 
				+            %ifdef cpuname
			
 
				+                %xdefine cpuname cpuname %+ _%1
			
 
				+            %else
			
 
				+                %xdefine cpuname %1
			
 
				+            %endif
			
 
				+            %assign cpuflags cpuflags | cpuflags_%1
			
 
				+            %rotate 1
			
 
				+        %endrep
			
 
				+        %xdefine SUFFIX _ %+ cpuname
			
 
				+
			
 
				+        %if cpuflag(avx)
			
 
				+            %assign avx_enabled 1
			
 
				+        %endif
			
 
				+        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
			
 
				+            %define mova movaps
			
 
				+            %define movu movups
			
 
				+            %define movnta movntps
			
 
				+        %endif
			
 
				+        %if cpuflag(aligned)
			
 
				+            %define movu mova
			
 
				+        %elif cpuflag(sse3) && notcpuflag(ssse3)
			
 
				+            %define movu lddqu
			
 
				+        %endif
			
 
				+    %endif
			
 
				+
			
 
				+    %if ARCH_X86_64 || cpuflag(sse2)
			
 
				+        %ifdef __NASM_VER__
			
 
				+            ALIGNMODE k8
			
 
				+        %else
			
 
				+            CPU amdnop
			
 
				+        %endif
			
 
				+    %else
			
 
				+        %ifdef __NASM_VER__
			
 
				+            ALIGNMODE nop
			
 
				+        %else
			
 
				+            CPU basicnop
			
 
				+        %endif
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+; Merge mmx and sse*
			
 
				+; m# is a simd register of the currently selected size
			
 
				+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
			
 
				+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
			
 
				+; (All 3 remain in sync through SWAP.)
			
 
				+
			
 
				+%macro CAT_XDEFINE 3
			
 
				+    %xdefine %1%2 %3
			
 
				+%endmacro
			
 
				+
			
 
				+%macro CAT_UNDEF 2
			
 
				+    %undef %1%2
			
 
				+%endmacro
			
 
				+
			
 
				+%macro INIT_MMX 0-1+
			
 
				+    %assign avx_enabled 0
			
 
				+    %define RESET_MM_PERMUTATION INIT_MMX %1
			
 
				+    %define mmsize 8
			
 
				+    %define num_mmregs 8
			
 
				+    %define mova movq
			
 
				+    %define movu movq
			
 
				+    %define movh movd
			
 
				+    %define movnta movntq
			
 
				+    %assign %%i 0
			
 
				+    %rep 8
			
 
				+        CAT_XDEFINE m, %%i, mm %+ %%i
			
 
				+        CAT_XDEFINE nnmm, %%i, %%i
			
 
				+        %assign %%i %%i+1
			
 
				+    %endrep
			
 
				+    %rep 8
			
 
				+        CAT_UNDEF m, %%i
			
 
				+        CAT_UNDEF nnmm, %%i
			
 
				+        %assign %%i %%i+1
			
 
				+    %endrep
			
 
				+    INIT_CPUFLAGS %1
			
 
				+%endmacro
			
 
				+
			
 
				+%macro INIT_XMM 0-1+
			
 
				+    %assign avx_enabled 0
			
 
				+    %define RESET_MM_PERMUTATION INIT_XMM %1
			
 
				+    %define mmsize 16
			
 
				+    %define num_mmregs 8
			
 
				+    %if ARCH_X86_64
			
 
				+        %define num_mmregs 16
			
 
				+    %endif
			
 
				+    %define mova movdqa
			
 
				+    %define movu movdqu
			
 
				+    %define movh movq
			
 
				+    %define movnta movntdq
			
 
				+    %assign %%i 0
			
 
				+    %rep num_mmregs
			
 
				+        CAT_XDEFINE m, %%i, xmm %+ %%i
			
 
				+        CAT_XDEFINE nnxmm, %%i, %%i
			
 
				+        %assign %%i %%i+1
			
 
				+    %endrep
			
 
				+    INIT_CPUFLAGS %1
			
 
				+%endmacro
			
 
				+
			
 
				+%macro INIT_YMM 0-1+
			
 
				+    %assign avx_enabled 1
			
 
				+    %define RESET_MM_PERMUTATION INIT_YMM %1
			
 
				+    %define mmsize 32
			
 
				+    %define num_mmregs 8
			
 
				+    %if ARCH_X86_64
			
 
				+        %define num_mmregs 16
			
 
				+    %endif
			
 
				+    %define mova movdqa
			
 
				+    %define movu movdqu
			
 
				+    %undef movh
			
 
				+    %define movnta movntdq
			
 
				+    %assign %%i 0
			
 
				+    %rep num_mmregs
			
 
				+        CAT_XDEFINE m, %%i, ymm %+ %%i
			
 
				+        CAT_XDEFINE nnymm, %%i, %%i
			
 
				+        %assign %%i %%i+1
			
 
				+    %endrep
			
 
				+    INIT_CPUFLAGS %1
			
 
				+%endmacro
			
 
				+
			
 
				+INIT_XMM
			
 
				+
			
 
				+%macro DECLARE_MMCAST 1
			
 
				+    %define  mmmm%1   mm%1
			
 
				+    %define  mmxmm%1  mm%1
			
 
				+    %define  mmymm%1  mm%1
			
 
				+    %define xmmmm%1   mm%1
			
 
				+    %define xmmxmm%1 xmm%1
			
 
				+    %define xmmymm%1 xmm%1
			
 
				+    %define ymmmm%1   mm%1
			
 
				+    %define ymmxmm%1 xmm%1
			
 
				+    %define ymmymm%1 ymm%1
			
 
				+    %define xm%1 xmm %+ m%1
			
 
				+    %define ym%1 ymm %+ m%1
			
 
				+%endmacro
			
 
				+
			
 
				+%assign i 0
			
 
				+%rep 16
			
 
				+    DECLARE_MMCAST i
			
 
				+    %assign i i+1
			
 
				+%endrep
			
 
				+
			
 
				+; I often want to use macros that permute their arguments. e.g. there's no
			
 
				+; efficient way to implement butterfly or transpose or dct without swapping some
			
 
				+; arguments.
			
 
				+;
			
 
				+; I would like to not have to manually keep track of the permutations:
			
 
				+; If I insert a permutation in the middle of a function, it should automatically
			
 
				+; change everything that follows. For more complex macros I may also have multiple
			
 
				+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
			
 
				+;
			
 
				+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
			
 
				+; permutes its arguments. It's equivalent to exchanging the contents of the
			
 
				+; registers, except that this way you exchange the register names instead, so it
			
 
				+; doesn't cost any cycles.
			
 
				+
			
 
				+%macro PERMUTE 2-* ; takes a list of pairs to swap
			
 
				+    %rep %0/2
			
 
				+        %xdefine %%tmp%2 m%2
			
 
				+        %rotate 2
			
 
				+    %endrep
			
 
				+    %rep %0/2
			
 
				+        %xdefine m%1 %%tmp%2
			
 
				+        CAT_XDEFINE nn, m%1, %1
			
 
				+        %rotate 2
			
 
				+    %endrep
			
 
				+%endmacro
			
 
				+
			
 
				+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
			
 
				+    %ifnum %1 ; SWAP 0, 1, ...
			
 
				+        SWAP_INTERNAL_NUM %1, %2
			
 
				+    %else ; SWAP m0, m1, ...
			
 
				+        SWAP_INTERNAL_NAME %1, %2
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro SWAP_INTERNAL_NUM 2-*
			
 
				+    %rep %0-1
			
 
				+        %xdefine %%tmp m%1
			
 
				+        %xdefine m%1 m%2
			
 
				+        %xdefine m%2 %%tmp
			
 
				+        CAT_XDEFINE nn, m%1, %1
			
 
				+        CAT_XDEFINE nn, m%2, %2
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+%endmacro
			
 
				+
			
 
				+%macro SWAP_INTERNAL_NAME 2-*
			
 
				+    %xdefine %%args nn %+ %1
			
 
				+    %rep %0-1
			
 
				+        %xdefine %%args %%args, nn %+ %2
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+    SWAP_INTERNAL_NUM %%args
			
 
				+%endmacro
			
 
				+
			
 
				+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
			
 
				+; calls to that function will automatically load the permutation, so values can
			
 
				+; be returned in mmregs.
			
 
				+%macro SAVE_MM_PERMUTATION 0-1
			
 
				+    %if %0
			
 
				+        %xdefine %%f %1_m
			
 
				+    %else
			
 
				+        %xdefine %%f current_function %+ _m
			
 
				+    %endif
			
 
				+    %assign %%i 0
			
 
				+    %rep num_mmregs
			
 
				+        CAT_XDEFINE %%f, %%i, m %+ %%i
			
 
				+        %assign %%i %%i+1
			
 
				+    %endrep
			
 
				+%endmacro
			
 
				+
			
 
				+%macro LOAD_MM_PERMUTATION 1 ; name to load from
			
 
				+    %ifdef %1_m0
			
 
				+        %assign %%i 0
			
 
				+        %rep num_mmregs
			
 
				+            CAT_XDEFINE m, %%i, %1_m %+ %%i
			
 
				+            CAT_XDEFINE nn, m %+ %%i, %%i
			
 
				+            %assign %%i %%i+1
			
 
				+        %endrep
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
			
 
				+%macro call 1
			
 
				+    call_internal %1 %+ SUFFIX, %1
			
 
				+%endmacro
			
 
				+%macro call_internal 2
			
 
				+    %xdefine %%i %2
			
 
				+    %ifndef cglobaled_%2
			
 
				+        %ifdef cglobaled_%1
			
 
				+            %xdefine %%i %1
			
 
				+        %endif
			
 
				+    %endif
			
 
				+    call %%i
			
 
				+    LOAD_MM_PERMUTATION %%i
			
 
				+%endmacro
			
 
				+
			
 
				+; Substitutions that reduce instruction size but are functionally equivalent
			
 
				+%macro add 2
			
 
				+    %ifnum %2
			
 
				+        %if %2==128
			
 
				+            sub %1, -128
			
 
				+        %else
			
 
				+            add %1, %2
			
 
				+        %endif
			
 
				+    %else
			
 
				+        add %1, %2
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+%macro sub 2
			
 
				+    %ifnum %2
			
 
				+        %if %2==128
			
 
				+            add %1, -128
			
 
				+        %else
			
 
				+            sub %1, %2
			
 
				+        %endif
			
 
				+    %else
			
 
				+        sub %1, %2
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+;=============================================================================
			
 
				+; AVX abstraction layer
			
 
				+;=============================================================================
			
 
				+
			
 
				+%assign i 0
			
 
				+%rep 16
			
 
				+    %if i < 8
			
 
				+        CAT_XDEFINE sizeofmm, i, 8
			
 
				+    %endif
			
 
				+    CAT_XDEFINE sizeofxmm, i, 16
			
 
				+    CAT_XDEFINE sizeofymm, i, 32
			
 
				+    %assign i i+1
			
 
				+%endrep
			
 
				+%undef i
			
 
				+
			
 
				+%macro CHECK_AVX_INSTR_EMU 3-*
			
 
				+    %xdefine %%opcode %1
			
 
				+    %xdefine %%dst %2
			
 
				+    %rep %0-2
			
 
				+        %ifidn %%dst, %3
			
 
				+            %error non-avx emulation of ``%%opcode'' is not supported
			
 
				+        %endif
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+%endmacro
			
 
				+
			
 
				+;%1 == instruction
			
 
				+;%2 == minimal instruction set
			
 
				+;%3 == 1 if float, 0 if int
			
 
				+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
			
 
				+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
			
 
				+;%6+: operands
			
 
				+%macro RUN_AVX_INSTR 6-9+
			
 
				+    %ifnum sizeof%7
			
 
				+        %assign __sizeofreg sizeof%7
			
 
				+    %elifnum sizeof%6
			
 
				+        %assign __sizeofreg sizeof%6
			
 
				+    %else
			
 
				+        %assign __sizeofreg mmsize
			
 
				+    %endif
			
 
				+    %assign __emulate_avx 0
			
 
				+    %if avx_enabled && __sizeofreg >= 16
			
 
				+        %xdefine __instr v%1
			
 
				+    %else
			
 
				+        %xdefine __instr %1
			
 
				+        %if %0 >= 8+%4
			
 
				+            %assign __emulate_avx 1
			
 
				+        %endif
			
 
				+    %endif
			
 
				+    %ifnidn %2, fnord
			
 
				+        %ifdef cpuname
			
 
				+            %if notcpuflag(%2)
			
 
				+                %error use of ``%1'' %2 instruction in cpuname function: current_function
			
 
				+            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
			
 
				+                %error use of ``%1'' sse2 instruction in cpuname function: current_function
			
 
				+            %endif
			
 
				+        %endif
			
 
				+    %endif
			
 
				+
			
 
				+    %if __emulate_avx
			
 
				+        %xdefine __src1 %7
			
 
				+        %xdefine __src2 %8
			
 
				+        %ifnidn %6, %7
			
 
				+            %if %0 >= 9
			
 
				+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
			
 
				+            %else
			
 
				+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
			
 
				+            %endif
			
 
				+            %if %5 && %4 == 0
			
 
				+                %ifnid %8
			
 
				+                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
			
 
				+                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
			
 
				+                    ; So, if the instruction is commutative with a memory arg, swap them.
			
 
				+                    %xdefine __src1 %8
			
 
				+                    %xdefine __src2 %7
			
 
				+                %endif
			
 
				+            %endif
			
 
				+            %if __sizeofreg == 8
			
 
				+                MOVQ %6, __src1
			
 
				+            %elif %3
			
 
				+                MOVAPS %6, __src1
			
 
				+            %else
			
 
				+                MOVDQA %6, __src1
			
 
				+            %endif
			
 
				+        %endif
			
 
				+        %if %0 >= 9
			
 
				+            %1 %6, __src2, %9
			
 
				+        %else
			
 
				+            %1 %6, __src2
			
 
				+        %endif
			
 
				+    %elif %0 >= 9
			
 
				+        __instr %6, %7, %8, %9
			
 
				+    %elif %0 == 8
			
 
				+        __instr %6, %7, %8
			
 
				+    %elif %0 == 7
			
 
				+        __instr %6, %7
			
 
				+    %else
			
 
				+        __instr %6
			
 
				+    %endif
			
 
				+%endmacro
			
 
				+
			
 
				+;%1 == instruction
			
 
				+;%2 == minimal instruction set
			
 
				+;%3 == 1 if float, 0 if int
			
 
				+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
			
 
				+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
			
 
				+%macro AVX_INSTR 1-5 fnord, 0, 1, 0
			
 
				+    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
			
 
				+        %ifidn %2, fnord
			
 
				+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
			
 
				+        %elifidn %3, fnord
			
 
				+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
			
 
				+        %elifidn %4, fnord
			
 
				+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
			
 
				+        %elifidn %5, fnord
			
 
				+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
			
 
				+        %else
			
 
				+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
			
 
				+        %endif
			
 
				+    %endmacro
			
 
				+%endmacro
			
 
				+
			
 
				+; Instructions with both VEX and non-VEX encodings
			
 
				+; Non-destructive instructions are written without parameters
			
 
				+AVX_INSTR addpd, sse2, 1, 0, 1
			
 
				+AVX_INSTR addps, sse, 1, 0, 1
			
 
				+AVX_INSTR addsd, sse2, 1, 0, 1
			
 
				+AVX_INSTR addss, sse, 1, 0, 1
			
 
				+AVX_INSTR addsubpd, sse3, 1, 0, 0
			
 
				+AVX_INSTR addsubps, sse3, 1, 0, 0
			
 
				+AVX_INSTR aesdec, fnord, 0, 0, 0
			
 
				+AVX_INSTR aesdeclast, fnord, 0, 0, 0
			
 
				+AVX_INSTR aesenc, fnord, 0, 0, 0
			
 
				+AVX_INSTR aesenclast, fnord, 0, 0, 0
			
 
				+AVX_INSTR aesimc
			
 
				+AVX_INSTR aeskeygenassist
			
 
				+AVX_INSTR andnpd, sse2, 1, 0, 0
			
 
				+AVX_INSTR andnps, sse, 1, 0, 0
			
 
				+AVX_INSTR andpd, sse2, 1, 0, 1
			
 
				+AVX_INSTR andps, sse, 1, 0, 1
			
 
				+AVX_INSTR blendpd, sse4, 1, 0, 0
			
 
				+AVX_INSTR blendps, sse4, 1, 0, 0
			
 
				+AVX_INSTR blendvpd, sse4, 1, 0, 0
			
 
				+AVX_INSTR blendvps, sse4, 1, 0, 0
			
 
				+AVX_INSTR cmppd, sse2, 1, 1, 0
			
 
				+AVX_INSTR cmpps, sse, 1, 1, 0
			
 
				+AVX_INSTR cmpsd, sse2, 1, 1, 0
			
 
				+AVX_INSTR cmpss, sse, 1, 1, 0
			
 
				+AVX_INSTR comisd, sse2
			
 
				+AVX_INSTR comiss, sse
			
 
				+AVX_INSTR cvtdq2pd, sse2
			
 
				+AVX_INSTR cvtdq2ps, sse2
			
 
				+AVX_INSTR cvtpd2dq, sse2
			
 
				+AVX_INSTR cvtpd2ps, sse2
			
 
				+AVX_INSTR cvtps2dq, sse2
			
 
				+AVX_INSTR cvtps2pd, sse2
			
 
				+AVX_INSTR cvtsd2si, sse2
			
 
				+AVX_INSTR cvtsd2ss, sse2
			
 
				+AVX_INSTR cvtsi2sd, sse2
			
 
				+AVX_INSTR cvtsi2ss, sse
			
 
				+AVX_INSTR cvtss2sd, sse2
			
 
				+AVX_INSTR cvtss2si, sse
			
 
				+AVX_INSTR cvttpd2dq, sse2
			
 
				+AVX_INSTR cvttps2dq, sse2
			
 
				+AVX_INSTR cvttsd2si, sse2
			
 
				+AVX_INSTR cvttss2si, sse
			
 
				+AVX_INSTR divpd, sse2, 1, 0, 0
			
 
				+AVX_INSTR divps, sse, 1, 0, 0
			
 
				+AVX_INSTR divsd, sse2, 1, 0, 0
			
 
				+AVX_INSTR divss, sse, 1, 0, 0
			
 
				+AVX_INSTR dppd, sse4, 1, 1, 0
			
 
				+AVX_INSTR dpps, sse4, 1, 1, 0
			
 
				+AVX_INSTR extractps, sse4
			
 
				+AVX_INSTR haddpd, sse3, 1, 0, 0
			
 
				+AVX_INSTR haddps, sse3, 1, 0, 0
			
 
				+AVX_INSTR hsubpd, sse3, 1, 0, 0
			
 
				+AVX_INSTR hsubps, sse3, 1, 0, 0
			
 
				+AVX_INSTR insertps, sse4, 1, 1, 0
			
 
				+AVX_INSTR lddqu, sse3
			
 
				+AVX_INSTR ldmxcsr, sse
			
 
				+AVX_INSTR maskmovdqu, sse2
			
 
				+AVX_INSTR maxpd, sse2, 1, 0, 1
			
 
				+AVX_INSTR maxps, sse, 1, 0, 1
			
 
				+AVX_INSTR maxsd, sse2, 1, 0, 1
			
 
				+AVX_INSTR maxss, sse, 1, 0, 1
			
 
				+AVX_INSTR minpd, sse2, 1, 0, 1
			
 
				+AVX_INSTR minps, sse, 1, 0, 1
			
 
				+AVX_INSTR minsd, sse2, 1, 0, 1
			
 
				+AVX_INSTR minss, sse, 1, 0, 1
			
 
				+AVX_INSTR movapd, sse2
			
 
				+AVX_INSTR movaps, sse
			
 
				+AVX_INSTR movd, mmx
			
 
				+AVX_INSTR movddup, sse3
			
 
				+AVX_INSTR movdqa, sse2
			
 
				+AVX_INSTR movdqu, sse2
			
 
				+AVX_INSTR movhlps, sse, 1, 0, 0
			
 
				+AVX_INSTR movhpd, sse2, 1, 0, 0
			
 
				+AVX_INSTR movhps, sse, 1, 0, 0
			
 
				+AVX_INSTR movlhps, sse, 1, 0, 0
			
 
				+AVX_INSTR movlpd, sse2, 1, 0, 0
			
 
				+AVX_INSTR movlps, sse, 1, 0, 0
			
 
				+AVX_INSTR movmskpd, sse2
			
 
				+AVX_INSTR movmskps, sse
			
 
				+AVX_INSTR movntdq, sse2
			
 
				+AVX_INSTR movntdqa, sse4
			
 
				+AVX_INSTR movntpd, sse2
			
 
				+AVX_INSTR movntps, sse
			
 
				+AVX_INSTR movq, mmx
			
 
				+AVX_INSTR movsd, sse2, 1, 0, 0
			
 
				+AVX_INSTR movshdup, sse3
			
 
				+AVX_INSTR movsldup, sse3
			
 
				+AVX_INSTR movss, sse, 1, 0, 0
			
 
				+AVX_INSTR movupd, sse2
			
 
				+AVX_INSTR movups, sse
			
 
				+AVX_INSTR mpsadbw, sse4
			
 
				+AVX_INSTR mulpd, sse2, 1, 0, 1
			
 
				+AVX_INSTR mulps, sse, 1, 0, 1
			
 
				+AVX_INSTR mulsd, sse2, 1, 0, 1
			
 
				+AVX_INSTR mulss, sse, 1, 0, 1
			
 
				+AVX_INSTR orpd, sse2, 1, 0, 1
			
 
				+AVX_INSTR orps, sse, 1, 0, 1
			
 
				+AVX_INSTR pabsb, ssse3
			
 
				+AVX_INSTR pabsd, ssse3
			
 
				+AVX_INSTR pabsw, ssse3
			
 
				+AVX_INSTR packsswb, mmx, 0, 0, 0
			
 
				+AVX_INSTR packssdw, mmx, 0, 0, 0
			
 
				+AVX_INSTR packuswb, mmx, 0, 0, 0
			
 
				+AVX_INSTR packusdw, sse4, 0, 0, 0
			
 
				+AVX_INSTR paddb, mmx, 0, 0, 1
			
 
				+AVX_INSTR paddw, mmx, 0, 0, 1
			
 
				+AVX_INSTR paddd, mmx, 0, 0, 1
			
 
				+AVX_INSTR paddq, sse2, 0, 0, 1
			
 
				+AVX_INSTR paddsb, mmx, 0, 0, 1
			
 
				+AVX_INSTR paddsw, mmx, 0, 0, 1
			
 
				+AVX_INSTR paddusb, mmx, 0, 0, 1
			
 
				+AVX_INSTR paddusw, mmx, 0, 0, 1
			
 
				+AVX_INSTR palignr, ssse3
			
 
				+AVX_INSTR pand, mmx, 0, 0, 1
			
 
				+AVX_INSTR pandn, mmx, 0, 0, 0
			
 
				+AVX_INSTR pavgb, mmx2, 0, 0, 1
			
 
				+AVX_INSTR pavgw, mmx2, 0, 0, 1
			
 
				+AVX_INSTR pblendvb, sse4, 0, 0, 0
			
 
				+AVX_INSTR pblendw, sse4
			
 
				+AVX_INSTR pclmulqdq
			
 
				+AVX_INSTR pcmpestri, sse42
			
 
				+AVX_INSTR pcmpestrm, sse42
			
 
				+AVX_INSTR pcmpistri, sse42
			
 
				+AVX_INSTR pcmpistrm, sse42
			
 
				+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
			
 
				+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
			
 
				+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
			
 
				+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
			
 
				+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
			
 
				+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
			
 
				+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
			
 
				+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
			
 
				+AVX_INSTR pextrb, sse4
			
 
				+AVX_INSTR pextrd, sse4
			
 
				+AVX_INSTR pextrq, sse4
			
 
				+AVX_INSTR pextrw, mmx2
			
 
				+AVX_INSTR phaddw, ssse3, 0, 0, 0
			
 
				+AVX_INSTR phaddd, ssse3, 0, 0, 0
			
 
				+AVX_INSTR phaddsw, ssse3, 0, 0, 0
			
 
				+AVX_INSTR phminposuw, sse4
			
 
				+AVX_INSTR phsubw, ssse3, 0, 0, 0
			
 
				+AVX_INSTR phsubd, ssse3, 0, 0, 0
			
 
				+AVX_INSTR phsubsw, ssse3, 0, 0, 0
			
 
				+AVX_INSTR pinsrb, sse4
			
 
				+AVX_INSTR pinsrd, sse4
			
 
				+AVX_INSTR pinsrq, sse4
			
 
				+AVX_INSTR pinsrw, mmx2
			
 
				+AVX_INSTR pmaddwd, mmx, 0, 0, 1
			
 
				+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
			
 
				+AVX_INSTR pmaxsb, sse4, 0, 0, 1
			
 
				+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
			
 
				+AVX_INSTR pmaxsd, sse4, 0, 0, 1
			
 
				+AVX_INSTR pmaxub, mmx2, 0, 0, 1
			
 
				+AVX_INSTR pmaxuw, sse4, 0, 0, 1
			
 
				+AVX_INSTR pmaxud, sse4, 0, 0, 1
			
 
				+AVX_INSTR pminsb, sse4, 0, 0, 1
			
 
				+AVX_INSTR pminsw, mmx2, 0, 0, 1
			
 
				+AVX_INSTR pminsd, sse4, 0, 0, 1
			
 
				+AVX_INSTR pminub, mmx2, 0, 0, 1
			
 
				+AVX_INSTR pminuw, sse4, 0, 0, 1
			
 
				+AVX_INSTR pminud, sse4, 0, 0, 1
			
 
				+AVX_INSTR pmovmskb, mmx2
			
 
				+AVX_INSTR pmovsxbw, sse4
			
 
				+AVX_INSTR pmovsxbd, sse4
			
 
				+AVX_INSTR pmovsxbq, sse4
			
 
				+AVX_INSTR pmovsxwd, sse4
			
 
				+AVX_INSTR pmovsxwq, sse4
			
 
				+AVX_INSTR pmovsxdq, sse4
			
 
				+AVX_INSTR pmovzxbw, sse4
			
 
				+AVX_INSTR pmovzxbd, sse4
			
 
				+AVX_INSTR pmovzxbq, sse4
			
 
				+AVX_INSTR pmovzxwd, sse4
			
 
				+AVX_INSTR pmovzxwq, sse4
			
 
				+AVX_INSTR pmovzxdq, sse4
			
 
				+AVX_INSTR pmuldq, sse4, 0, 0, 1
			
 
				+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
			
 
				+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
			
 
				+AVX_INSTR pmulhw, mmx, 0, 0, 1
			
 
				+AVX_INSTR pmullw, mmx, 0, 0, 1
			
 
				+AVX_INSTR pmulld, sse4, 0, 0, 1
			
 
				+AVX_INSTR pmuludq, sse2, 0, 0, 1
			
 
				+AVX_INSTR por, mmx, 0, 0, 1
			
 
				+AVX_INSTR psadbw, mmx2, 0, 0, 1
			
 
				+AVX_INSTR pshufb, ssse3, 0, 0, 0
			
 
				+AVX_INSTR pshufd, sse2
			
 
				+AVX_INSTR pshufhw, sse2
			
 
				+AVX_INSTR pshuflw, sse2
			
 
				+AVX_INSTR psignb, ssse3, 0, 0, 0
			
 
				+AVX_INSTR psignw, ssse3, 0, 0, 0
			
 
				+AVX_INSTR psignd, ssse3, 0, 0, 0
			
 
				+AVX_INSTR psllw, mmx, 0, 0, 0
			
 
				+AVX_INSTR pslld, mmx, 0, 0, 0
			
 
				+AVX_INSTR psllq, mmx, 0, 0, 0
			
 
				+AVX_INSTR pslldq, sse2, 0, 0, 0
			
 
				+AVX_INSTR psraw, mmx, 0, 0, 0
			
 
				+AVX_INSTR psrad, mmx, 0, 0, 0
			
 
				+AVX_INSTR psrlw, mmx, 0, 0, 0
			
 
				+AVX_INSTR psrld, mmx, 0, 0, 0
			
 
				+AVX_INSTR psrlq, mmx, 0, 0, 0
			
 
				+AVX_INSTR psrldq, sse2, 0, 0, 0
			
 
				+AVX_INSTR psubb, mmx, 0, 0, 0
			
 
				+AVX_INSTR psubw, mmx, 0, 0, 0
			
 
				+AVX_INSTR psubd, mmx, 0, 0, 0
			
 
				+AVX_INSTR psubq, sse2, 0, 0, 0
			
 
				+AVX_INSTR psubsb, mmx, 0, 0, 0
			
 
				+AVX_INSTR psubsw, mmx, 0, 0, 0
			
 
				+AVX_INSTR psubusb, mmx, 0, 0, 0
			
 
				+AVX_INSTR psubusw, mmx, 0, 0, 0
			
 
				+AVX_INSTR ptest, sse4
			
 
				+AVX_INSTR punpckhbw, mmx, 0, 0, 0
			
 
				+AVX_INSTR punpckhwd, mmx, 0, 0, 0
			
 
				+AVX_INSTR punpckhdq, mmx, 0, 0, 0
			
 
				+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
			
 
				+AVX_INSTR punpcklbw, mmx, 0, 0, 0
			
 
				+AVX_INSTR punpcklwd, mmx, 0, 0, 0
			
 
				+AVX_INSTR punpckldq, mmx, 0, 0, 0
			
 
				+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
			
 
				+AVX_INSTR pxor, mmx, 0, 0, 1
			
 
				+AVX_INSTR rcpps, sse, 1, 0, 0
			
 
				+AVX_INSTR rcpss, sse, 1, 0, 0
			
 
				+AVX_INSTR roundpd, sse4
			
 
				+AVX_INSTR roundps, sse4
			
 
				+AVX_INSTR roundsd, sse4
			
 
				+AVX_INSTR roundss, sse4
			
 
				+AVX_INSTR rsqrtps, sse, 1, 0, 0
			
 
				+AVX_INSTR rsqrtss, sse, 1, 0, 0
			
 
				+AVX_INSTR shufpd, sse2, 1, 1, 0
			
 
				+AVX_INSTR shufps, sse, 1, 1, 0
			
 
				+AVX_INSTR sqrtpd, sse2, 1, 0, 0
			
 
				+AVX_INSTR sqrtps, sse, 1, 0, 0
			
 
				+AVX_INSTR sqrtsd, sse2, 1, 0, 0
			
 
				+AVX_INSTR sqrtss, sse, 1, 0, 0
			
 
				+AVX_INSTR stmxcsr, sse
			
 
				+AVX_INSTR subpd, sse2, 1, 0, 0
			
 
				+AVX_INSTR subps, sse, 1, 0, 0
			
 
				+AVX_INSTR subsd, sse2, 1, 0, 0
			
 
				+AVX_INSTR subss, sse, 1, 0, 0
			
 
				+AVX_INSTR ucomisd, sse2
			
 
				+AVX_INSTR ucomiss, sse
			
 
				+AVX_INSTR unpckhpd, sse2, 1, 0, 0
			
 
				+AVX_INSTR unpckhps, sse, 1, 0, 0
			
 
				+AVX_INSTR unpcklpd, sse2, 1, 0, 0
			
 
				+AVX_INSTR unpcklps, sse, 1, 0, 0
			
 
				+AVX_INSTR xorpd, sse2, 1, 0, 1
			
 
				+AVX_INSTR xorps, sse, 1, 0, 1
			
 
				+
			
 
				+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
			
 
				+AVX_INSTR pfadd, 3dnow, 1, 0, 1
			
 
				+AVX_INSTR pfsub, 3dnow, 1, 0, 0
			
 
				+AVX_INSTR pfmul, 3dnow, 1, 0, 1
			
 
				+
			
 
				+; base-4 constants for shuffles
			
 
				+%assign i 0
			
 
				+%rep 256
			
 
				+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
			
 
				+    %if j < 10
			
 
				+        CAT_XDEFINE q000, j, i
			
 
				+    %elif j < 100
			
 
				+        CAT_XDEFINE q00, j, i
			
 
				+    %elif j < 1000
			
 
				+        CAT_XDEFINE q0, j, i
			
 
				+    %else
			
 
				+        CAT_XDEFINE q, j, i
			
 
				+    %endif
			
 
				+    %assign i i+1
			
 
				+%endrep
			
 
				+%undef i
			
 
				+%undef j
			
 
				+
			
 
				+%macro FMA_INSTR 3
			
 
				+    %macro %1 4-7 %1, %2, %3
			
 
				+        %if cpuflag(xop)
			
 
				+            v%5 %1, %2, %3, %4
			
 
				+        %elifnidn %1, %4
			
 
				+            %6 %1, %2, %3
			
 
				+            %7 %1, %4
			
 
				+        %else
			
 
				+            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
			
 
				+        %endif
			
 
				+    %endmacro
			
 
				+%endmacro
			
 
				+
			
 
				+FMA_INSTR  pmacsww,  pmullw, paddw
			
 
				+FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
			
 
				+FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
			
 
				+FMA_INSTR pmadcswd, pmaddwd, paddd
			
 
				+
			
 
				+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
			
 
				+; FMA3 is only possible if dst is the same as one of the src registers.
			
 
				+; Either src2 or src3 can be a memory operand.
			
 
				+%macro FMA4_INSTR 2-*
			
 
				+    %push fma4_instr
			
 
				+    %xdefine %$prefix %1
			
 
				+    %rep %0 - 1
			
 
				+        %macro %$prefix%2 4-6 %$prefix, %2
			
 
				+            %if notcpuflag(fma3) && notcpuflag(fma4)
			
 
				+                %error use of ``%5%6'' fma instruction in cpuname function: current_function
			
 
				+            %elif cpuflag(fma4)
			
 
				+                v%5%6 %1, %2, %3, %4
			
 
				+            %elifidn %1, %2
			
 
				+                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
			
 
				+                %ifid %3
			
 
				+                    v%{5}213%6 %2, %3, %4
			
 
				+                %else
			
 
				+                    v%{5}132%6 %2, %4, %3
			
 
				+                %endif
			
 
				+            %elifidn %1, %3
			
 
				+                v%{5}213%6 %3, %2, %4
			
 
				+            %elifidn %1, %4
			
 
				+                v%{5}231%6 %4, %2, %3
			
 
				+            %else
			
 
				+                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
			
 
				+            %endif
			
 
				+        %endmacro
			
 
				+        %rotate 1
			
 
				+    %endrep
			
 
				+    %pop
			
 
				+%endmacro
			
 
				+
			
 
				+FMA4_INSTR fmadd,    pd, ps, sd, ss
			
 
				+FMA4_INSTR fmaddsub, pd, ps
			
 
				+FMA4_INSTR fmsub,    pd, ps, sd, ss
			
 
				+FMA4_INSTR fmsubadd, pd, ps
			
 
				+FMA4_INSTR fnmadd,   pd, ps, sd, ss
			
 
				+FMA4_INSTR fnmsub,   pd, ps, sd, ss
			
 
				+
			
 
				+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
			
 
				+%ifdef __YASM_VER__
			
 
				+    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
			
 
				+        %macro vpbroadcastq 2
			
 
				+            %if sizeof%1 == 16
			
 
				+                movddup %1, %2
			
 
				+            %else
			
 
				+                vbroadcastsd %1, %2
			
 
				+            %endif
			
 
				+        %endmacro
			
 
				+    %endif
			
 
				+%endif
			
--- a/thirdparty/libvpx/vp8/common/alloccommon.c
+++ b/thirdparty/libvpx/vp8/common/alloccommon.c
@@ -0,0 +1,190 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "alloccommon.h"
			
 
				+#include "blockd.h"
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+#include "onyxc_int.h"
			
 
				+#include "findnearmv.h"
			
 
				+#include "entropymode.h"
			
 
				+#include "systemdependent.h"
			
 
				+
			
 
				+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
			
 
				+{
			
 
				+    int i;
			
 
				+    for (i = 0; i < NUM_YV12_BUFFERS; i++)
			
 
				+        vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
			
 
				+
			
 
				+    vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
			
 
				+#if CONFIG_POSTPROC
			
 
				+    vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
			
 
				+    if (oci->post_proc_buffer_int_used)
			
 
				+        vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);
			
 
				+
			
 
				+    vpx_free(oci->pp_limits_buffer);
			
 
				+    oci->pp_limits_buffer = NULL;
			
 
				+#endif
			
 
				+
			
 
				+    vpx_free(oci->above_context);
			
 
				+    vpx_free(oci->mip);
			
 
				+#if CONFIG_ERROR_CONCEALMENT
			
 
				+    vpx_free(oci->prev_mip);
			
 
				+    oci->prev_mip = NULL;
			
 
				+#endif
			
 
				+
			
 
				+    oci->above_context = NULL;
			
 
				+    oci->mip = NULL;
			
 
				+}
			
 
				+
			
 
				+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
			
 
				+{
			
 
				+    int i;
			
 
				+
			
 
				+    vp8_de_alloc_frame_buffers(oci);
			
 
				+
			
 
				+    /* our internal buffers are always multiples of 16 */
			
 
				+    if ((width & 0xf) != 0)
			
 
				+        width += 16 - (width & 0xf);
			
 
				+
			
 
				+    if ((height & 0xf) != 0)
			
 
				+        height += 16 - (height & 0xf);
			
 
				+
			
 
				+
			
 
				+    for (i = 0; i < NUM_YV12_BUFFERS; i++)
			
 
				+    {
			
 
				+        oci->fb_idx_ref_cnt[i] = 0;
			
 
				+        oci->yv12_fb[i].flags = 0;
			
 
				+        if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0)
			
 
				+            goto allocation_fail;
			
 
				+    }
			
 
				+
			
 
				+    oci->new_fb_idx = 0;
			
 
				+    oci->lst_fb_idx = 1;
			
 
				+    oci->gld_fb_idx = 2;
			
 
				+    oci->alt_fb_idx = 3;
			
 
				+
			
 
				+    oci->fb_idx_ref_cnt[0] = 1;
			
 
				+    oci->fb_idx_ref_cnt[1] = 1;
			
 
				+    oci->fb_idx_ref_cnt[2] = 1;
			
 
				+    oci->fb_idx_ref_cnt[3] = 1;
			
 
				+
			
 
				+    if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0)
			
 
				+        goto allocation_fail;
			
 
				+
			
 
				+    oci->mb_rows = height >> 4;
			
 
				+    oci->mb_cols = width >> 4;
			
 
				+    oci->MBs = oci->mb_rows * oci->mb_cols;
			
 
				+    oci->mode_info_stride = oci->mb_cols + 1;
			
 
				+    oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
			
 
				+
			
 
				+    if (!oci->mip)
			
 
				+        goto allocation_fail;
			
 
				+
			
 
				+    oci->mi = oci->mip + oci->mode_info_stride + 1;
			
 
				+
			
 
				+    /* Allocation of previous mode info will be done in vp8_decode_frame()
			
 
				+     * as it is a decoder only data */
			
 
				+
			
 
				+    oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
			
 
				+
			
 
				+    if (!oci->above_context)
			
 
				+        goto allocation_fail;
			
 
				+
			
 
				+#if CONFIG_POSTPROC
			
 
				+    if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
			
 
				+        goto allocation_fail;
			
 
				+
			
 
				+    oci->post_proc_buffer_int_used = 0;
			
 
				+    memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
			
 
				+    memset(oci->post_proc_buffer.buffer_alloc, 128,
			
 
				+           oci->post_proc_buffer.frame_size);
			
 
				+
			
 
				+    /* Allocate buffer to store post-processing filter coefficients.
			
 
				+     *
			
 
				+     * Note: Round up mb_cols to support SIMD reads
			
 
				+     */
			
 
				+    oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1));
			
 
				+    if (!oci->pp_limits_buffer)
			
 
				+        goto allocation_fail;
			
 
				+#endif
			
 
				+
			
 
				+    return 0;
			
 
				+
			
 
				+allocation_fail:
			
 
				+    vp8_de_alloc_frame_buffers(oci);
			
 
				+    return 1;
			
 
				+}
			
 
				+
			
 
				+void vp8_setup_version(VP8_COMMON *cm)
			
 
				+{
			
 
				+    switch (cm->version)
			
 
				+    {
			
 
				+    case 0:
			
 
				+        cm->no_lpf = 0;
			
 
				+        cm->filter_type = NORMAL_LOOPFILTER;
			
 
				+        cm->use_bilinear_mc_filter = 0;
			
 
				+        cm->full_pixel = 0;
			
 
				+        break;
			
 
				+    case 1:
			
 
				+        cm->no_lpf = 0;
			
 
				+        cm->filter_type = SIMPLE_LOOPFILTER;
			
 
				+        cm->use_bilinear_mc_filter = 1;
			
 
				+        cm->full_pixel = 0;
			
 
				+        break;
			
 
				+    case 2:
			
 
				+        cm->no_lpf = 1;
			
 
				+        cm->filter_type = NORMAL_LOOPFILTER;
			
 
				+        cm->use_bilinear_mc_filter = 1;
			
 
				+        cm->full_pixel = 0;
			
 
				+        break;
			
 
				+    case 3:
			
 
				+        cm->no_lpf = 1;
			
 
				+        cm->filter_type = SIMPLE_LOOPFILTER;
			
 
				+        cm->use_bilinear_mc_filter = 1;
			
 
				+        cm->full_pixel = 1;
			
 
				+        break;
			
 
				+    default:
			
 
				+        /*4,5,6,7 are reserved for future use*/
			
 
				+        cm->no_lpf = 0;
			
 
				+        cm->filter_type = NORMAL_LOOPFILTER;
			
 
				+        cm->use_bilinear_mc_filter = 0;
			
 
				+        cm->full_pixel = 0;
			
 
				+        break;
			
 
				+    }
			
 
				+}
			
 
				+void vp8_create_common(VP8_COMMON *oci)
			
 
				+{
			
 
				+    vp8_machine_specific_config(oci);
			
 
				+
			
 
				+    vp8_init_mbmode_probs(oci);
			
 
				+    vp8_default_bmode_probs(oci->fc.bmode_prob);
			
 
				+
			
 
				+    oci->mb_no_coeff_skip = 1;
			
 
				+    oci->no_lpf = 0;
			
 
				+    oci->filter_type = NORMAL_LOOPFILTER;
			
 
				+    oci->use_bilinear_mc_filter = 0;
			
 
				+    oci->full_pixel = 0;
			
 
				+    oci->multi_token_partition = ONE_PARTITION;
			
 
				+    oci->clamp_type = RECON_CLAMP_REQUIRED;
			
 
				+
			
 
				+    /* Initialize reference frame sign bias structure to defaults */
			
 
				+    memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
			
 
				+
			
 
				+    /* Default disable buffer to buffer copying */
			
 
				+    oci->copy_buffer_to_gf = 0;
			
 
				+    oci->copy_buffer_to_arf = 0;
			
 
				+}
			
 
				+
			
 
				+void vp8_remove_common(VP8_COMMON *oci)
			
 
				+{
			
 
				+    vp8_de_alloc_frame_buffers(oci);
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/alloccommon.h
+++ b/thirdparty/libvpx/vp8/common/alloccommon.h
@@ -0,0 +1,31 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_ALLOCCOMMON_H_
			
 
				+#define VP8_COMMON_ALLOCCOMMON_H_
			
 
				+
			
 
				+#include "onyxc_int.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+void vp8_create_common(VP8_COMMON *oci);
			
 
				+void vp8_remove_common(VP8_COMMON *oci);
			
 
				+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
			
 
				+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
			
 
				+void vp8_setup_version(VP8_COMMON *oci);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_ALLOCCOMMON_H_
			
--- a/thirdparty/libvpx/vp8/common/arm/loopfilter_arm.c
+++ b/thirdparty/libvpx/vp8/common/arm/loopfilter_arm.c
@@ -0,0 +1,181 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+#include "vp8/common/loopfilter.h"
			
 
				+#include "vp8/common/onyxc_int.h"
			
 
				+
			
 
				+#define prototype_loopfilter(sym) \
			
 
				+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
			
 
				+             const unsigned char *limit, const unsigned char *thresh, int count)
			
 
				+
			
 
				+#if HAVE_MEDIA
			
 
				+extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
			
 
				+extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
			
 
				+extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
			
 
				+extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
			
 
				+#endif
			
 
				+
			
 
				+#if HAVE_NEON
			
 
				+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
			
 
				+        unsigned char blimit, unsigned char limit, unsigned char thresh);
			
 
				+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
			
 
				+        unsigned char blimit, unsigned char limit, unsigned char thresh,
			
 
				+        unsigned char *v);
			
 
				+
			
 
				+extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
			
 
				+extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
			
 
				+extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
			
 
				+extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
			
 
				+
			
 
				+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
			
 
				+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
			
 
				+extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
			
 
				+extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
			
 
				+#endif
			
 
				+
			
 
				+#if HAVE_MEDIA
			
 
				+/* ARMV6/MEDIA loopfilter functions*/
			
 
				+/* Horizontal MB filtering */
			
 
				+void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                               int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+/* Vertical MB Filtering */
			
 
				+void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                               int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+/* Horizontal B Filtering */
			
 
				+void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                              int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
			
 
				+                               const unsigned char *blimit)
			
 
				+{
			
 
				+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
			
 
				+}
			
 
				+
			
 
				+/* Vertical B Filtering */
			
 
				+void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                              int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
			
 
				+                               const unsigned char *blimit)
			
 
				+{
			
 
				+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if HAVE_NEON
			
 
				+/* NEON loopfilter functions */
			
 
				+/* Horizontal MB filtering */
			
 
				+void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                              int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    unsigned char mblim = *lfi->mblim;
			
 
				+    unsigned char lim = *lfi->lim;
			
 
				+    unsigned char hev_thr = *lfi->hev_thr;
			
 
				+    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
			
 
				+}
			
 
				+
			
 
				+/* Vertical MB Filtering */
			
 
				+void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                              int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    unsigned char mblim = *lfi->mblim;
			
 
				+    unsigned char lim = *lfi->lim;
			
 
				+    unsigned char hev_thr = *lfi->hev_thr;
			
 
				+
			
 
				+    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
			
 
				+}
			
 
				+
			
 
				+/* Horizontal B Filtering */
			
 
				+void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                             int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    unsigned char blim = *lfi->blim;
			
 
				+    unsigned char lim = *lfi->lim;
			
 
				+    unsigned char hev_thr = *lfi->hev_thr;
			
 
				+
			
 
				+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
			
 
				+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
			
 
				+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
			
 
				+}
			
 
				+
			
 
				+/* Vertical B Filtering */
			
 
				+void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                             int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    unsigned char blim = *lfi->blim;
			
 
				+    unsigned char lim = *lfi->lim;
			
 
				+    unsigned char hev_thr = *lfi->hev_thr;
			
 
				+
			
 
				+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
			
 
				+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
			
 
				+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
			
 
				+}
			
 
				+#endif
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -0,0 +1,591 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+
			
 
				+static const uint8_t bifilter4_coeff[8][2] = {
			
 
				+    {128,   0},
			
 
				+    {112,  16},
			
 
				+    { 96,  32},
			
 
				+    { 80,  48},
			
 
				+    { 64,  64},
			
 
				+    { 48,  80},
			
 
				+    { 32,  96},
			
 
				+    { 16, 112}
			
 
				+};
			
 
				+
			
 
				+void vp8_bilinear_predict8x4_neon(
			
 
				+        unsigned char *src_ptr,
			
 
				+        int src_pixels_per_line,
			
 
				+        int xoffset,
			
 
				+        int yoffset,
			
 
				+        unsigned char *dst_ptr,
			
 
				+        int dst_pitch) {
			
 
				+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8;
			
 
				+    uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8;
			
 
				+    uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
			
 
				+    uint16x8_t q1u16, q2u16, q3u16, q4u16;
			
 
				+    uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
			
 
				+
			
 
				+    if (xoffset == 0) {  // skip_1stpass_filter
			
 
				+        d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d26u8 = vld1_u8(src_ptr);
			
 
				+    } else {
			
 
				+        q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        q5u8 = vld1q_u8(src_ptr);
			
 
				+
			
 
				+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
			
 
				+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
			
 
				+
			
 
				+        q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
			
 
				+        q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
			
 
				+        q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
			
 
				+        q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
			
 
				+        q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
			
 
				+
			
 
				+        d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
			
 
				+        d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
			
 
				+        d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
			
 
				+        d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
			
 
				+        d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
			
 
				+
			
 
				+        q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
			
 
				+        q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
			
 
				+        q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
			
 
				+        q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
			
 
				+        q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
			
 
				+
			
 
				+        d22u8 = vqrshrn_n_u16(q6u16, 7);
			
 
				+        d23u8 = vqrshrn_n_u16(q7u16, 7);
			
 
				+        d24u8 = vqrshrn_n_u16(q8u16, 7);
			
 
				+        d25u8 = vqrshrn_n_u16(q9u16, 7);
			
 
				+        d26u8 = vqrshrn_n_u16(q10u16, 7);
			
 
				+    }
			
 
				+
			
 
				+    // secondpass_filter
			
 
				+    if (yoffset == 0) {  // skip_2ndpass_filter
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d25u8);
			
 
				+    } else {
			
 
				+        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
			
 
				+        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
			
 
				+
			
 
				+        q1u16 = vmull_u8(d22u8, d0u8);
			
 
				+        q2u16 = vmull_u8(d23u8, d0u8);
			
 
				+        q3u16 = vmull_u8(d24u8, d0u8);
			
 
				+        q4u16 = vmull_u8(d25u8, d0u8);
			
 
				+
			
 
				+        q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
			
 
				+        q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
			
 
				+        q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
			
 
				+        q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
			
 
				+
			
 
				+        d2u8 = vqrshrn_n_u16(q1u16, 7);
			
 
				+        d3u8 = vqrshrn_n_u16(q2u16, 7);
			
 
				+        d4u8 = vqrshrn_n_u16(q3u16, 7);
			
 
				+        d5u8 = vqrshrn_n_u16(q4u16, 7);
			
 
				+
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d5u8);
			
 
				+    }
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_bilinear_predict8x8_neon(
			
 
				+        unsigned char *src_ptr,
			
 
				+        int src_pixels_per_line,
			
 
				+        int xoffset,
			
 
				+        int yoffset,
			
 
				+        unsigned char *dst_ptr,
			
 
				+        int dst_pitch) {
			
 
				+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8;
			
 
				+    uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8;
			
 
				+    uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
			
 
				+    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16;
			
 
				+    uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
			
 
				+
			
 
				+    if (xoffset == 0) {  // skip_1stpass_filter
			
 
				+        d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        d30u8 = vld1_u8(src_ptr);
			
 
				+    } else {
			
 
				+        q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+
			
 
				+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
			
 
				+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
			
 
				+
			
 
				+        q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
			
 
				+        q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
			
 
				+        q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
			
 
				+        q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
			
 
				+
			
 
				+        d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
			
 
				+        d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
			
 
				+        d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
			
 
				+        d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
			
 
				+
			
 
				+        q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
			
 
				+        q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
			
 
				+        q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
			
 
				+        q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
			
 
				+
			
 
				+        d22u8 = vqrshrn_n_u16(q6u16, 7);
			
 
				+        d23u8 = vqrshrn_n_u16(q7u16, 7);
			
 
				+        d24u8 = vqrshrn_n_u16(q8u16, 7);
			
 
				+        d25u8 = vqrshrn_n_u16(q9u16, 7);
			
 
				+
			
 
				+        // first_pass filtering on the rest 5-line data
			
 
				+        q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+        q5u8 = vld1q_u8(src_ptr);
			
 
				+
			
 
				+        q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
			
 
				+        q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
			
 
				+        q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
			
 
				+        q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
			
 
				+        q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
			
 
				+
			
 
				+        d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
			
 
				+        d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
			
 
				+        d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
			
 
				+        d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
			
 
				+        d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
			
 
				+
			
 
				+        q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
			
 
				+        q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
			
 
				+        q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
			
 
				+        q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
			
 
				+        q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
			
 
				+
			
 
				+        d26u8 = vqrshrn_n_u16(q6u16, 7);
			
 
				+        d27u8 = vqrshrn_n_u16(q7u16, 7);
			
 
				+        d28u8 = vqrshrn_n_u16(q8u16, 7);
			
 
				+        d29u8 = vqrshrn_n_u16(q9u16, 7);
			
 
				+        d30u8 = vqrshrn_n_u16(q10u16, 7);
			
 
				+    }
			
 
				+
			
 
				+    // secondpass_filter
			
 
				+    if (yoffset == 0) {  // skip_2ndpass_filter
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d29u8);
			
 
				+    } else {
			
 
				+        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
			
 
				+        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
			
 
				+
			
 
				+        q1u16 = vmull_u8(d22u8, d0u8);
			
 
				+        q2u16 = vmull_u8(d23u8, d0u8);
			
 
				+        q3u16 = vmull_u8(d24u8, d0u8);
			
 
				+        q4u16 = vmull_u8(d25u8, d0u8);
			
 
				+        q5u16 = vmull_u8(d26u8, d0u8);
			
 
				+        q6u16 = vmull_u8(d27u8, d0u8);
			
 
				+        q7u16 = vmull_u8(d28u8, d0u8);
			
 
				+        q8u16 = vmull_u8(d29u8, d0u8);
			
 
				+
			
 
				+        q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
			
 
				+        q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
			
 
				+        q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
			
 
				+        q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
			
 
				+        q5u16 = vmlal_u8(q5u16, d27u8, d1u8);
			
 
				+        q6u16 = vmlal_u8(q6u16, d28u8, d1u8);
			
 
				+        q7u16 = vmlal_u8(q7u16, d29u8, d1u8);
			
 
				+        q8u16 = vmlal_u8(q8u16, d30u8, d1u8);
			
 
				+
			
 
				+        d2u8 = vqrshrn_n_u16(q1u16, 7);
			
 
				+        d3u8 = vqrshrn_n_u16(q2u16, 7);
			
 
				+        d4u8 = vqrshrn_n_u16(q3u16, 7);
			
 
				+        d5u8 = vqrshrn_n_u16(q4u16, 7);
			
 
				+        d6u8 = vqrshrn_n_u16(q5u16, 7);
			
 
				+        d7u8 = vqrshrn_n_u16(q6u16, 7);
			
 
				+        d8u8 = vqrshrn_n_u16(q7u16, 7);
			
 
				+        d9u8 = vqrshrn_n_u16(q8u16, 7);
			
 
				+
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch;
			
 
				+        vst1_u8((uint8_t *)dst_ptr, d9u8);
			
 
				+    }
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_bilinear_predict16x16_neon(
			
 
				+        unsigned char *src_ptr,
			
 
				+        int src_pixels_per_line,
			
 
				+        int xoffset,
			
 
				+        int yoffset,
			
 
				+        unsigned char *dst_ptr,
			
 
				+        int dst_pitch) {
			
 
				+    int i;
			
 
				+    unsigned char tmp[272];
			
 
				+    unsigned char *tmpp;
			
 
				+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
			
 
				+    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
			
 
				+    uint8x8_t d19u8, d20u8, d21u8;
			
 
				+    uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
			
 
				+    uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8;
			
 
				+    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
			
 
				+    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
			
 
				+
			
 
				+    if (xoffset == 0) {  // secondpass_bfilter16x16_only
			
 
				+        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
			
 
				+        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
			
 
				+
			
 
				+        q11u8 = vld1q_u8(src_ptr);
			
 
				+        src_ptr += src_pixels_per_line;
			
 
				+        for (i = 4; i > 0; i--) {
			
 
				+            q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+            q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+            q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+            q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
			
 
				+
			
 
				+            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
			
 
				+            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
			
 
				+            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
			
 
				+            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
			
 
				+            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
			
 
				+            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
			
 
				+            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
			
 
				+            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
			
 
				+
			
 
				+            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
			
 
				+            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
			
 
				+            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
			
 
				+            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
			
 
				+            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
			
 
				+            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
			
 
				+            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
			
 
				+            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
			
 
				+
			
 
				+            d2u8 = vqrshrn_n_u16(q1u16, 7);
			
 
				+            d3u8 = vqrshrn_n_u16(q2u16, 7);
			
 
				+            d4u8 = vqrshrn_n_u16(q3u16, 7);
			
 
				+            d5u8 = vqrshrn_n_u16(q4u16, 7);
			
 
				+            d6u8 = vqrshrn_n_u16(q5u16, 7);
			
 
				+            d7u8 = vqrshrn_n_u16(q6u16, 7);
			
 
				+            d8u8 = vqrshrn_n_u16(q7u16, 7);
			
 
				+            d9u8 = vqrshrn_n_u16(q8u16, 7);
			
 
				+
			
 
				+            q1u8 = vcombine_u8(d2u8, d3u8);
			
 
				+            q2u8 = vcombine_u8(d4u8, d5u8);
			
 
				+            q3u8 = vcombine_u8(d6u8, d7u8);
			
 
				+            q4u8 = vcombine_u8(d8u8, d9u8);
			
 
				+
			
 
				+            q11u8 = q15u8;
			
 
				+
			
 
				+            vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
			
 
				+            vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
			
 
				+            vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
			
 
				+            vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
			
 
				+        }
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    if (yoffset == 0) {  // firstpass_bfilter16x16_only
			
 
				+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
			
 
				+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
			
 
				+
			
 
				+        for (i = 4; i > 0 ; i--) {
			
 
				+            d2u8 = vld1_u8(src_ptr);
			
 
				+            d3u8 = vld1_u8(src_ptr + 8);
			
 
				+            d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+            d5u8 = vld1_u8(src_ptr);
			
 
				+            d6u8 = vld1_u8(src_ptr + 8);
			
 
				+            d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+            d8u8 = vld1_u8(src_ptr);
			
 
				+            d9u8 = vld1_u8(src_ptr + 8);
			
 
				+            d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+            d11u8 = vld1_u8(src_ptr);
			
 
				+            d12u8 = vld1_u8(src_ptr + 8);
			
 
				+            d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+
			
 
				+            q7u16  = vmull_u8(d2u8, d0u8);
			
 
				+            q8u16  = vmull_u8(d3u8, d0u8);
			
 
				+            q9u16  = vmull_u8(d5u8, d0u8);
			
 
				+            q10u16 = vmull_u8(d6u8, d0u8);
			
 
				+            q11u16 = vmull_u8(d8u8, d0u8);
			
 
				+            q12u16 = vmull_u8(d9u8, d0u8);
			
 
				+            q13u16 = vmull_u8(d11u8, d0u8);
			
 
				+            q14u16 = vmull_u8(d12u8, d0u8);
			
 
				+
			
 
				+            d2u8  = vext_u8(d2u8, d3u8, 1);
			
 
				+            d5u8  = vext_u8(d5u8, d6u8, 1);
			
 
				+            d8u8  = vext_u8(d8u8, d9u8, 1);
			
 
				+            d11u8 = vext_u8(d11u8, d12u8, 1);
			
 
				+
			
 
				+            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
			
 
				+            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
			
 
				+            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
			
 
				+            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
			
 
				+
			
 
				+            d3u8  = vext_u8(d3u8, d4u8, 1);
			
 
				+            d6u8  = vext_u8(d6u8, d7u8, 1);
			
 
				+            d9u8  = vext_u8(d9u8, d10u8, 1);
			
 
				+            d12u8 = vext_u8(d12u8, d13u8, 1);
			
 
				+
			
 
				+            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
			
 
				+            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
			
 
				+            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
			
 
				+            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
			
 
				+
			
 
				+            d14u8 = vqrshrn_n_u16(q7u16, 7);
			
 
				+            d15u8 = vqrshrn_n_u16(q8u16, 7);
			
 
				+            d16u8 = vqrshrn_n_u16(q9u16, 7);
			
 
				+            d17u8 = vqrshrn_n_u16(q10u16, 7);
			
 
				+            d18u8 = vqrshrn_n_u16(q11u16, 7);
			
 
				+            d19u8 = vqrshrn_n_u16(q12u16, 7);
			
 
				+            d20u8 = vqrshrn_n_u16(q13u16, 7);
			
 
				+            d21u8 = vqrshrn_n_u16(q14u16, 7);
			
 
				+
			
 
				+            q7u8 = vcombine_u8(d14u8, d15u8);
			
 
				+            q8u8 = vcombine_u8(d16u8, d17u8);
			
 
				+            q9u8 = vcombine_u8(d18u8, d19u8);
			
 
				+            q10u8 =vcombine_u8(d20u8, d21u8);
			
 
				+
			
 
				+            vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch;
			
 
				+            vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch;
			
 
				+            vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch;
			
 
				+            vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch;
			
 
				+        }
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
			
 
				+    d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
			
 
				+
			
 
				+    d2u8 = vld1_u8(src_ptr);
			
 
				+    d3u8 = vld1_u8(src_ptr + 8);
			
 
				+    d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+    d5u8 = vld1_u8(src_ptr);
			
 
				+    d6u8 = vld1_u8(src_ptr + 8);
			
 
				+    d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+    d8u8 = vld1_u8(src_ptr);
			
 
				+    d9u8 = vld1_u8(src_ptr + 8);
			
 
				+    d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+    d11u8 = vld1_u8(src_ptr);
			
 
				+    d12u8 = vld1_u8(src_ptr + 8);
			
 
				+    d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+
			
 
				+    // First Pass: output_height lines x output_width columns (17x16)
			
 
				+    tmpp = tmp;
			
 
				+    for (i = 3; i > 0; i--) {
			
 
				+        q7u16  = vmull_u8(d2u8, d0u8);
			
 
				+        q8u16  = vmull_u8(d3u8, d0u8);
			
 
				+        q9u16  = vmull_u8(d5u8, d0u8);
			
 
				+        q10u16 = vmull_u8(d6u8, d0u8);
			
 
				+        q11u16 = vmull_u8(d8u8, d0u8);
			
 
				+        q12u16 = vmull_u8(d9u8, d0u8);
			
 
				+        q13u16 = vmull_u8(d11u8, d0u8);
			
 
				+        q14u16 = vmull_u8(d12u8, d0u8);
			
 
				+
			
 
				+        d2u8  = vext_u8(d2u8, d3u8, 1);
			
 
				+        d5u8  = vext_u8(d5u8, d6u8, 1);
			
 
				+        d8u8  = vext_u8(d8u8, d9u8, 1);
			
 
				+        d11u8 = vext_u8(d11u8, d12u8, 1);
			
 
				+
			
 
				+        q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
			
 
				+        q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
			
 
				+        q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
			
 
				+        q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
			
 
				+
			
 
				+        d3u8  = vext_u8(d3u8, d4u8, 1);
			
 
				+        d6u8  = vext_u8(d6u8, d7u8, 1);
			
 
				+        d9u8  = vext_u8(d9u8, d10u8, 1);
			
 
				+        d12u8 = vext_u8(d12u8, d13u8, 1);
			
 
				+
			
 
				+        q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
			
 
				+        q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
			
 
				+        q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
			
 
				+        q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
			
 
				+
			
 
				+        d14u8 = vqrshrn_n_u16(q7u16, 7);
			
 
				+        d15u8 = vqrshrn_n_u16(q8u16, 7);
			
 
				+        d16u8 = vqrshrn_n_u16(q9u16, 7);
			
 
				+        d17u8 = vqrshrn_n_u16(q10u16, 7);
			
 
				+        d18u8 = vqrshrn_n_u16(q11u16, 7);
			
 
				+        d19u8 = vqrshrn_n_u16(q12u16, 7);
			
 
				+        d20u8 = vqrshrn_n_u16(q13u16, 7);
			
 
				+        d21u8 = vqrshrn_n_u16(q14u16, 7);
			
 
				+
			
 
				+        d2u8 = vld1_u8(src_ptr);
			
 
				+        d3u8 = vld1_u8(src_ptr + 8);
			
 
				+        d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+        d5u8 = vld1_u8(src_ptr);
			
 
				+        d6u8 = vld1_u8(src_ptr + 8);
			
 
				+        d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+        d8u8 = vld1_u8(src_ptr);
			
 
				+        d9u8 = vld1_u8(src_ptr + 8);
			
 
				+        d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+        d11u8 = vld1_u8(src_ptr);
			
 
				+        d12u8 = vld1_u8(src_ptr + 8);
			
 
				+        d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+
			
 
				+        q7u8 = vcombine_u8(d14u8, d15u8);
			
 
				+        q8u8 = vcombine_u8(d16u8, d17u8);
			
 
				+        q9u8 = vcombine_u8(d18u8, d19u8);
			
 
				+        q10u8 = vcombine_u8(d20u8, d21u8);
			
 
				+
			
 
				+        vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
			
 
				+        vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
			
 
				+        vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16;
			
 
				+        vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16;
			
 
				+    }
			
 
				+
			
 
				+    // First-pass filtering for rest 5 lines
			
 
				+    d14u8 = vld1_u8(src_ptr);
			
 
				+    d15u8 = vld1_u8(src_ptr + 8);
			
 
				+    d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
			
 
				+
			
 
				+    q9u16  = vmull_u8(d2u8, d0u8);
			
 
				+    q10u16 = vmull_u8(d3u8, d0u8);
			
 
				+    q11u16 = vmull_u8(d5u8, d0u8);
			
 
				+    q12u16 = vmull_u8(d6u8, d0u8);
			
 
				+    q13u16 = vmull_u8(d8u8, d0u8);
			
 
				+    q14u16 = vmull_u8(d9u8, d0u8);
			
 
				+
			
 
				+    d2u8  = vext_u8(d2u8, d3u8, 1);
			
 
				+    d5u8  = vext_u8(d5u8, d6u8, 1);
			
 
				+    d8u8  = vext_u8(d8u8, d9u8, 1);
			
 
				+
			
 
				+    q9u16  = vmlal_u8(q9u16, d2u8, d1u8);
			
 
				+    q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
			
 
				+    q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
			
 
				+
			
 
				+    d3u8  = vext_u8(d3u8, d4u8, 1);
			
 
				+    d6u8  = vext_u8(d6u8, d7u8, 1);
			
 
				+    d9u8  = vext_u8(d9u8, d10u8, 1);
			
 
				+
			
 
				+    q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
			
 
				+    q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
			
 
				+    q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
			
 
				+
			
 
				+    q1u16 = vmull_u8(d11u8, d0u8);
			
 
				+    q2u16 = vmull_u8(d12u8, d0u8);
			
 
				+    q3u16 = vmull_u8(d14u8, d0u8);
			
 
				+    q4u16 = vmull_u8(d15u8, d0u8);
			
 
				+
			
 
				+    d11u8 = vext_u8(d11u8, d12u8, 1);
			
 
				+    d14u8 = vext_u8(d14u8, d15u8, 1);
			
 
				+
			
 
				+    q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
			
 
				+    q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
			
 
				+
			
 
				+    d12u8 = vext_u8(d12u8, d13u8, 1);
			
 
				+    d15u8 = vext_u8(d15u8, d16u8, 1);
			
 
				+
			
 
				+    q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
			
 
				+    q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
			
 
				+
			
 
				+    d10u8 = vqrshrn_n_u16(q9u16, 7);
			
 
				+    d11u8 = vqrshrn_n_u16(q10u16, 7);
			
 
				+    d12u8 = vqrshrn_n_u16(q11u16, 7);
			
 
				+    d13u8 = vqrshrn_n_u16(q12u16, 7);
			
 
				+    d14u8 = vqrshrn_n_u16(q13u16, 7);
			
 
				+    d15u8 = vqrshrn_n_u16(q14u16, 7);
			
 
				+    d16u8 = vqrshrn_n_u16(q1u16, 7);
			
 
				+    d17u8 = vqrshrn_n_u16(q2u16, 7);
			
 
				+    d18u8 = vqrshrn_n_u16(q3u16, 7);
			
 
				+    d19u8 = vqrshrn_n_u16(q4u16, 7);
			
 
				+
			
 
				+    q5u8 = vcombine_u8(d10u8, d11u8);
			
 
				+    q6u8 = vcombine_u8(d12u8, d13u8);
			
 
				+    q7u8 = vcombine_u8(d14u8, d15u8);
			
 
				+    q8u8 = vcombine_u8(d16u8, d17u8);
			
 
				+    q9u8 = vcombine_u8(d18u8, d19u8);
			
 
				+
			
 
				+    vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16;
			
 
				+    vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16;
			
 
				+    vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
			
 
				+    vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
			
 
				+    vst1q_u8((uint8_t *)tmpp, q9u8);
			
 
				+
			
 
				+    // secondpass_filter
			
 
				+    d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
			
 
				+    d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
			
 
				+
			
 
				+    tmpp = tmp;
			
 
				+    q11u8 = vld1q_u8(tmpp);
			
 
				+    tmpp += 16;
			
 
				+    for (i = 4; i > 0; i--) {
			
 
				+        q12u8 = vld1q_u8(tmpp); tmpp += 16;
			
 
				+        q13u8 = vld1q_u8(tmpp); tmpp += 16;
			
 
				+        q14u8 = vld1q_u8(tmpp); tmpp += 16;
			
 
				+        q15u8 = vld1q_u8(tmpp); tmpp += 16;
			
 
				+
			
 
				+        q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
			
 
				+        q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
			
 
				+        q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
			
 
				+        q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
			
 
				+        q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
			
 
				+        q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
			
 
				+        q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
			
 
				+        q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
			
 
				+
			
 
				+        q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
			
 
				+        q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
			
 
				+        q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
			
 
				+        q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
			
 
				+        q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
			
 
				+        q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
			
 
				+        q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
			
 
				+        q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
			
 
				+
			
 
				+        d2u8 = vqrshrn_n_u16(q1u16, 7);
			
 
				+        d3u8 = vqrshrn_n_u16(q2u16, 7);
			
 
				+        d4u8 = vqrshrn_n_u16(q3u16, 7);
			
 
				+        d5u8 = vqrshrn_n_u16(q4u16, 7);
			
 
				+        d6u8 = vqrshrn_n_u16(q5u16, 7);
			
 
				+        d7u8 = vqrshrn_n_u16(q6u16, 7);
			
 
				+        d8u8 = vqrshrn_n_u16(q7u16, 7);
			
 
				+        d9u8 = vqrshrn_n_u16(q8u16, 7);
			
 
				+
			
 
				+        q1u8 = vcombine_u8(d2u8, d3u8);
			
 
				+        q2u8 = vcombine_u8(d4u8, d5u8);
			
 
				+        q3u8 = vcombine_u8(d6u8, d7u8);
			
 
				+        q4u8 = vcombine_u8(d8u8, d9u8);
			
 
				+
			
 
				+        q11u8 = q15u8;
			
 
				+
			
 
				+        vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
			
 
				+        vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
			
 
				+        vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
			
 
				+        vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
			
 
				+    }
			
 
				+    return;
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/copymem_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/copymem_neon.c
@@ -0,0 +1,59 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+
			
 
				+void vp8_copy_mem8x4_neon(
			
 
				+        unsigned char *src,
			
 
				+        int src_stride,
			
 
				+        unsigned char *dst,
			
 
				+        int dst_stride) {
			
 
				+    uint8x8_t vtmp;
			
 
				+    int r;
			
 
				+
			
 
				+    for (r = 0; r < 4; r++) {
			
 
				+        vtmp = vld1_u8(src);
			
 
				+        vst1_u8(dst, vtmp);
			
 
				+        src += src_stride;
			
 
				+        dst += dst_stride;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_copy_mem8x8_neon(
			
 
				+        unsigned char *src,
			
 
				+        int src_stride,
			
 
				+        unsigned char *dst,
			
 
				+        int dst_stride) {
			
 
				+    uint8x8_t vtmp;
			
 
				+    int r;
			
 
				+
			
 
				+    for (r = 0; r < 8; r++) {
			
 
				+        vtmp = vld1_u8(src);
			
 
				+        vst1_u8(dst, vtmp);
			
 
				+        src += src_stride;
			
 
				+        dst += dst_stride;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_copy_mem16x16_neon(
			
 
				+        unsigned char *src,
			
 
				+        int src_stride,
			
 
				+        unsigned char *dst,
			
 
				+        int dst_stride) {
			
 
				+    int r;
			
 
				+    uint8x16_t qtmp;
			
 
				+
			
 
				+    for (r = 0; r < 16; r++) {
			
 
				+        qtmp = vld1q_u8(src);
			
 
				+        vst1q_u8(dst, qtmp);
			
 
				+        src += src_stride;
			
 
				+        dst += dst_stride;
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
@@ -0,0 +1,42 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+
			
 
				+void vp8_dc_only_idct_add_neon(
			
 
				+        int16_t input_dc,
			
 
				+        unsigned char *pred_ptr,
			
 
				+        int pred_stride,
			
 
				+        unsigned char *dst_ptr,
			
 
				+        int dst_stride) {
			
 
				+    int i;
			
 
				+    uint16_t a1 = ((input_dc + 4) >> 3);
			
 
				+    uint32x2_t d2u32 = vdup_n_u32(0);
			
 
				+    uint8x8_t d2u8;
			
 
				+    uint16x8_t q1u16;
			
 
				+    uint16x8_t qAdd;
			
 
				+
			
 
				+    qAdd = vdupq_n_u16(a1);
			
 
				+
			
 
				+    for (i = 0; i < 2; i++) {
			
 
				+        d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
			
 
				+        pred_ptr += pred_stride;
			
 
				+        d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
			
 
				+        pred_ptr += pred_stride;
			
 
				+
			
 
				+        q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
			
 
				+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
			
 
				+
			
 
				+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
			
 
				+        dst_ptr += dst_stride;
			
 
				+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
			
 
				+        dst_ptr += dst_stride;
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
@@ -0,0 +1,142 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+
			
 
				+static const int16_t cospi8sqrt2minus1 = 20091;
			
 
				+static const int16_t sinpi8sqrt2       = 35468;
			
 
				+
			
 
				+void vp8_dequant_idct_add_neon(
			
 
				+        int16_t *input,
			
 
				+        int16_t *dq,
			
 
				+        unsigned char *dst,
			
 
				+        int stride) {
			
 
				+    unsigned char *dst0;
			
 
				+    int32x2_t d14, d15;
			
 
				+    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
			
 
				+    int16x8_t q1, q2, q3, q4, q5, q6;
			
 
				+    int16x8_t qEmpty = vdupq_n_s16(0);
			
 
				+    int32x2x2_t d2tmp0, d2tmp1;
			
 
				+    int16x4x2_t d2tmp2, d2tmp3;
			
 
				+
			
 
				+    d14 = d15 = vdup_n_s32(0);
			
 
				+
			
 
				+    // load input
			
 
				+    q3 = vld1q_s16(input);
			
 
				+    vst1q_s16(input, qEmpty);
			
 
				+    input += 8;
			
 
				+    q4 = vld1q_s16(input);
			
 
				+    vst1q_s16(input, qEmpty);
			
 
				+
			
 
				+    // load dq
			
 
				+    q5 = vld1q_s16(dq);
			
 
				+    dq += 8;
			
 
				+    q6 = vld1q_s16(dq);
			
 
				+
			
 
				+    // load src from dst
			
 
				+    dst0 = dst;
			
 
				+    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
			
 
				+    dst0 += stride;
			
 
				+    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
			
 
				+    dst0 += stride;
			
 
				+    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
			
 
				+    dst0 += stride;
			
 
				+    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
			
 
				+
			
 
				+    q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
			
 
				+                                         vreinterpretq_u16_s16(q5)));
			
 
				+    q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
			
 
				+                                         vreinterpretq_u16_s16(q6)));
			
 
				+
			
 
				+    d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
			
 
				+    d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
			
 
				+
			
 
				+    q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
			
 
				+
			
 
				+    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
			
 
				+    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
			
 
				+
			
 
				+    q3 = vshrq_n_s16(q3, 1);
			
 
				+    q4 = vshrq_n_s16(q4, 1);
			
 
				+
			
 
				+    q3 = vqaddq_s16(q3, q2);
			
 
				+    q4 = vqaddq_s16(q4, q2);
			
 
				+
			
 
				+    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
			
 
				+    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
			
 
				+
			
 
				+    d2 = vqadd_s16(d12, d11);
			
 
				+    d3 = vqadd_s16(d13, d10);
			
 
				+    d4 = vqsub_s16(d13, d10);
			
 
				+    d5 = vqsub_s16(d12, d11);
			
 
				+
			
 
				+    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
			
 
				+    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
			
 
				+    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
			
 
				+                      vreinterpret_s16_s32(d2tmp1.val[0]));
			
 
				+    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
			
 
				+                      vreinterpret_s16_s32(d2tmp1.val[1]));
			
 
				+
			
 
				+    // loop 2
			
 
				+    q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
			
 
				+
			
 
				+    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
			
 
				+    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
			
 
				+
			
 
				+    d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
			
 
				+    d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
			
 
				+
			
 
				+    q3 = vshrq_n_s16(q3, 1);
			
 
				+    q4 = vshrq_n_s16(q4, 1);
			
 
				+
			
 
				+    q3 = vqaddq_s16(q3, q2);
			
 
				+    q4 = vqaddq_s16(q4, q2);
			
 
				+
			
 
				+    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
			
 
				+    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
			
 
				+
			
 
				+    d2 = vqadd_s16(d12, d11);
			
 
				+    d3 = vqadd_s16(d13, d10);
			
 
				+    d4 = vqsub_s16(d13, d10);
			
 
				+    d5 = vqsub_s16(d12, d11);
			
 
				+
			
 
				+    d2 = vrshr_n_s16(d2, 3);
			
 
				+    d3 = vrshr_n_s16(d3, 3);
			
 
				+    d4 = vrshr_n_s16(d4, 3);
			
 
				+    d5 = vrshr_n_s16(d5, 3);
			
 
				+
			
 
				+    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
			
 
				+    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
			
 
				+    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
			
 
				+                      vreinterpret_s16_s32(d2tmp1.val[0]));
			
 
				+    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
			
 
				+                      vreinterpret_s16_s32(d2tmp1.val[1]));
			
 
				+
			
 
				+    q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
			
 
				+    q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
			
 
				+
			
 
				+    q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
			
 
				+                                        vreinterpret_u8_s32(d14)));
			
 
				+    q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
			
 
				+                                        vreinterpret_u8_s32(d15)));
			
 
				+
			
 
				+    d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
			
 
				+    d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
			
 
				+
			
 
				+    dst0 = dst;
			
 
				+    vst1_lane_s32((int32_t *)dst0, d14, 0);
			
 
				+    dst0 += stride;
			
 
				+    vst1_lane_s32((int32_t *)dst0, d14, 1);
			
 
				+    dst0 += stride;
			
 
				+    vst1_lane_s32((int32_t *)dst0, d15, 0);
			
 
				+    dst0 += stride;
			
 
				+    vst1_lane_s32((int32_t *)dst0, d15, 1);
			
 
				+    return;
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
@@ -0,0 +1,25 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+
			
 
				+#include "vp8/common/blockd.h"
			
 
				+
			
 
				+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
			
 
				+    int16x8x2_t qQ, qDQC, qDQ;
			
 
				+
			
 
				+    qQ   = vld2q_s16(d->qcoeff);
			
 
				+    qDQC = vld2q_s16(DQC);
			
 
				+
			
 
				+    qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
			
 
				+    qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
			
 
				+
			
 
				+    vst2q_s16(d->dqcoeff, qDQ);
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/idct_blk_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/idct_blk_neon.c
@@ -0,0 +1,96 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+
			
 
				+/* place these declarations here because we don't want to maintain them
			
 
				+ * outside of this scope
			
 
				+ */
			
 
				+void idct_dequant_full_2x_neon(short *q, short *dq,
			
 
				+                               unsigned char *dst, int stride);
			
 
				+void idct_dequant_0_2x_neon(short *q, short dq,
			
 
				+                            unsigned char *dst, int stride);
			
 
				+
			
 
				+
			
 
				+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
			
 
				+                                       unsigned char *dst,
			
 
				+                                       int stride, char *eobs)
			
 
				+{
			
 
				+    int i;
			
 
				+
			
 
				+    for (i = 0; i < 4; i++)
			
 
				+    {
			
 
				+        if (((short *)(eobs))[0])
			
 
				+        {
			
 
				+            if (((short *)eobs)[0] & 0xfefe)
			
 
				+                idct_dequant_full_2x_neon (q, dq, dst, stride);
			
 
				+            else
			
 
				+                idct_dequant_0_2x_neon (q, dq[0], dst, stride);
			
 
				+        }
			
 
				+
			
 
				+        if (((short *)(eobs))[1])
			
 
				+        {
			
 
				+            if (((short *)eobs)[1] & 0xfefe)
			
 
				+                idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
			
 
				+            else
			
 
				+                idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
			
 
				+        }
			
 
				+        q    += 64;
			
 
				+        dst  += 4*stride;
			
 
				+        eobs += 4;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
			
 
				+                                        unsigned char *dstu,
			
 
				+                                        unsigned char *dstv,
			
 
				+                                        int stride, char *eobs)
			
 
				+{
			
 
				+    if (((short *)(eobs))[0])
			
 
				+    {
			
 
				+        if (((short *)eobs)[0] & 0xfefe)
			
 
				+            idct_dequant_full_2x_neon (q, dq, dstu, stride);
			
 
				+        else
			
 
				+            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
			
 
				+    }
			
 
				+
			
 
				+    q    += 32;
			
 
				+    dstu += 4*stride;
			
 
				+
			
 
				+    if (((short *)(eobs))[1])
			
 
				+    {
			
 
				+        if (((short *)eobs)[1] & 0xfefe)
			
 
				+            idct_dequant_full_2x_neon (q, dq, dstu, stride);
			
 
				+        else
			
 
				+            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
			
 
				+    }
			
 
				+
			
 
				+    q += 32;
			
 
				+
			
 
				+    if (((short *)(eobs))[2])
			
 
				+    {
			
 
				+        if (((short *)eobs)[2] & 0xfefe)
			
 
				+            idct_dequant_full_2x_neon (q, dq, dstv, stride);
			
 
				+        else
			
 
				+            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
			
 
				+    }
			
 
				+
			
 
				+    q    += 32;
			
 
				+    dstv += 4*stride;
			
 
				+
			
 
				+    if (((short *)(eobs))[3])
			
 
				+    {
			
 
				+        if (((short *)eobs)[3] & 0xfefe)
			
 
				+            idct_dequant_full_2x_neon (q, dq, dstv, stride);
			
 
				+        else
			
 
				+            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
@@ -0,0 +1,63 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+
			
 
				+void idct_dequant_0_2x_neon(
			
 
				+        int16_t *q,
			
 
				+        int16_t dq,
			
 
				+        unsigned char *dst,
			
 
				+        int stride) {
			
 
				+    unsigned char *dst0;
			
 
				+    int i, a0, a1;
			
 
				+    int16x8x2_t q2Add;
			
 
				+    int32x2_t d2s32 = vdup_n_s32(0),
			
 
				+              d4s32 = vdup_n_s32(0);
			
 
				+    uint8x8_t d2u8, d4u8;
			
 
				+    uint16x8_t q1u16, q2u16;
			
 
				+
			
 
				+    a0 = ((q[0] * dq) + 4) >> 3;
			
 
				+    a1 = ((q[16] * dq) + 4) >> 3;
			
 
				+    q[0] = q[16] = 0;
			
 
				+    q2Add.val[0] = vdupq_n_s16((int16_t)a0);
			
 
				+    q2Add.val[1] = vdupq_n_s16((int16_t)a1);
			
 
				+
			
 
				+    for (i = 0; i < 2; i++, dst += 4) {
			
 
				+        dst0 = dst;
			
 
				+        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
			
 
				+        dst0 += stride;
			
 
				+        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
			
 
				+        dst0 += stride;
			
 
				+        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
			
 
				+        dst0 += stride;
			
 
				+        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
			
 
				+
			
 
				+        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
			
 
				+                         vreinterpret_u8_s32(d2s32));
			
 
				+        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
			
 
				+                         vreinterpret_u8_s32(d4s32));
			
 
				+
			
 
				+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
			
 
				+        d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
			
 
				+
			
 
				+        d2s32 = vreinterpret_s32_u8(d2u8);
			
 
				+        d4s32 = vreinterpret_s32_u8(d4u8);
			
 
				+
			
 
				+        dst0 = dst;
			
 
				+        vst1_lane_s32((int32_t *)dst0, d2s32, 0);
			
 
				+        dst0 += stride;
			
 
				+        vst1_lane_s32((int32_t *)dst0, d2s32, 1);
			
 
				+        dst0 += stride;
			
 
				+        vst1_lane_s32((int32_t *)dst0, d4s32, 0);
			
 
				+        dst0 += stride;
			
 
				+        vst1_lane_s32((int32_t *)dst0, d4s32, 1);
			
 
				+    }
			
 
				+    return;
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
@@ -0,0 +1,185 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+
			
 
				+static const int16_t cospi8sqrt2minus1 = 20091;
			
 
				+static const int16_t sinpi8sqrt2       = 17734;
			
 
				+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
			
 
				+
			
 
				+void idct_dequant_full_2x_neon(
			
 
				+        int16_t *q,
			
 
				+        int16_t *dq,
			
 
				+        unsigned char *dst,
			
 
				+        int stride) {
			
 
				+    unsigned char *dst0, *dst1;
			
 
				+    int32x2_t d28, d29, d30, d31;
			
 
				+    int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
			
 
				+    int16x8_t qEmpty = vdupq_n_s16(0);
			
 
				+    int32x4x2_t q2tmp0, q2tmp1;
			
 
				+    int16x8x2_t q2tmp2, q2tmp3;
			
 
				+    int16x4_t dLow0, dLow1, dHigh0, dHigh1;
			
 
				+
			
 
				+    d28 = d29 = d30 = d31 = vdup_n_s32(0);
			
 
				+
			
 
				+    // load dq
			
 
				+    q0 = vld1q_s16(dq);
			
 
				+    dq += 8;
			
 
				+    q1 = vld1q_s16(dq);
			
 
				+
			
 
				+    // load q
			
 
				+    q2 = vld1q_s16(q);
			
 
				+    vst1q_s16(q, qEmpty);
			
 
				+    q += 8;
			
 
				+    q3 = vld1q_s16(q);
			
 
				+    vst1q_s16(q, qEmpty);
			
 
				+    q += 8;
			
 
				+    q4 = vld1q_s16(q);
			
 
				+    vst1q_s16(q, qEmpty);
			
 
				+    q += 8;
			
 
				+    q5 = vld1q_s16(q);
			
 
				+    vst1q_s16(q, qEmpty);
			
 
				+
			
 
				+    // load src from dst
			
 
				+    dst0 = dst;
			
 
				+    dst1 = dst + 4;
			
 
				+    d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
			
 
				+    dst0 += stride;
			
 
				+    d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
			
 
				+    dst1 += stride;
			
 
				+    d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
			
 
				+    dst0 += stride;
			
 
				+    d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
			
 
				+    dst1 += stride;
			
 
				+
			
 
				+    d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
			
 
				+    dst0 += stride;
			
 
				+    d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
			
 
				+    dst1 += stride;
			
 
				+    d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
			
 
				+    d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
			
 
				+
			
 
				+    q2 = vmulq_s16(q2, q0);
			
 
				+    q3 = vmulq_s16(q3, q1);
			
 
				+    q4 = vmulq_s16(q4, q0);
			
 
				+    q5 = vmulq_s16(q5, q1);
			
 
				+
			
 
				+    // vswp
			
 
				+    dLow0 = vget_low_s16(q2);
			
 
				+    dHigh0 = vget_high_s16(q2);
			
 
				+    dLow1 = vget_low_s16(q4);
			
 
				+    dHigh1 = vget_high_s16(q4);
			
 
				+    q2 = vcombine_s16(dLow0, dLow1);
			
 
				+    q4 = vcombine_s16(dHigh0, dHigh1);
			
 
				+
			
 
				+    dLow0 = vget_low_s16(q3);
			
 
				+    dHigh0 = vget_high_s16(q3);
			
 
				+    dLow1 = vget_low_s16(q5);
			
 
				+    dHigh1 = vget_high_s16(q5);
			
 
				+    q3 = vcombine_s16(dLow0, dLow1);
			
 
				+    q5 = vcombine_s16(dHigh0, dHigh1);
			
 
				+
			
 
				+    q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
			
 
				+    q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
			
 
				+    q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
			
 
				+    q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
			
 
				+
			
 
				+    q10 = vqaddq_s16(q2, q3);
			
 
				+    q11 = vqsubq_s16(q2, q3);
			
 
				+
			
 
				+    q8 = vshrq_n_s16(q8, 1);
			
 
				+    q9 = vshrq_n_s16(q9, 1);
			
 
				+
			
 
				+    q4 = vqaddq_s16(q4, q8);
			
 
				+    q5 = vqaddq_s16(q5, q9);
			
 
				+
			
 
				+    q2 = vqsubq_s16(q6, q5);
			
 
				+    q3 = vqaddq_s16(q7, q4);
			
 
				+
			
 
				+    q4 = vqaddq_s16(q10, q3);
			
 
				+    q5 = vqaddq_s16(q11, q2);
			
 
				+    q6 = vqsubq_s16(q11, q2);
			
 
				+    q7 = vqsubq_s16(q10, q3);
			
 
				+
			
 
				+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
			
 
				+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
			
 
				+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
			
 
				+                       vreinterpretq_s16_s32(q2tmp1.val[0]));
			
 
				+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
			
 
				+                       vreinterpretq_s16_s32(q2tmp1.val[1]));
			
 
				+
			
 
				+    // loop 2
			
 
				+    q8  = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
			
 
				+    q9  = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
			
 
				+    q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
			
 
				+    q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
			
 
				+
			
 
				+    q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
			
 
				+    q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
			
 
				+
			
 
				+    q10 = vshrq_n_s16(q10, 1);
			
 
				+    q11 = vshrq_n_s16(q11, 1);
			
 
				+
			
 
				+    q10 = vqaddq_s16(q2tmp2.val[1], q10);
			
 
				+    q11 = vqaddq_s16(q2tmp3.val[1], q11);
			
 
				+
			
 
				+    q8 = vqsubq_s16(q8, q11);
			
 
				+    q9 = vqaddq_s16(q9, q10);
			
 
				+
			
 
				+    q4 = vqaddq_s16(q2, q9);
			
 
				+    q5 = vqaddq_s16(q3, q8);
			
 
				+    q6 = vqsubq_s16(q3, q8);
			
 
				+    q7 = vqsubq_s16(q2, q9);
			
 
				+
			
 
				+    q4 = vrshrq_n_s16(q4, 3);
			
 
				+    q5 = vrshrq_n_s16(q5, 3);
			
 
				+    q6 = vrshrq_n_s16(q6, 3);
			
 
				+    q7 = vrshrq_n_s16(q7, 3);
			
 
				+
			
 
				+    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
			
 
				+    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
			
 
				+    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
			
 
				+                       vreinterpretq_s16_s32(q2tmp1.val[0]));
			
 
				+    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
			
 
				+                       vreinterpretq_s16_s32(q2tmp1.val[1]));
			
 
				+
			
 
				+    q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
			
 
				+                                          vreinterpret_u8_s32(d28)));
			
 
				+    q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
			
 
				+                                          vreinterpret_u8_s32(d29)));
			
 
				+    q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
			
 
				+                                          vreinterpret_u8_s32(d30)));
			
 
				+    q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
			
 
				+                                          vreinterpret_u8_s32(d31)));
			
 
				+
			
 
				+    d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
			
 
				+    d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
			
 
				+    d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
			
 
				+    d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
			
 
				+
			
 
				+    dst0 = dst;
			
 
				+    dst1 = dst + 4;
			
 
				+    vst1_lane_s32((int32_t *)dst0, d28, 0);
			
 
				+    dst0 += stride;
			
 
				+    vst1_lane_s32((int32_t *)dst1, d28, 1);
			
 
				+    dst1 += stride;
			
 
				+    vst1_lane_s32((int32_t *)dst0, d29, 0);
			
 
				+    dst0 += stride;
			
 
				+    vst1_lane_s32((int32_t *)dst1, d29, 1);
			
 
				+    dst1 += stride;
			
 
				+
			
 
				+    vst1_lane_s32((int32_t *)dst0, d30, 0);
			
 
				+    dst0 += stride;
			
 
				+    vst1_lane_s32((int32_t *)dst1, d30, 1);
			
 
				+    dst1 += stride;
			
 
				+    vst1_lane_s32((int32_t *)dst0, d31, 0);
			
 
				+    vst1_lane_s32((int32_t *)dst1, d31, 1);
			
 
				+    return;
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/iwalsh_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/iwalsh_neon.c
@@ -0,0 +1,102 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+
			
 
				+void vp8_short_inv_walsh4x4_neon(
			
 
				+        int16_t *input,
			
 
				+        int16_t *mb_dqcoeff) {
			
 
				+    int16x8_t q0s16, q1s16, q2s16, q3s16;
			
 
				+    int16x4_t d4s16, d5s16, d6s16, d7s16;
			
 
				+    int16x4x2_t v2tmp0, v2tmp1;
			
 
				+    int32x2x2_t v2tmp2, v2tmp3;
			
 
				+    int16x8_t qAdd3;
			
 
				+
			
 
				+    q0s16 = vld1q_s16(input);
			
 
				+    q1s16 = vld1q_s16(input + 8);
			
 
				+
			
 
				+    // 1st for loop
			
 
				+    d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
			
 
				+    d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
			
 
				+    d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
			
 
				+    d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
			
 
				+
			
 
				+    q2s16 = vcombine_s16(d4s16, d5s16);
			
 
				+    q3s16 = vcombine_s16(d6s16, d7s16);
			
 
				+
			
 
				+    q0s16 = vaddq_s16(q2s16, q3s16);
			
 
				+    q1s16 = vsubq_s16(q2s16, q3s16);
			
 
				+
			
 
				+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
			
 
				+                      vreinterpret_s32_s16(vget_low_s16(q1s16)));
			
 
				+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
			
 
				+                      vreinterpret_s32_s16(vget_high_s16(q1s16)));
			
 
				+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
			
 
				+                      vreinterpret_s16_s32(v2tmp3.val[0]));
			
 
				+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
			
 
				+                      vreinterpret_s16_s32(v2tmp3.val[1]));
			
 
				+
			
 
				+    // 2nd for loop
			
 
				+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
			
 
				+    d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
			
 
				+    d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
			
 
				+    d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
			
 
				+    q2s16 = vcombine_s16(d4s16, d5s16);
			
 
				+    q3s16 = vcombine_s16(d6s16, d7s16);
			
 
				+
			
 
				+    qAdd3 = vdupq_n_s16(3);
			
 
				+
			
 
				+    q0s16 = vaddq_s16(q2s16, q3s16);
			
 
				+    q1s16 = vsubq_s16(q2s16, q3s16);
			
 
				+
			
 
				+    q0s16 = vaddq_s16(q0s16, qAdd3);
			
 
				+    q1s16 = vaddq_s16(q1s16, qAdd3);
			
 
				+
			
 
				+    q0s16 = vshrq_n_s16(q0s16, 3);
			
 
				+    q1s16 = vshrq_n_s16(q1s16, 3);
			
 
				+
			
 
				+    // store
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  0);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  0);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
			
 
				+    mb_dqcoeff += 16;
			
 
				+
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  1);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  1);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
			
 
				+    mb_dqcoeff += 16;
			
 
				+
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  2);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  2);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
			
 
				+    mb_dqcoeff += 16;
			
 
				+
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16),  3);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16),  3);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
			
 
				+    mb_dqcoeff += 16;
			
 
				+    return;
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -0,0 +1,111 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+#include "./vpx_config.h"
			
 
				+
			
 
				+static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
			
 
				+        unsigned char *s,
			
 
				+        int p,
			
 
				+        const unsigned char *blimit) {
			
 
				+    uint8_t *sp;
			
 
				+    uint8x16_t qblimit, q0u8;
			
 
				+    uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
			
 
				+    int16x8_t q2s16, q3s16, q13s16;
			
 
				+    int8x8_t d8s8, d9s8;
			
 
				+    int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
			
 
				+
			
 
				+    qblimit = vdupq_n_u8(*blimit);
			
 
				+
			
 
				+    sp = s - (p << 1);
			
 
				+    q5u8 = vld1q_u8(sp);
			
 
				+    sp += p;
			
 
				+    q6u8 = vld1q_u8(sp);
			
 
				+    sp += p;
			
 
				+    q7u8 = vld1q_u8(sp);
			
 
				+    sp += p;
			
 
				+    q8u8 = vld1q_u8(sp);
			
 
				+
			
 
				+    q15u8 = vabdq_u8(q6u8, q7u8);
			
 
				+    q14u8 = vabdq_u8(q5u8, q8u8);
			
 
				+
			
 
				+    q15u8 = vqaddq_u8(q15u8, q15u8);
			
 
				+    q14u8 = vshrq_n_u8(q14u8, 1);
			
 
				+    q0u8 = vdupq_n_u8(0x80);
			
 
				+    q13s16 = vdupq_n_s16(3);
			
 
				+    q15u8 = vqaddq_u8(q15u8, q14u8);
			
 
				+
			
 
				+    q5u8 = veorq_u8(q5u8, q0u8);
			
 
				+    q6u8 = veorq_u8(q6u8, q0u8);
			
 
				+    q7u8 = veorq_u8(q7u8, q0u8);
			
 
				+    q8u8 = veorq_u8(q8u8, q0u8);
			
 
				+
			
 
				+    q15u8 = vcgeq_u8(qblimit, q15u8);
			
 
				+
			
 
				+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
			
 
				+                     vget_low_s8(vreinterpretq_s8_u8(q6u8)));
			
 
				+    q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
			
 
				+                     vget_high_s8(vreinterpretq_s8_u8(q6u8)));
			
 
				+
			
 
				+    q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
			
 
				+                     vreinterpretq_s8_u8(q8u8));
			
 
				+
			
 
				+    q2s16 = vmulq_s16(q2s16, q13s16);
			
 
				+    q3s16 = vmulq_s16(q3s16, q13s16);
			
 
				+
			
 
				+    q10u8 = vdupq_n_u8(3);
			
 
				+    q9u8 = vdupq_n_u8(4);
			
 
				+
			
 
				+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
			
 
				+    q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
			
 
				+
			
 
				+    d8s8 = vqmovn_s16(q2s16);
			
 
				+    d9s8 = vqmovn_s16(q3s16);
			
 
				+    q4s8 = vcombine_s8(d8s8, d9s8);
			
 
				+
			
 
				+    q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
			
 
				+
			
 
				+    q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
			
 
				+    q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
			
 
				+    q2s8 = vshrq_n_s8(q2s8, 3);
			
 
				+    q3s8 = vshrq_n_s8(q3s8, 3);
			
 
				+
			
 
				+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
			
 
				+    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
			
 
				+
			
 
				+    q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
			
 
				+    q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
			
 
				+
			
 
				+    vst1q_u8(s, q7u8);
			
 
				+    s -= p;
			
 
				+    vst1q_u8(s, q6u8);
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_bhs_neon(
			
 
				+        unsigned char *y_ptr,
			
 
				+        int y_stride,
			
 
				+        const unsigned char *blimit) {
			
 
				+    y_ptr += y_stride * 4;
			
 
				+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
			
 
				+    y_ptr += y_stride * 4;
			
 
				+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
			
 
				+    y_ptr += y_stride * 4;
			
 
				+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_mbhs_neon(
			
 
				+        unsigned char *y_ptr,
			
 
				+        int y_stride,
			
 
				+        const unsigned char *blimit) {
			
 
				+    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
			
 
				+    return;
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -0,0 +1,283 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+#include "./vpx_config.h"
			
 
				+#include "vpx_ports/arm.h"
			
 
				+
			
 
				+#ifdef VPX_INCOMPATIBLE_GCC
			
 
				+static INLINE void write_2x4(unsigned char *dst, int pitch,
			
 
				+                             const uint8x8x2_t result) {
			
 
				+    /*
			
 
				+     * uint8x8x2_t result
			
 
				+    00 01 02 03 | 04 05 06 07
			
 
				+    10 11 12 13 | 14 15 16 17
			
 
				+    ---
			
 
				+    * after vtrn_u8
			
 
				+    00 10 02 12 | 04 14 06 16
			
 
				+    01 11 03 13 | 05 15 07 17
			
 
				+    */
			
 
				+    const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
			
 
				+                                       result.val[1]);
			
 
				+    const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
			
 
				+    const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
			
 
				+    vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
			
 
				+}
			
 
				+
			
 
				+static INLINE void write_2x8(unsigned char *dst, int pitch,
			
 
				+                             const uint8x8x2_t result,
			
 
				+                             const uint8x8x2_t result2) {
			
 
				+  write_2x4(dst, pitch, result);
			
 
				+  dst += pitch * 8;
			
 
				+  write_2x4(dst, pitch, result2);
			
 
				+}
			
 
				+#else
			
 
				+static INLINE void write_2x8(unsigned char *dst, int pitch,
			
 
				+                             const uint8x8x2_t result,
			
 
				+                             const uint8x8x2_t result2) {
			
 
				+  vst2_lane_u8(dst, result, 0);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result, 1);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result, 2);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result, 3);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result, 4);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result, 5);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result, 6);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result, 7);
			
 
				+  dst += pitch;
			
 
				+
			
 
				+  vst2_lane_u8(dst, result2, 0);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result2, 1);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result2, 2);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result2, 3);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result2, 4);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result2, 5);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result2, 6);
			
 
				+  dst += pitch;
			
 
				+  vst2_lane_u8(dst, result2, 7);
			
 
				+}
			
 
				+#endif  // VPX_INCOMPATIBLE_GCC
			
 
				+
			
 
				+
			
 
				+#ifdef VPX_INCOMPATIBLE_GCC
			
 
				+static INLINE
			
 
				+uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
			
 
				+    uint8x8x4_t x;
			
 
				+    const uint8x8_t a = vld1_u8(src);
			
 
				+    const uint8x8_t b = vld1_u8(src + pitch * 1);
			
 
				+    const uint8x8_t c = vld1_u8(src + pitch * 2);
			
 
				+    const uint8x8_t d = vld1_u8(src + pitch * 3);
			
 
				+    const uint8x8_t e = vld1_u8(src + pitch * 4);
			
 
				+    const uint8x8_t f = vld1_u8(src + pitch * 5);
			
 
				+    const uint8x8_t g = vld1_u8(src + pitch * 6);
			
 
				+    const uint8x8_t h = vld1_u8(src + pitch * 7);
			
 
				+    const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
			
 
				+                                          vreinterpret_u32_u8(e));
			
 
				+    const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
			
 
				+                                          vreinterpret_u32_u8(f));
			
 
				+    const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
			
 
				+                                          vreinterpret_u32_u8(g));
			
 
				+    const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
			
 
				+                                          vreinterpret_u32_u8(h));
			
 
				+    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
			
 
				+                                          vreinterpret_u16_u32(r26_u32.val[0]));
			
 
				+    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
			
 
				+                                          vreinterpret_u16_u32(r37_u32.val[0]));
			
 
				+    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
			
 
				+                                       vreinterpret_u8_u16(r13_u16.val[0]));
			
 
				+    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
			
 
				+                                       vreinterpret_u8_u16(r13_u16.val[1]));
			
 
				+    /*
			
 
				+     * after vtrn_u32
			
 
				+    00 01 02 03 | 40 41 42 43
			
 
				+    10 11 12 13 | 50 51 52 53
			
 
				+    20 21 22 23 | 60 61 62 63
			
 
				+    30 31 32 33 | 70 71 72 73
			
 
				+    ---
			
 
				+    * after vtrn_u16
			
 
				+    00 01 20 21 | 40 41 60 61
			
 
				+    02 03 22 23 | 42 43 62 63
			
 
				+    10 11 30 31 | 50 51 70 71
			
 
				+    12 13 32 33 | 52 52 72 73
			
 
				+
			
 
				+    00 01 20 21 | 40 41 60 61
			
 
				+    10 11 30 31 | 50 51 70 71
			
 
				+    02 03 22 23 | 42 43 62 63
			
 
				+    12 13 32 33 | 52 52 72 73
			
 
				+    ---
			
 
				+    * after vtrn_u8
			
 
				+    00 10 20 30 | 40 50 60 70
			
 
				+    01 11 21 31 | 41 51 61 71
			
 
				+    02 12 22 32 | 42 52 62 72
			
 
				+    03 13 23 33 | 43 53 63 73
			
 
				+    */
			
 
				+    x.val[0] = r01_u8.val[0];
			
 
				+    x.val[1] = r01_u8.val[1];
			
 
				+    x.val[2] = r23_u8.val[0];
			
 
				+    x.val[3] = r23_u8.val[1];
			
 
				+
			
 
				+    return x;
			
 
				+}
			
 
				+#else
			
 
				+static INLINE
			
 
				+uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
			
 
				+    uint8x8x4_t x;
			
 
				+    x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0);
			
 
				+    x = vld4_lane_u8(src, x, 0);
			
 
				+    src += pitch;
			
 
				+    x = vld4_lane_u8(src, x, 1);
			
 
				+    src += pitch;
			
 
				+    x = vld4_lane_u8(src, x, 2);
			
 
				+    src += pitch;
			
 
				+    x = vld4_lane_u8(src, x, 3);
			
 
				+    src += pitch;
			
 
				+    x = vld4_lane_u8(src, x, 4);
			
 
				+    src += pitch;
			
 
				+    x = vld4_lane_u8(src, x, 5);
			
 
				+    src += pitch;
			
 
				+    x = vld4_lane_u8(src, x, 6);
			
 
				+    src += pitch;
			
 
				+    x = vld4_lane_u8(src, x, 7);
			
 
				+    return x;
			
 
				+}
			
 
				+#endif  // VPX_INCOMPATIBLE_GCC
			
 
				+
			
 
				+static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
			
 
				+        unsigned char *s,
			
 
				+        int p,
			
 
				+        const unsigned char *blimit) {
			
 
				+    unsigned char *src1;
			
 
				+    uint8x16_t qblimit, q0u8;
			
 
				+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
			
 
				+    int16x8_t q2s16, q13s16, q11s16;
			
 
				+    int8x8_t d28s8, d29s8;
			
 
				+    int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
			
 
				+    uint8x8x4_t d0u8x4;  // d6, d7, d8, d9
			
 
				+    uint8x8x4_t d1u8x4;  // d10, d11, d12, d13
			
 
				+    uint8x8x2_t d2u8x2;  // d12, d13
			
 
				+    uint8x8x2_t d3u8x2;  // d14, d15
			
 
				+
			
 
				+    qblimit = vdupq_n_u8(*blimit);
			
 
				+
			
 
				+    src1 = s - 2;
			
 
				+    d0u8x4 = read_4x8(src1, p);
			
 
				+    src1 += p * 8;
			
 
				+    d1u8x4 = read_4x8(src1, p);
			
 
				+
			
 
				+    q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]);  // d6 d10
			
 
				+    q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]);  // d8 d12
			
 
				+    q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]);  // d7 d11
			
 
				+    q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]);  // d9 d13
			
 
				+
			
 
				+    q15u8 = vabdq_u8(q5u8, q4u8);
			
 
				+    q14u8 = vabdq_u8(q3u8, q6u8);
			
 
				+
			
 
				+    q15u8 = vqaddq_u8(q15u8, q15u8);
			
 
				+    q14u8 = vshrq_n_u8(q14u8, 1);
			
 
				+    q0u8 = vdupq_n_u8(0x80);
			
 
				+    q11s16 = vdupq_n_s16(3);
			
 
				+    q15u8 = vqaddq_u8(q15u8, q14u8);
			
 
				+
			
 
				+    q3u8 = veorq_u8(q3u8, q0u8);
			
 
				+    q4u8 = veorq_u8(q4u8, q0u8);
			
 
				+    q5u8 = veorq_u8(q5u8, q0u8);
			
 
				+    q6u8 = veorq_u8(q6u8, q0u8);
			
 
				+
			
 
				+    q15u8 = vcgeq_u8(qblimit, q15u8);
			
 
				+
			
 
				+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
			
 
				+                     vget_low_s8(vreinterpretq_s8_u8(q5u8)));
			
 
				+    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
			
 
				+                      vget_high_s8(vreinterpretq_s8_u8(q5u8)));
			
 
				+
			
 
				+    q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
			
 
				+                      vreinterpretq_s8_u8(q6u8));
			
 
				+
			
 
				+    q2s16 = vmulq_s16(q2s16, q11s16);
			
 
				+    q13s16 = vmulq_s16(q13s16, q11s16);
			
 
				+
			
 
				+    q11u8 = vdupq_n_u8(3);
			
 
				+    q12u8 = vdupq_n_u8(4);
			
 
				+
			
 
				+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
			
 
				+    q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
			
 
				+
			
 
				+    d28s8 = vqmovn_s16(q2s16);
			
 
				+    d29s8 = vqmovn_s16(q13s16);
			
 
				+    q14s8 = vcombine_s8(d28s8, d29s8);
			
 
				+
			
 
				+    q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
			
 
				+
			
 
				+    q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
			
 
				+    q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
			
 
				+    q2s8 = vshrq_n_s8(q2s8, 3);
			
 
				+    q14s8 = vshrq_n_s8(q3s8, 3);
			
 
				+
			
 
				+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
			
 
				+    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
			
 
				+
			
 
				+    q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
			
 
				+    q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
			
 
				+
			
 
				+    d2u8x2.val[0] = vget_low_u8(q6u8);   // d12
			
 
				+    d2u8x2.val[1] = vget_low_u8(q7u8);   // d14
			
 
				+    d3u8x2.val[0] = vget_high_u8(q6u8);  // d13
			
 
				+    d3u8x2.val[1] = vget_high_u8(q7u8);  // d15
			
 
				+
			
 
				+    src1 = s - 1;
			
 
				+    write_2x8(src1, p, d2u8x2, d3u8x2);
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_bvs_neon(
			
 
				+        unsigned char *y_ptr,
			
 
				+        int y_stride,
			
 
				+        const unsigned char *blimit) {
			
 
				+    y_ptr += 4;
			
 
				+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
			
 
				+    y_ptr += 4;
			
 
				+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
			
 
				+    y_ptr += 4;
			
 
				+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_mbvs_neon(
			
 
				+        unsigned char *y_ptr,
			
 
				+        int y_stride,
			
 
				+        const unsigned char *blimit) {
			
 
				+    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
			
 
				+    return;
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -0,0 +1,625 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+#include "./vpx_config.h"
			
 
				+
			
 
				+static INLINE void vp8_mbloop_filter_neon(
			
 
				+        uint8x16_t qblimit,  // mblimit
			
 
				+        uint8x16_t qlimit,   // limit
			
 
				+        uint8x16_t qthresh,  // thresh
			
 
				+        uint8x16_t q3,       // p2
			
 
				+        uint8x16_t q4,       // p2
			
 
				+        uint8x16_t q5,       // p1
			
 
				+        uint8x16_t q6,       // p0
			
 
				+        uint8x16_t q7,       // q0
			
 
				+        uint8x16_t q8,       // q1
			
 
				+        uint8x16_t q9,       // q2
			
 
				+        uint8x16_t q10,      // q3
			
 
				+        uint8x16_t *q4r,     // p1
			
 
				+        uint8x16_t *q5r,     // p1
			
 
				+        uint8x16_t *q6r,     // p0
			
 
				+        uint8x16_t *q7r,     // q0
			
 
				+        uint8x16_t *q8r,     // q1
			
 
				+        uint8x16_t *q9r) {   // q1
			
 
				+    uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
			
 
				+    int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
			
 
				+    int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
			
 
				+    uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
			
 
				+    int8x16_t q0s8, q12s8, q14s8, q15s8;
			
 
				+    int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
			
 
				+
			
 
				+    q11u8 = vabdq_u8(q3, q4);
			
 
				+    q12u8 = vabdq_u8(q4, q5);
			
 
				+    q13u8 = vabdq_u8(q5, q6);
			
 
				+    q14u8 = vabdq_u8(q8, q7);
			
 
				+    q1u8  = vabdq_u8(q9, q8);
			
 
				+    q0u8  = vabdq_u8(q10, q9);
			
 
				+
			
 
				+    q11u8 = vmaxq_u8(q11u8, q12u8);
			
 
				+    q12u8 = vmaxq_u8(q13u8, q14u8);
			
 
				+    q1u8  = vmaxq_u8(q1u8, q0u8);
			
 
				+    q15u8 = vmaxq_u8(q11u8, q12u8);
			
 
				+
			
 
				+    q12u8 = vabdq_u8(q6, q7);
			
 
				+
			
 
				+    // vp8_hevmask
			
 
				+    q13u8 = vcgtq_u8(q13u8, qthresh);
			
 
				+    q14u8 = vcgtq_u8(q14u8, qthresh);
			
 
				+    q15u8 = vmaxq_u8(q15u8, q1u8);
			
 
				+
			
 
				+    q15u8 = vcgeq_u8(qlimit, q15u8);
			
 
				+
			
 
				+    q1u8 = vabdq_u8(q5, q8);
			
 
				+    q12u8 = vqaddq_u8(q12u8, q12u8);
			
 
				+
			
 
				+    // vp8_filter() function
			
 
				+    // convert to signed
			
 
				+    q0u8 = vdupq_n_u8(0x80);
			
 
				+    q9 = veorq_u8(q9, q0u8);
			
 
				+    q8 = veorq_u8(q8, q0u8);
			
 
				+    q7 = veorq_u8(q7, q0u8);
			
 
				+    q6 = veorq_u8(q6, q0u8);
			
 
				+    q5 = veorq_u8(q5, q0u8);
			
 
				+    q4 = veorq_u8(q4, q0u8);
			
 
				+
			
 
				+    q1u8 = vshrq_n_u8(q1u8, 1);
			
 
				+    q12u8 = vqaddq_u8(q12u8, q1u8);
			
 
				+
			
 
				+    q14u8 = vorrq_u8(q13u8, q14u8);
			
 
				+    q12u8 = vcgeq_u8(qblimit, q12u8);
			
 
				+
			
 
				+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
			
 
				+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
			
 
				+    q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
			
 
				+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
			
 
				+
			
 
				+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
			
 
				+                     vreinterpretq_s8_u8(q8));
			
 
				+
			
 
				+    q11s16 = vdupq_n_s16(3);
			
 
				+    q2s16  = vmulq_s16(q2s16, q11s16);
			
 
				+    q13s16 = vmulq_s16(q13s16, q11s16);
			
 
				+
			
 
				+    q15u8 = vandq_u8(q15u8, q12u8);
			
 
				+
			
 
				+    q2s16  = vaddw_s8(q2s16, vget_low_s8(q1s8));
			
 
				+    q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
			
 
				+
			
 
				+    q12u8 = vdupq_n_u8(3);
			
 
				+    q11u8 = vdupq_n_u8(4);
			
 
				+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
			
 
				+    d2 = vqmovn_s16(q2s16);
			
 
				+    d3 = vqmovn_s16(q13s16);
			
 
				+    q1s8 = vcombine_s8(d2, d3);
			
 
				+    q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
			
 
				+    q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
			
 
				+
			
 
				+    q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
			
 
				+    q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
			
 
				+    q2s8 = vshrq_n_s8(q2s8, 3);
			
 
				+    q13s8 = vshrq_n_s8(q13s8, 3);
			
 
				+
			
 
				+    q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
			
 
				+    q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
			
 
				+
			
 
				+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
			
 
				+
			
 
				+    q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
			
 
				+    d5 = vdup_n_s8(9);
			
 
				+    d4 = vdup_n_s8(18);
			
 
				+
			
 
				+    q0s16  = vmlal_s8(vreinterpretq_s16_u16(q0u16),  vget_low_s8(q1s8),  d5);
			
 
				+    q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
			
 
				+    d5 = vdup_n_s8(27);
			
 
				+    q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8),  d4);
			
 
				+    q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
			
 
				+    q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8),  d5);
			
 
				+    q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
			
 
				+
			
 
				+    d0  = vqshrn_n_s16(q0s16 , 7);
			
 
				+    d1  = vqshrn_n_s16(q11s16, 7);
			
 
				+    d24 = vqshrn_n_s16(q12s16, 7);
			
 
				+    d25 = vqshrn_n_s16(q13s16, 7);
			
 
				+    d28 = vqshrn_n_s16(q14s16, 7);
			
 
				+    d29 = vqshrn_n_s16(q15s16, 7);
			
 
				+
			
 
				+    q0s8  = vcombine_s8(d0, d1);
			
 
				+    q12s8 = vcombine_s8(d24, d25);
			
 
				+    q14s8 = vcombine_s8(d28, d29);
			
 
				+
			
 
				+    q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
			
 
				+    q0s8  = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
			
 
				+    q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
			
 
				+    q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
			
 
				+    q15s8 = vqsubq_s8((q7s8), q14s8);
			
 
				+    q14s8 = vqaddq_s8((q6s8), q14s8);
			
 
				+
			
 
				+    q1u8 = vdupq_n_u8(0x80);
			
 
				+    *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
			
 
				+    *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
			
 
				+    *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
			
 
				+    *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
			
 
				+    *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
			
 
				+    *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_mbloop_filter_horizontal_edge_y_neon(
			
 
				+        unsigned char *src,
			
 
				+        int pitch,
			
 
				+        unsigned char blimit,
			
 
				+        unsigned char limit,
			
 
				+        unsigned char thresh) {
			
 
				+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
			
 
				+    uint8x16_t q5, q6, q7, q8, q9, q10;
			
 
				+
			
 
				+    qblimit = vdupq_n_u8(blimit);
			
 
				+    qlimit = vdupq_n_u8(limit);
			
 
				+    qthresh = vdupq_n_u8(thresh);
			
 
				+
			
 
				+    src -= (pitch << 2);
			
 
				+
			
 
				+    q3 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q4 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q5 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q6 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q7 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q8 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q9 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q10 = vld1q_u8(src);
			
 
				+
			
 
				+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
			
 
				+                         q5, q6, q7, q8, q9, q10,
			
 
				+                         &q4, &q5, &q6, &q7, &q8, &q9);
			
 
				+
			
 
				+    src -= (pitch * 6);
			
 
				+    vst1q_u8(src, q4);
			
 
				+    src += pitch;
			
 
				+    vst1q_u8(src, q5);
			
 
				+    src += pitch;
			
 
				+    vst1q_u8(src, q6);
			
 
				+    src += pitch;
			
 
				+    vst1q_u8(src, q7);
			
 
				+    src += pitch;
			
 
				+    vst1q_u8(src, q8);
			
 
				+    src += pitch;
			
 
				+    vst1q_u8(src, q9);
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_mbloop_filter_horizontal_edge_uv_neon(
			
 
				+        unsigned char *u,
			
 
				+        int pitch,
			
 
				+        unsigned char blimit,
			
 
				+        unsigned char limit,
			
 
				+        unsigned char thresh,
			
 
				+        unsigned char *v) {
			
 
				+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
			
 
				+    uint8x16_t q5, q6, q7, q8, q9, q10;
			
 
				+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
			
 
				+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
			
 
				+
			
 
				+    qblimit = vdupq_n_u8(blimit);
			
 
				+    qlimit = vdupq_n_u8(limit);
			
 
				+    qthresh = vdupq_n_u8(thresh);
			
 
				+
			
 
				+    u -= (pitch << 2);
			
 
				+    v -= (pitch << 2);
			
 
				+
			
 
				+    d6 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d7 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d8 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d9 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d10 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d11 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d12 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d13 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d14 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d15 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d16 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d17 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d18 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d19 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d20 = vld1_u8(u);
			
 
				+    d21 = vld1_u8(v);
			
 
				+
			
 
				+    q3 = vcombine_u8(d6, d7);
			
 
				+    q4 = vcombine_u8(d8, d9);
			
 
				+    q5 = vcombine_u8(d10, d11);
			
 
				+    q6 = vcombine_u8(d12, d13);
			
 
				+    q7 = vcombine_u8(d14, d15);
			
 
				+    q8 = vcombine_u8(d16, d17);
			
 
				+    q9 = vcombine_u8(d18, d19);
			
 
				+    q10 = vcombine_u8(d20, d21);
			
 
				+
			
 
				+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
			
 
				+                         q5, q6, q7, q8, q9, q10,
			
 
				+                         &q4, &q5, &q6, &q7, &q8, &q9);
			
 
				+
			
 
				+    u -= (pitch * 6);
			
 
				+    v -= (pitch * 6);
			
 
				+    vst1_u8(u, vget_low_u8(q4));
			
 
				+    u += pitch;
			
 
				+    vst1_u8(v, vget_high_u8(q4));
			
 
				+    v += pitch;
			
 
				+    vst1_u8(u, vget_low_u8(q5));
			
 
				+    u += pitch;
			
 
				+    vst1_u8(v, vget_high_u8(q5));
			
 
				+    v += pitch;
			
 
				+    vst1_u8(u, vget_low_u8(q6));
			
 
				+    u += pitch;
			
 
				+    vst1_u8(v, vget_high_u8(q6));
			
 
				+    v += pitch;
			
 
				+    vst1_u8(u, vget_low_u8(q7));
			
 
				+    u += pitch;
			
 
				+    vst1_u8(v, vget_high_u8(q7));
			
 
				+    v += pitch;
			
 
				+    vst1_u8(u, vget_low_u8(q8));
			
 
				+    u += pitch;
			
 
				+    vst1_u8(v, vget_high_u8(q8));
			
 
				+    v += pitch;
			
 
				+    vst1_u8(u, vget_low_u8(q9));
			
 
				+    vst1_u8(v, vget_high_u8(q9));
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_mbloop_filter_vertical_edge_y_neon(
			
 
				+        unsigned char *src,
			
 
				+        int pitch,
			
 
				+        unsigned char blimit,
			
 
				+        unsigned char limit,
			
 
				+        unsigned char thresh) {
			
 
				+    unsigned char *s1, *s2;
			
 
				+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
			
 
				+    uint8x16_t q5, q6, q7, q8, q9, q10;
			
 
				+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
			
 
				+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
			
 
				+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
			
 
				+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
			
 
				+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
			
 
				+
			
 
				+    qblimit = vdupq_n_u8(blimit);
			
 
				+    qlimit = vdupq_n_u8(limit);
			
 
				+    qthresh = vdupq_n_u8(thresh);
			
 
				+
			
 
				+    s1 = src - 4;
			
 
				+    s2 = s1 + 8 * pitch;
			
 
				+    d6  = vld1_u8(s1);
			
 
				+    s1 += pitch;
			
 
				+    d7  = vld1_u8(s2);
			
 
				+    s2 += pitch;
			
 
				+    d8  = vld1_u8(s1);
			
 
				+    s1 += pitch;
			
 
				+    d9  = vld1_u8(s2);
			
 
				+    s2 += pitch;
			
 
				+    d10 = vld1_u8(s1);
			
 
				+    s1 += pitch;
			
 
				+    d11 = vld1_u8(s2);
			
 
				+    s2 += pitch;
			
 
				+    d12 = vld1_u8(s1);
			
 
				+    s1 += pitch;
			
 
				+    d13 = vld1_u8(s2);
			
 
				+    s2 += pitch;
			
 
				+    d14 = vld1_u8(s1);
			
 
				+    s1 += pitch;
			
 
				+    d15 = vld1_u8(s2);
			
 
				+    s2 += pitch;
			
 
				+    d16 = vld1_u8(s1);
			
 
				+    s1 += pitch;
			
 
				+    d17 = vld1_u8(s2);
			
 
				+    s2 += pitch;
			
 
				+    d18 = vld1_u8(s1);
			
 
				+    s1 += pitch;
			
 
				+    d19 = vld1_u8(s2);
			
 
				+    s2 += pitch;
			
 
				+    d20 = vld1_u8(s1);
			
 
				+    d21 = vld1_u8(s2);
			
 
				+
			
 
				+    q3 = vcombine_u8(d6, d7);
			
 
				+    q4 = vcombine_u8(d8, d9);
			
 
				+    q5 = vcombine_u8(d10, d11);
			
 
				+    q6 = vcombine_u8(d12, d13);
			
 
				+    q7 = vcombine_u8(d14, d15);
			
 
				+    q8 = vcombine_u8(d16, d17);
			
 
				+    q9 = vcombine_u8(d18, d19);
			
 
				+    q10 = vcombine_u8(d20, d21);
			
 
				+
			
 
				+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
			
 
				+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
			
 
				+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
			
 
				+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
			
 
				+
			
 
				+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
			
 
				+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
			
 
				+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
			
 
				+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
			
 
				+
			
 
				+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
			
 
				+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
			
 
				+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
			
 
				+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
			
 
				+
			
 
				+    q3 = q2tmp8.val[0];
			
 
				+    q4 = q2tmp8.val[1];
			
 
				+    q5 = q2tmp9.val[0];
			
 
				+    q6 = q2tmp9.val[1];
			
 
				+    q7 = q2tmp10.val[0];
			
 
				+    q8 = q2tmp10.val[1];
			
 
				+    q9 = q2tmp11.val[0];
			
 
				+    q10 = q2tmp11.val[1];
			
 
				+
			
 
				+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
			
 
				+                         q5, q6, q7, q8, q9, q10,
			
 
				+                         &q4, &q5, &q6, &q7, &q8, &q9);
			
 
				+
			
 
				+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
			
 
				+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
			
 
				+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
			
 
				+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
			
 
				+
			
 
				+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
			
 
				+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
			
 
				+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
			
 
				+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
			
 
				+
			
 
				+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
			
 
				+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
			
 
				+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
			
 
				+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
			
 
				+
			
 
				+    q3 = q2tmp8.val[0];
			
 
				+    q4 = q2tmp8.val[1];
			
 
				+    q5 = q2tmp9.val[0];
			
 
				+    q6 = q2tmp9.val[1];
			
 
				+    q7 = q2tmp10.val[0];
			
 
				+    q8 = q2tmp10.val[1];
			
 
				+    q9 = q2tmp11.val[0];
			
 
				+    q10 = q2tmp11.val[1];
			
 
				+
			
 
				+    s1 -= 7 * pitch;
			
 
				+    s2 -= 7 * pitch;
			
 
				+
			
 
				+    vst1_u8(s1, vget_low_u8(q3));
			
 
				+    s1 += pitch;
			
 
				+    vst1_u8(s2, vget_high_u8(q3));
			
 
				+    s2 += pitch;
			
 
				+    vst1_u8(s1, vget_low_u8(q4));
			
 
				+    s1 += pitch;
			
 
				+    vst1_u8(s2, vget_high_u8(q4));
			
 
				+    s2 += pitch;
			
 
				+    vst1_u8(s1, vget_low_u8(q5));
			
 
				+    s1 += pitch;
			
 
				+    vst1_u8(s2, vget_high_u8(q5));
			
 
				+    s2 += pitch;
			
 
				+    vst1_u8(s1, vget_low_u8(q6));
			
 
				+    s1 += pitch;
			
 
				+    vst1_u8(s2, vget_high_u8(q6));
			
 
				+    s2 += pitch;
			
 
				+    vst1_u8(s1, vget_low_u8(q7));
			
 
				+    s1 += pitch;
			
 
				+    vst1_u8(s2, vget_high_u8(q7));
			
 
				+    s2 += pitch;
			
 
				+    vst1_u8(s1, vget_low_u8(q8));
			
 
				+    s1 += pitch;
			
 
				+    vst1_u8(s2, vget_high_u8(q8));
			
 
				+    s2 += pitch;
			
 
				+    vst1_u8(s1, vget_low_u8(q9));
			
 
				+    s1 += pitch;
			
 
				+    vst1_u8(s2, vget_high_u8(q9));
			
 
				+    s2 += pitch;
			
 
				+    vst1_u8(s1, vget_low_u8(q10));
			
 
				+    vst1_u8(s2, vget_high_u8(q10));
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_mbloop_filter_vertical_edge_uv_neon(
			
 
				+        unsigned char *u,
			
 
				+        int pitch,
			
 
				+        unsigned char blimit,
			
 
				+        unsigned char limit,
			
 
				+        unsigned char thresh,
			
 
				+        unsigned char *v) {
			
 
				+    unsigned char *us, *ud;
			
 
				+    unsigned char *vs, *vd;
			
 
				+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
			
 
				+    uint8x16_t q5, q6, q7, q8, q9, q10;
			
 
				+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
			
 
				+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
			
 
				+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
			
 
				+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
			
 
				+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
			
 
				+
			
 
				+    qblimit = vdupq_n_u8(blimit);
			
 
				+    qlimit = vdupq_n_u8(limit);
			
 
				+    qthresh = vdupq_n_u8(thresh);
			
 
				+
			
 
				+    us = u - 4;
			
 
				+    vs = v - 4;
			
 
				+    d6 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d7 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d8 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d9 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d10 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d11 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d12 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d13 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d14 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d15 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d16 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d17 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d18 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d19 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d20 = vld1_u8(us);
			
 
				+    d21 = vld1_u8(vs);
			
 
				+
			
 
				+    q3 = vcombine_u8(d6, d7);
			
 
				+    q4 = vcombine_u8(d8, d9);
			
 
				+    q5 = vcombine_u8(d10, d11);
			
 
				+    q6 = vcombine_u8(d12, d13);
			
 
				+    q7 = vcombine_u8(d14, d15);
			
 
				+    q8 = vcombine_u8(d16, d17);
			
 
				+    q9 = vcombine_u8(d18, d19);
			
 
				+    q10 = vcombine_u8(d20, d21);
			
 
				+
			
 
				+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
			
 
				+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
			
 
				+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
			
 
				+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
			
 
				+
			
 
				+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
			
 
				+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
			
 
				+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
			
 
				+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
			
 
				+
			
 
				+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
			
 
				+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
			
 
				+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
			
 
				+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
			
 
				+
			
 
				+    q3 = q2tmp8.val[0];
			
 
				+    q4 = q2tmp8.val[1];
			
 
				+    q5 = q2tmp9.val[0];
			
 
				+    q6 = q2tmp9.val[1];
			
 
				+    q7 = q2tmp10.val[0];
			
 
				+    q8 = q2tmp10.val[1];
			
 
				+    q9 = q2tmp11.val[0];
			
 
				+    q10 = q2tmp11.val[1];
			
 
				+
			
 
				+    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
			
 
				+                         q5, q6, q7, q8, q9, q10,
			
 
				+                         &q4, &q5, &q6, &q7, &q8, &q9);
			
 
				+
			
 
				+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
			
 
				+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
			
 
				+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
			
 
				+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
			
 
				+
			
 
				+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
			
 
				+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
			
 
				+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
			
 
				+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
			
 
				+
			
 
				+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
			
 
				+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
			
 
				+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
			
 
				+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
			
 
				+
			
 
				+    q3 = q2tmp8.val[0];
			
 
				+    q4 = q2tmp8.val[1];
			
 
				+    q5 = q2tmp9.val[0];
			
 
				+    q6 = q2tmp9.val[1];
			
 
				+    q7 = q2tmp10.val[0];
			
 
				+    q8 = q2tmp10.val[1];
			
 
				+    q9 = q2tmp11.val[0];
			
 
				+    q10 = q2tmp11.val[1];
			
 
				+
			
 
				+    ud = u - 4;
			
 
				+    vst1_u8(ud, vget_low_u8(q3));
			
 
				+    ud += pitch;
			
 
				+    vst1_u8(ud, vget_low_u8(q4));
			
 
				+    ud += pitch;
			
 
				+    vst1_u8(ud, vget_low_u8(q5));
			
 
				+    ud += pitch;
			
 
				+    vst1_u8(ud, vget_low_u8(q6));
			
 
				+    ud += pitch;
			
 
				+    vst1_u8(ud, vget_low_u8(q7));
			
 
				+    ud += pitch;
			
 
				+    vst1_u8(ud, vget_low_u8(q8));
			
 
				+    ud += pitch;
			
 
				+    vst1_u8(ud, vget_low_u8(q9));
			
 
				+    ud += pitch;
			
 
				+    vst1_u8(ud, vget_low_u8(q10));
			
 
				+
			
 
				+    vd = v - 4;
			
 
				+    vst1_u8(vd, vget_high_u8(q3));
			
 
				+    vd += pitch;
			
 
				+    vst1_u8(vd, vget_high_u8(q4));
			
 
				+    vd += pitch;
			
 
				+    vst1_u8(vd, vget_high_u8(q5));
			
 
				+    vd += pitch;
			
 
				+    vst1_u8(vd, vget_high_u8(q6));
			
 
				+    vd += pitch;
			
 
				+    vst1_u8(vd, vget_high_u8(q7));
			
 
				+    vd += pitch;
			
 
				+    vst1_u8(vd, vget_high_u8(q8));
			
 
				+    vd += pitch;
			
 
				+    vst1_u8(vd, vget_high_u8(q9));
			
 
				+    vd += pitch;
			
 
				+    vst1_u8(vd, vget_high_u8(q10));
			
 
				+    return;
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -0,0 +1,123 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+
			
 
				+static const int16_t cospi8sqrt2minus1 = 20091;
			
 
				+static const int16_t sinpi8sqrt2       = 35468;
			
 
				+
			
 
				+void vp8_short_idct4x4llm_neon(
			
 
				+        int16_t *input,
			
 
				+        unsigned char *pred_ptr,
			
 
				+        int pred_stride,
			
 
				+        unsigned char *dst_ptr,
			
 
				+        int dst_stride) {
			
 
				+    int i;
			
 
				+    uint32x2_t d6u32 = vdup_n_u32(0);
			
 
				+    uint8x8_t d1u8;
			
 
				+    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
			
 
				+    uint16x8_t q1u16;
			
 
				+    int16x8_t q1s16, q2s16, q3s16, q4s16;
			
 
				+    int32x2x2_t v2tmp0, v2tmp1;
			
 
				+    int16x4x2_t v2tmp2, v2tmp3;
			
 
				+
			
 
				+    d2 = vld1_s16(input);
			
 
				+    d3 = vld1_s16(input + 4);
			
 
				+    d4 = vld1_s16(input + 8);
			
 
				+    d5 = vld1_s16(input + 12);
			
 
				+
			
 
				+    // 1st for loop
			
 
				+    q1s16 = vcombine_s16(d2, d4);  // Swap d3 d4 here
			
 
				+    q2s16 = vcombine_s16(d3, d5);
			
 
				+
			
 
				+    q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
			
 
				+    q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
			
 
				+
			
 
				+    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
			
 
				+    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
			
 
				+
			
 
				+    q3s16 = vshrq_n_s16(q3s16, 1);
			
 
				+    q4s16 = vshrq_n_s16(q4s16, 1);
			
 
				+
			
 
				+    q3s16 = vqaddq_s16(q3s16, q2s16);
			
 
				+    q4s16 = vqaddq_s16(q4s16, q2s16);
			
 
				+
			
 
				+    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
			
 
				+    d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
			
 
				+
			
 
				+    d2 = vqadd_s16(d12, d11);
			
 
				+    d3 = vqadd_s16(d13, d10);
			
 
				+    d4 = vqsub_s16(d13, d10);
			
 
				+    d5 = vqsub_s16(d12, d11);
			
 
				+
			
 
				+    v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
			
 
				+    v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
			
 
				+    v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
			
 
				+                      vreinterpret_s16_s32(v2tmp1.val[0]));
			
 
				+    v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
			
 
				+                      vreinterpret_s16_s32(v2tmp1.val[1]));
			
 
				+
			
 
				+    // 2nd for loop
			
 
				+    q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
			
 
				+    q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
			
 
				+
			
 
				+    q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
			
 
				+    q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
			
 
				+
			
 
				+    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
			
 
				+    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
			
 
				+
			
 
				+    q3s16 = vshrq_n_s16(q3s16, 1);
			
 
				+    q4s16 = vshrq_n_s16(q4s16, 1);
			
 
				+
			
 
				+    q3s16 = vqaddq_s16(q3s16, q2s16);
			
 
				+    q4s16 = vqaddq_s16(q4s16, q2s16);
			
 
				+
			
 
				+    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
			
 
				+    d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
			
 
				+
			
 
				+    d2 = vqadd_s16(d12, d11);
			
 
				+    d3 = vqadd_s16(d13, d10);
			
 
				+    d4 = vqsub_s16(d13, d10);
			
 
				+    d5 = vqsub_s16(d12, d11);
			
 
				+
			
 
				+    d2 = vrshr_n_s16(d2, 3);
			
 
				+    d3 = vrshr_n_s16(d3, 3);
			
 
				+    d4 = vrshr_n_s16(d4, 3);
			
 
				+    d5 = vrshr_n_s16(d5, 3);
			
 
				+
			
 
				+    v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
			
 
				+    v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
			
 
				+    v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
			
 
				+                      vreinterpret_s16_s32(v2tmp1.val[0]));
			
 
				+    v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
			
 
				+                      vreinterpret_s16_s32(v2tmp1.val[1]));
			
 
				+
			
 
				+    q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
			
 
				+    q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
			
 
				+
			
 
				+    // dc_only_idct_add
			
 
				+    for (i = 0; i < 2; i++, q1s16 = q2s16) {
			
 
				+        d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
			
 
				+        pred_ptr += pred_stride;
			
 
				+        d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
			
 
				+        pred_ptr += pred_stride;
			
 
				+
			
 
				+        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
			
 
				+                         vreinterpret_u8_u32(d6u32));
			
 
				+        d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
			
 
				+
			
 
				+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
			
 
				+        dst_ptr += dst_stride;
			
 
				+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
			
 
				+        dst_ptr += dst_stride;
			
 
				+    }
			
 
				+    return;
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -0,0 +1,1377 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+#include "vpx_ports/mem.h"
			
 
				+
			
 
				+static const int8_t vp8_sub_pel_filters[8][8] = {
			
 
				+    {0,  0,  128,   0,   0, 0, 0, 0},  /* note that 1/8 pel positionyys are */
			
 
				+    {0, -6,  123,  12,  -1, 0, 0, 0},  /*    just as per alpha -0.5 bicubic */
			
 
				+    {2, -11, 108,  36,  -8, 1, 0, 0},  /* New 1/4 pel 6 tap filter */
			
 
				+    {0, -9,   93,  50,  -6, 0, 0, 0},
			
 
				+    {3, -16,  77,  77, -16, 3, 0, 0},  /* New 1/2 pel 6 tap filter */
			
 
				+    {0, -6,   50,  93,  -9, 0, 0, 0},
			
 
				+    {1, -8,   36, 108, -11, 2, 0, 0},  /* New 1/4 pel 6 tap filter */
			
 
				+    {0, -1,   12, 123,  -6, 0, 0, 0},
			
 
				+};
			
 
				+
			
 
				+void vp8_sixtap_predict8x4_neon(
			
 
				+        unsigned char *src_ptr,
			
 
				+        int src_pixels_per_line,
			
 
				+        int xoffset,
			
 
				+        int yoffset,
			
 
				+        unsigned char *dst_ptr,
			
 
				+        int dst_pitch) {
			
 
				+    unsigned char *src;
			
 
				+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
			
 
				+    uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
			
 
				+    uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
			
 
				+    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
			
 
				+    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
			
 
				+    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
			
 
				+    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
			
 
				+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
			
 
				+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
			
 
				+
			
 
				+    if (xoffset == 0) {  // secondpass_filter8x4_only
			
 
				+        // load second_pass filter
			
 
				+        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
			
 
				+        d0s8 = vdup_lane_s8(dtmps8, 0);
			
 
				+        d1s8 = vdup_lane_s8(dtmps8, 1);
			
 
				+        d2s8 = vdup_lane_s8(dtmps8, 2);
			
 
				+        d3s8 = vdup_lane_s8(dtmps8, 3);
			
 
				+        d4s8 = vdup_lane_s8(dtmps8, 4);
			
 
				+        d5s8 = vdup_lane_s8(dtmps8, 5);
			
 
				+        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
			
 
				+        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
			
 
				+        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
			
 
				+        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
			
 
				+        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
			
 
				+        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
			
 
				+
			
 
				+        // load src data
			
 
				+        src = src_ptr - src_pixels_per_line * 2;
			
 
				+        d22u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d23u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d24u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d25u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d26u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d27u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d28u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d29u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d30u8 = vld1_u8(src);
			
 
				+
			
 
				+        q3u16 = vmull_u8(d22u8, d0u8);
			
 
				+        q4u16 = vmull_u8(d23u8, d0u8);
			
 
				+        q5u16 = vmull_u8(d24u8, d0u8);
			
 
				+        q6u16 = vmull_u8(d25u8, d0u8);
			
 
				+
			
 
				+        q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
			
 
				+        q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
			
 
				+        q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
			
 
				+        q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
			
 
				+
			
 
				+        q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
			
 
				+        q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
			
 
				+        q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
			
 
				+        q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
			
 
				+
			
 
				+        q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
			
 
				+        q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
			
 
				+        q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
			
 
				+        q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
			
 
				+
			
 
				+        q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
			
 
				+        q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
			
 
				+        q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
			
 
				+        q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
			
 
				+
			
 
				+        q7u16 = vmull_u8(d25u8, d3u8);
			
 
				+        q8u16 = vmull_u8(d26u8, d3u8);
			
 
				+        q9u16 = vmull_u8(d27u8, d3u8);
			
 
				+        q10u16 = vmull_u8(d28u8, d3u8);
			
 
				+
			
 
				+        q3s16 = vreinterpretq_s16_u16(q3u16);
			
 
				+        q4s16 = vreinterpretq_s16_u16(q4u16);
			
 
				+        q5s16 = vreinterpretq_s16_u16(q5u16);
			
 
				+        q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+        q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+        q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+        q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+        q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+
			
 
				+        q7s16 = vqaddq_s16(q7s16, q3s16);
			
 
				+        q8s16 = vqaddq_s16(q8s16, q4s16);
			
 
				+        q9s16 = vqaddq_s16(q9s16, q5s16);
			
 
				+        q10s16 = vqaddq_s16(q10s16, q6s16);
			
 
				+
			
 
				+        d6u8 = vqrshrun_n_s16(q7s16, 7);
			
 
				+        d7u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+        d8u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+        d9u8 = vqrshrun_n_s16(q10s16, 7);
			
 
				+
			
 
				+        vst1_u8(dst_ptr, d6u8);
			
 
				+        dst_ptr += dst_pitch;
			
 
				+        vst1_u8(dst_ptr, d7u8);
			
 
				+        dst_ptr += dst_pitch;
			
 
				+        vst1_u8(dst_ptr, d8u8);
			
 
				+        dst_ptr += dst_pitch;
			
 
				+        vst1_u8(dst_ptr, d9u8);
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    // load first_pass filter
			
 
				+    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
			
 
				+    d0s8 = vdup_lane_s8(dtmps8, 0);
			
 
				+    d1s8 = vdup_lane_s8(dtmps8, 1);
			
 
				+    d2s8 = vdup_lane_s8(dtmps8, 2);
			
 
				+    d3s8 = vdup_lane_s8(dtmps8, 3);
			
 
				+    d4s8 = vdup_lane_s8(dtmps8, 4);
			
 
				+    d5s8 = vdup_lane_s8(dtmps8, 5);
			
 
				+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
			
 
				+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
			
 
				+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
			
 
				+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
			
 
				+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
			
 
				+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
			
 
				+
			
 
				+    // First pass: output_height lines x output_width columns (9x4)
			
 
				+    if (yoffset == 0)  // firstpass_filter4x4_only
			
 
				+        src = src_ptr - 2;
			
 
				+    else
			
 
				+        src = src_ptr - 2 - (src_pixels_per_line * 2);
			
 
				+    q3u8 = vld1q_u8(src);
			
 
				+    src += src_pixels_per_line;
			
 
				+    q4u8 = vld1q_u8(src);
			
 
				+    src += src_pixels_per_line;
			
 
				+    q5u8 = vld1q_u8(src);
			
 
				+    src += src_pixels_per_line;
			
 
				+    q6u8 = vld1q_u8(src);
			
 
				+
			
 
				+    q7u16  = vmull_u8(vget_low_u8(q3u8), d0u8);
			
 
				+    q8u16  = vmull_u8(vget_low_u8(q4u8), d0u8);
			
 
				+    q9u16  = vmull_u8(vget_low_u8(q5u8), d0u8);
			
 
				+    q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
			
 
				+
			
 
				+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
			
 
				+
			
 
				+    q7u16  = vmlsl_u8(q7u16, d28u8, d1u8);
			
 
				+    q8u16  = vmlsl_u8(q8u16, d29u8, d1u8);
			
 
				+    q9u16  = vmlsl_u8(q9u16, d30u8, d1u8);
			
 
				+    q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
			
 
				+
			
 
				+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
			
 
				+
			
 
				+    q7u16  = vmlsl_u8(q7u16, d28u8, d4u8);
			
 
				+    q8u16  = vmlsl_u8(q8u16, d29u8, d4u8);
			
 
				+    q9u16  = vmlsl_u8(q9u16, d30u8, d4u8);
			
 
				+    q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
			
 
				+
			
 
				+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
			
 
				+
			
 
				+    q7u16  = vmlal_u8(q7u16, d28u8, d2u8);
			
 
				+    q8u16  = vmlal_u8(q8u16, d29u8, d2u8);
			
 
				+    q9u16  = vmlal_u8(q9u16, d30u8, d2u8);
			
 
				+    q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
			
 
				+
			
 
				+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
			
 
				+
			
 
				+    q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
			
 
				+    q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
			
 
				+    q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
			
 
				+    q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
			
 
				+
			
 
				+    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
			
 
				+
			
 
				+    q3u16 = vmull_u8(d28u8, d3u8);
			
 
				+    q4u16 = vmull_u8(d29u8, d3u8);
			
 
				+    q5u16 = vmull_u8(d30u8, d3u8);
			
 
				+    q6u16 = vmull_u8(d31u8, d3u8);
			
 
				+
			
 
				+    q3s16 = vreinterpretq_s16_u16(q3u16);
			
 
				+    q4s16 = vreinterpretq_s16_u16(q4u16);
			
 
				+    q5s16 = vreinterpretq_s16_u16(q5u16);
			
 
				+    q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+    q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+    q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+    q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+    q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+
			
 
				+    q7s16 = vqaddq_s16(q7s16, q3s16);
			
 
				+    q8s16 = vqaddq_s16(q8s16, q4s16);
			
 
				+    q9s16 = vqaddq_s16(q9s16, q5s16);
			
 
				+    q10s16 = vqaddq_s16(q10s16, q6s16);
			
 
				+
			
 
				+    d22u8 = vqrshrun_n_s16(q7s16, 7);
			
 
				+    d23u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+    d24u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+    d25u8 = vqrshrun_n_s16(q10s16, 7);
			
 
				+
			
 
				+    if (yoffset == 0) {  // firstpass_filter8x4_only
			
 
				+        vst1_u8(dst_ptr, d22u8);
			
 
				+        dst_ptr += dst_pitch;
			
 
				+        vst1_u8(dst_ptr, d23u8);
			
 
				+        dst_ptr += dst_pitch;
			
 
				+        vst1_u8(dst_ptr, d24u8);
			
 
				+        dst_ptr += dst_pitch;
			
 
				+        vst1_u8(dst_ptr, d25u8);
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    // First Pass on rest 5-line data
			
 
				+    src += src_pixels_per_line;
			
 
				+    q3u8 = vld1q_u8(src);
			
 
				+    src += src_pixels_per_line;
			
 
				+    q4u8 = vld1q_u8(src);
			
 
				+    src += src_pixels_per_line;
			
 
				+    q5u8 = vld1q_u8(src);
			
 
				+    src += src_pixels_per_line;
			
 
				+    q6u8 = vld1q_u8(src);
			
 
				+    src += src_pixels_per_line;
			
 
				+    q7u8 = vld1q_u8(src);
			
 
				+
			
 
				+    q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
			
 
				+    q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
			
 
				+    q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
			
 
				+    q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
			
 
				+    q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
			
 
				+
			
 
				+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
			
 
				+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
			
 
				+
			
 
				+    q8u16  = vmlsl_u8(q8u16, d27u8, d1u8);
			
 
				+    q9u16  = vmlsl_u8(q9u16, d28u8, d1u8);
			
 
				+    q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
			
 
				+    q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
			
 
				+    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
			
 
				+
			
 
				+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
			
 
				+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
			
 
				+
			
 
				+    q8u16  = vmlsl_u8(q8u16, d27u8, d4u8);
			
 
				+    q9u16  = vmlsl_u8(q9u16, d28u8, d4u8);
			
 
				+    q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
			
 
				+    q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
			
 
				+    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
			
 
				+
			
 
				+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
			
 
				+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
			
 
				+
			
 
				+    q8u16  = vmlal_u8(q8u16, d27u8, d2u8);
			
 
				+    q9u16  = vmlal_u8(q9u16, d28u8, d2u8);
			
 
				+    q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
			
 
				+    q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
			
 
				+    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
			
 
				+
			
 
				+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
			
 
				+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
			
 
				+
			
 
				+    q8u16  = vmlal_u8(q8u16, d27u8, d5u8);
			
 
				+    q9u16  = vmlal_u8(q9u16, d28u8, d5u8);
			
 
				+    q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
			
 
				+    q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
			
 
				+    q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
			
 
				+
			
 
				+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
			
 
				+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
			
 
				+
			
 
				+    q3u16 = vmull_u8(d27u8, d3u8);
			
 
				+    q4u16 = vmull_u8(d28u8, d3u8);
			
 
				+    q5u16 = vmull_u8(d29u8, d3u8);
			
 
				+    q6u16 = vmull_u8(d30u8, d3u8);
			
 
				+    q7u16 = vmull_u8(d31u8, d3u8);
			
 
				+
			
 
				+    q3s16 = vreinterpretq_s16_u16(q3u16);
			
 
				+    q4s16 = vreinterpretq_s16_u16(q4u16);
			
 
				+    q5s16 = vreinterpretq_s16_u16(q5u16);
			
 
				+    q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+    q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+    q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+    q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+    q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+    q11s16 = vreinterpretq_s16_u16(q11u16);
			
 
				+    q12s16 = vreinterpretq_s16_u16(q12u16);
			
 
				+
			
 
				+    q8s16 = vqaddq_s16(q8s16, q3s16);
			
 
				+    q9s16 = vqaddq_s16(q9s16, q4s16);
			
 
				+    q10s16 = vqaddq_s16(q10s16, q5s16);
			
 
				+    q11s16 = vqaddq_s16(q11s16, q6s16);
			
 
				+    q12s16 = vqaddq_s16(q12s16, q7s16);
			
 
				+
			
 
				+    d26u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+    d27u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+    d28u8 = vqrshrun_n_s16(q10s16, 7);
			
 
				+    d29u8 = vqrshrun_n_s16(q11s16, 7);
			
 
				+    d30u8 = vqrshrun_n_s16(q12s16, 7);
			
 
				+
			
 
				+    // Second pass: 8x4
			
 
				+    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
			
 
				+    d0s8 = vdup_lane_s8(dtmps8, 0);
			
 
				+    d1s8 = vdup_lane_s8(dtmps8, 1);
			
 
				+    d2s8 = vdup_lane_s8(dtmps8, 2);
			
 
				+    d3s8 = vdup_lane_s8(dtmps8, 3);
			
 
				+    d4s8 = vdup_lane_s8(dtmps8, 4);
			
 
				+    d5s8 = vdup_lane_s8(dtmps8, 5);
			
 
				+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
			
 
				+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
			
 
				+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
			
 
				+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
			
 
				+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
			
 
				+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
			
 
				+
			
 
				+    q3u16 = vmull_u8(d22u8, d0u8);
			
 
				+    q4u16 = vmull_u8(d23u8, d0u8);
			
 
				+    q5u16 = vmull_u8(d24u8, d0u8);
			
 
				+    q6u16 = vmull_u8(d25u8, d0u8);
			
 
				+
			
 
				+    q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
			
 
				+    q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
			
 
				+    q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
			
 
				+    q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
			
 
				+
			
 
				+    q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
			
 
				+    q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
			
 
				+    q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
			
 
				+    q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
			
 
				+
			
 
				+    q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
			
 
				+    q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
			
 
				+    q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
			
 
				+    q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
			
 
				+
			
 
				+    q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
			
 
				+    q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
			
 
				+    q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
			
 
				+    q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
			
 
				+
			
 
				+    q7u16 = vmull_u8(d25u8, d3u8);
			
 
				+    q8u16 = vmull_u8(d26u8, d3u8);
			
 
				+    q9u16 = vmull_u8(d27u8, d3u8);
			
 
				+    q10u16 = vmull_u8(d28u8, d3u8);
			
 
				+
			
 
				+    q3s16 = vreinterpretq_s16_u16(q3u16);
			
 
				+    q4s16 = vreinterpretq_s16_u16(q4u16);
			
 
				+    q5s16 = vreinterpretq_s16_u16(q5u16);
			
 
				+    q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+    q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+    q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+    q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+    q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+
			
 
				+    q7s16 = vqaddq_s16(q7s16, q3s16);
			
 
				+    q8s16 = vqaddq_s16(q8s16, q4s16);
			
 
				+    q9s16 = vqaddq_s16(q9s16, q5s16);
			
 
				+    q10s16 = vqaddq_s16(q10s16, q6s16);
			
 
				+
			
 
				+    d6u8 = vqrshrun_n_s16(q7s16, 7);
			
 
				+    d7u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+    d8u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+    d9u8 = vqrshrun_n_s16(q10s16, 7);
			
 
				+
			
 
				+    vst1_u8(dst_ptr, d6u8);
			
 
				+    dst_ptr += dst_pitch;
			
 
				+    vst1_u8(dst_ptr, d7u8);
			
 
				+    dst_ptr += dst_pitch;
			
 
				+    vst1_u8(dst_ptr, d8u8);
			
 
				+    dst_ptr += dst_pitch;
			
 
				+    vst1_u8(dst_ptr, d9u8);
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_sixtap_predict8x8_neon(
			
 
				+        unsigned char *src_ptr,
			
 
				+        int src_pixels_per_line,
			
 
				+        int xoffset,
			
 
				+        int yoffset,
			
 
				+        unsigned char *dst_ptr,
			
 
				+        int dst_pitch) {
			
 
				+    unsigned char *src, *tmpp;
			
 
				+    unsigned char tmp[64];
			
 
				+    int i;
			
 
				+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
			
 
				+    uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
			
 
				+    uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
			
 
				+    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
			
 
				+    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
			
 
				+    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
			
 
				+    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
			
 
				+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
			
 
				+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
			
 
				+
			
 
				+    if (xoffset == 0) {  // secondpass_filter8x8_only
			
 
				+        // load second_pass filter
			
 
				+        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
			
 
				+        d0s8 = vdup_lane_s8(dtmps8, 0);
			
 
				+        d1s8 = vdup_lane_s8(dtmps8, 1);
			
 
				+        d2s8 = vdup_lane_s8(dtmps8, 2);
			
 
				+        d3s8 = vdup_lane_s8(dtmps8, 3);
			
 
				+        d4s8 = vdup_lane_s8(dtmps8, 4);
			
 
				+        d5s8 = vdup_lane_s8(dtmps8, 5);
			
 
				+        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
			
 
				+        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
			
 
				+        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
			
 
				+        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
			
 
				+        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
			
 
				+        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
			
 
				+
			
 
				+        // load src data
			
 
				+        src = src_ptr - src_pixels_per_line * 2;
			
 
				+        d18u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d19u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d20u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d21u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d22u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d23u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d24u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d25u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d26u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d27u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d28u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d29u8 = vld1_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d30u8 = vld1_u8(src);
			
 
				+
			
 
				+        for (i = 2; i > 0; i--) {
			
 
				+            q3u16 = vmull_u8(d18u8, d0u8);
			
 
				+            q4u16 = vmull_u8(d19u8, d0u8);
			
 
				+            q5u16 = vmull_u8(d20u8, d0u8);
			
 
				+            q6u16 = vmull_u8(d21u8, d0u8);
			
 
				+
			
 
				+            q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
			
 
				+            q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
			
 
				+            q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
			
 
				+            q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
			
 
				+
			
 
				+            q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
			
 
				+            q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
			
 
				+            q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
			
 
				+            q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
			
 
				+
			
 
				+            q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
			
 
				+            q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
			
 
				+            q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
			
 
				+            q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
			
 
				+
			
 
				+            q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
			
 
				+            q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
			
 
				+            q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
			
 
				+            q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
			
 
				+
			
 
				+            q7u16 = vmull_u8(d21u8, d3u8);
			
 
				+            q8u16 = vmull_u8(d22u8, d3u8);
			
 
				+            q9u16 = vmull_u8(d23u8, d3u8);
			
 
				+            q10u16 = vmull_u8(d24u8, d3u8);
			
 
				+
			
 
				+            q3s16 = vreinterpretq_s16_u16(q3u16);
			
 
				+            q4s16 = vreinterpretq_s16_u16(q4u16);
			
 
				+            q5s16 = vreinterpretq_s16_u16(q5u16);
			
 
				+            q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+            q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+            q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+            q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+            q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+
			
 
				+            q7s16 = vqaddq_s16(q7s16, q3s16);
			
 
				+            q8s16 = vqaddq_s16(q8s16, q4s16);
			
 
				+            q9s16 = vqaddq_s16(q9s16, q5s16);
			
 
				+            q10s16 = vqaddq_s16(q10s16, q6s16);
			
 
				+
			
 
				+            d6u8 = vqrshrun_n_s16(q7s16, 7);
			
 
				+            d7u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+            d8u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+            d9u8 = vqrshrun_n_s16(q10s16, 7);
			
 
				+
			
 
				+            d18u8 = d22u8;
			
 
				+            d19u8 = d23u8;
			
 
				+            d20u8 = d24u8;
			
 
				+            d21u8 = d25u8;
			
 
				+            d22u8 = d26u8;
			
 
				+            d23u8 = d27u8;
			
 
				+            d24u8 = d28u8;
			
 
				+            d25u8 = d29u8;
			
 
				+            d26u8 = d30u8;
			
 
				+
			
 
				+            vst1_u8(dst_ptr, d6u8);
			
 
				+            dst_ptr += dst_pitch;
			
 
				+            vst1_u8(dst_ptr, d7u8);
			
 
				+            dst_ptr += dst_pitch;
			
 
				+            vst1_u8(dst_ptr, d8u8);
			
 
				+            dst_ptr += dst_pitch;
			
 
				+            vst1_u8(dst_ptr, d9u8);
			
 
				+            dst_ptr += dst_pitch;
			
 
				+        }
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    // load first_pass filter
			
 
				+    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
			
 
				+    d0s8 = vdup_lane_s8(dtmps8, 0);
			
 
				+    d1s8 = vdup_lane_s8(dtmps8, 1);
			
 
				+    d2s8 = vdup_lane_s8(dtmps8, 2);
			
 
				+    d3s8 = vdup_lane_s8(dtmps8, 3);
			
 
				+    d4s8 = vdup_lane_s8(dtmps8, 4);
			
 
				+    d5s8 = vdup_lane_s8(dtmps8, 5);
			
 
				+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
			
 
				+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
			
 
				+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
			
 
				+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
			
 
				+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
			
 
				+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
			
 
				+
			
 
				+    // First pass: output_height lines x output_width columns (9x4)
			
 
				+    if (yoffset == 0)  // firstpass_filter4x4_only
			
 
				+        src = src_ptr - 2;
			
 
				+    else
			
 
				+        src = src_ptr - 2 - (src_pixels_per_line * 2);
			
 
				+
			
 
				+    tmpp = tmp;
			
 
				+    for (i = 2; i > 0; i--) {
			
 
				+        q3u8 = vld1q_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        q4u8 = vld1q_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        q5u8 = vld1q_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+        q6u8 = vld1q_u8(src);
			
 
				+        src += src_pixels_per_line;
			
 
				+
			
 
				+        __builtin_prefetch(src);
			
 
				+        __builtin_prefetch(src + src_pixels_per_line);
			
 
				+        __builtin_prefetch(src + src_pixels_per_line * 2);
			
 
				+
			
 
				+        q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
			
 
				+        q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
			
 
				+        q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
			
 
				+        q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
			
 
				+
			
 
				+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
			
 
				+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
			
 
				+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
			
 
				+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
			
 
				+
			
 
				+        q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
			
 
				+        q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
			
 
				+        q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
			
 
				+        q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
			
 
				+
			
 
				+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
			
 
				+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
			
 
				+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
			
 
				+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
			
 
				+
			
 
				+        q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
			
 
				+        q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
			
 
				+        q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
			
 
				+        q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
			
 
				+
			
 
				+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
			
 
				+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
			
 
				+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
			
 
				+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
			
 
				+
			
 
				+        q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
			
 
				+        q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
			
 
				+        q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
			
 
				+        q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
			
 
				+
			
 
				+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
			
 
				+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
			
 
				+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
			
 
				+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
			
 
				+
			
 
				+        q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
			
 
				+        q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
			
 
				+        q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
			
 
				+        q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
			
 
				+
			
 
				+        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
			
 
				+        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
			
 
				+        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
			
 
				+        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
			
 
				+
			
 
				+        q3u16 = vmull_u8(d28u8, d3u8);
			
 
				+        q4u16 = vmull_u8(d29u8, d3u8);
			
 
				+        q5u16 = vmull_u8(d30u8, d3u8);
			
 
				+        q6u16 = vmull_u8(d31u8, d3u8);
			
 
				+
			
 
				+        q3s16 = vreinterpretq_s16_u16(q3u16);
			
 
				+        q4s16 = vreinterpretq_s16_u16(q4u16);
			
 
				+        q5s16 = vreinterpretq_s16_u16(q5u16);
			
 
				+        q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+        q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+        q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+        q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+        q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+
			
 
				+        q7s16 = vqaddq_s16(q7s16, q3s16);
			
 
				+        q8s16 = vqaddq_s16(q8s16, q4s16);
			
 
				+        q9s16 = vqaddq_s16(q9s16, q5s16);
			
 
				+        q10s16 = vqaddq_s16(q10s16, q6s16);
			
 
				+
			
 
				+        d22u8 = vqrshrun_n_s16(q7s16, 7);
			
 
				+        d23u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+        d24u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+        d25u8 = vqrshrun_n_s16(q10s16, 7);
			
 
				+
			
 
				+        if (yoffset == 0) {  // firstpass_filter8x4_only
			
 
				+            vst1_u8(dst_ptr, d22u8);
			
 
				+            dst_ptr += dst_pitch;
			
 
				+            vst1_u8(dst_ptr, d23u8);
			
 
				+            dst_ptr += dst_pitch;
			
 
				+            vst1_u8(dst_ptr, d24u8);
			
 
				+            dst_ptr += dst_pitch;
			
 
				+            vst1_u8(dst_ptr, d25u8);
			
 
				+            dst_ptr += dst_pitch;
			
 
				+        } else {
			
 
				+            vst1_u8(tmpp, d22u8);
			
 
				+            tmpp += 8;
			
 
				+            vst1_u8(tmpp, d23u8);
			
 
				+            tmpp += 8;
			
 
				+            vst1_u8(tmpp, d24u8);
			
 
				+            tmpp += 8;
			
 
				+            vst1_u8(tmpp, d25u8);
			
 
				+            tmpp += 8;
			
 
				+        }
			
 
				+    }
			
 
				+    if (yoffset == 0)
			
 
				+        return;
			
 
				+
			
 
				+    // First Pass on rest 5-line data
			
 
				+    q3u8 = vld1q_u8(src);
			
 
				+    src += src_pixels_per_line;
			
 
				+    q4u8 = vld1q_u8(src);
			
 
				+    src += src_pixels_per_line;
			
 
				+    q5u8 = vld1q_u8(src);
			
 
				+    src += src_pixels_per_line;
			
 
				+    q6u8 = vld1q_u8(src);
			
 
				+    src += src_pixels_per_line;
			
 
				+    q7u8 = vld1q_u8(src);
			
 
				+
			
 
				+    q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
			
 
				+    q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
			
 
				+    q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
			
 
				+    q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
			
 
				+    q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
			
 
				+
			
 
				+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
			
 
				+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
			
 
				+
			
 
				+    q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
			
 
				+    q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
			
 
				+    q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
			
 
				+    q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
			
 
				+    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
			
 
				+
			
 
				+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
			
 
				+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
			
 
				+
			
 
				+    q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
			
 
				+    q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
			
 
				+    q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
			
 
				+    q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
			
 
				+    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
			
 
				+
			
 
				+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
			
 
				+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
			
 
				+
			
 
				+    q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
			
 
				+    q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
			
 
				+    q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
			
 
				+    q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
			
 
				+    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
			
 
				+
			
 
				+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
			
 
				+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
			
 
				+
			
 
				+    q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
			
 
				+    q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
			
 
				+    q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
			
 
				+    q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
			
 
				+    q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
			
 
				+
			
 
				+    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
			
 
				+    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
			
 
				+    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
			
 
				+    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
			
 
				+    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
			
 
				+
			
 
				+    q3u16 = vmull_u8(d27u8, d3u8);
			
 
				+    q4u16 = vmull_u8(d28u8, d3u8);
			
 
				+    q5u16 = vmull_u8(d29u8, d3u8);
			
 
				+    q6u16 = vmull_u8(d30u8, d3u8);
			
 
				+    q7u16 = vmull_u8(d31u8, d3u8);
			
 
				+
			
 
				+    q3s16 = vreinterpretq_s16_u16(q3u16);
			
 
				+    q4s16 = vreinterpretq_s16_u16(q4u16);
			
 
				+    q5s16 = vreinterpretq_s16_u16(q5u16);
			
 
				+    q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+    q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+    q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+    q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+    q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+    q11s16 = vreinterpretq_s16_u16(q11u16);
			
 
				+    q12s16 = vreinterpretq_s16_u16(q12u16);
			
 
				+
			
 
				+    q8s16 = vqaddq_s16(q8s16, q3s16);
			
 
				+    q9s16 = vqaddq_s16(q9s16, q4s16);
			
 
				+    q10s16 = vqaddq_s16(q10s16, q5s16);
			
 
				+    q11s16 = vqaddq_s16(q11s16, q6s16);
			
 
				+    q12s16 = vqaddq_s16(q12s16, q7s16);
			
 
				+
			
 
				+    d26u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+    d27u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+    d28u8 = vqrshrun_n_s16(q10s16, 7);
			
 
				+    d29u8 = vqrshrun_n_s16(q11s16, 7);
			
 
				+    d30u8 = vqrshrun_n_s16(q12s16, 7);
			
 
				+
			
 
				+    // Second pass: 8x8
			
 
				+    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
			
 
				+    d0s8 = vdup_lane_s8(dtmps8, 0);
			
 
				+    d1s8 = vdup_lane_s8(dtmps8, 1);
			
 
				+    d2s8 = vdup_lane_s8(dtmps8, 2);
			
 
				+    d3s8 = vdup_lane_s8(dtmps8, 3);
			
 
				+    d4s8 = vdup_lane_s8(dtmps8, 4);
			
 
				+    d5s8 = vdup_lane_s8(dtmps8, 5);
			
 
				+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
			
 
				+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
			
 
				+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
			
 
				+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
			
 
				+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
			
 
				+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
			
 
				+
			
 
				+    tmpp = tmp;
			
 
				+    q9u8 = vld1q_u8(tmpp);
			
 
				+    tmpp += 16;
			
 
				+    q10u8 = vld1q_u8(tmpp);
			
 
				+    tmpp += 16;
			
 
				+    q11u8 = vld1q_u8(tmpp);
			
 
				+    tmpp += 16;
			
 
				+    q12u8 = vld1q_u8(tmpp);
			
 
				+
			
 
				+    d18u8 = vget_low_u8(q9u8);
			
 
				+    d19u8 = vget_high_u8(q9u8);
			
 
				+    d20u8 = vget_low_u8(q10u8);
			
 
				+    d21u8 = vget_high_u8(q10u8);
			
 
				+    d22u8 = vget_low_u8(q11u8);
			
 
				+    d23u8 = vget_high_u8(q11u8);
			
 
				+    d24u8 = vget_low_u8(q12u8);
			
 
				+    d25u8 = vget_high_u8(q12u8);
			
 
				+
			
 
				+    for (i = 2; i > 0; i--) {
			
 
				+        q3u16 = vmull_u8(d18u8, d0u8);
			
 
				+        q4u16 = vmull_u8(d19u8, d0u8);
			
 
				+        q5u16 = vmull_u8(d20u8, d0u8);
			
 
				+        q6u16 = vmull_u8(d21u8, d0u8);
			
 
				+
			
 
				+        q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
			
 
				+        q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
			
 
				+        q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
			
 
				+        q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
			
 
				+
			
 
				+        q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
			
 
				+        q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
			
 
				+        q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
			
 
				+        q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
			
 
				+
			
 
				+        q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
			
 
				+        q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
			
 
				+        q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
			
 
				+        q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
			
 
				+
			
 
				+        q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
			
 
				+        q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
			
 
				+        q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
			
 
				+        q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
			
 
				+
			
 
				+        q7u16 = vmull_u8(d21u8, d3u8);
			
 
				+        q8u16 = vmull_u8(d22u8, d3u8);
			
 
				+        q9u16 = vmull_u8(d23u8, d3u8);
			
 
				+        q10u16 = vmull_u8(d24u8, d3u8);
			
 
				+
			
 
				+        q3s16 = vreinterpretq_s16_u16(q3u16);
			
 
				+        q4s16 = vreinterpretq_s16_u16(q4u16);
			
 
				+        q5s16 = vreinterpretq_s16_u16(q5u16);
			
 
				+        q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+        q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+        q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+        q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+        q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+
			
 
				+        q7s16 = vqaddq_s16(q7s16, q3s16);
			
 
				+        q8s16 = vqaddq_s16(q8s16, q4s16);
			
 
				+        q9s16 = vqaddq_s16(q9s16, q5s16);
			
 
				+        q10s16 = vqaddq_s16(q10s16, q6s16);
			
 
				+
			
 
				+        d6u8 = vqrshrun_n_s16(q7s16, 7);
			
 
				+        d7u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+        d8u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+        d9u8 = vqrshrun_n_s16(q10s16, 7);
			
 
				+
			
 
				+        d18u8 = d22u8;
			
 
				+        d19u8 = d23u8;
			
 
				+        d20u8 = d24u8;
			
 
				+        d21u8 = d25u8;
			
 
				+        d22u8 = d26u8;
			
 
				+        d23u8 = d27u8;
			
 
				+        d24u8 = d28u8;
			
 
				+        d25u8 = d29u8;
			
 
				+        d26u8 = d30u8;
			
 
				+
			
 
				+        vst1_u8(dst_ptr, d6u8);
			
 
				+        dst_ptr += dst_pitch;
			
 
				+        vst1_u8(dst_ptr, d7u8);
			
 
				+        dst_ptr += dst_pitch;
			
 
				+        vst1_u8(dst_ptr, d8u8);
			
 
				+        dst_ptr += dst_pitch;
			
 
				+        vst1_u8(dst_ptr, d9u8);
			
 
				+        dst_ptr += dst_pitch;
			
 
				+    }
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_sixtap_predict16x16_neon(
			
 
				+        unsigned char *src_ptr,
			
 
				+        int src_pixels_per_line,
			
 
				+        int xoffset,
			
 
				+        int yoffset,
			
 
				+        unsigned char *dst_ptr,
			
 
				+        int dst_pitch) {
			
 
				+    unsigned char *src, *src_tmp, *dst, *tmpp;
			
 
				+    unsigned char tmp[336];
			
 
				+    int i, j;
			
 
				+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
			
 
				+    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
			
 
				+    uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
			
 
				+    uint8x8_t d28u8, d29u8, d30u8, d31u8;
			
 
				+    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
			
 
				+    uint8x16_t q3u8, q4u8;
			
 
				+    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
			
 
				+    uint16x8_t q11u16, q12u16, q13u16, q15u16;
			
 
				+    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
			
 
				+    int16x8_t q11s16, q12s16, q13s16, q15s16;
			
 
				+
			
 
				+    if (xoffset == 0) {  // secondpass_filter8x8_only
			
 
				+        // load second_pass filter
			
 
				+        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
			
 
				+        d0s8 = vdup_lane_s8(dtmps8, 0);
			
 
				+        d1s8 = vdup_lane_s8(dtmps8, 1);
			
 
				+        d2s8 = vdup_lane_s8(dtmps8, 2);
			
 
				+        d3s8 = vdup_lane_s8(dtmps8, 3);
			
 
				+        d4s8 = vdup_lane_s8(dtmps8, 4);
			
 
				+        d5s8 = vdup_lane_s8(dtmps8, 5);
			
 
				+        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
			
 
				+        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
			
 
				+        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
			
 
				+        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
			
 
				+        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
			
 
				+        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
			
 
				+
			
 
				+        // load src data
			
 
				+        src_tmp = src_ptr - src_pixels_per_line * 2;
			
 
				+        for (i = 0; i < 2; i++) {
			
 
				+            src = src_tmp + i * 8;
			
 
				+            dst = dst_ptr + i * 8;
			
 
				+            d18u8 = vld1_u8(src);
			
 
				+            src += src_pixels_per_line;
			
 
				+            d19u8 = vld1_u8(src);
			
 
				+            src += src_pixels_per_line;
			
 
				+            d20u8 = vld1_u8(src);
			
 
				+            src += src_pixels_per_line;
			
 
				+            d21u8 = vld1_u8(src);
			
 
				+            src += src_pixels_per_line;
			
 
				+            d22u8 = vld1_u8(src);
			
 
				+            src += src_pixels_per_line;
			
 
				+            for (j = 0; j < 4; j++) {
			
 
				+                d23u8 = vld1_u8(src);
			
 
				+                src += src_pixels_per_line;
			
 
				+                d24u8 = vld1_u8(src);
			
 
				+                src += src_pixels_per_line;
			
 
				+                d25u8 = vld1_u8(src);
			
 
				+                src += src_pixels_per_line;
			
 
				+                d26u8 = vld1_u8(src);
			
 
				+                src += src_pixels_per_line;
			
 
				+
			
 
				+                q3u16 = vmull_u8(d18u8, d0u8);
			
 
				+                q4u16 = vmull_u8(d19u8, d0u8);
			
 
				+                q5u16 = vmull_u8(d20u8, d0u8);
			
 
				+                q6u16 = vmull_u8(d21u8, d0u8);
			
 
				+
			
 
				+                q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
			
 
				+                q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
			
 
				+                q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
			
 
				+                q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
			
 
				+
			
 
				+                q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
			
 
				+                q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
			
 
				+                q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
			
 
				+                q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
			
 
				+
			
 
				+                q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
			
 
				+                q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
			
 
				+                q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
			
 
				+                q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
			
 
				+
			
 
				+                q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
			
 
				+                q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
			
 
				+                q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
			
 
				+                q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
			
 
				+
			
 
				+                q7u16 = vmull_u8(d21u8, d3u8);
			
 
				+                q8u16 = vmull_u8(d22u8, d3u8);
			
 
				+                q9u16 = vmull_u8(d23u8, d3u8);
			
 
				+                q10u16 = vmull_u8(d24u8, d3u8);
			
 
				+
			
 
				+                q3s16 = vreinterpretq_s16_u16(q3u16);
			
 
				+                q4s16 = vreinterpretq_s16_u16(q4u16);
			
 
				+                q5s16 = vreinterpretq_s16_u16(q5u16);
			
 
				+                q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+                q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+                q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+                q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+                q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+
			
 
				+                q7s16 = vqaddq_s16(q7s16, q3s16);
			
 
				+                q8s16 = vqaddq_s16(q8s16, q4s16);
			
 
				+                q9s16 = vqaddq_s16(q9s16, q5s16);
			
 
				+                q10s16 = vqaddq_s16(q10s16, q6s16);
			
 
				+
			
 
				+                d6u8 = vqrshrun_n_s16(q7s16, 7);
			
 
				+                d7u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+                d8u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+                d9u8 = vqrshrun_n_s16(q10s16, 7);
			
 
				+
			
 
				+                d18u8 = d22u8;
			
 
				+                d19u8 = d23u8;
			
 
				+                d20u8 = d24u8;
			
 
				+                d21u8 = d25u8;
			
 
				+                d22u8 = d26u8;
			
 
				+
			
 
				+                vst1_u8(dst, d6u8);
			
 
				+                dst += dst_pitch;
			
 
				+                vst1_u8(dst, d7u8);
			
 
				+                dst += dst_pitch;
			
 
				+                vst1_u8(dst, d8u8);
			
 
				+                dst += dst_pitch;
			
 
				+                vst1_u8(dst, d9u8);
			
 
				+                dst += dst_pitch;
			
 
				+            }
			
 
				+        }
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    // load first_pass filter
			
 
				+    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
			
 
				+    d0s8 = vdup_lane_s8(dtmps8, 0);
			
 
				+    d1s8 = vdup_lane_s8(dtmps8, 1);
			
 
				+    d2s8 = vdup_lane_s8(dtmps8, 2);
			
 
				+    d3s8 = vdup_lane_s8(dtmps8, 3);
			
 
				+    d4s8 = vdup_lane_s8(dtmps8, 4);
			
 
				+    d5s8 = vdup_lane_s8(dtmps8, 5);
			
 
				+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
			
 
				+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
			
 
				+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
			
 
				+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
			
 
				+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
			
 
				+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
			
 
				+
			
 
				+    // First pass: output_height lines x output_width columns (9x4)
			
 
				+    if (yoffset == 0) {  // firstpass_filter4x4_only
			
 
				+        src = src_ptr - 2;
			
 
				+        dst = dst_ptr;
			
 
				+        for (i = 0; i < 8; i++) {
			
 
				+            d6u8 = vld1_u8(src);
			
 
				+            d7u8 = vld1_u8(src + 8);
			
 
				+            d8u8 = vld1_u8(src + 16);
			
 
				+            src += src_pixels_per_line;
			
 
				+            d9u8 = vld1_u8(src);
			
 
				+            d10u8 = vld1_u8(src + 8);
			
 
				+            d11u8 = vld1_u8(src + 16);
			
 
				+            src += src_pixels_per_line;
			
 
				+
			
 
				+            __builtin_prefetch(src);
			
 
				+            __builtin_prefetch(src + src_pixels_per_line);
			
 
				+
			
 
				+            q6u16 = vmull_u8(d6u8, d0u8);
			
 
				+            q7u16 = vmull_u8(d7u8, d0u8);
			
 
				+            q8u16 = vmull_u8(d9u8, d0u8);
			
 
				+            q9u16 = vmull_u8(d10u8, d0u8);
			
 
				+
			
 
				+            d20u8 = vext_u8(d6u8, d7u8, 1);
			
 
				+            d21u8 = vext_u8(d9u8, d10u8, 1);
			
 
				+            d22u8 = vext_u8(d7u8, d8u8, 1);
			
 
				+            d23u8 = vext_u8(d10u8, d11u8, 1);
			
 
				+            d24u8 = vext_u8(d6u8, d7u8, 4);
			
 
				+            d25u8 = vext_u8(d9u8, d10u8, 4);
			
 
				+            d26u8 = vext_u8(d7u8, d8u8, 4);
			
 
				+            d27u8 = vext_u8(d10u8, d11u8, 4);
			
 
				+            d28u8 = vext_u8(d6u8, d7u8, 5);
			
 
				+            d29u8 = vext_u8(d9u8, d10u8, 5);
			
 
				+
			
 
				+            q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
			
 
				+            q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
			
 
				+            q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
			
 
				+            q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
			
 
				+            q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
			
 
				+            q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
			
 
				+            q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
			
 
				+            q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
			
 
				+            q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
			
 
				+            q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
			
 
				+
			
 
				+            d20u8 = vext_u8(d7u8, d8u8, 5);
			
 
				+            d21u8 = vext_u8(d10u8, d11u8, 5);
			
 
				+            d22u8 = vext_u8(d6u8, d7u8, 2);
			
 
				+            d23u8 = vext_u8(d9u8, d10u8, 2);
			
 
				+            d24u8 = vext_u8(d7u8, d8u8, 2);
			
 
				+            d25u8 = vext_u8(d10u8, d11u8, 2);
			
 
				+            d26u8 = vext_u8(d6u8, d7u8, 3);
			
 
				+            d27u8 = vext_u8(d9u8, d10u8, 3);
			
 
				+            d28u8 = vext_u8(d7u8, d8u8, 3);
			
 
				+            d29u8 = vext_u8(d10u8, d11u8, 3);
			
 
				+
			
 
				+            q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
			
 
				+            q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
			
 
				+            q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
			
 
				+            q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
			
 
				+            q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
			
 
				+            q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
			
 
				+
			
 
				+            q10u16 = vmull_u8(d26u8, d3u8);
			
 
				+            q11u16 = vmull_u8(d27u8, d3u8);
			
 
				+            q12u16 = vmull_u8(d28u8, d3u8);
			
 
				+            q15u16 = vmull_u8(d29u8, d3u8);
			
 
				+
			
 
				+            q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+            q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+            q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+            q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+            q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+            q11s16 = vreinterpretq_s16_u16(q11u16);
			
 
				+            q12s16 = vreinterpretq_s16_u16(q12u16);
			
 
				+            q15s16 = vreinterpretq_s16_u16(q15u16);
			
 
				+
			
 
				+            q6s16 = vqaddq_s16(q6s16, q10s16);
			
 
				+            q8s16 = vqaddq_s16(q8s16, q11s16);
			
 
				+            q7s16 = vqaddq_s16(q7s16, q12s16);
			
 
				+            q9s16 = vqaddq_s16(q9s16, q15s16);
			
 
				+
			
 
				+            d6u8 = vqrshrun_n_s16(q6s16, 7);
			
 
				+            d7u8 = vqrshrun_n_s16(q7s16, 7);
			
 
				+            d8u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+            d9u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+
			
 
				+            q3u8 = vcombine_u8(d6u8, d7u8);
			
 
				+            q4u8 = vcombine_u8(d8u8, d9u8);
			
 
				+            vst1q_u8(dst, q3u8);
			
 
				+            dst += dst_pitch;
			
 
				+            vst1q_u8(dst, q4u8);
			
 
				+            dst += dst_pitch;
			
 
				+        }
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    src = src_ptr - 2 - src_pixels_per_line * 2;
			
 
				+    tmpp = tmp;
			
 
				+    for (i = 0; i < 7; i++) {
			
 
				+        d6u8 = vld1_u8(src);
			
 
				+        d7u8 = vld1_u8(src + 8);
			
 
				+        d8u8 = vld1_u8(src + 16);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d9u8 = vld1_u8(src);
			
 
				+        d10u8 = vld1_u8(src + 8);
			
 
				+        d11u8 = vld1_u8(src + 16);
			
 
				+        src += src_pixels_per_line;
			
 
				+        d12u8 = vld1_u8(src);
			
 
				+        d13u8 = vld1_u8(src + 8);
			
 
				+        d14u8 = vld1_u8(src + 16);
			
 
				+        src += src_pixels_per_line;
			
 
				+
			
 
				+        __builtin_prefetch(src);
			
 
				+        __builtin_prefetch(src + src_pixels_per_line);
			
 
				+        __builtin_prefetch(src + src_pixels_per_line * 2);
			
 
				+
			
 
				+        q8u16 = vmull_u8(d6u8, d0u8);
			
 
				+        q9u16 = vmull_u8(d7u8, d0u8);
			
 
				+        q10u16 = vmull_u8(d9u8, d0u8);
			
 
				+        q11u16 = vmull_u8(d10u8, d0u8);
			
 
				+        q12u16 = vmull_u8(d12u8, d0u8);
			
 
				+        q13u16 = vmull_u8(d13u8, d0u8);
			
 
				+
			
 
				+        d28u8 = vext_u8(d6u8, d7u8, 1);
			
 
				+        d29u8 = vext_u8(d9u8, d10u8, 1);
			
 
				+        d30u8 = vext_u8(d12u8, d13u8, 1);
			
 
				+        q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
			
 
				+        q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
			
 
				+        q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
			
 
				+        d28u8 = vext_u8(d7u8, d8u8, 1);
			
 
				+        d29u8 = vext_u8(d10u8, d11u8, 1);
			
 
				+        d30u8 = vext_u8(d13u8, d14u8, 1);
			
 
				+        q9u16  = vmlsl_u8(q9u16, d28u8, d1u8);
			
 
				+        q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
			
 
				+        q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
			
 
				+
			
 
				+        d28u8 = vext_u8(d6u8, d7u8, 4);
			
 
				+        d29u8 = vext_u8(d9u8, d10u8, 4);
			
 
				+        d30u8 = vext_u8(d12u8, d13u8, 4);
			
 
				+        q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
			
 
				+        q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
			
 
				+        q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
			
 
				+        d28u8 = vext_u8(d7u8, d8u8, 4);
			
 
				+        d29u8 = vext_u8(d10u8, d11u8, 4);
			
 
				+        d30u8 = vext_u8(d13u8, d14u8, 4);
			
 
				+        q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
			
 
				+        q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
			
 
				+        q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
			
 
				+
			
 
				+        d28u8 = vext_u8(d6u8, d7u8, 5);
			
 
				+        d29u8 = vext_u8(d9u8, d10u8, 5);
			
 
				+        d30u8 = vext_u8(d12u8, d13u8, 5);
			
 
				+        q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
			
 
				+        q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
			
 
				+        q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
			
 
				+        d28u8 = vext_u8(d7u8, d8u8, 5);
			
 
				+        d29u8 = vext_u8(d10u8, d11u8, 5);
			
 
				+        d30u8 = vext_u8(d13u8, d14u8, 5);
			
 
				+        q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
			
 
				+        q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
			
 
				+        q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
			
 
				+
			
 
				+        d28u8 = vext_u8(d6u8, d7u8, 2);
			
 
				+        d29u8 = vext_u8(d9u8, d10u8, 2);
			
 
				+        d30u8 = vext_u8(d12u8, d13u8, 2);
			
 
				+        q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
			
 
				+        q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
			
 
				+        q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
			
 
				+        d28u8 = vext_u8(d7u8, d8u8, 2);
			
 
				+        d29u8 = vext_u8(d10u8, d11u8, 2);
			
 
				+        d30u8 = vext_u8(d13u8, d14u8, 2);
			
 
				+        q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
			
 
				+        q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
			
 
				+        q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
			
 
				+
			
 
				+        d28u8 = vext_u8(d6u8, d7u8, 3);
			
 
				+        d29u8 = vext_u8(d9u8, d10u8, 3);
			
 
				+        d30u8 = vext_u8(d12u8, d13u8, 3);
			
 
				+        d15u8 = vext_u8(d7u8, d8u8, 3);
			
 
				+        d31u8 = vext_u8(d10u8, d11u8, 3);
			
 
				+        d6u8  = vext_u8(d13u8, d14u8, 3);
			
 
				+        q4u16 = vmull_u8(d28u8, d3u8);
			
 
				+        q5u16 = vmull_u8(d29u8, d3u8);
			
 
				+        q6u16 = vmull_u8(d30u8, d3u8);
			
 
				+        q4s16 = vreinterpretq_s16_u16(q4u16);
			
 
				+        q5s16 = vreinterpretq_s16_u16(q5u16);
			
 
				+        q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+        q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+        q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+        q12s16 = vreinterpretq_s16_u16(q12u16);
			
 
				+        q8s16 = vqaddq_s16(q8s16, q4s16);
			
 
				+        q10s16 = vqaddq_s16(q10s16, q5s16);
			
 
				+        q12s16 = vqaddq_s16(q12s16, q6s16);
			
 
				+
			
 
				+        q6u16 = vmull_u8(d15u8, d3u8);
			
 
				+        q7u16 = vmull_u8(d31u8, d3u8);
			
 
				+        q3u16 = vmull_u8(d6u8, d3u8);
			
 
				+        q3s16 = vreinterpretq_s16_u16(q3u16);
			
 
				+        q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+        q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+        q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+        q11s16 = vreinterpretq_s16_u16(q11u16);
			
 
				+        q13s16 = vreinterpretq_s16_u16(q13u16);
			
 
				+        q9s16 = vqaddq_s16(q9s16, q6s16);
			
 
				+        q11s16 = vqaddq_s16(q11s16, q7s16);
			
 
				+        q13s16 = vqaddq_s16(q13s16, q3s16);
			
 
				+
			
 
				+        d6u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+        d7u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+        d8u8 = vqrshrun_n_s16(q10s16, 7);
			
 
				+        d9u8 = vqrshrun_n_s16(q11s16, 7);
			
 
				+        d10u8 = vqrshrun_n_s16(q12s16, 7);
			
 
				+        d11u8 = vqrshrun_n_s16(q13s16, 7);
			
 
				+
			
 
				+        vst1_u8(tmpp, d6u8);
			
 
				+        tmpp += 8;
			
 
				+        vst1_u8(tmpp, d7u8);
			
 
				+        tmpp += 8;
			
 
				+        vst1_u8(tmpp, d8u8);
			
 
				+        tmpp += 8;
			
 
				+        vst1_u8(tmpp, d9u8);
			
 
				+        tmpp += 8;
			
 
				+        vst1_u8(tmpp, d10u8);
			
 
				+        tmpp += 8;
			
 
				+        vst1_u8(tmpp, d11u8);
			
 
				+        tmpp += 8;
			
 
				+    }
			
 
				+
			
 
				+    // Second pass: 16x16
			
 
				+    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
			
 
				+    d0s8 = vdup_lane_s8(dtmps8, 0);
			
 
				+    d1s8 = vdup_lane_s8(dtmps8, 1);
			
 
				+    d2s8 = vdup_lane_s8(dtmps8, 2);
			
 
				+    d3s8 = vdup_lane_s8(dtmps8, 3);
			
 
				+    d4s8 = vdup_lane_s8(dtmps8, 4);
			
 
				+    d5s8 = vdup_lane_s8(dtmps8, 5);
			
 
				+    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
			
 
				+    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
			
 
				+    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
			
 
				+    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
			
 
				+    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
			
 
				+    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
			
 
				+
			
 
				+    for (i = 0; i < 2; i++) {
			
 
				+        dst = dst_ptr + 8 * i;
			
 
				+        tmpp = tmp + 8 * i;
			
 
				+        d18u8 = vld1_u8(tmpp);
			
 
				+        tmpp += 16;
			
 
				+        d19u8 = vld1_u8(tmpp);
			
 
				+        tmpp += 16;
			
 
				+        d20u8 = vld1_u8(tmpp);
			
 
				+        tmpp += 16;
			
 
				+        d21u8 = vld1_u8(tmpp);
			
 
				+        tmpp += 16;
			
 
				+        d22u8 = vld1_u8(tmpp);
			
 
				+        tmpp += 16;
			
 
				+        for (j = 0; j < 4; j++) {
			
 
				+            d23u8 = vld1_u8(tmpp);
			
 
				+            tmpp += 16;
			
 
				+            d24u8 = vld1_u8(tmpp);
			
 
				+            tmpp += 16;
			
 
				+            d25u8 = vld1_u8(tmpp);
			
 
				+            tmpp += 16;
			
 
				+            d26u8 = vld1_u8(tmpp);
			
 
				+            tmpp += 16;
			
 
				+
			
 
				+            q3u16 = vmull_u8(d18u8, d0u8);
			
 
				+            q4u16 = vmull_u8(d19u8, d0u8);
			
 
				+            q5u16 = vmull_u8(d20u8, d0u8);
			
 
				+            q6u16 = vmull_u8(d21u8, d0u8);
			
 
				+
			
 
				+            q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
			
 
				+            q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
			
 
				+            q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
			
 
				+            q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
			
 
				+
			
 
				+            q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
			
 
				+            q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
			
 
				+            q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
			
 
				+            q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
			
 
				+
			
 
				+            q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
			
 
				+            q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
			
 
				+            q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
			
 
				+            q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
			
 
				+
			
 
				+            q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
			
 
				+            q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
			
 
				+            q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
			
 
				+            q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
			
 
				+
			
 
				+            q7u16 = vmull_u8(d21u8, d3u8);
			
 
				+            q8u16 = vmull_u8(d22u8, d3u8);
			
 
				+            q9u16 = vmull_u8(d23u8, d3u8);
			
 
				+            q10u16 = vmull_u8(d24u8, d3u8);
			
 
				+
			
 
				+            q3s16 = vreinterpretq_s16_u16(q3u16);
			
 
				+            q4s16 = vreinterpretq_s16_u16(q4u16);
			
 
				+            q5s16 = vreinterpretq_s16_u16(q5u16);
			
 
				+            q6s16 = vreinterpretq_s16_u16(q6u16);
			
 
				+            q7s16 = vreinterpretq_s16_u16(q7u16);
			
 
				+            q8s16 = vreinterpretq_s16_u16(q8u16);
			
 
				+            q9s16 = vreinterpretq_s16_u16(q9u16);
			
 
				+            q10s16 = vreinterpretq_s16_u16(q10u16);
			
 
				+
			
 
				+            q7s16 = vqaddq_s16(q7s16, q3s16);
			
 
				+            q8s16 = vqaddq_s16(q8s16, q4s16);
			
 
				+            q9s16 = vqaddq_s16(q9s16, q5s16);
			
 
				+            q10s16 = vqaddq_s16(q10s16, q6s16);
			
 
				+
			
 
				+            d6u8 = vqrshrun_n_s16(q7s16, 7);
			
 
				+            d7u8 = vqrshrun_n_s16(q8s16, 7);
			
 
				+            d8u8 = vqrshrun_n_s16(q9s16, 7);
			
 
				+            d9u8 = vqrshrun_n_s16(q10s16, 7);
			
 
				+
			
 
				+            d18u8 = d22u8;
			
 
				+            d19u8 = d23u8;
			
 
				+            d20u8 = d24u8;
			
 
				+            d21u8 = d25u8;
			
 
				+            d22u8 = d26u8;
			
 
				+
			
 
				+            vst1_u8(dst, d6u8);
			
 
				+            dst += dst_pitch;
			
 
				+            vst1_u8(dst, d7u8);
			
 
				+            dst += dst_pitch;
			
 
				+            vst1_u8(dst, d8u8);
			
 
				+            dst += dst_pitch;
			
 
				+            vst1_u8(dst, d9u8);
			
 
				+            dst += dst_pitch;
			
 
				+        }
			
 
				+    }
			
 
				+    return;
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
+++ b/thirdparty/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
@@ -0,0 +1,550 @@
 
				+/*
			
 
				+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+#include "./vpx_config.h"
			
 
				+#include "vpx_ports/arm.h"
			
 
				+
			
 
				+static INLINE void vp8_loop_filter_neon(
			
 
				+        uint8x16_t qblimit,  // flimit
			
 
				+        uint8x16_t qlimit,   // limit
			
 
				+        uint8x16_t qthresh,  // thresh
			
 
				+        uint8x16_t q3,       // p3
			
 
				+        uint8x16_t q4,       // p2
			
 
				+        uint8x16_t q5,       // p1
			
 
				+        uint8x16_t q6,       // p0
			
 
				+        uint8x16_t q7,       // q0
			
 
				+        uint8x16_t q8,       // q1
			
 
				+        uint8x16_t q9,       // q2
			
 
				+        uint8x16_t q10,      // q3
			
 
				+        uint8x16_t *q5r,     // p1
			
 
				+        uint8x16_t *q6r,     // p0
			
 
				+        uint8x16_t *q7r,     // q0
			
 
				+        uint8x16_t *q8r) {   // q1
			
 
				+    uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
			
 
				+    int16x8_t q2s16, q11s16;
			
 
				+    uint16x8_t q4u16;
			
 
				+    int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
			
 
				+    int8x8_t d2s8, d3s8;
			
 
				+
			
 
				+    q11u8 = vabdq_u8(q3, q4);
			
 
				+    q12u8 = vabdq_u8(q4, q5);
			
 
				+    q13u8 = vabdq_u8(q5, q6);
			
 
				+    q14u8 = vabdq_u8(q8, q7);
			
 
				+    q3    = vabdq_u8(q9, q8);
			
 
				+    q4    = vabdq_u8(q10, q9);
			
 
				+
			
 
				+    q11u8 = vmaxq_u8(q11u8, q12u8);
			
 
				+    q12u8 = vmaxq_u8(q13u8, q14u8);
			
 
				+    q3    = vmaxq_u8(q3, q4);
			
 
				+    q15u8 = vmaxq_u8(q11u8, q12u8);
			
 
				+
			
 
				+    q9 = vabdq_u8(q6, q7);
			
 
				+
			
 
				+    // vp8_hevmask
			
 
				+    q13u8 = vcgtq_u8(q13u8, qthresh);
			
 
				+    q14u8 = vcgtq_u8(q14u8, qthresh);
			
 
				+    q15u8 = vmaxq_u8(q15u8, q3);
			
 
				+
			
 
				+    q2u8 = vabdq_u8(q5, q8);
			
 
				+    q9 = vqaddq_u8(q9, q9);
			
 
				+
			
 
				+    q15u8 = vcgeq_u8(qlimit, q15u8);
			
 
				+
			
 
				+    // vp8_filter() function
			
 
				+    // convert to signed
			
 
				+    q10 = vdupq_n_u8(0x80);
			
 
				+    q8 = veorq_u8(q8, q10);
			
 
				+    q7 = veorq_u8(q7, q10);
			
 
				+    q6 = veorq_u8(q6, q10);
			
 
				+    q5 = veorq_u8(q5, q10);
			
 
				+
			
 
				+    q2u8 = vshrq_n_u8(q2u8, 1);
			
 
				+    q9 = vqaddq_u8(q9, q2u8);
			
 
				+
			
 
				+    q10 = vdupq_n_u8(3);
			
 
				+
			
 
				+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
			
 
				+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
			
 
				+    q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
			
 
				+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
			
 
				+
			
 
				+    q9 = vcgeq_u8(qblimit, q9);
			
 
				+
			
 
				+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
			
 
				+                    vreinterpretq_s8_u8(q8));
			
 
				+
			
 
				+    q14u8 = vorrq_u8(q13u8, q14u8);
			
 
				+
			
 
				+    q4u16 = vmovl_u8(vget_low_u8(q10));
			
 
				+    q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
			
 
				+    q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
			
 
				+
			
 
				+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
			
 
				+    q15u8 = vandq_u8(q15u8, q9);
			
 
				+
			
 
				+    q1s8 = vreinterpretq_s8_u8(q1u8);
			
 
				+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
			
 
				+    q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
			
 
				+
			
 
				+    q9 = vdupq_n_u8(4);
			
 
				+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
			
 
				+    d2s8 = vqmovn_s16(q2s16);
			
 
				+    d3s8 = vqmovn_s16(q11s16);
			
 
				+    q1s8 = vcombine_s8(d2s8, d3s8);
			
 
				+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
			
 
				+    q1s8 = vreinterpretq_s8_u8(q1u8);
			
 
				+
			
 
				+    q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
			
 
				+    q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
			
 
				+    q2s8 = vshrq_n_s8(q2s8, 3);
			
 
				+    q1s8 = vshrq_n_s8(q1s8, 3);
			
 
				+
			
 
				+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
			
 
				+    q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
			
 
				+
			
 
				+    q1s8 = vrshrq_n_s8(q1s8, 1);
			
 
				+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
			
 
				+
			
 
				+    q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
			
 
				+    q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
			
 
				+
			
 
				+    q0u8 = vdupq_n_u8(0x80);
			
 
				+    *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
			
 
				+    *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
			
 
				+    *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
			
 
				+    *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_horizontal_edge_y_neon(
			
 
				+        unsigned char *src,
			
 
				+        int pitch,
			
 
				+        unsigned char blimit,
			
 
				+        unsigned char limit,
			
 
				+        unsigned char thresh) {
			
 
				+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
			
 
				+    uint8x16_t q5, q6, q7, q8, q9, q10;
			
 
				+
			
 
				+    qblimit = vdupq_n_u8(blimit);
			
 
				+    qlimit  = vdupq_n_u8(limit);
			
 
				+    qthresh = vdupq_n_u8(thresh);
			
 
				+    src -= (pitch << 2);
			
 
				+
			
 
				+    q3 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q4 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q5 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q6 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q7 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q8 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q9 = vld1q_u8(src);
			
 
				+    src += pitch;
			
 
				+    q10 = vld1q_u8(src);
			
 
				+
			
 
				+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
			
 
				+                         q5, q6, q7, q8, q9, q10,
			
 
				+                         &q5, &q6, &q7, &q8);
			
 
				+
			
 
				+    src -= (pitch * 5);
			
 
				+    vst1q_u8(src, q5);
			
 
				+    src += pitch;
			
 
				+    vst1q_u8(src, q6);
			
 
				+    src += pitch;
			
 
				+    vst1q_u8(src, q7);
			
 
				+    src += pitch;
			
 
				+    vst1q_u8(src, q8);
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_horizontal_edge_uv_neon(
			
 
				+        unsigned char *u,
			
 
				+        int pitch,
			
 
				+        unsigned char blimit,
			
 
				+        unsigned char limit,
			
 
				+        unsigned char thresh,
			
 
				+        unsigned char *v) {
			
 
				+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
			
 
				+    uint8x16_t q5, q6, q7, q8, q9, q10;
			
 
				+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
			
 
				+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
			
 
				+
			
 
				+    qblimit = vdupq_n_u8(blimit);
			
 
				+    qlimit  = vdupq_n_u8(limit);
			
 
				+    qthresh = vdupq_n_u8(thresh);
			
 
				+
			
 
				+    u -= (pitch << 2);
			
 
				+    v -= (pitch << 2);
			
 
				+
			
 
				+    d6  = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d7  = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d8  = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d9  = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d10 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d11 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d12 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d13 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d14 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d15 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d16 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d17 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d18 = vld1_u8(u);
			
 
				+    u += pitch;
			
 
				+    d19 = vld1_u8(v);
			
 
				+    v += pitch;
			
 
				+    d20 = vld1_u8(u);
			
 
				+    d21 = vld1_u8(v);
			
 
				+
			
 
				+    q3 = vcombine_u8(d6, d7);
			
 
				+    q4 = vcombine_u8(d8, d9);
			
 
				+    q5 = vcombine_u8(d10, d11);
			
 
				+    q6 = vcombine_u8(d12, d13);
			
 
				+    q7 = vcombine_u8(d14, d15);
			
 
				+    q8 = vcombine_u8(d16, d17);
			
 
				+    q9 = vcombine_u8(d18, d19);
			
 
				+    q10 = vcombine_u8(d20, d21);
			
 
				+
			
 
				+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
			
 
				+                         q5, q6, q7, q8, q9, q10,
			
 
				+                         &q5, &q6, &q7, &q8);
			
 
				+
			
 
				+    u -= (pitch * 5);
			
 
				+    vst1_u8(u, vget_low_u8(q5));
			
 
				+    u += pitch;
			
 
				+    vst1_u8(u, vget_low_u8(q6));
			
 
				+    u += pitch;
			
 
				+    vst1_u8(u, vget_low_u8(q7));
			
 
				+    u += pitch;
			
 
				+    vst1_u8(u, vget_low_u8(q8));
			
 
				+
			
 
				+    v -= (pitch * 5);
			
 
				+    vst1_u8(v, vget_high_u8(q5));
			
 
				+    v += pitch;
			
 
				+    vst1_u8(v, vget_high_u8(q6));
			
 
				+    v += pitch;
			
 
				+    vst1_u8(v, vget_high_u8(q7));
			
 
				+    v += pitch;
			
 
				+    vst1_u8(v, vget_high_u8(q8));
			
 
				+    return;
			
 
				+}
			
 
				+
			
 
				+static INLINE void write_4x8(unsigned char *dst, int pitch,
			
 
				+                             const uint8x8x4_t result) {
			
 
				+#ifdef VPX_INCOMPATIBLE_GCC
			
 
				+    /*
			
 
				+     * uint8x8x4_t result
			
 
				+    00 01 02 03 | 04 05 06 07
			
 
				+    10 11 12 13 | 14 15 16 17
			
 
				+    20 21 22 23 | 24 25 26 27
			
 
				+    30 31 32 33 | 34 35 36 37
			
 
				+    ---
			
 
				+    * after vtrn_u16
			
 
				+    00 01 20 21 | 04 05 24 25
			
 
				+    02 03 22 23 | 06 07 26 27
			
 
				+    10 11 30 31 | 14 15 34 35
			
 
				+    12 13 32 33 | 16 17 36 37
			
 
				+    ---
			
 
				+    * after vtrn_u8
			
 
				+    00 10 20 30 | 04 14 24 34
			
 
				+    01 11 21 31 | 05 15 25 35
			
 
				+    02 12 22 32 | 06 16 26 36
			
 
				+    03 13 23 33 | 07 17 27 37
			
 
				+    */
			
 
				+    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
			
 
				+                                          vreinterpret_u16_u8(result.val[2]));
			
 
				+    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
			
 
				+                                          vreinterpret_u16_u8(result.val[3]));
			
 
				+    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
			
 
				+                                       vreinterpret_u8_u16(r13_u16.val[0]));
			
 
				+    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
			
 
				+                                       vreinterpret_u8_u16(r13_u16.val[1]));
			
 
				+    const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
			
 
				+    const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
			
 
				+    const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
			
 
				+    const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
			
 
				+    vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
			
 
				+    dst += pitch;
			
 
				+    vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
			
 
				+#else
			
 
				+    vst4_lane_u8(dst, result, 0);
			
 
				+    dst += pitch;
			
 
				+    vst4_lane_u8(dst, result, 1);
			
 
				+    dst += pitch;
			
 
				+    vst4_lane_u8(dst, result, 2);
			
 
				+    dst += pitch;
			
 
				+    vst4_lane_u8(dst, result, 3);
			
 
				+    dst += pitch;
			
 
				+    vst4_lane_u8(dst, result, 4);
			
 
				+    dst += pitch;
			
 
				+    vst4_lane_u8(dst, result, 5);
			
 
				+    dst += pitch;
			
 
				+    vst4_lane_u8(dst, result, 6);
			
 
				+    dst += pitch;
			
 
				+    vst4_lane_u8(dst, result, 7);
			
 
				+#endif  // VPX_INCOMPATIBLE_GCC
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_vertical_edge_y_neon(
			
 
				+        unsigned char *src,
			
 
				+        int pitch,
			
 
				+        unsigned char blimit,
			
 
				+        unsigned char limit,
			
 
				+        unsigned char thresh) {
			
 
				+    unsigned char *s, *d;
			
 
				+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
			
 
				+    uint8x16_t q5, q6, q7, q8, q9, q10;
			
 
				+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
			
 
				+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
			
 
				+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
			
 
				+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
			
 
				+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
			
 
				+    uint8x8x4_t q4ResultH, q4ResultL;
			
 
				+
			
 
				+    qblimit = vdupq_n_u8(blimit);
			
 
				+    qlimit  = vdupq_n_u8(limit);
			
 
				+    qthresh = vdupq_n_u8(thresh);
			
 
				+
			
 
				+    s = src - 4;
			
 
				+    d6  = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d8  = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d10 = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d12 = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d14 = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d16 = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d18 = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d20 = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d7  = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d9  = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d11 = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d13 = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d15 = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d17 = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d19 = vld1_u8(s);
			
 
				+    s += pitch;
			
 
				+    d21 = vld1_u8(s);
			
 
				+
			
 
				+    q3 = vcombine_u8(d6, d7);
			
 
				+    q4 = vcombine_u8(d8, d9);
			
 
				+    q5 = vcombine_u8(d10, d11);
			
 
				+    q6 = vcombine_u8(d12, d13);
			
 
				+    q7 = vcombine_u8(d14, d15);
			
 
				+    q8 = vcombine_u8(d16, d17);
			
 
				+    q9 = vcombine_u8(d18, d19);
			
 
				+    q10 = vcombine_u8(d20, d21);
			
 
				+
			
 
				+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
			
 
				+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
			
 
				+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
			
 
				+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
			
 
				+
			
 
				+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
			
 
				+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
			
 
				+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
			
 
				+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
			
 
				+
			
 
				+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
			
 
				+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
			
 
				+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
			
 
				+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
			
 
				+
			
 
				+    q3 = q2tmp8.val[0];
			
 
				+    q4 = q2tmp8.val[1];
			
 
				+    q5 = q2tmp9.val[0];
			
 
				+    q6 = q2tmp9.val[1];
			
 
				+    q7 = q2tmp10.val[0];
			
 
				+    q8 = q2tmp10.val[1];
			
 
				+    q9 = q2tmp11.val[0];
			
 
				+    q10 = q2tmp11.val[1];
			
 
				+
			
 
				+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
			
 
				+                         q5, q6, q7, q8, q9, q10,
			
 
				+                         &q5, &q6, &q7, &q8);
			
 
				+
			
 
				+    q4ResultL.val[0] = vget_low_u8(q5);   // d10
			
 
				+    q4ResultL.val[1] = vget_low_u8(q6);   // d12
			
 
				+    q4ResultL.val[2] = vget_low_u8(q7);   // d14
			
 
				+    q4ResultL.val[3] = vget_low_u8(q8);   // d16
			
 
				+    q4ResultH.val[0] = vget_high_u8(q5);  // d11
			
 
				+    q4ResultH.val[1] = vget_high_u8(q6);  // d13
			
 
				+    q4ResultH.val[2] = vget_high_u8(q7);  // d15
			
 
				+    q4ResultH.val[3] = vget_high_u8(q8);  // d17
			
 
				+
			
 
				+    d = src - 2;
			
 
				+    write_4x8(d, pitch, q4ResultL);
			
 
				+    d += pitch * 8;
			
 
				+    write_4x8(d, pitch, q4ResultH);
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_vertical_edge_uv_neon(
			
 
				+        unsigned char *u,
			
 
				+        int pitch,
			
 
				+        unsigned char blimit,
			
 
				+        unsigned char limit,
			
 
				+        unsigned char thresh,
			
 
				+        unsigned char *v) {
			
 
				+    unsigned char *us, *ud;
			
 
				+    unsigned char *vs, *vd;
			
 
				+    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
			
 
				+    uint8x16_t q5, q6, q7, q8, q9, q10;
			
 
				+    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
			
 
				+    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
			
 
				+    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
			
 
				+    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
			
 
				+    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
			
 
				+    uint8x8x4_t q4ResultH, q4ResultL;
			
 
				+
			
 
				+    qblimit = vdupq_n_u8(blimit);
			
 
				+    qlimit  = vdupq_n_u8(limit);
			
 
				+    qthresh = vdupq_n_u8(thresh);
			
 
				+
			
 
				+    us = u - 4;
			
 
				+    d6 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d8 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d10 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d12 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d14 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d16 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d18 = vld1_u8(us);
			
 
				+    us += pitch;
			
 
				+    d20 = vld1_u8(us);
			
 
				+
			
 
				+    vs = v - 4;
			
 
				+    d7 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d9 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d11 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d13 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d15 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d17 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d19 = vld1_u8(vs);
			
 
				+    vs += pitch;
			
 
				+    d21 = vld1_u8(vs);
			
 
				+
			
 
				+    q3 = vcombine_u8(d6, d7);
			
 
				+    q4 = vcombine_u8(d8, d9);
			
 
				+    q5 = vcombine_u8(d10, d11);
			
 
				+    q6 = vcombine_u8(d12, d13);
			
 
				+    q7 = vcombine_u8(d14, d15);
			
 
				+    q8 = vcombine_u8(d16, d17);
			
 
				+    q9 = vcombine_u8(d18, d19);
			
 
				+    q10 = vcombine_u8(d20, d21);
			
 
				+
			
 
				+    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
			
 
				+    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
			
 
				+    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
			
 
				+    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
			
 
				+
			
 
				+    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[0]));
			
 
				+    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[0]));
			
 
				+    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp2.val[1]));
			
 
				+    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
			
 
				+                       vreinterpretq_u16_u32(q2tmp3.val[1]));
			
 
				+
			
 
				+    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[0]));
			
 
				+    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp5.val[1]));
			
 
				+    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[0]));
			
 
				+    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
			
 
				+                       vreinterpretq_u8_u16(q2tmp7.val[1]));
			
 
				+
			
 
				+    q3 = q2tmp8.val[0];
			
 
				+    q4 = q2tmp8.val[1];
			
 
				+    q5 = q2tmp9.val[0];
			
 
				+    q6 = q2tmp9.val[1];
			
 
				+    q7 = q2tmp10.val[0];
			
 
				+    q8 = q2tmp10.val[1];
			
 
				+    q9 = q2tmp11.val[0];
			
 
				+    q10 = q2tmp11.val[1];
			
 
				+
			
 
				+    vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
			
 
				+                         q5, q6, q7, q8, q9, q10,
			
 
				+                         &q5, &q6, &q7, &q8);
			
 
				+
			
 
				+    q4ResultL.val[0] = vget_low_u8(q5);   // d10
			
 
				+    q4ResultL.val[1] = vget_low_u8(q6);   // d12
			
 
				+    q4ResultL.val[2] = vget_low_u8(q7);   // d14
			
 
				+    q4ResultL.val[3] = vget_low_u8(q8);   // d16
			
 
				+    ud = u - 2;
			
 
				+    write_4x8(ud, pitch, q4ResultL);
			
 
				+
			
 
				+    q4ResultH.val[0] = vget_high_u8(q5);  // d11
			
 
				+    q4ResultH.val[1] = vget_high_u8(q6);  // d13
			
 
				+    q4ResultH.val[2] = vget_high_u8(q7);  // d15
			
 
				+    q4ResultH.val[3] = vget_high_u8(q8);  // d17
			
 
				+    vd = v - 2;
			
 
				+    write_4x8(vd, pitch, q4ResultH);
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/blockd.c
+++ b/thirdparty/libvpx/vp8/common/blockd.c
@@ -0,0 +1,22 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "blockd.h"
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+
			
 
				+const unsigned char vp8_block2left[25] =
			
 
				+{
			
 
				+    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
			
 
				+};
			
 
				+const unsigned char vp8_block2above[25] =
			
 
				+{
			
 
				+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
			
 
				+};
			
--- a/thirdparty/libvpx/vp8/common/blockd.h
+++ b/thirdparty/libvpx/vp8/common/blockd.h
@@ -0,0 +1,312 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_BLOCKD_H_
			
 
				+#define VP8_COMMON_BLOCKD_H_
			
 
				+
			
 
				+void vpx_log(const char *format, ...);
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vpx_scale/yv12config.h"
			
 
				+#include "mv.h"
			
 
				+#include "treecoder.h"
			
 
				+#include "vpx_ports/mem.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/*#define DCPRED 1*/
			
 
				+#define DCPREDSIMTHRESH 0
			
 
				+#define DCPREDCNTTHRESH 3
			
 
				+
			
 
				+#define MB_FEATURE_TREE_PROBS   3
			
 
				+#define MAX_MB_SEGMENTS         4
			
 
				+
			
 
				+#define MAX_REF_LF_DELTAS       4
			
 
				+#define MAX_MODE_LF_DELTAS      4
			
 
				+
			
 
				+/* Segment Feature Masks */
			
 
				+#define SEGMENT_DELTADATA   0
			
 
				+#define SEGMENT_ABSDATA     1
			
 
				+
			
 
				+typedef struct
			
 
				+{
			
 
				+    int r, c;
			
 
				+} POS;
			
 
				+
			
 
				+#define PLANE_TYPE_Y_NO_DC    0
			
 
				+#define PLANE_TYPE_Y2         1
			
 
				+#define PLANE_TYPE_UV         2
			
 
				+#define PLANE_TYPE_Y_WITH_DC  3
			
 
				+
			
 
				+
			
 
				+typedef char ENTROPY_CONTEXT;
			
 
				+typedef struct
			
 
				+{
			
 
				+    ENTROPY_CONTEXT y1[4];
			
 
				+    ENTROPY_CONTEXT u[2];
			
 
				+    ENTROPY_CONTEXT v[2];
			
 
				+    ENTROPY_CONTEXT y2;
			
 
				+} ENTROPY_CONTEXT_PLANES;
			
 
				+
			
 
				+extern const unsigned char vp8_block2left[25];
			
 
				+extern const unsigned char vp8_block2above[25];
			
 
				+
			
 
				+#define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \
			
 
				+    Dest = (A)+(B);
			
 
				+
			
 
				+
			
 
				+typedef enum
			
 
				+{
			
 
				+    KEY_FRAME = 0,
			
 
				+    INTER_FRAME = 1
			
 
				+} FRAME_TYPE;
			
 
				+
			
 
				+typedef enum
			
 
				+{
			
 
				+    DC_PRED,            /* average of above and left pixels */
			
 
				+    V_PRED,             /* vertical prediction */
			
 
				+    H_PRED,             /* horizontal prediction */
			
 
				+    TM_PRED,            /* Truemotion prediction */
			
 
				+    B_PRED,             /* block based prediction, each block has its own prediction mode */
			
 
				+
			
 
				+    NEARESTMV,
			
 
				+    NEARMV,
			
 
				+    ZEROMV,
			
 
				+    NEWMV,
			
 
				+    SPLITMV,
			
 
				+
			
 
				+    MB_MODE_COUNT
			
 
				+} MB_PREDICTION_MODE;
			
 
				+
			
 
				+/* Macroblock level features */
			
 
				+typedef enum
			
 
				+{
			
 
				+    MB_LVL_ALT_Q = 0,               /* Use alternate Quantizer .... */
			
 
				+    MB_LVL_ALT_LF = 1,              /* Use alternate loop filter value... */
			
 
				+    MB_LVL_MAX = 2                  /* Number of MB level features supported */
			
 
				+
			
 
				+} MB_LVL_FEATURES;
			
 
				+
			
 
				+/* Segment Feature Masks */
			
 
				+#define SEGMENT_ALTQ    0x01
			
 
				+#define SEGMENT_ALT_LF  0x02
			
 
				+
			
 
				+#define VP8_YMODES  (B_PRED + 1)
			
 
				+#define VP8_UV_MODES (TM_PRED + 1)
			
 
				+
			
 
				+#define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
			
 
				+
			
 
				+typedef enum
			
 
				+{
			
 
				+    B_DC_PRED,          /* average of above and left pixels */
			
 
				+    B_TM_PRED,
			
 
				+
			
 
				+    B_VE_PRED,           /* vertical prediction */
			
 
				+    B_HE_PRED,           /* horizontal prediction */
			
 
				+
			
 
				+    B_LD_PRED,
			
 
				+    B_RD_PRED,
			
 
				+
			
 
				+    B_VR_PRED,
			
 
				+    B_VL_PRED,
			
 
				+    B_HD_PRED,
			
 
				+    B_HU_PRED,
			
 
				+
			
 
				+    LEFT4X4,
			
 
				+    ABOVE4X4,
			
 
				+    ZERO4X4,
			
 
				+    NEW4X4,
			
 
				+
			
 
				+    B_MODE_COUNT
			
 
				+} B_PREDICTION_MODE;
			
 
				+
			
 
				+#define VP8_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
			
 
				+#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
			
 
				+
			
 
				+/* For keyframes, intra block modes are predicted by the (already decoded)
			
 
				+   modes for the Y blocks to the left and above us; for interframes, there
			
 
				+   is a single probability table. */
			
 
				+
			
 
				+union b_mode_info
			
 
				+{
			
 
				+    B_PREDICTION_MODE as_mode;
			
 
				+    int_mv mv;
			
 
				+};
			
 
				+
			
 
				+typedef enum
			
 
				+{
			
 
				+    INTRA_FRAME = 0,
			
 
				+    LAST_FRAME = 1,
			
 
				+    GOLDEN_FRAME = 2,
			
 
				+    ALTREF_FRAME = 3,
			
 
				+    MAX_REF_FRAMES = 4
			
 
				+} MV_REFERENCE_FRAME;
			
 
				+
			
 
				+typedef struct
			
 
				+{
			
 
				+    uint8_t mode, uv_mode;
			
 
				+    uint8_t ref_frame;
			
 
				+    uint8_t is_4x4;
			
 
				+    int_mv mv;
			
 
				+
			
 
				+    uint8_t partitioning;
			
 
				+    uint8_t mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
			
 
				+    uint8_t need_to_clamp_mvs;
			
 
				+    uint8_t segment_id;                  /* Which set of segmentation parameters should be used for this MB */
			
 
				+} MB_MODE_INFO;
			
 
				+
			
 
				+typedef struct modeinfo
			
 
				+{
			
 
				+    MB_MODE_INFO mbmi;
			
 
				+    union b_mode_info bmi[16];
			
 
				+} MODE_INFO;
			
 
				+
			
 
				+#if CONFIG_MULTI_RES_ENCODING
			
 
				+/* The mb-level information needed to be stored for higher-resolution encoder */
			
 
				+typedef struct
			
 
				+{
			
 
				+    MB_PREDICTION_MODE mode;
			
 
				+    MV_REFERENCE_FRAME ref_frame;
			
 
				+    int_mv mv;
			
 
				+    int dissim;    /* dissimilarity level of the macroblock */
			
 
				+} LOWER_RES_MB_INFO;
			
 
				+
			
 
				+/* The frame-level information needed to be stored for higher-resolution
			
 
				+ *  encoder */
			
 
				+typedef struct
			
 
				+{
			
 
				+    FRAME_TYPE frame_type;
			
 
				+    int is_frame_dropped;
			
 
				+    // The frame rate for the lowest resolution.
			
 
				+    double low_res_framerate;
			
 
				+    /* The frame number of each reference frames */
			
 
				+    unsigned int low_res_ref_frames[MAX_REF_FRAMES];
			
 
				+    // The video frame counter value for the key frame, for lowest resolution.
			
 
				+    unsigned int key_frame_counter_value;
			
 
				+    LOWER_RES_MB_INFO *mb_info;
			
 
				+} LOWER_RES_FRAME_INFO;
			
 
				+#endif
			
 
				+
			
 
				+typedef struct blockd
			
 
				+{
			
 
				+    short *qcoeff;
			
 
				+    short *dqcoeff;
			
 
				+    unsigned char  *predictor;
			
 
				+    short *dequant;
			
 
				+
			
 
				+    int offset;
			
 
				+    char *eob;
			
 
				+
			
 
				+    union b_mode_info bmi;
			
 
				+} BLOCKD;
			
 
				+
			
 
				+typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
			
 
				+
			
 
				+typedef struct macroblockd
			
 
				+{
			
 
				+    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
			
 
				+    DECLARE_ALIGNED(16, short, qcoeff[400]);
			
 
				+    DECLARE_ALIGNED(16, short, dqcoeff[400]);
			
 
				+    DECLARE_ALIGNED(16, char,  eobs[25]);
			
 
				+
			
 
				+    DECLARE_ALIGNED(16, short,  dequant_y1[16]);
			
 
				+    DECLARE_ALIGNED(16, short,  dequant_y1_dc[16]);
			
 
				+    DECLARE_ALIGNED(16, short,  dequant_y2[16]);
			
 
				+    DECLARE_ALIGNED(16, short,  dequant_uv[16]);
			
 
				+
			
 
				+    /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
			
 
				+    BLOCKD block[25];
			
 
				+    int fullpixel_mask;
			
 
				+
			
 
				+    YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
			
 
				+    YV12_BUFFER_CONFIG dst;
			
 
				+
			
 
				+    MODE_INFO *mode_info_context;
			
 
				+    int mode_info_stride;
			
 
				+
			
 
				+    FRAME_TYPE frame_type;
			
 
				+
			
 
				+    int up_available;
			
 
				+    int left_available;
			
 
				+
			
 
				+    unsigned char *recon_above[3];
			
 
				+    unsigned char *recon_left[3];
			
 
				+    int recon_left_stride[2];
			
 
				+
			
 
				+    /* Y,U,V,Y2 */
			
 
				+    ENTROPY_CONTEXT_PLANES *above_context;
			
 
				+    ENTROPY_CONTEXT_PLANES *left_context;
			
 
				+
			
 
				+    /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
			
 
				+    unsigned char segmentation_enabled;
			
 
				+
			
 
				+    /* 0 (do not update) 1 (update) the macroblock segmentation map. */
			
 
				+    unsigned char update_mb_segmentation_map;
			
 
				+
			
 
				+    /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
			
 
				+    unsigned char update_mb_segmentation_data;
			
 
				+
			
 
				+    /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
			
 
				+    unsigned char mb_segement_abs_delta;
			
 
				+
			
 
				+    /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
			
 
				+    /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
			
 
				+    vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];         /* Probability Tree used to code Segment number */
			
 
				+
			
 
				+    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            /* Segment parameters */
			
 
				+
			
 
				+    /* mode_based Loop filter adjustment */
			
 
				+    unsigned char mode_ref_lf_delta_enabled;
			
 
				+    unsigned char mode_ref_lf_delta_update;
			
 
				+
			
 
				+    /* Delta values have the range +/- MAX_LOOP_FILTER */
			
 
				+    signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];                /* 0 = Intra, Last, GF, ARF */
			
 
				+    signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     /* 0 = Intra, Last, GF, ARF */
			
 
				+    signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];                      /* 0 = BPRED, ZERO_MV, MV, SPLIT */
			
 
				+    signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                           /* 0 = BPRED, ZERO_MV, MV, SPLIT */
			
 
				+
			
 
				+    /* Distance of MB away from frame edges */
			
 
				+    int mb_to_left_edge;
			
 
				+    int mb_to_right_edge;
			
 
				+    int mb_to_top_edge;
			
 
				+    int mb_to_bottom_edge;
			
 
				+
			
 
				+
			
 
				+
			
 
				+    vp8_subpix_fn_t  subpixel_predict;
			
 
				+    vp8_subpix_fn_t  subpixel_predict8x4;
			
 
				+    vp8_subpix_fn_t  subpixel_predict8x8;
			
 
				+    vp8_subpix_fn_t  subpixel_predict16x16;
			
 
				+
			
 
				+    void *current_bc;
			
 
				+
			
 
				+    int corrupted;
			
 
				+
			
 
				+#if ARCH_X86 || ARCH_X86_64
			
 
				+    /* This is an intermediate buffer currently used in sub-pixel motion search
			
 
				+     * to keep a copy of the reference area. This buffer can be used for other
			
 
				+     * purpose.
			
 
				+     */
			
 
				+    DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
			
 
				+#endif
			
 
				+} MACROBLOCKD;
			
 
				+
			
 
				+
			
 
				+extern void vp8_build_block_doffsets(MACROBLOCKD *x);
			
 
				+extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_BLOCKD_H_
			
--- a/thirdparty/libvpx/vp8/common/coefupdateprobs.h
+++ b/thirdparty/libvpx/vp8/common/coefupdateprobs.h
@@ -0,0 +1,197 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#ifndef VP8_COMMON_COEFUPDATEPROBS_H_
			
 
				+#define VP8_COMMON_COEFUPDATEPROBS_H_
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/* Update probabilities for the nodes in the token entropy tree.
			
 
				+   Generated file included by entropy.c */
			
 
				+
			
 
				+const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] =
			
 
				+{
			
 
				+    {
			
 
				+        {
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
			
 
				+            {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
			
 
				+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+    },
			
 
				+    {
			
 
				+        {
			
 
				+            {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
			
 
				+            {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+    },
			
 
				+    {
			
 
				+        {
			
 
				+            {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+    },
			
 
				+    {
			
 
				+        {
			
 
				+            {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+        {
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
			
 
				+        },
			
 
				+    },
			
 
				+};
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_COEFUPDATEPROBS_H_
			
--- a/thirdparty/libvpx/vp8/common/common.h
+++ b/thirdparty/libvpx/vp8/common/common.h
@@ -0,0 +1,48 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_COMMON_H_
			
 
				+#define VP8_COMMON_COMMON_H_
			
 
				+
			
 
				+#include <assert.h>
			
 
				+
			
 
				+/* Interface header for common constant data structures and lookup tables */
			
 
				+
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/* Only need this for fixed-size arrays, for structs just assign. */
			
 
				+
			
 
				+#define vp8_copy( Dest, Src) { \
			
 
				+        assert( sizeof( Dest) == sizeof( Src)); \
			
 
				+        memcpy( Dest, Src, sizeof( Src)); \
			
 
				+    }
			
 
				+
			
 
				+/* Use this for variably-sized arrays. */
			
 
				+
			
 
				+#define vp8_copy_array( Dest, Src, N) { \
			
 
				+        assert( sizeof( *Dest) == sizeof( *Src)); \
			
 
				+        memcpy( Dest, Src, N * sizeof( *Src)); \
			
 
				+    }
			
 
				+
			
 
				+#define vp8_zero( Dest)  memset( &Dest, 0, sizeof( Dest));
			
 
				+
			
 
				+#define vp8_zero_array( Dest, N)  memset( Dest, 0, N * sizeof( *Dest));
			
 
				+
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_COMMON_H_
			
--- a/thirdparty/libvpx/vp8/common/copy_c.c
+++ b/thirdparty/libvpx/vp8/common/copy_c.c
@@ -0,0 +1,32 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <string.h>
			
 
				+
			
 
				+#include "./vp8_rtcd.h"
			
 
				+#include "vpx/vpx_integer.h"
			
 
				+
			
 
				+/* Copy 2 macroblocks to a buffer */
			
 
				+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride,
			
 
				+                    unsigned char *dst_ptr, int dst_stride,
			
 
				+                    int height)
			
 
				+{
			
 
				+    int r;
			
 
				+
			
 
				+    for (r = 0; r < height; r++)
			
 
				+    {
			
 
				+        memcpy(dst_ptr, src_ptr, 32);
			
 
				+
			
 
				+        src_ptr += src_stride;
			
 
				+        dst_ptr += dst_stride;
			
 
				+
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/debugmodes.c
+++ b/thirdparty/libvpx/vp8/common/debugmodes.c
@@ -0,0 +1,155 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include "blockd.h"
			
 
				+
			
 
				+
			
 
				+void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame)
			
 
				+{
			
 
				+
			
 
				+    int mb_row;
			
 
				+    int mb_col;
			
 
				+    int mb_index = 0;
			
 
				+    FILE *mvs = fopen("mvs.stt", "a");
			
 
				+
			
 
				+    /* print out the macroblock Y modes */
			
 
				+    mb_index = 0;
			
 
				+    fprintf(mvs, "Mb Modes for Frame %d\n", frame);
			
 
				+
			
 
				+    for (mb_row = 0; mb_row < rows; mb_row++)
			
 
				+    {
			
 
				+        for (mb_col = 0; mb_col < cols; mb_col++)
			
 
				+        {
			
 
				+
			
 
				+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
			
 
				+
			
 
				+            mb_index++;
			
 
				+        }
			
 
				+
			
 
				+        fprintf(mvs, "\n");
			
 
				+        mb_index++;
			
 
				+    }
			
 
				+
			
 
				+    fprintf(mvs, "\n");
			
 
				+
			
 
				+    mb_index = 0;
			
 
				+    fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
			
 
				+
			
 
				+    for (mb_row = 0; mb_row < rows; mb_row++)
			
 
				+    {
			
 
				+        for (mb_col = 0; mb_col < cols; mb_col++)
			
 
				+        {
			
 
				+
			
 
				+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
			
 
				+
			
 
				+            mb_index++;
			
 
				+        }
			
 
				+
			
 
				+        fprintf(mvs, "\n");
			
 
				+        mb_index++;
			
 
				+    }
			
 
				+
			
 
				+    fprintf(mvs, "\n");
			
 
				+
			
 
				+    /* print out the macroblock UV modes */
			
 
				+    mb_index = 0;
			
 
				+    fprintf(mvs, "UV Modes for Frame %d\n", frame);
			
 
				+
			
 
				+    for (mb_row = 0; mb_row < rows; mb_row++)
			
 
				+    {
			
 
				+        for (mb_col = 0; mb_col < cols; mb_col++)
			
 
				+        {
			
 
				+
			
 
				+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
			
 
				+
			
 
				+            mb_index++;
			
 
				+        }
			
 
				+
			
 
				+        mb_index++;
			
 
				+        fprintf(mvs, "\n");
			
 
				+    }
			
 
				+
			
 
				+    fprintf(mvs, "\n");
			
 
				+
			
 
				+    /* print out the block modes */
			
 
				+    fprintf(mvs, "Mbs for Frame %d\n", frame);
			
 
				+    {
			
 
				+        int b_row;
			
 
				+
			
 
				+        for (b_row = 0; b_row < 4 * rows; b_row++)
			
 
				+        {
			
 
				+            int b_col;
			
 
				+            int bindex;
			
 
				+
			
 
				+            for (b_col = 0; b_col < 4 * cols; b_col++)
			
 
				+            {
			
 
				+                mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
			
 
				+                bindex = (b_row & 3) * 4 + (b_col & 3);
			
 
				+
			
 
				+                if (mi[mb_index].mbmi.mode == B_PRED)
			
 
				+                    fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode);
			
 
				+                else
			
 
				+                    fprintf(mvs, "xx ");
			
 
				+
			
 
				+            }
			
 
				+
			
 
				+            fprintf(mvs, "\n");
			
 
				+        }
			
 
				+    }
			
 
				+    fprintf(mvs, "\n");
			
 
				+
			
 
				+    /* print out the macroblock mvs */
			
 
				+    mb_index = 0;
			
 
				+    fprintf(mvs, "MVs for Frame %d\n", frame);
			
 
				+
			
 
				+    for (mb_row = 0; mb_row < rows; mb_row++)
			
 
				+    {
			
 
				+        for (mb_col = 0; mb_col < cols; mb_col++)
			
 
				+        {
			
 
				+            fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2);
			
 
				+
			
 
				+            mb_index++;
			
 
				+        }
			
 
				+
			
 
				+        mb_index++;
			
 
				+        fprintf(mvs, "\n");
			
 
				+    }
			
 
				+
			
 
				+    fprintf(mvs, "\n");
			
 
				+
			
 
				+
			
 
				+    /* print out the block modes */
			
 
				+    fprintf(mvs, "MVs for Frame %d\n", frame);
			
 
				+    {
			
 
				+        int b_row;
			
 
				+
			
 
				+        for (b_row = 0; b_row < 4 * rows; b_row++)
			
 
				+        {
			
 
				+            int b_col;
			
 
				+            int bindex;
			
 
				+
			
 
				+            for (b_col = 0; b_col < 4 * cols; b_col++)
			
 
				+            {
			
 
				+                mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
			
 
				+                bindex = (b_row & 3) * 4 + (b_col & 3);
			
 
				+                fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col);
			
 
				+
			
 
				+            }
			
 
				+
			
 
				+            fprintf(mvs, "\n");
			
 
				+        }
			
 
				+    }
			
 
				+    fprintf(mvs, "\n");
			
 
				+
			
 
				+
			
 
				+    fclose(mvs);
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/default_coef_probs.h
+++ b/thirdparty/libvpx/vp8/common/default_coef_probs.h
@@ -0,0 +1,200 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+*/
			
 
				+
			
 
				+#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
			
 
				+#define VP8_COMMON_DEFAULT_COEF_PROBS_H_
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/*Generated file, included by entropy.c*/
			
 
				+
			
 
				+
			
 
				+static const vp8_prob default_coef_probs [BLOCK_TYPES]
			
 
				+                                         [COEF_BANDS]
			
 
				+                                         [PREV_COEF_CONTEXTS]
			
 
				+                                         [ENTROPY_NODES] =
			
 
				+{
			
 
				+    { /* Block Type ( 0 ) */
			
 
				+        { /* Coeff Band ( 0 )*/
			
 
				+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 1 )*/
			
 
				+            { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
			
 
				+            { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
			
 
				+            { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 2 )*/
			
 
				+            {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
			
 
				+            { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
			
 
				+            {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 3 )*/
			
 
				+            {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
			
 
				+            { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
			
 
				+            {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 4 )*/
			
 
				+            {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
			
 
				+            { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
			
 
				+            {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 5 )*/
			
 
				+            {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
			
 
				+            { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 6 )*/
			
 
				+            {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
			
 
				+            { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
			
 
				+            {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 7 )*/
			
 
				+            {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
			
 
				+        }
			
 
				+    },
			
 
				+    { /* Block Type ( 1 ) */
			
 
				+        { /* Coeff Band ( 0 )*/
			
 
				+            { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },
			
 
				+            { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },
			
 
				+            {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 1 )*/
			
 
				+            {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
			
 
				+            { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
			
 
				+            {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 2 )*/
			
 
				+            {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
			
 
				+            {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
			
 
				+            {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 3 )*/
			
 
				+            {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
			
 
				+            { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
			
 
				+            {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 4 )*/
			
 
				+            {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
			
 
				+            {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
			
 
				+            {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 5 )*/
			
 
				+            {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
			
 
				+            { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
			
 
				+            {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 6 )*/
			
 
				+            {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
			
 
				+            { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
			
 
				+            {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 7 )*/
			
 
				+            {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
			
 
				+            { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
			
 
				+        }
			
 
				+    },
			
 
				+    { /* Block Type ( 2 ) */
			
 
				+        { /* Coeff Band ( 0 )*/
			
 
				+            { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
			
 
				+            { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
			
 
				+            {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 1 )*/
			
 
				+            {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
			
 
				+            { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
			
 
				+            { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 2 )*/
			
 
				+            {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
			
 
				+            { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
			
 
				+            {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 3 )*/
			
 
				+            {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
			
 
				+            { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 4 )*/
			
 
				+            {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
			
 
				+            { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 5 )*/
			
 
				+            {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 6 )*/
			
 
				+            {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
			
 
				+            {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 7 )*/
			
 
				+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
			
 
				+        }
			
 
				+    },
			
 
				+    { /* Block Type ( 3 ) */
			
 
				+        { /* Coeff Band ( 0 )*/
			
 
				+            { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
			
 
				+            { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
			
 
				+            {  61,  46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 1 )*/
			
 
				+            {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
			
 
				+            { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
			
 
				+            {  39,  77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 2 )*/
			
 
				+            {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
			
 
				+            { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
			
 
				+            {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 3 )*/
			
 
				+            {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
			
 
				+            { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
			
 
				+            {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 4 )*/
			
 
				+            {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
			
 
				+            { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
			
 
				+            {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 5 )*/
			
 
				+            {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
			
 
				+            { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
			
 
				+            {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 6 )*/
			
 
				+            {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
			
 
				+            { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
			
 
				+            {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
			
 
				+        },
			
 
				+        { /* Coeff Band ( 7 )*/
			
 
				+            {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
			
 
				+            { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
			
 
				+        }
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_DEFAULT_COEF_PROBS_H_
			
--- a/thirdparty/libvpx/vp8/common/dequantize.c
+++ b/thirdparty/libvpx/vp8/common/dequantize.c
@@ -0,0 +1,43 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+#include "vp8/common/blockd.h"
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+
			
 
				+void vp8_dequantize_b_c(BLOCKD *d, short *DQC)
			
 
				+{
			
 
				+    int i;
			
 
				+    short *DQ  = d->dqcoeff;
			
 
				+    short *Q   = d->qcoeff;
			
 
				+
			
 
				+    for (i = 0; i < 16; i++)
			
 
				+    {
			
 
				+        DQ[i] = Q[i] * DQC[i];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_dequant_idct_add_c(short *input, short *dq,
			
 
				+                            unsigned char *dest, int stride)
			
 
				+{
			
 
				+    int i;
			
 
				+
			
 
				+    for (i = 0; i < 16; i++)
			
 
				+    {
			
 
				+        input[i] = dq[i] * input[i];
			
 
				+    }
			
 
				+
			
 
				+    vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
			
 
				+
			
 
				+    memset(input, 0, 32);
			
 
				+
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/entropy.c
+++ b/thirdparty/libvpx/vp8/common/entropy.c
@@ -0,0 +1,188 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include "entropy.h"
			
 
				+#include "blockd.h"
			
 
				+#include "onyxc_int.h"
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+
			
 
				+#include "coefupdateprobs.h"
			
 
				+
			
 
				+DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) =
			
 
				+{
			
 
				+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
			
 
				+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
			
 
				+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
			
 
				+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
			
 
				+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
			
 
				+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
			
 
				+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
			
 
				+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
			
 
				+};
			
 
				+
			
 
				+DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) =
			
 
				+{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
			
 
				+
			
 
				+DECLARE_ALIGNED(16, const unsigned char,
			
 
				+                vp8_prev_token_class[MAX_ENTROPY_TOKENS]) =
			
 
				+{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
			
 
				+
			
 
				+DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
			
 
				+{
			
 
				+    0,  1,  4,  8,
			
 
				+    5,  2,  3,  6,
			
 
				+    9, 12, 13, 10,
			
 
				+    7, 11, 14, 15,
			
 
				+};
			
 
				+
			
 
				+DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
			
 
				+{
			
 
				+    1,  2,  6,  7,
			
 
				+    3,  5,  8, 13,
			
 
				+    4,  9, 12, 14,
			
 
				+   10, 11, 15, 16
			
 
				+};
			
 
				+
			
 
				+/* vp8_default_zig_zag_mask generated with:
			
 
				+
			
 
				+    void vp8_init_scan_order_mask()
			
 
				+    {
			
 
				+        int i;
			
 
				+
			
 
				+        for (i = 0; i < 16; i++)
			
 
				+        {
			
 
				+            vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i;
			
 
				+        }
			
 
				+
			
 
				+    }
			
 
				+*/
			
 
				+DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) =
			
 
				+{
			
 
				+     1,    2,    32,     64,
			
 
				+     4,   16,   128,   4096,
			
 
				+     8,  256,  2048,   8192,
			
 
				+   512, 1024, 16384, -32768
			
 
				+};
			
 
				+
			
 
				+const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
			
 
				+
			
 
				+/* Array indices are identical to previously-existing CONTEXT_NODE indices */
			
 
				+
			
 
				+const vp8_tree_index vp8_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
			
 
				+{
			
 
				+    -DCT_EOB_TOKEN, 2,                             /* 0 = EOB */
			
 
				+    -ZERO_TOKEN, 4,                               /* 1 = ZERO */
			
 
				+    -ONE_TOKEN, 6,                               /* 2 = ONE */
			
 
				+    8, 12,                                      /* 3 = LOW_VAL */
			
 
				+    -TWO_TOKEN, 10,                            /* 4 = TWO */
			
 
				+    -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
			
 
				+    14, 16,                                    /* 6 = HIGH_LOW */
			
 
				+    -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
			
 
				+    18, 20,                                   /* 8 = CAT_THREEFOUR */
			
 
				+    -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
			
 
				+    -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
			
 
				+};
			
 
				+
			
 
				+/* vp8_coef_encodings generated with:
			
 
				+    vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree);
			
 
				+*/
			
 
				+vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] =
			
 
				+{
			
 
				+    {2, 2},
			
 
				+    {6, 3},
			
 
				+    {28, 5},
			
 
				+    {58, 6},
			
 
				+    {59, 6},
			
 
				+    {60, 6},
			
 
				+    {61, 6},
			
 
				+    {124, 7},
			
 
				+    {125, 7},
			
 
				+    {126, 7},
			
 
				+    {127, 7},
			
 
				+    {0, 1}
			
 
				+};
			
 
				+
			
 
				+/* Trees for extra bits.  Probabilities are constant and
			
 
				+   do not depend on previously encoded bits */
			
 
				+
			
 
				+static const vp8_prob Pcat1[] = { 159};
			
 
				+static const vp8_prob Pcat2[] = { 165, 145};
			
 
				+static const vp8_prob Pcat3[] = { 173, 148, 140};
			
 
				+static const vp8_prob Pcat4[] = { 176, 155, 140, 135};
			
 
				+static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130};
			
 
				+static const vp8_prob Pcat6[] =
			
 
				+{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129};
			
 
				+
			
 
				+
			
 
				+/* tree index tables generated with:
			
 
				+
			
 
				+    void init_bit_tree(vp8_tree_index *p, int n)
			
 
				+    {
			
 
				+        int i = 0;
			
 
				+
			
 
				+        while (++i < n)
			
 
				+        {
			
 
				+            p[0] = p[1] = i << 1;
			
 
				+            p += 2;
			
 
				+        }
			
 
				+
			
 
				+        p[0] = p[1] = 0;
			
 
				+    }
			
 
				+
			
 
				+    void init_bit_trees()
			
 
				+    {
			
 
				+        init_bit_tree(cat1, 1);
			
 
				+        init_bit_tree(cat2, 2);
			
 
				+        init_bit_tree(cat3, 3);
			
 
				+        init_bit_tree(cat4, 4);
			
 
				+        init_bit_tree(cat5, 5);
			
 
				+        init_bit_tree(cat6, 11);
			
 
				+    }
			
 
				+*/
			
 
				+
			
 
				+static const vp8_tree_index cat1[2] = { 0, 0 };
			
 
				+static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 };
			
 
				+static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 };
			
 
				+static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 };
			
 
				+static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 };
			
 
				+static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
			
 
				+                                        14, 14, 16, 16, 18, 18, 20, 20, 0, 0 };
			
 
				+
			
 
				+const vp8_extra_bit_struct vp8_extra_bits[12] =
			
 
				+{
			
 
				+    { 0, 0, 0, 0},
			
 
				+    { 0, 0, 0, 1},
			
 
				+    { 0, 0, 0, 2},
			
 
				+    { 0, 0, 0, 3},
			
 
				+    { 0, 0, 0, 4},
			
 
				+    { cat1, Pcat1, 1, 5},
			
 
				+    { cat2, Pcat2, 2, 7},
			
 
				+    { cat3, Pcat3, 3, 11},
			
 
				+    { cat4, Pcat4, 4, 19},
			
 
				+    { cat5, Pcat5, 5, 35},
			
 
				+    { cat6, Pcat6, 11, 67},
			
 
				+    { 0, 0, 0, 0}
			
 
				+};
			
 
				+
			
 
				+#include "default_coef_probs.h"
			
 
				+
			
 
				+void vp8_default_coef_probs(VP8_COMMON *pc)
			
 
				+{
			
 
				+    memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs));
			
 
				+}
			
 
				+
			
--- a/thirdparty/libvpx/vp8/common/entropy.h
+++ b/thirdparty/libvpx/vp8/common/entropy.h
@@ -0,0 +1,109 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_ENTROPY_H_
			
 
				+#define VP8_COMMON_ENTROPY_H_
			
 
				+
			
 
				+#include "treecoder.h"
			
 
				+#include "blockd.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/* Coefficient token alphabet */
			
 
				+
			
 
				+#define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
			
 
				+#define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */
			
 
				+#define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */
			
 
				+#define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */
			
 
				+#define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
			
 
				+#define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
			
 
				+#define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
			
 
				+#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
			
 
				+#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
			
 
				+#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
			
 
				+#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 11+1 */
			
 
				+#define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
			
 
				+
			
 
				+#define MAX_ENTROPY_TOKENS 12
			
 
				+#define ENTROPY_NODES 11
			
 
				+
			
 
				+extern const vp8_tree_index vp8_coef_tree[];
			
 
				+
			
 
				+extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
			
 
				+
			
 
				+typedef struct
			
 
				+{
			
 
				+    vp8_tree_p tree;
			
 
				+    const vp8_prob *prob;
			
 
				+    int Len;
			
 
				+    int base_val;
			
 
				+} vp8_extra_bit_struct;
			
 
				+
			
 
				+extern const vp8_extra_bit_struct vp8_extra_bits[12];    /* indexed by token value */
			
 
				+
			
 
				+#define PROB_UPDATE_BASELINE_COST   7
			
 
				+
			
 
				+#define MAX_PROB                255
			
 
				+#define DCT_MAX_VALUE           2048
			
 
				+
			
 
				+
			
 
				+/* Coefficients are predicted via a 3-dimensional probability table. */
			
 
				+
			
 
				+/* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
			
 
				+
			
 
				+#define BLOCK_TYPES 4
			
 
				+
			
 
				+/* Middle dimension is a coarsening of the coefficient's
			
 
				+   position within the 4x4 DCT. */
			
 
				+
			
 
				+#define COEF_BANDS 8
			
 
				+extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
			
 
				+
			
 
				+/* Inside dimension is 3-valued measure of nearby complexity, that is,
			
 
				+   the extent to which nearby coefficients are nonzero.  For the first
			
 
				+   coefficient (DC, unless block type is 0), we look at the (already encoded)
			
 
				+   blocks above and to the left of the current block.  The context index is
			
 
				+   then the number (0,1,or 2) of these blocks having nonzero coefficients.
			
 
				+   After decoding a coefficient, the measure is roughly the size of the
			
 
				+   most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
			
 
				+   Note that the intuitive meaning of this measure changes as coefficients
			
 
				+   are decoded, e.g., prior to the first token, a zero means that my neighbors
			
 
				+   are empty while, after the first token, because of the use of end-of-block,
			
 
				+   a zero means we just decoded a zero and hence guarantees that a non-zero
			
 
				+   coefficient will appear later in this block.  However, this shift
			
 
				+   in meaning is perfectly OK because our context depends also on the
			
 
				+   coefficient band (and since zigzag positions 0, 1, and 2 are in
			
 
				+   distinct bands). */
			
 
				+
			
 
				+/*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
			
 
				+#   define PREV_COEF_CONTEXTS       3
			
 
				+
			
 
				+extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]);
			
 
				+
			
 
				+extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
			
 
				+
			
 
				+
			
 
				+struct VP8Common;
			
 
				+void vp8_default_coef_probs(struct VP8Common *);
			
 
				+
			
 
				+extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
			
 
				+extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
			
 
				+extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]);
			
 
				+extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
			
 
				+
			
 
				+void vp8_coef_tree_initialize(void);
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_ENTROPY_H_
			
--- a/thirdparty/libvpx/vp8/common/entropymode.c
+++ b/thirdparty/libvpx/vp8/common/entropymode.c
@@ -0,0 +1,171 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#define USE_PREBUILT_TABLES
			
 
				+
			
 
				+#include "entropymode.h"
			
 
				+#include "entropy.h"
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+
			
 
				+#include "vp8_entropymodedata.h"
			
 
				+
			
 
				+int vp8_mv_cont(const int_mv *l, const int_mv *a)
			
 
				+{
			
 
				+    int lez = (l->as_int == 0);
			
 
				+    int aez = (a->as_int == 0);
			
 
				+    int lea = (l->as_int == a->as_int);
			
 
				+
			
 
				+    if (lea && lez)
			
 
				+        return SUBMVREF_LEFT_ABOVE_ZED;
			
 
				+
			
 
				+    if (lea)
			
 
				+        return SUBMVREF_LEFT_ABOVE_SAME;
			
 
				+
			
 
				+    if (aez)
			
 
				+        return SUBMVREF_ABOVE_ZED;
			
 
				+
			
 
				+    if (lez)
			
 
				+        return SUBMVREF_LEFT_ZED;
			
 
				+
			
 
				+    return SUBMVREF_NORMAL;
			
 
				+}
			
 
				+
			
 
				+static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25};
			
 
				+
			
 
				+const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] =
			
 
				+{
			
 
				+    { 147, 136, 18 },
			
 
				+    { 106, 145, 1  },
			
 
				+    { 179, 121, 1  },
			
 
				+    { 223, 1  , 34 },
			
 
				+    { 208, 1  , 1  }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] =
			
 
				+{
			
 
				+    {
			
 
				+        0,  0,  0,  0,
			
 
				+        0,  0,  0,  0,
			
 
				+        1,  1,  1,  1,
			
 
				+        1,  1,  1,  1,
			
 
				+    },
			
 
				+    {
			
 
				+        0,  0,  1,  1,
			
 
				+        0,  0,  1,  1,
			
 
				+        0,  0,  1,  1,
			
 
				+        0,  0,  1,  1,
			
 
				+    },
			
 
				+    {
			
 
				+        0,  0,  1,  1,
			
 
				+        0,  0,  1,  1,
			
 
				+        2,  2,  3,  3,
			
 
				+        2,  2,  3,  3,
			
 
				+    },
			
 
				+    {
			
 
				+        0,  1,  2,  3,
			
 
				+        4,  5,  6,  7,
			
 
				+        8,  9,  10, 11,
			
 
				+        12, 13, 14, 15,
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16};
			
 
				+
			
 
				+const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150};
			
 
				+
			
 
				+
			
 
				+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
			
 
				+
			
 
				+const vp8_tree_index vp8_bmode_tree[18] =     /* INTRAMODECONTEXTNODE value */
			
 
				+{
			
 
				+    -B_DC_PRED, 2,                             /* 0 = DC_NODE */
			
 
				+    -B_TM_PRED, 4,                            /* 1 = TM_NODE */
			
 
				+    -B_VE_PRED, 6,                           /* 2 = VE_NODE */
			
 
				+    8, 12,                                  /* 3 = COM_NODE */
			
 
				+    -B_HE_PRED, 10,                        /* 4 = HE_NODE */
			
 
				+    -B_RD_PRED, -B_VR_PRED,               /* 5 = RD_NODE */
			
 
				+    -B_LD_PRED, 14,                        /* 6 = LD_NODE */
			
 
				+    -B_VL_PRED, 16,                      /* 7 = VL_NODE */
			
 
				+    -B_HD_PRED, -B_HU_PRED             /* 8 = HD_NODE */
			
 
				+};
			
 
				+
			
 
				+/* Again, these trees use the same probability indices as their
			
 
				+   explicitly-programmed predecessors. */
			
 
				+
			
 
				+const vp8_tree_index vp8_ymode_tree[8] =
			
 
				+{
			
 
				+    -DC_PRED, 2,
			
 
				+    4, 6,
			
 
				+    -V_PRED, -H_PRED,
			
 
				+    -TM_PRED, -B_PRED
			
 
				+};
			
 
				+
			
 
				+const vp8_tree_index vp8_kf_ymode_tree[8] =
			
 
				+{
			
 
				+    -B_PRED, 2,
			
 
				+    4, 6,
			
 
				+    -DC_PRED, -V_PRED,
			
 
				+    -H_PRED, -TM_PRED
			
 
				+};
			
 
				+
			
 
				+const vp8_tree_index vp8_uv_mode_tree[6] =
			
 
				+{
			
 
				+    -DC_PRED, 2,
			
 
				+    -V_PRED, 4,
			
 
				+    -H_PRED, -TM_PRED
			
 
				+};
			
 
				+
			
 
				+const vp8_tree_index vp8_mbsplit_tree[6] =
			
 
				+{
			
 
				+    -3, 2,
			
 
				+    -2, 4,
			
 
				+    -0, -1
			
 
				+};
			
 
				+
			
 
				+const vp8_tree_index vp8_mv_ref_tree[8] =
			
 
				+{
			
 
				+    -ZEROMV, 2,
			
 
				+    -NEARESTMV, 4,
			
 
				+    -NEARMV, 6,
			
 
				+    -NEWMV, -SPLITMV
			
 
				+};
			
 
				+
			
 
				+const vp8_tree_index vp8_sub_mv_ref_tree[6] =
			
 
				+{
			
 
				+    -LEFT4X4, 2,
			
 
				+    -ABOVE4X4, 4,
			
 
				+    -ZERO4X4, -NEW4X4
			
 
				+};
			
 
				+
			
 
				+const vp8_tree_index vp8_small_mvtree [14] =
			
 
				+{
			
 
				+    2, 8,
			
 
				+    4, 6,
			
 
				+    -0, -1,
			
 
				+    -2, -3,
			
 
				+    10, 12,
			
 
				+    -4, -5,
			
 
				+    -6, -7
			
 
				+};
			
 
				+
			
 
				+void vp8_init_mbmode_probs(VP8_COMMON *x)
			
 
				+{
			
 
				+    memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
			
 
				+    memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
			
 
				+    memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
			
 
				+}
			
 
				+
			
 
				+void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
			
 
				+{
			
 
				+    memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
			
 
				+}
			
 
				+
			
--- a/thirdparty/libvpx/vp8/common/entropymode.h
+++ b/thirdparty/libvpx/vp8/common/entropymode.h
@@ -0,0 +1,88 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_ENTROPYMODE_H_
			
 
				+#define VP8_COMMON_ENTROPYMODE_H_
			
 
				+
			
 
				+#include "onyxc_int.h"
			
 
				+#include "treecoder.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+typedef enum
			
 
				+{
			
 
				+    SUBMVREF_NORMAL,
			
 
				+    SUBMVREF_LEFT_ZED,
			
 
				+    SUBMVREF_ABOVE_ZED,
			
 
				+    SUBMVREF_LEFT_ABOVE_SAME,
			
 
				+    SUBMVREF_LEFT_ABOVE_ZED
			
 
				+} sumvfref_t;
			
 
				+
			
 
				+typedef int vp8_mbsplit[16];
			
 
				+
			
 
				+#define VP8_NUMMBSPLITS 4
			
 
				+
			
 
				+extern const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS];
			
 
				+
			
 
				+extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS];    /* # of subsets */
			
 
				+
			
 
				+extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1];
			
 
				+
			
 
				+extern int vp8_mv_cont(const int_mv *l, const int_mv *a);
			
 
				+#define SUBMVREF_COUNT 5
			
 
				+extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1];
			
 
				+
			
 
				+
			
 
				+extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES];
			
 
				+
			
 
				+
			
 
				+extern const vp8_tree_index vp8_bmode_tree[];
			
 
				+
			
 
				+extern const vp8_tree_index  vp8_ymode_tree[];
			
 
				+extern const vp8_tree_index  vp8_kf_ymode_tree[];
			
 
				+extern const vp8_tree_index  vp8_uv_mode_tree[];
			
 
				+
			
 
				+extern const vp8_tree_index  vp8_mbsplit_tree[];
			
 
				+extern const vp8_tree_index  vp8_mv_ref_tree[];
			
 
				+extern const vp8_tree_index  vp8_sub_mv_ref_tree[];
			
 
				+
			
 
				+extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES];
			
 
				+extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES];
			
 
				+extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES];
			
 
				+extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES];
			
 
				+extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS];
			
 
				+
			
 
				+/* Inter mode values do not start at zero */
			
 
				+
			
 
				+extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS];
			
 
				+extern const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS];
			
 
				+
			
 
				+extern const vp8_tree_index vp8_small_mvtree[];
			
 
				+
			
 
				+extern const struct vp8_token_struct vp8_small_mvencodings[8];
			
 
				+
			
 
				+/* Key frame default mode probs */
			
 
				+extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES]
			
 
				+[VP8_BINTRAMODES-1];
			
 
				+extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1];
			
 
				+extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1];
			
 
				+
			
 
				+void vp8_init_mbmode_probs(VP8_COMMON *x);
			
 
				+void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]);
			
 
				+void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_ENTROPYMODE_H_
			
--- a/thirdparty/libvpx/vp8/common/entropymv.c
+++ b/thirdparty/libvpx/vp8/common/entropymv.c
@@ -0,0 +1,49 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "entropymv.h"
			
 
				+
			
 
				+const MV_CONTEXT vp8_mv_update_probs[2] =
			
 
				+{
			
 
				+    {{
			
 
				+        237,
			
 
				+        246,
			
 
				+        253, 253, 254, 254, 254, 254, 254,
			
 
				+        254, 254, 254, 254, 254, 250, 250, 252, 254, 254
			
 
				+    }},
			
 
				+    {{
			
 
				+        231,
			
 
				+        243,
			
 
				+        245, 253, 254, 254, 254, 254, 254,
			
 
				+        254, 254, 254, 254, 254, 251, 251, 254, 254, 254
			
 
				+    }}
			
 
				+};
			
 
				+const MV_CONTEXT vp8_default_mv_context[2] =
			
 
				+{
			
 
				+    {{
			
 
				+        /* row */
			
 
				+        162,                                        /* is short */
			
 
				+        128,                                        /* sign */
			
 
				+        225, 146, 172, 147, 214,  39, 156,          /* short tree */
			
 
				+        128, 129, 132,  75, 145, 178, 206, 239, 254, 254 /* long bits */
			
 
				+    }},
			
 
				+
			
 
				+
			
 
				+
			
 
				+    {{
			
 
				+        /* same for column */
			
 
				+        164,                                        /* is short */
			
 
				+        128,
			
 
				+        204, 170, 119, 235, 140, 230, 228,
			
 
				+        128, 130, 130,  74, 148, 180, 203, 236, 254, 254 /* long bits */
			
 
				+
			
 
				+    }}
			
 
				+};
			
--- a/thirdparty/libvpx/vp8/common/entropymv.h
+++ b/thirdparty/libvpx/vp8/common/entropymv.h
@@ -0,0 +1,52 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_ENTROPYMV_H_
			
 
				+#define VP8_COMMON_ENTROPYMV_H_
			
 
				+
			
 
				+#include "treecoder.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+enum
			
 
				+{
			
 
				+    mv_max  = 1023,              /* max absolute value of a MV component */
			
 
				+    MVvals = (2 * mv_max) + 1,   /* # possible values "" */
			
 
				+    mvfp_max  = 255,              /* max absolute value of a full pixel MV component */
			
 
				+    MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */
			
 
				+
			
 
				+    mvlong_width = 10,       /* Large MVs have 9 bit magnitudes */
			
 
				+    mvnum_short = 8,         /* magnitudes 0 through 7 */
			
 
				+
			
 
				+    /* probability offsets for coding each MV component */
			
 
				+
			
 
				+    mvpis_short = 0,         /* short (<= 7) vs long (>= 8) */
			
 
				+    MVPsign,                /* sign for non-zero */
			
 
				+    MVPshort,               /* 8 short values = 7-position tree */
			
 
				+
			
 
				+    MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */
			
 
				+    MVPcount = MVPbits + mvlong_width    /* (with independent probabilities) */
			
 
				+};
			
 
				+
			
 
				+typedef struct mv_context
			
 
				+{
			
 
				+    vp8_prob prob[MVPcount];  /* often come in row, col pairs */
			
 
				+} MV_CONTEXT;
			
 
				+
			
 
				+extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_ENTROPYMV_H_
			
--- a/thirdparty/libvpx/vp8/common/extend.c
+++ b/thirdparty/libvpx/vp8/common/extend.c
@@ -0,0 +1,188 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "extend.h"
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+
			
 
				+
			
 
				+static void copy_and_extend_plane
			
 
				+(
			
 
				+    unsigned char *s, /* source */
			
 
				+    int sp,           /* source pitch */
			
 
				+    unsigned char *d, /* destination */
			
 
				+    int dp,           /* destination pitch */
			
 
				+    int h,            /* height */
			
 
				+    int w,            /* width */
			
 
				+    int et,           /* extend top border */
			
 
				+    int el,           /* extend left border */
			
 
				+    int eb,           /* extend bottom border */
			
 
				+    int er            /* extend right border */
			
 
				+)
			
 
				+{
			
 
				+    int i;
			
 
				+    unsigned char *src_ptr1, *src_ptr2;
			
 
				+    unsigned char *dest_ptr1, *dest_ptr2;
			
 
				+    int linesize;
			
 
				+
			
 
				+    /* copy the left and right most columns out */
			
 
				+    src_ptr1 = s;
			
 
				+    src_ptr2 = s + w - 1;
			
 
				+    dest_ptr1 = d - el;
			
 
				+    dest_ptr2 = d + w;
			
 
				+
			
 
				+    for (i = 0; i < h; i++)
			
 
				+    {
			
 
				+        memset(dest_ptr1, src_ptr1[0], el);
			
 
				+        memcpy(dest_ptr1 + el, src_ptr1, w);
			
 
				+        memset(dest_ptr2, src_ptr2[0], er);
			
 
				+        src_ptr1  += sp;
			
 
				+        src_ptr2  += sp;
			
 
				+        dest_ptr1 += dp;
			
 
				+        dest_ptr2 += dp;
			
 
				+    }
			
 
				+
			
 
				+    /* Now copy the top and bottom lines into each line of the respective
			
 
				+     * borders
			
 
				+     */
			
 
				+    src_ptr1 = d - el;
			
 
				+    src_ptr2 = d + dp * (h - 1) - el;
			
 
				+    dest_ptr1 = d + dp * (-et) - el;
			
 
				+    dest_ptr2 = d + dp * (h) - el;
			
 
				+    linesize = el + er + w;
			
 
				+
			
 
				+    for (i = 0; i < et; i++)
			
 
				+    {
			
 
				+        memcpy(dest_ptr1, src_ptr1, linesize);
			
 
				+        dest_ptr1 += dp;
			
 
				+    }
			
 
				+
			
 
				+    for (i = 0; i < eb; i++)
			
 
				+    {
			
 
				+        memcpy(dest_ptr2, src_ptr2, linesize);
			
 
				+        dest_ptr2 += dp;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
			
 
				+                               YV12_BUFFER_CONFIG *dst)
			
 
				+{
			
 
				+    int et = dst->border;
			
 
				+    int el = dst->border;
			
 
				+    int eb = dst->border + dst->y_height - src->y_height;
			
 
				+    int er = dst->border + dst->y_width - src->y_width;
			
 
				+
			
 
				+    copy_and_extend_plane(src->y_buffer, src->y_stride,
			
 
				+                          dst->y_buffer, dst->y_stride,
			
 
				+                          src->y_height, src->y_width,
			
 
				+                          et, el, eb, er);
			
 
				+
			
 
				+    et = dst->border >> 1;
			
 
				+    el = dst->border >> 1;
			
 
				+    eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
			
 
				+    er = (dst->border >> 1) + dst->uv_width - src->uv_width;
			
 
				+
			
 
				+    copy_and_extend_plane(src->u_buffer, src->uv_stride,
			
 
				+                          dst->u_buffer, dst->uv_stride,
			
 
				+                          src->uv_height, src->uv_width,
			
 
				+                          et, el, eb, er);
			
 
				+
			
 
				+    copy_and_extend_plane(src->v_buffer, src->uv_stride,
			
 
				+                          dst->v_buffer, dst->uv_stride,
			
 
				+                          src->uv_height, src->uv_width,
			
 
				+                          et, el, eb, er);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
			
 
				+                                         YV12_BUFFER_CONFIG *dst,
			
 
				+                                         int srcy, int srcx,
			
 
				+                                         int srch, int srcw)
			
 
				+{
			
 
				+    int et = dst->border;
			
 
				+    int el = dst->border;
			
 
				+    int eb = dst->border + dst->y_height - src->y_height;
			
 
				+    int er = dst->border + dst->y_width - src->y_width;
			
 
				+    int src_y_offset = srcy * src->y_stride + srcx;
			
 
				+    int dst_y_offset = srcy * dst->y_stride + srcx;
			
 
				+    int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
			
 
				+    int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
			
 
				+
			
 
				+    /* If the side is not touching the bounder then don't extend. */
			
 
				+    if (srcy)
			
 
				+      et = 0;
			
 
				+    if (srcx)
			
 
				+      el = 0;
			
 
				+    if (srcy + srch != src->y_height)
			
 
				+      eb = 0;
			
 
				+    if (srcx + srcw != src->y_width)
			
 
				+      er = 0;
			
 
				+
			
 
				+    copy_and_extend_plane(src->y_buffer + src_y_offset,
			
 
				+                          src->y_stride,
			
 
				+                          dst->y_buffer + dst_y_offset,
			
 
				+                          dst->y_stride,
			
 
				+                          srch, srcw,
			
 
				+                          et, el, eb, er);
			
 
				+
			
 
				+    et = (et + 1) >> 1;
			
 
				+    el = (el + 1) >> 1;
			
 
				+    eb = (eb + 1) >> 1;
			
 
				+    er = (er + 1) >> 1;
			
 
				+    srch = (srch + 1) >> 1;
			
 
				+    srcw = (srcw + 1) >> 1;
			
 
				+
			
 
				+    copy_and_extend_plane(src->u_buffer + src_uv_offset,
			
 
				+                          src->uv_stride,
			
 
				+                          dst->u_buffer + dst_uv_offset,
			
 
				+                          dst->uv_stride,
			
 
				+                          srch, srcw,
			
 
				+                          et, el, eb, er);
			
 
				+
			
 
				+    copy_and_extend_plane(src->v_buffer + src_uv_offset,
			
 
				+                          src->uv_stride,
			
 
				+                          dst->v_buffer + dst_uv_offset,
			
 
				+                          dst->uv_stride,
			
 
				+                          srch, srcw,
			
 
				+                          et, el, eb, er);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* note the extension is only for the last row, for intra prediction purpose */
			
 
				+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf,
			
 
				+                       unsigned char *YPtr,
			
 
				+                       unsigned char *UPtr,
			
 
				+                       unsigned char *VPtr)
			
 
				+{
			
 
				+    int i;
			
 
				+
			
 
				+    YPtr += ybf->y_stride * 14;
			
 
				+    UPtr += ybf->uv_stride * 6;
			
 
				+    VPtr += ybf->uv_stride * 6;
			
 
				+
			
 
				+    for (i = 0; i < 4; i++)
			
 
				+    {
			
 
				+        YPtr[i] = YPtr[-1];
			
 
				+        UPtr[i] = UPtr[-1];
			
 
				+        VPtr[i] = VPtr[-1];
			
 
				+    }
			
 
				+
			
 
				+    YPtr += ybf->y_stride;
			
 
				+    UPtr += ybf->uv_stride;
			
 
				+    VPtr += ybf->uv_stride;
			
 
				+
			
 
				+    for (i = 0; i < 4; i++)
			
 
				+    {
			
 
				+        YPtr[i] = YPtr[-1];
			
 
				+        UPtr[i] = UPtr[-1];
			
 
				+        VPtr[i] = VPtr[-1];
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/extend.h
+++ b/thirdparty/libvpx/vp8/common/extend.h
@@ -0,0 +1,33 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_EXTEND_H_
			
 
				+#define VP8_COMMON_EXTEND_H_
			
 
				+
			
 
				+#include "vpx_scale/yv12config.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
			
 
				+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
			
 
				+                               YV12_BUFFER_CONFIG *dst);
			
 
				+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
			
 
				+                                         YV12_BUFFER_CONFIG *dst,
			
 
				+                                         int srcy, int srcx,
			
 
				+                                         int srch, int srcw);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_EXTEND_H_
			
--- a/thirdparty/libvpx/vp8/common/filter.c
+++ b/thirdparty/libvpx/vp8/common/filter.c
@@ -0,0 +1,493 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "filter.h"
			
 
				+#include "./vp8_rtcd.h"
			
 
				+
			
 
				+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
			
 
				+{
			
 
				+    { 128,   0 },
			
 
				+    { 112,  16 },
			
 
				+    {  96,  32 },
			
 
				+    {  80,  48 },
			
 
				+    {  64,  64 },
			
 
				+    {  48,  80 },
			
 
				+    {  32,  96 },
			
 
				+    {  16, 112 }
			
 
				+};
			
 
				+
			
 
				+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
			
 
				+{
			
 
				+
			
 
				+    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
			
 
				+    { 0, -6,  123,   12,  -1,  0 },
			
 
				+    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
			
 
				+    { 0, -9,   93,   50,  -6,  0 },
			
 
				+    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
			
 
				+    { 0, -6,   50,   93,  -9,  0 },
			
 
				+    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
			
 
				+    { 0, -1,   12,  123,  -6,  0 },
			
 
				+};
			
 
				+
			
 
				+static void filter_block2d_first_pass
			
 
				+(
			
 
				+    unsigned char *src_ptr,
			
 
				+    int *output_ptr,
			
 
				+    unsigned int src_pixels_per_line,
			
 
				+    unsigned int pixel_step,
			
 
				+    unsigned int output_height,
			
 
				+    unsigned int output_width,
			
 
				+    const short *vp8_filter
			
 
				+)
			
 
				+{
			
 
				+    unsigned int i, j;
			
 
				+    int  Temp;
			
 
				+
			
 
				+    for (i = 0; i < output_height; i++)
			
 
				+    {
			
 
				+        for (j = 0; j < output_width; j++)
			
 
				+        {
			
 
				+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
			
 
				+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
			
 
				+                   ((int)src_ptr[0]                 * vp8_filter[2]) +
			
 
				+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
			
 
				+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
			
 
				+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
			
 
				+                   (VP8_FILTER_WEIGHT >> 1);      /* Rounding */
			
 
				+
			
 
				+            /* Normalize back to 0-255 */
			
 
				+            Temp = Temp >> VP8_FILTER_SHIFT;
			
 
				+
			
 
				+            if (Temp < 0)
			
 
				+                Temp = 0;
			
 
				+            else if (Temp > 255)
			
 
				+                Temp = 255;
			
 
				+
			
 
				+            output_ptr[j] = Temp;
			
 
				+            src_ptr++;
			
 
				+        }
			
 
				+
			
 
				+        /* Next row... */
			
 
				+        src_ptr    += src_pixels_per_line - output_width;
			
 
				+        output_ptr += output_width;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void filter_block2d_second_pass
			
 
				+(
			
 
				+    int *src_ptr,
			
 
				+    unsigned char *output_ptr,
			
 
				+    int output_pitch,
			
 
				+    unsigned int src_pixels_per_line,
			
 
				+    unsigned int pixel_step,
			
 
				+    unsigned int output_height,
			
 
				+    unsigned int output_width,
			
 
				+    const short *vp8_filter
			
 
				+)
			
 
				+{
			
 
				+    unsigned int i, j;
			
 
				+    int  Temp;
			
 
				+
			
 
				+    for (i = 0; i < output_height; i++)
			
 
				+    {
			
 
				+        for (j = 0; j < output_width; j++)
			
 
				+        {
			
 
				+            /* Apply filter */
			
 
				+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
			
 
				+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
			
 
				+                   ((int)src_ptr[0]                 * vp8_filter[2]) +
			
 
				+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
			
 
				+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
			
 
				+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
			
 
				+                   (VP8_FILTER_WEIGHT >> 1);   /* Rounding */
			
 
				+
			
 
				+            /* Normalize back to 0-255 */
			
 
				+            Temp = Temp >> VP8_FILTER_SHIFT;
			
 
				+
			
 
				+            if (Temp < 0)
			
 
				+                Temp = 0;
			
 
				+            else if (Temp > 255)
			
 
				+                Temp = 255;
			
 
				+
			
 
				+            output_ptr[j] = (unsigned char)Temp;
			
 
				+            src_ptr++;
			
 
				+        }
			
 
				+
			
 
				+        /* Start next row */
			
 
				+        src_ptr    += src_pixels_per_line - output_width;
			
 
				+        output_ptr += output_pitch;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void filter_block2d
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    unsigned char  *output_ptr,
			
 
				+    unsigned int src_pixels_per_line,
			
 
				+    int output_pitch,
			
 
				+    const short  *HFilter,
			
 
				+    const short  *VFilter
			
 
				+)
			
 
				+{
			
 
				+    int FData[9*4]; /* Temp data buffer used in filtering */
			
 
				+
			
 
				+    /* First filter 1-D horizontally... */
			
 
				+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
			
 
				+
			
 
				+    /* then filter verticaly... */
			
 
				+    filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_sixtap_predict4x4_c
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    const short  *HFilter;
			
 
				+    const short  *VFilter;
			
 
				+
			
 
				+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
			
 
				+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
			
 
				+
			
 
				+    filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
			
 
				+}
			
 
				+void vp8_sixtap_predict8x8_c
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int  src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int  dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    const short  *HFilter;
			
 
				+    const short  *VFilter;
			
 
				+    int FData[13*16];   /* Temp data buffer used in filtering */
			
 
				+
			
 
				+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
			
 
				+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
			
 
				+
			
 
				+    /* First filter 1-D horizontally... */
			
 
				+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
			
 
				+
			
 
				+
			
 
				+    /* then filter verticaly... */
			
 
				+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_sixtap_predict8x4_c
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int  src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int  dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    const short  *HFilter;
			
 
				+    const short  *VFilter;
			
 
				+    int FData[13*16];   /* Temp data buffer used in filtering */
			
 
				+
			
 
				+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
			
 
				+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
			
 
				+
			
 
				+    /* First filter 1-D horizontally... */
			
 
				+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
			
 
				+
			
 
				+
			
 
				+    /* then filter verticaly... */
			
 
				+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_sixtap_predict16x16_c
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int  src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int  dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    const short  *HFilter;
			
 
				+    const short  *VFilter;
			
 
				+    int FData[21*24];   /* Temp data buffer used in filtering */
			
 
				+
			
 
				+
			
 
				+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
			
 
				+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
			
 
				+
			
 
				+    /* First filter 1-D horizontally... */
			
 
				+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
			
 
				+
			
 
				+    /* then filter verticaly... */
			
 
				+    filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/****************************************************************************
			
 
				+ *
			
 
				+ *  ROUTINE       : filter_block2d_bil_first_pass
			
 
				+ *
			
 
				+ *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
			
 
				+ *                  UINT32  src_stride : Stride of source block.
			
 
				+ *                  UINT32  height     : Block height.
			
 
				+ *                  UINT32  width      : Block width.
			
 
				+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
			
 
				+ *
			
 
				+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
			
 
				+ *
			
 
				+ *  RETURNS       : void
			
 
				+ *
			
 
				+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
			
 
				+ *                  in the horizontal direction to produce the filtered output
			
 
				+ *                  block. Used to implement first-pass of 2-D separable filter.
			
 
				+ *
			
 
				+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
			
 
				+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
			
 
				+ *
			
 
				+ ****************************************************************************/
			
 
				+static void filter_block2d_bil_first_pass
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    unsigned short *dst_ptr,
			
 
				+    unsigned int    src_stride,
			
 
				+    unsigned int    height,
			
 
				+    unsigned int    width,
			
 
				+    const short    *vp8_filter
			
 
				+)
			
 
				+{
			
 
				+    unsigned int i, j;
			
 
				+
			
 
				+    for (i = 0; i < height; i++)
			
 
				+    {
			
 
				+        for (j = 0; j < width; j++)
			
 
				+        {
			
 
				+            /* Apply bilinear filter */
			
 
				+            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
			
 
				+                          ((int)src_ptr[1] * vp8_filter[1]) +
			
 
				+                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
			
 
				+            src_ptr++;
			
 
				+        }
			
 
				+
			
 
				+        /* Next row... */
			
 
				+        src_ptr += src_stride - width;
			
 
				+        dst_ptr += width;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/****************************************************************************
			
 
				+ *
			
 
				+ *  ROUTINE       : filter_block2d_bil_second_pass
			
 
				+ *
			
 
				+ *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
			
 
				+ *                  UINT32  dst_pitch  : Destination block pitch.
			
 
				+ *                  UINT32  height     : Block height.
			
 
				+ *                  UINT32  width      : Block width.
			
 
				+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
			
 
				+ *
			
 
				+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
			
 
				+ *
			
 
				+ *  RETURNS       : void
			
 
				+ *
			
 
				+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
			
 
				+ *                  in the vertical direction to produce the filtered output
			
 
				+ *                  block. Used to implement second-pass of 2-D separable filter.
			
 
				+ *
			
 
				+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
			
 
				+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
			
 
				+ *
			
 
				+ ****************************************************************************/
			
 
				+static void filter_block2d_bil_second_pass
			
 
				+(
			
 
				+    unsigned short *src_ptr,
			
 
				+    unsigned char  *dst_ptr,
			
 
				+    int             dst_pitch,
			
 
				+    unsigned int    height,
			
 
				+    unsigned int    width,
			
 
				+    const short    *vp8_filter
			
 
				+)
			
 
				+{
			
 
				+    unsigned int  i, j;
			
 
				+    int  Temp;
			
 
				+
			
 
				+    for (i = 0; i < height; i++)
			
 
				+    {
			
 
				+        for (j = 0; j < width; j++)
			
 
				+        {
			
 
				+            /* Apply filter */
			
 
				+            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
			
 
				+                   ((int)src_ptr[width] * vp8_filter[1]) +
			
 
				+                   (VP8_FILTER_WEIGHT / 2);
			
 
				+            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
			
 
				+            src_ptr++;
			
 
				+        }
			
 
				+
			
 
				+        /* Next row... */
			
 
				+        dst_ptr += dst_pitch;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/****************************************************************************
			
 
				+ *
			
 
				+ *  ROUTINE       : filter_block2d_bil
			
 
				+ *
			
 
				+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
			
 
				+ *                  UINT32  src_pitch        : Stride of source block.
			
 
				+ *                  UINT32  dst_pitch        : Stride of destination block.
			
 
				+ *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
			
 
				+ *                  INT32  *VFilter          : Array of 2 vertical filter taps.
			
 
				+ *                  INT32  Width             : Block width
			
 
				+ *                  INT32  Height            : Block height
			
 
				+ *
			
 
				+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
			
 
				+ *
			
 
				+ *  RETURNS       : void
			
 
				+ *
			
 
				+ *  FUNCTION      : 2-D filters an input block by applying a 2-tap
			
 
				+ *                  bi-linear filter horizontally followed by a 2-tap
			
 
				+ *                  bi-linear filter vertically on the result.
			
 
				+ *
			
 
				+ *  SPECIAL NOTES : The largest block size can be handled here is 16x16
			
 
				+ *
			
 
				+ ****************************************************************************/
			
 
				+static void filter_block2d_bil
			
 
				+(
			
 
				+    unsigned char *src_ptr,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    unsigned int   src_pitch,
			
 
				+    unsigned int   dst_pitch,
			
 
				+    const short   *HFilter,
			
 
				+    const short   *VFilter,
			
 
				+    int            Width,
			
 
				+    int            Height
			
 
				+)
			
 
				+{
			
 
				+
			
 
				+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */
			
 
				+
			
 
				+    /* First filter 1-D horizontally... */
			
 
				+    filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
			
 
				+
			
 
				+    /* then 1-D vertically... */
			
 
				+    filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_bilinear_predict4x4_c
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    const short *HFilter;
			
 
				+    const short *VFilter;
			
 
				+
			
 
				+    HFilter = vp8_bilinear_filters[xoffset];
			
 
				+    VFilter = vp8_bilinear_filters[yoffset];
			
 
				+#if 0
			
 
				+    {
			
 
				+        int i;
			
 
				+        unsigned char temp1[16];
			
 
				+        unsigned char temp2[16];
			
 
				+
			
 
				+        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
			
 
				+        filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
			
 
				+
			
 
				+        for (i = 0; i < 16; i++)
			
 
				+        {
			
 
				+            if (temp1[i] != temp2[i])
			
 
				+            {
			
 
				+                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
			
 
				+                filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+#endif
			
 
				+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_bilinear_predict8x8_c
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int  src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int  dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    const short *HFilter;
			
 
				+    const short *VFilter;
			
 
				+
			
 
				+    HFilter = vp8_bilinear_filters[xoffset];
			
 
				+    VFilter = vp8_bilinear_filters[yoffset];
			
 
				+
			
 
				+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_bilinear_predict8x4_c
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int  src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int  dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    const short *HFilter;
			
 
				+    const short *VFilter;
			
 
				+
			
 
				+    HFilter = vp8_bilinear_filters[xoffset];
			
 
				+    VFilter = vp8_bilinear_filters[yoffset];
			
 
				+
			
 
				+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_bilinear_predict16x16_c
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int  src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int  dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    const short *HFilter;
			
 
				+    const short *VFilter;
			
 
				+
			
 
				+    HFilter = vp8_bilinear_filters[xoffset];
			
 
				+    VFilter = vp8_bilinear_filters[yoffset];
			
 
				+
			
 
				+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/filter.h
+++ b/thirdparty/libvpx/vp8/common/filter.h
@@ -0,0 +1,32 @@
 
				+/*
			
 
				+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_FILTER_H_
			
 
				+#define VP8_COMMON_FILTER_H_
			
 
				+
			
 
				+#include "vpx_ports/mem.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#define BLOCK_HEIGHT_WIDTH 4
			
 
				+#define VP8_FILTER_WEIGHT 128
			
 
				+#define VP8_FILTER_SHIFT  7
			
 
				+
			
 
				+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]);
			
 
				+extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_FILTER_H_
			
--- a/thirdparty/libvpx/vp8/common/findnearmv.c
+++ b/thirdparty/libvpx/vp8/common/findnearmv.c
@@ -0,0 +1,193 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "findnearmv.h"
			
 
				+
			
 
				+const unsigned char vp8_mbsplit_offset[4][16] = {
			
 
				+    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
			
 
				+    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
			
 
				+    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
			
 
				+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
			
 
				+};
			
 
				+
			
 
				+/* Predict motion vectors using those from already-decoded nearby blocks.
			
 
				+   Note that we only consider one 4x4 subblock from each candidate 16x16
			
 
				+   macroblock.   */
			
 
				+void vp8_find_near_mvs
			
 
				+(
			
 
				+    MACROBLOCKD *xd,
			
 
				+    const MODE_INFO *here,
			
 
				+    int_mv *nearest,
			
 
				+    int_mv *nearby,
			
 
				+    int_mv *best_mv,
			
 
				+    int cnt[4],
			
 
				+    int refframe,
			
 
				+    int *ref_frame_sign_bias
			
 
				+)
			
 
				+{
			
 
				+    const MODE_INFO *above = here - xd->mode_info_stride;
			
 
				+    const MODE_INFO *left = here - 1;
			
 
				+    const MODE_INFO *aboveleft = above - 1;
			
 
				+    int_mv            near_mvs[4];
			
 
				+    int_mv           *mv = near_mvs;
			
 
				+    int             *cntx = cnt;
			
 
				+    enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
			
 
				+
			
 
				+    /* Zero accumulators */
			
 
				+    mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
			
 
				+    cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
			
 
				+
			
 
				+    /* Process above */
			
 
				+    if (above->mbmi.ref_frame != INTRA_FRAME)
			
 
				+    {
			
 
				+        if (above->mbmi.mv.as_int)
			
 
				+        {
			
 
				+            (++mv)->as_int = above->mbmi.mv.as_int;
			
 
				+            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
			
 
				+            ++cntx;
			
 
				+        }
			
 
				+
			
 
				+        *cntx += 2;
			
 
				+    }
			
 
				+
			
 
				+    /* Process left */
			
 
				+    if (left->mbmi.ref_frame != INTRA_FRAME)
			
 
				+    {
			
 
				+        if (left->mbmi.mv.as_int)
			
 
				+        {
			
 
				+            int_mv this_mv;
			
 
				+
			
 
				+            this_mv.as_int = left->mbmi.mv.as_int;
			
 
				+            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
			
 
				+
			
 
				+            if (this_mv.as_int != mv->as_int)
			
 
				+            {
			
 
				+                (++mv)->as_int = this_mv.as_int;
			
 
				+                ++cntx;
			
 
				+            }
			
 
				+
			
 
				+            *cntx += 2;
			
 
				+        }
			
 
				+        else
			
 
				+            cnt[CNT_INTRA] += 2;
			
 
				+    }
			
 
				+
			
 
				+    /* Process above left */
			
 
				+    if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
			
 
				+    {
			
 
				+        if (aboveleft->mbmi.mv.as_int)
			
 
				+        {
			
 
				+            int_mv this_mv;
			
 
				+
			
 
				+            this_mv.as_int = aboveleft->mbmi.mv.as_int;
			
 
				+            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
			
 
				+
			
 
				+            if (this_mv.as_int != mv->as_int)
			
 
				+            {
			
 
				+                (++mv)->as_int = this_mv.as_int;
			
 
				+                ++cntx;
			
 
				+            }
			
 
				+
			
 
				+            *cntx += 1;
			
 
				+        }
			
 
				+        else
			
 
				+            cnt[CNT_INTRA] += 1;
			
 
				+    }
			
 
				+
			
 
				+    /* If we have three distinct MV's ... */
			
 
				+    if (cnt[CNT_SPLITMV])
			
 
				+    {
			
 
				+        /* See if above-left MV can be merged with NEAREST */
			
 
				+        if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
			
 
				+            cnt[CNT_NEAREST] += 1;
			
 
				+    }
			
 
				+
			
 
				+    cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
			
 
				+                        + (left->mbmi.mode == SPLITMV)) * 2
			
 
				+                       + (aboveleft->mbmi.mode == SPLITMV);
			
 
				+
			
 
				+    /* Swap near and nearest if necessary */
			
 
				+    if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
			
 
				+    {
			
 
				+        int tmp;
			
 
				+        tmp = cnt[CNT_NEAREST];
			
 
				+        cnt[CNT_NEAREST] = cnt[CNT_NEAR];
			
 
				+        cnt[CNT_NEAR] = tmp;
			
 
				+        tmp = near_mvs[CNT_NEAREST].as_int;
			
 
				+        near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
			
 
				+        near_mvs[CNT_NEAR].as_int = tmp;
			
 
				+    }
			
 
				+
			
 
				+    /* Use near_mvs[0] to store the "best" MV */
			
 
				+    if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
			
 
				+        near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
			
 
				+
			
 
				+    /* Set up return values */
			
 
				+    best_mv->as_int = near_mvs[0].as_int;
			
 
				+    nearest->as_int = near_mvs[CNT_NEAREST].as_int;
			
 
				+    nearby->as_int = near_mvs[CNT_NEAR].as_int;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd)
			
 
				+{
			
 
				+    inv->as_mv.row = src->as_mv.row * -1;
			
 
				+    inv->as_mv.col = src->as_mv.col * -1;
			
 
				+    vp8_clamp_mv2(inv, xd);
			
 
				+    vp8_clamp_mv2(src, xd);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int vp8_find_near_mvs_bias
			
 
				+(
			
 
				+    MACROBLOCKD *xd,
			
 
				+    const MODE_INFO *here,
			
 
				+    int_mv mode_mv_sb[2][MB_MODE_COUNT],
			
 
				+    int_mv best_mv_sb[2],
			
 
				+    int cnt[4],
			
 
				+    int refframe,
			
 
				+    int *ref_frame_sign_bias
			
 
				+)
			
 
				+{
			
 
				+    int sign_bias = ref_frame_sign_bias[refframe];
			
 
				+
			
 
				+    vp8_find_near_mvs(xd,
			
 
				+                      here,
			
 
				+                      &mode_mv_sb[sign_bias][NEARESTMV],
			
 
				+                      &mode_mv_sb[sign_bias][NEARMV],
			
 
				+                      &best_mv_sb[sign_bias],
			
 
				+                      cnt,
			
 
				+                      refframe,
			
 
				+                      ref_frame_sign_bias);
			
 
				+
			
 
				+    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV],
			
 
				+                         &mode_mv_sb[sign_bias][NEARESTMV], xd);
			
 
				+    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV],
			
 
				+                         &mode_mv_sb[sign_bias][NEARMV], xd);
			
 
				+    invert_and_clamp_mvs(&best_mv_sb[!sign_bias],
			
 
				+                         &best_mv_sb[sign_bias], xd);
			
 
				+
			
 
				+    return sign_bias;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+vp8_prob *vp8_mv_ref_probs(
			
 
				+    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
			
 
				+)
			
 
				+{
			
 
				+    p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0];
			
 
				+    p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1];
			
 
				+    p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2];
			
 
				+    p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3];
			
 
				+    /*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/
			
 
				+    return p;
			
 
				+}
			
 
				+
			
--- a/thirdparty/libvpx/vp8/common/findnearmv.h
+++ b/thirdparty/libvpx/vp8/common/findnearmv.h
@@ -0,0 +1,195 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_FINDNEARMV_H_
			
 
				+#define VP8_COMMON_FINDNEARMV_H_
			
 
				+
			
 
				+#include "./vpx_config.h"
			
 
				+#include "mv.h"
			
 
				+#include "blockd.h"
			
 
				+#include "modecont.h"
			
 
				+#include "treecoder.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+static INLINE void mv_bias(int refmb_ref_frame_sign_bias, int refframe,
			
 
				+                           int_mv *mvp, const int *ref_frame_sign_bias)
			
 
				+{
			
 
				+    if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
			
 
				+    {
			
 
				+        mvp->as_mv.row *= -1;
			
 
				+        mvp->as_mv.col *= -1;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+#define LEFT_TOP_MARGIN (16 << 3)
			
 
				+#define RIGHT_BOTTOM_MARGIN (16 << 3)
			
 
				+static INLINE void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
			
 
				+{
			
 
				+    if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
			
 
				+        mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
			
 
				+    else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
			
 
				+        mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
			
 
				+
			
 
				+    if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
			
 
				+        mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
			
 
				+    else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
			
 
				+        mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
			
 
				+}
			
 
				+
			
 
				+static INLINE void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge,
			
 
				+                                int mb_to_right_edge, int mb_to_top_edge,
			
 
				+                                int mb_to_bottom_edge)
			
 
				+{
			
 
				+    mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
			
 
				+        mb_to_left_edge : mv->as_mv.col;
			
 
				+    mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
			
 
				+        mb_to_right_edge : mv->as_mv.col;
			
 
				+    mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
			
 
				+        mb_to_top_edge : mv->as_mv.row;
			
 
				+    mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
			
 
				+        mb_to_bottom_edge : mv->as_mv.row;
			
 
				+}
			
 
				+static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
			
 
				+                                               int mb_to_right_edge,
			
 
				+                                               int mb_to_top_edge,
			
 
				+                                               int mb_to_bottom_edge)
			
 
				+{
			
 
				+    unsigned int need_to_clamp;
			
 
				+    need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
			
 
				+    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
			
 
				+    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
			
 
				+    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
			
 
				+    return need_to_clamp;
			
 
				+}
			
 
				+
			
 
				+void vp8_find_near_mvs
			
 
				+(
			
 
				+    MACROBLOCKD *xd,
			
 
				+    const MODE_INFO *here,
			
 
				+    int_mv *nearest, int_mv *nearby, int_mv *best,
			
 
				+    int near_mv_ref_cts[4],
			
 
				+    int refframe,
			
 
				+    int *ref_frame_sign_bias
			
 
				+);
			
 
				+
			
 
				+
			
 
				+int vp8_find_near_mvs_bias
			
 
				+(
			
 
				+    MACROBLOCKD *xd,
			
 
				+    const MODE_INFO *here,
			
 
				+    int_mv mode_mv_sb[2][MB_MODE_COUNT],
			
 
				+    int_mv best_mv_sb[2],
			
 
				+    int cnt[4],
			
 
				+    int refframe,
			
 
				+    int *ref_frame_sign_bias
			
 
				+);
			
 
				+
			
 
				+
			
 
				+vp8_prob *vp8_mv_ref_probs(
			
 
				+    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
			
 
				+);
			
 
				+
			
 
				+extern const unsigned char vp8_mbsplit_offset[4][16];
			
 
				+
			
 
				+
			
 
				+static INLINE uint32_t left_block_mv(const MODE_INFO *cur_mb, int b)
			
 
				+{
			
 
				+    if (!(b & 3))
			
 
				+    {
			
 
				+        /* On L edge, get from MB to left of us */
			
 
				+        --cur_mb;
			
 
				+
			
 
				+        if(cur_mb->mbmi.mode != SPLITMV)
			
 
				+            return cur_mb->mbmi.mv.as_int;
			
 
				+        b += 4;
			
 
				+    }
			
 
				+
			
 
				+    return (cur_mb->bmi + b - 1)->mv.as_int;
			
 
				+}
			
 
				+
			
 
				+static INLINE uint32_t above_block_mv(const MODE_INFO *cur_mb, int b,
			
 
				+                                      int mi_stride)
			
 
				+{
			
 
				+    if (!(b >> 2))
			
 
				+    {
			
 
				+        /* On top edge, get from MB above us */
			
 
				+        cur_mb -= mi_stride;
			
 
				+
			
 
				+        if(cur_mb->mbmi.mode != SPLITMV)
			
 
				+            return cur_mb->mbmi.mv.as_int;
			
 
				+        b += 16;
			
 
				+    }
			
 
				+
			
 
				+    return (cur_mb->bmi + (b - 4))->mv.as_int;
			
 
				+}
			
 
				+static INLINE B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
			
 
				+{
			
 
				+    if (!(b & 3))
			
 
				+    {
			
 
				+        /* On L edge, get from MB to left of us */
			
 
				+        --cur_mb;
			
 
				+        switch (cur_mb->mbmi.mode)
			
 
				+        {
			
 
				+            case B_PRED:
			
 
				+              return (cur_mb->bmi + b + 3)->as_mode;
			
 
				+            case DC_PRED:
			
 
				+                return B_DC_PRED;
			
 
				+            case V_PRED:
			
 
				+                return B_VE_PRED;
			
 
				+            case H_PRED:
			
 
				+                return B_HE_PRED;
			
 
				+            case TM_PRED:
			
 
				+                return B_TM_PRED;
			
 
				+            default:
			
 
				+                return B_DC_PRED;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return (cur_mb->bmi + b - 1)->as_mode;
			
 
				+}
			
 
				+
			
 
				+static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b,
			
 
				+                                                 int mi_stride)
			
 
				+{
			
 
				+    if (!(b >> 2))
			
 
				+    {
			
 
				+        /* On top edge, get from MB above us */
			
 
				+        cur_mb -= mi_stride;
			
 
				+
			
 
				+        switch (cur_mb->mbmi.mode)
			
 
				+        {
			
 
				+            case B_PRED:
			
 
				+              return (cur_mb->bmi + b + 12)->as_mode;
			
 
				+            case DC_PRED:
			
 
				+                return B_DC_PRED;
			
 
				+            case V_PRED:
			
 
				+                return B_VE_PRED;
			
 
				+            case H_PRED:
			
 
				+                return B_HE_PRED;
			
 
				+            case TM_PRED:
			
 
				+                return B_TM_PRED;
			
 
				+            default:
			
 
				+                return B_DC_PRED;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return (cur_mb->bmi + b - 4)->as_mode;
			
 
				+}
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_FINDNEARMV_H_
			
--- a/thirdparty/libvpx/vp8/common/generic/systemdependent.c
+++ b/thirdparty/libvpx/vp8/common/generic/systemdependent.c
@@ -0,0 +1,106 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+#if ARCH_ARM
			
 
				+#include "vpx_ports/arm.h"
			
 
				+#elif ARCH_X86 || ARCH_X86_64
			
 
				+#include "vpx_ports/x86.h"
			
 
				+#endif
			
 
				+#include "vp8/common/onyxc_int.h"
			
 
				+#include "vp8/common/systemdependent.h"
			
 
				+
			
 
				+#if CONFIG_MULTITHREAD
			
 
				+#if HAVE_UNISTD_H && !defined(__OS2__)
			
 
				+#include <unistd.h>
			
 
				+#elif defined(_WIN32)
			
 
				+#include <windows.h>
			
 
				+typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
			
 
				+#elif defined(__OS2__)
			
 
				+#define INCL_DOS
			
 
				+#define INCL_DOSSPINLOCK
			
 
				+#include <os2.h>
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#if CONFIG_MULTITHREAD
			
 
				+static int get_cpu_count()
			
 
				+{
			
 
				+    int core_count = 16;
			
 
				+
			
 
				+#if HAVE_UNISTD_H && !defined(__OS2__)
			
 
				+#if defined(_SC_NPROCESSORS_ONLN)
			
 
				+    core_count = sysconf(_SC_NPROCESSORS_ONLN);
			
 
				+#elif defined(_SC_NPROC_ONLN)
			
 
				+    core_count = sysconf(_SC_NPROC_ONLN);
			
 
				+#endif
			
 
				+#elif defined(_WIN32)
			
 
				+    {
			
 
				+#if _WIN32_WINNT >= 0x0501
			
 
				+        SYSTEM_INFO sysinfo;
			
 
				+        GetNativeSystemInfo(&sysinfo);
			
 
				+#else
			
 
				+        PGNSI pGNSI;
			
 
				+        SYSTEM_INFO sysinfo;
			
 
				+
			
 
				+        /* Call GetNativeSystemInfo if supported or
			
 
				+         * GetSystemInfo otherwise. */
			
 
				+
			
 
				+        pGNSI = (PGNSI) GetProcAddress(
			
 
				+                GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
			
 
				+        if (pGNSI != NULL)
			
 
				+            pGNSI(&sysinfo);
			
 
				+        else
			
 
				+            GetSystemInfo(&sysinfo);
			
 
				+#endif
			
 
				+
			
 
				+        core_count = sysinfo.dwNumberOfProcessors;
			
 
				+    }
			
 
				+#elif defined(__OS2__)
			
 
				+    {
			
 
				+        ULONG proc_id;
			
 
				+        ULONG status;
			
 
				+
			
 
				+        core_count = 0;
			
 
				+        for (proc_id = 1; ; proc_id++)
			
 
				+        {
			
 
				+            if (DosGetProcessorStatus(proc_id, &status))
			
 
				+                break;
			
 
				+
			
 
				+            if (status == PROC_ONLINE)
			
 
				+                core_count++;
			
 
				+        }
			
 
				+    }
			
 
				+#else
			
 
				+    /* other platforms */
			
 
				+#endif
			
 
				+
			
 
				+    return core_count > 0 ? core_count : 1;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+void vp8_clear_system_state_c() {};
			
 
				+
			
 
				+void vp8_machine_specific_config(VP8_COMMON *ctx)
			
 
				+{
			
 
				+#if CONFIG_MULTITHREAD
			
 
				+    ctx->processor_core_count = get_cpu_count();
			
 
				+#else
			
 
				+    (void)ctx;
			
 
				+#endif /* CONFIG_MULTITHREAD */
			
 
				+
			
 
				+#if ARCH_ARM
			
 
				+    ctx->cpu_caps = arm_cpu_caps();
			
 
				+#elif ARCH_X86 || ARCH_X86_64
			
 
				+    ctx->cpu_caps = x86_simd_caps();
			
 
				+#endif
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/header.h
+++ b/thirdparty/libvpx/vp8/common/header.h
@@ -0,0 +1,51 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_HEADER_H_
			
 
				+#define VP8_COMMON_HEADER_H_
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/* 24 bits total */
			
 
				+typedef struct
			
 
				+{
			
 
				+    unsigned int type: 1;
			
 
				+    unsigned int version: 3;
			
 
				+    unsigned int show_frame: 1;
			
 
				+
			
 
				+    /* Allow 2^20 bytes = 8 megabits for first partition */
			
 
				+
			
 
				+    unsigned int first_partition_length_in_bytes: 19;
			
 
				+
			
 
				+#ifdef PACKET_TESTING
			
 
				+    unsigned int frame_number;
			
 
				+    unsigned int update_gold: 1;
			
 
				+    unsigned int uses_gold: 1;
			
 
				+    unsigned int update_last: 1;
			
 
				+    unsigned int uses_last: 1;
			
 
				+#endif
			
 
				+
			
 
				+} VP8_HEADER;
			
 
				+
			
 
				+#ifdef PACKET_TESTING
			
 
				+#define VP8_HEADER_SIZE 8
			
 
				+#else
			
 
				+#define VP8_HEADER_SIZE 3
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_HEADER_H_
			
--- a/thirdparty/libvpx/vp8/common/idct_blk.c
+++ b/thirdparty/libvpx/vp8/common/idct_blk.c
@@ -0,0 +1,90 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+
			
 
				+void vp8_dequant_idct_add_c(short *input, short *dq,
			
 
				+                            unsigned char *dest, int stride);
			
 
				+void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
			
 
				+                            int pred_stride, unsigned char *dst_ptr,
			
 
				+                            int dst_stride);
			
 
				+
			
 
				+void vp8_dequant_idct_add_y_block_c
			
 
				+            (short *q, short *dq,
			
 
				+             unsigned char *dst, int stride, char *eobs)
			
 
				+{
			
 
				+    int i, j;
			
 
				+
			
 
				+    for (i = 0; i < 4; i++)
			
 
				+    {
			
 
				+        for (j = 0; j < 4; j++)
			
 
				+        {
			
 
				+            if (*eobs++ > 1)
			
 
				+                vp8_dequant_idct_add_c (q, dq, dst, stride);
			
 
				+            else
			
 
				+            {
			
 
				+                vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
			
 
				+                memset(q, 0, 2 * sizeof(q[0]));
			
 
				+            }
			
 
				+
			
 
				+            q   += 16;
			
 
				+            dst += 4;
			
 
				+        }
			
 
				+
			
 
				+        dst += 4*stride - 16;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_dequant_idct_add_uv_block_c
			
 
				+            (short *q, short *dq,
			
 
				+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
			
 
				+{
			
 
				+    int i, j;
			
 
				+
			
 
				+    for (i = 0; i < 2; i++)
			
 
				+    {
			
 
				+        for (j = 0; j < 2; j++)
			
 
				+        {
			
 
				+            if (*eobs++ > 1)
			
 
				+                vp8_dequant_idct_add_c (q, dq, dstu, stride);
			
 
				+            else
			
 
				+            {
			
 
				+                vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
			
 
				+                memset(q, 0, 2 * sizeof(q[0]));
			
 
				+            }
			
 
				+
			
 
				+            q    += 16;
			
 
				+            dstu += 4;
			
 
				+        }
			
 
				+
			
 
				+        dstu += 4*stride - 8;
			
 
				+    }
			
 
				+
			
 
				+    for (i = 0; i < 2; i++)
			
 
				+    {
			
 
				+        for (j = 0; j < 2; j++)
			
 
				+        {
			
 
				+            if (*eobs++ > 1)
			
 
				+                vp8_dequant_idct_add_c (q, dq, dstv, stride);
			
 
				+            else
			
 
				+            {
			
 
				+                vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
			
 
				+                memset(q, 0, 2 * sizeof(q[0]));
			
 
				+            }
			
 
				+
			
 
				+            q    += 16;
			
 
				+            dstv += 4;
			
 
				+        }
			
 
				+
			
 
				+        dstv += 4*stride - 8;
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/idctllm.c
+++ b/thirdparty/libvpx/vp8/common/idctllm.c
@@ -0,0 +1,205 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include "./vp8_rtcd.h"
			
 
				+
			
 
				+/****************************************************************************
			
 
				+ * Notes:
			
 
				+ *
			
 
				+ * This implementation makes use of 16 bit fixed point verio of two multiply
			
 
				+ * constants:
			
 
				+ *         1.   sqrt(2) * cos (pi/8)
			
 
				+ *         2.   sqrt(2) * sin (pi/8)
			
 
				+ * Becuase the first constant is bigger than 1, to maintain the same 16 bit
			
 
				+ * fixed point precision as the second one, we use a trick of
			
 
				+ *         x * a = x + x*(a-1)
			
 
				+ * so
			
 
				+ *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
			
 
				+ **************************************************************************/
			
 
				+static const int cospi8sqrt2minus1 = 20091;
			
 
				+static const int sinpi8sqrt2      = 35468;
			
 
				+
			
 
				+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
			
 
				+                            int pred_stride, unsigned char *dst_ptr,
			
 
				+                            int dst_stride)
			
 
				+{
			
 
				+    int i;
			
 
				+    int r, c;
			
 
				+    int a1, b1, c1, d1;
			
 
				+    short output[16];
			
 
				+    short *ip = input;
			
 
				+    short *op = output;
			
 
				+    int temp1, temp2;
			
 
				+    int shortpitch = 4;
			
 
				+
			
 
				+    for (i = 0; i < 4; i++)
			
 
				+    {
			
 
				+        a1 = ip[0] + ip[8];
			
 
				+        b1 = ip[0] - ip[8];
			
 
				+
			
 
				+        temp1 = (ip[4] * sinpi8sqrt2) >> 16;
			
 
				+        temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
			
 
				+        c1 = temp1 - temp2;
			
 
				+
			
 
				+        temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
			
 
				+        temp2 = (ip[12] * sinpi8sqrt2) >> 16;
			
 
				+        d1 = temp1 + temp2;
			
 
				+
			
 
				+        op[shortpitch*0] = a1 + d1;
			
 
				+        op[shortpitch*3] = a1 - d1;
			
 
				+
			
 
				+        op[shortpitch*1] = b1 + c1;
			
 
				+        op[shortpitch*2] = b1 - c1;
			
 
				+
			
 
				+        ip++;
			
 
				+        op++;
			
 
				+    }
			
 
				+
			
 
				+    ip = output;
			
 
				+    op = output;
			
 
				+
			
 
				+    for (i = 0; i < 4; i++)
			
 
				+    {
			
 
				+        a1 = ip[0] + ip[2];
			
 
				+        b1 = ip[0] - ip[2];
			
 
				+
			
 
				+        temp1 = (ip[1] * sinpi8sqrt2) >> 16;
			
 
				+        temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
			
 
				+        c1 = temp1 - temp2;
			
 
				+
			
 
				+        temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
			
 
				+        temp2 = (ip[3] * sinpi8sqrt2) >> 16;
			
 
				+        d1 = temp1 + temp2;
			
 
				+
			
 
				+
			
 
				+        op[0] = (a1 + d1 + 4) >> 3;
			
 
				+        op[3] = (a1 - d1 + 4) >> 3;
			
 
				+
			
 
				+        op[1] = (b1 + c1 + 4) >> 3;
			
 
				+        op[2] = (b1 - c1 + 4) >> 3;
			
 
				+
			
 
				+        ip += shortpitch;
			
 
				+        op += shortpitch;
			
 
				+    }
			
 
				+
			
 
				+    ip = output;
			
 
				+    for (r = 0; r < 4; r++)
			
 
				+    {
			
 
				+        for (c = 0; c < 4; c++)
			
 
				+        {
			
 
				+            int a = ip[c] + pred_ptr[c] ;
			
 
				+
			
 
				+            if (a < 0)
			
 
				+                a = 0;
			
 
				+
			
 
				+            if (a > 255)
			
 
				+                a = 255;
			
 
				+
			
 
				+            dst_ptr[c] = (unsigned char) a ;
			
 
				+        }
			
 
				+        ip += 4;
			
 
				+        dst_ptr += dst_stride;
			
 
				+        pred_ptr += pred_stride;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
			
 
				+                            int pred_stride, unsigned char *dst_ptr,
			
 
				+                            int dst_stride)
			
 
				+{
			
 
				+    int a1 = ((input_dc + 4) >> 3);
			
 
				+    int r, c;
			
 
				+
			
 
				+    for (r = 0; r < 4; r++)
			
 
				+    {
			
 
				+        for (c = 0; c < 4; c++)
			
 
				+        {
			
 
				+            int a = a1 + pred_ptr[c] ;
			
 
				+
			
 
				+            if (a < 0)
			
 
				+                a = 0;
			
 
				+
			
 
				+            if (a > 255)
			
 
				+                a = 255;
			
 
				+
			
 
				+            dst_ptr[c] = (unsigned char) a ;
			
 
				+        }
			
 
				+
			
 
				+        dst_ptr += dst_stride;
			
 
				+        pred_ptr += pred_stride;
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
			
 
				+{
			
 
				+    short output[16];
			
 
				+    int i;
			
 
				+    int a1, b1, c1, d1;
			
 
				+    int a2, b2, c2, d2;
			
 
				+    short *ip = input;
			
 
				+    short *op = output;
			
 
				+
			
 
				+    for (i = 0; i < 4; i++)
			
 
				+    {
			
 
				+        a1 = ip[0] + ip[12];
			
 
				+        b1 = ip[4] + ip[8];
			
 
				+        c1 = ip[4] - ip[8];
			
 
				+        d1 = ip[0] - ip[12];
			
 
				+
			
 
				+        op[0] = a1 + b1;
			
 
				+        op[4] = c1 + d1;
			
 
				+        op[8] = a1 - b1;
			
 
				+        op[12] = d1 - c1;
			
 
				+        ip++;
			
 
				+        op++;
			
 
				+    }
			
 
				+
			
 
				+    ip = output;
			
 
				+    op = output;
			
 
				+
			
 
				+    for (i = 0; i < 4; i++)
			
 
				+    {
			
 
				+        a1 = ip[0] + ip[3];
			
 
				+        b1 = ip[1] + ip[2];
			
 
				+        c1 = ip[1] - ip[2];
			
 
				+        d1 = ip[0] - ip[3];
			
 
				+
			
 
				+        a2 = a1 + b1;
			
 
				+        b2 = c1 + d1;
			
 
				+        c2 = a1 - b1;
			
 
				+        d2 = d1 - c1;
			
 
				+
			
 
				+        op[0] = (a2 + 3) >> 3;
			
 
				+        op[1] = (b2 + 3) >> 3;
			
 
				+        op[2] = (c2 + 3) >> 3;
			
 
				+        op[3] = (d2 + 3) >> 3;
			
 
				+
			
 
				+        ip += 4;
			
 
				+        op += 4;
			
 
				+    }
			
 
				+
			
 
				+    for(i = 0; i < 16; i++)
			
 
				+    {
			
 
				+        mb_dqcoeff[i * 16] = output[i];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
			
 
				+{
			
 
				+    int i;
			
 
				+    int a1;
			
 
				+
			
 
				+    a1 = ((input[0] + 3) >> 3);
			
 
				+    for(i = 0; i < 16; i++)
			
 
				+    {
			
 
				+        mb_dqcoeff[i * 16] = a1;
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/invtrans.h
+++ b/thirdparty/libvpx/vp8/common/invtrans.h
@@ -0,0 +1,70 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_INVTRANS_H_
			
 
				+#define VP8_COMMON_INVTRANS_H_
			
 
				+
			
 
				+#include "./vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+#include "blockd.h"
			
 
				+#include "onyxc_int.h"
			
 
				+
			
 
				+#if CONFIG_MULTITHREAD
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+static void eob_adjust(char *eobs, short *diff)
			
 
				+{
			
 
				+    /* eob adjust.... the idct can only skip if both the dc and eob are zero */
			
 
				+    int js;
			
 
				+    for(js = 0; js < 16; js++)
			
 
				+    {
			
 
				+        if((eobs[js] == 0) && (diff[0] != 0))
			
 
				+            eobs[js]++;
			
 
				+        diff+=16;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd)
			
 
				+{
			
 
				+    short *DQC = xd->dequant_y1;
			
 
				+
			
 
				+    if (xd->mode_info_context->mbmi.mode != SPLITMV)
			
 
				+    {
			
 
				+        /* do 2nd order transform on the dc block */
			
 
				+        if (xd->eobs[24] > 1)
			
 
				+        {
			
 
				+            vp8_short_inv_walsh4x4
			
 
				+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            vp8_short_inv_walsh4x4_1
			
 
				+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
			
 
				+        }
			
 
				+        eob_adjust(xd->eobs, xd->qcoeff);
			
 
				+
			
 
				+        DQC = xd->dequant_y1_dc;
			
 
				+    }
			
 
				+    vp8_dequant_idct_add_y_block
			
 
				+                    (xd->qcoeff, DQC,
			
 
				+                     xd->dst.y_buffer,
			
 
				+                     xd->dst.y_stride, xd->eobs);
			
 
				+}
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_INVTRANS_H_
			
--- a/thirdparty/libvpx/vp8/common/loopfilter.h
+++ b/thirdparty/libvpx/vp8/common/loopfilter.h
@@ -0,0 +1,113 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_LOOPFILTER_H_
			
 
				+#define VP8_COMMON_LOOPFILTER_H_
			
 
				+
			
 
				+#include "vpx_ports/mem.h"
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#define MAX_LOOP_FILTER             63
			
 
				+/* fraction of total macroblock rows to be used in fast filter level picking */
			
 
				+/* has to be > 2 */
			
 
				+#define PARTIAL_FRAME_FRACTION      8
			
 
				+
			
 
				+typedef enum
			
 
				+{
			
 
				+    NORMAL_LOOPFILTER = 0,
			
 
				+    SIMPLE_LOOPFILTER = 1
			
 
				+} LOOPFILTERTYPE;
			
 
				+
			
 
				+#if ARCH_ARM
			
 
				+#define SIMD_WIDTH 1
			
 
				+#else
			
 
				+#define SIMD_WIDTH 16
			
 
				+#endif
			
 
				+
			
 
				+/* Need to align this structure so when it is declared and
			
 
				+ * passed it can be loaded into vector registers.
			
 
				+ */
			
 
				+typedef struct
			
 
				+{
			
 
				+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
			
 
				+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
			
 
				+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
			
 
				+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
			
 
				+    unsigned char lvl[4][4][4];
			
 
				+    unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
			
 
				+    unsigned char mode_lf_lut[10];
			
 
				+} loop_filter_info_n;
			
 
				+
			
 
				+typedef struct loop_filter_info
			
 
				+{
			
 
				+    const unsigned char * mblim;
			
 
				+    const unsigned char * blim;
			
 
				+    const unsigned char * lim;
			
 
				+    const unsigned char * hev_thr;
			
 
				+} loop_filter_info;
			
 
				+
			
 
				+
			
 
				+typedef void loop_filter_uvfunction
			
 
				+(
			
 
				+    unsigned char *u,   /* source pointer */
			
 
				+    int p,              /* pitch */
			
 
				+    const unsigned char *blimit,
			
 
				+    const unsigned char *limit,
			
 
				+    const unsigned char *thresh,
			
 
				+    unsigned char *v
			
 
				+);
			
 
				+
			
 
				+/* assorted loopfilter functions which get used elsewhere */
			
 
				+struct VP8Common;
			
 
				+struct macroblockd;
			
 
				+struct modeinfo;
			
 
				+
			
 
				+void vp8_loop_filter_init(struct VP8Common *cm);
			
 
				+
			
 
				+void vp8_loop_filter_frame_init(struct VP8Common *cm,
			
 
				+                                struct macroblockd *mbd,
			
 
				+                                int default_filt_lvl);
			
 
				+
			
 
				+void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd,
			
 
				+                           int frame_type);
			
 
				+
			
 
				+void vp8_loop_filter_partial_frame(struct VP8Common *cm,
			
 
				+                                   struct macroblockd *mbd,
			
 
				+                                   int default_filt_lvl);
			
 
				+
			
 
				+void vp8_loop_filter_frame_yonly(struct VP8Common *cm,
			
 
				+                                 struct macroblockd *mbd,
			
 
				+                                 int default_filt_lvl);
			
 
				+
			
 
				+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
			
 
				+                                      int sharpness_lvl);
			
 
				+
			
 
				+void vp8_loop_filter_row_normal(struct VP8Common *cm,
			
 
				+                                struct modeinfo *mode_info_context,
			
 
				+                                int mb_row, int post_ystride, int post_uvstride,
			
 
				+                                unsigned char *y_ptr, unsigned char *u_ptr,
			
 
				+                                unsigned char *v_ptr);
			
 
				+
			
 
				+void vp8_loop_filter_row_simple(struct VP8Common *cm,
			
 
				+                                struct modeinfo *mode_info_context,
			
 
				+                                int mb_row, int post_ystride, int post_uvstride,
			
 
				+                                unsigned char *y_ptr, unsigned char *u_ptr,
			
 
				+                                unsigned char *v_ptr);
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_LOOPFILTER_H_
			
--- a/thirdparty/libvpx/vp8/common/loopfilter_filters.c
+++ b/thirdparty/libvpx/vp8/common/loopfilter_filters.c
@@ -0,0 +1,430 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include "loopfilter.h"
			
 
				+#include "onyxc_int.h"
			
 
				+
			
 
				+typedef unsigned char uc;
			
 
				+
			
 
				+static signed char vp8_signed_char_clamp(int t)
			
 
				+{
			
 
				+    t = (t < -128 ? -128 : t);
			
 
				+    t = (t > 127 ? 127 : t);
			
 
				+    return (signed char) t;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
			
 
				+static signed char vp8_filter_mask(uc limit, uc blimit,
			
 
				+                            uc p3, uc p2, uc p1, uc p0,
			
 
				+                            uc q0, uc q1, uc q2, uc q3)
			
 
				+{
			
 
				+    signed char mask = 0;
			
 
				+    mask |= (abs(p3 - p2) > limit);
			
 
				+    mask |= (abs(p2 - p1) > limit);
			
 
				+    mask |= (abs(p1 - p0) > limit);
			
 
				+    mask |= (abs(q1 - q0) > limit);
			
 
				+    mask |= (abs(q2 - q1) > limit);
			
 
				+    mask |= (abs(q3 - q2) > limit);
			
 
				+    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit);
			
 
				+    return mask - 1;
			
 
				+}
			
 
				+
			
 
				+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
			
 
				+static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
			
 
				+{
			
 
				+    signed char hev = 0;
			
 
				+    hev  |= (abs(p1 - p0) > thresh) * -1;
			
 
				+    hev  |= (abs(q1 - q0) > thresh) * -1;
			
 
				+    return hev;
			
 
				+}
			
 
				+
			
 
				+static void vp8_filter(signed char mask, uc hev, uc *op1,
			
 
				+        uc *op0, uc *oq0, uc *oq1)
			
 
				+
			
 
				+{
			
 
				+    signed char ps0, qs0;
			
 
				+    signed char ps1, qs1;
			
 
				+    signed char filter_value, Filter1, Filter2;
			
 
				+    signed char u;
			
 
				+
			
 
				+    ps1 = (signed char) * op1 ^ 0x80;
			
 
				+    ps0 = (signed char) * op0 ^ 0x80;
			
 
				+    qs0 = (signed char) * oq0 ^ 0x80;
			
 
				+    qs1 = (signed char) * oq1 ^ 0x80;
			
 
				+
			
 
				+    /* add outer taps if we have high edge variance */
			
 
				+    filter_value = vp8_signed_char_clamp(ps1 - qs1);
			
 
				+    filter_value &= hev;
			
 
				+
			
 
				+    /* inner taps */
			
 
				+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
			
 
				+    filter_value &= mask;
			
 
				+
			
 
				+    /* save bottom 3 bits so that we round one side +4 and the other +3
			
 
				+     * if it equals 4 we'll set to adjust by -1 to account for the fact
			
 
				+     * we'd round 3 the other way
			
 
				+     */
			
 
				+    Filter1 = vp8_signed_char_clamp(filter_value + 4);
			
 
				+    Filter2 = vp8_signed_char_clamp(filter_value + 3);
			
 
				+    Filter1 >>= 3;
			
 
				+    Filter2 >>= 3;
			
 
				+    u = vp8_signed_char_clamp(qs0 - Filter1);
			
 
				+    *oq0 = u ^ 0x80;
			
 
				+    u = vp8_signed_char_clamp(ps0 + Filter2);
			
 
				+    *op0 = u ^ 0x80;
			
 
				+    filter_value = Filter1;
			
 
				+
			
 
				+    /* outer tap adjustments */
			
 
				+    filter_value += 1;
			
 
				+    filter_value >>= 1;
			
 
				+    filter_value &= ~hev;
			
 
				+
			
 
				+    u = vp8_signed_char_clamp(qs1 - filter_value);
			
 
				+    *oq1 = u ^ 0x80;
			
 
				+    u = vp8_signed_char_clamp(ps1 + filter_value);
			
 
				+    *op1 = u ^ 0x80;
			
 
				+
			
 
				+}
			
 
				+void vp8_loop_filter_horizontal_edge_c
			
 
				+(
			
 
				+    unsigned char *s,
			
 
				+    int p, /* pitch */
			
 
				+    const unsigned char *blimit,
			
 
				+    const unsigned char *limit,
			
 
				+    const unsigned char *thresh,
			
 
				+    int count
			
 
				+)
			
 
				+{
			
 
				+    int  hev = 0; /* high edge variance */
			
 
				+    signed char mask = 0;
			
 
				+    int i = 0;
			
 
				+
			
 
				+    /* loop filter designed to work using chars so that we can make maximum use
			
 
				+     * of 8 bit simd instructions.
			
 
				+     */
			
 
				+    do
			
 
				+    {
			
 
				+        mask = vp8_filter_mask(limit[0], blimit[0],
			
 
				+                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
			
 
				+                               s[0*p], s[1*p], s[2*p], s[3*p]);
			
 
				+
			
 
				+        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
			
 
				+
			
 
				+        vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
			
 
				+
			
 
				+        ++s;
			
 
				+    }
			
 
				+    while (++i < count * 8);
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_vertical_edge_c
			
 
				+(
			
 
				+    unsigned char *s,
			
 
				+    int p,
			
 
				+    const unsigned char *blimit,
			
 
				+    const unsigned char *limit,
			
 
				+    const unsigned char *thresh,
			
 
				+    int count
			
 
				+)
			
 
				+{
			
 
				+    int  hev = 0; /* high edge variance */
			
 
				+    signed char mask = 0;
			
 
				+    int i = 0;
			
 
				+
			
 
				+    /* loop filter designed to work using chars so that we can make maximum use
			
 
				+     * of 8 bit simd instructions.
			
 
				+     */
			
 
				+    do
			
 
				+    {
			
 
				+        mask = vp8_filter_mask(limit[0], blimit[0],
			
 
				+                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
			
 
				+
			
 
				+        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
			
 
				+
			
 
				+        vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
			
 
				+
			
 
				+        s += p;
			
 
				+    }
			
 
				+    while (++i < count * 8);
			
 
				+}
			
 
				+
			
 
				+static void vp8_mbfilter(signed char mask, uc hev,
			
 
				+                           uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
			
 
				+{
			
 
				+    signed char s, u;
			
 
				+    signed char filter_value, Filter1, Filter2;
			
 
				+    signed char ps2 = (signed char) * op2 ^ 0x80;
			
 
				+    signed char ps1 = (signed char) * op1 ^ 0x80;
			
 
				+    signed char ps0 = (signed char) * op0 ^ 0x80;
			
 
				+    signed char qs0 = (signed char) * oq0 ^ 0x80;
			
 
				+    signed char qs1 = (signed char) * oq1 ^ 0x80;
			
 
				+    signed char qs2 = (signed char) * oq2 ^ 0x80;
			
 
				+
			
 
				+    /* add outer taps if we have high edge variance */
			
 
				+    filter_value = vp8_signed_char_clamp(ps1 - qs1);
			
 
				+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0));
			
 
				+    filter_value &= mask;
			
 
				+
			
 
				+    Filter2 = filter_value;
			
 
				+    Filter2 &= hev;
			
 
				+
			
 
				+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
			
 
				+    Filter1 = vp8_signed_char_clamp(Filter2 + 4);
			
 
				+    Filter2 = vp8_signed_char_clamp(Filter2 + 3);
			
 
				+    Filter1 >>= 3;
			
 
				+    Filter2 >>= 3;
			
 
				+    qs0 = vp8_signed_char_clamp(qs0 - Filter1);
			
 
				+    ps0 = vp8_signed_char_clamp(ps0 + Filter2);
			
 
				+
			
 
				+
			
 
				+    /* only apply wider filter if not high edge variance */
			
 
				+    filter_value &= ~hev;
			
 
				+    Filter2 = filter_value;
			
 
				+
			
 
				+    /* roughly 3/7th difference across boundary */
			
 
				+    u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
			
 
				+    s = vp8_signed_char_clamp(qs0 - u);
			
 
				+    *oq0 = s ^ 0x80;
			
 
				+    s = vp8_signed_char_clamp(ps0 + u);
			
 
				+    *op0 = s ^ 0x80;
			
 
				+
			
 
				+    /* roughly 2/7th difference across boundary */
			
 
				+    u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
			
 
				+    s = vp8_signed_char_clamp(qs1 - u);
			
 
				+    *oq1 = s ^ 0x80;
			
 
				+    s = vp8_signed_char_clamp(ps1 + u);
			
 
				+    *op1 = s ^ 0x80;
			
 
				+
			
 
				+    /* roughly 1/7th difference across boundary */
			
 
				+    u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
			
 
				+    s = vp8_signed_char_clamp(qs2 - u);
			
 
				+    *oq2 = s ^ 0x80;
			
 
				+    s = vp8_signed_char_clamp(ps2 + u);
			
 
				+    *op2 = s ^ 0x80;
			
 
				+}
			
 
				+
			
 
				+void vp8_mbloop_filter_horizontal_edge_c
			
 
				+(
			
 
				+    unsigned char *s,
			
 
				+    int p,
			
 
				+    const unsigned char *blimit,
			
 
				+    const unsigned char *limit,
			
 
				+    const unsigned char *thresh,
			
 
				+    int count
			
 
				+)
			
 
				+{
			
 
				+    signed char hev = 0; /* high edge variance */
			
 
				+    signed char mask = 0;
			
 
				+    int i = 0;
			
 
				+
			
 
				+    /* loop filter designed to work using chars so that we can make maximum use
			
 
				+     * of 8 bit simd instructions.
			
 
				+     */
			
 
				+    do
			
 
				+    {
			
 
				+
			
 
				+        mask = vp8_filter_mask(limit[0], blimit[0],
			
 
				+                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
			
 
				+                               s[0*p], s[1*p], s[2*p], s[3*p]);
			
 
				+
			
 
				+        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
			
 
				+
			
 
				+        vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
			
 
				+
			
 
				+        ++s;
			
 
				+    }
			
 
				+    while (++i < count * 8);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_mbloop_filter_vertical_edge_c
			
 
				+(
			
 
				+    unsigned char *s,
			
 
				+    int p,
			
 
				+    const unsigned char *blimit,
			
 
				+    const unsigned char *limit,
			
 
				+    const unsigned char *thresh,
			
 
				+    int count
			
 
				+)
			
 
				+{
			
 
				+    signed char hev = 0; /* high edge variance */
			
 
				+    signed char mask = 0;
			
 
				+    int i = 0;
			
 
				+
			
 
				+    do
			
 
				+    {
			
 
				+
			
 
				+        mask = vp8_filter_mask(limit[0], blimit[0],
			
 
				+                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
			
 
				+
			
 
				+        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
			
 
				+
			
 
				+        vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
			
 
				+
			
 
				+        s += p;
			
 
				+    }
			
 
				+    while (++i < count * 8);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
			
 
				+static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
			
 
				+{
			
 
				+/* Why does this cause problems for win32?
			
 
				+ * error C2143: syntax error : missing ';' before 'type'
			
 
				+ *  (void) limit;
			
 
				+ */
			
 
				+    signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
			
 
				+    return mask;
			
 
				+}
			
 
				+
			
 
				+static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
			
 
				+{
			
 
				+    signed char filter_value, Filter1, Filter2;
			
 
				+    signed char p1 = (signed char) * op1 ^ 0x80;
			
 
				+    signed char p0 = (signed char) * op0 ^ 0x80;
			
 
				+    signed char q0 = (signed char) * oq0 ^ 0x80;
			
 
				+    signed char q1 = (signed char) * oq1 ^ 0x80;
			
 
				+    signed char u;
			
 
				+
			
 
				+    filter_value = vp8_signed_char_clamp(p1 - q1);
			
 
				+    filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0));
			
 
				+    filter_value &= mask;
			
 
				+
			
 
				+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
			
 
				+    Filter1 = vp8_signed_char_clamp(filter_value + 4);
			
 
				+    Filter1 >>= 3;
			
 
				+    u = vp8_signed_char_clamp(q0 - Filter1);
			
 
				+    *oq0  = u ^ 0x80;
			
 
				+
			
 
				+    Filter2 = vp8_signed_char_clamp(filter_value + 3);
			
 
				+    Filter2 >>= 3;
			
 
				+    u = vp8_signed_char_clamp(p0 + Filter2);
			
 
				+    *op0 = u ^ 0x80;
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_simple_horizontal_edge_c
			
 
				+(
			
 
				+    unsigned char *s,
			
 
				+    int p,
			
 
				+    const unsigned char *blimit
			
 
				+)
			
 
				+{
			
 
				+    signed char mask = 0;
			
 
				+    int i = 0;
			
 
				+
			
 
				+    do
			
 
				+    {
			
 
				+        mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
			
 
				+        vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
			
 
				+        ++s;
			
 
				+    }
			
 
				+    while (++i < 16);
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_simple_vertical_edge_c
			
 
				+(
			
 
				+    unsigned char *s,
			
 
				+    int p,
			
 
				+    const unsigned char *blimit
			
 
				+)
			
 
				+{
			
 
				+    signed char mask = 0;
			
 
				+    int i = 0;
			
 
				+
			
 
				+    do
			
 
				+    {
			
 
				+        mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
			
 
				+        vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
			
 
				+        s += p;
			
 
				+    }
			
 
				+    while (++i < 16);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+/* Horizontal MB filtering */
			
 
				+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
			
 
				+                           unsigned char *v_ptr, int y_stride, int uv_stride,
			
 
				+                           loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+/* Vertical MB Filtering */
			
 
				+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
			
 
				+                           unsigned char *v_ptr, int y_stride, int uv_stride,
			
 
				+                           loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+/* Horizontal B Filtering */
			
 
				+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
			
 
				+                          unsigned char *v_ptr, int y_stride, int uv_stride,
			
 
				+                          loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
			
 
				+                           const unsigned char *blimit)
			
 
				+{
			
 
				+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
			
 
				+}
			
 
				+
			
 
				+/* Vertical B Filtering */
			
 
				+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
			
 
				+                          unsigned char *v_ptr, int y_stride, int uv_stride,
			
 
				+                          loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
			
 
				+                           const unsigned char *blimit)
			
 
				+{
			
 
				+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/mbpitch.c
+++ b/thirdparty/libvpx/vp8/common/mbpitch.c
@@ -0,0 +1,68 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "blockd.h"
			
 
				+
			
 
				+void vp8_setup_block_dptrs(MACROBLOCKD *x)
			
 
				+{
			
 
				+    int r, c;
			
 
				+
			
 
				+    for (r = 0; r < 4; r++)
			
 
				+    {
			
 
				+        for (c = 0; c < 4; c++)
			
 
				+        {
			
 
				+            x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    for (r = 0; r < 2; r++)
			
 
				+    {
			
 
				+        for (c = 0; c < 2; c++)
			
 
				+        {
			
 
				+            x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
			
 
				+
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    for (r = 0; r < 2; r++)
			
 
				+    {
			
 
				+        for (c = 0; c < 2; c++)
			
 
				+        {
			
 
				+            x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
			
 
				+
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    for (r = 0; r < 25; r++)
			
 
				+    {
			
 
				+        x->block[r].qcoeff  = x->qcoeff  + r * 16;
			
 
				+        x->block[r].dqcoeff = x->dqcoeff + r * 16;
			
 
				+        x->block[r].eob     = x->eobs + r;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_build_block_doffsets(MACROBLOCKD *x)
			
 
				+{
			
 
				+    int block;
			
 
				+
			
 
				+    for (block = 0; block < 16; block++) /* y blocks */
			
 
				+    {
			
 
				+        x->block[block].offset =
			
 
				+            (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4;
			
 
				+    }
			
 
				+
			
 
				+    for (block = 16; block < 20; block++) /* U and V blocks */
			
 
				+    {
			
 
				+        x->block[block+4].offset =
			
 
				+        x->block[block].offset =
			
 
				+            ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/modecont.c
+++ b/thirdparty/libvpx/vp8/common/modecont.c
@@ -0,0 +1,40 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "entropy.h"
			
 
				+
			
 
				+const int vp8_mode_contexts[6][4] =
			
 
				+{
			
 
				+    {
			
 
				+        /* 0 */
			
 
				+        7,     1,     1,   143,
			
 
				+    },
			
 
				+    {
			
 
				+        /* 1 */
			
 
				+        14,    18,    14,   107,
			
 
				+    },
			
 
				+    {
			
 
				+        /* 2 */
			
 
				+        135,    64,    57,    68,
			
 
				+    },
			
 
				+    {
			
 
				+        /* 3 */
			
 
				+        60,    56,   128,    65,
			
 
				+    },
			
 
				+    {
			
 
				+        /* 4 */
			
 
				+        159,   134,   128,    34,
			
 
				+    },
			
 
				+    {
			
 
				+        /* 5 */
			
 
				+        234,   188,   128,    28,
			
 
				+    },
			
 
				+};
			
--- a/thirdparty/libvpx/vp8/common/modecont.h
+++ b/thirdparty/libvpx/vp8/common/modecont.h
@@ -0,0 +1,25 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_MODECONT_H_
			
 
				+#define VP8_COMMON_MODECONT_H_
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+extern const int vp8_mode_contexts[6][4];
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_MODECONT_H_
			
--- a/thirdparty/libvpx/vp8/common/mv.h
+++ b/thirdparty/libvpx/vp8/common/mv.h
@@ -0,0 +1,36 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_MV_H_
			
 
				+#define VP8_COMMON_MV_H_
			
 
				+#include "vpx/vpx_integer.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+typedef struct
			
 
				+{
			
 
				+    short row;
			
 
				+    short col;
			
 
				+} MV;
			
 
				+
			
 
				+typedef union int_mv
			
 
				+{
			
 
				+    uint32_t  as_int;
			
 
				+    MV        as_mv;
			
 
				+} int_mv;        /* facilitates faster equality tests and copies */
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_MV_H_
			
--- a/thirdparty/libvpx/vp8/common/onyxc_int.h
+++ b/thirdparty/libvpx/vp8/common/onyxc_int.h
@@ -0,0 +1,185 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_ONYXC_INT_H_
			
 
				+#define VP8_COMMON_ONYXC_INT_H_
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+#include "vpx/internal/vpx_codec_internal.h"
			
 
				+#include "loopfilter.h"
			
 
				+#include "entropymv.h"
			
 
				+#include "entropy.h"
			
 
				+#if CONFIG_POSTPROC
			
 
				+#include "postproc.h"
			
 
				+#endif
			
 
				+
			
 
				+/*#ifdef PACKET_TESTING*/
			
 
				+#include "header.h"
			
 
				+/*#endif*/
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#define MINQ 0
			
 
				+#define MAXQ 127
			
 
				+#define QINDEX_RANGE (MAXQ + 1)
			
 
				+
			
 
				+#define NUM_YV12_BUFFERS 4
			
 
				+
			
 
				+#define MAX_PARTITIONS 9
			
 
				+
			
 
				+typedef struct frame_contexts
			
 
				+{
			
 
				+    vp8_prob bmode_prob [VP8_BINTRAMODES-1];
			
 
				+    vp8_prob ymode_prob [VP8_YMODES-1];   /* interframe intra mode probs */
			
 
				+    vp8_prob uv_mode_prob [VP8_UV_MODES-1];
			
 
				+    vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1];
			
 
				+    vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
			
 
				+    MV_CONTEXT mvc[2];
			
 
				+} FRAME_CONTEXT;
			
 
				+
			
 
				+typedef enum
			
 
				+{
			
 
				+    ONE_PARTITION  = 0,
			
 
				+    TWO_PARTITION  = 1,
			
 
				+    FOUR_PARTITION = 2,
			
 
				+    EIGHT_PARTITION = 3
			
 
				+} TOKEN_PARTITION;
			
 
				+
			
 
				+typedef enum
			
 
				+{
			
 
				+    RECON_CLAMP_REQUIRED        = 0,
			
 
				+    RECON_CLAMP_NOTREQUIRED     = 1
			
 
				+} CLAMP_TYPE;
			
 
				+
			
 
				+typedef struct VP8Common
			
 
				+
			
 
				+{
			
 
				+    struct vpx_internal_error_info  error;
			
 
				+
			
 
				+    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]);
			
 
				+    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]);
			
 
				+    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]);
			
 
				+
			
 
				+    int Width;
			
 
				+    int Height;
			
 
				+    int horiz_scale;
			
 
				+    int vert_scale;
			
 
				+
			
 
				+    CLAMP_TYPE  clamp_type;
			
 
				+
			
 
				+    YV12_BUFFER_CONFIG *frame_to_show;
			
 
				+
			
 
				+    YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
			
 
				+    int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
			
 
				+    int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
			
 
				+
			
 
				+    YV12_BUFFER_CONFIG temp_scale_frame;
			
 
				+
			
 
				+#if CONFIG_POSTPROC
			
 
				+    YV12_BUFFER_CONFIG post_proc_buffer;
			
 
				+    YV12_BUFFER_CONFIG post_proc_buffer_int;
			
 
				+    int post_proc_buffer_int_used;
			
 
				+    unsigned char *pp_limits_buffer;   /* post-processing filter coefficients */
			
 
				+#endif
			
 
				+
			
 
				+    FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
			
 
				+    FRAME_TYPE frame_type;
			
 
				+
			
 
				+    int show_frame;
			
 
				+
			
 
				+    int frame_flags;
			
 
				+    int MBs;
			
 
				+    int mb_rows;
			
 
				+    int mb_cols;
			
 
				+    int mode_info_stride;
			
 
				+
			
 
				+    /* profile settings */
			
 
				+    int mb_no_coeff_skip;
			
 
				+    int no_lpf;
			
 
				+    int use_bilinear_mc_filter;
			
 
				+    int full_pixel;
			
 
				+
			
 
				+    int base_qindex;
			
 
				+
			
 
				+    int y1dc_delta_q;
			
 
				+    int y2dc_delta_q;
			
 
				+    int y2ac_delta_q;
			
 
				+    int uvdc_delta_q;
			
 
				+    int uvac_delta_q;
			
 
				+
			
 
				+    /* We allocate a MODE_INFO struct for each macroblock, together with
			
 
				+       an extra row on top and column on the left to simplify prediction. */
			
 
				+
			
 
				+    MODE_INFO *mip; /* Base of allocated array */
			
 
				+    MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
			
 
				+#if CONFIG_ERROR_CONCEALMENT
			
 
				+    MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
			
 
				+    MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
			
 
				+#endif
			
 
				+    MODE_INFO *show_frame_mi;  /* MODE_INFO for the last decoded frame
			
 
				+                                  to show */
			
 
				+    LOOPFILTERTYPE filter_type;
			
 
				+
			
 
				+    loop_filter_info_n lf_info;
			
 
				+
			
 
				+    int filter_level;
			
 
				+    int last_sharpness_level;
			
 
				+    int sharpness_level;
			
 
				+
			
 
				+    int refresh_last_frame;       /* Two state 0 = NO, 1 = YES */
			
 
				+    int refresh_golden_frame;     /* Two state 0 = NO, 1 = YES */
			
 
				+    int refresh_alt_ref_frame;     /* Two state 0 = NO, 1 = YES */
			
 
				+
			
 
				+    int copy_buffer_to_gf;         /* 0 none, 1 Last to GF, 2 ARF to GF */
			
 
				+    int copy_buffer_to_arf;        /* 0 none, 1 Last to ARF, 2 GF to ARF */
			
 
				+
			
 
				+    int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */
			
 
				+
			
 
				+    int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
			
 
				+
			
 
				+    /* Y,U,V,Y2 */
			
 
				+    ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
			
 
				+    ENTROPY_CONTEXT_PLANES left_context;  /* (up to) 4 contexts "" */
			
 
				+
			
 
				+    FRAME_CONTEXT lfc; /* last frame entropy */
			
 
				+    FRAME_CONTEXT fc;  /* this frame entropy */
			
 
				+
			
 
				+    unsigned int current_video_frame;
			
 
				+
			
 
				+    int version;
			
 
				+
			
 
				+    TOKEN_PARTITION multi_token_partition;
			
 
				+
			
 
				+#ifdef PACKET_TESTING
			
 
				+    VP8_HEADER oh;
			
 
				+#endif
			
 
				+#if CONFIG_POSTPROC_VISUALIZER
			
 
				+    double bitrate;
			
 
				+    double framerate;
			
 
				+#endif
			
 
				+
			
 
				+#if CONFIG_MULTITHREAD
			
 
				+    int processor_core_count;
			
 
				+#endif
			
 
				+#if CONFIG_POSTPROC
			
 
				+    struct postproc_state  postproc_state;
			
 
				+#endif
			
 
				+    int cpu_caps;
			
 
				+} VP8_COMMON;
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_ONYXC_INT_H_
			
--- a/thirdparty/libvpx/vp8/common/onyxd.h
+++ b/thirdparty/libvpx/vp8/common/onyxd.h
@@ -0,0 +1,63 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_ONYXD_H_
			
 
				+#define VP8_COMMON_ONYXD_H_
			
 
				+
			
 
				+
			
 
				+/* Create/destroy static data structures. */
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+#include "vpx_scale/yv12config.h"
			
 
				+#include "ppflags.h"
			
 
				+#include "vpx_ports/mem.h"
			
 
				+#include "vpx/vpx_codec.h"
			
 
				+#include "vpx/vp8.h"
			
 
				+
			
 
				+    struct VP8D_COMP;
			
 
				+
			
 
				+    typedef struct
			
 
				+    {
			
 
				+        int     Width;
			
 
				+        int     Height;
			
 
				+        int     Version;
			
 
				+        int     postprocess;
			
 
				+        int     max_threads;
			
 
				+        int     error_concealment;
			
 
				+    } VP8D_CONFIG;
			
 
				+
			
 
				+    typedef enum
			
 
				+    {
			
 
				+        VP8D_OK = 0
			
 
				+    } VP8D_SETTING;
			
 
				+
			
 
				+    void vp8dx_initialize(void);
			
 
				+
			
 
				+    void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x);
			
 
				+
			
 
				+    int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst);
			
 
				+
			
 
				+    int vp8dx_receive_compressed_data(struct VP8D_COMP* comp,
			
 
				+                                      size_t size, const uint8_t *dest,
			
 
				+                                      int64_t time_stamp);
			
 
				+    int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
			
 
				+
			
 
				+    vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
			
 
				+    vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#endif  // VP8_COMMON_ONYXD_H_
			
--- a/thirdparty/libvpx/vp8/common/ppflags.h
+++ b/thirdparty/libvpx/vp8/common/ppflags.h
@@ -0,0 +1,49 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_PPFLAGS_H_
			
 
				+#define VP8_COMMON_PPFLAGS_H_
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+enum
			
 
				+{
			
 
				+    VP8D_NOFILTERING            = 0,
			
 
				+    VP8D_DEBLOCK                = 1<<0,
			
 
				+    VP8D_DEMACROBLOCK           = 1<<1,
			
 
				+    VP8D_ADDNOISE               = 1<<2,
			
 
				+    VP8D_DEBUG_TXT_FRAME_INFO   = 1<<3,
			
 
				+    VP8D_DEBUG_TXT_MBLK_MODES   = 1<<4,
			
 
				+    VP8D_DEBUG_TXT_DC_DIFF      = 1<<5,
			
 
				+    VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
			
 
				+    VP8D_DEBUG_DRAW_MV          = 1<<7,
			
 
				+    VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
			
 
				+    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9,
			
 
				+    VP8D_MFQE                   = 1<<10
			
 
				+};
			
 
				+
			
 
				+typedef struct
			
 
				+{
			
 
				+    int post_proc_flag;
			
 
				+    int deblocking_level;
			
 
				+    int noise_level;
			
 
				+    int display_ref_frame_flag;
			
 
				+    int display_mb_modes_flag;
			
 
				+    int display_b_modes_flag;
			
 
				+    int display_mv_flag;
			
 
				+} vp8_ppflags_t;
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_PPFLAGS_H_
			
--- a/thirdparty/libvpx/vp8/common/quant_common.c
+++ b/thirdparty/libvpx/vp8/common/quant_common.c
@@ -0,0 +1,135 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "quant_common.h"
			
 
				+
			
 
				+static const int dc_qlookup[QINDEX_RANGE] =
			
 
				+{
			
 
				+    4,    5,    6,    7,    8,    9,   10,   10,   11,   12,   13,   14,   15,   16,   17,   17,
			
 
				+    18,   19,   20,   20,   21,   21,   22,   22,   23,   23,   24,   25,   25,   26,   27,   28,
			
 
				+    29,   30,   31,   32,   33,   34,   35,   36,   37,   37,   38,   39,   40,   41,   42,   43,
			
 
				+    44,   45,   46,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
			
 
				+    59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,   72,   73,   74,
			
 
				+    75,   76,   76,   77,   78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,
			
 
				+    91,   93,   95,   96,   98,  100,  101,  102,  104,  106,  108,  110,  112,  114,  116,  118,
			
 
				+    122,  124,  126,  128,  130,  132,  134,  136,  138,  140,  143,  145,  148,  151,  154,  157,
			
 
				+};
			
 
				+
			
 
				+static const int ac_qlookup[QINDEX_RANGE] =
			
 
				+{
			
 
				+    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
			
 
				+    20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,
			
 
				+    36,   37,   38,   39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
			
 
				+    52,   53,   54,   55,   56,   57,   58,   60,   62,   64,   66,   68,   70,   72,   74,   76,
			
 
				+    78,   80,   82,   84,   86,   88,   90,   92,   94,   96,   98,  100,  102,  104,  106,  108,
			
 
				+    110,  112,  114,  116,  119,  122,  125,  128,  131,  134,  137,  140,  143,  146,  149,  152,
			
 
				+    155,  158,  161,  164,  167,  170,  173,  177,  181,  185,  189,  193,  197,  201,  205,  209,
			
 
				+    213,  217,  221,  225,  229,  234,  239,  245,  249,  254,  259,  264,  269,  274,  279,  284,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+int vp8_dc_quant(int QIndex, int Delta)
			
 
				+{
			
 
				+    int retval;
			
 
				+
			
 
				+    QIndex = QIndex + Delta;
			
 
				+
			
 
				+    if (QIndex > 127)
			
 
				+        QIndex = 127;
			
 
				+    else if (QIndex < 0)
			
 
				+        QIndex = 0;
			
 
				+
			
 
				+    retval = dc_qlookup[ QIndex ];
			
 
				+    return retval;
			
 
				+}
			
 
				+
			
 
				+int vp8_dc2quant(int QIndex, int Delta)
			
 
				+{
			
 
				+    int retval;
			
 
				+
			
 
				+    QIndex = QIndex + Delta;
			
 
				+
			
 
				+    if (QIndex > 127)
			
 
				+        QIndex = 127;
			
 
				+    else if (QIndex < 0)
			
 
				+        QIndex = 0;
			
 
				+
			
 
				+    retval = dc_qlookup[ QIndex ] * 2;
			
 
				+    return retval;
			
 
				+
			
 
				+}
			
 
				+int vp8_dc_uv_quant(int QIndex, int Delta)
			
 
				+{
			
 
				+    int retval;
			
 
				+
			
 
				+    QIndex = QIndex + Delta;
			
 
				+
			
 
				+    if (QIndex > 127)
			
 
				+        QIndex = 127;
			
 
				+    else if (QIndex < 0)
			
 
				+        QIndex = 0;
			
 
				+
			
 
				+    retval = dc_qlookup[ QIndex ];
			
 
				+
			
 
				+    if (retval > 132)
			
 
				+        retval = 132;
			
 
				+
			
 
				+    return retval;
			
 
				+}
			
 
				+
			
 
				+int vp8_ac_yquant(int QIndex)
			
 
				+{
			
 
				+    int retval;
			
 
				+
			
 
				+    if (QIndex > 127)
			
 
				+        QIndex = 127;
			
 
				+    else if (QIndex < 0)
			
 
				+        QIndex = 0;
			
 
				+
			
 
				+    retval = ac_qlookup[ QIndex ];
			
 
				+    return retval;
			
 
				+}
			
 
				+
			
 
				+int vp8_ac2quant(int QIndex, int Delta)
			
 
				+{
			
 
				+    int retval;
			
 
				+
			
 
				+    QIndex = QIndex + Delta;
			
 
				+
			
 
				+    if (QIndex > 127)
			
 
				+        QIndex = 127;
			
 
				+    else if (QIndex < 0)
			
 
				+        QIndex = 0;
			
 
				+
			
 
				+    /* For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
			
 
				+     * The smallest precision for that is '(x*6349) >> 12' but 16 is a good
			
 
				+     * word size. */
			
 
				+    retval = (ac_qlookup[ QIndex ] * 101581) >> 16;
			
 
				+
			
 
				+    if (retval < 8)
			
 
				+        retval = 8;
			
 
				+
			
 
				+    return retval;
			
 
				+}
			
 
				+int vp8_ac_uv_quant(int QIndex, int Delta)
			
 
				+{
			
 
				+    int retval;
			
 
				+
			
 
				+    QIndex = QIndex + Delta;
			
 
				+
			
 
				+    if (QIndex > 127)
			
 
				+        QIndex = 127;
			
 
				+    else if (QIndex < 0)
			
 
				+        QIndex = 0;
			
 
				+
			
 
				+    retval = ac_qlookup[ QIndex ];
			
 
				+    return retval;
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/quant_common.h
+++ b/thirdparty/libvpx/vp8/common/quant_common.h
@@ -0,0 +1,34 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#ifndef VP8_COMMON_QUANT_COMMON_H_
			
 
				+#define VP8_COMMON_QUANT_COMMON_H_
			
 
				+
			
 
				+
			
 
				+#include "string.h"
			
 
				+#include "blockd.h"
			
 
				+#include "onyxc_int.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+extern int vp8_ac_yquant(int QIndex);
			
 
				+extern int vp8_dc_quant(int QIndex, int Delta);
			
 
				+extern int vp8_dc2quant(int QIndex, int Delta);
			
 
				+extern int vp8_ac2quant(int QIndex, int Delta);
			
 
				+extern int vp8_dc_uv_quant(int QIndex, int Delta);
			
 
				+extern int vp8_ac_uv_quant(int QIndex, int Delta);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_QUANT_COMMON_H_
			
--- a/thirdparty/libvpx/vp8/common/reconinter.c
+++ b/thirdparty/libvpx/vp8/common/reconinter.c
@@ -0,0 +1,544 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <limits.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+#include "vpx/vpx_integer.h"
			
 
				+#include "blockd.h"
			
 
				+#include "reconinter.h"
			
 
				+#if CONFIG_RUNTIME_CPU_DETECT
			
 
				+#include "onyxc_int.h"
			
 
				+#endif
			
 
				+
			
 
				+void vp8_copy_mem16x16_c(
			
 
				+    unsigned char *src,
			
 
				+    int src_stride,
			
 
				+    unsigned char *dst,
			
 
				+    int dst_stride)
			
 
				+{
			
 
				+
			
 
				+    int r;
			
 
				+
			
 
				+    for (r = 0; r < 16; r++)
			
 
				+    {
			
 
				+        memcpy(dst, src, 16);
			
 
				+
			
 
				+        src += src_stride;
			
 
				+        dst += dst_stride;
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_copy_mem8x8_c(
			
 
				+    unsigned char *src,
			
 
				+    int src_stride,
			
 
				+    unsigned char *dst,
			
 
				+    int dst_stride)
			
 
				+{
			
 
				+    int r;
			
 
				+
			
 
				+    for (r = 0; r < 8; r++)
			
 
				+    {
			
 
				+        memcpy(dst, src, 8);
			
 
				+
			
 
				+        src += src_stride;
			
 
				+        dst += dst_stride;
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_copy_mem8x4_c(
			
 
				+    unsigned char *src,
			
 
				+    int src_stride,
			
 
				+    unsigned char *dst,
			
 
				+    int dst_stride)
			
 
				+{
			
 
				+    int r;
			
 
				+
			
 
				+    for (r = 0; r < 4; r++)
			
 
				+    {
			
 
				+        memcpy(dst, src, 8);
			
 
				+
			
 
				+        src += src_stride;
			
 
				+        dst += dst_stride;
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
			
 
				+{
			
 
				+    int r;
			
 
				+    unsigned char *pred_ptr = d->predictor;
			
 
				+    unsigned char *ptr;
			
 
				+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
			
 
				+
			
 
				+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
			
 
				+    {
			
 
				+        sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        for (r = 0; r < 4; r++)
			
 
				+        {
			
 
				+            pred_ptr[0]  = ptr[0];
			
 
				+            pred_ptr[1]  = ptr[1];
			
 
				+            pred_ptr[2]  = ptr[2];
			
 
				+            pred_ptr[3]  = ptr[3];
			
 
				+            pred_ptr     += pitch;
			
 
				+            ptr         += pre_stride;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
			
 
				+{
			
 
				+    unsigned char *ptr;
			
 
				+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
			
 
				+
			
 
				+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
			
 
				+    {
			
 
				+        x->subpixel_predict8x8(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        vp8_copy_mem8x8(ptr, pre_stride, dst, dst_stride);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
			
 
				+{
			
 
				+    unsigned char *ptr;
			
 
				+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
			
 
				+
			
 
				+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
			
 
				+    {
			
 
				+        x->subpixel_predict8x4(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        vp8_copy_mem8x4(ptr, pre_stride, dst, dst_stride);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
			
 
				+{
			
 
				+    int r;
			
 
				+    unsigned char *ptr;
			
 
				+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
			
 
				+
			
 
				+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
			
 
				+    {
			
 
				+        sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        for (r = 0; r < 4; r++)
			
 
				+        {
			
 
				+          dst[0]  = ptr[0];
			
 
				+          dst[1]  = ptr[1];
			
 
				+          dst[2]  = ptr[2];
			
 
				+          dst[3]  = ptr[3];
			
 
				+          dst     += dst_stride;
			
 
				+          ptr     += pre_stride;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*encoder only*/
			
 
				+void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x)
			
 
				+{
			
 
				+    unsigned char *uptr, *vptr;
			
 
				+    unsigned char *upred_ptr = &x->predictor[256];
			
 
				+    unsigned char *vpred_ptr = &x->predictor[320];
			
 
				+
			
 
				+    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
			
 
				+    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
			
 
				+    int offset;
			
 
				+    int pre_stride = x->pre.uv_stride;
			
 
				+
			
 
				+    /* calc uv motion vectors */
			
 
				+    mv_row += 1 | (mv_row >> (sizeof(int) * CHAR_BIT - 1));
			
 
				+    mv_col += 1 | (mv_col >> (sizeof(int) * CHAR_BIT - 1));
			
 
				+    mv_row /= 2;
			
 
				+    mv_col /= 2;
			
 
				+    mv_row &= x->fullpixel_mask;
			
 
				+    mv_col &= x->fullpixel_mask;
			
 
				+
			
 
				+    offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
			
 
				+    uptr = x->pre.u_buffer + offset;
			
 
				+    vptr = x->pre.v_buffer + offset;
			
 
				+
			
 
				+    if ((mv_row | mv_col) & 7)
			
 
				+    {
			
 
				+        x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
			
 
				+        x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        vp8_copy_mem8x8(uptr, pre_stride, upred_ptr, 8);
			
 
				+        vp8_copy_mem8x8(vptr, pre_stride, vpred_ptr, 8);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/*encoder only*/
			
 
				+void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
			
 
				+{
			
 
				+    int i, j;
			
 
				+    int pre_stride = x->pre.uv_stride;
			
 
				+    unsigned char *base_pre;
			
 
				+
			
 
				+    /* build uv mvs */
			
 
				+    for (i = 0; i < 2; i++)
			
 
				+    {
			
 
				+        for (j = 0; j < 2; j++)
			
 
				+        {
			
 
				+            int yoffset = i * 8 + j * 2;
			
 
				+            int uoffset = 16 + i * 2 + j;
			
 
				+            int voffset = 20 + i * 2 + j;
			
 
				+
			
 
				+            int temp;
			
 
				+
			
 
				+            temp = x->block[yoffset  ].bmi.mv.as_mv.row
			
 
				+                   + x->block[yoffset+1].bmi.mv.as_mv.row
			
 
				+                   + x->block[yoffset+4].bmi.mv.as_mv.row
			
 
				+                   + x->block[yoffset+5].bmi.mv.as_mv.row;
			
 
				+
			
 
				+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
			
 
				+
			
 
				+            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
			
 
				+
			
 
				+            temp = x->block[yoffset  ].bmi.mv.as_mv.col
			
 
				+                   + x->block[yoffset+1].bmi.mv.as_mv.col
			
 
				+                   + x->block[yoffset+4].bmi.mv.as_mv.col
			
 
				+                   + x->block[yoffset+5].bmi.mv.as_mv.col;
			
 
				+
			
 
				+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
			
 
				+
			
 
				+            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
			
 
				+
			
 
				+            x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    base_pre = x->pre.u_buffer;
			
 
				+    for (i = 16; i < 20; i += 2)
			
 
				+    {
			
 
				+        BLOCKD *d0 = &x->block[i];
			
 
				+        BLOCKD *d1 = &x->block[i+1];
			
 
				+
			
 
				+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
			
 
				+            build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
			
 
				+        else
			
 
				+        {
			
 
				+            vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
			
 
				+            vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    base_pre = x->pre.v_buffer;
			
 
				+    for (i = 20; i < 24; i += 2)
			
 
				+    {
			
 
				+        BLOCKD *d0 = &x->block[i];
			
 
				+        BLOCKD *d1 = &x->block[i+1];
			
 
				+
			
 
				+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
			
 
				+            build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
			
 
				+        else
			
 
				+        {
			
 
				+            vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
			
 
				+            vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*encoder only*/
			
 
				+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
			
 
				+                                         unsigned char *dst_y,
			
 
				+                                         int dst_ystride)
			
 
				+{
			
 
				+    unsigned char *ptr_base;
			
 
				+    unsigned char *ptr;
			
 
				+    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
			
 
				+    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
			
 
				+    int pre_stride = x->pre.y_stride;
			
 
				+
			
 
				+    ptr_base = x->pre.y_buffer;
			
 
				+    ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
			
 
				+
			
 
				+    if ((mv_row | mv_col) & 7)
			
 
				+    {
			
 
				+        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,
			
 
				+                                 dst_y, dst_ystride);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        vp8_copy_mem16x16(ptr, pre_stride, dst_y,
			
 
				+            dst_ystride);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
			
 
				+{
			
 
				+    /* If the MV points so far into the UMV border that no visible pixels
			
 
				+     * are used for reconstruction, the subpel part of the MV can be
			
 
				+     * discarded and the MV limited to 16 pixels with equivalent results.
			
 
				+     *
			
 
				+     * This limit kicks in at 19 pixels for the top and left edges, for
			
 
				+     * the 16 pixels plus 3 taps right of the central pixel when subpel
			
 
				+     * filtering. The bottom and right edges use 16 pixels plus 2 pixels
			
 
				+     * left of the central pixel when filtering.
			
 
				+     */
			
 
				+    if (mv->col < (xd->mb_to_left_edge - (19 << 3)))
			
 
				+        mv->col = xd->mb_to_left_edge - (16 << 3);
			
 
				+    else if (mv->col > xd->mb_to_right_edge + (18 << 3))
			
 
				+        mv->col = xd->mb_to_right_edge + (16 << 3);
			
 
				+
			
 
				+    if (mv->row < (xd->mb_to_top_edge - (19 << 3)))
			
 
				+        mv->row = xd->mb_to_top_edge - (16 << 3);
			
 
				+    else if (mv->row > xd->mb_to_bottom_edge + (18 << 3))
			
 
				+        mv->row = xd->mb_to_bottom_edge + (16 << 3);
			
 
				+}
			
 
				+
			
 
				+/* A version of the above function for chroma block MVs.*/
			
 
				+static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
			
 
				+{
			
 
				+    mv->col = (2*mv->col < (xd->mb_to_left_edge - (19 << 3))) ?
			
 
				+        (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
			
 
				+    mv->col = (2*mv->col > xd->mb_to_right_edge + (18 << 3)) ?
			
 
				+        (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
			
 
				+
			
 
				+    mv->row = (2*mv->row < (xd->mb_to_top_edge - (19 << 3))) ?
			
 
				+        (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
			
 
				+    mv->row = (2*mv->row > xd->mb_to_bottom_edge + (18 << 3)) ?
			
 
				+        (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
			
 
				+}
			
 
				+
			
 
				+void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
			
 
				+                                        unsigned char *dst_y,
			
 
				+                                        unsigned char *dst_u,
			
 
				+                                        unsigned char *dst_v,
			
 
				+                                        int dst_ystride,
			
 
				+                                        int dst_uvstride)
			
 
				+{
			
 
				+    int offset;
			
 
				+    unsigned char *ptr;
			
 
				+    unsigned char *uptr, *vptr;
			
 
				+
			
 
				+    int_mv _16x16mv;
			
 
				+
			
 
				+    unsigned char *ptr_base = x->pre.y_buffer;
			
 
				+    int pre_stride = x->pre.y_stride;
			
 
				+
			
 
				+    _16x16mv.as_int = x->mode_info_context->mbmi.mv.as_int;
			
 
				+
			
 
				+    if (x->mode_info_context->mbmi.need_to_clamp_mvs)
			
 
				+    {
			
 
				+        clamp_mv_to_umv_border(&_16x16mv.as_mv, x);
			
 
				+    }
			
 
				+
			
 
				+    ptr = ptr_base + ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
			
 
				+
			
 
				+    if ( _16x16mv.as_int & 0x00070007)
			
 
				+    {
			
 
				+        x->subpixel_predict16x16(ptr, pre_stride, _16x16mv.as_mv.col & 7,  _16x16mv.as_mv.row & 7, dst_y, dst_ystride);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
			
 
				+    }
			
 
				+
			
 
				+    /* calc uv motion vectors */
			
 
				+    _16x16mv.as_mv.row += 1 | (_16x16mv.as_mv.row >> (sizeof(int) * CHAR_BIT - 1));
			
 
				+    _16x16mv.as_mv.col += 1 | (_16x16mv.as_mv.col >> (sizeof(int) * CHAR_BIT - 1));
			
 
				+    _16x16mv.as_mv.row /= 2;
			
 
				+    _16x16mv.as_mv.col /= 2;
			
 
				+    _16x16mv.as_mv.row &= x->fullpixel_mask;
			
 
				+    _16x16mv.as_mv.col &= x->fullpixel_mask;
			
 
				+
			
 
				+    pre_stride >>= 1;
			
 
				+    offset = ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
			
 
				+    uptr = x->pre.u_buffer + offset;
			
 
				+    vptr = x->pre.v_buffer + offset;
			
 
				+
			
 
				+    if ( _16x16mv.as_int & 0x00070007)
			
 
				+    {
			
 
				+        x->subpixel_predict8x8(uptr, pre_stride, _16x16mv.as_mv.col & 7,  _16x16mv.as_mv.row & 7, dst_u, dst_uvstride);
			
 
				+        x->subpixel_predict8x8(vptr, pre_stride, _16x16mv.as_mv.col & 7,  _16x16mv.as_mv.row & 7, dst_v, dst_uvstride);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        vp8_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
			
 
				+        vp8_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
			
 
				+{
			
 
				+    int i;
			
 
				+    unsigned char *base_dst = x->dst.y_buffer;
			
 
				+    unsigned char *base_pre = x->pre.y_buffer;
			
 
				+
			
 
				+    if (x->mode_info_context->mbmi.partitioning < 3)
			
 
				+    {
			
 
				+        BLOCKD *b;
			
 
				+        int dst_stride = x->dst.y_stride;
			
 
				+
			
 
				+        x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
			
 
				+        x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
			
 
				+        x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
			
 
				+        x->block[10].bmi = x->mode_info_context->bmi[10];
			
 
				+        if (x->mode_info_context->mbmi.need_to_clamp_mvs)
			
 
				+        {
			
 
				+            clamp_mv_to_umv_border(&x->block[ 0].bmi.mv.as_mv, x);
			
 
				+            clamp_mv_to_umv_border(&x->block[ 2].bmi.mv.as_mv, x);
			
 
				+            clamp_mv_to_umv_border(&x->block[ 8].bmi.mv.as_mv, x);
			
 
				+            clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x);
			
 
				+        }
			
 
				+
			
 
				+        b = &x->block[ 0];
			
 
				+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
			
 
				+        b = &x->block[ 2];
			
 
				+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
			
 
				+        b = &x->block[ 8];
			
 
				+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
			
 
				+        b = &x->block[10];
			
 
				+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        for (i = 0; i < 16; i += 2)
			
 
				+        {
			
 
				+            BLOCKD *d0 = &x->block[i];
			
 
				+            BLOCKD *d1 = &x->block[i+1];
			
 
				+            int dst_stride = x->dst.y_stride;
			
 
				+
			
 
				+            x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
			
 
				+            x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
			
 
				+            if (x->mode_info_context->mbmi.need_to_clamp_mvs)
			
 
				+            {
			
 
				+                clamp_mv_to_umv_border(&x->block[i+0].bmi.mv.as_mv, x);
			
 
				+                clamp_mv_to_umv_border(&x->block[i+1].bmi.mv.as_mv, x);
			
 
				+            }
			
 
				+
			
 
				+            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
			
 
				+                build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
			
 
				+            else
			
 
				+            {
			
 
				+                build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
			
 
				+                build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
			
 
				+            }
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+    }
			
 
				+    base_dst = x->dst.u_buffer;
			
 
				+    base_pre = x->pre.u_buffer;
			
 
				+    for (i = 16; i < 20; i += 2)
			
 
				+    {
			
 
				+        BLOCKD *d0 = &x->block[i];
			
 
				+        BLOCKD *d1 = &x->block[i+1];
			
 
				+        int dst_stride = x->dst.uv_stride;
			
 
				+
			
 
				+        /* Note: uv mvs already clamped in build_4x4uvmvs() */
			
 
				+
			
 
				+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
			
 
				+            build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
			
 
				+        else
			
 
				+        {
			
 
				+            build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
			
 
				+            build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    base_dst = x->dst.v_buffer;
			
 
				+    base_pre = x->pre.v_buffer;
			
 
				+    for (i = 20; i < 24; i += 2)
			
 
				+    {
			
 
				+        BLOCKD *d0 = &x->block[i];
			
 
				+        BLOCKD *d1 = &x->block[i+1];
			
 
				+        int dst_stride = x->dst.uv_stride;
			
 
				+
			
 
				+        /* Note: uv mvs already clamped in build_4x4uvmvs() */
			
 
				+
			
 
				+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
			
 
				+            build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
			
 
				+        else
			
 
				+        {
			
 
				+            build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
			
 
				+            build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+void build_4x4uvmvs(MACROBLOCKD *x)
			
 
				+{
			
 
				+    int i, j;
			
 
				+
			
 
				+    for (i = 0; i < 2; i++)
			
 
				+    {
			
 
				+        for (j = 0; j < 2; j++)
			
 
				+        {
			
 
				+            int yoffset = i * 8 + j * 2;
			
 
				+            int uoffset = 16 + i * 2 + j;
			
 
				+            int voffset = 20 + i * 2 + j;
			
 
				+
			
 
				+            int temp;
			
 
				+
			
 
				+            temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.row
			
 
				+                 + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.row
			
 
				+                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row
			
 
				+                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row;
			
 
				+
			
 
				+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
			
 
				+
			
 
				+            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
			
 
				+
			
 
				+            temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.col
			
 
				+                 + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.col
			
 
				+                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col
			
 
				+                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col;
			
 
				+
			
 
				+            temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8);
			
 
				+
			
 
				+            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
			
 
				+
			
 
				+            if (x->mode_info_context->mbmi.need_to_clamp_mvs)
			
 
				+                clamp_uvmv_to_umv_border(&x->block[uoffset].bmi.mv.as_mv, x);
			
 
				+
			
 
				+            x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
			
 
				+{
			
 
				+    if (xd->mode_info_context->mbmi.mode != SPLITMV)
			
 
				+    {
			
 
				+        vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
			
 
				+                                           xd->dst.u_buffer, xd->dst.v_buffer,
			
 
				+                                           xd->dst.y_stride, xd->dst.uv_stride);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        build_4x4uvmvs(xd);
			
 
				+        build_inter4x4_predictors_mb(xd);
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/reconinter.h
+++ b/thirdparty/libvpx/vp8/common/reconinter.h
@@ -0,0 +1,43 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_RECONINTER_H_
			
 
				+#define VP8_COMMON_RECONINTER_H_
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
			
 
				+extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
			
 
				+                                               unsigned char *dst_y,
			
 
				+                                               unsigned char *dst_u,
			
 
				+                                               unsigned char *dst_v,
			
 
				+                                               int dst_ystride,
			
 
				+                                               int dst_uvstride);
			
 
				+
			
 
				+
			
 
				+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
			
 
				+                                                unsigned char *dst_y,
			
 
				+                                                int dst_ystride);
			
 
				+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
			
 
				+                                         unsigned char *base_pre,
			
 
				+                                         int pre_stride,
			
 
				+                                         vp8_subpix_fn_t sppf);
			
 
				+
			
 
				+extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
			
 
				+extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_RECONINTER_H_
			
--- a/thirdparty/libvpx/vp8/common/reconintra.c
+++ b/thirdparty/libvpx/vp8/common/reconintra.c
@@ -0,0 +1,117 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "./vpx_config.h"
			
 
				+#include "./vpx_dsp_rtcd.h"
			
 
				+#include "./vp8_rtcd.h"
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+#include "vpx_ports/vpx_once.h"
			
 
				+#include "blockd.h"
			
 
				+#include "vp8/common/reconintra.h"
			
 
				+#include "vp8/common/reconintra4x4.h"
			
 
				+
			
 
				+enum {
			
 
				+    SIZE_16,
			
 
				+    SIZE_8,
			
 
				+    NUM_SIZES,
			
 
				+};
			
 
				+
			
 
				+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
			
 
				+                              const uint8_t *above, const uint8_t *left);
			
 
				+
			
 
				+static intra_pred_fn pred[4][NUM_SIZES];
			
 
				+static intra_pred_fn dc_pred[2][2][NUM_SIZES];
			
 
				+
			
 
				+static void vp8_init_intra_predictors_internal(void)
			
 
				+{
			
 
				+#define INIT_SIZE(sz) \
			
 
				+    pred[V_PRED][SIZE_##sz] = vpx_v_predictor_##sz##x##sz; \
			
 
				+    pred[H_PRED][SIZE_##sz] = vpx_h_predictor_##sz##x##sz; \
			
 
				+    pred[TM_PRED][SIZE_##sz] = vpx_tm_predictor_##sz##x##sz; \
			
 
				+ \
			
 
				+    dc_pred[0][0][SIZE_##sz] = vpx_dc_128_predictor_##sz##x##sz; \
			
 
				+    dc_pred[0][1][SIZE_##sz] = vpx_dc_top_predictor_##sz##x##sz; \
			
 
				+    dc_pred[1][0][SIZE_##sz] = vpx_dc_left_predictor_##sz##x##sz; \
			
 
				+    dc_pred[1][1][SIZE_##sz] = vpx_dc_predictor_##sz##x##sz
			
 
				+
			
 
				+    INIT_SIZE(16);
			
 
				+    INIT_SIZE(8);
			
 
				+    vp8_init_intra4x4_predictors_internal();
			
 
				+}
			
 
				+
			
 
				+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
			
 
				+                                      unsigned char * yabove_row,
			
 
				+                                      unsigned char * yleft,
			
 
				+                                      int left_stride,
			
 
				+                                      unsigned char * ypred_ptr,
			
 
				+                                      int y_stride)
			
 
				+{
			
 
				+    MB_PREDICTION_MODE mode = x->mode_info_context->mbmi.mode;
			
 
				+    DECLARE_ALIGNED(16, uint8_t, yleft_col[16]);
			
 
				+    int i;
			
 
				+    intra_pred_fn fn;
			
 
				+
			
 
				+    for (i = 0; i < 16; i++)
			
 
				+    {
			
 
				+        yleft_col[i] = yleft[i* left_stride];
			
 
				+    }
			
 
				+
			
 
				+    if (mode == DC_PRED)
			
 
				+    {
			
 
				+        fn = dc_pred[x->left_available][x->up_available][SIZE_16];
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        fn = pred[mode][SIZE_16];
			
 
				+    }
			
 
				+
			
 
				+    fn(ypred_ptr, y_stride, yabove_row, yleft_col);
			
 
				+}
			
 
				+
			
 
				+void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x,
			
 
				+                                       unsigned char * uabove_row,
			
 
				+                                       unsigned char * vabove_row,
			
 
				+                                       unsigned char * uleft,
			
 
				+                                       unsigned char * vleft,
			
 
				+                                       int left_stride,
			
 
				+                                       unsigned char * upred_ptr,
			
 
				+                                       unsigned char * vpred_ptr,
			
 
				+                                       int pred_stride)
			
 
				+{
			
 
				+    MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode;
			
 
				+    unsigned char uleft_col[8];
			
 
				+    unsigned char vleft_col[8];
			
 
				+    int i;
			
 
				+    intra_pred_fn fn;
			
 
				+
			
 
				+    for (i = 0; i < 8; i++)
			
 
				+    {
			
 
				+        uleft_col[i] = uleft[i * left_stride];
			
 
				+        vleft_col[i] = vleft[i * left_stride];
			
 
				+    }
			
 
				+
			
 
				+    if (uvmode == DC_PRED)
			
 
				+    {
			
 
				+        fn = dc_pred[x->left_available][x->up_available][SIZE_8];
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        fn = pred[uvmode][SIZE_8];
			
 
				+    }
			
 
				+
			
 
				+    fn(upred_ptr, pred_stride, uabove_row, uleft_col);
			
 
				+    fn(vpred_ptr, pred_stride, vabove_row, vleft_col);
			
 
				+}
			
 
				+
			
 
				+void vp8_init_intra_predictors(void)
			
 
				+{
			
 
				+    once(vp8_init_intra_predictors_internal);
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/reconintra.h
+++ b/thirdparty/libvpx/vp8/common/reconintra.h
@@ -0,0 +1,44 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_RECONINTRA_H_
			
 
				+#define VP8_COMMON_RECONINTRA_H_
			
 
				+
			
 
				+#include "vp8/common/blockd.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
			
 
				+                                      unsigned char *yabove_row,
			
 
				+                                      unsigned char *yleft,
			
 
				+                                      int left_stride,
			
 
				+                                      unsigned char *ypred_ptr,
			
 
				+                                      int y_stride);
			
 
				+
			
 
				+void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x,
			
 
				+                                       unsigned char * uabove_row,
			
 
				+                                       unsigned char * vabove_row,
			
 
				+                                       unsigned char * uleft,
			
 
				+                                       unsigned char * vleft,
			
 
				+                                       int left_stride,
			
 
				+                                       unsigned char * upred_ptr,
			
 
				+                                       unsigned char * vpred_ptr,
			
 
				+                                       int pred_stride);
			
 
				+
			
 
				+void vp8_init_intra_predictors(void);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_RECONINTRA_H_
			
--- a/thirdparty/libvpx/vp8/common/reconintra4x4.c
+++ b/thirdparty/libvpx/vp8/common/reconintra4x4.c
@@ -0,0 +1,54 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include <string.h>
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "./vpx_dsp_rtcd.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+#include "blockd.h"
			
 
				+
			
 
				+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
			
 
				+                              const uint8_t *above, const uint8_t *left);
			
 
				+
			
 
				+static intra_pred_fn pred[10];
			
 
				+
			
 
				+void vp8_init_intra4x4_predictors_internal(void)
			
 
				+{
			
 
				+    pred[B_DC_PRED] = vpx_dc_predictor_4x4;
			
 
				+    pred[B_TM_PRED] = vpx_tm_predictor_4x4;
			
 
				+    pred[B_VE_PRED] = vpx_ve_predictor_4x4;
			
 
				+    pred[B_HE_PRED] = vpx_he_predictor_4x4;
			
 
				+    pred[B_LD_PRED] = vpx_d45e_predictor_4x4;
			
 
				+    pred[B_RD_PRED] = vpx_d135_predictor_4x4;
			
 
				+    pred[B_VR_PRED] = vpx_d117_predictor_4x4;
			
 
				+    pred[B_VL_PRED] = vpx_d63f_predictor_4x4;
			
 
				+    pred[B_HD_PRED] = vpx_d153_predictor_4x4;
			
 
				+    pred[B_HU_PRED] = vpx_d207_predictor_4x4;
			
 
				+}
			
 
				+
			
 
				+void vp8_intra4x4_predict(unsigned char *above,
			
 
				+                          unsigned char *yleft, int left_stride,
			
 
				+                          B_PREDICTION_MODE b_mode,
			
 
				+                          unsigned char *dst, int dst_stride,
			
 
				+                          unsigned char top_left)
			
 
				+{
			
 
				+    unsigned char Left[4];
			
 
				+    unsigned char Aboveb[12], *Above = Aboveb + 4;
			
 
				+
			
 
				+    Left[0] = yleft[0];
			
 
				+    Left[1] = yleft[left_stride];
			
 
				+    Left[2] = yleft[2 * left_stride];
			
 
				+    Left[3] = yleft[3 * left_stride];
			
 
				+    memcpy(Above, above, 8);
			
 
				+    Above[-1] = top_left;
			
 
				+
			
 
				+    pred[b_mode](dst, dst_stride, Above, Left);
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/reconintra4x4.h
+++ b/thirdparty/libvpx/vp8/common/reconintra4x4.h
@@ -0,0 +1,48 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_RECONINTRA4X4_H_
			
 
				+#define VP8_COMMON_RECONINTRA4X4_H_
			
 
				+#include "vp8/common/blockd.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd,
			
 
				+                                              unsigned char *above_right_src)
			
 
				+{
			
 
				+    int dst_stride = xd->dst.y_stride;
			
 
				+    unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16;
			
 
				+
			
 
				+    unsigned int *src_ptr = (unsigned int *)above_right_src;
			
 
				+    unsigned int *dst_ptr0 = (unsigned int *)(above_right_dst + 4 * dst_stride);
			
 
				+    unsigned int *dst_ptr1 = (unsigned int *)(above_right_dst + 8 * dst_stride);
			
 
				+    unsigned int *dst_ptr2 = (unsigned int *)(above_right_dst + 12 * dst_stride);
			
 
				+
			
 
				+    *dst_ptr0 = *src_ptr;
			
 
				+    *dst_ptr1 = *src_ptr;
			
 
				+    *dst_ptr2 = *src_ptr;
			
 
				+}
			
 
				+
			
 
				+void vp8_intra4x4_predict(unsigned char *Above,
			
 
				+                          unsigned char *yleft, int left_stride,
			
 
				+                          B_PREDICTION_MODE b_mode,
			
 
				+                          unsigned char *dst, int dst_stride,
			
 
				+                          unsigned char top_left);
			
 
				+
			
 
				+void vp8_init_intra4x4_predictors_internal(void);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_RECONINTRA4X4_H_
			
--- a/thirdparty/libvpx/vp8/common/rtcd.c
+++ b/thirdparty/libvpx/vp8/common/rtcd.c
@@ -0,0 +1,19 @@
 
				+/*
			
 
				+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+#include "./vpx_config.h"
			
 
				+#define RTCD_C
			
 
				+#include "./vp8_rtcd.h"
			
 
				+#include "vpx_ports/vpx_once.h"
			
 
				+
			
 
				+
			
 
				+void vp8_rtcd()
			
 
				+{
			
 
				+    once(setup_rtcd_internal);
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/setupintrarecon.c
+++ b/thirdparty/libvpx/vp8/common/setupintrarecon.c
@@ -0,0 +1,39 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "setupintrarecon.h"
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+
			
 
				+void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
			
 
				+{
			
 
				+    int i;
			
 
				+
			
 
				+    /* set up frame new frame for intra coded blocks */
			
 
				+    memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
			
 
				+    for (i = 0; i < ybf->y_height; i++)
			
 
				+        ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129;
			
 
				+
			
 
				+    memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
			
 
				+    for (i = 0; i < ybf->uv_height; i++)
			
 
				+        ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
			
 
				+
			
 
				+    memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
			
 
				+    for (i = 0; i < ybf->uv_height; i++)
			
 
				+        ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf)
			
 
				+{
			
 
				+    memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
			
 
				+    memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
			
 
				+    memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/setupintrarecon.h
+++ b/thirdparty/libvpx/vp8/common/setupintrarecon.h
@@ -0,0 +1,45 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#ifndef VP8_COMMON_SETUPINTRARECON_H_
			
 
				+#define VP8_COMMON_SETUPINTRARECON_H_
			
 
				+
			
 
				+#include "./vpx_config.h"
			
 
				+#include "vpx_scale/yv12config.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
			
 
				+extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf);
			
 
				+
			
 
				+static INLINE void setup_intra_recon_left(unsigned char *y_buffer,
			
 
				+                                          unsigned char *u_buffer,
			
 
				+                                          unsigned char *v_buffer,
			
 
				+                                          int y_stride,
			
 
				+                                          int uv_stride)
			
 
				+{
			
 
				+    int i;
			
 
				+
			
 
				+    for (i = 0; i < 16; i++)
			
 
				+        y_buffer[y_stride *i] = (unsigned char) 129;
			
 
				+
			
 
				+    for (i = 0; i < 8; i++)
			
 
				+        u_buffer[uv_stride *i] = (unsigned char) 129;
			
 
				+
			
 
				+    for (i = 0; i < 8; i++)
			
 
				+        v_buffer[uv_stride *i] = (unsigned char) 129;
			
 
				+}
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_SETUPINTRARECON_H_
			
--- a/thirdparty/libvpx/vp8/common/swapyv12buffer.c
+++ b/thirdparty/libvpx/vp8/common/swapyv12buffer.c
@@ -0,0 +1,34 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "swapyv12buffer.h"
			
 
				+
			
 
				+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame)
			
 
				+{
			
 
				+    unsigned char *temp;
			
 
				+
			
 
				+    temp = last_frame->buffer_alloc;
			
 
				+    last_frame->buffer_alloc = new_frame->buffer_alloc;
			
 
				+    new_frame->buffer_alloc = temp;
			
 
				+
			
 
				+    temp = last_frame->y_buffer;
			
 
				+    last_frame->y_buffer = new_frame->y_buffer;
			
 
				+    new_frame->y_buffer = temp;
			
 
				+
			
 
				+    temp = last_frame->u_buffer;
			
 
				+    last_frame->u_buffer = new_frame->u_buffer;
			
 
				+    new_frame->u_buffer = temp;
			
 
				+
			
 
				+    temp = last_frame->v_buffer;
			
 
				+    last_frame->v_buffer = new_frame->v_buffer;
			
 
				+    new_frame->v_buffer = temp;
			
 
				+
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/swapyv12buffer.h
+++ b/thirdparty/libvpx/vp8/common/swapyv12buffer.h
@@ -0,0 +1,27 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_SWAPYV12BUFFER_H_
			
 
				+#define VP8_COMMON_SWAPYV12BUFFER_H_
			
 
				+
			
 
				+#include "vpx_scale/yv12config.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_SWAPYV12BUFFER_H_
			
--- a/thirdparty/libvpx/vp8/common/systemdependent.h
+++ b/thirdparty/libvpx/vp8/common/systemdependent.h
@@ -0,0 +1,27 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_
			
 
				+#define VP8_COMMON_SYSTEMDEPENDENT_H_
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+struct VP8Common;
			
 
				+void vp8_machine_specific_config(struct VP8Common *);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_SYSTEMDEPENDENT_H_
			
--- a/thirdparty/libvpx/vp8/common/threading.h
+++ b/thirdparty/libvpx/vp8/common/threading.h
@@ -0,0 +1,232 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_THREADING_H_
			
 
				+#define VP8_COMMON_THREADING_H_
			
 
				+
			
 
				+#include "./vpx_config.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
			
 
				+
			
 
				+/* Thread management macros */
			
 
				+#if defined(_WIN32) && !HAVE_PTHREAD_H
			
 
				+/* Win32 */
			
 
				+#include <process.h>
			
 
				+#include <windows.h>
			
 
				+#define THREAD_FUNCTION unsigned int __stdcall
			
 
				+#define THREAD_FUNCTION_RETURN DWORD
			
 
				+#define THREAD_SPECIFIC_INDEX DWORD
			
 
				+#define pthread_t HANDLE
			
 
				+#define pthread_attr_t DWORD
			
 
				+#define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread)
			
 
				+#define thread_sleep(nms) Sleep(nms)
			
 
				+#define pthread_cancel(thread) terminate_thread(thread,0)
			
 
				+#define ts_key_create(ts_key, destructor) {ts_key = TlsAlloc();};
			
 
				+#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
			
 
				+#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
			
 
				+#define pthread_self() GetCurrentThreadId()
			
 
				+
			
 
				+#elif defined(__OS2__)
			
 
				+/* OS/2 */
			
 
				+#define INCL_DOS
			
 
				+#include <os2.h>
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#define THREAD_FUNCTION void *
			
 
				+#define THREAD_FUNCTION_RETURN void *
			
 
				+#define THREAD_SPECIFIC_INDEX PULONG
			
 
				+#define pthread_t TID
			
 
				+#define pthread_attr_t ULONG
			
 
				+#define pthread_detach(thread) 0
			
 
				+#define thread_sleep(nms) DosSleep(nms)
			
 
				+#define pthread_cancel(thread) DosKillThread(thread)
			
 
				+#define ts_key_create(ts_key, destructor) \
			
 
				+    DosAllocThreadLocalMemory(1, &(ts_key));
			
 
				+#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
			
 
				+#define pthread_setspecific(ts_key, value) (*(ts_key)=(ULONG)(value))
			
 
				+#define pthread_self() _gettid()
			
 
				+#else
			
 
				+#ifdef __APPLE__
			
 
				+#include <mach/mach_init.h>
			
 
				+#include <mach/semaphore.h>
			
 
				+#include <mach/task.h>
			
 
				+#include <time.h>
			
 
				+#include <unistd.h>
			
 
				+
			
 
				+#else
			
 
				+#include <semaphore.h>
			
 
				+#endif
			
 
				+
			
 
				+#include <pthread.h>
			
 
				+/* pthreads */
			
 
				+/* Nearly everything is already defined */
			
 
				+#define THREAD_FUNCTION void *
			
 
				+#define THREAD_FUNCTION_RETURN void *
			
 
				+#define THREAD_SPECIFIC_INDEX pthread_key_t
			
 
				+#define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor);
			
 
				+#endif
			
 
				+
			
 
				+/* Synchronization macros: Win32 and Pthreads */
			
 
				+#if defined(_WIN32) && !HAVE_PTHREAD_H
			
 
				+#define sem_t HANDLE
			
 
				+#define pause(voidpara) __asm PAUSE
			
 
				+#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL)
			
 
				+#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE))
			
 
				+#define sem_post(sem) ReleaseSemaphore(*sem,1,NULL)
			
 
				+#define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)
			
 
				+#define thread_sleep(nms) Sleep(nms)
			
 
				+
			
 
				+#elif defined(__OS2__)
			
 
				+typedef struct
			
 
				+{
			
 
				+    HEV  event;
			
 
				+    HMTX wait_mutex;
			
 
				+    HMTX count_mutex;
			
 
				+    int  count;
			
 
				+} sem_t;
			
 
				+
			
 
				+static inline int sem_init(sem_t *sem, int pshared, unsigned int value)
			
 
				+{
			
 
				+    DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
			
 
				+                      value > 0 ? TRUE : FALSE);
			
 
				+    DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
			
 
				+    DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
			
 
				+
			
 
				+    sem->count = value;
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int sem_wait(sem_t * sem)
			
 
				+{
			
 
				+    DosRequestMutexSem(sem->wait_mutex, -1);
			
 
				+
			
 
				+    DosWaitEventSem(sem->event, -1);
			
 
				+
			
 
				+    DosRequestMutexSem(sem->count_mutex, -1);
			
 
				+
			
 
				+    sem->count--;
			
 
				+    if (sem->count == 0)
			
 
				+    {
			
 
				+        ULONG post_count;
			
 
				+
			
 
				+        DosResetEventSem(sem->event, &post_count);
			
 
				+    }
			
 
				+
			
 
				+    DosReleaseMutexSem(sem->count_mutex);
			
 
				+
			
 
				+    DosReleaseMutexSem(sem->wait_mutex);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int sem_post(sem_t * sem)
			
 
				+{
			
 
				+    DosRequestMutexSem(sem->count_mutex, -1);
			
 
				+
			
 
				+    if (sem->count < 32768)
			
 
				+    {
			
 
				+        sem->count++;
			
 
				+        DosPostEventSem(sem->event);
			
 
				+    }
			
 
				+
			
 
				+    DosReleaseMutexSem(sem->count_mutex);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int sem_destroy(sem_t * sem)
			
 
				+{
			
 
				+    DosCloseEventSem(sem->event);
			
 
				+    DosCloseMutexSem(sem->wait_mutex);
			
 
				+    DosCloseMutexSem(sem->count_mutex);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+#define thread_sleep(nms) DosSleep(nms)
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#ifdef __APPLE__
			
 
				+#define sem_t semaphore_t
			
 
				+#define sem_init(X,Y,Z) semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
			
 
				+#define sem_wait(sem) (semaphore_wait(*sem) )
			
 
				+#define sem_post(sem) semaphore_signal(*sem)
			
 
				+#define sem_destroy(sem) semaphore_destroy(mach_task_self(),*sem)
			
 
				+#define thread_sleep(nms) /* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
			
 
				+#else
			
 
				+#include <unistd.h>
			
 
				+#include <sched.h>
			
 
				+#define thread_sleep(nms) sched_yield();/* {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
			
 
				+#endif
			
 
				+/* Not Windows. Assume pthreads */
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#if ARCH_X86 || ARCH_X86_64
			
 
				+#include "vpx_ports/x86.h"
			
 
				+#else
			
 
				+#define x86_pause_hint()
			
 
				+#endif
			
 
				+
			
 
				+#include "vpx_util/vpx_thread.h"
			
 
				+
			
 
				+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
			
 
				+    const int kMaxTryLocks = 4000;
			
 
				+    int locked = 0;
			
 
				+    int i;
			
 
				+
			
 
				+    for (i = 0; i < kMaxTryLocks; ++i) {
			
 
				+        if (!pthread_mutex_trylock(mutex)) {
			
 
				+            locked = 1;
			
 
				+            break;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (!locked)
			
 
				+        pthread_mutex_lock(mutex);
			
 
				+}
			
 
				+
			
 
				+static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) {
			
 
				+    int ret;
			
 
				+    mutex_lock(mutex);
			
 
				+    ret = *p;
			
 
				+    pthread_mutex_unlock(mutex);
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col,
			
 
				+                             const int *last_row_current_mb_col,
			
 
				+                             const int nsync) {
			
 
				+    while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) {
			
 
				+        x86_pause_hint();
			
 
				+        thread_sleep(0);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) {
			
 
				+    mutex_lock(mutex);
			
 
				+    *p = v;
			
 
				+    pthread_mutex_unlock(mutex);
			
 
				+}
			
 
				+
			
 
				+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_THREADING_H_
			
--- a/thirdparty/libvpx/vp8/common/treecoder.c
+++ b/thirdparty/libvpx/vp8/common/treecoder.c
@@ -0,0 +1,143 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#if CONFIG_DEBUG
			
 
				+#include <assert.h>
			
 
				+#endif
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+#include "treecoder.h"
			
 
				+
			
 
				+static void tree2tok(
			
 
				+    struct vp8_token_struct *const p,
			
 
				+    vp8_tree t,
			
 
				+    int i,
			
 
				+    int v,
			
 
				+    int L
			
 
				+)
			
 
				+{
			
 
				+    v += v;
			
 
				+    ++L;
			
 
				+
			
 
				+    do
			
 
				+    {
			
 
				+        const vp8_tree_index j = t[i++];
			
 
				+
			
 
				+        if (j <= 0)
			
 
				+        {
			
 
				+            p[-j].value = v;
			
 
				+            p[-j].Len = L;
			
 
				+        }
			
 
				+        else
			
 
				+            tree2tok(p, t, j, v, L);
			
 
				+    }
			
 
				+    while (++v & 1);
			
 
				+}
			
 
				+
			
 
				+void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t)
			
 
				+{
			
 
				+    tree2tok(p, t, 0, 0, 0);
			
 
				+}
			
 
				+
			
 
				+void vp8_tokens_from_tree_offset(struct vp8_token_struct *p, vp8_tree t,
			
 
				+                                 int offset)
			
 
				+{
			
 
				+    tree2tok(p - offset, t, 0, 0, 0);
			
 
				+}
			
 
				+
			
 
				+static void branch_counts(
			
 
				+    int n,                      /* n = size of alphabet */
			
 
				+    vp8_token tok               [ /* n */ ],
			
 
				+    vp8_tree tree,
			
 
				+    unsigned int branch_ct       [ /* n-1 */ ] [2],
			
 
				+    const unsigned int num_events[ /* n */ ]
			
 
				+)
			
 
				+{
			
 
				+    const int tree_len = n - 1;
			
 
				+    int t = 0;
			
 
				+
			
 
				+#if CONFIG_DEBUG
			
 
				+    assert(tree_len);
			
 
				+#endif
			
 
				+
			
 
				+    do
			
 
				+    {
			
 
				+        branch_ct[t][0] = branch_ct[t][1] = 0;
			
 
				+    }
			
 
				+    while (++t < tree_len);
			
 
				+
			
 
				+    t = 0;
			
 
				+
			
 
				+    do
			
 
				+    {
			
 
				+        int L = tok[t].Len;
			
 
				+        const int enc = tok[t].value;
			
 
				+        const unsigned int ct = num_events[t];
			
 
				+
			
 
				+        vp8_tree_index i = 0;
			
 
				+
			
 
				+        do
			
 
				+        {
			
 
				+            const int b = (enc >> --L) & 1;
			
 
				+            const int j = i >> 1;
			
 
				+#if CONFIG_DEBUG
			
 
				+            assert(j < tree_len  &&  0 <= L);
			
 
				+#endif
			
 
				+
			
 
				+            branch_ct [j] [b] += ct;
			
 
				+            i = tree[ i + b];
			
 
				+        }
			
 
				+        while (i > 0);
			
 
				+
			
 
				+#if CONFIG_DEBUG
			
 
				+        assert(!L);
			
 
				+#endif
			
 
				+    }
			
 
				+    while (++t < n);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_tree_probs_from_distribution(
			
 
				+    int n,                      /* n = size of alphabet */
			
 
				+    vp8_token tok               [ /* n */ ],
			
 
				+    vp8_tree tree,
			
 
				+    vp8_prob probs          [ /* n-1 */ ],
			
 
				+    unsigned int branch_ct       [ /* n-1 */ ] [2],
			
 
				+    const unsigned int num_events[ /* n */ ],
			
 
				+    unsigned int Pfac,
			
 
				+    int rd
			
 
				+)
			
 
				+{
			
 
				+    const int tree_len = n - 1;
			
 
				+    int t = 0;
			
 
				+
			
 
				+    branch_counts(n, tok, tree, branch_ct, num_events);
			
 
				+
			
 
				+    do
			
 
				+    {
			
 
				+        const unsigned int *const c = branch_ct[t];
			
 
				+        const unsigned int tot = c[0] + c[1];
			
 
				+
			
 
				+#if CONFIG_DEBUG
			
 
				+        assert(tot < (1 << 24));        /* no overflow below */
			
 
				+#endif
			
 
				+
			
 
				+        if (tot)
			
 
				+        {
			
 
				+            const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
			
 
				+            probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
			
 
				+        }
			
 
				+        else
			
 
				+            probs[t] = vp8_prob_half;
			
 
				+    }
			
 
				+    while (++t < tree_len);
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/treecoder.h
+++ b/thirdparty/libvpx/vp8/common/treecoder.h
@@ -0,0 +1,98 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#ifndef VP8_COMMON_TREECODER_H_
			
 
				+#define VP8_COMMON_TREECODER_H_
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+typedef unsigned char vp8bc_index_t; /* probability index */
			
 
				+
			
 
				+
			
 
				+typedef unsigned char vp8_prob;
			
 
				+
			
 
				+#define vp8_prob_half ( (vp8_prob) 128)
			
 
				+
			
 
				+typedef signed char vp8_tree_index;
			
 
				+struct bool_coder_spec;
			
 
				+
			
 
				+typedef struct bool_coder_spec bool_coder_spec;
			
 
				+typedef struct bool_writer bool_writer;
			
 
				+typedef struct bool_reader bool_reader;
			
 
				+
			
 
				+typedef const bool_coder_spec c_bool_coder_spec;
			
 
				+typedef const bool_writer c_bool_writer;
			
 
				+typedef const bool_reader c_bool_reader;
			
 
				+
			
 
				+
			
 
				+
			
 
				+# define vp8_complement( x) (255 - x)
			
 
				+
			
 
				+
			
 
				+/* We build coding trees compactly in arrays.
			
 
				+   Each node of the tree is a pair of vp8_tree_indices.
			
 
				+   Array index often references a corresponding probability table.
			
 
				+   Index <= 0 means done encoding/decoding and value = -Index,
			
 
				+   Index > 0 means need another bit, specification at index.
			
 
				+   Nonnegative indices are always even;  processing begins at node 0. */
			
 
				+
			
 
				+typedef const vp8_tree_index vp8_tree[], *vp8_tree_p;
			
 
				+
			
 
				+
			
 
				+typedef const struct vp8_token_struct
			
 
				+{
			
 
				+    int value;
			
 
				+    int Len;
			
 
				+} vp8_token;
			
 
				+
			
 
				+/* Construct encoding array from tree. */
			
 
				+
			
 
				+void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree);
			
 
				+void vp8_tokens_from_tree_offset(struct vp8_token_struct *, vp8_tree,
			
 
				+                                 int offset);
			
 
				+
			
 
				+
			
 
				+/* Convert array of token occurrence counts into a table of probabilities
			
 
				+   for the associated binary encoding tree.  Also writes count of branches
			
 
				+   taken for each node on the tree; this facilitiates decisions as to
			
 
				+   probability updates. */
			
 
				+
			
 
				+void vp8_tree_probs_from_distribution(
			
 
				+    int n,                      /* n = size of alphabet */
			
 
				+    vp8_token tok               [ /* n */ ],
			
 
				+    vp8_tree tree,
			
 
				+    vp8_prob probs          [ /* n-1 */ ],
			
 
				+    unsigned int branch_ct       [ /* n-1 */ ] [2],
			
 
				+    const unsigned int num_events[ /* n */ ],
			
 
				+    unsigned int Pfactor,
			
 
				+    int Round
			
 
				+);
			
 
				+
			
 
				+/* Variant of above using coder spec rather than hardwired 8-bit probs. */
			
 
				+
			
 
				+void vp8bc_tree_probs_from_distribution(
			
 
				+    int n,                      /* n = size of alphabet */
			
 
				+    vp8_token tok               [ /* n */ ],
			
 
				+    vp8_tree tree,
			
 
				+    vp8_prob probs          [ /* n-1 */ ],
			
 
				+    unsigned int branch_ct       [ /* n-1 */ ] [2],
			
 
				+    const unsigned int num_events[ /* n */ ],
			
 
				+    c_bool_coder_spec *s
			
 
				+);
			
 
				+
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_TREECODER_H_
			
--- a/thirdparty/libvpx/vp8/common/vp8_entropymodedata.h
+++ b/thirdparty/libvpx/vp8/common/vp8_entropymodedata.h
@@ -0,0 +1,254 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+*/
			
 
				+
			
 
				+#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_
			
 
				+#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/*Generated file, included by entropymode.c*/
			
 
				+
			
 
				+
			
 
				+const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES] =
			
 
				+{
			
 
				+    { 0, 1 },
			
 
				+    { 2, 2 },
			
 
				+    { 6, 3 },
			
 
				+    { 28, 5 },
			
 
				+    { 30, 5 },
			
 
				+    { 58, 6 },
			
 
				+    { 59, 6 },
			
 
				+    { 62, 6 },
			
 
				+    { 126, 7 },
			
 
				+    { 127, 7 }
			
 
				+};
			
 
				+
			
 
				+const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES] =
			
 
				+{
			
 
				+    { 0, 1 },
			
 
				+    { 4, 3 },
			
 
				+    { 5, 3 },
			
 
				+    { 6, 3 },
			
 
				+    { 7, 3 }
			
 
				+};
			
 
				+
			
 
				+const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES] =
			
 
				+{
			
 
				+    { 4, 3 },
			
 
				+    { 5, 3 },
			
 
				+    { 6, 3 },
			
 
				+    { 7, 3 },
			
 
				+    { 0, 1 }
			
 
				+};
			
 
				+
			
 
				+const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES] =
			
 
				+{
			
 
				+    { 0, 1 },
			
 
				+    { 2, 2 },
			
 
				+    { 6, 3 },
			
 
				+    { 7, 3 }
			
 
				+};
			
 
				+
			
 
				+const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS] =
			
 
				+{
			
 
				+    { 6, 3 },
			
 
				+    { 7, 3 },
			
 
				+    { 2, 2 },
			
 
				+    { 0, 1 }
			
 
				+};
			
 
				+
			
 
				+const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS] =
			
 
				+{
			
 
				+    { 2, 2 },
			
 
				+    { 6, 3 },
			
 
				+    { 0, 1 },
			
 
				+    { 14, 4 },
			
 
				+    { 15, 4 }
			
 
				+};
			
 
				+
			
 
				+const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS] =
			
 
				+{
			
 
				+    { 0, 1 },
			
 
				+    { 2, 2 },
			
 
				+    { 6, 3 },
			
 
				+    { 7, 3 }
			
 
				+};
			
 
				+
			
 
				+const struct vp8_token_struct vp8_small_mvencodings[8] =
			
 
				+{
			
 
				+    { 0, 3 },
			
 
				+    { 1, 3 },
			
 
				+    { 2, 3 },
			
 
				+    { 3, 3 },
			
 
				+    { 4, 3 },
			
 
				+    { 5, 3 },
			
 
				+    { 6, 3 },
			
 
				+    { 7, 3 }
			
 
				+};
			
 
				+
			
 
				+const vp8_prob vp8_ymode_prob[VP8_YMODES-1] =
			
 
				+{
			
 
				+    112, 86, 140, 37
			
 
				+};
			
 
				+
			
 
				+const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1] =
			
 
				+{
			
 
				+    145, 156, 163, 128
			
 
				+};
			
 
				+
			
 
				+const vp8_prob vp8_uv_mode_prob[VP8_UV_MODES-1] =
			
 
				+{
			
 
				+    162, 101, 204
			
 
				+};
			
 
				+
			
 
				+const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1] =
			
 
				+{
			
 
				+    142, 114, 183
			
 
				+};
			
 
				+
			
 
				+const vp8_prob vp8_bmode_prob[VP8_BINTRAMODES-1] =
			
 
				+{
			
 
				+    120, 90, 79, 133, 87, 85, 80, 111, 151
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+const vp8_prob vp8_kf_bmode_prob
			
 
				+[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1] =
			
 
				+{
			
 
				+    {
			
 
				+        { 231, 120,  48,  89, 115, 113, 120, 152, 112 },
			
 
				+        { 152, 179,  64, 126, 170, 118,  46,  70,  95 },
			
 
				+        { 175,  69, 143,  80,  85,  82,  72, 155, 103 },
			
 
				+        {  56,  58,  10, 171, 218, 189,  17,  13, 152 },
			
 
				+        { 144,  71,  10,  38, 171, 213, 144,  34,  26 },
			
 
				+        { 114,  26,  17, 163,  44, 195,  21,  10, 173 },
			
 
				+        { 121,  24,  80, 195,  26,  62,  44,  64,  85 },
			
 
				+        { 170,  46,  55,  19, 136, 160,  33, 206,  71 },
			
 
				+        {  63,  20,   8, 114, 114, 208,  12,   9, 226 },
			
 
				+        {  81,  40,  11,  96, 182,  84,  29,  16,  36 }
			
 
				+    },
			
 
				+    {
			
 
				+        { 134, 183,  89, 137,  98, 101, 106, 165, 148 },
			
 
				+        {  72, 187, 100, 130, 157, 111,  32,  75,  80 },
			
 
				+        {  66, 102, 167,  99,  74,  62,  40, 234, 128 },
			
 
				+        {  41,  53,   9, 178, 241, 141,  26,   8, 107 },
			
 
				+        { 104,  79,  12,  27, 217, 255,  87,  17,   7 },
			
 
				+        {  74,  43,  26, 146,  73, 166,  49,  23, 157 },
			
 
				+        {  65,  38, 105, 160,  51,  52,  31, 115, 128 },
			
 
				+        {  87,  68,  71,  44, 114,  51,  15, 186,  23 },
			
 
				+        {  47,  41,  14, 110, 182, 183,  21,  17, 194 },
			
 
				+        {  66,  45,  25, 102, 197, 189,  23,  18,  22 }
			
 
				+    },
			
 
				+    {
			
 
				+        {  88,  88, 147, 150,  42,  46,  45, 196, 205 },
			
 
				+        {  43,  97, 183, 117,  85,  38,  35, 179,  61 },
			
 
				+        {  39,  53, 200,  87,  26,  21,  43, 232, 171 },
			
 
				+        {  56,  34,  51, 104, 114, 102,  29,  93,  77 },
			
 
				+        { 107,  54,  32,  26,  51,   1,  81,  43,  31 },
			
 
				+        {  39,  28,  85, 171,  58, 165,  90,  98,  64 },
			
 
				+        {  34,  22, 116, 206,  23,  34,  43, 166,  73 },
			
 
				+        {  68,  25, 106,  22,  64, 171,  36, 225, 114 },
			
 
				+        {  34,  19,  21, 102, 132, 188,  16,  76, 124 },
			
 
				+        {  62,  18,  78,  95,  85,  57,  50,  48,  51 }
			
 
				+    },
			
 
				+    {
			
 
				+        { 193, 101,  35, 159, 215, 111,  89,  46, 111 },
			
 
				+        {  60, 148,  31, 172, 219, 228,  21,  18, 111 },
			
 
				+        { 112, 113,  77,  85, 179, 255,  38, 120, 114 },
			
 
				+        {  40,  42,   1, 196, 245, 209,  10,  25, 109 },
			
 
				+        { 100,  80,   8,  43, 154,   1,  51,  26,  71 },
			
 
				+        {  88,  43,  29, 140, 166, 213,  37,  43, 154 },
			
 
				+        {  61,  63,  30, 155,  67,  45,  68,   1, 209 },
			
 
				+        { 142,  78,  78,  16, 255, 128,  34, 197, 171 },
			
 
				+        {  41,  40,   5, 102, 211, 183,   4,   1, 221 },
			
 
				+        {  51,  50,  17, 168, 209, 192,  23,  25,  82 }
			
 
				+    },
			
 
				+    {
			
 
				+        { 125,  98,  42,  88, 104,  85, 117, 175,  82 },
			
 
				+        {  95,  84,  53,  89, 128, 100, 113, 101,  45 },
			
 
				+        {  75,  79, 123,  47,  51, 128,  81, 171,   1 },
			
 
				+        {  57,  17,   5,  71, 102,  57,  53,  41,  49 },
			
 
				+        { 115,  21,   2,  10, 102, 255, 166,  23,   6 },
			
 
				+        {  38,  33,  13, 121,  57,  73,  26,   1,  85 },
			
 
				+        {  41,  10,  67, 138,  77, 110,  90,  47, 114 },
			
 
				+        { 101,  29,  16,  10,  85, 128, 101, 196,  26 },
			
 
				+        {  57,  18,  10, 102, 102, 213,  34,  20,  43 },
			
 
				+        { 117,  20,  15,  36, 163, 128,  68,   1,  26 }
			
 
				+    },
			
 
				+    {
			
 
				+        { 138,  31,  36, 171,  27, 166,  38,  44, 229 },
			
 
				+        {  67,  87,  58, 169,  82, 115,  26,  59, 179 },
			
 
				+        {  63,  59,  90, 180,  59, 166,  93,  73, 154 },
			
 
				+        {  40,  40,  21, 116, 143, 209,  34,  39, 175 },
			
 
				+        {  57,  46,  22,  24, 128,   1,  54,  17,  37 },
			
 
				+        {  47,  15,  16, 183,  34, 223,  49,  45, 183 },
			
 
				+        {  46,  17,  33, 183,   6,  98,  15,  32, 183 },
			
 
				+        {  65,  32,  73, 115,  28, 128,  23, 128, 205 },
			
 
				+        {  40,   3,   9, 115,  51, 192,  18,   6, 223 },
			
 
				+        {  87,  37,   9, 115,  59,  77,  64,  21,  47 }
			
 
				+    },
			
 
				+    {
			
 
				+        { 104,  55,  44, 218,   9,  54,  53, 130, 226 },
			
 
				+        {  64,  90,  70, 205,  40,  41,  23,  26,  57 },
			
 
				+        {  54,  57, 112, 184,   5,  41,  38, 166, 213 },
			
 
				+        {  30,  34,  26, 133, 152, 116,  10,  32, 134 },
			
 
				+        {  75,  32,  12,  51, 192, 255, 160,  43,  51 },
			
 
				+        {  39,  19,  53, 221,  26, 114,  32,  73, 255 },
			
 
				+        {  31,   9,  65, 234,   2,  15,   1, 118,  73 },
			
 
				+        {  88,  31,  35,  67, 102,  85,  55, 186,  85 },
			
 
				+        {  56,  21,  23, 111,  59, 205,  45,  37, 192 },
			
 
				+        {  55,  38,  70, 124,  73, 102,   1,  34,  98 }
			
 
				+    },
			
 
				+    {
			
 
				+        { 102,  61,  71,  37,  34,  53,  31, 243, 192 },
			
 
				+        {  69,  60,  71,  38,  73, 119,  28, 222,  37 },
			
 
				+        {  68,  45, 128,  34,   1,  47,  11, 245, 171 },
			
 
				+        {  62,  17,  19,  70, 146,  85,  55,  62,  70 },
			
 
				+        {  75,  15,   9,   9,  64, 255, 184, 119,  16 },
			
 
				+        {  37,  43,  37, 154, 100, 163,  85, 160,   1 },
			
 
				+        {  63,   9,  92, 136,  28,  64,  32, 201,  85 },
			
 
				+        {  86,   6,  28,   5,  64, 255,  25, 248,   1 },
			
 
				+        {  56,   8,  17, 132, 137, 255,  55, 116, 128 },
			
 
				+        {  58,  15,  20,  82, 135,  57,  26, 121,  40 }
			
 
				+    },
			
 
				+    {
			
 
				+        { 164,  50,  31, 137, 154, 133,  25,  35, 218 },
			
 
				+        {  51, 103,  44, 131, 131, 123,  31,   6, 158 },
			
 
				+        {  86,  40,  64, 135, 148, 224,  45, 183, 128 },
			
 
				+        {  22,  26,  17, 131, 240, 154,  14,   1, 209 },
			
 
				+        {  83,  12,  13,  54, 192, 255,  68,  47,  28 },
			
 
				+        {  45,  16,  21,  91,  64, 222,   7,   1, 197 },
			
 
				+        {  56,  21,  39, 155,  60, 138,  23, 102, 213 },
			
 
				+        {  85,  26,  85,  85, 128, 128,  32, 146, 171 },
			
 
				+        {  18,  11,   7,  63, 144, 171,   4,   4, 246 },
			
 
				+        {  35,  27,  10, 146, 174, 171,  12,  26, 128 }
			
 
				+    },
			
 
				+    {
			
 
				+        { 190,  80,  35,  99, 180,  80, 126,  54,  45 },
			
 
				+        {  85, 126,  47,  87, 176,  51,  41,  20,  32 },
			
 
				+        { 101,  75, 128, 139, 118, 146, 116, 128,  85 },
			
 
				+        {  56,  41,  15, 176, 236,  85,  37,   9,  62 },
			
 
				+        { 146,  36,  19,  30, 171, 255,  97,  27,  20 },
			
 
				+        {  71,  30,  17, 119, 118, 255,  17,  18, 138 },
			
 
				+        { 101,  38,  60, 138,  55,  70,  43,  26, 142 },
			
 
				+        { 138,  45,  61,  62, 219,   1,  81, 188,  64 },
			
 
				+        {  32,  41,  20, 117, 151, 142,  20,  21, 163 },
			
 
				+        { 112,  19,  12,  61, 195, 128,  48,   4,  24 }
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_VP8_ENTROPYMODEDATA_H_
			
--- a/thirdparty/libvpx/vp8/common/vp8_loopfilter.c
+++ b/thirdparty/libvpx/vp8/common/vp8_loopfilter.c
@@ -0,0 +1,661 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+#include "loopfilter.h"
			
 
				+#include "onyxc_int.h"
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+
			
 
				+
			
 
				+static void lf_init_lut(loop_filter_info_n *lfi)
			
 
				+{
			
 
				+    int filt_lvl;
			
 
				+
			
 
				+    for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++)
			
 
				+    {
			
 
				+        if (filt_lvl >= 40)
			
 
				+        {
			
 
				+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
			
 
				+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
			
 
				+        }
			
 
				+        else if (filt_lvl >= 20)
			
 
				+        {
			
 
				+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
			
 
				+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
			
 
				+        }
			
 
				+        else if (filt_lvl >= 15)
			
 
				+        {
			
 
				+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
			
 
				+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
			
 
				+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    lfi->mode_lf_lut[DC_PRED] = 1;
			
 
				+    lfi->mode_lf_lut[V_PRED] = 1;
			
 
				+    lfi->mode_lf_lut[H_PRED] = 1;
			
 
				+    lfi->mode_lf_lut[TM_PRED] = 1;
			
 
				+    lfi->mode_lf_lut[B_PRED]  = 0;
			
 
				+
			
 
				+    lfi->mode_lf_lut[ZEROMV]  = 1;
			
 
				+    lfi->mode_lf_lut[NEARESTMV] = 2;
			
 
				+    lfi->mode_lf_lut[NEARMV] = 2;
			
 
				+    lfi->mode_lf_lut[NEWMV] = 2;
			
 
				+    lfi->mode_lf_lut[SPLITMV] = 3;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
			
 
				+                                      int sharpness_lvl)
			
 
				+{
			
 
				+    int i;
			
 
				+
			
 
				+    /* For each possible value for the loop filter fill out limits */
			
 
				+    for (i = 0; i <= MAX_LOOP_FILTER; i++)
			
 
				+    {
			
 
				+        int filt_lvl = i;
			
 
				+        int block_inside_limit = 0;
			
 
				+
			
 
				+        /* Set loop filter paramaeters that control sharpness. */
			
 
				+        block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
			
 
				+        block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
			
 
				+
			
 
				+        if (sharpness_lvl > 0)
			
 
				+        {
			
 
				+            if (block_inside_limit > (9 - sharpness_lvl))
			
 
				+                block_inside_limit = (9 - sharpness_lvl);
			
 
				+        }
			
 
				+
			
 
				+        if (block_inside_limit < 1)
			
 
				+            block_inside_limit = 1;
			
 
				+
			
 
				+        memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
			
 
				+        memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH);
			
 
				+        memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
			
 
				+               SIMD_WIDTH);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_init(VP8_COMMON *cm)
			
 
				+{
			
 
				+    loop_filter_info_n *lfi = &cm->lf_info;
			
 
				+    int i;
			
 
				+
			
 
				+    /* init limits for given sharpness*/
			
 
				+    vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
			
 
				+    cm->last_sharpness_level = cm->sharpness_level;
			
 
				+
			
 
				+    /* init LUT for lvl  and hev thr picking */
			
 
				+    lf_init_lut(lfi);
			
 
				+
			
 
				+    /* init hev threshold const vectors */
			
 
				+    for(i = 0; i < 4 ; i++)
			
 
				+    {
			
 
				+        memset(lfi->hev_thr[i], i, SIMD_WIDTH);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_frame_init(VP8_COMMON *cm,
			
 
				+                                MACROBLOCKD *mbd,
			
 
				+                                int default_filt_lvl)
			
 
				+{
			
 
				+    int seg,  /* segment number */
			
 
				+        ref,  /* index in ref_lf_deltas */
			
 
				+        mode; /* index in mode_lf_deltas */
			
 
				+
			
 
				+    loop_filter_info_n *lfi = &cm->lf_info;
			
 
				+
			
 
				+    /* update limits if sharpness has changed */
			
 
				+    if(cm->last_sharpness_level != cm->sharpness_level)
			
 
				+    {
			
 
				+        vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
			
 
				+        cm->last_sharpness_level = cm->sharpness_level;
			
 
				+    }
			
 
				+
			
 
				+    for(seg = 0; seg < MAX_MB_SEGMENTS; seg++)
			
 
				+    {
			
 
				+        int lvl_seg = default_filt_lvl;
			
 
				+        int lvl_ref, lvl_mode;
			
 
				+
			
 
				+        /* Note the baseline filter values for each segment */
			
 
				+        if (mbd->segmentation_enabled)
			
 
				+        {
			
 
				+            /* Abs value */
			
 
				+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
			
 
				+            {
			
 
				+                lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
			
 
				+            }
			
 
				+            else  /* Delta Value */
			
 
				+            {
			
 
				+                lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
			
 
				+            }
			
 
				+            lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
			
 
				+        }
			
 
				+
			
 
				+        if (!mbd->mode_ref_lf_delta_enabled)
			
 
				+        {
			
 
				+            /* we could get rid of this if we assume that deltas are set to
			
 
				+             * zero when not in use; encoder always uses deltas
			
 
				+             */
			
 
				+            memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
			
 
				+            continue;
			
 
				+        }
			
 
				+
			
 
				+        /* INTRA_FRAME */
			
 
				+        ref = INTRA_FRAME;
			
 
				+
			
 
				+        /* Apply delta for reference frame */
			
 
				+        lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
			
 
				+
			
 
				+        /* Apply delta for Intra modes */
			
 
				+        mode = 0; /* B_PRED */
			
 
				+        /* Only the split mode BPRED has a further special case */
			
 
				+        lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
			
 
				+        /* clamp */
			
 
				+        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
			
 
				+
			
 
				+        lfi->lvl[seg][ref][mode] = lvl_mode;
			
 
				+
			
 
				+        mode = 1; /* all the rest of Intra modes */
			
 
				+        /* clamp */
			
 
				+        lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0;
			
 
				+        lfi->lvl[seg][ref][mode] = lvl_mode;
			
 
				+
			
 
				+        /* LAST, GOLDEN, ALT */
			
 
				+        for(ref = 1; ref < MAX_REF_FRAMES; ref++)
			
 
				+        {
			
 
				+            /* Apply delta for reference frame */
			
 
				+            lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref];
			
 
				+
			
 
				+            /* Apply delta for Inter modes */
			
 
				+            for (mode = 1; mode < 4; mode++)
			
 
				+            {
			
 
				+                lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
			
 
				+                /* clamp */
			
 
				+                lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0;
			
 
				+
			
 
				+                lfi->lvl[seg][ref][mode] = lvl_mode;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context,
			
 
				+                         int mb_row, int post_ystride, int post_uvstride,
			
 
				+                         unsigned char *y_ptr, unsigned char *u_ptr,
			
 
				+                         unsigned char *v_ptr)
			
 
				+{
			
 
				+    int mb_col;
			
 
				+    int filter_level;
			
 
				+    loop_filter_info_n *lfi_n = &cm->lf_info;
			
 
				+    loop_filter_info lfi;
			
 
				+    FRAME_TYPE frame_type = cm->frame_type;
			
 
				+
			
 
				+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
			
 
				+    {
			
 
				+        int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
			
 
				+                        mode_info_context->mbmi.mode != SPLITMV &&
			
 
				+                        mode_info_context->mbmi.mb_skip_coeff);
			
 
				+
			
 
				+        const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
			
 
				+        const int seg = mode_info_context->mbmi.segment_id;
			
 
				+        const int ref_frame = mode_info_context->mbmi.ref_frame;
			
 
				+
			
 
				+        filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
			
 
				+
			
 
				+        if (filter_level)
			
 
				+        {
			
 
				+            const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
			
 
				+            lfi.mblim = lfi_n->mblim[filter_level];
			
 
				+            lfi.blim = lfi_n->blim[filter_level];
			
 
				+            lfi.lim = lfi_n->lim[filter_level];
			
 
				+            lfi.hev_thr = lfi_n->hev_thr[hev_index];
			
 
				+
			
 
				+            if (mb_col > 0)
			
 
				+                vp8_loop_filter_mbv
			
 
				+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
			
 
				+
			
 
				+            if (!skip_lf)
			
 
				+                vp8_loop_filter_bv
			
 
				+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
			
 
				+
			
 
				+            /* don't apply across umv border */
			
 
				+            if (mb_row > 0)
			
 
				+                vp8_loop_filter_mbh
			
 
				+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
			
 
				+
			
 
				+            if (!skip_lf)
			
 
				+                vp8_loop_filter_bh
			
 
				+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
			
 
				+        }
			
 
				+
			
 
				+        y_ptr += 16;
			
 
				+        u_ptr += 8;
			
 
				+        v_ptr += 8;
			
 
				+
			
 
				+        mode_info_context++;     /* step to next MB */
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
			
 
				+                         int mb_row, int post_ystride, int post_uvstride,
			
 
				+                         unsigned char *y_ptr, unsigned char *u_ptr,
			
 
				+                         unsigned char *v_ptr)
			
 
				+{
			
 
				+    int mb_col;
			
 
				+    int filter_level;
			
 
				+    loop_filter_info_n *lfi_n = &cm->lf_info;
			
 
				+    (void)post_uvstride;
			
 
				+
			
 
				+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
			
 
				+    {
			
 
				+        int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
			
 
				+                        mode_info_context->mbmi.mode != SPLITMV &&
			
 
				+                        mode_info_context->mbmi.mb_skip_coeff);
			
 
				+
			
 
				+        const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
			
 
				+        const int seg = mode_info_context->mbmi.segment_id;
			
 
				+        const int ref_frame = mode_info_context->mbmi.ref_frame;
			
 
				+
			
 
				+        filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
			
 
				+
			
 
				+        if (filter_level)
			
 
				+        {
			
 
				+            if (mb_col > 0)
			
 
				+                vp8_loop_filter_simple_mbv
			
 
				+                (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
			
 
				+
			
 
				+            if (!skip_lf)
			
 
				+                vp8_loop_filter_simple_bv
			
 
				+                (y_ptr, post_ystride, lfi_n->blim[filter_level]);
			
 
				+
			
 
				+            /* don't apply across umv border */
			
 
				+            if (mb_row > 0)
			
 
				+                vp8_loop_filter_simple_mbh
			
 
				+                (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
			
 
				+
			
 
				+            if (!skip_lf)
			
 
				+                vp8_loop_filter_simple_bh
			
 
				+                (y_ptr, post_ystride, lfi_n->blim[filter_level]);
			
 
				+        }
			
 
				+
			
 
				+        y_ptr += 16;
			
 
				+        u_ptr += 8;
			
 
				+        v_ptr += 8;
			
 
				+
			
 
				+        mode_info_context++;     /* step to next MB */
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+void vp8_loop_filter_frame(VP8_COMMON *cm,
			
 
				+                           MACROBLOCKD *mbd,
			
 
				+                           int frame_type)
			
 
				+{
			
 
				+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
			
 
				+    loop_filter_info_n *lfi_n = &cm->lf_info;
			
 
				+    loop_filter_info lfi;
			
 
				+
			
 
				+    int mb_row;
			
 
				+    int mb_col;
			
 
				+    int mb_rows = cm->mb_rows;
			
 
				+    int mb_cols = cm->mb_cols;
			
 
				+
			
 
				+    int filter_level;
			
 
				+
			
 
				+    unsigned char *y_ptr, *u_ptr, *v_ptr;
			
 
				+
			
 
				+    /* Point at base of Mb MODE_INFO list */
			
 
				+    const MODE_INFO *mode_info_context = cm->mi;
			
 
				+    int post_y_stride = post->y_stride;
			
 
				+    int post_uv_stride = post->uv_stride;
			
 
				+
			
 
				+    /* Initialize the loop filter for this frame. */
			
 
				+    vp8_loop_filter_frame_init(cm, mbd, cm->filter_level);
			
 
				+
			
 
				+    /* Set up the buffer pointers */
			
 
				+    y_ptr = post->y_buffer;
			
 
				+    u_ptr = post->u_buffer;
			
 
				+    v_ptr = post->v_buffer;
			
 
				+
			
 
				+    /* vp8_filter each macro block */
			
 
				+    if (cm->filter_type == NORMAL_LOOPFILTER)
			
 
				+    {
			
 
				+        for (mb_row = 0; mb_row < mb_rows; mb_row++)
			
 
				+        {
			
 
				+            for (mb_col = 0; mb_col < mb_cols; mb_col++)
			
 
				+            {
			
 
				+                int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
			
 
				+                                mode_info_context->mbmi.mode != SPLITMV &&
			
 
				+                                mode_info_context->mbmi.mb_skip_coeff);
			
 
				+
			
 
				+                const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
			
 
				+                const int seg = mode_info_context->mbmi.segment_id;
			
 
				+                const int ref_frame = mode_info_context->mbmi.ref_frame;
			
 
				+
			
 
				+                filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
			
 
				+
			
 
				+                if (filter_level)
			
 
				+                {
			
 
				+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
			
 
				+                    lfi.mblim = lfi_n->mblim[filter_level];
			
 
				+                    lfi.blim = lfi_n->blim[filter_level];
			
 
				+                    lfi.lim = lfi_n->lim[filter_level];
			
 
				+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
			
 
				+
			
 
				+                    if (mb_col > 0)
			
 
				+                        vp8_loop_filter_mbv
			
 
				+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_bv
			
 
				+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
			
 
				+
			
 
				+                    /* don't apply across umv border */
			
 
				+                    if (mb_row > 0)
			
 
				+                        vp8_loop_filter_mbh
			
 
				+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_bh
			
 
				+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
			
 
				+                }
			
 
				+
			
 
				+                y_ptr += 16;
			
 
				+                u_ptr += 8;
			
 
				+                v_ptr += 8;
			
 
				+
			
 
				+                mode_info_context++;     /* step to next MB */
			
 
				+            }
			
 
				+            y_ptr += post_y_stride  * 16 - post->y_width;
			
 
				+            u_ptr += post_uv_stride *  8 - post->uv_width;
			
 
				+            v_ptr += post_uv_stride *  8 - post->uv_width;
			
 
				+
			
 
				+            mode_info_context++;         /* Skip border mb */
			
 
				+
			
 
				+        }
			
 
				+    }
			
 
				+    else /* SIMPLE_LOOPFILTER */
			
 
				+    {
			
 
				+        for (mb_row = 0; mb_row < mb_rows; mb_row++)
			
 
				+        {
			
 
				+            for (mb_col = 0; mb_col < mb_cols; mb_col++)
			
 
				+            {
			
 
				+                int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
			
 
				+                                mode_info_context->mbmi.mode != SPLITMV &&
			
 
				+                                mode_info_context->mbmi.mb_skip_coeff);
			
 
				+
			
 
				+                const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
			
 
				+                const int seg = mode_info_context->mbmi.segment_id;
			
 
				+                const int ref_frame = mode_info_context->mbmi.ref_frame;
			
 
				+
			
 
				+                filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
			
 
				+                if (filter_level)
			
 
				+                {
			
 
				+                    const unsigned char * mblim = lfi_n->mblim[filter_level];
			
 
				+                    const unsigned char * blim = lfi_n->blim[filter_level];
			
 
				+
			
 
				+                    if (mb_col > 0)
			
 
				+                        vp8_loop_filter_simple_mbv
			
 
				+                        (y_ptr, post_y_stride, mblim);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_simple_bv
			
 
				+                        (y_ptr, post_y_stride, blim);
			
 
				+
			
 
				+                    /* don't apply across umv border */
			
 
				+                    if (mb_row > 0)
			
 
				+                        vp8_loop_filter_simple_mbh
			
 
				+                        (y_ptr, post_y_stride, mblim);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_simple_bh
			
 
				+                        (y_ptr, post_y_stride, blim);
			
 
				+                }
			
 
				+
			
 
				+                y_ptr += 16;
			
 
				+                u_ptr += 8;
			
 
				+                v_ptr += 8;
			
 
				+
			
 
				+                mode_info_context++;     /* step to next MB */
			
 
				+            }
			
 
				+            y_ptr += post_y_stride  * 16 - post->y_width;
			
 
				+            u_ptr += post_uv_stride *  8 - post->uv_width;
			
 
				+            v_ptr += post_uv_stride *  8 - post->uv_width;
			
 
				+
			
 
				+            mode_info_context++;         /* Skip border mb */
			
 
				+
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_frame_yonly
			
 
				+(
			
 
				+    VP8_COMMON *cm,
			
 
				+    MACROBLOCKD *mbd,
			
 
				+    int default_filt_lvl
			
 
				+)
			
 
				+{
			
 
				+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
			
 
				+
			
 
				+    unsigned char *y_ptr;
			
 
				+    int mb_row;
			
 
				+    int mb_col;
			
 
				+
			
 
				+    loop_filter_info_n *lfi_n = &cm->lf_info;
			
 
				+    loop_filter_info lfi;
			
 
				+
			
 
				+    int filter_level;
			
 
				+    FRAME_TYPE frame_type = cm->frame_type;
			
 
				+
			
 
				+    /* Point at base of Mb MODE_INFO list */
			
 
				+    const MODE_INFO *mode_info_context = cm->mi;
			
 
				+
			
 
				+#if 0
			
 
				+    if(default_filt_lvl == 0) /* no filter applied */
			
 
				+        return;
			
 
				+#endif
			
 
				+
			
 
				+    /* Initialize the loop filter for this frame. */
			
 
				+    vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
			
 
				+
			
 
				+    /* Set up the buffer pointers */
			
 
				+    y_ptr = post->y_buffer;
			
 
				+
			
 
				+    /* vp8_filter each macro block */
			
 
				+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
			
 
				+    {
			
 
				+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
			
 
				+        {
			
 
				+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
			
 
				+                            mode_info_context->mbmi.mode != SPLITMV &&
			
 
				+                            mode_info_context->mbmi.mb_skip_coeff);
			
 
				+
			
 
				+            const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
			
 
				+            const int seg = mode_info_context->mbmi.segment_id;
			
 
				+            const int ref_frame = mode_info_context->mbmi.ref_frame;
			
 
				+
			
 
				+            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
			
 
				+
			
 
				+            if (filter_level)
			
 
				+            {
			
 
				+                if (cm->filter_type == NORMAL_LOOPFILTER)
			
 
				+                {
			
 
				+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
			
 
				+                    lfi.mblim = lfi_n->mblim[filter_level];
			
 
				+                    lfi.blim = lfi_n->blim[filter_level];
			
 
				+                    lfi.lim = lfi_n->lim[filter_level];
			
 
				+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
			
 
				+
			
 
				+                    if (mb_col > 0)
			
 
				+                        vp8_loop_filter_mbv
			
 
				+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_bv
			
 
				+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
			
 
				+
			
 
				+                    /* don't apply across umv border */
			
 
				+                    if (mb_row > 0)
			
 
				+                        vp8_loop_filter_mbh
			
 
				+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_bh
			
 
				+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
			
 
				+                }
			
 
				+                else
			
 
				+                {
			
 
				+                    if (mb_col > 0)
			
 
				+                        vp8_loop_filter_simple_mbv
			
 
				+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_simple_bv
			
 
				+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
			
 
				+
			
 
				+                    /* don't apply across umv border */
			
 
				+                    if (mb_row > 0)
			
 
				+                        vp8_loop_filter_simple_mbh
			
 
				+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_simple_bh
			
 
				+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            y_ptr += 16;
			
 
				+            mode_info_context ++;        /* step to next MB */
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+        y_ptr += post->y_stride  * 16 - post->y_width;
			
 
				+        mode_info_context ++;            /* Skip border mb */
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void vp8_loop_filter_partial_frame
			
 
				+(
			
 
				+    VP8_COMMON *cm,
			
 
				+    MACROBLOCKD *mbd,
			
 
				+    int default_filt_lvl
			
 
				+)
			
 
				+{
			
 
				+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
			
 
				+
			
 
				+    unsigned char *y_ptr;
			
 
				+    int mb_row;
			
 
				+    int mb_col;
			
 
				+    int mb_cols = post->y_width >> 4;
			
 
				+    int mb_rows = post->y_height >> 4;
			
 
				+
			
 
				+    int linestocopy;
			
 
				+
			
 
				+    loop_filter_info_n *lfi_n = &cm->lf_info;
			
 
				+    loop_filter_info lfi;
			
 
				+
			
 
				+    int filter_level;
			
 
				+    FRAME_TYPE frame_type = cm->frame_type;
			
 
				+
			
 
				+    const MODE_INFO *mode_info_context;
			
 
				+
			
 
				+#if 0
			
 
				+    if(default_filt_lvl == 0) /* no filter applied */
			
 
				+        return;
			
 
				+#endif
			
 
				+
			
 
				+    /* Initialize the loop filter for this frame. */
			
 
				+    vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
			
 
				+
			
 
				+    /* number of MB rows to use in partial filtering */
			
 
				+    linestocopy = mb_rows / PARTIAL_FRAME_FRACTION;
			
 
				+    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
			
 
				+
			
 
				+    /* Set up the buffer pointers; partial image starts at ~middle of frame */
			
 
				+    y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride;
			
 
				+    mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
			
 
				+
			
 
				+    /* vp8_filter each macro block */
			
 
				+    for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++)
			
 
				+    {
			
 
				+        for (mb_col = 0; mb_col < mb_cols; mb_col++)
			
 
				+        {
			
 
				+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
			
 
				+                           mode_info_context->mbmi.mode != SPLITMV &&
			
 
				+                           mode_info_context->mbmi.mb_skip_coeff);
			
 
				+
			
 
				+            const int mode_index =
			
 
				+                lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
			
 
				+            const int seg = mode_info_context->mbmi.segment_id;
			
 
				+            const int ref_frame = mode_info_context->mbmi.ref_frame;
			
 
				+
			
 
				+            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
			
 
				+
			
 
				+            if (filter_level)
			
 
				+            {
			
 
				+                if (cm->filter_type == NORMAL_LOOPFILTER)
			
 
				+                {
			
 
				+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
			
 
				+                    lfi.mblim = lfi_n->mblim[filter_level];
			
 
				+                    lfi.blim = lfi_n->blim[filter_level];
			
 
				+                    lfi.lim = lfi_n->lim[filter_level];
			
 
				+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
			
 
				+
			
 
				+                    if (mb_col > 0)
			
 
				+                        vp8_loop_filter_mbv
			
 
				+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_bv
			
 
				+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
			
 
				+
			
 
				+                    vp8_loop_filter_mbh
			
 
				+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_bh
			
 
				+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
			
 
				+                }
			
 
				+                else
			
 
				+                {
			
 
				+                    if (mb_col > 0)
			
 
				+                        vp8_loop_filter_simple_mbv
			
 
				+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_simple_bv
			
 
				+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
			
 
				+
			
 
				+                    vp8_loop_filter_simple_mbh
			
 
				+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
			
 
				+
			
 
				+                    if (!skip_lf)
			
 
				+                        vp8_loop_filter_simple_bh
			
 
				+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            y_ptr += 16;
			
 
				+            mode_info_context += 1;      /* step to next MB */
			
 
				+        }
			
 
				+
			
 
				+        y_ptr += post->y_stride  * 16 - post->y_width;
			
 
				+        mode_info_context += 1;          /* Skip border mb */
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm
+++ b/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm
@@ -0,0 +1,93 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+
			
 
				+;void vp8_copy32xn_sse2(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int  src_stride,
			
 
				+;    unsigned char *dst_ptr,
			
 
				+;    int  dst_stride,
			
 
				+;    int height);
			
 
				+global sym(vp8_copy32xn_sse2) PRIVATE
			
 
				+sym(vp8_copy32xn_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 5
			
 
				+    SAVE_XMM 7
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov             rsi,        arg(0) ;src_ptr
			
 
				+        mov             rdi,        arg(2) ;dst_ptr
			
 
				+
			
 
				+        movsxd          rax,        dword ptr arg(1) ;src_stride
			
 
				+        movsxd          rdx,        dword ptr arg(3) ;dst_stride
			
 
				+        movsxd          rcx,        dword ptr arg(4) ;height
			
 
				+
			
 
				+.block_copy_sse2_loopx4:
			
 
				+        movdqu          xmm0,       XMMWORD PTR [rsi]
			
 
				+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
			
 
				+        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
			
 
				+        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
			
 
				+
			
 
				+        lea             rsi,        [rsi+rax*2]
			
 
				+
			
 
				+        movdqu          xmm4,       XMMWORD PTR [rsi]
			
 
				+        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
			
 
				+        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
			
 
				+        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
			
 
				+
			
 
				+        lea             rsi,    [rsi+rax*2]
			
 
				+
			
 
				+        movdqa          XMMWORD PTR [rdi], xmm0
			
 
				+        movdqa          XMMWORD PTR [rdi + 16], xmm1
			
 
				+        movdqa          XMMWORD PTR [rdi + rdx], xmm2
			
 
				+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
			
 
				+
			
 
				+        lea             rdi,    [rdi+rdx*2]
			
 
				+
			
 
				+        movdqa          XMMWORD PTR [rdi], xmm4
			
 
				+        movdqa          XMMWORD PTR [rdi + 16], xmm5
			
 
				+        movdqa          XMMWORD PTR [rdi + rdx], xmm6
			
 
				+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
			
 
				+
			
 
				+        lea             rdi,    [rdi+rdx*2]
			
 
				+
			
 
				+        sub             rcx,     4
			
 
				+        cmp             rcx,     4
			
 
				+        jge             .block_copy_sse2_loopx4
			
 
				+
			
 
				+        cmp             rcx, 0
			
 
				+        je              .copy_is_done
			
 
				+
			
 
				+.block_copy_sse2_loop:
			
 
				+        movdqu          xmm0,       XMMWORD PTR [rsi]
			
 
				+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
			
 
				+        lea             rsi,    [rsi+rax]
			
 
				+
			
 
				+        movdqa          XMMWORD PTR [rdi], xmm0
			
 
				+        movdqa          XMMWORD PTR [rdi + 16], xmm1
			
 
				+        lea             rdi,    [rdi+rdx]
			
 
				+
			
 
				+        sub             rcx,     1
			
 
				+        jne             .block_copy_sse2_loop
			
 
				+
			
 
				+.copy_is_done:
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
--- a/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm
+++ b/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm
@@ -0,0 +1,146 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+%macro STACK_FRAME_CREATE_X3 0
			
 
				+%if ABI_IS_32BIT
			
 
				+  %define     src_ptr       rsi
			
 
				+  %define     src_stride    rax
			
 
				+  %define     ref_ptr       rdi
			
 
				+  %define     ref_stride    rdx
			
 
				+  %define     end_ptr       rcx
			
 
				+  %define     ret_var       rbx
			
 
				+  %define     result_ptr    arg(4)
			
 
				+  %define     max_sad       arg(4)
			
 
				+  %define     height        dword ptr arg(4)
			
 
				+    push        rbp
			
 
				+    mov         rbp,        rsp
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    push        rbx
			
 
				+
			
 
				+    mov         rsi,        arg(0)              ; src_ptr
			
 
				+    mov         rdi,        arg(2)              ; ref_ptr
			
 
				+
			
 
				+    movsxd      rax,        dword ptr arg(1)    ; src_stride
			
 
				+    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
			
 
				+%else
			
 
				+  %if LIBVPX_YASM_WIN64
			
 
				+    SAVE_XMM 7, u
			
 
				+    %define     src_ptr     rcx
			
 
				+    %define     src_stride  rdx
			
 
				+    %define     ref_ptr     r8
			
 
				+    %define     ref_stride  r9
			
 
				+    %define     end_ptr     r10
			
 
				+    %define     ret_var     r11
			
 
				+    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
			
 
				+    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
			
 
				+    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
			
 
				+  %else
			
 
				+    %define     src_ptr     rdi
			
 
				+    %define     src_stride  rsi
			
 
				+    %define     ref_ptr     rdx
			
 
				+    %define     ref_stride  rcx
			
 
				+    %define     end_ptr     r9
			
 
				+    %define     ret_var     r10
			
 
				+    %define     result_ptr  r8
			
 
				+    %define     max_sad     r8
			
 
				+    %define     height      r8
			
 
				+  %endif
			
 
				+%endif
			
 
				+
			
 
				+%endmacro
			
 
				+
			
 
				+%macro STACK_FRAME_DESTROY_X3 0
			
 
				+  %define     src_ptr
			
 
				+  %define     src_stride
			
 
				+  %define     ref_ptr
			
 
				+  %define     ref_stride
			
 
				+  %define     end_ptr
			
 
				+  %define     ret_var
			
 
				+  %define     result_ptr
			
 
				+  %define     max_sad
			
 
				+  %define     height
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+    pop         rbx
			
 
				+    pop         rdi
			
 
				+    pop         rsi
			
 
				+    pop         rbp
			
 
				+%else
			
 
				+  %if LIBVPX_YASM_WIN64
			
 
				+    RESTORE_XMM
			
 
				+  %endif
			
 
				+%endif
			
 
				+    ret
			
 
				+%endmacro
			
 
				+
			
 
				+
			
 
				+;void vp8_copy32xn_sse3(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int  src_stride,
			
 
				+;    unsigned char *dst_ptr,
			
 
				+;    int  dst_stride,
			
 
				+;    int height);
			
 
				+global sym(vp8_copy32xn_sse3) PRIVATE
			
 
				+sym(vp8_copy32xn_sse3):
			
 
				+
			
 
				+    STACK_FRAME_CREATE_X3
			
 
				+
			
 
				+.block_copy_sse3_loopx4:
			
 
				+        lea             end_ptr,    [src_ptr+src_stride*2]
			
 
				+
			
 
				+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
			
 
				+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
			
 
				+        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
			
 
				+        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
			
 
				+        movdqu          xmm4,       XMMWORD PTR [end_ptr]
			
 
				+        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
			
 
				+        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
			
 
				+        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
			
 
				+
			
 
				+        lea             src_ptr,    [src_ptr+src_stride*4]
			
 
				+
			
 
				+        lea             end_ptr,    [ref_ptr+ref_stride*2]
			
 
				+
			
 
				+        movdqa          XMMWORD PTR [ref_ptr], xmm0
			
 
				+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
			
 
				+        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
			
 
				+        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
			
 
				+        movdqa          XMMWORD PTR [end_ptr], xmm4
			
 
				+        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
			
 
				+        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
			
 
				+        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
			
 
				+
			
 
				+        lea             ref_ptr,    [ref_ptr+ref_stride*4]
			
 
				+
			
 
				+        sub             height,     4
			
 
				+        cmp             height,     4
			
 
				+        jge             .block_copy_sse3_loopx4
			
 
				+
			
 
				+        ;Check to see if there is more rows need to be copied.
			
 
				+        cmp             height, 0
			
 
				+        je              .copy_is_done
			
 
				+
			
 
				+.block_copy_sse3_loop:
			
 
				+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
			
 
				+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
			
 
				+        lea             src_ptr,    [src_ptr+src_stride]
			
 
				+
			
 
				+        movdqa          XMMWORD PTR [ref_ptr], xmm0
			
 
				+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
			
 
				+        lea             ref_ptr,    [ref_ptr+ref_stride]
			
 
				+
			
 
				+        sub             height,     1
			
 
				+        jne             .block_copy_sse3_loop
			
 
				+
			
 
				+.copy_is_done:
			
 
				+    STACK_FRAME_DESTROY_X3
			
--- a/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm
+++ b/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm
@@ -0,0 +1,258 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+
			
 
				+;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
			
 
				+global sym(vp8_dequantize_b_impl_mmx) PRIVATE
			
 
				+sym(vp8_dequantize_b_impl_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 3
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov       rsi, arg(0) ;sq
			
 
				+        mov       rdi, arg(1) ;dq
			
 
				+        mov       rax, arg(2) ;q
			
 
				+
			
 
				+        movq      mm1, [rsi]
			
 
				+        pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.
			
 
				+        movq      [rdi], mm1
			
 
				+
			
 
				+        movq      mm1, [rsi+8]
			
 
				+        pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.
			
 
				+        movq      [rdi+8], mm1
			
 
				+
			
 
				+        movq      mm1, [rsi+16]
			
 
				+        pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.
			
 
				+        movq      [rdi+16], mm1
			
 
				+
			
 
				+        movq      mm1, [rsi+24]
			
 
				+        pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.
			
 
				+        movq      [rdi+24], mm1
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void dequant_idct_add_mmx(
			
 
				+;short *input,            0
			
 
				+;short *dq,               1
			
 
				+;unsigned char *dest,     2
			
 
				+;int stride)              3
			
 
				+global sym(vp8_dequant_idct_add_mmx) PRIVATE
			
 
				+sym(vp8_dequant_idct_add_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 4
			
 
				+    GET_GOT     rbx
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rax,    arg(0) ;input
			
 
				+        mov         rdx,    arg(1) ;dq
			
 
				+
			
 
				+
			
 
				+        movq        mm0,    [rax   ]
			
 
				+        pmullw      mm0,    [rdx]
			
 
				+
			
 
				+        movq        mm1,    [rax +8]
			
 
				+        pmullw      mm1,    [rdx +8]
			
 
				+
			
 
				+        movq        mm2,    [rax+16]
			
 
				+        pmullw      mm2,    [rdx+16]
			
 
				+
			
 
				+        movq        mm3,    [rax+24]
			
 
				+        pmullw      mm3,    [rdx+24]
			
 
				+
			
 
				+        mov         rdx,    arg(2) ;dest
			
 
				+
			
 
				+        pxor        mm7,    mm7
			
 
				+
			
 
				+
			
 
				+        movq        [rax],   mm7
			
 
				+        movq        [rax+8], mm7
			
 
				+
			
 
				+        movq        [rax+16],mm7
			
 
				+        movq        [rax+24],mm7
			
 
				+
			
 
				+
			
 
				+        movsxd      rdi,            dword ptr arg(3) ;stride
			
 
				+
			
 
				+        psubw       mm0,            mm2             ; b1= 0-2
			
 
				+        paddw       mm2,            mm2             ;
			
 
				+
			
 
				+        movq        mm5,            mm1
			
 
				+        paddw       mm2,            mm0             ; a1 =0+2
			
 
				+
			
 
				+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
			
 
				+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
			
 
				+
			
 
				+        movq        mm7,            mm3             ;
			
 
				+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
			
 
				+
			
 
				+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
			
 
				+        psubw       mm7,            mm5             ; c1
			
 
				+
			
 
				+        movq        mm5,            mm1
			
 
				+        movq        mm4,            mm3
			
 
				+
			
 
				+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
			
 
				+        paddw       mm5,            mm1
			
 
				+
			
 
				+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
			
 
				+        paddw       mm3,            mm4
			
 
				+
			
 
				+        paddw       mm3,            mm5             ; d1
			
 
				+        movq        mm6,            mm2             ; a1
			
 
				+
			
 
				+        movq        mm4,            mm0             ; b1
			
 
				+        paddw       mm2,            mm3             ;0
			
 
				+
			
 
				+        paddw       mm4,            mm7             ;1
			
 
				+        psubw       mm0,            mm7             ;2
			
 
				+
			
 
				+        psubw       mm6,            mm3             ;3
			
 
				+
			
 
				+        movq        mm1,            mm2             ; 03 02 01 00
			
 
				+        movq        mm3,            mm4             ; 23 22 21 20
			
 
				+
			
 
				+        punpcklwd   mm1,            mm0             ; 11 01 10 00
			
 
				+        punpckhwd   mm2,            mm0             ; 13 03 12 02
			
 
				+
			
 
				+        punpcklwd   mm3,            mm6             ; 31 21 30 20
			
 
				+        punpckhwd   mm4,            mm6             ; 33 23 32 22
			
 
				+
			
 
				+        movq        mm0,            mm1             ; 11 01 10 00
			
 
				+        movq        mm5,            mm2             ; 13 03 12 02
			
 
				+
			
 
				+        punpckldq   mm0,            mm3             ; 30 20 10 00
			
 
				+        punpckhdq   mm1,            mm3             ; 31 21 11 01
			
 
				+
			
 
				+        punpckldq   mm2,            mm4             ; 32 22 12 02
			
 
				+        punpckhdq   mm5,            mm4             ; 33 23 13 03
			
 
				+
			
 
				+        movq        mm3,            mm5             ; 33 23 13 03
			
 
				+
			
 
				+        psubw       mm0,            mm2             ; b1= 0-2
			
 
				+        paddw       mm2,            mm2             ;
			
 
				+
			
 
				+        movq        mm5,            mm1
			
 
				+        paddw       mm2,            mm0             ; a1 =0+2
			
 
				+
			
 
				+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
			
 
				+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
			
 
				+
			
 
				+        movq        mm7,            mm3             ;
			
 
				+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
			
 
				+
			
 
				+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
			
 
				+        psubw       mm7,            mm5             ; c1
			
 
				+
			
 
				+        movq        mm5,            mm1
			
 
				+        movq        mm4,            mm3
			
 
				+
			
 
				+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
			
 
				+        paddw       mm5,            mm1
			
 
				+
			
 
				+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
			
 
				+        paddw       mm3,            mm4
			
 
				+
			
 
				+        paddw       mm3,            mm5             ; d1
			
 
				+        paddw       mm0,            [GLOBAL(fours)]
			
 
				+
			
 
				+        paddw       mm2,            [GLOBAL(fours)]
			
 
				+        movq        mm6,            mm2             ; a1
			
 
				+
			
 
				+        movq        mm4,            mm0             ; b1
			
 
				+        paddw       mm2,            mm3             ;0
			
 
				+
			
 
				+        paddw       mm4,            mm7             ;1
			
 
				+        psubw       mm0,            mm7             ;2
			
 
				+
			
 
				+        psubw       mm6,            mm3             ;3
			
 
				+        psraw       mm2,            3
			
 
				+
			
 
				+        psraw       mm0,            3
			
 
				+        psraw       mm4,            3
			
 
				+
			
 
				+        psraw       mm6,            3
			
 
				+
			
 
				+        movq        mm1,            mm2             ; 03 02 01 00
			
 
				+        movq        mm3,            mm4             ; 23 22 21 20
			
 
				+
			
 
				+        punpcklwd   mm1,            mm0             ; 11 01 10 00
			
 
				+        punpckhwd   mm2,            mm0             ; 13 03 12 02
			
 
				+
			
 
				+        punpcklwd   mm3,            mm6             ; 31 21 30 20
			
 
				+        punpckhwd   mm4,            mm6             ; 33 23 32 22
			
 
				+
			
 
				+        movq        mm0,            mm1             ; 11 01 10 00
			
 
				+        movq        mm5,            mm2             ; 13 03 12 02
			
 
				+
			
 
				+        punpckldq   mm0,            mm3             ; 30 20 10 00
			
 
				+        punpckhdq   mm1,            mm3             ; 31 21 11 01
			
 
				+
			
 
				+        punpckldq   mm2,            mm4             ; 32 22 12 02
			
 
				+        punpckhdq   mm5,            mm4             ; 33 23 13 03
			
 
				+
			
 
				+        pxor        mm7,            mm7
			
 
				+
			
 
				+        movd        mm4,            [rdx]
			
 
				+        punpcklbw   mm4,            mm7
			
 
				+        paddsw      mm0,            mm4
			
 
				+        packuswb    mm0,            mm7
			
 
				+        movd        [rdx],          mm0
			
 
				+
			
 
				+        movd        mm4,            [rdx+rdi]
			
 
				+        punpcklbw   mm4,            mm7
			
 
				+        paddsw      mm1,            mm4
			
 
				+        packuswb    mm1,            mm7
			
 
				+        movd        [rdx+rdi],      mm1
			
 
				+
			
 
				+        movd        mm4,            [rdx+2*rdi]
			
 
				+        punpcklbw   mm4,            mm7
			
 
				+        paddsw      mm2,            mm4
			
 
				+        packuswb    mm2,            mm7
			
 
				+        movd        [rdx+rdi*2],    mm2
			
 
				+
			
 
				+        add         rdx,            rdi
			
 
				+
			
 
				+        movd        mm4,            [rdx+2*rdi]
			
 
				+        punpcklbw   mm4,            mm7
			
 
				+        paddsw      mm5,            mm4
			
 
				+        packuswb    mm5,            mm7
			
 
				+        movd        [rdx+rdi*2],    mm5
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+SECTION_RODATA
			
 
				+align 16
			
 
				+x_s1sqr2:
			
 
				+    times 4 dw 0x8A8C
			
 
				+align 16
			
 
				+x_c1sqr2less1:
			
 
				+    times 4 dw 0x4E7B
			
 
				+align 16
			
 
				+fours:
			
 
				+    times 4 dw 0x0004
			
--- a/thirdparty/libvpx/vp8/common/x86/filter_x86.c
+++ b/thirdparty/libvpx/vp8/common/x86/filter_x86.c
@@ -0,0 +1,35 @@
 
				+/*
			
 
				+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include "vp8/common/x86/filter_x86.h"
			
 
				+
			
 
				+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
			
 
				+{
			
 
				+    { 128, 128, 128, 128,   0,   0,   0,   0 },
			
 
				+    { 112, 112, 112, 112,  16,  16,  16,  16 },
			
 
				+    {  96,  96,  96,  96,  32,  32,  32,  32 },
			
 
				+    {  80,  80,  80,  80,  48,  48,  48,  48 },
			
 
				+    {  64,  64,  64,  64,  64,  64,  64,  64 },
			
 
				+    {  48,  48,  48,  48,  80,  80,  80,  80 },
			
 
				+    {  32,  32,  32,  32,  96,  96,  96,  96 },
			
 
				+    {  16,  16,  16,  16, 112, 112, 112, 112 }
			
 
				+};
			
 
				+
			
 
				+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) =
			
 
				+{
			
 
				+    { 128, 128, 128, 128, 128, 128, 128, 128,   0,   0,   0,   0,   0,   0,   0,   0 },
			
 
				+    { 112, 112, 112, 112, 112, 112, 112, 112,  16,  16,  16,  16,  16,  16,  16,  16 },
			
 
				+    {  96,  96,  96,  96,  96,  96,  96,  96,  32,  32,  32,  32,  32,  32,  32,  32 },
			
 
				+    {  80,  80,  80,  80,  80,  80,  80,  80,  48,  48,  48,  48,  48,  48,  48,  48 },
			
 
				+    {  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
			
 
				+    {  48,  48,  48,  48,  48,  48,  48,  48,  80,  80,  80,  80,  80,  80,  80,  80 },
			
 
				+    {  32,  32,  32,  32,  32,  32,  32,  32,  96,  96,  96,  96,  96,  96,  96,  96 },
			
 
				+    {  16,  16,  16,  16,  16,  16,  16,  16, 112, 112, 112, 112, 112, 112, 112, 112 }
			
 
				+};
			
--- a/thirdparty/libvpx/vp8/common/x86/filter_x86.h
+++ b/thirdparty/libvpx/vp8/common/x86/filter_x86.h
@@ -0,0 +1,33 @@
 
				+/*
			
 
				+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#ifndef VP8_COMMON_X86_FILTER_X86_H_
			
 
				+#define VP8_COMMON_X86_FILTER_X86_H_
			
 
				+
			
 
				+#include "vpx_ports/mem.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
			
 
				+ * duplicated values */
			
 
				+
			
 
				+/* duplicated 4x */
			
 
				+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
			
 
				+
			
 
				+/* duplicated 8x */
			
 
				+extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // VP8_COMMON_X86_FILTER_X86_H_
			
--- a/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c
+++ b/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c
@@ -0,0 +1,128 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+#include "vp8/common/blockd.h"
			
 
				+#include "vpx_mem/vpx_mem.h"
			
 
				+
			
 
				+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
			
 
				+
			
 
				+void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC)
			
 
				+{
			
 
				+    short *sq = (short *) d->qcoeff;
			
 
				+    short *dq = (short *) d->dqcoeff;
			
 
				+
			
 
				+    vp8_dequantize_b_impl_mmx(sq, dq, DQC);
			
 
				+}
			
 
				+
			
 
				+void vp8_dequant_idct_add_y_block_mmx
			
 
				+            (short *q, short *dq,
			
 
				+             unsigned char *dst, int stride, char *eobs)
			
 
				+{
			
 
				+    int i;
			
 
				+
			
 
				+    for (i = 0; i < 4; i++)
			
 
				+    {
			
 
				+        if (eobs[0] > 1)
			
 
				+            vp8_dequant_idct_add_mmx (q, dq, dst, stride);
			
 
				+        else if (eobs[0] == 1)
			
 
				+        {
			
 
				+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
			
 
				+            memset(q, 0, 2 * sizeof(q[0]));
			
 
				+        }
			
 
				+
			
 
				+        if (eobs[1] > 1)
			
 
				+            vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);
			
 
				+        else if (eobs[1] == 1)
			
 
				+        {
			
 
				+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
			
 
				+                                      dst+4, stride);
			
 
				+            memset(q + 16, 0, 2 * sizeof(q[0]));
			
 
				+        }
			
 
				+
			
 
				+        if (eobs[2] > 1)
			
 
				+            vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);
			
 
				+        else if (eobs[2] == 1)
			
 
				+        {
			
 
				+            vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
			
 
				+                                      dst+8, stride);
			
 
				+            memset(q + 32, 0, 2 * sizeof(q[0]));
			
 
				+        }
			
 
				+
			
 
				+        if (eobs[3] > 1)
			
 
				+            vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);
			
 
				+        else if (eobs[3] == 1)
			
 
				+        {
			
 
				+            vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
			
 
				+                                      dst+12, stride);
			
 
				+            memset(q + 48, 0, 2 * sizeof(q[0]));
			
 
				+        }
			
 
				+
			
 
				+        q    += 64;
			
 
				+        dst  += 4*stride;
			
 
				+        eobs += 4;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_dequant_idct_add_uv_block_mmx
			
 
				+            (short *q, short *dq,
			
 
				+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
			
 
				+{
			
 
				+    int i;
			
 
				+
			
 
				+    for (i = 0; i < 2; i++)
			
 
				+    {
			
 
				+        if (eobs[0] > 1)
			
 
				+            vp8_dequant_idct_add_mmx (q, dq, dstu, stride);
			
 
				+        else if (eobs[0] == 1)
			
 
				+        {
			
 
				+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
			
 
				+            memset(q, 0, 2 * sizeof(q[0]));
			
 
				+        }
			
 
				+
			
 
				+        if (eobs[1] > 1)
			
 
				+            vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);
			
 
				+        else if (eobs[1] == 1)
			
 
				+        {
			
 
				+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
			
 
				+                                      dstu+4, stride);
			
 
				+            memset(q + 16, 0, 2 * sizeof(q[0]));
			
 
				+        }
			
 
				+
			
 
				+        q    += 32;
			
 
				+        dstu += 4*stride;
			
 
				+        eobs += 2;
			
 
				+    }
			
 
				+
			
 
				+    for (i = 0; i < 2; i++)
			
 
				+    {
			
 
				+        if (eobs[0] > 1)
			
 
				+            vp8_dequant_idct_add_mmx (q, dq, dstv, stride);
			
 
				+        else if (eobs[0] == 1)
			
 
				+        {
			
 
				+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
			
 
				+            memset(q, 0, 2 * sizeof(q[0]));
			
 
				+        }
			
 
				+
			
 
				+        if (eobs[1] > 1)
			
 
				+            vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);
			
 
				+        else if (eobs[1] == 1)
			
 
				+        {
			
 
				+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
			
 
				+                                      dstv+4, stride);
			
 
				+            memset(q + 16, 0, 2 * sizeof(q[0]));
			
 
				+        }
			
 
				+
			
 
				+        q    += 32;
			
 
				+        dstv += 4*stride;
			
 
				+        eobs += 2;
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c
+++ b/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c
@@ -0,0 +1,89 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+
			
 
				+void vp8_idct_dequant_0_2x_sse2
			
 
				+            (short *q, short *dq ,
			
 
				+             unsigned char *dst, int dst_stride);
			
 
				+void vp8_idct_dequant_full_2x_sse2
			
 
				+            (short *q, short *dq ,
			
 
				+             unsigned char *dst, int dst_stride);
			
 
				+
			
 
				+void vp8_dequant_idct_add_y_block_sse2
			
 
				+            (short *q, short *dq,
			
 
				+             unsigned char *dst, int stride, char *eobs)
			
 
				+{
			
 
				+    int i;
			
 
				+
			
 
				+    for (i = 0; i < 4; i++)
			
 
				+    {
			
 
				+        if (((short *)(eobs))[0])
			
 
				+        {
			
 
				+            if (((short *)(eobs))[0] & 0xfefe)
			
 
				+                vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);
			
 
				+            else
			
 
				+                vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);
			
 
				+        }
			
 
				+        if (((short *)(eobs))[1])
			
 
				+        {
			
 
				+            if (((short *)(eobs))[1] & 0xfefe)
			
 
				+                vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);
			
 
				+            else
			
 
				+                vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);
			
 
				+        }
			
 
				+        q    += 64;
			
 
				+        dst  += stride*4;
			
 
				+        eobs += 4;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_dequant_idct_add_uv_block_sse2
			
 
				+            (short *q, short *dq,
			
 
				+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
			
 
				+{
			
 
				+    if (((short *)(eobs))[0])
			
 
				+    {
			
 
				+        if (((short *)(eobs))[0] & 0xfefe)
			
 
				+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
			
 
				+        else
			
 
				+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
			
 
				+    }
			
 
				+    q    += 32;
			
 
				+    dstu += stride*4;
			
 
				+
			
 
				+    if (((short *)(eobs))[1])
			
 
				+    {
			
 
				+        if (((short *)(eobs))[1] & 0xfefe)
			
 
				+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
			
 
				+        else
			
 
				+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
			
 
				+    }
			
 
				+    q    += 32;
			
 
				+
			
 
				+    if (((short *)(eobs))[2])
			
 
				+    {
			
 
				+        if (((short *)(eobs))[2] & 0xfefe)
			
 
				+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
			
 
				+        else
			
 
				+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
			
 
				+    }
			
 
				+    q    += 32;
			
 
				+    dstv += stride*4;
			
 
				+
			
 
				+    if (((short *)(eobs))[3])
			
 
				+    {
			
 
				+      if (((short *)(eobs))[3] & 0xfefe)
			
 
				+          vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
			
 
				+      else
			
 
				+          vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
			
 
				+    }
			
 
				+}
			
--- a/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm
+++ b/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm
@@ -0,0 +1,295 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+; /****************************************************************************
			
 
				+; * Notes:
			
 
				+; *
			
 
				+; * This implementation makes use of 16 bit fixed point version of two multiply
			
 
				+; * constants:
			
 
				+; *        1.   sqrt(2) * cos (pi/8)
			
 
				+; *        2.   sqrt(2) * sin (pi/8)
			
 
				+; * Because the first constant is bigger than 1, to maintain the same 16 bit
			
 
				+; * fixed point precision as the second one, we use a trick of
			
 
				+; *        x * a = x + x*(a-1)
			
 
				+; * so
			
 
				+; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
			
 
				+; *
			
 
				+; * For the second constant, because of the 16bit version is 35468, which
			
 
				+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
			
 
				+; * number.
			
 
				+; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
			
 
				+; *
			
 
				+; **************************************************************************/
			
 
				+
			
 
				+
			
 
				+;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
			
 
				+;int pitch, unsigned char *dest,int stride)
			
 
				+global sym(vp8_short_idct4x4llm_mmx) PRIVATE
			
 
				+sym(vp8_short_idct4x4llm_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 5
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    mov         rax,    arg(0)              ;input
			
 
				+    mov         rsi,    arg(1)              ;pred
			
 
				+
			
 
				+    movq        mm0,    [rax   ]
			
 
				+    movq        mm1,    [rax+ 8]
			
 
				+    movq        mm2,    [rax+16]
			
 
				+    movq        mm3,    [rax+24]
			
 
				+
			
 
				+%if 0
			
 
				+    pxor        mm7,    mm7
			
 
				+    movq        [rax],   mm7
			
 
				+    movq        [rax+8], mm7
			
 
				+    movq        [rax+16],mm7
			
 
				+    movq        [rax+24],mm7
			
 
				+%endif
			
 
				+    movsxd      rax,    dword ptr arg(2)    ;pitch
			
 
				+    mov         rdx,    arg(3)              ;dest
			
 
				+    movsxd      rdi,    dword ptr arg(4)    ;stride
			
 
				+
			
 
				+
			
 
				+    psubw       mm0,            mm2             ; b1= 0-2
			
 
				+    paddw       mm2,            mm2             ;
			
 
				+
			
 
				+    movq        mm5,            mm1
			
 
				+    paddw       mm2,            mm0             ; a1 =0+2
			
 
				+
			
 
				+    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
			
 
				+    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
			
 
				+
			
 
				+    movq        mm7,            mm3             ;
			
 
				+    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
			
 
				+
			
 
				+    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
			
 
				+    psubw       mm7,            mm5             ; c1
			
 
				+
			
 
				+    movq        mm5,            mm1
			
 
				+    movq        mm4,            mm3
			
 
				+
			
 
				+    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
			
 
				+    paddw       mm5,            mm1
			
 
				+
			
 
				+    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
			
 
				+    paddw       mm3,            mm4
			
 
				+
			
 
				+    paddw       mm3,            mm5             ; d1
			
 
				+    movq        mm6,            mm2             ; a1
			
 
				+
			
 
				+    movq        mm4,            mm0             ; b1
			
 
				+    paddw       mm2,            mm3             ;0
			
 
				+
			
 
				+    paddw       mm4,            mm7             ;1
			
 
				+    psubw       mm0,            mm7             ;2
			
 
				+
			
 
				+    psubw       mm6,            mm3             ;3
			
 
				+
			
 
				+    movq        mm1,            mm2             ; 03 02 01 00
			
 
				+    movq        mm3,            mm4             ; 23 22 21 20
			
 
				+
			
 
				+    punpcklwd   mm1,            mm0             ; 11 01 10 00
			
 
				+    punpckhwd   mm2,            mm0             ; 13 03 12 02
			
 
				+
			
 
				+    punpcklwd   mm3,            mm6             ; 31 21 30 20
			
 
				+    punpckhwd   mm4,            mm6             ; 33 23 32 22
			
 
				+
			
 
				+    movq        mm0,            mm1             ; 11 01 10 00
			
 
				+    movq        mm5,            mm2             ; 13 03 12 02
			
 
				+
			
 
				+    punpckldq   mm0,            mm3             ; 30 20 10 00
			
 
				+    punpckhdq   mm1,            mm3             ; 31 21 11 01
			
 
				+
			
 
				+    punpckldq   mm2,            mm4             ; 32 22 12 02
			
 
				+    punpckhdq   mm5,            mm4             ; 33 23 13 03
			
 
				+
			
 
				+    movq        mm3,            mm5             ; 33 23 13 03
			
 
				+
			
 
				+    psubw       mm0,            mm2             ; b1= 0-2
			
 
				+    paddw       mm2,            mm2             ;
			
 
				+
			
 
				+    movq        mm5,            mm1
			
 
				+    paddw       mm2,            mm0             ; a1 =0+2
			
 
				+
			
 
				+    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
			
 
				+    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
			
 
				+
			
 
				+    movq        mm7,            mm3             ;
			
 
				+    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
			
 
				+
			
 
				+    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
			
 
				+    psubw       mm7,            mm5             ; c1
			
 
				+
			
 
				+    movq        mm5,            mm1
			
 
				+    movq        mm4,            mm3
			
 
				+
			
 
				+    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
			
 
				+    paddw       mm5,            mm1
			
 
				+
			
 
				+    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
			
 
				+    paddw       mm3,            mm4
			
 
				+
			
 
				+    paddw       mm3,            mm5             ; d1
			
 
				+    paddw       mm0,            [GLOBAL(fours)]
			
 
				+
			
 
				+    paddw       mm2,            [GLOBAL(fours)]
			
 
				+    movq        mm6,            mm2             ; a1
			
 
				+
			
 
				+    movq        mm4,            mm0             ; b1
			
 
				+    paddw       mm2,            mm3             ;0
			
 
				+
			
 
				+    paddw       mm4,            mm7             ;1
			
 
				+    psubw       mm0,            mm7             ;2
			
 
				+
			
 
				+    psubw       mm6,            mm3             ;3
			
 
				+    psraw       mm2,            3
			
 
				+
			
 
				+    psraw       mm0,            3
			
 
				+    psraw       mm4,            3
			
 
				+
			
 
				+    psraw       mm6,            3
			
 
				+
			
 
				+    movq        mm1,            mm2             ; 03 02 01 00
			
 
				+    movq        mm3,            mm4             ; 23 22 21 20
			
 
				+
			
 
				+    punpcklwd   mm1,            mm0             ; 11 01 10 00
			
 
				+    punpckhwd   mm2,            mm0             ; 13 03 12 02
			
 
				+
			
 
				+    punpcklwd   mm3,            mm6             ; 31 21 30 20
			
 
				+    punpckhwd   mm4,            mm6             ; 33 23 32 22
			
 
				+
			
 
				+    movq        mm0,            mm1             ; 11 01 10 00
			
 
				+    movq        mm5,            mm2             ; 13 03 12 02
			
 
				+
			
 
				+    punpckldq   mm0,            mm3             ; 30 20 10 00
			
 
				+    punpckhdq   mm1,            mm3             ; 31 21 11 01
			
 
				+
			
 
				+    punpckldq   mm2,            mm4             ; 32 22 12 02
			
 
				+    punpckhdq   mm5,            mm4             ; 33 23 13 03
			
 
				+
			
 
				+    pxor        mm7,            mm7
			
 
				+
			
 
				+    movd        mm4,            [rsi]
			
 
				+    punpcklbw   mm4,            mm7
			
 
				+    paddsw      mm0,            mm4
			
 
				+    packuswb    mm0,            mm7
			
 
				+    movd        [rdx],          mm0
			
 
				+
			
 
				+    movd        mm4,            [rsi+rax]
			
 
				+    punpcklbw   mm4,            mm7
			
 
				+    paddsw      mm1,            mm4
			
 
				+    packuswb    mm1,            mm7
			
 
				+    movd        [rdx+rdi],      mm1
			
 
				+
			
 
				+    movd        mm4,            [rsi+2*rax]
			
 
				+    punpcklbw   mm4,            mm7
			
 
				+    paddsw      mm2,            mm4
			
 
				+    packuswb    mm2,            mm7
			
 
				+    movd        [rdx+rdi*2],    mm2
			
 
				+
			
 
				+    add         rdx,            rdi
			
 
				+    add         rsi,            rax
			
 
				+
			
 
				+    movd        mm4,            [rsi+2*rax]
			
 
				+    punpcklbw   mm4,            mm7
			
 
				+    paddsw      mm5,            mm4
			
 
				+    packuswb    mm5,            mm7
			
 
				+    movd        [rdx+rdi*2],    mm5
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+;void vp8_dc_only_idct_add_mmx(
			
 
				+;short input_dc,
			
 
				+;unsigned char *pred_ptr,
			
 
				+;int pred_stride,
			
 
				+;unsigned char *dst_ptr,
			
 
				+;int stride)
			
 
				+global sym(vp8_dc_only_idct_add_mmx) PRIVATE
			
 
				+sym(vp8_dc_only_idct_add_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 5
			
 
				+    GET_GOT     rbx
			
 
				+    ; end prolog
			
 
				+
			
 
				+        movd        mm5,            arg(0) ;input_dc
			
 
				+        mov         rax,            arg(1) ;pred_ptr
			
 
				+        movsxd      rdx,            dword ptr arg(2) ;pred_stride
			
 
				+
			
 
				+        pxor        mm0,            mm0
			
 
				+
			
 
				+        paddw       mm5,            [GLOBAL(fours)]
			
 
				+        lea         rcx,            [rdx + rdx*2]
			
 
				+
			
 
				+        psraw       mm5,            3
			
 
				+
			
 
				+        punpcklwd   mm5,            mm5
			
 
				+
			
 
				+        punpckldq   mm5,            mm5
			
 
				+
			
 
				+        movd        mm1,            [rax]
			
 
				+        movd        mm2,            [rax+rdx]
			
 
				+        movd        mm3,            [rax+2*rdx]
			
 
				+        movd        mm4,            [rax+rcx]
			
 
				+
			
 
				+        mov         rax,            arg(3) ;d -- destination
			
 
				+        movsxd      rdx,            dword ptr arg(4) ;dst_stride
			
 
				+
			
 
				+        punpcklbw   mm1,            mm0
			
 
				+        paddsw      mm1,            mm5
			
 
				+        packuswb    mm1,            mm0              ; pack and unpack to saturate
			
 
				+        lea         rcx,            [rdx + rdx*2]
			
 
				+
			
 
				+        punpcklbw   mm2,            mm0
			
 
				+        paddsw      mm2,            mm5
			
 
				+        packuswb    mm2,            mm0              ; pack and unpack to saturate
			
 
				+
			
 
				+        punpcklbw   mm3,            mm0
			
 
				+        paddsw      mm3,            mm5
			
 
				+        packuswb    mm3,            mm0              ; pack and unpack to saturate
			
 
				+
			
 
				+        punpcklbw   mm4,            mm0
			
 
				+        paddsw      mm4,            mm5
			
 
				+        packuswb    mm4,            mm0              ; pack and unpack to saturate
			
 
				+
			
 
				+        movd        [rax],          mm1
			
 
				+        movd        [rax+rdx],      mm2
			
 
				+        movd        [rax+2*rdx],    mm3
			
 
				+        movd        [rax+rcx],      mm4
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+SECTION_RODATA
			
 
				+align 16
			
 
				+x_s1sqr2:
			
 
				+    times 4 dw 0x8A8C
			
 
				+align 16
			
 
				+x_c1sqr2less1:
			
 
				+    times 4 dw 0x4E7B
			
 
				+align 16
			
 
				+fours:
			
 
				+    times 4 dw 0x0004
			
--- a/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm
+++ b/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm
@@ -0,0 +1,708 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+;void vp8_idct_dequant_0_2x_sse2
			
 
				+; (
			
 
				+;   short *qcoeff       - 0
			
 
				+;   short *dequant      - 1
			
 
				+;   unsigned char *dst  - 2
			
 
				+;   int dst_stride      - 3
			
 
				+; )
			
 
				+
			
 
				+global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
			
 
				+sym(vp8_idct_dequant_0_2x_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 4
			
 
				+    GET_GOT     rbx
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rdx,            arg(1) ; dequant
			
 
				+        mov         rax,            arg(0) ; qcoeff
			
 
				+
			
 
				+        movd        xmm4,           [rax]
			
 
				+        movd        xmm5,           [rdx]
			
 
				+
			
 
				+        pinsrw      xmm4,           [rax+32],   4
			
 
				+        pinsrw      xmm5,           [rdx],      4
			
 
				+
			
 
				+        pmullw      xmm4,           xmm5
			
 
				+
			
 
				+    ; Zero out xmm5, for use unpacking
			
 
				+        pxor        xmm5,           xmm5
			
 
				+
			
 
				+    ; clear coeffs
			
 
				+        movd        [rax],          xmm5
			
 
				+        movd        [rax+32],       xmm5
			
 
				+;pshufb
			
 
				+        mov         rax,            arg(2) ; dst
			
 
				+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
			
 
				+
			
 
				+        pshuflw     xmm4,           xmm4,       00000000b
			
 
				+        pshufhw     xmm4,           xmm4,       00000000b
			
 
				+
			
 
				+        lea         rcx,            [rdx + rdx*2]
			
 
				+        paddw       xmm4,           [GLOBAL(fours)]
			
 
				+
			
 
				+        psraw       xmm4,           3
			
 
				+
			
 
				+        movq        xmm0,           [rax]
			
 
				+        movq        xmm1,           [rax+rdx]
			
 
				+        movq        xmm2,           [rax+2*rdx]
			
 
				+        movq        xmm3,           [rax+rcx]
			
 
				+
			
 
				+        punpcklbw   xmm0,           xmm5
			
 
				+        punpcklbw   xmm1,           xmm5
			
 
				+        punpcklbw   xmm2,           xmm5
			
 
				+        punpcklbw   xmm3,           xmm5
			
 
				+
			
 
				+
			
 
				+    ; Add to predict buffer
			
 
				+        paddw       xmm0,           xmm4
			
 
				+        paddw       xmm1,           xmm4
			
 
				+        paddw       xmm2,           xmm4
			
 
				+        paddw       xmm3,           xmm4
			
 
				+
			
 
				+    ; pack up before storing
			
 
				+        packuswb    xmm0,           xmm5
			
 
				+        packuswb    xmm1,           xmm5
			
 
				+        packuswb    xmm2,           xmm5
			
 
				+        packuswb    xmm3,           xmm5
			
 
				+
			
 
				+    ; store blocks back out
			
 
				+        movq        [rax],          xmm0
			
 
				+        movq        [rax + rdx],    xmm1
			
 
				+
			
 
				+        lea         rax,            [rax + 2*rdx]
			
 
				+
			
 
				+        movq        [rax],          xmm2
			
 
				+        movq        [rax + rdx],    xmm3
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+;void vp8_idct_dequant_full_2x_sse2
			
 
				+; (
			
 
				+;   short *qcoeff       - 0
			
 
				+;   short *dequant      - 1
			
 
				+;   unsigned char *dst  - 2
			
 
				+;   int dst_stride      - 3
			
 
				+; )
			
 
				+global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
			
 
				+sym(vp8_idct_dequant_full_2x_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 4
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ; special case when 2 blocks have 0 or 1 coeffs
			
 
				+    ; dc is set as first coeff, so no need to load qcoeff
			
 
				+        mov         rax,            arg(0) ; qcoeff
			
 
				+        mov         rdx,            arg(1)  ; dequant
			
 
				+        mov         rdi,            arg(2) ; dst
			
 
				+
			
 
				+
			
 
				+    ; Zero out xmm7, for use unpacking
			
 
				+        pxor        xmm7,           xmm7
			
 
				+
			
 
				+
			
 
				+    ; note the transpose of xmm1 and xmm2, necessary for shuffle
			
 
				+    ;   to spit out sensicle data
			
 
				+        movdqa      xmm0,           [rax]
			
 
				+        movdqa      xmm2,           [rax+16]
			
 
				+        movdqa      xmm1,           [rax+32]
			
 
				+        movdqa      xmm3,           [rax+48]
			
 
				+
			
 
				+    ; Clear out coeffs
			
 
				+        movdqa      [rax],          xmm7
			
 
				+        movdqa      [rax+16],       xmm7
			
 
				+        movdqa      [rax+32],       xmm7
			
 
				+        movdqa      [rax+48],       xmm7
			
 
				+
			
 
				+    ; dequantize qcoeff buffer
			
 
				+        pmullw      xmm0,           [rdx]
			
 
				+        pmullw      xmm2,           [rdx+16]
			
 
				+        pmullw      xmm1,           [rdx]
			
 
				+        pmullw      xmm3,           [rdx+16]
			
 
				+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
			
 
				+
			
 
				+    ; repack so block 0 row x and block 1 row x are together
			
 
				+        movdqa      xmm4,           xmm0
			
 
				+        punpckldq   xmm0,           xmm1
			
 
				+        punpckhdq   xmm4,           xmm1
			
 
				+
			
 
				+        pshufd      xmm0,           xmm0,       11011000b
			
 
				+        pshufd      xmm1,           xmm4,       11011000b
			
 
				+
			
 
				+        movdqa      xmm4,           xmm2
			
 
				+        punpckldq   xmm2,           xmm3
			
 
				+        punpckhdq   xmm4,           xmm3
			
 
				+
			
 
				+        pshufd      xmm2,           xmm2,       11011000b
			
 
				+        pshufd      xmm3,           xmm4,       11011000b
			
 
				+
			
 
				+    ; first pass
			
 
				+        psubw       xmm0,           xmm2        ; b1 = 0-2
			
 
				+        paddw       xmm2,           xmm2        ;
			
 
				+
			
 
				+        movdqa      xmm5,           xmm1
			
 
				+        paddw       xmm2,           xmm0        ; a1 = 0+2
			
 
				+
			
 
				+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
			
 
				+        lea         rcx,            [rdx + rdx*2]   ;dst_stride * 3
			
 
				+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
			
 
				+
			
 
				+        movdqa      xmm7,           xmm3
			
 
				+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
			
 
				+
			
 
				+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
			
 
				+        psubw       xmm7,           xmm5        ; c1
			
 
				+
			
 
				+        movdqa      xmm5,           xmm1
			
 
				+        movdqa      xmm4,           xmm3
			
 
				+
			
 
				+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
			
 
				+        paddw       xmm5,           xmm1
			
 
				+
			
 
				+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
			
 
				+        paddw       xmm3,           xmm4
			
 
				+
			
 
				+        paddw       xmm3,           xmm5        ; d1
			
 
				+        movdqa      xmm6,           xmm2        ; a1
			
 
				+
			
 
				+        movdqa      xmm4,           xmm0        ; b1
			
 
				+        paddw       xmm2,           xmm3        ;0
			
 
				+
			
 
				+        paddw       xmm4,           xmm7        ;1
			
 
				+        psubw       xmm0,           xmm7        ;2
			
 
				+
			
 
				+        psubw       xmm6,           xmm3        ;3
			
 
				+
			
 
				+    ; transpose for the second pass
			
 
				+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
			
 
				+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
			
 
				+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
			
 
				+
			
 
				+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
			
 
				+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
			
 
				+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
			
 
				+
			
 
				+
			
 
				+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
			
 
				+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
			
 
				+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
			
 
				+
			
 
				+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
			
 
				+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
			
 
				+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
			
 
				+
			
 
				+
			
 
				+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
			
 
				+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
			
 
				+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
			
 
				+
			
 
				+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
			
 
				+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
			
 
				+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
			
 
				+
			
 
				+        pshufd      xmm0,           xmm2,       11011000b
			
 
				+        pshufd      xmm2,           xmm1,       11011000b
			
 
				+
			
 
				+        pshufd      xmm1,           xmm5,       11011000b
			
 
				+        pshufd      xmm3,           xmm7,       11011000b
			
 
				+
			
 
				+    ; second pass
			
 
				+        psubw       xmm0,           xmm2            ; b1 = 0-2
			
 
				+        paddw       xmm2,           xmm2
			
 
				+
			
 
				+        movdqa      xmm5,           xmm1
			
 
				+        paddw       xmm2,           xmm0            ; a1 = 0+2
			
 
				+
			
 
				+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
			
 
				+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
			
 
				+
			
 
				+        movdqa      xmm7,           xmm3
			
 
				+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
			
 
				+
			
 
				+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
			
 
				+        psubw       xmm7,           xmm5            ; c1
			
 
				+
			
 
				+        movdqa      xmm5,           xmm1
			
 
				+        movdqa      xmm4,           xmm3
			
 
				+
			
 
				+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
			
 
				+        paddw       xmm5,           xmm1
			
 
				+
			
 
				+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
			
 
				+        paddw       xmm3,           xmm4
			
 
				+
			
 
				+        paddw       xmm3,           xmm5            ; d1
			
 
				+        paddw       xmm0,           [GLOBAL(fours)]
			
 
				+
			
 
				+        paddw       xmm2,           [GLOBAL(fours)]
			
 
				+        movdqa      xmm6,           xmm2            ; a1
			
 
				+
			
 
				+        movdqa      xmm4,           xmm0            ; b1
			
 
				+        paddw       xmm2,           xmm3            ;0
			
 
				+
			
 
				+        paddw       xmm4,           xmm7            ;1
			
 
				+        psubw       xmm0,           xmm7            ;2
			
 
				+
			
 
				+        psubw       xmm6,           xmm3            ;3
			
 
				+        psraw       xmm2,           3
			
 
				+
			
 
				+        psraw       xmm0,           3
			
 
				+        psraw       xmm4,           3
			
 
				+
			
 
				+        psraw       xmm6,           3
			
 
				+
			
 
				+    ; transpose to save
			
 
				+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
			
 
				+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
			
 
				+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
			
 
				+
			
 
				+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
			
 
				+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
			
 
				+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
			
 
				+
			
 
				+
			
 
				+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
			
 
				+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
			
 
				+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
			
 
				+
			
 
				+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
			
 
				+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
			
 
				+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
			
 
				+
			
 
				+
			
 
				+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
			
 
				+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
			
 
				+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
			
 
				+
			
 
				+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
			
 
				+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
			
 
				+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
			
 
				+
			
 
				+        pshufd      xmm0,           xmm2,       11011000b
			
 
				+        pshufd      xmm2,           xmm1,       11011000b
			
 
				+
			
 
				+        pshufd      xmm1,           xmm5,       11011000b
			
 
				+        pshufd      xmm3,           xmm7,       11011000b
			
 
				+
			
 
				+        pxor        xmm7,           xmm7
			
 
				+
			
 
				+    ; Load up predict blocks
			
 
				+        movq        xmm4,           [rdi]
			
 
				+        movq        xmm5,           [rdi+rdx]
			
 
				+
			
 
				+        punpcklbw   xmm4,           xmm7
			
 
				+        punpcklbw   xmm5,           xmm7
			
 
				+
			
 
				+        paddw       xmm0,           xmm4
			
 
				+        paddw       xmm1,           xmm5
			
 
				+
			
 
				+        movq        xmm4,           [rdi+2*rdx]
			
 
				+        movq        xmm5,           [rdi+rcx]
			
 
				+
			
 
				+        punpcklbw   xmm4,           xmm7
			
 
				+        punpcklbw   xmm5,           xmm7
			
 
				+
			
 
				+        paddw       xmm2,           xmm4
			
 
				+        paddw       xmm3,           xmm5
			
 
				+
			
 
				+.finish:
			
 
				+
			
 
				+    ; pack up before storing
			
 
				+        packuswb    xmm0,           xmm7
			
 
				+        packuswb    xmm1,           xmm7
			
 
				+        packuswb    xmm2,           xmm7
			
 
				+        packuswb    xmm3,           xmm7
			
 
				+
			
 
				+    ; store blocks back out
			
 
				+        movq        [rdi],          xmm0
			
 
				+        movq        [rdi + rdx],    xmm1
			
 
				+        movq        [rdi + rdx*2],  xmm2
			
 
				+        movq        [rdi + rcx],    xmm3
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop         rdi
			
 
				+    pop         rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+;void vp8_idct_dequant_dc_0_2x_sse2
			
 
				+; (
			
 
				+;   short *qcoeff       - 0
			
 
				+;   short *dequant      - 1
			
 
				+;   unsigned char *dst  - 2
			
 
				+;   int dst_stride      - 3
			
 
				+;   short *dc           - 4
			
 
				+; )
			
 
				+global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
			
 
				+sym(vp8_idct_dequant_dc_0_2x_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 5
			
 
				+    GET_GOT     rbx
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ; special case when 2 blocks have 0 or 1 coeffs
			
 
				+    ; dc is set as first coeff, so no need to load qcoeff
			
 
				+        mov         rax,            arg(0) ; qcoeff
			
 
				+
			
 
				+        mov         rdi,            arg(2) ; dst
			
 
				+        mov         rdx,            arg(4) ; dc
			
 
				+
			
 
				+    ; Zero out xmm5, for use unpacking
			
 
				+        pxor        xmm5,           xmm5
			
 
				+
			
 
				+    ; load up 2 dc words here == 2*16 = doubleword
			
 
				+        movd        xmm4,           [rdx]
			
 
				+
			
 
				+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
			
 
				+        lea         rcx, [rdx + rdx*2]
			
 
				+    ; Load up predict blocks
			
 
				+        movq        xmm0,           [rdi]
			
 
				+        movq        xmm1,           [rdi+rdx*1]
			
 
				+        movq        xmm2,           [rdi+rdx*2]
			
 
				+        movq        xmm3,           [rdi+rcx]
			
 
				+
			
 
				+    ; Duplicate and expand dc across
			
 
				+        punpcklwd   xmm4,           xmm4
			
 
				+        punpckldq   xmm4,           xmm4
			
 
				+
			
 
				+    ; Rounding to dequant and downshift
			
 
				+        paddw       xmm4,           [GLOBAL(fours)]
			
 
				+        psraw       xmm4,           3
			
 
				+
			
 
				+    ; Predict buffer needs to be expanded from bytes to words
			
 
				+        punpcklbw   xmm0,           xmm5
			
 
				+        punpcklbw   xmm1,           xmm5
			
 
				+        punpcklbw   xmm2,           xmm5
			
 
				+        punpcklbw   xmm3,           xmm5
			
 
				+
			
 
				+    ; Add to predict buffer
			
 
				+        paddw       xmm0,           xmm4
			
 
				+        paddw       xmm1,           xmm4
			
 
				+        paddw       xmm2,           xmm4
			
 
				+        paddw       xmm3,           xmm4
			
 
				+
			
 
				+    ; pack up before storing
			
 
				+        packuswb    xmm0,           xmm5
			
 
				+        packuswb    xmm1,           xmm5
			
 
				+        packuswb    xmm2,           xmm5
			
 
				+        packuswb    xmm3,           xmm5
			
 
				+
			
 
				+    ; store blocks back out
			
 
				+        movq        [rdi],          xmm0
			
 
				+        movq        [rdi + rdx],    xmm1
			
 
				+        movq        [rdi + rdx*2],  xmm2
			
 
				+        movq        [rdi + rcx],    xmm3
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop         rdi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+;void vp8_idct_dequant_dc_full_2x_sse2
			
 
				+; (
			
 
				+;   short *qcoeff       - 0
			
 
				+;   short *dequant      - 1
			
 
				+;   unsigned char *dst  - 2
			
 
				+;   int dst_stride      - 3
			
 
				+;   short *dc           - 4
			
 
				+; )
			
 
				+global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
			
 
				+sym(vp8_idct_dequant_dc_full_2x_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 5
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ; special case when 2 blocks have 0 or 1 coeffs
			
 
				+    ; dc is set as first coeff, so no need to load qcoeff
			
 
				+        mov         rax,            arg(0) ; qcoeff
			
 
				+        mov         rdx,            arg(1)  ; dequant
			
 
				+
			
 
				+        mov         rdi,            arg(2) ; dst
			
 
				+
			
 
				+    ; Zero out xmm7, for use unpacking
			
 
				+        pxor        xmm7,           xmm7
			
 
				+
			
 
				+
			
 
				+    ; note the transpose of xmm1 and xmm2, necessary for shuffle
			
 
				+    ;   to spit out sensicle data
			
 
				+        movdqa      xmm0,           [rax]
			
 
				+        movdqa      xmm2,           [rax+16]
			
 
				+        movdqa      xmm1,           [rax+32]
			
 
				+        movdqa      xmm3,           [rax+48]
			
 
				+
			
 
				+    ; Clear out coeffs
			
 
				+        movdqa      [rax],          xmm7
			
 
				+        movdqa      [rax+16],       xmm7
			
 
				+        movdqa      [rax+32],       xmm7
			
 
				+        movdqa      [rax+48],       xmm7
			
 
				+
			
 
				+    ; dequantize qcoeff buffer
			
 
				+        pmullw      xmm0,           [rdx]
			
 
				+        pmullw      xmm2,           [rdx+16]
			
 
				+        pmullw      xmm1,           [rdx]
			
 
				+        pmullw      xmm3,           [rdx+16]
			
 
				+
			
 
				+    ; DC component
			
 
				+        mov         rdx,            arg(4)
			
 
				+
			
 
				+    ; repack so block 0 row x and block 1 row x are together
			
 
				+        movdqa      xmm4,           xmm0
			
 
				+        punpckldq   xmm0,           xmm1
			
 
				+        punpckhdq   xmm4,           xmm1
			
 
				+
			
 
				+        pshufd      xmm0,           xmm0,       11011000b
			
 
				+        pshufd      xmm1,           xmm4,       11011000b
			
 
				+
			
 
				+        movdqa      xmm4,           xmm2
			
 
				+        punpckldq   xmm2,           xmm3
			
 
				+        punpckhdq   xmm4,           xmm3
			
 
				+
			
 
				+        pshufd      xmm2,           xmm2,       11011000b
			
 
				+        pshufd      xmm3,           xmm4,       11011000b
			
 
				+
			
 
				+    ; insert DC component
			
 
				+        pinsrw      xmm0,           [rdx],      0
			
 
				+        pinsrw      xmm0,           [rdx+2],    4
			
 
				+
			
 
				+    ; first pass
			
 
				+        psubw       xmm0,           xmm2        ; b1 = 0-2
			
 
				+        paddw       xmm2,           xmm2        ;
			
 
				+
			
 
				+        movdqa      xmm5,           xmm1
			
 
				+        paddw       xmm2,           xmm0        ; a1 = 0+2
			
 
				+
			
 
				+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
			
 
				+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
			
 
				+
			
 
				+        movdqa      xmm7,           xmm3
			
 
				+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
			
 
				+
			
 
				+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
			
 
				+        psubw       xmm7,           xmm5        ; c1
			
 
				+
			
 
				+        movdqa      xmm5,           xmm1
			
 
				+        movdqa      xmm4,           xmm3
			
 
				+
			
 
				+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
			
 
				+        paddw       xmm5,           xmm1
			
 
				+
			
 
				+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
			
 
				+        paddw       xmm3,           xmm4
			
 
				+
			
 
				+        paddw       xmm3,           xmm5        ; d1
			
 
				+        movdqa      xmm6,           xmm2        ; a1
			
 
				+
			
 
				+        movdqa      xmm4,           xmm0        ; b1
			
 
				+        paddw       xmm2,           xmm3        ;0
			
 
				+
			
 
				+        paddw       xmm4,           xmm7        ;1
			
 
				+        psubw       xmm0,           xmm7        ;2
			
 
				+
			
 
				+        psubw       xmm6,           xmm3        ;3
			
 
				+
			
 
				+    ; transpose for the second pass
			
 
				+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
			
 
				+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
			
 
				+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
			
 
				+
			
 
				+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
			
 
				+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
			
 
				+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
			
 
				+
			
 
				+
			
 
				+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
			
 
				+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
			
 
				+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
			
 
				+
			
 
				+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
			
 
				+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
			
 
				+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
			
 
				+
			
 
				+
			
 
				+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
			
 
				+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
			
 
				+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
			
 
				+
			
 
				+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
			
 
				+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
			
 
				+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
			
 
				+
			
 
				+        pshufd      xmm0,           xmm2,       11011000b
			
 
				+        pshufd      xmm2,           xmm1,       11011000b
			
 
				+
			
 
				+        pshufd      xmm1,           xmm5,       11011000b
			
 
				+        pshufd      xmm3,           xmm7,       11011000b
			
 
				+
			
 
				+    ; second pass
			
 
				+        psubw       xmm0,           xmm2            ; b1 = 0-2
			
 
				+        paddw       xmm2,           xmm2
			
 
				+
			
 
				+        movdqa      xmm5,           xmm1
			
 
				+        paddw       xmm2,           xmm0            ; a1 = 0+2
			
 
				+
			
 
				+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
			
 
				+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
			
 
				+
			
 
				+        movdqa      xmm7,           xmm3
			
 
				+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
			
 
				+
			
 
				+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
			
 
				+        psubw       xmm7,           xmm5            ; c1
			
 
				+
			
 
				+        movdqa      xmm5,           xmm1
			
 
				+        movdqa      xmm4,           xmm3
			
 
				+
			
 
				+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
			
 
				+        paddw       xmm5,           xmm1
			
 
				+
			
 
				+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
			
 
				+        paddw       xmm3,           xmm4
			
 
				+
			
 
				+        paddw       xmm3,           xmm5            ; d1
			
 
				+        paddw       xmm0,           [GLOBAL(fours)]
			
 
				+
			
 
				+        paddw       xmm2,           [GLOBAL(fours)]
			
 
				+        movdqa      xmm6,           xmm2            ; a1
			
 
				+
			
 
				+        movdqa      xmm4,           xmm0            ; b1
			
 
				+        paddw       xmm2,           xmm3            ;0
			
 
				+
			
 
				+        paddw       xmm4,           xmm7            ;1
			
 
				+        psubw       xmm0,           xmm7            ;2
			
 
				+
			
 
				+        psubw       xmm6,           xmm3            ;3
			
 
				+        psraw       xmm2,           3
			
 
				+
			
 
				+        psraw       xmm0,           3
			
 
				+        psraw       xmm4,           3
			
 
				+
			
 
				+        psraw       xmm6,           3
			
 
				+
			
 
				+    ; transpose to save
			
 
				+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
			
 
				+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
			
 
				+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
			
 
				+
			
 
				+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
			
 
				+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
			
 
				+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
			
 
				+
			
 
				+
			
 
				+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
			
 
				+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
			
 
				+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
			
 
				+
			
 
				+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
			
 
				+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
			
 
				+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
			
 
				+
			
 
				+
			
 
				+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
			
 
				+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
			
 
				+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
			
 
				+
			
 
				+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
			
 
				+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
			
 
				+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
			
 
				+
			
 
				+        pshufd      xmm0,           xmm2,       11011000b
			
 
				+        pshufd      xmm2,           xmm1,       11011000b
			
 
				+
			
 
				+        pshufd      xmm1,           xmm5,       11011000b
			
 
				+        pshufd      xmm3,           xmm7,       11011000b
			
 
				+
			
 
				+        pxor        xmm7,           xmm7
			
 
				+
			
 
				+    ; Load up predict blocks
			
 
				+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
			
 
				+        movq        xmm4,           [rdi]
			
 
				+        movq        xmm5,           [rdi+rdx]
			
 
				+        lea         rcx,            [rdx + rdx*2]
			
 
				+
			
 
				+        punpcklbw   xmm4,           xmm7
			
 
				+        punpcklbw   xmm5,           xmm7
			
 
				+
			
 
				+        paddw       xmm0,           xmm4
			
 
				+        paddw       xmm1,           xmm5
			
 
				+
			
 
				+        movq        xmm4,           [rdi+rdx*2]
			
 
				+        movq        xmm5,           [rdi+rcx]
			
 
				+
			
 
				+        punpcklbw   xmm4,           xmm7
			
 
				+        punpcklbw   xmm5,           xmm7
			
 
				+
			
 
				+        paddw       xmm2,           xmm4
			
 
				+        paddw       xmm3,           xmm5
			
 
				+
			
 
				+.finish:
			
 
				+
			
 
				+    ; pack up before storing
			
 
				+        packuswb    xmm0,           xmm7
			
 
				+        packuswb    xmm1,           xmm7
			
 
				+        packuswb    xmm2,           xmm7
			
 
				+        packuswb    xmm3,           xmm7
			
 
				+
			
 
				+    ; Load destination stride before writing out,
			
 
				+    ;   doesn't need to persist
			
 
				+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
			
 
				+
			
 
				+    ; store blocks back out
			
 
				+        movq        [rdi],          xmm0
			
 
				+        movq        [rdi + rdx],    xmm1
			
 
				+
			
 
				+        lea         rdi,            [rdi + 2*rdx]
			
 
				+
			
 
				+        movq        [rdi],          xmm2
			
 
				+        movq        [rdi + rdx],    xmm3
			
 
				+
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop         rdi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+SECTION_RODATA
			
 
				+align 16
			
 
				+fours:
			
 
				+    times 8 dw 0x0004
			
 
				+align 16
			
 
				+x_s1sqr2:
			
 
				+    times 8 dw 0x8A8C
			
 
				+align 16
			
 
				+x_c1sqr2less1:
			
 
				+    times 8 dw 0x4E7B
			
--- a/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm
+++ b/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm
@@ -0,0 +1,140 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
			
 
				+global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
			
 
				+sym(vp8_short_inv_walsh4x4_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 2
			
 
				+    ; end prolog
			
 
				+
			
 
				+    mov         rdx, arg(0)
			
 
				+    mov         rax, 30003h
			
 
				+
			
 
				+    movq        mm0, [rdx + 0]    ;ip[0]
			
 
				+    movq        mm1, [rdx + 8]    ;ip[4]
			
 
				+    movq        mm7, rax
			
 
				+
			
 
				+    movq        mm2, [rdx + 16]   ;ip[8]
			
 
				+    movq        mm3, [rdx + 24]   ;ip[12]
			
 
				+    punpcklwd   mm7, mm7          ;0003000300030003h
			
 
				+    mov         rdx, arg(1)
			
 
				+
			
 
				+    movq        mm4, mm0
			
 
				+    movq        mm5, mm1
			
 
				+
			
 
				+    paddw       mm4, mm3          ;ip[0] + ip[12] aka al
			
 
				+    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
			
 
				+
			
 
				+    movq        mm6, mm4          ;temp al
			
 
				+    paddw       mm4, mm5          ;al + bl
			
 
				+    psubw       mm6, mm5          ;al - bl
			
 
				+
			
 
				+    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
			
 
				+    psubw       mm1, mm2          ;ip[4] - ip[8] aka c1
			
 
				+
			
 
				+    movq        mm5, mm0          ;temp dl
			
 
				+    paddw       mm0, mm1          ;dl + cl
			
 
				+    psubw       mm5, mm1          ;dl - cl
			
 
				+
			
 
				+    ; 03 02 01 00
			
 
				+    ; 13 12 11 10
			
 
				+    ; 23 22 21 20
			
 
				+    ; 33 32 31 30
			
 
				+
			
 
				+    movq        mm3, mm4          ; 03 02 01 00
			
 
				+    punpcklwd   mm4, mm0          ; 11 01 10 00
			
 
				+    punpckhwd   mm3, mm0          ; 13 03 12 02
			
 
				+
			
 
				+    movq        mm1, mm6          ; 23 22 21 20
			
 
				+    punpcklwd   mm6, mm5          ; 31 21 30 20
			
 
				+    punpckhwd   mm1, mm5          ; 33 23 32 22
			
 
				+
			
 
				+    movq        mm0, mm4          ; 11 01 10 00
			
 
				+    movq        mm2, mm3          ; 13 03 12 02
			
 
				+
			
 
				+    punpckldq   mm0, mm6          ; 30 20 10 00 aka ip[0]
			
 
				+    punpckhdq   mm4, mm6          ; 31 21 11 01 aka ip[4]
			
 
				+
			
 
				+    punpckldq   mm2, mm1          ; 32 22 12 02 aka ip[8]
			
 
				+    punpckhdq   mm3, mm1          ; 33 23 13 03 aka ip[12]
			
 
				+;~~~~~~~~~~~~~~~~~~~~~
			
 
				+    movq        mm1, mm0
			
 
				+    movq        mm5, mm4
			
 
				+    paddw       mm1, mm3          ;ip[0] + ip[12] aka al
			
 
				+    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
			
 
				+
			
 
				+    movq        mm6, mm1          ;temp al
			
 
				+    paddw       mm1, mm5          ;al + bl
			
 
				+    psubw       mm6, mm5          ;al - bl
			
 
				+    paddw       mm1, mm7
			
 
				+    paddw       mm6, mm7
			
 
				+    psraw       mm1, 3
			
 
				+    psraw       mm6, 3
			
 
				+
			
 
				+    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
			
 
				+    psubw       mm4, mm2          ;ip[4] - ip[8] aka c1
			
 
				+
			
 
				+    movq        mm5, mm0          ;temp dl
			
 
				+    paddw       mm0, mm4          ;dl + cl
			
 
				+    psubw       mm5, mm4          ;dl - cl
			
 
				+    paddw       mm0, mm7
			
 
				+    paddw       mm5, mm7
			
 
				+    psraw       mm0, 3
			
 
				+    psraw       mm5, 3
			
 
				+;~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+    movd        eax, mm1
			
 
				+    movd        ecx, mm0
			
 
				+    psrlq       mm0, 32
			
 
				+    psrlq       mm1, 32
			
 
				+    mov         word ptr[rdx+32*0], ax
			
 
				+    mov         word ptr[rdx+32*1], cx
			
 
				+    shr         eax, 16
			
 
				+    shr         ecx, 16
			
 
				+    mov         word ptr[rdx+32*4], ax
			
 
				+    mov         word ptr[rdx+32*5], cx
			
 
				+    movd        eax, mm1
			
 
				+    movd        ecx, mm0
			
 
				+    mov         word ptr[rdx+32*8], ax
			
 
				+    mov         word ptr[rdx+32*9], cx
			
 
				+    shr         eax, 16
			
 
				+    shr         ecx, 16
			
 
				+    mov         word ptr[rdx+32*12], ax
			
 
				+    mov         word ptr[rdx+32*13], cx
			
 
				+
			
 
				+    movd        eax, mm6
			
 
				+    movd        ecx, mm5
			
 
				+    psrlq       mm5, 32
			
 
				+    psrlq       mm6, 32
			
 
				+    mov         word ptr[rdx+32*2], ax
			
 
				+    mov         word ptr[rdx+32*3], cx
			
 
				+    shr         eax, 16
			
 
				+    shr         ecx, 16
			
 
				+    mov         word ptr[rdx+32*6], ax
			
 
				+    mov         word ptr[rdx+32*7], cx
			
 
				+    movd        eax, mm6
			
 
				+    movd        ecx, mm5
			
 
				+    mov         word ptr[rdx+32*10], ax
			
 
				+    mov         word ptr[rdx+32*11], cx
			
 
				+    shr         eax, 16
			
 
				+    shr         ecx, 16
			
 
				+    mov         word ptr[rdx+32*14], ax
			
 
				+    mov         word ptr[rdx+32*15], cx
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
--- a/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm
+++ b/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -0,0 +1,121 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
			
 
				+global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
			
 
				+sym(vp8_short_inv_walsh4x4_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 2
			
 
				+    ; end prolog
			
 
				+
			
 
				+    mov         rcx, arg(0)
			
 
				+    mov         rdx, arg(1)
			
 
				+    mov         rax, 30003h
			
 
				+
			
 
				+    movdqa      xmm0, [rcx + 0]     ;ip[4] ip[0]
			
 
				+    movdqa      xmm1, [rcx + 16]    ;ip[12] ip[8]
			
 
				+
			
 
				+
			
 
				+    pshufd      xmm2, xmm1, 4eh     ;ip[8] ip[12]
			
 
				+    movdqa      xmm3, xmm0          ;ip[4] ip[0]
			
 
				+
			
 
				+    paddw       xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
			
 
				+    psubw       xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
			
 
				+
			
 
				+    movdqa      xmm4, xmm0
			
 
				+    punpcklqdq  xmm0, xmm3          ;d1 a1
			
 
				+    punpckhqdq  xmm4, xmm3          ;c1 b1
			
 
				+
			
 
				+    movdqa      xmm1, xmm4          ;c1 b1
			
 
				+    paddw       xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
			
 
				+    psubw       xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
			
 
				+
			
 
				+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+    ; 13 12 11 10 03 02 01 00
			
 
				+    ;
			
 
				+    ; 33 32 31 30 23 22 21 20
			
 
				+    ;
			
 
				+    movdqa      xmm3, xmm4          ; 13 12 11 10 03 02 01 00
			
 
				+    punpcklwd   xmm4, xmm0          ; 23 03 22 02 21 01 20 00
			
 
				+    punpckhwd   xmm3, xmm0          ; 33 13 32 12 31 11 30 10
			
 
				+    movdqa      xmm1, xmm4          ; 23 03 22 02 21 01 20 00
			
 
				+    punpcklwd   xmm4, xmm3          ; 31 21 11 01 30 20 10 00
			
 
				+    punpckhwd   xmm1, xmm3          ; 33 23 13 03 32 22 12 02
			
 
				+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+    movd        xmm0, eax
			
 
				+    pshufd      xmm2, xmm1, 4eh     ;ip[8] ip[12]
			
 
				+    movdqa      xmm3, xmm4          ;ip[4] ip[0]
			
 
				+
			
 
				+    pshufd      xmm0, xmm0, 0       ;03 03 03 03 03 03 03 03
			
 
				+
			
 
				+    paddw       xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
			
 
				+    psubw       xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
			
 
				+
			
 
				+    movdqa      xmm5, xmm4
			
 
				+    punpcklqdq  xmm4, xmm3          ;d1 a1
			
 
				+    punpckhqdq  xmm5, xmm3          ;c1 b1
			
 
				+
			
 
				+    movdqa      xmm1, xmm5          ;c1 b1
			
 
				+    paddw       xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
			
 
				+    psubw       xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
			
 
				+
			
 
				+    paddw       xmm5, xmm0
			
 
				+    paddw       xmm4, xmm0
			
 
				+    psraw       xmm5, 3
			
 
				+    psraw       xmm4, 3
			
 
				+
			
 
				+    movd        eax, xmm5
			
 
				+    movd        ecx, xmm4
			
 
				+    psrldq      xmm5, 4
			
 
				+    psrldq      xmm4, 4
			
 
				+    mov         word ptr[rdx+32*0], ax
			
 
				+    mov         word ptr[rdx+32*2], cx
			
 
				+    shr         eax, 16
			
 
				+    shr         ecx, 16
			
 
				+    mov         word ptr[rdx+32*4], ax
			
 
				+    mov         word ptr[rdx+32*6], cx
			
 
				+    movd        eax, xmm5
			
 
				+    movd        ecx, xmm4
			
 
				+    psrldq      xmm5, 4
			
 
				+    psrldq      xmm4, 4
			
 
				+    mov         word ptr[rdx+32*8], ax
			
 
				+    mov         word ptr[rdx+32*10], cx
			
 
				+    shr         eax, 16
			
 
				+    shr         ecx, 16
			
 
				+    mov         word ptr[rdx+32*12], ax
			
 
				+    mov         word ptr[rdx+32*14], cx
			
 
				+
			
 
				+    movd        eax, xmm5
			
 
				+    movd        ecx, xmm4
			
 
				+    psrldq      xmm5, 4
			
 
				+    psrldq      xmm4, 4
			
 
				+    mov         word ptr[rdx+32*1], ax
			
 
				+    mov         word ptr[rdx+32*3], cx
			
 
				+    shr         eax, 16
			
 
				+    shr         ecx, 16
			
 
				+    mov         word ptr[rdx+32*5], ax
			
 
				+    mov         word ptr[rdx+32*7], cx
			
 
				+    movd        eax, xmm5
			
 
				+    movd        ecx, xmm4
			
 
				+    mov         word ptr[rdx+32*9], ax
			
 
				+    mov         word ptr[rdx+32*11], cx
			
 
				+    shr         eax, 16
			
 
				+    shr         ecx, 16
			
 
				+    mov         word ptr[rdx+32*13], ax
			
 
				+    mov         word ptr[rdx+32*15], cx
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
--- a/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
+++ b/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
@@ -0,0 +1,815 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+%macro LF_ABS 2
			
 
				+        ; %1 value not preserved
			
 
				+        ; %2 value preserved
			
 
				+        ; output in %1
			
 
				+        movdqa      scratch1, %2            ; v2
			
 
				+
			
 
				+        psubusb     scratch1, %1            ; v2 - v1
			
 
				+        psubusb     %1, %2                  ; v1 - v2
			
 
				+        por         %1, scratch1            ; abs(v2 - v1)
			
 
				+%endmacro
			
 
				+
			
 
				+%macro LF_FILTER_HEV_MASK 8-9
			
 
				+
			
 
				+        LF_ABS      %1, %2                  ; abs(p3 - p2)
			
 
				+        LF_ABS      %2, %3                  ; abs(p2 - p1)
			
 
				+        pmaxub      %1, %2                  ; accumulate mask
			
 
				+%if %0 == 8
			
 
				+        movdqa      scratch2, %3            ; save p1
			
 
				+        LF_ABS      scratch2, %4            ; abs(p1 - p0)
			
 
				+%endif
			
 
				+        LF_ABS      %4, %5                  ; abs(p0 - q0)
			
 
				+        LF_ABS      %5, %6                  ; abs(q0 - q1)
			
 
				+%if %0 == 8
			
 
				+        pmaxub      %5, scratch2            ; accumulate hev
			
 
				+%else
			
 
				+        pmaxub      %5, %9
			
 
				+%endif
			
 
				+        pmaxub      %1, %5                  ; accumulate mask
			
 
				+
			
 
				+        LF_ABS      %3, %6                  ; abs(p1 - q1)
			
 
				+        LF_ABS      %6, %7                  ; abs(q1 - q2)
			
 
				+        pmaxub      %1, %6                  ; accumulate mask
			
 
				+        LF_ABS      %7, %8                  ; abs(q2 - q3)
			
 
				+        pmaxub      %1, %7                  ; accumulate mask
			
 
				+
			
 
				+        paddusb     %4, %4                  ; 2 * abs(p0 - q0)
			
 
				+        pand        %3, [GLOBAL(tfe)]
			
 
				+        psrlw       %3, 1                   ; abs(p1 - q1) / 2
			
 
				+        paddusb     %4, %3                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
			
 
				+
			
 
				+        psubusb     %1, [limit]
			
 
				+        psubusb     %4, [blimit]
			
 
				+        por         %1, %4
			
 
				+        pcmpeqb     %1, zero                ; mask
			
 
				+
			
 
				+        psubusb     %5, [thresh]
			
 
				+        pcmpeqb     %5, zero                ; ~hev
			
 
				+%endmacro
			
 
				+
			
 
				+%macro LF_FILTER 6
			
 
				+        ; %1-%4: p1-q1
			
 
				+        ; %5: mask
			
 
				+        ; %6: hev
			
 
				+
			
 
				+        movdqa      scratch2, %6            ; save hev
			
 
				+
			
 
				+        pxor        %1, [GLOBAL(t80)]       ; ps1
			
 
				+        pxor        %4, [GLOBAL(t80)]       ; qs1
			
 
				+        movdqa      scratch1, %1
			
 
				+        psubsb      scratch1, %4            ; signed_char_clamp(ps1 - qs1)
			
 
				+        pandn       scratch2, scratch1      ; vp8_filter &= hev
			
 
				+
			
 
				+        pxor        %2, [GLOBAL(t80)]       ; ps0
			
 
				+        pxor        %3, [GLOBAL(t80)]       ; qs0
			
 
				+        movdqa      scratch1, %3
			
 
				+        psubsb      scratch1, %2            ; qs0 - ps0
			
 
				+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
			
 
				+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
			
 
				+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
			
 
				+        pand        %5, scratch2            ; &= mask
			
 
				+
			
 
				+        movdqa      scratch2, %5
			
 
				+        paddsb      %5, [GLOBAL(t4)]        ; Filter1
			
 
				+        paddsb      scratch2, [GLOBAL(t3)]  ; Filter2
			
 
				+
			
 
				+        ; Filter1 >> 3
			
 
				+        movdqa      scratch1, zero
			
 
				+        pcmpgtb     scratch1, %5
			
 
				+        psrlw       %5, 3
			
 
				+        pand        scratch1, [GLOBAL(te0)]
			
 
				+        pand        %5, [GLOBAL(t1f)]
			
 
				+        por         %5, scratch1
			
 
				+
			
 
				+        psubsb      %3, %5                  ; qs0 - Filter1
			
 
				+        pxor        %3, [GLOBAL(t80)]
			
 
				+
			
 
				+        ; Filter2 >> 3
			
 
				+        movdqa      scratch1, zero
			
 
				+        pcmpgtb     scratch1, scratch2
			
 
				+        psrlw       scratch2, 3
			
 
				+        pand        scratch1, [GLOBAL(te0)]
			
 
				+        pand        scratch2, [GLOBAL(t1f)]
			
 
				+        por         scratch2, scratch1
			
 
				+
			
 
				+        paddsb      %2, scratch2            ; ps0 + Filter2
			
 
				+        pxor        %2, [GLOBAL(t80)]
			
 
				+
			
 
				+        ; outer tap adjustments
			
 
				+        paddsb      %5, [GLOBAL(t1)]
			
 
				+        movdqa      scratch1, zero
			
 
				+        pcmpgtb     scratch1, %5
			
 
				+        psrlw       %5, 1
			
 
				+        pand        scratch1, [GLOBAL(t80)]
			
 
				+        pand        %5, [GLOBAL(t7f)]
			
 
				+        por         %5, scratch1
			
 
				+        pand        %5, %6                  ; vp8_filter &= ~hev
			
 
				+
			
 
				+        psubsb      %4, %5                  ; qs1 - vp8_filter
			
 
				+        pxor        %4, [GLOBAL(t80)]
			
 
				+
			
 
				+        paddsb      %1, %5                  ; ps1 + vp8_filter
			
 
				+        pxor        %1, [GLOBAL(t80)]
			
 
				+%endmacro
			
 
				+
			
 
				+;void vp8_loop_filter_bh_y_sse2
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int            src_pixel_step,
			
 
				+;    const char    *blimit,
			
 
				+;    const char    *limit,
			
 
				+;    const char    *thresh
			
 
				+;)
			
 
				+global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
			
 
				+sym(vp8_loop_filter_bh_y_sse2):
			
 
				+
			
 
				+%if LIBVPX_YASM_WIN64
			
 
				+    %define src      rcx ; src_ptr
			
 
				+    %define stride   rdx ; src_pixel_step
			
 
				+    %define blimit   r8
			
 
				+    %define limit    r9
			
 
				+    %define thresh   r10
			
 
				+
			
 
				+    %define spp      rax
			
 
				+    %define stride3  r11
			
 
				+    %define stride5  r12
			
 
				+    %define stride7  r13
			
 
				+
			
 
				+    push    rbp
			
 
				+    mov     rbp, rsp
			
 
				+    SAVE_XMM 11
			
 
				+    push    r12
			
 
				+    push    r13
			
 
				+    mov     thresh, arg(4)
			
 
				+%else
			
 
				+    %define src      rdi ; src_ptr
			
 
				+    %define stride   rsi ; src_pixel_step
			
 
				+    %define blimit   rdx
			
 
				+    %define limit    rcx
			
 
				+    %define thresh   r8
			
 
				+
			
 
				+    %define spp      rax
			
 
				+    %define stride3  r9
			
 
				+    %define stride5  r10
			
 
				+    %define stride7  r11
			
 
				+%endif
			
 
				+
			
 
				+    %define scratch1 xmm5
			
 
				+    %define scratch2 xmm6
			
 
				+    %define zero     xmm7
			
 
				+
			
 
				+    %define i0       [src]
			
 
				+    %define i1       [spp]
			
 
				+    %define i2       [src + 2 * stride]
			
 
				+    %define i3       [spp + 2 * stride]
			
 
				+    %define i4       [src + 4 * stride]
			
 
				+    %define i5       [spp + 4 * stride]
			
 
				+    %define i6       [src + 2 * stride3]
			
 
				+    %define i7       [spp + 2 * stride3]
			
 
				+    %define i8       [src + 8 * stride]
			
 
				+    %define i9       [spp + 8 * stride]
			
 
				+    %define i10      [src + 2 * stride5]
			
 
				+    %define i11      [spp + 2 * stride5]
			
 
				+    %define i12      [src + 4 * stride3]
			
 
				+    %define i13      [spp + 4 * stride3]
			
 
				+    %define i14      [src + 2 * stride7]
			
 
				+    %define i15      [spp + 2 * stride7]
			
 
				+
			
 
				+    ; prep work
			
 
				+    lea         spp, [src + stride]
			
 
				+    lea         stride3, [stride + 2 * stride]
			
 
				+    lea         stride5, [stride3 + 2 * stride]
			
 
				+    lea         stride7, [stride3 + 4 * stride]
			
 
				+    pxor        zero, zero
			
 
				+
			
 
				+        ; load the first set into registers
			
 
				+        movdqa       xmm0, i0
			
 
				+        movdqa       xmm1, i1
			
 
				+        movdqa       xmm2, i2
			
 
				+        movdqa       xmm3, i3
			
 
				+        movdqa       xmm4, i4
			
 
				+        movdqa       xmm8, i5
			
 
				+        movdqa       xmm9, i6   ; q2, will contain abs(p1-p0)
			
 
				+        movdqa       xmm10, i7
			
 
				+LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
			
 
				+
			
 
				+        movdqa       xmm1, i2
			
 
				+        movdqa       xmm2, i3
			
 
				+        movdqa       xmm3, i4
			
 
				+        movdqa       xmm8, i5
			
 
				+LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
			
 
				+        movdqa       i2, xmm1
			
 
				+        movdqa       i3, xmm2
			
 
				+
			
 
				+; second set
			
 
				+        movdqa       i4, xmm3
			
 
				+        movdqa       i5, xmm8
			
 
				+
			
 
				+        movdqa       xmm0, i6
			
 
				+        movdqa       xmm1, i7
			
 
				+        movdqa       xmm2, i8
			
 
				+        movdqa       xmm4, i9
			
 
				+        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
			
 
				+        movdqa       xmm11, i11
			
 
				+LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
			
 
				+
			
 
				+        movdqa       xmm0, i6
			
 
				+        movdqa       xmm1, i7
			
 
				+        movdqa       xmm4, i8
			
 
				+        movdqa       xmm8, i9
			
 
				+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
			
 
				+        movdqa       i6, xmm0
			
 
				+        movdqa       i7, xmm1
			
 
				+
			
 
				+; last set
			
 
				+        movdqa       i8, xmm4
			
 
				+        movdqa       i9, xmm8
			
 
				+
			
 
				+        movdqa       xmm0, i10
			
 
				+        movdqa       xmm1, i11
			
 
				+        movdqa       xmm2, i12
			
 
				+        movdqa       xmm3, i13
			
 
				+        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
			
 
				+        movdqa       xmm11, i15
			
 
				+LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
			
 
				+
			
 
				+        movdqa       xmm0, i10
			
 
				+        movdqa       xmm1, i11
			
 
				+        movdqa       xmm3, i12
			
 
				+        movdqa       xmm8, i13
			
 
				+LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
			
 
				+        movdqa       i10, xmm0
			
 
				+        movdqa       i11, xmm1
			
 
				+        movdqa       i12, xmm3
			
 
				+        movdqa       i13, xmm8
			
 
				+
			
 
				+%if LIBVPX_YASM_WIN64
			
 
				+    pop    r13
			
 
				+    pop    r12
			
 
				+    RESTORE_XMM
			
 
				+    pop    rbp
			
 
				+%endif
			
 
				+
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_loop_filter_bv_y_sse2
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int            src_pixel_step,
			
 
				+;    const char    *blimit,
			
 
				+;    const char    *limit,
			
 
				+;    const char    *thresh
			
 
				+;)
			
 
				+
			
 
				+global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
			
 
				+sym(vp8_loop_filter_bv_y_sse2):
			
 
				+
			
 
				+%if LIBVPX_YASM_WIN64
			
 
				+    %define src      rcx ; src_ptr
			
 
				+    %define stride   rdx ; src_pixel_step
			
 
				+    %define blimit   r8
			
 
				+    %define limit    r9
			
 
				+    %define thresh   r10
			
 
				+
			
 
				+    %define spp      rax
			
 
				+    %define stride3  r11
			
 
				+    %define stride5  r12
			
 
				+    %define stride7  r13
			
 
				+
			
 
				+    push    rbp
			
 
				+    mov     rbp, rsp
			
 
				+    SAVE_XMM 15
			
 
				+    push    r12
			
 
				+    push    r13
			
 
				+    mov     thresh, arg(4)
			
 
				+%else
			
 
				+    %define src      rdi
			
 
				+    %define stride   rsi
			
 
				+    %define blimit   rdx
			
 
				+    %define limit    rcx
			
 
				+    %define thresh   r8
			
 
				+
			
 
				+    %define spp      rax
			
 
				+    %define stride3  r9
			
 
				+    %define stride5  r10
			
 
				+    %define stride7  r11
			
 
				+%endif
			
 
				+
			
 
				+    %define scratch1 xmm5
			
 
				+    %define scratch2 xmm6
			
 
				+    %define zero     xmm7
			
 
				+
			
 
				+    %define s0       [src]
			
 
				+    %define s1       [spp]
			
 
				+    %define s2       [src + 2 * stride]
			
 
				+    %define s3       [spp + 2 * stride]
			
 
				+    %define s4       [src + 4 * stride]
			
 
				+    %define s5       [spp + 4 * stride]
			
 
				+    %define s6       [src + 2 * stride3]
			
 
				+    %define s7       [spp + 2 * stride3]
			
 
				+    %define s8       [src + 8 * stride]
			
 
				+    %define s9       [spp + 8 * stride]
			
 
				+    %define s10      [src + 2 * stride5]
			
 
				+    %define s11      [spp + 2 * stride5]
			
 
				+    %define s12      [src + 4 * stride3]
			
 
				+    %define s13      [spp + 4 * stride3]
			
 
				+    %define s14      [src + 2 * stride7]
			
 
				+    %define s15      [spp + 2 * stride7]
			
 
				+
			
 
				+    %define i0       [rsp]
			
 
				+    %define i1       [rsp + 16]
			
 
				+    %define i2       [rsp + 32]
			
 
				+    %define i3       [rsp + 48]
			
 
				+    %define i4       [rsp + 64]
			
 
				+    %define i5       [rsp + 80]
			
 
				+    %define i6       [rsp + 96]
			
 
				+    %define i7       [rsp + 112]
			
 
				+    %define i8       [rsp + 128]
			
 
				+    %define i9       [rsp + 144]
			
 
				+    %define i10      [rsp + 160]
			
 
				+    %define i11      [rsp + 176]
			
 
				+    %define i12      [rsp + 192]
			
 
				+    %define i13      [rsp + 208]
			
 
				+    %define i14      [rsp + 224]
			
 
				+    %define i15      [rsp + 240]
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+
			
 
				+    ; reserve stack space
			
 
				+    %define      temp_storage  0 ; size is 256 (16*16)
			
 
				+    %define      stack_size 256
			
 
				+    sub          rsp, stack_size
			
 
				+
			
 
				+    ; prep work
			
 
				+    lea         spp, [src + stride]
			
 
				+    lea         stride3, [stride + 2 * stride]
			
 
				+    lea         stride5, [stride3 + 2 * stride]
			
 
				+    lea         stride7, [stride3 + 4 * stride]
			
 
				+
			
 
				+        ; 8-f
			
 
				+        movdqa      xmm0, s8
			
 
				+        movdqa      xmm1, xmm0
			
 
				+        punpcklbw   xmm0, s9                ; 80 90
			
 
				+        punpckhbw   xmm1, s9                ; 88 98
			
 
				+
			
 
				+        movdqa      xmm2, s10
			
 
				+        movdqa      xmm3, xmm2
			
 
				+        punpcklbw   xmm2, s11 ; a0 b0
			
 
				+        punpckhbw   xmm3, s11 ; a8 b8
			
 
				+
			
 
				+        movdqa      xmm4, xmm0
			
 
				+        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
			
 
				+        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
			
 
				+
			
 
				+        movdqa      xmm2, xmm1
			
 
				+        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
			
 
				+        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
			
 
				+
			
 
				+        ; using xmm[0124]
			
 
				+        ; work on next 4 rows
			
 
				+
			
 
				+        movdqa      xmm3, s12
			
 
				+        movdqa      xmm5, xmm3
			
 
				+        punpcklbw   xmm3, s13 ; c0 d0
			
 
				+        punpckhbw   xmm5, s13 ; c8 d8
			
 
				+
			
 
				+        movdqa      xmm6, s14
			
 
				+        movdqa      xmm7, xmm6
			
 
				+        punpcklbw   xmm6, s15 ; e0 f0
			
 
				+        punpckhbw   xmm7, s15 ; e8 f8
			
 
				+
			
 
				+        movdqa      xmm8, xmm3
			
 
				+        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
			
 
				+        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
			
 
				+
			
 
				+        movdqa      xmm6, xmm5
			
 
				+        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
			
 
				+        punpckhwd   xmm6, xmm7              ; cc dc ec fc
			
 
				+
			
 
				+        ; pull the third and fourth sets together
			
 
				+
			
 
				+        movdqa      xmm7, xmm0
			
 
				+        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
			
 
				+        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
			
 
				+
			
 
				+        movdqa      xmm3, xmm4
			
 
				+        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
			
 
				+        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
			
 
				+
			
 
				+        movdqa      xmm8, xmm1
			
 
				+        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
			
 
				+        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
			
 
				+
			
 
				+        movdqa      xmm5, xmm2
			
 
				+        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
			
 
				+        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
			
 
				+
			
 
				+        ; save the calculations. we only have 15 registers ...
			
 
				+        movdqa      i0, xmm0
			
 
				+        movdqa      i1, xmm7
			
 
				+        movdqa      i2, xmm4
			
 
				+        movdqa      i3, xmm3
			
 
				+        movdqa      i4, xmm1
			
 
				+        movdqa      i5, xmm8
			
 
				+        movdqa      i6, xmm2
			
 
				+        movdqa      i7, xmm5
			
 
				+
			
 
				+        ; 0-7
			
 
				+        movdqa      xmm0, s0
			
 
				+        movdqa      xmm1, xmm0
			
 
				+        punpcklbw   xmm0, s1 ; 00 10
			
 
				+        punpckhbw   xmm1, s1 ; 08 18
			
 
				+
			
 
				+        movdqa      xmm2, s2
			
 
				+        movdqa      xmm3, xmm2
			
 
				+        punpcklbw   xmm2, s3 ; 20 30
			
 
				+        punpckhbw   xmm3, s3 ; 28 38
			
 
				+
			
 
				+        movdqa      xmm4, xmm0
			
 
				+        punpcklwd   xmm0, xmm2              ; 00 10 20 30
			
 
				+        punpckhwd   xmm4, xmm2              ; 04 14 24 34
			
 
				+
			
 
				+        movdqa      xmm2, xmm1
			
 
				+        punpcklwd   xmm1, xmm3              ; 08 18 28 38
			
 
				+        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
			
 
				+
			
 
				+        ; using xmm[0124]
			
 
				+        ; work on next 4 rows
			
 
				+
			
 
				+        movdqa      xmm3, s4
			
 
				+        movdqa      xmm5, xmm3
			
 
				+        punpcklbw   xmm3, s5 ; 40 50
			
 
				+        punpckhbw   xmm5, s5 ; 48 58
			
 
				+
			
 
				+        movdqa      xmm6, s6
			
 
				+        movdqa      xmm7, xmm6
			
 
				+        punpcklbw   xmm6, s7   ; 60 70
			
 
				+        punpckhbw   xmm7, s7   ; 68 78
			
 
				+
			
 
				+        movdqa      xmm8, xmm3
			
 
				+        punpcklwd   xmm3, xmm6              ; 40 50 60 70
			
 
				+        punpckhwd   xmm8, xmm6              ; 44 54 64 74
			
 
				+
			
 
				+        movdqa      xmm6, xmm5
			
 
				+        punpcklwd   xmm5, xmm7              ; 48 58 68 78
			
 
				+        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
			
 
				+
			
 
				+        ; pull the first two sets together
			
 
				+
			
 
				+        movdqa      xmm7, xmm0
			
 
				+        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
			
 
				+        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
			
 
				+
			
 
				+        movdqa      xmm3, xmm4
			
 
				+        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
			
 
				+        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
			
 
				+
			
 
				+        movdqa      xmm8, xmm1
			
 
				+        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
			
 
				+        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
			
 
				+
			
 
				+        movdqa      xmm5, xmm2
			
 
				+        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
			
 
				+        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
			
 
				+        ; final combination
			
 
				+
			
 
				+        movdqa      xmm6, xmm0
			
 
				+        punpcklqdq  xmm0, i0
			
 
				+        punpckhqdq  xmm6, i0
			
 
				+
			
 
				+        movdqa      xmm9, xmm7
			
 
				+        punpcklqdq  xmm7, i1
			
 
				+        punpckhqdq  xmm9, i1
			
 
				+
			
 
				+        movdqa      xmm10, xmm4
			
 
				+        punpcklqdq  xmm4, i2
			
 
				+        punpckhqdq  xmm10, i2
			
 
				+
			
 
				+        movdqa      xmm11, xmm3
			
 
				+        punpcklqdq  xmm3, i3
			
 
				+        punpckhqdq  xmm11, i3
			
 
				+
			
 
				+        movdqa      xmm12, xmm1
			
 
				+        punpcklqdq  xmm1, i4
			
 
				+        punpckhqdq  xmm12, i4
			
 
				+
			
 
				+        movdqa      xmm13, xmm8
			
 
				+        punpcklqdq  xmm8, i5
			
 
				+        punpckhqdq  xmm13, i5
			
 
				+
			
 
				+        movdqa      xmm14, xmm2
			
 
				+        punpcklqdq  xmm2, i6
			
 
				+        punpckhqdq  xmm14, i6
			
 
				+
			
 
				+        movdqa      xmm15, xmm5
			
 
				+        punpcklqdq  xmm5, i7
			
 
				+        punpckhqdq  xmm15, i7
			
 
				+
			
 
				+        movdqa      i0, xmm0
			
 
				+        movdqa      i1, xmm6
			
 
				+        movdqa      i2, xmm7
			
 
				+        movdqa      i3, xmm9
			
 
				+        movdqa      i4, xmm4
			
 
				+        movdqa      i5, xmm10
			
 
				+        movdqa      i6, xmm3
			
 
				+        movdqa      i7, xmm11
			
 
				+        movdqa      i8, xmm1
			
 
				+        movdqa      i9, xmm12
			
 
				+        movdqa      i10, xmm8
			
 
				+        movdqa      i11, xmm13
			
 
				+        movdqa      i12, xmm2
			
 
				+        movdqa      i13, xmm14
			
 
				+        movdqa      i14, xmm5
			
 
				+        movdqa      i15, xmm15
			
 
				+
			
 
				+; TRANSPOSED DATA AVAILABLE ON THE STACK
			
 
				+
			
 
				+        movdqa      xmm12, xmm6
			
 
				+        movdqa      xmm13, xmm7
			
 
				+
			
 
				+        pxor        zero, zero
			
 
				+
			
 
				+LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
			
 
				+
			
 
				+        movdqa       xmm1, i2
			
 
				+        movdqa       xmm2, i3
			
 
				+        movdqa       xmm8, i4
			
 
				+        movdqa       xmm9, i5
			
 
				+LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
			
 
				+        movdqa       i2, xmm1
			
 
				+        movdqa       i3, xmm2
			
 
				+
			
 
				+; second set
			
 
				+        movdqa       i4, xmm8
			
 
				+        movdqa       i5, xmm9
			
 
				+
			
 
				+        movdqa       xmm0, i6
			
 
				+        movdqa       xmm1, i7
			
 
				+        movdqa       xmm2, i8
			
 
				+        movdqa       xmm4, i9
			
 
				+        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
			
 
				+        movdqa       xmm11, i11
			
 
				+LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
			
 
				+
			
 
				+        movdqa       xmm0, i6
			
 
				+        movdqa       xmm1, i7
			
 
				+        movdqa       xmm3, i8
			
 
				+        movdqa       xmm4, i9
			
 
				+LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
			
 
				+        movdqa       i6, xmm0
			
 
				+        movdqa       i7, xmm1
			
 
				+
			
 
				+; last set
			
 
				+        movdqa       i8, xmm3
			
 
				+        movdqa       i9, xmm4
			
 
				+
			
 
				+        movdqa       xmm0, i10
			
 
				+        movdqa       xmm1, i11
			
 
				+        movdqa       xmm2, i12
			
 
				+        movdqa       xmm8, i13
			
 
				+        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
			
 
				+        movdqa       xmm11, i15
			
 
				+LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
			
 
				+
			
 
				+        movdqa       xmm0, i10
			
 
				+        movdqa       xmm1, i11
			
 
				+        movdqa       xmm4, i12
			
 
				+        movdqa       xmm8, i13
			
 
				+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
			
 
				+        movdqa       i10, xmm0
			
 
				+        movdqa       i11, xmm1
			
 
				+        movdqa       i12, xmm4
			
 
				+        movdqa       i13, xmm8
			
 
				+
			
 
				+
			
 
				+; RESHUFFLE AND WRITE OUT
			
 
				+        ; 8-f
			
 
				+        movdqa      xmm0, i8
			
 
				+        movdqa      xmm1, xmm0
			
 
				+        punpcklbw   xmm0, i9                ; 80 90
			
 
				+        punpckhbw   xmm1, i9                ; 88 98
			
 
				+
			
 
				+        movdqa      xmm2, i10
			
 
				+        movdqa      xmm3, xmm2
			
 
				+        punpcklbw   xmm2, i11               ; a0 b0
			
 
				+        punpckhbw   xmm3, i11               ; a8 b8
			
 
				+
			
 
				+        movdqa      xmm4, xmm0
			
 
				+        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
			
 
				+        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
			
 
				+
			
 
				+        movdqa      xmm2, xmm1
			
 
				+        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
			
 
				+        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
			
 
				+
			
 
				+        ; using xmm[0124]
			
 
				+        ; work on next 4 rows
			
 
				+
			
 
				+        movdqa      xmm3, i12
			
 
				+        movdqa      xmm5, xmm3
			
 
				+        punpcklbw   xmm3, i13               ; c0 d0
			
 
				+        punpckhbw   xmm5, i13               ; c8 d8
			
 
				+
			
 
				+        movdqa      xmm6, i14
			
 
				+        movdqa      xmm7, xmm6
			
 
				+        punpcklbw   xmm6, i15               ; e0 f0
			
 
				+        punpckhbw   xmm7, i15               ; e8 f8
			
 
				+
			
 
				+        movdqa      xmm8, xmm3
			
 
				+        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
			
 
				+        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
			
 
				+
			
 
				+        movdqa      xmm6, xmm5
			
 
				+        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
			
 
				+        punpckhwd   xmm6, xmm7              ; cc dc ec fc
			
 
				+
			
 
				+        ; pull the third and fourth sets together
			
 
				+
			
 
				+        movdqa      xmm7, xmm0
			
 
				+        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
			
 
				+        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
			
 
				+
			
 
				+        movdqa      xmm3, xmm4
			
 
				+        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
			
 
				+        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
			
 
				+
			
 
				+        movdqa      xmm8, xmm1
			
 
				+        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
			
 
				+        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
			
 
				+
			
 
				+        movdqa      xmm5, xmm2
			
 
				+        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
			
 
				+        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
			
 
				+
			
 
				+        ; save the calculations. we only have 15 registers ...
			
 
				+        movdqa      i8, xmm0
			
 
				+        movdqa      i9, xmm7
			
 
				+        movdqa      i10, xmm4
			
 
				+        movdqa      i11, xmm3
			
 
				+        movdqa      i12, xmm1
			
 
				+        movdqa      i13, xmm8
			
 
				+        movdqa      i14, xmm2
			
 
				+        movdqa      i15, xmm5
			
 
				+
			
 
				+        ; 0-7
			
 
				+        movdqa      xmm0, i0
			
 
				+        movdqa      xmm1, xmm0
			
 
				+        punpcklbw   xmm0, i1                ; 00 10
			
 
				+        punpckhbw   xmm1, i1                ; 08 18
			
 
				+
			
 
				+        movdqa      xmm2, i2
			
 
				+        movdqa      xmm3, xmm2
			
 
				+        punpcklbw   xmm2, i3                ; 20 30
			
 
				+        punpckhbw   xmm3, i3                ; 28 38
			
 
				+
			
 
				+        movdqa      xmm4, xmm0
			
 
				+        punpcklwd   xmm0, xmm2              ; 00 10 20 30
			
 
				+        punpckhwd   xmm4, xmm2              ; 04 14 24 34
			
 
				+
			
 
				+        movdqa      xmm2, xmm1
			
 
				+        punpcklwd   xmm1, xmm3              ; 08 18 28 38
			
 
				+        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
			
 
				+
			
 
				+        ; using xmm[0124]
			
 
				+        ; work on next 4 rows
			
 
				+
			
 
				+        movdqa      xmm3, i4
			
 
				+        movdqa      xmm5, xmm3
			
 
				+        punpcklbw   xmm3, i5                ; 40 50
			
 
				+        punpckhbw   xmm5, i5                ; 48 58
			
 
				+
			
 
				+        movdqa      xmm6, i6
			
 
				+        movdqa      xmm7, xmm6
			
 
				+        punpcklbw   xmm6, i7                ; 60 70
			
 
				+        punpckhbw   xmm7, i7                ; 68 78
			
 
				+
			
 
				+        movdqa      xmm8, xmm3
			
 
				+        punpcklwd   xmm3, xmm6              ; 40 50 60 70
			
 
				+        punpckhwd   xmm8, xmm6              ; 44 54 64 74
			
 
				+
			
 
				+        movdqa      xmm6, xmm5
			
 
				+        punpcklwd   xmm5, xmm7              ; 48 58 68 78
			
 
				+        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
			
 
				+
			
 
				+        ; pull the first two sets together
			
 
				+
			
 
				+        movdqa      xmm7, xmm0
			
 
				+        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
			
 
				+        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
			
 
				+
			
 
				+        movdqa      xmm3, xmm4
			
 
				+        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
			
 
				+        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
			
 
				+
			
 
				+        movdqa      xmm8, xmm1
			
 
				+        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
			
 
				+        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
			
 
				+
			
 
				+        movdqa      xmm5, xmm2
			
 
				+        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
			
 
				+        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
			
 
				+        ; final combination
			
 
				+
			
 
				+        movdqa      xmm6, xmm0
			
 
				+        punpcklqdq  xmm0, i8
			
 
				+        punpckhqdq  xmm6, i8
			
 
				+
			
 
				+        movdqa      xmm9, xmm7
			
 
				+        punpcklqdq  xmm7, i9
			
 
				+        punpckhqdq  xmm9, i9
			
 
				+
			
 
				+        movdqa      xmm10, xmm4
			
 
				+        punpcklqdq  xmm4, i10
			
 
				+        punpckhqdq  xmm10, i10
			
 
				+
			
 
				+        movdqa      xmm11, xmm3
			
 
				+        punpcklqdq  xmm3, i11
			
 
				+        punpckhqdq  xmm11, i11
			
 
				+
			
 
				+        movdqa      xmm12, xmm1
			
 
				+        punpcklqdq  xmm1, i12
			
 
				+        punpckhqdq  xmm12, i12
			
 
				+
			
 
				+        movdqa      xmm13, xmm8
			
 
				+        punpcklqdq  xmm8, i13
			
 
				+        punpckhqdq  xmm13, i13
			
 
				+
			
 
				+        movdqa      xmm14, xmm2
			
 
				+        punpcklqdq  xmm2, i14
			
 
				+        punpckhqdq  xmm14, i14
			
 
				+
			
 
				+        movdqa      xmm15, xmm5
			
 
				+        punpcklqdq  xmm5, i15
			
 
				+        punpckhqdq  xmm15, i15
			
 
				+
			
 
				+        movdqa      s0, xmm0
			
 
				+        movdqa      s1, xmm6
			
 
				+        movdqa      s2, xmm7
			
 
				+        movdqa      s3, xmm9
			
 
				+        movdqa      s4, xmm4
			
 
				+        movdqa      s5, xmm10
			
 
				+        movdqa      s6, xmm3
			
 
				+        movdqa      s7, xmm11
			
 
				+        movdqa      s8, xmm1
			
 
				+        movdqa      s9, xmm12
			
 
				+        movdqa      s10, xmm8
			
 
				+        movdqa      s11, xmm13
			
 
				+        movdqa      s12, xmm2
			
 
				+        movdqa      s13, xmm14
			
 
				+        movdqa      s14, xmm5
			
 
				+        movdqa      s15, xmm15
			
 
				+
			
 
				+    ; free stack space
			
 
				+    add          rsp, stack_size
			
 
				+
			
 
				+    ; un-ALIGN_STACK
			
 
				+    pop          rsp
			
 
				+
			
 
				+%if LIBVPX_YASM_WIN64
			
 
				+    pop    r13
			
 
				+    pop    r12
			
 
				+    RESTORE_XMM
			
 
				+    pop    rbp
			
 
				+%endif
			
 
				+
			
 
				+    ret
			
 
				+
			
 
				+SECTION_RODATA
			
 
				+align 16
			
 
				+te0:
			
 
				+    times 16 db 0xe0
			
 
				+align 16
			
 
				+t7f:
			
 
				+    times 16 db 0x7f
			
 
				+align 16
			
 
				+tfe:
			
 
				+    times 16 db 0xfe
			
 
				+align 16
			
 
				+t1f:
			
 
				+    times 16 db 0x1f
			
 
				+align 16
			
 
				+t80:
			
 
				+    times 16 db 0x80
			
 
				+align 16
			
 
				+t1:
			
 
				+    times 16 db 0x01
			
 
				+align 16
			
 
				+t3:
			
 
				+    times 16 db 0x03
			
 
				+align 16
			
 
				+t4:
			
 
				+    times 16 db 0x04
			
--- a/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm
+++ b/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -0,0 +1,1640 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+%define _t0 0
			
 
				+%define _t1 _t0 + 16
			
 
				+%define _p3 _t1 + 16
			
 
				+%define _p2 _p3 + 16
			
 
				+%define _p1 _p2 + 16
			
 
				+%define _p0 _p1 + 16
			
 
				+%define _q0 _p0 + 16
			
 
				+%define _q1 _q0 + 16
			
 
				+%define _q2 _q1 + 16
			
 
				+%define _q3 _q2 + 16
			
 
				+%define lf_var_size 160
			
 
				+
			
 
				+; Use of pmaxub instead of psubusb to compute filter mask was seen
			
 
				+; in ffvp8
			
 
				+
			
 
				+%macro LFH_FILTER_AND_HEV_MASK 1
			
 
				+%if %1
			
 
				+        movdqa      xmm2,                   [rdi+2*rax]       ; q3
			
 
				+        movdqa      xmm1,                   [rsi+2*rax]       ; q2
			
 
				+        movdqa      xmm4,                   [rsi+rax]         ; q1
			
 
				+        movdqa      xmm5,                   [rsi]             ; q0
			
 
				+        neg         rax                     ; negate pitch to deal with above border
			
 
				+%else
			
 
				+        movlps      xmm2,                   [rsi + rcx*2]     ; q3
			
 
				+        movlps      xmm1,                   [rsi + rcx]       ; q2
			
 
				+        movlps      xmm4,                   [rsi]             ; q1
			
 
				+        movlps      xmm5,                   [rsi + rax]       ; q0
			
 
				+
			
 
				+        movhps      xmm2,                   [rdi + rcx*2]
			
 
				+        movhps      xmm1,                   [rdi + rcx]
			
 
				+        movhps      xmm4,                   [rdi]
			
 
				+        movhps      xmm5,                   [rdi + rax]
			
 
				+
			
 
				+        lea         rsi,                    [rsi + rax*4]
			
 
				+        lea         rdi,                    [rdi + rax*4]
			
 
				+
			
 
				+        movdqa      [rsp+_q2],              xmm1              ; store q2
			
 
				+        movdqa      [rsp+_q1],              xmm4              ; store q1
			
 
				+%endif
			
 
				+        movdqa      xmm7,                   [rdx]             ;limit
			
 
				+
			
 
				+        movdqa      xmm6,                   xmm1              ; q2
			
 
				+        movdqa      xmm3,                   xmm4              ; q1
			
 
				+
			
 
				+        psubusb     xmm1,                   xmm2              ; q2-=q3
			
 
				+        psubusb     xmm2,                   xmm6              ; q3-=q2
			
 
				+
			
 
				+        psubusb     xmm4,                   xmm6              ; q1-=q2
			
 
				+        psubusb     xmm6,                   xmm3              ; q2-=q1
			
 
				+
			
 
				+        por         xmm4,                   xmm6              ; abs(q2-q1)
			
 
				+        por         xmm1,                   xmm2              ; abs(q3-q2)
			
 
				+
			
 
				+        movdqa      xmm0,                   xmm5              ; q0
			
 
				+        pmaxub      xmm1,                   xmm4
			
 
				+
			
 
				+        psubusb     xmm5,                   xmm3              ; q0-=q1
			
 
				+        psubusb     xmm3,                   xmm0              ; q1-=q0
			
 
				+
			
 
				+        por         xmm5,                   xmm3              ; abs(q0-q1)
			
 
				+        movdqa      [rsp+_t0],              xmm5              ; save to t0
			
 
				+
			
 
				+        pmaxub      xmm1,                   xmm5
			
 
				+
			
 
				+%if %1
			
 
				+        movdqa      xmm2,                   [rsi+4*rax]       ; p3
			
 
				+        movdqa      xmm4,                   [rdi+4*rax]       ; p2
			
 
				+        movdqa      xmm6,                   [rsi+2*rax]       ; p1
			
 
				+%else
			
 
				+        movlps      xmm2,                   [rsi + rax]       ; p3
			
 
				+        movlps      xmm4,                   [rsi]             ; p2
			
 
				+        movlps      xmm6,                   [rsi + rcx]       ; p1
			
 
				+
			
 
				+        movhps      xmm2,                   [rdi + rax]
			
 
				+        movhps      xmm4,                   [rdi]
			
 
				+        movhps      xmm6,                   [rdi + rcx]
			
 
				+
			
 
				+        movdqa      [rsp+_p2],              xmm4              ; store p2
			
 
				+        movdqa      [rsp+_p1],              xmm6              ; store p1
			
 
				+%endif
			
 
				+
			
 
				+        movdqa      xmm5,                   xmm4              ; p2
			
 
				+        movdqa      xmm3,                   xmm6              ; p1
			
 
				+
			
 
				+        psubusb     xmm4,                   xmm2              ; p2-=p3
			
 
				+        psubusb     xmm2,                   xmm5              ; p3-=p2
			
 
				+
			
 
				+        psubusb     xmm3,                   xmm5              ; p1-=p2
			
 
				+        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
			
 
				+
			
 
				+        psubusb     xmm5,                   xmm6              ; p2-=p1
			
 
				+        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
			
 
				+
			
 
				+        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
			
 
				+        movdqa      xmm2,                   xmm6              ; p1
			
 
				+
			
 
				+        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
			
 
				+%if %1
			
 
				+        movdqa      xmm4,                   [rsi+rax]         ; p0
			
 
				+        movdqa      xmm3,                   [rdi]             ; q1
			
 
				+%else
			
 
				+        movlps      xmm4,                   [rsi + rcx*2]     ; p0
			
 
				+        movhps      xmm4,                   [rdi + rcx*2]
			
 
				+        movdqa      xmm3,                   [rsp+_q1]                ; q1
			
 
				+%endif
			
 
				+
			
 
				+        movdqa      xmm5,                   xmm4              ; p0
			
 
				+        psubusb     xmm4,                   xmm6              ; p0-=p1
			
 
				+
			
 
				+        psubusb     xmm6,                   xmm5              ; p1-=p0
			
 
				+
			
 
				+        por         xmm6,                   xmm4              ; abs(p1 - p0)
			
 
				+        mov         rdx,                    arg(2)            ; get blimit
			
 
				+
			
 
				+        movdqa     [rsp+_t1],               xmm6              ; save to t1
			
 
				+
			
 
				+        movdqa      xmm4,                   xmm3              ; q1
			
 
				+        pmaxub      xmm1,                   xmm6
			
 
				+
			
 
				+        psubusb     xmm3,                   xmm2              ; q1-=p1
			
 
				+        psubusb     xmm2,                   xmm4              ; p1-=q1
			
 
				+
			
 
				+        psubusb     xmm1,                   xmm7
			
 
				+        por         xmm2,                   xmm3              ; abs(p1-q1)
			
 
				+
			
 
				+        movdqa      xmm7,                   [rdx]             ; blimit
			
 
				+        mov         rdx,                    arg(4)            ; hev get thresh
			
 
				+
			
 
				+        movdqa      xmm3,                   xmm0              ; q0
			
 
				+        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
			
 
				+
			
 
				+        movdqa      xmm6,                   xmm5              ; p0
			
 
				+        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
			
 
				+
			
 
				+        psubusb     xmm5,                   xmm3              ; p0-=q0
			
 
				+        psubusb     xmm3,                   xmm6              ; q0-=p0
			
 
				+        por         xmm5,                   xmm3              ; abs(p0 - q0)
			
 
				+
			
 
				+        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
			
 
				+
			
 
				+        movdqa      xmm4,                   [rsp+_t0]                ; hev get abs (q1 - q0)
			
 
				+        movdqa      xmm3,                   [rsp+_t1]                ; get abs (p1 - p0)
			
 
				+
			
 
				+        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
			
 
				+
			
 
				+        movdqa      xmm2,                   [rdx]             ; hev
			
 
				+
			
 
				+        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
			
 
				+        psubusb     xmm4,                   xmm2              ; hev
			
 
				+
			
 
				+        psubusb     xmm3,                   xmm2              ; hev
			
 
				+        por         xmm1,                   xmm5
			
 
				+
			
 
				+        pxor        xmm7,                   xmm7
			
 
				+        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
			
 
				+
			
 
				+        pcmpeqb     xmm4,                   xmm5              ; hev
			
 
				+        pcmpeqb     xmm3,                   xmm3              ; hev
			
 
				+
			
 
				+        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
			
 
				+        pxor        xmm4,                   xmm3              ; hev
			
 
				+%endmacro
			
 
				+
			
 
				+%macro B_FILTER 1
			
 
				+        movdqa      xmm3,                   [GLOBAL(t80)]
			
 
				+%if %1 == 0
			
 
				+        movdqa      xmm2,                   [rsp+_p1]                ; p1
			
 
				+        movdqa      xmm7,                   [rsp+_q1]                ; q1
			
 
				+%elif %1 == 1
			
 
				+        movdqa      xmm2,                   [rsi+2*rax]       ; p1
			
 
				+        movdqa      xmm7,                   [rdi]             ; q1
			
 
				+%elif %1 == 2
			
 
				+        movdqa      xmm2,                   [rsp+_p1]         ; p1
			
 
				+        movdqa      xmm6,                   [rsp+_p0]         ; p0
			
 
				+        movdqa      xmm0,                   [rsp+_q0]         ; q0
			
 
				+        movdqa      xmm7,                   [rsp+_q1]         ; q1
			
 
				+%endif
			
 
				+
			
 
				+        pxor        xmm2,                   xmm3              ; p1 offset to convert to signed values
			
 
				+        pxor        xmm7,                   xmm3              ; q1 offset to convert to signed values
			
 
				+
			
 
				+        psubsb      xmm2,                   xmm7              ; p1 - q1
			
 
				+        pxor        xmm6,                   xmm3              ; offset to convert to signed values
			
 
				+
			
 
				+        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
			
 
				+        pxor        xmm0,                   xmm3              ; offset to convert to signed values
			
 
				+
			
 
				+        movdqa      xmm3,                   xmm0              ; q0
			
 
				+        psubsb      xmm0,                   xmm6              ; q0 - p0
			
 
				+        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
			
 
				+        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
			
 
				+        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
			
 
				+        pand        xmm1,                   xmm2              ; mask filter values we don't care about
			
 
				+
			
 
				+        movdqa      xmm2,                   xmm1
			
 
				+        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
			
 
				+        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
			
 
				+
			
 
				+        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
			
 
				+        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
			
 
				+
			
 
				+        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
			
 
				+        psraw       xmm5,                   11                ; sign extended shift right by 3
			
 
				+
			
 
				+        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
			
 
				+        psraw       xmm2,                   11                ; sign extended shift right by 3
			
 
				+
			
 
				+        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
			
 
				+        psraw       xmm0,                   11                ; sign extended shift right by 3
			
 
				+
			
 
				+        psraw       xmm1,                   11                ; sign extended shift right by 3
			
 
				+        movdqa      xmm5,                   xmm0              ; save results
			
 
				+
			
 
				+        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
			
 
				+
			
 
				+        paddsb      xmm6,                   xmm2              ; p0+= p0 add
			
 
				+
			
 
				+        movdqa      xmm2,                   [GLOBAL(ones)]
			
 
				+        paddsw      xmm5,                   xmm2
			
 
				+        paddsw      xmm1,                   xmm2
			
 
				+        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
			
 
				+        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
			
 
				+        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
			
 
				+        movdqa      xmm2,                   [GLOBAL(t80)]
			
 
				+
			
 
				+%if %1 == 0
			
 
				+        movdqa      xmm1,                   [rsp+_p1]         ; p1
			
 
				+        lea         rsi,                    [rsi + rcx*2]
			
 
				+        lea         rdi,                    [rdi + rcx*2]
			
 
				+%elif %1 == 1
			
 
				+        movdqa      xmm1,                   [rsi+2*rax]       ; p1
			
 
				+%elif %1 == 2
			
 
				+        movdqa      xmm1,                   [rsp+_p1]         ; p1
			
 
				+%endif
			
 
				+
			
 
				+        pandn       xmm4,                   xmm5              ; high edge variance additive
			
 
				+        pxor        xmm6,                   xmm2              ; unoffset
			
 
				+
			
 
				+        pxor        xmm1,                   xmm2              ; reoffset
			
 
				+        psubsb      xmm3,                   xmm0              ; q0-= q0 add
			
 
				+
			
 
				+        paddsb      xmm1,                   xmm4              ; p1+= p1 add
			
 
				+        pxor        xmm3,                   xmm2              ; unoffset
			
 
				+
			
 
				+        pxor        xmm1,                   xmm2              ; unoffset
			
 
				+        psubsb      xmm7,                   xmm4              ; q1-= q1 add
			
 
				+
			
 
				+        pxor        xmm7,                   xmm2              ; unoffset
			
 
				+%if %1 == 0
			
 
				+        movq        [rsi],                  xmm6              ; p0
			
 
				+        movhps      [rdi],                  xmm6
			
 
				+        movq        [rsi + rax],            xmm1              ; p1
			
 
				+        movhps      [rdi + rax],            xmm1
			
 
				+        movq        [rsi + rcx],            xmm3              ; q0
			
 
				+        movhps      [rdi + rcx],            xmm3
			
 
				+        movq        [rsi + rcx*2],          xmm7              ; q1
			
 
				+        movhps      [rdi + rcx*2],          xmm7
			
 
				+%elif %1 == 1
			
 
				+        movdqa      [rsi+rax],              xmm6              ; write back
			
 
				+        movdqa      [rsi+2*rax],            xmm1              ; write back
			
 
				+        movdqa      [rsi],                  xmm3              ; write back
			
 
				+        movdqa      [rdi],                  xmm7              ; write back
			
 
				+%endif
			
 
				+
			
 
				+%endmacro
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+
			
 
				+;void vp8_loop_filter_horizontal_edge_sse2
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int            src_pixel_step,
			
 
				+;    const char    *blimit,
			
 
				+;    const char    *limit,
			
 
				+;    const char    *thresh,
			
 
				+;)
			
 
				+global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
			
 
				+sym(vp8_loop_filter_horizontal_edge_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 5
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub         rsp, lf_var_size
			
 
				+
			
 
				+        mov         rsi,                    arg(0)           ;src_ptr
			
 
				+        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
			
 
				+
			
 
				+        mov         rdx,                    arg(3)           ;limit
			
 
				+
			
 
				+        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
			
 
				+
			
 
				+        ; calculate breakout conditions and high edge variance
			
 
				+        LFH_FILTER_AND_HEV_MASK 1
			
 
				+        ; filter and write back the result
			
 
				+        B_FILTER 1
			
 
				+
			
 
				+    add rsp, lf_var_size
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+%endif
			
 
				+
			
 
				+;void vp8_loop_filter_horizontal_edge_uv_sse2
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int            src_pixel_step,
			
 
				+;    const char    *blimit,
			
 
				+;    const char    *limit,
			
 
				+;    const char    *thresh,
			
 
				+;    int            count
			
 
				+;)
			
 
				+global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
			
 
				+sym(vp8_loop_filter_horizontal_edge_uv_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub         rsp, lf_var_size
			
 
				+
			
 
				+        mov         rsi,                    arg(0)             ; u
			
 
				+        mov         rdi,                    arg(5)             ; v
			
 
				+        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
			
 
				+        mov         rcx,                    rax
			
 
				+        neg         rax                     ; negate pitch to deal with above border
			
 
				+
			
 
				+        mov         rdx,                    arg(3)             ;limit
			
 
				+
			
 
				+        lea         rsi,                    [rsi + rcx]
			
 
				+        lea         rdi,                    [rdi + rcx]
			
 
				+
			
 
				+        ; calculate breakout conditions and high edge variance
			
 
				+        LFH_FILTER_AND_HEV_MASK 0
			
 
				+        ; filter and write back the result
			
 
				+        B_FILTER 0
			
 
				+
			
 
				+    add rsp, lf_var_size
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+%macro MB_FILTER_AND_WRITEBACK 1
			
 
				+        movdqa      xmm3,                   [GLOBAL(t80)]
			
 
				+%if %1 == 0
			
 
				+        movdqa      xmm2,                   [rsp+_p1]              ; p1
			
 
				+        movdqa      xmm7,                   [rsp+_q1]              ; q1
			
 
				+%elif %1 == 1
			
 
				+        movdqa      xmm2,                   [rsi+2*rax]     ; p1
			
 
				+        movdqa      xmm7,                   [rdi]           ; q1
			
 
				+
			
 
				+        mov         rcx,                    rax
			
 
				+        neg         rcx
			
 
				+%elif %1 == 2
			
 
				+        movdqa      xmm2,                   [rsp+_p1]       ; p1
			
 
				+        movdqa      xmm6,                   [rsp+_p0]       ; p0
			
 
				+        movdqa      xmm0,                   [rsp+_q0]       ; q0
			
 
				+        movdqa      xmm7,                   [rsp+_q1]       ; q1
			
 
				+%endif
			
 
				+
			
 
				+        pxor        xmm2,                   xmm3            ; p1 offset to convert to signed values
			
 
				+        pxor        xmm7,                   xmm3            ; q1 offset to convert to signed values
			
 
				+        pxor        xmm6,                   xmm3            ; offset to convert to signed values
			
 
				+        pxor        xmm0,                   xmm3            ; offset to convert to signed values
			
 
				+
			
 
				+        psubsb      xmm2,                   xmm7            ; p1 - q1
			
 
				+
			
 
				+        movdqa      xmm3,                   xmm0            ; q0
			
 
				+        psubsb      xmm0,                   xmm6            ; q0 - p0
			
 
				+        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
			
 
				+        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
			
 
				+        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
			
 
				+        pand        xmm1,                   xmm2            ; mask filter values we don't care about
			
 
				+
			
 
				+        movdqa      xmm2,                   xmm1            ; vp8_filter
			
 
				+
			
 
				+        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
			
 
				+        pxor        xmm0,                   xmm0
			
 
				+
			
 
				+        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
			
 
				+        pxor        xmm1,                   xmm1
			
 
				+
			
 
				+        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
			
 
				+        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
			
 
				+
			
 
				+        movdqa      xmm5,                   xmm2
			
 
				+
			
 
				+        movdqa      xmm4,                   [GLOBAL(s9)]
			
 
				+        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
			
 
				+        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
			
 
				+
			
 
				+        pmulhw      xmm1,                   xmm4            ; Filter 2 (lo) * 9
			
 
				+        pmulhw      xmm0,                   xmm4            ; Filter 2 (hi) * 9
			
 
				+
			
 
				+        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
			
 
				+        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
			
 
				+
			
 
				+        psraw       xmm7,                   11              ; sign extended shift right by 3
			
 
				+
			
 
				+        psraw       xmm5,                   11              ; sign extended shift right by 3
			
 
				+        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
			
 
				+
			
 
				+        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
			
 
				+        psraw       xmm4,                   11              ; sign extended shift right by 3
			
 
				+
			
 
				+        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
			
 
				+        psraw       xmm2,                   11              ; sign extended shift right by 3
			
 
				+
			
 
				+        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
			
 
				+
			
 
				+        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
			
 
				+
			
 
				+        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
			
 
				+        movdqa      xmm7,                   xmm1
			
 
				+
			
 
				+        movdqa      xmm4,                   [GLOBAL(s63)]
			
 
				+        movdqa      xmm5,                   xmm0
			
 
				+        movdqa      xmm2,                   xmm5
			
 
				+        paddw       xmm0,                   xmm4            ; Filter 2 (hi) * 9 + 63
			
 
				+        paddw       xmm1,                   xmm4            ; Filter 2 (lo) * 9 + 63
			
 
				+        movdqa      xmm4,                   xmm7
			
 
				+
			
 
				+        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
			
 
				+
			
 
				+        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
			
 
				+        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
			
 
				+
			
 
				+        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
			
 
				+        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
			
 
				+        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
			
 
				+
			
 
				+        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
			
 
				+        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
			
 
				+        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
			
 
				+
			
 
				+        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
			
 
				+
			
 
				+        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
			
 
				+        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
			
 
				+        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
			
 
				+
			
 
				+        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
			
 
				+        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
			
 
				+        movdqa      xmm7,                   [GLOBAL(t80)]
			
 
				+
			
 
				+%if %1 == 0
			
 
				+        movdqa      xmm1,                   [rsp+_q1]       ; q1
			
 
				+        movdqa      xmm4,                   [rsp+_p1]       ; p1
			
 
				+        lea         rsi,                    [rsi+rcx*2]
			
 
				+        lea         rdi,                    [rdi+rcx*2]
			
 
				+
			
 
				+%elif %1 == 1
			
 
				+        movdqa      xmm1,                   [rdi]           ; q1
			
 
				+        movdqa      xmm4,                   [rsi+rax*2]     ; p1
			
 
				+%elif %1 == 2
			
 
				+        movdqa      xmm4,                   [rsp+_p1]       ; p1
			
 
				+        movdqa      xmm1,                   [rsp+_q1]       ; q1
			
 
				+%endif
			
 
				+
			
 
				+        pxor        xmm1,                   xmm7
			
 
				+        pxor        xmm4,                   xmm7
			
 
				+
			
 
				+        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
			
 
				+        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
			
 
				+        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
			
 
				+        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
			
 
				+
			
 
				+%if %1 == 1
			
 
				+        movdqa      xmm2,                   [rdi+rax*4]     ; p2
			
 
				+        movdqa      xmm5,                   [rdi+rcx]       ; q2
			
 
				+%else
			
 
				+        movdqa      xmm2,                   [rsp+_p2]       ; p2
			
 
				+        movdqa      xmm5,                   [rsp+_q2]       ; q2
			
 
				+%endif
			
 
				+
			
 
				+        pxor        xmm1,                   xmm7            ; *oq1 = sq^0x80;
			
 
				+        pxor        xmm4,                   xmm7            ; *op1 = sp^0x80;
			
 
				+        pxor        xmm2,                   xmm7
			
 
				+        pxor        xmm5,                   xmm7
			
 
				+        paddsb      xmm2,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
			
 
				+        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
			
 
				+        pxor        xmm2,                   xmm7            ; *op2 = sp^0x80;
			
 
				+        pxor        xmm5,                   xmm7            ; *oq2 = sq^0x80;
			
 
				+        pxor        xmm3,                   xmm7            ; *oq0 = sq^0x80
			
 
				+        pxor        xmm6,                   xmm7            ; *oq0 = sp^0x80
			
 
				+%if %1 == 0
			
 
				+        movq        [rsi],                  xmm6            ; p0
			
 
				+        movhps      [rdi],                  xmm6
			
 
				+        movq        [rsi + rcx],            xmm3            ; q0
			
 
				+        movhps      [rdi + rcx],            xmm3
			
 
				+        lea         rdx,                    [rcx + rcx*2]
			
 
				+        movq        [rsi+rcx*2],            xmm1            ; q1
			
 
				+        movhps      [rdi+rcx*2],            xmm1
			
 
				+
			
 
				+        movq        [rsi + rax],            xmm4            ; p1
			
 
				+        movhps      [rdi + rax],            xmm4
			
 
				+
			
 
				+        movq        [rsi+rax*2],            xmm2            ; p2
			
 
				+        movhps      [rdi+rax*2],            xmm2
			
 
				+
			
 
				+        movq        [rsi+rdx],              xmm5            ; q2
			
 
				+        movhps      [rdi+rdx],              xmm5
			
 
				+%elif %1 == 1
			
 
				+        movdqa      [rdi+rcx],              xmm5            ; q2
			
 
				+        movdqa      [rdi],                  xmm1            ; q1
			
 
				+        movdqa      [rsi],                  xmm3            ; q0
			
 
				+        movdqa      [rsi+rax  ],            xmm6            ; p0
			
 
				+        movdqa      [rsi+rax*2],            xmm4            ; p1
			
 
				+        movdqa      [rdi+rax*4],            xmm2            ; p2
			
 
				+%elif %1 == 2
			
 
				+        movdqa      [rsp+_p1],              xmm4            ; p1
			
 
				+        movdqa      [rsp+_p0],              xmm6            ; p0
			
 
				+        movdqa      [rsp+_q0],              xmm3            ; q0
			
 
				+        movdqa      [rsp+_q1],              xmm1            ; q1
			
 
				+%endif
			
 
				+
			
 
				+%endmacro
			
 
				+
			
 
				+
			
 
				+;void vp8_mbloop_filter_horizontal_edge_sse2
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int            src_pixel_step,
			
 
				+;    const char    *blimit,
			
 
				+;    const char    *limit,
			
 
				+;    const char    *thresh,
			
 
				+;)
			
 
				+global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
			
 
				+sym(vp8_mbloop_filter_horizontal_edge_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 5
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub         rsp, lf_var_size
			
 
				+
			
 
				+        mov         rsi,                    arg(0)            ;src_ptr
			
 
				+        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
			
 
				+        mov         rdx,                    arg(3)            ;limit
			
 
				+
			
 
				+        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
			
 
				+
			
 
				+        ; calculate breakout conditions and high edge variance
			
 
				+        LFH_FILTER_AND_HEV_MASK 1
			
 
				+        ; filter and write back the results
			
 
				+        MB_FILTER_AND_WRITEBACK 1
			
 
				+
			
 
				+    add rsp, lf_var_size
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_mbloop_filter_horizontal_edge_uv_sse2
			
 
				+;(
			
 
				+;    unsigned char *u,
			
 
				+;    int            src_pixel_step,
			
 
				+;    const char    *blimit,
			
 
				+;    const char    *limit,
			
 
				+;    const char    *thresh,
			
 
				+;    unsigned char *v
			
 
				+;)
			
 
				+global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
			
 
				+sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub         rsp, lf_var_size
			
 
				+
			
 
				+        mov         rsi,                    arg(0)             ; u
			
 
				+        mov         rdi,                    arg(5)             ; v
			
 
				+        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
			
 
				+        mov         rcx,                    rax
			
 
				+        neg         rax                     ; negate pitch to deal with above border
			
 
				+        mov         rdx,                    arg(3)             ;limit
			
 
				+
			
 
				+        lea         rsi,                    [rsi + rcx]
			
 
				+        lea         rdi,                    [rdi + rcx]
			
 
				+
			
 
				+        ; calculate breakout conditions and high edge variance
			
 
				+        LFH_FILTER_AND_HEV_MASK 0
			
 
				+        ; filter and write back the results
			
 
				+        MB_FILTER_AND_WRITEBACK 0
			
 
				+
			
 
				+    add rsp, lf_var_size
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+%macro TRANSPOSE_16X8 2
			
 
				+        movq        xmm4,               [rsi]           ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
			
 
				+        movq        xmm1,               [rdi]           ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
			
 
				+        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
			
 
				+        movq        xmm7,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
			
 
				+        movq        xmm5,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
			
 
				+        movq        xmm2,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
			
 
				+
			
 
				+        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
			
 
				+
			
 
				+        movq        xmm1,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
			
 
				+
			
 
				+        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
			
 
				+        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
			
 
				+
			
 
				+        movq        xmm7,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
			
 
				+
			
 
				+        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
			
 
				+%if %1
			
 
				+        lea         rsi,                [rsi+rax*8]
			
 
				+        lea         rdi,                [rdi+rax*8]
			
 
				+%else
			
 
				+        mov         rsi,                arg(5)          ; v_ptr
			
 
				+%endif
			
 
				+
			
 
				+        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
			
 
				+        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
			
 
				+        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
			
 
				+        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
			
 
				+        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
			
 
				+
			
 
				+%if %1 == 0
			
 
				+        lea         rdi,                [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
			
 
				+        lea         rsi,                [rsi - 4]
			
 
				+%endif
			
 
				+
			
 
				+        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
			
 
				+        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
			
 
				+
			
 
				+        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
			
 
				+        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
			
 
				+
			
 
				+        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
			
 
				+
			
 
				+        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
			
 
				+
			
 
				+        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
			
 
				+
			
 
				+        movdqa      [rsp+_t0],          xmm2            ; save to free XMM2
			
 
				+
			
 
				+        movq        xmm2,               [rsi]           ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
			
 
				+        movq        xmm6,               [rdi]           ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
			
 
				+        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
			
 
				+        movq        xmm5,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
			
 
				+        movq        xmm1,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
			
 
				+
			
 
				+        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
			
 
				+
			
 
				+        movq        xmm6,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
			
 
				+
			
 
				+        punpcklbw   xmm0,               xmm5            ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
			
 
				+
			
 
				+        movq        xmm5,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
			
 
				+
			
 
				+        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
			
 
				+
			
 
				+        movq        xmm6,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
			
 
				+
			
 
				+        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
			
 
				+
			
 
				+        movdqa      xmm6,               xmm1            ;
			
 
				+        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
			
 
				+
			
 
				+        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
			
 
				+        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
			
 
				+
			
 
				+        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
			
 
				+
			
 
				+        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
			
 
				+
			
 
				+        movdqa      xmm0,               xmm5
			
 
				+        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
			
 
				+
			
 
				+        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
			
 
				+        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
			
 
				+
			
 
				+        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
			
 
				+
			
 
				+        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
			
 
				+        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
			
 
				+
			
 
				+        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
			
 
				+
			
 
				+        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
			
 
				+
			
 
				+%if %2 == 0
			
 
				+        movdqa      [rsp+_q3],          xmm7            ; save 7
			
 
				+        movdqa      [rsp+_q2],          xmm6            ; save 6
			
 
				+%endif
			
 
				+        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
			
 
				+        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
			
 
				+        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
			
 
				+        movdqa      [rsp+_p1],          xmm2            ; save 2
			
 
				+
			
 
				+        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
			
 
				+        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
			
 
				+        movdqa      [rsp+_p0],          xmm3            ; save 3
			
 
				+
			
 
				+        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
			
 
				+
			
 
				+        movdqa      [rsp+_q0],          xmm4            ; save 4
			
 
				+        movdqa      [rsp+_q1],          xmm5            ; save 5
			
 
				+        movdqa      xmm1,               [rsp+_t0]
			
 
				+
			
 
				+        movdqa      xmm2,               xmm1            ;
			
 
				+        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
			
 
				+        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
			
 
				+
			
 
				+%if %2 == 0
			
 
				+        movdqa      [rsp+_p2],          xmm1
			
 
				+        movdqa      [rsp+_p3],          xmm2
			
 
				+%endif
			
 
				+
			
 
				+%endmacro
			
 
				+
			
 
				+%macro LFV_FILTER_MASK_HEV_MASK 0
			
 
				+        movdqa      xmm0,               xmm6            ; q2
			
 
				+        psubusb     xmm0,               xmm7            ; q2-q3
			
 
				+
			
 
				+        psubusb     xmm7,               xmm6            ; q3-q2
			
 
				+        movdqa      xmm4,               xmm5            ; q1
			
 
				+
			
 
				+        por         xmm7,               xmm0            ; abs (q3-q2)
			
 
				+        psubusb     xmm4,               xmm6            ; q1-q2
			
 
				+
			
 
				+        movdqa      xmm0,               xmm1
			
 
				+        psubusb     xmm6,               xmm5            ; q2-q1
			
 
				+
			
 
				+        por         xmm6,               xmm4            ; abs (q2-q1)
			
 
				+        psubusb     xmm0,               xmm2            ; p2 - p3;
			
 
				+
			
 
				+        psubusb     xmm2,               xmm1            ; p3 - p2;
			
 
				+        por         xmm0,               xmm2            ; abs(p2-p3)
			
 
				+
			
 
				+        movdqa      xmm5,               [rsp+_p1]       ; p1
			
 
				+        pmaxub      xmm0,               xmm7
			
 
				+
			
 
				+        movdqa      xmm2,               xmm5            ; p1
			
 
				+        psubusb     xmm5,               xmm1            ; p1-p2
			
 
				+        psubusb     xmm1,               xmm2            ; p2-p1
			
 
				+
			
 
				+        movdqa      xmm7,               xmm3            ; p0
			
 
				+        psubusb     xmm7,               xmm2            ; p0-p1
			
 
				+
			
 
				+        por         xmm1,               xmm5            ; abs(p2-p1)
			
 
				+        pmaxub      xmm0,               xmm6
			
 
				+
			
 
				+        pmaxub      xmm0,               xmm1
			
 
				+        movdqa      xmm1,               xmm2            ; p1
			
 
				+
			
 
				+        psubusb     xmm2,               xmm3            ; p1-p0
			
 
				+
			
 
				+        por         xmm2,               xmm7            ; abs(p1-p0)
			
 
				+
			
 
				+        pmaxub      xmm0,               xmm2
			
 
				+
			
 
				+        movdqa      xmm5,               [rsp+_q0]       ; q0
			
 
				+        movdqa      xmm7,               [rsp+_q1]       ; q1
			
 
				+
			
 
				+        mov         rdx,                arg(3)          ; limit
			
 
				+
			
 
				+        movdqa      xmm6,               xmm5            ; q0
			
 
				+        movdqa      xmm4,               xmm7            ; q1
			
 
				+
			
 
				+        psubusb     xmm5,               xmm7            ; q0-q1
			
 
				+        psubusb     xmm7,               xmm6            ; q1-q0
			
 
				+
			
 
				+        por         xmm7,               xmm5            ; abs(q1-q0)
			
 
				+
			
 
				+        pmaxub      xmm0,               xmm7
			
 
				+
			
 
				+        psubusb     xmm0,               [rdx]           ; limit
			
 
				+
			
 
				+        mov         rdx,                arg(2)          ; blimit
			
 
				+        movdqa      xmm5,               xmm4            ; q1
			
 
				+
			
 
				+        psubusb     xmm5,               xmm1            ; q1-=p1
			
 
				+        psubusb     xmm1,               xmm4            ; p1-=q1
			
 
				+
			
 
				+        por         xmm5,               xmm1            ; abs(p1-q1)
			
 
				+        movdqa      xmm1,               xmm3            ; p0
			
 
				+
			
 
				+        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
			
 
				+        psubusb     xmm1,               xmm6            ; p0-q0
			
 
				+
			
 
				+        movdqa      xmm4,               [rdx]           ; blimit
			
 
				+        mov         rdx,                arg(4)          ; get thresh
			
 
				+
			
 
				+        psrlw       xmm5,               1               ; abs(p1-q1)/2
			
 
				+        psubusb     xmm6,               xmm3            ; q0-p0
			
 
				+
			
 
				+        por         xmm1,               xmm6            ; abs(q0-p0)
			
 
				+        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
			
 
				+        movdqa      xmm3,               [rdx]
			
 
				+
			
 
				+        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
			
 
				+        psubusb     xmm2,               xmm3            ; abs(q1 - q0) > thresh
			
 
				+
			
 
				+        psubusb     xmm7,               xmm3            ; abs(p1 - p0)> thresh
			
 
				+
			
 
				+        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
			
 
				+        por         xmm2,               xmm7            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
			
 
				+
			
 
				+        por         xmm1,               xmm0            ; mask
			
 
				+        pcmpeqb     xmm2,               xmm0
			
 
				+
			
 
				+        pxor        xmm0,               xmm0
			
 
				+        pcmpeqb     xmm4,               xmm4
			
 
				+
			
 
				+        pcmpeqb     xmm1,               xmm0
			
 
				+        pxor        xmm4,               xmm2
			
 
				+%endmacro
			
 
				+
			
 
				+%macro BV_TRANSPOSE 0
			
 
				+        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
			
 
				+        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
			
 
				+        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
			
 
				+        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
			
 
				+        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
			
 
				+        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
			
 
				+
			
 
				+        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
			
 
				+        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
			
 
				+
			
 
				+        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
			
 
				+
			
 
				+        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
			
 
				+
			
 
				+        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
			
 
				+        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
			
 
				+
			
 
				+        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
			
 
				+        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
			
 
				+
			
 
				+        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
			
 
				+
			
 
				+        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
			
 
				+        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
			
 
				+        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
			
 
				+        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
			
 
				+        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
			
 
				+%endmacro
			
 
				+
			
 
				+%macro BV_WRITEBACK 2
			
 
				+        movd        [rsi+2],            %1
			
 
				+        movd        [rsi+4*rax+2],      %2
			
 
				+        psrldq      %1,                 4
			
 
				+        psrldq      %2,                 4
			
 
				+        movd        [rdi+2],            %1
			
 
				+        movd        [rdi+4*rax+2],      %2
			
 
				+        psrldq      %1,                 4
			
 
				+        psrldq      %2,                 4
			
 
				+        movd        [rsi+2*rax+2],      %1
			
 
				+        movd        [rsi+2*rcx+2],      %2
			
 
				+        psrldq      %1,                 4
			
 
				+        psrldq      %2,                 4
			
 
				+        movd        [rdi+2*rax+2],      %1
			
 
				+        movd        [rdi+2*rcx+2],      %2
			
 
				+%endmacro
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+
			
 
				+;void vp8_loop_filter_vertical_edge_sse2
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int            src_pixel_step,
			
 
				+;    const char    *blimit,
			
 
				+;    const char    *limit,
			
 
				+;    const char    *thresh,
			
 
				+;)
			
 
				+global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
			
 
				+sym(vp8_loop_filter_vertical_edge_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 5
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub             rsp, lf_var_size
			
 
				+
			
 
				+        mov         rsi,        arg(0)                  ; src_ptr
			
 
				+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
			
 
				+
			
 
				+        lea         rsi,        [rsi - 4]
			
 
				+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
			
 
				+        lea         rcx,        [rax*2+rax]
			
 
				+
			
 
				+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
			
 
				+        TRANSPOSE_16X8 1, 1
			
 
				+
			
 
				+        ; calculate filter mask and high edge variance
			
 
				+        LFV_FILTER_MASK_HEV_MASK
			
 
				+
			
 
				+        ; start work on filters
			
 
				+        B_FILTER 2
			
 
				+
			
 
				+        ; transpose and write back - only work on q1, q0, p0, p1
			
 
				+        BV_TRANSPOSE
			
 
				+        ; store 16-line result
			
 
				+
			
 
				+        lea         rdx,        [rax]
			
 
				+        neg         rdx
			
 
				+
			
 
				+        BV_WRITEBACK xmm1, xmm5
			
 
				+
			
 
				+        lea         rsi,        [rsi+rdx*8]
			
 
				+        lea         rdi,        [rdi+rdx*8]
			
 
				+        BV_WRITEBACK xmm2, xmm6
			
 
				+
			
 
				+    add rsp, lf_var_size
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+%endif
			
 
				+
			
 
				+;void vp8_loop_filter_vertical_edge_uv_sse2
			
 
				+;(
			
 
				+;    unsigned char *u,
			
 
				+;    int            src_pixel_step,
			
 
				+;    const char    *blimit,
			
 
				+;    const char    *limit,
			
 
				+;    const char    *thresh,
			
 
				+;    unsigned char *v
			
 
				+;)
			
 
				+global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
			
 
				+sym(vp8_loop_filter_vertical_edge_uv_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub             rsp, lf_var_size
			
 
				+
			
 
				+        mov         rsi,        arg(0)                  ; u_ptr
			
 
				+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
			
 
				+
			
 
				+        lea         rsi,        [rsi - 4]
			
 
				+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
			
 
				+        lea         rcx,        [rax+2*rax]
			
 
				+
			
 
				+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
			
 
				+        TRANSPOSE_16X8 0, 1
			
 
				+
			
 
				+        ; calculate filter mask and high edge variance
			
 
				+        LFV_FILTER_MASK_HEV_MASK
			
 
				+
			
 
				+        ; start work on filters
			
 
				+        B_FILTER 2
			
 
				+
			
 
				+        ; transpose and write back - only work on q1, q0, p0, p1
			
 
				+        BV_TRANSPOSE
			
 
				+
			
 
				+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
			
 
				+
			
 
				+        ; store 16-line result
			
 
				+        BV_WRITEBACK xmm1, xmm5
			
 
				+
			
 
				+        mov         rsi,        arg(0)                  ; u_ptr
			
 
				+        lea         rsi,        [rsi - 4]
			
 
				+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
			
 
				+        BV_WRITEBACK xmm2, xmm6
			
 
				+
			
 
				+    add rsp, lf_var_size
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+%macro MBV_TRANSPOSE 0
			
 
				+        movdqa      xmm0,               [rsp+_p3]           ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
			
 
				+        movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
			
 
				+
			
 
				+        punpcklbw   xmm0,               xmm2                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
			
 
				+        punpckhbw   xmm1,               xmm2                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
			
 
				+
			
 
				+        movdqa      xmm7,               [rsp+_p1]           ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
			
 
				+        movdqa      xmm6,               xmm7                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
			
 
				+
			
 
				+        punpcklbw   xmm7,               [rsp+_p0]           ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
			
 
				+        punpckhbw   xmm6,               [rsp+_p0]           ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
			
 
				+
			
 
				+        movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
			
 
				+        punpcklwd   xmm0,               xmm7                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
			
 
				+
			
 
				+        punpckhwd   xmm3,               xmm7                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
			
 
				+        movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
			
 
				+
			
 
				+        punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
			
 
				+        punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
			
 
				+
			
 
				+        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
			
 
				+        punpcklbw   xmm7,               [rsp+_q1]           ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
			
 
				+
			
 
				+        movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
			
 
				+        punpcklbw   xmm6,               [rsp+_q3]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
			
 
				+
			
 
				+        movdqa      xmm2,               xmm7                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
			
 
				+        punpcklwd   xmm7,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
			
 
				+
			
 
				+        punpckhwd   xmm2,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
			
 
				+        movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
			
 
				+
			
 
				+        punpckldq   xmm0,               xmm7                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
			
 
				+        punpckhdq   xmm6,               xmm7                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
			
 
				+%endmacro
			
 
				+
			
 
				+%macro MBV_WRITEBACK_1 0
			
 
				+        movq        [rsi],              xmm0
			
 
				+        movhps      [rdi],              xmm0
			
 
				+
			
 
				+        movq        [rsi+2*rax],        xmm6
			
 
				+        movhps      [rdi+2*rax],        xmm6
			
 
				+
			
 
				+        movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
			
 
				+        punpckldq   xmm0,               xmm2                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
			
 
				+        punpckhdq   xmm3,               xmm2                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
			
 
				+
			
 
				+        movq        [rsi+4*rax],        xmm0
			
 
				+        movhps      [rdi+4*rax],        xmm0
			
 
				+
			
 
				+        movq        [rsi+2*rcx],        xmm3
			
 
				+        movhps      [rdi+2*rcx],        xmm3
			
 
				+
			
 
				+        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
			
 
				+        punpckhbw   xmm7,               [rsp+_q1]           ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
			
 
				+        punpckhbw   xmm5,               [rsp+_q3]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
			
 
				+
			
 
				+        movdqa      xmm0,               xmm7
			
 
				+        punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
			
 
				+        punpckhwd   xmm7,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
			
 
				+
			
 
				+        movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
			
 
				+        punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
			
 
				+        punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
			
 
				+%endmacro
			
 
				+
			
 
				+%macro MBV_WRITEBACK_2 0
			
 
				+        movq        [rsi],              xmm1
			
 
				+        movhps      [rdi],              xmm1
			
 
				+
			
 
				+        movq        [rsi+2*rax],        xmm5
			
 
				+        movhps      [rdi+2*rax],        xmm5
			
 
				+
			
 
				+        movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
			
 
				+        punpckldq   xmm1,               xmm7                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
			
 
				+        punpckhdq   xmm4,               xmm7                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
			
 
				+
			
 
				+        movq        [rsi+4*rax],        xmm1
			
 
				+        movhps      [rdi+4*rax],        xmm1
			
 
				+
			
 
				+        movq        [rsi+2*rcx],        xmm4
			
 
				+        movhps      [rdi+2*rcx],        xmm4
			
 
				+%endmacro
			
 
				+
			
 
				+
			
 
				+;void vp8_mbloop_filter_vertical_edge_sse2
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int            src_pixel_step,
			
 
				+;    const char    *blimit,
			
 
				+;    const char    *limit,
			
 
				+;    const char    *thresh,
			
 
				+;)
			
 
				+global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
			
 
				+sym(vp8_mbloop_filter_vertical_edge_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 5
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub          rsp, lf_var_size
			
 
				+
			
 
				+        mov         rsi,                arg(0)              ; src_ptr
			
 
				+        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
			
 
				+
			
 
				+        lea         rsi,                [rsi - 4]
			
 
				+        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
			
 
				+        lea         rcx,                [rax*2+rax]
			
 
				+
			
 
				+        ; Transpose
			
 
				+        TRANSPOSE_16X8 1, 0
			
 
				+
			
 
				+        ; calculate filter mask and high edge variance
			
 
				+        LFV_FILTER_MASK_HEV_MASK
			
 
				+
			
 
				+        neg         rax
			
 
				+        ; start work on filters
			
 
				+        MB_FILTER_AND_WRITEBACK 2
			
 
				+
			
 
				+        lea         rsi,                [rsi+rax*8]
			
 
				+        lea         rdi,                [rdi+rax*8]
			
 
				+
			
 
				+        ; transpose and write back
			
 
				+        MBV_TRANSPOSE
			
 
				+
			
 
				+        neg         rax
			
 
				+
			
 
				+        MBV_WRITEBACK_1
			
 
				+
			
 
				+
			
 
				+        lea         rsi,                [rsi+rax*8]
			
 
				+        lea         rdi,                [rdi+rax*8]
			
 
				+        MBV_WRITEBACK_2
			
 
				+
			
 
				+    add rsp, lf_var_size
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_mbloop_filter_vertical_edge_uv_sse2
			
 
				+;(
			
 
				+;    unsigned char *u,
			
 
				+;    int            src_pixel_step,
			
 
				+;    const char    *blimit,
			
 
				+;    const char    *limit,
			
 
				+;    const char    *thresh,
			
 
				+;    unsigned char *v
			
 
				+;)
			
 
				+global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
			
 
				+sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub          rsp, lf_var_size
			
 
				+
			
 
				+        mov         rsi,                arg(0)              ; u_ptr
			
 
				+        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
			
 
				+
			
 
				+        lea         rsi,                [rsi - 4]
			
 
				+        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
			
 
				+        lea         rcx,                [rax+2*rax]
			
 
				+
			
 
				+        ; Transpose
			
 
				+        TRANSPOSE_16X8 0, 0
			
 
				+
			
 
				+        ; calculate filter mask and high edge variance
			
 
				+        LFV_FILTER_MASK_HEV_MASK
			
 
				+
			
 
				+        ; start work on filters
			
 
				+        MB_FILTER_AND_WRITEBACK 2
			
 
				+
			
 
				+        ; transpose and write back
			
 
				+        MBV_TRANSPOSE
			
 
				+
			
 
				+        mov         rsi,                arg(0)             ;u_ptr
			
 
				+        lea         rsi,                [rsi - 4]
			
 
				+        lea         rdi,                [rsi + rax]
			
 
				+        MBV_WRITEBACK_1
			
 
				+        mov         rsi,                arg(5)             ;v_ptr
			
 
				+        lea         rsi,                [rsi - 4]
			
 
				+        lea         rdi,                [rsi + rax]
			
 
				+        MBV_WRITEBACK_2
			
 
				+
			
 
				+    add rsp, lf_var_size
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_loop_filter_simple_horizontal_edge_sse2
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int  src_pixel_step,
			
 
				+;    const char *blimit,
			
 
				+;)
			
 
				+global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
			
 
				+sym(vp8_loop_filter_simple_horizontal_edge_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 3
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rcx, arg(0)             ;src_ptr
			
 
				+        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
			
 
				+        movdqa      xmm6, [GLOBAL(tfe)]
			
 
				+        lea         rdx, [rcx + rax]
			
 
				+        neg         rax
			
 
				+
			
 
				+        ; calculate mask
			
 
				+        movdqa      xmm0, [rdx]             ; q1
			
 
				+        mov         rdx, arg(2)             ;blimit
			
 
				+        movdqa      xmm1, [rcx+2*rax]       ; p1
			
 
				+
			
 
				+        movdqa      xmm2, xmm1
			
 
				+        movdqa      xmm3, xmm0
			
 
				+
			
 
				+        psubusb     xmm0, xmm1              ; q1-=p1
			
 
				+        psubusb     xmm1, xmm3              ; p1-=q1
			
 
				+        por         xmm1, xmm0              ; abs(p1-q1)
			
 
				+        pand        xmm1, xmm6              ; set lsb of each byte to zero
			
 
				+        psrlw       xmm1, 1                 ; abs(p1-q1)/2
			
 
				+
			
 
				+        movdqa      xmm7, XMMWORD PTR [rdx]
			
 
				+
			
 
				+        movdqa      xmm5, [rcx+rax]         ; p0
			
 
				+        movdqa      xmm4, [rcx]             ; q0
			
 
				+        movdqa      xmm0, xmm4              ; q0
			
 
				+        movdqa      xmm6, xmm5              ; p0
			
 
				+        psubusb     xmm5, xmm4              ; p0-=q0
			
 
				+        psubusb     xmm4, xmm6              ; q0-=p0
			
 
				+        por         xmm5, xmm4              ; abs(p0 - q0)
			
 
				+
			
 
				+        movdqa      xmm4, [GLOBAL(t80)]
			
 
				+
			
 
				+        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
			
 
				+        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
			
 
				+        psubusb     xmm5, xmm7              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
			
 
				+        pxor        xmm7, xmm7
			
 
				+        pcmpeqb     xmm5, xmm7
			
 
				+
			
 
				+
			
 
				+        ; start work on filters
			
 
				+        pxor        xmm2, xmm4     ; p1 offset to convert to signed values
			
 
				+        pxor        xmm3, xmm4     ; q1 offset to convert to signed values
			
 
				+        psubsb      xmm2, xmm3              ; p1 - q1
			
 
				+
			
 
				+        pxor        xmm6, xmm4     ; offset to convert to signed values
			
 
				+        pxor        xmm0, xmm4     ; offset to convert to signed values
			
 
				+        movdqa      xmm3, xmm0              ; q0
			
 
				+        psubsb      xmm0, xmm6              ; q0 - p0
			
 
				+        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
			
 
				+        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
			
 
				+        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
			
 
				+        pand        xmm5, xmm2              ; mask filter values we don't care about
			
 
				+
			
 
				+        movdqa      xmm0, xmm5
			
 
				+        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
			
 
				+        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
			
 
				+
			
 
				+        movdqa      xmm1, [GLOBAL(te0)]
			
 
				+        movdqa      xmm2, [GLOBAL(t1f)]
			
 
				+
			
 
				+;        pxor        xmm7, xmm7
			
 
				+        pcmpgtb     xmm7, xmm0              ;save sign
			
 
				+        pand        xmm7, xmm1              ;preserve the upper 3 bits
			
 
				+        psrlw       xmm0, 3
			
 
				+        pand        xmm0, xmm2              ;clear out upper 3 bits
			
 
				+        por         xmm0, xmm7              ;add sign
			
 
				+        psubsb      xmm3, xmm0              ; q0-= q0sz add
			
 
				+
			
 
				+        pxor        xmm7, xmm7
			
 
				+        pcmpgtb     xmm7, xmm5              ;save sign
			
 
				+        pand        xmm7, xmm1              ;preserve the upper 3 bits
			
 
				+        psrlw       xmm5, 3
			
 
				+        pand        xmm5, xmm2              ;clear out upper 3 bits
			
 
				+        por         xmm5, xmm7              ;add sign
			
 
				+        paddsb      xmm6, xmm5              ; p0+= p0 add
			
 
				+
			
 
				+        pxor        xmm3, xmm4     ; unoffset
			
 
				+        movdqa      [rcx], xmm3             ; write back
			
 
				+
			
 
				+        pxor        xmm6, xmm4     ; unoffset
			
 
				+        movdqa      [rcx+rax], xmm6         ; write back
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_loop_filter_simple_vertical_edge_sse2
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int  src_pixel_step,
			
 
				+;    const char *blimit,
			
 
				+;)
			
 
				+global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
			
 
				+sym(vp8_loop_filter_simple_vertical_edge_sse2):
			
 
				+    push        rbp         ; save old base pointer value.
			
 
				+    mov         rbp, rsp    ; set new base pointer value.
			
 
				+    SHADOW_ARGS_TO_STACK 3
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx         ; save callee-saved reg
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub         rsp, 32                         ; reserve 32 bytes
			
 
				+    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
			
 
				+    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
			
 
				+
			
 
				+        mov         rsi, arg(0) ;src_ptr
			
 
				+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
			
 
				+
			
 
				+        lea         rsi,        [rsi - 2 ]
			
 
				+        lea         rdi,        [rsi + rax]
			
 
				+        lea         rdx,        [rsi + rax*4]
			
 
				+        lea         rcx,        [rdx + rax]
			
 
				+
			
 
				+        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
			
 
				+        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
			
 
				+        movd        xmm2,       [rdi]                   ; 13 12 11 10
			
 
				+        movd        xmm3,       [rcx]                   ; 53 52 51 50
			
 
				+        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
			
 
				+        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
			
 
				+
			
 
				+        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
			
 
				+        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
			
 
				+        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
			
 
				+        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
			
 
				+        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
			
 
				+        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
			
 
				+
			
 
				+        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
			
 
				+        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
			
 
				+
			
 
				+        movdqa      xmm1,       xmm0
			
 
				+        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
			
 
				+        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
			
 
				+
			
 
				+        movdqa      xmm2,       xmm0
			
 
				+        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
			
 
				+        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
			
 
				+
			
 
				+        lea         rsi,        [rsi + rax*8]
			
 
				+        lea         rdi,        [rsi + rax]
			
 
				+        lea         rdx,        [rsi + rax*4]
			
 
				+        lea         rcx,        [rdx + rax]
			
 
				+
			
 
				+        movd        xmm4,       [rsi]                   ; 83 82 81 80
			
 
				+        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
			
 
				+        movd        xmm6,       [rdi]                   ; 93 92 91 90
			
 
				+        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
			
 
				+        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
			
 
				+        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
			
 
				+
			
 
				+        movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
			
 
				+        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
			
 
				+        movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
			
 
				+        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
			
 
				+        punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
			
 
				+        punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
			
 
				+
			
 
				+        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
			
 
				+        punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
			
 
				+
			
 
				+        movdqa      xmm7,       xmm4
			
 
				+        punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
			
 
				+        punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
			
 
				+
			
 
				+        movdqa      xmm6,       xmm4
			
 
				+        punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
			
 
				+        punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
			
 
				+
			
 
				+        movdqa      xmm1,       xmm0
			
 
				+        movdqa      xmm3,       xmm2
			
 
				+
			
 
				+        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
			
 
				+        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
			
 
				+        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
			
 
				+        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
			
 
				+
			
 
				+        mov         rdx,        arg(2)                          ;blimit
			
 
				+
			
 
				+        ; calculate mask
			
 
				+        movdqa      xmm6,       xmm0                            ; p1
			
 
				+        movdqa      xmm7,       xmm3                            ; q1
			
 
				+        psubusb     xmm7,       xmm0                            ; q1-=p1
			
 
				+        psubusb     xmm6,       xmm3                            ; p1-=q1
			
 
				+        por         xmm6,       xmm7                            ; abs(p1-q1)
			
 
				+        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
			
 
				+        psrlw       xmm6,       1                               ; abs(p1-q1)/2
			
 
				+
			
 
				+        movdqa      xmm7, [rdx]
			
 
				+
			
 
				+        movdqa      xmm5,       xmm1                            ; p0
			
 
				+        movdqa      xmm4,       xmm2                            ; q0
			
 
				+        psubusb     xmm5,       xmm2                            ; p0-=q0
			
 
				+        psubusb     xmm4,       xmm1                            ; q0-=p0
			
 
				+        por         xmm5,       xmm4                            ; abs(p0 - q0)
			
 
				+        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
			
 
				+        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
			
 
				+
			
 
				+        movdqa      xmm4, [GLOBAL(t80)]
			
 
				+
			
 
				+        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
			
 
				+        pxor        xmm7,        xmm7
			
 
				+        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
			
 
				+
			
 
				+        ; start work on filters
			
 
				+        movdqa        t0,        xmm0
			
 
				+        movdqa        t1,        xmm3
			
 
				+
			
 
				+        pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
			
 
				+        pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
			
 
				+        psubsb      xmm0,        xmm3                           ; p1 - q1
			
 
				+
			
 
				+        pxor        xmm1,        xmm4                  ; offset to convert to signed values
			
 
				+        pxor        xmm2,        xmm4                  ; offset to convert to signed values
			
 
				+
			
 
				+        movdqa      xmm3,        xmm2                           ; offseted ; q0
			
 
				+        psubsb      xmm2,        xmm1                           ; q0 - p0
			
 
				+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
			
 
				+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
			
 
				+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
			
 
				+        pand        xmm5,        xmm0                           ; mask filter values we don't care about
			
 
				+
			
 
				+        movdqa      xmm0, xmm5
			
 
				+        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
			
 
				+        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
			
 
				+
			
 
				+        movdqa  xmm6, [GLOBAL(te0)]
			
 
				+        movdqa  xmm2, [GLOBAL(t1f)]
			
 
				+
			
 
				+;        pxor        xmm7, xmm7
			
 
				+        pcmpgtb     xmm7, xmm0              ;save sign
			
 
				+        pand        xmm7, xmm6              ;preserve the upper 3 bits
			
 
				+        psrlw       xmm0, 3
			
 
				+        pand        xmm0, xmm2              ;clear out upper 3 bits
			
 
				+        por         xmm0, xmm7              ;add sign
			
 
				+        psubsb      xmm3, xmm0              ; q0-= q0sz add
			
 
				+
			
 
				+        pxor        xmm7, xmm7
			
 
				+        pcmpgtb     xmm7, xmm5              ;save sign
			
 
				+        pand        xmm7, xmm6              ;preserve the upper 3 bits
			
 
				+        psrlw       xmm5, 3
			
 
				+        pand        xmm5, xmm2              ;clear out upper 3 bits
			
 
				+        por         xmm5, xmm7              ;add sign
			
 
				+        paddsb      xmm1, xmm5              ; p0+= p0 add
			
 
				+
			
 
				+        pxor        xmm3,        xmm4                  ; unoffset   q0
			
 
				+        pxor        xmm1,        xmm4                  ; unoffset   p0
			
 
				+
			
 
				+        movdqa      xmm0,        t0                             ; p1
			
 
				+        movdqa      xmm4,        t1                             ; q1
			
 
				+
			
 
				+        ; write out order: xmm0 xmm2 xmm1 xmm3
			
 
				+        lea         rdx,        [rsi + rax*4]
			
 
				+
			
 
				+        ; transpose back to write out
			
 
				+        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
			
 
				+        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
			
 
				+        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
			
 
				+        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
			
 
				+        movdqa      xmm6,       xmm0
			
 
				+        punpcklbw   xmm0,       xmm1                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
			
 
				+        punpckhbw   xmm6,       xmm1                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
			
 
				+
			
 
				+        movdqa      xmm5,       xmm3
			
 
				+        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
			
 
				+        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
			
 
				+
			
 
				+        movdqa      xmm2,       xmm0
			
 
				+        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
			
 
				+        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
			
 
				+
			
 
				+        movdqa      xmm3,       xmm6
			
 
				+        punpcklwd   xmm6,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
			
 
				+        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
			
 
				+
			
 
				+        movd        [rsi],      xmm6                               ; write the second 8-line result
			
 
				+        movd        [rdx],      xmm3
			
 
				+        psrldq      xmm6,       4
			
 
				+        psrldq      xmm3,       4
			
 
				+        movd        [rdi],      xmm6
			
 
				+        movd        [rcx],      xmm3
			
 
				+        psrldq      xmm6,       4
			
 
				+        psrldq      xmm3,       4
			
 
				+        movd        [rsi + rax*2], xmm6
			
 
				+        movd        [rdx + rax*2], xmm3
			
 
				+        psrldq      xmm6,       4
			
 
				+        psrldq      xmm3,       4
			
 
				+        movd        [rdi + rax*2], xmm6
			
 
				+        movd        [rcx + rax*2], xmm3
			
 
				+
			
 
				+        neg         rax
			
 
				+        lea         rsi,        [rsi + rax*8]
			
 
				+        neg         rax
			
 
				+        lea         rdi,        [rsi + rax]
			
 
				+        lea         rdx,        [rsi + rax*4]
			
 
				+        lea         rcx,        [rdx + rax]
			
 
				+
			
 
				+        movd        [rsi],      xmm0                                ; write the first 8-line result
			
 
				+        movd        [rdx],      xmm2
			
 
				+        psrldq      xmm0,       4
			
 
				+        psrldq      xmm2,       4
			
 
				+        movd        [rdi],      xmm0
			
 
				+        movd        [rcx],      xmm2
			
 
				+        psrldq      xmm0,       4
			
 
				+        psrldq      xmm2,       4
			
 
				+        movd        [rsi + rax*2], xmm0
			
 
				+        movd        [rdx + rax*2], xmm2
			
 
				+        psrldq      xmm0,       4
			
 
				+        psrldq      xmm2,       4
			
 
				+        movd        [rdi + rax*2], xmm0
			
 
				+        movd        [rcx + rax*2], xmm2
			
 
				+
			
 
				+    add rsp, 32
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+SECTION_RODATA
			
 
				+align 16
			
 
				+tfe:
			
 
				+    times 16 db 0xfe
			
 
				+align 16
			
 
				+t80:
			
 
				+    times 16 db 0x80
			
 
				+align 16
			
 
				+t1s:
			
 
				+    times 16 db 0x01
			
 
				+align 16
			
 
				+t3:
			
 
				+    times 16 db 0x03
			
 
				+align 16
			
 
				+t4:
			
 
				+    times 16 db 0x04
			
 
				+align 16
			
 
				+ones:
			
 
				+    times 8 dw 0x0001
			
 
				+align 16
			
 
				+s9:
			
 
				+    times 8 dw 0x0900
			
 
				+align 16
			
 
				+s63:
			
 
				+    times 8 dw 0x003f
			
 
				+align 16
			
 
				+te0:
			
 
				+    times 16 db 0xe0
			
 
				+align 16
			
 
				+t1f:
			
 
				+    times 16 db 0x1f
			
--- a/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c
+++ b/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c
@@ -0,0 +1,198 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8/common/loopfilter.h"
			
 
				+
			
 
				+#define prototype_loopfilter(sym) \
			
 
				+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
			
 
				+             const unsigned char *limit, const unsigned char *thresh, int count)
			
 
				+
			
 
				+#define prototype_loopfilter_nc(sym) \
			
 
				+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
			
 
				+             const unsigned char *limit, const unsigned char *thresh)
			
 
				+
			
 
				+#define prototype_simple_loopfilter(sym) \
			
 
				+    void sym(unsigned char *y, int ystride, const unsigned char *blimit)
			
 
				+
			
 
				+prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
			
 
				+prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
			
 
				+prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
			
 
				+prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
			
 
				+prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
			
 
				+prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
			
 
				+
			
 
				+#if HAVE_SSE2 && ARCH_X86_64
			
 
				+prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
			
 
				+prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
			
 
				+#else
			
 
				+prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2);
			
 
				+prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2);
			
 
				+#endif
			
 
				+prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2);
			
 
				+prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2);
			
 
				+
			
 
				+extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
			
 
				+extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
			
 
				+extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
			
 
				+extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
			
 
				+
			
 
				+#if HAVE_MMX
			
 
				+/* Horizontal MB filtering */
			
 
				+void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                             int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Vertical MB Filtering */
			
 
				+void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                             int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Horizontal B Filtering */
			
 
				+void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                            int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
			
 
				+{
			
 
				+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Vertical B Filtering */
			
 
				+void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                            int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+
			
 
				+    if (v_ptr)
			
 
				+        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
			
 
				+{
			
 
				+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+/* Horizontal MB filtering */
			
 
				+#if HAVE_SSE2
			
 
				+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                              int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Vertical MB Filtering */
			
 
				+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                              int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Horizontal B Filtering */
			
 
				+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                             int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+#if ARCH_X86_64
			
 
				+    vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+#else
			
 
				+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
			
 
				+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
			
 
				+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
			
 
				+#endif
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
			
 
				+{
			
 
				+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Vertical B Filtering */
			
 
				+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
			
 
				+                             int y_stride, int uv_stride, loop_filter_info *lfi)
			
 
				+{
			
 
				+#if ARCH_X86_64
			
 
				+    vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
			
 
				+#else
			
 
				+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
			
 
				+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
			
 
				+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
			
 
				+#endif
			
 
				+
			
 
				+    if (u_ptr)
			
 
				+        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
			
 
				+{
			
 
				+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
			
 
				+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm
+++ b/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm
@@ -0,0 +1,274 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+
			
 
				+;void copy_mem8x8_mmx(
			
 
				+;    unsigned char *src,
			
 
				+;    int src_stride,
			
 
				+;    unsigned char *dst,
			
 
				+;    int dst_stride
			
 
				+;    )
			
 
				+global sym(vp8_copy_mem8x8_mmx) PRIVATE
			
 
				+sym(vp8_copy_mem8x8_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 4
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rsi,        arg(0) ;src;
			
 
				+        movq        mm0,        [rsi]
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(1) ;src_stride;
			
 
				+        mov         rdi,        arg(2) ;dst;
			
 
				+
			
 
				+        movq        mm1,        [rsi+rax]
			
 
				+        movq        mm2,        [rsi+rax*2]
			
 
				+
			
 
				+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
			
 
				+        lea         rsi,        [rsi+rax*2]
			
 
				+
			
 
				+        movq        [rdi],      mm0
			
 
				+        add         rsi,        rax
			
 
				+
			
 
				+        movq        [rdi+rcx],      mm1
			
 
				+        movq        [rdi+rcx*2],    mm2
			
 
				+
			
 
				+
			
 
				+        lea         rdi,        [rdi+rcx*2]
			
 
				+        movq        mm3,        [rsi]
			
 
				+
			
 
				+        add         rdi,        rcx
			
 
				+        movq        mm4,        [rsi+rax]
			
 
				+
			
 
				+        movq        mm5,        [rsi+rax*2]
			
 
				+        movq        [rdi],      mm3
			
 
				+
			
 
				+        lea         rsi,        [rsi+rax*2]
			
 
				+        movq        [rdi+rcx],  mm4
			
 
				+
			
 
				+        movq        [rdi+rcx*2],    mm5
			
 
				+        lea         rdi,        [rdi+rcx*2]
			
 
				+
			
 
				+        movq        mm0,        [rsi+rax]
			
 
				+        movq        mm1,        [rsi+rax*2]
			
 
				+
			
 
				+        movq        [rdi+rcx],  mm0
			
 
				+        movq        [rdi+rcx*2],mm1
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void copy_mem8x4_mmx(
			
 
				+;    unsigned char *src,
			
 
				+;    int src_stride,
			
 
				+;    unsigned char *dst,
			
 
				+;    int dst_stride
			
 
				+;    )
			
 
				+global sym(vp8_copy_mem8x4_mmx) PRIVATE
			
 
				+sym(vp8_copy_mem8x4_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 4
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rsi,        arg(0) ;src;
			
 
				+        movq        mm0,        [rsi]
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(1) ;src_stride;
			
 
				+        mov         rdi,        arg(2) ;dst;
			
 
				+
			
 
				+        movq        mm1,        [rsi+rax]
			
 
				+        movq        mm2,        [rsi+rax*2]
			
 
				+
			
 
				+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
			
 
				+        lea         rsi,        [rsi+rax*2]
			
 
				+
			
 
				+        movq        [rdi],      mm0
			
 
				+        movq        [rdi+rcx],      mm1
			
 
				+
			
 
				+        movq        [rdi+rcx*2],    mm2
			
 
				+        lea         rdi,        [rdi+rcx*2]
			
 
				+
			
 
				+        movq        mm3,        [rsi+rax]
			
 
				+        movq        [rdi+rcx],      mm3
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void copy_mem16x16_mmx(
			
 
				+;    unsigned char *src,
			
 
				+;    int src_stride,
			
 
				+;    unsigned char *dst,
			
 
				+;    int dst_stride
			
 
				+;    )
			
 
				+global sym(vp8_copy_mem16x16_mmx) PRIVATE
			
 
				+sym(vp8_copy_mem16x16_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 4
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rsi,        arg(0) ;src;
			
 
				+        movsxd      rax,        dword ptr arg(1) ;src_stride;
			
 
				+
			
 
				+        mov         rdi,        arg(2) ;dst;
			
 
				+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
			
 
				+
			
 
				+        movq        mm0,            [rsi]
			
 
				+        movq        mm3,            [rsi+8];
			
 
				+
			
 
				+        movq        mm1,            [rsi+rax]
			
 
				+        movq        mm4,            [rsi+rax+8]
			
 
				+
			
 
				+        movq        mm2,            [rsi+rax*2]
			
 
				+        movq        mm5,            [rsi+rax*2+8]
			
 
				+
			
 
				+        lea         rsi,            [rsi+rax*2]
			
 
				+        add         rsi,            rax
			
 
				+
			
 
				+        movq        [rdi],          mm0
			
 
				+        movq        [rdi+8],        mm3
			
 
				+
			
 
				+        movq        [rdi+rcx],      mm1
			
 
				+        movq        [rdi+rcx+8],    mm4
			
 
				+
			
 
				+        movq        [rdi+rcx*2],    mm2
			
 
				+        movq        [rdi+rcx*2+8],  mm5
			
 
				+
			
 
				+        lea         rdi,            [rdi+rcx*2]
			
 
				+        add         rdi,            rcx
			
 
				+
			
 
				+        movq        mm0,            [rsi]
			
 
				+        movq        mm3,            [rsi+8];
			
 
				+
			
 
				+        movq        mm1,            [rsi+rax]
			
 
				+        movq        mm4,            [rsi+rax+8]
			
 
				+
			
 
				+        movq        mm2,            [rsi+rax*2]
			
 
				+        movq        mm5,            [rsi+rax*2+8]
			
 
				+
			
 
				+        lea         rsi,            [rsi+rax*2]
			
 
				+        add         rsi,            rax
			
 
				+
			
 
				+        movq        [rdi],          mm0
			
 
				+        movq        [rdi+8],        mm3
			
 
				+
			
 
				+        movq        [rdi+rcx],      mm1
			
 
				+        movq        [rdi+rcx+8],    mm4
			
 
				+
			
 
				+        movq        [rdi+rcx*2],    mm2
			
 
				+        movq        [rdi+rcx*2+8],  mm5
			
 
				+
			
 
				+        lea         rdi,            [rdi+rcx*2]
			
 
				+        add         rdi,            rcx
			
 
				+
			
 
				+        movq        mm0,            [rsi]
			
 
				+        movq        mm3,            [rsi+8];
			
 
				+
			
 
				+        movq        mm1,            [rsi+rax]
			
 
				+        movq        mm4,            [rsi+rax+8]
			
 
				+
			
 
				+        movq        mm2,            [rsi+rax*2]
			
 
				+        movq        mm5,            [rsi+rax*2+8]
			
 
				+
			
 
				+        lea         rsi,            [rsi+rax*2]
			
 
				+        add         rsi,            rax
			
 
				+
			
 
				+        movq        [rdi],          mm0
			
 
				+        movq        [rdi+8],        mm3
			
 
				+
			
 
				+        movq        [rdi+rcx],      mm1
			
 
				+        movq        [rdi+rcx+8],    mm4
			
 
				+
			
 
				+        movq        [rdi+rcx*2],    mm2
			
 
				+        movq        [rdi+rcx*2+8],  mm5
			
 
				+
			
 
				+        lea         rdi,            [rdi+rcx*2]
			
 
				+        add         rdi,            rcx
			
 
				+
			
 
				+        movq        mm0,            [rsi]
			
 
				+        movq        mm3,            [rsi+8];
			
 
				+
			
 
				+        movq        mm1,            [rsi+rax]
			
 
				+        movq        mm4,            [rsi+rax+8]
			
 
				+
			
 
				+        movq        mm2,            [rsi+rax*2]
			
 
				+        movq        mm5,            [rsi+rax*2+8]
			
 
				+
			
 
				+        lea         rsi,            [rsi+rax*2]
			
 
				+        add         rsi,            rax
			
 
				+
			
 
				+        movq        [rdi],          mm0
			
 
				+        movq        [rdi+8],        mm3
			
 
				+
			
 
				+        movq        [rdi+rcx],      mm1
			
 
				+        movq        [rdi+rcx+8],    mm4
			
 
				+
			
 
				+        movq        [rdi+rcx*2],    mm2
			
 
				+        movq        [rdi+rcx*2+8],  mm5
			
 
				+
			
 
				+        lea         rdi,            [rdi+rcx*2]
			
 
				+        add         rdi,            rcx
			
 
				+
			
 
				+        movq        mm0,            [rsi]
			
 
				+        movq        mm3,            [rsi+8];
			
 
				+
			
 
				+        movq        mm1,            [rsi+rax]
			
 
				+        movq        mm4,            [rsi+rax+8]
			
 
				+
			
 
				+        movq        mm2,            [rsi+rax*2]
			
 
				+        movq        mm5,            [rsi+rax*2+8]
			
 
				+
			
 
				+        lea         rsi,            [rsi+rax*2]
			
 
				+        add         rsi,            rax
			
 
				+
			
 
				+        movq        [rdi],          mm0
			
 
				+        movq        [rdi+8],        mm3
			
 
				+
			
 
				+        movq        [rdi+rcx],      mm1
			
 
				+        movq        [rdi+rcx+8],    mm4
			
 
				+
			
 
				+        movq        [rdi+rcx*2],    mm2
			
 
				+        movq        [rdi+rcx*2+8],  mm5
			
 
				+
			
 
				+        lea         rdi,            [rdi+rcx*2]
			
 
				+        add         rdi,            rcx
			
 
				+
			
 
				+        movq        mm0,            [rsi]
			
 
				+        movq        mm3,            [rsi+8];
			
 
				+
			
 
				+        movq        [rdi],          mm0
			
 
				+        movq        [rdi+8],        mm3
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
--- a/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm
+++ b/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm
@@ -0,0 +1,116 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+;void copy_mem16x16_sse2(
			
 
				+;    unsigned char *src,
			
 
				+;    int src_stride,
			
 
				+;    unsigned char *dst,
			
 
				+;    int dst_stride
			
 
				+;    )
			
 
				+global sym(vp8_copy_mem16x16_sse2) PRIVATE
			
 
				+sym(vp8_copy_mem16x16_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 4
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rsi,        arg(0) ;src;
			
 
				+        movdqu      xmm0,       [rsi]
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(1) ;src_stride;
			
 
				+        mov         rdi,        arg(2) ;dst;
			
 
				+
			
 
				+        movdqu      xmm1,       [rsi+rax]
			
 
				+        movdqu      xmm2,       [rsi+rax*2]
			
 
				+
			
 
				+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
			
 
				+        lea         rsi,        [rsi+rax*2]
			
 
				+
			
 
				+        movdqa      [rdi],      xmm0
			
 
				+        add         rsi,        rax
			
 
				+
			
 
				+        movdqa      [rdi+rcx],  xmm1
			
 
				+        movdqa      [rdi+rcx*2],xmm2
			
 
				+
			
 
				+        lea         rdi,        [rdi+rcx*2]
			
 
				+        movdqu      xmm3,       [rsi]
			
 
				+
			
 
				+        add         rdi,        rcx
			
 
				+        movdqu      xmm4,       [rsi+rax]
			
 
				+
			
 
				+        movdqu      xmm5,       [rsi+rax*2]
			
 
				+        lea         rsi,        [rsi+rax*2]
			
 
				+
			
 
				+        movdqa      [rdi],  xmm3
			
 
				+        add         rsi,        rax
			
 
				+
			
 
				+        movdqa      [rdi+rcx],  xmm4
			
 
				+        movdqa      [rdi+rcx*2],xmm5
			
 
				+
			
 
				+        lea         rdi,        [rdi+rcx*2]
			
 
				+        movdqu      xmm0,       [rsi]
			
 
				+
			
 
				+        add         rdi,        rcx
			
 
				+        movdqu      xmm1,       [rsi+rax]
			
 
				+
			
 
				+        movdqu      xmm2,       [rsi+rax*2]
			
 
				+        lea         rsi,        [rsi+rax*2]
			
 
				+
			
 
				+        movdqa      [rdi],      xmm0
			
 
				+        add         rsi,        rax
			
 
				+
			
 
				+        movdqa      [rdi+rcx],  xmm1
			
 
				+
			
 
				+        movdqa      [rdi+rcx*2],    xmm2
			
 
				+        movdqu      xmm3,       [rsi]
			
 
				+
			
 
				+        movdqu      xmm4,       [rsi+rax]
			
 
				+        lea         rdi,        [rdi+rcx*2]
			
 
				+
			
 
				+        add         rdi,        rcx
			
 
				+        movdqu      xmm5,       [rsi+rax*2]
			
 
				+
			
 
				+        lea         rsi,        [rsi+rax*2]
			
 
				+        movdqa      [rdi],  xmm3
			
 
				+
			
 
				+        add         rsi,        rax
			
 
				+        movdqa      [rdi+rcx],  xmm4
			
 
				+
			
 
				+        movdqa      [rdi+rcx*2],xmm5
			
 
				+        movdqu      xmm0,       [rsi]
			
 
				+
			
 
				+        lea         rdi,        [rdi+rcx*2]
			
 
				+        movdqu      xmm1,       [rsi+rax]
			
 
				+
			
 
				+        add         rdi,        rcx
			
 
				+        movdqu      xmm2,       [rsi+rax*2]
			
 
				+
			
 
				+        lea         rsi,        [rsi+rax*2]
			
 
				+        movdqa      [rdi],      xmm0
			
 
				+
			
 
				+        movdqa      [rdi+rcx],  xmm1
			
 
				+        movdqa      [rdi+rcx*2],xmm2
			
 
				+
			
 
				+        movdqu      xmm3,       [rsi+rax]
			
 
				+        lea         rdi,        [rdi+rcx*2]
			
 
				+
			
 
				+        movdqa      [rdi+rcx],  xmm3
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
--- a/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm
+++ b/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -0,0 +1,702 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+extern sym(vp8_bilinear_filters_x86_8)
			
 
				+
			
 
				+
			
 
				+%define BLOCK_HEIGHT_WIDTH 4
			
 
				+%define vp8_filter_weight 128
			
 
				+%define VP8_FILTER_SHIFT  7
			
 
				+
			
 
				+
			
 
				+;void vp8_filter_block1d_h6_mmx
			
 
				+;(
			
 
				+;    unsigned char   *src_ptr,
			
 
				+;    unsigned short  *output_ptr,
			
 
				+;    unsigned int    src_pixels_per_line,
			
 
				+;    unsigned int    pixel_step,
			
 
				+;    unsigned int    output_height,
			
 
				+;    unsigned int    output_width,
			
 
				+;    short           * vp8_filter
			
 
				+;)
			
 
				+global sym(vp8_filter_block1d_h6_mmx) PRIVATE
			
 
				+sym(vp8_filter_block1d_h6_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rdx,    arg(6) ;vp8_filter
			
 
				+
			
 
				+        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
			
 
				+        movq        mm2,    [rdx + 32]         ;
			
 
				+        movq        mm6,    [rdx + 48]        ;
			
 
				+        movq        mm7,    [rdx + 64]        ;
			
 
				+
			
 
				+        mov         rdi,    arg(1) ;output_ptr
			
 
				+        mov         rsi,    arg(0) ;src_ptr
			
 
				+        movsxd      rcx,    dword ptr arg(4) ;output_height
			
 
				+        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
			
 
				+        pxor        mm0,    mm0              ; mm0 = 00000000
			
 
				+
			
 
				+.nextrow:
			
 
				+        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
			
 
				+        movq        mm4,    mm3              ; mm4 = p-2..p5
			
 
				+        psrlq       mm3,    8                ; mm3 = p-1..p5
			
 
				+        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
			
 
				+        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
			
 
				+
			
 
				+        movq        mm5,    mm4              ; mm5 = p-2..p5
			
 
				+        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
			
 
				+        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
			
 
				+        paddsw      mm3,    mm4              ; mm3 += mm5
			
 
				+
			
 
				+        movq        mm4,    mm5              ; mm4 = p-2..p5;
			
 
				+        psrlq       mm5,    16               ; mm5 = p0..p5;
			
 
				+        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
			
 
				+        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
			
 
				+        paddsw      mm3,    mm5              ; mm3 += mm5
			
 
				+
			
 
				+        movq        mm5,    mm4              ; mm5 = p-2..p5
			
 
				+        psrlq       mm4,    24               ; mm4 = p1..p5
			
 
				+        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
			
 
				+        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
			
 
				+        paddsw      mm3,    mm4              ; mm3 += mm5
			
 
				+
			
 
				+        ; do outer positive taps
			
 
				+        movd        mm4,    [rsi+3]
			
 
				+        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
			
 
				+        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
			
 
				+        paddsw      mm3,    mm4              ; mm3 += mm5
			
 
				+
			
 
				+        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
			
 
				+        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
			
 
				+        paddsw      mm3,    mm5              ; mm3 += mm5
			
 
				+
			
 
				+        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
			
 
				+        psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
			
 
				+        packuswb    mm3,    mm0              ; pack and unpack to saturate
			
 
				+        punpcklbw   mm3,    mm0              ;
			
 
				+
			
 
				+        movq        [rdi],  mm3              ; store the results in the destination
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
			
 
				+        add         rdi,    rax;
			
 
				+%else
			
 
				+        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
			
 
				+        add         rdi,    rax;
			
 
				+
			
 
				+        add         rsi,    r8               ; next line
			
 
				+%endif
			
 
				+
			
 
				+        dec         rcx                      ; decrement count
			
 
				+        jnz         .nextrow                 ; next row
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_filter_block1dc_v6_mmx
			
 
				+;(
			
 
				+;   short *src_ptr,
			
 
				+;   unsigned char *output_ptr,
			
 
				+;    int output_pitch,
			
 
				+;   unsigned int pixels_per_line,
			
 
				+;   unsigned int pixel_step,
			
 
				+;   unsigned int output_height,
			
 
				+;   unsigned int output_width,
			
 
				+;   short * vp8_filter
			
 
				+;)
			
 
				+global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
			
 
				+sym(vp8_filter_block1dc_v6_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 8
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        movq      mm5, [GLOBAL(rd)]
			
 
				+        push        rbx
			
 
				+        mov         rbx, arg(7) ;vp8_filter
			
 
				+        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
			
 
				+        movq      mm2, [rbx + 32]         ;
			
 
				+        movq      mm6, [rbx + 48]        ;
			
 
				+        movq      mm7, [rbx + 64]        ;
			
 
				+
			
 
				+        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
			
 
				+        mov         rdi, arg(1) ;output_ptr
			
 
				+        mov         rsi, arg(0) ;src_ptr
			
 
				+        sub         rsi, rdx
			
 
				+        sub         rsi, rdx
			
 
				+        movsxd      rcx, DWORD PTR arg(5) ;output_height
			
 
				+        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
			
 
				+        pxor        mm0, mm0              ; mm0 = 00000000
			
 
				+
			
 
				+
			
 
				+.nextrow_cv:
			
 
				+        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
			
 
				+        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
			
 
				+
			
 
				+
			
 
				+        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
			
 
				+        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
			
 
				+        paddsw      mm3, mm4              ; mm3 += mm4
			
 
				+
			
 
				+        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
			
 
				+        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
			
 
				+        paddsw      mm3, mm4              ; mm3 += mm4
			
 
				+
			
 
				+        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
			
 
				+        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
			
 
				+        paddsw      mm3, mm4              ; mm3 += mm4
			
 
				+
			
 
				+
			
 
				+        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
			
 
				+        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
			
 
				+        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
			
 
				+        paddsw      mm3, mm4              ; mm3 += mm4
			
 
				+
			
 
				+        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
			
 
				+        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
			
 
				+        paddsw      mm3, mm4              ; mm3 += mm4
			
 
				+
			
 
				+
			
 
				+        paddsw      mm3, mm5               ; mm3 += round value
			
 
				+        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
			
 
				+        packuswb    mm3, mm0              ; pack and saturate
			
 
				+
			
 
				+        movd        [rdi],mm3             ; store the results in the destination
			
 
				+        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
			
 
				+        ; recon block should be in cache this shouldn't cost much.  Its obviously
			
 
				+        ; avoidable!!!.
			
 
				+        lea         rdi,  [rdi+rax] ;
			
 
				+        dec         rcx                   ; decrement count
			
 
				+        jnz         .nextrow_cv           ; next row
			
 
				+
			
 
				+        pop         rbx
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void bilinear_predict8x8_mmx
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    int   src_pixels_per_line,
			
 
				+;    int  xoffset,
			
 
				+;    int  yoffset,
			
 
				+;   unsigned char *dst_ptr,
			
 
				+;    int dst_pitch
			
 
				+;)
			
 
				+global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
			
 
				+sym(vp8_bilinear_predict8x8_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
			
 
				+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(2) ;xoffset
			
 
				+        mov         rdi,        arg(4) ;dst_ptr           ;
			
 
				+
			
 
				+        shl         rax,        5 ; offset * 32
			
 
				+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
			
 
				+
			
 
				+        add         rax,        rcx ; HFilter
			
 
				+        mov         rsi,        arg(0) ;src_ptr              ;
			
 
				+
			
 
				+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
			
 
				+        movq        mm1,        [rax]               ;
			
 
				+
			
 
				+        movq        mm2,        [rax+16]            ;
			
 
				+        movsxd      rax,        dword ptr arg(3) ;yoffset
			
 
				+
			
 
				+        pxor        mm0,        mm0                 ;
			
 
				+
			
 
				+        shl         rax,        5 ; offset*32
			
 
				+        add         rax,        rcx ; VFilter
			
 
				+
			
 
				+        lea         rcx,        [rdi+rdx*8]          ;
			
 
				+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
			
 
				+
			
 
				+
			
 
				+
			
 
				+        ; get the first horizontal line done       ;
			
 
				+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
			
 
				+        movq        mm4,        mm3                 ; make a copy of current line
			
 
				+
			
 
				+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
			
 
				+        punpckhbw   mm4,        mm0                 ;
			
 
				+
			
 
				+        pmullw      mm3,        mm1                 ;
			
 
				+        pmullw      mm4,        mm1                 ;
			
 
				+
			
 
				+        movq        mm5,        [rsi+1]             ;
			
 
				+        movq        mm6,        mm5                 ;
			
 
				+
			
 
				+        punpcklbw   mm5,        mm0                 ;
			
 
				+        punpckhbw   mm6,        mm0                 ;
			
 
				+
			
 
				+        pmullw      mm5,        mm2                 ;
			
 
				+        pmullw      mm6,        mm2                 ;
			
 
				+
			
 
				+        paddw       mm3,        mm5                 ;
			
 
				+        paddw       mm4,        mm6                 ;
			
 
				+
			
 
				+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
			
 
				+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       mm4,        [GLOBAL(rd)]                 ;
			
 
				+        psraw       mm4,        VP8_FILTER_SHIFT        ;
			
 
				+
			
 
				+        movq        mm7,        mm3                 ;
			
 
				+        packuswb    mm7,        mm4                 ;
			
 
				+
			
 
				+        add         rsi,        rdx                 ; next line
			
 
				+.next_row_8x8:
			
 
				+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
			
 
				+        movq        mm4,        mm3                 ; make a copy of current line
			
 
				+
			
 
				+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
			
 
				+        punpckhbw   mm4,        mm0                 ;
			
 
				+
			
 
				+        pmullw      mm3,        mm1                 ;
			
 
				+        pmullw      mm4,        mm1                 ;
			
 
				+
			
 
				+        movq        mm5,        [rsi+1]             ;
			
 
				+        movq        mm6,        mm5                 ;
			
 
				+
			
 
				+        punpcklbw   mm5,        mm0                 ;
			
 
				+        punpckhbw   mm6,        mm0                 ;
			
 
				+
			
 
				+        pmullw      mm5,        mm2                 ;
			
 
				+        pmullw      mm6,        mm2                 ;
			
 
				+
			
 
				+        paddw       mm3,        mm5                 ;
			
 
				+        paddw       mm4,        mm6                 ;
			
 
				+
			
 
				+        movq        mm5,        mm7                 ;
			
 
				+        movq        mm6,        mm7                 ;
			
 
				+
			
 
				+        punpcklbw   mm5,        mm0                 ;
			
 
				+        punpckhbw   mm6,        mm0
			
 
				+
			
 
				+        pmullw      mm5,        [rax]               ;
			
 
				+        pmullw      mm6,        [rax]               ;
			
 
				+
			
 
				+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
			
 
				+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       mm4,        [GLOBAL(rd)]                 ;
			
 
				+        psraw       mm4,        VP8_FILTER_SHIFT        ;
			
 
				+
			
 
				+        movq        mm7,        mm3                 ;
			
 
				+        packuswb    mm7,        mm4                 ;
			
 
				+
			
 
				+
			
 
				+        pmullw      mm3,        [rax+16]            ;
			
 
				+        pmullw      mm4,        [rax+16]            ;
			
 
				+
			
 
				+        paddw       mm3,        mm5                 ;
			
 
				+        paddw       mm4,        mm6                 ;
			
 
				+
			
 
				+
			
 
				+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
			
 
				+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       mm4,        [GLOBAL(rd)]                 ;
			
 
				+        psraw       mm4,        VP8_FILTER_SHIFT        ;
			
 
				+
			
 
				+        packuswb    mm3,        mm4
			
 
				+
			
 
				+        movq        [rdi],      mm3                 ; store the results in the destination
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rsi,        rdx                 ; next line
			
 
				+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
			
 
				+%else
			
 
				+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
			
 
				+        add         rsi,        rdx                 ; next line
			
 
				+        add         rdi,        r8                  ;dst_pitch
			
 
				+%endif
			
 
				+        cmp         rdi,        rcx                 ;
			
 
				+        jne         .next_row_8x8
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void bilinear_predict8x4_mmx
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    int   src_pixels_per_line,
			
 
				+;    int  xoffset,
			
 
				+;    int  yoffset,
			
 
				+;    unsigned char *dst_ptr,
			
 
				+;    int dst_pitch
			
 
				+;)
			
 
				+global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
			
 
				+sym(vp8_bilinear_predict8x4_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
			
 
				+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(2) ;xoffset
			
 
				+        mov         rdi,        arg(4) ;dst_ptr           ;
			
 
				+
			
 
				+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
			
 
				+        shl         rax,        5
			
 
				+
			
 
				+        mov         rsi,        arg(0) ;src_ptr              ;
			
 
				+        add         rax,        rcx
			
 
				+
			
 
				+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
			
 
				+        movq        mm1,        [rax]               ;
			
 
				+
			
 
				+        movq        mm2,        [rax+16]            ;
			
 
				+        movsxd      rax,        dword ptr arg(3) ;yoffset
			
 
				+
			
 
				+        pxor        mm0,        mm0                 ;
			
 
				+        shl         rax,        5
			
 
				+
			
 
				+        add         rax,        rcx
			
 
				+        lea         rcx,        [rdi+rdx*4]          ;
			
 
				+
			
 
				+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
			
 
				+
			
 
				+        ; get the first horizontal line done       ;
			
 
				+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
			
 
				+        movq        mm4,        mm3                 ; make a copy of current line
			
 
				+
			
 
				+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
			
 
				+        punpckhbw   mm4,        mm0                 ;
			
 
				+
			
 
				+        pmullw      mm3,        mm1                 ;
			
 
				+        pmullw      mm4,        mm1                 ;
			
 
				+
			
 
				+        movq        mm5,        [rsi+1]             ;
			
 
				+        movq        mm6,        mm5                 ;
			
 
				+
			
 
				+        punpcklbw   mm5,        mm0                 ;
			
 
				+        punpckhbw   mm6,        mm0                 ;
			
 
				+
			
 
				+        pmullw      mm5,        mm2                 ;
			
 
				+        pmullw      mm6,        mm2                 ;
			
 
				+
			
 
				+        paddw       mm3,        mm5                 ;
			
 
				+        paddw       mm4,        mm6                 ;
			
 
				+
			
 
				+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
			
 
				+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       mm4,        [GLOBAL(rd)]                 ;
			
 
				+        psraw       mm4,        VP8_FILTER_SHIFT        ;
			
 
				+
			
 
				+        movq        mm7,        mm3                 ;
			
 
				+        packuswb    mm7,        mm4                 ;
			
 
				+
			
 
				+        add         rsi,        rdx                 ; next line
			
 
				+.next_row_8x4:
			
 
				+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
			
 
				+        movq        mm4,        mm3                 ; make a copy of current line
			
 
				+
			
 
				+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
			
 
				+        punpckhbw   mm4,        mm0                 ;
			
 
				+
			
 
				+        pmullw      mm3,        mm1                 ;
			
 
				+        pmullw      mm4,        mm1                 ;
			
 
				+
			
 
				+        movq        mm5,        [rsi+1]             ;
			
 
				+        movq        mm6,        mm5                 ;
			
 
				+
			
 
				+        punpcklbw   mm5,        mm0                 ;
			
 
				+        punpckhbw   mm6,        mm0                 ;
			
 
				+
			
 
				+        pmullw      mm5,        mm2                 ;
			
 
				+        pmullw      mm6,        mm2                 ;
			
 
				+
			
 
				+        paddw       mm3,        mm5                 ;
			
 
				+        paddw       mm4,        mm6                 ;
			
 
				+
			
 
				+        movq        mm5,        mm7                 ;
			
 
				+        movq        mm6,        mm7                 ;
			
 
				+
			
 
				+        punpcklbw   mm5,        mm0                 ;
			
 
				+        punpckhbw   mm6,        mm0
			
 
				+
			
 
				+        pmullw      mm5,        [rax]               ;
			
 
				+        pmullw      mm6,        [rax]               ;
			
 
				+
			
 
				+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
			
 
				+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       mm4,        [GLOBAL(rd)]                 ;
			
 
				+        psraw       mm4,        VP8_FILTER_SHIFT        ;
			
 
				+
			
 
				+        movq        mm7,        mm3                 ;
			
 
				+        packuswb    mm7,        mm4                 ;
			
 
				+
			
 
				+
			
 
				+        pmullw      mm3,        [rax+16]            ;
			
 
				+        pmullw      mm4,        [rax+16]            ;
			
 
				+
			
 
				+        paddw       mm3,        mm5                 ;
			
 
				+        paddw       mm4,        mm6                 ;
			
 
				+
			
 
				+
			
 
				+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
			
 
				+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       mm4,        [GLOBAL(rd)]                 ;
			
 
				+        psraw       mm4,        VP8_FILTER_SHIFT        ;
			
 
				+
			
 
				+        packuswb    mm3,        mm4
			
 
				+
			
 
				+        movq        [rdi],      mm3                 ; store the results in the destination
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rsi,        rdx                 ; next line
			
 
				+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
			
 
				+%else
			
 
				+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
			
 
				+        add         rsi,        rdx                 ; next line
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+        cmp         rdi,        rcx                 ;
			
 
				+        jne         .next_row_8x4
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void bilinear_predict4x4_mmx
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    int   src_pixels_per_line,
			
 
				+;    int  xoffset,
			
 
				+;    int  yoffset,
			
 
				+;    unsigned char *dst_ptr,
			
 
				+;    int dst_pitch
			
 
				+;)
			
 
				+global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
			
 
				+sym(vp8_bilinear_predict4x4_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
			
 
				+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(2) ;xoffset
			
 
				+        mov         rdi,        arg(4) ;dst_ptr           ;
			
 
				+
			
 
				+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
			
 
				+        shl         rax,        5
			
 
				+
			
 
				+        add         rax,        rcx ; HFilter
			
 
				+        mov         rsi,        arg(0) ;src_ptr              ;
			
 
				+
			
 
				+        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
			
 
				+        movq        mm1,        [rax]               ;
			
 
				+
			
 
				+        movq        mm2,        [rax+16]            ;
			
 
				+        movsxd      rax,        dword ptr arg(3) ;yoffset
			
 
				+
			
 
				+        pxor        mm0,        mm0                 ;
			
 
				+        shl         rax,        5
			
 
				+
			
 
				+        add         rax,        rcx
			
 
				+        lea         rcx,        [rdi+rdx*4]          ;
			
 
				+
			
 
				+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
			
 
				+
			
 
				+        ; get the first horizontal line done       ;
			
 
				+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
			
 
				+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
			
 
				+
			
 
				+        pmullw      mm3,        mm1                 ;
			
 
				+        movd        mm5,        [rsi+1]             ;
			
 
				+
			
 
				+        punpcklbw   mm5,        mm0                 ;
			
 
				+        pmullw      mm5,        mm2                 ;
			
 
				+
			
 
				+        paddw       mm3,        mm5                 ;
			
 
				+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
			
 
				+
			
 
				+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        movq        mm7,        mm3                 ;
			
 
				+        packuswb    mm7,        mm0                 ;
			
 
				+
			
 
				+        add         rsi,        rdx                 ; next line
			
 
				+.next_row_4x4:
			
 
				+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
			
 
				+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
			
 
				+
			
 
				+        pmullw      mm3,        mm1                 ;
			
 
				+        movd        mm5,        [rsi+1]             ;
			
 
				+
			
 
				+        punpcklbw   mm5,        mm0                 ;
			
 
				+        pmullw      mm5,        mm2                 ;
			
 
				+
			
 
				+        paddw       mm3,        mm5                 ;
			
 
				+
			
 
				+        movq        mm5,        mm7                 ;
			
 
				+        punpcklbw   mm5,        mm0                 ;
			
 
				+
			
 
				+        pmullw      mm5,        [rax]               ;
			
 
				+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
			
 
				+
			
 
				+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+        movq        mm7,        mm3                 ;
			
 
				+
			
 
				+        packuswb    mm7,        mm0                 ;
			
 
				+
			
 
				+        pmullw      mm3,        [rax+16]            ;
			
 
				+        paddw       mm3,        mm5                 ;
			
 
				+
			
 
				+
			
 
				+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
			
 
				+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        packuswb    mm3,        mm0
			
 
				+        movd        [rdi],      mm3                 ; store the results in the destination
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rsi,        rdx                 ; next line
			
 
				+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
			
 
				+%else
			
 
				+        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
			
 
				+        add         rsi,        rdx                 ; next line
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+
			
 
				+        cmp         rdi,        rcx                 ;
			
 
				+        jne         .next_row_4x4
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+
			
 
				+SECTION_RODATA
			
 
				+align 16
			
 
				+rd:
			
 
				+    times 4 dw 0x40
			
 
				+
			
 
				+align 16
			
 
				+global HIDDEN_DATA(sym(vp8_six_tap_mmx))
			
 
				+sym(vp8_six_tap_mmx):
			
 
				+    times 8 dw 0
			
 
				+    times 8 dw 0
			
 
				+    times 8 dw 128
			
 
				+    times 8 dw 0
			
 
				+    times 8 dw 0
			
 
				+    times 8 dw 0
			
 
				+
			
 
				+    times 8 dw 0
			
 
				+    times 8 dw -6
			
 
				+    times 8 dw 123
			
 
				+    times 8 dw 12
			
 
				+    times 8 dw -1
			
 
				+    times 8 dw 0
			
 
				+
			
 
				+    times 8 dw 2
			
 
				+    times 8 dw -11
			
 
				+    times 8 dw 108
			
 
				+    times 8 dw 36
			
 
				+    times 8 dw -8
			
 
				+    times 8 dw 1
			
 
				+
			
 
				+    times 8 dw 0
			
 
				+    times 8 dw -9
			
 
				+    times 8 dw 93
			
 
				+    times 8 dw 50
			
 
				+    times 8 dw -6
			
 
				+    times 8 dw 0
			
 
				+
			
 
				+    times 8 dw 3
			
 
				+    times 8 dw -16
			
 
				+    times 8 dw 77
			
 
				+    times 8 dw 77
			
 
				+    times 8 dw -16
			
 
				+    times 8 dw 3
			
 
				+
			
 
				+    times 8 dw 0
			
 
				+    times 8 dw -6
			
 
				+    times 8 dw 50
			
 
				+    times 8 dw 93
			
 
				+    times 8 dw -9
			
 
				+    times 8 dw 0
			
 
				+
			
 
				+    times 8 dw 1
			
 
				+    times 8 dw -8
			
 
				+    times 8 dw 36
			
 
				+    times 8 dw 108
			
 
				+    times 8 dw -11
			
 
				+    times 8 dw 2
			
 
				+
			
 
				+    times 8 dw 0
			
 
				+    times 8 dw -1
			
 
				+    times 8 dw 12
			
 
				+    times 8 dw 123
			
 
				+    times 8 dw -6
			
 
				+    times 8 dw 0
			
 
				+
			
 
				+
			
--- a/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm
+++ b/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -0,0 +1,1372 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+extern sym(vp8_bilinear_filters_x86_8)
			
 
				+
			
 
				+%define BLOCK_HEIGHT_WIDTH 4
			
 
				+%define VP8_FILTER_WEIGHT 128
			
 
				+%define VP8_FILTER_SHIFT  7
			
 
				+
			
 
				+
			
 
				+;/************************************************************************************
			
 
				+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
			
 
				+; input pixel array has output_height rows. This routine assumes that output_height is an
			
 
				+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
			
 
				+; rows each iteration to take advantage of the 128 bits operations.
			
 
				+;*************************************************************************************/
			
 
				+;void vp8_filter_block1d8_h6_sse2
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    unsigned short *output_ptr,
			
 
				+;    unsigned int    src_pixels_per_line,
			
 
				+;    unsigned int    pixel_step,
			
 
				+;    unsigned int    output_height,
			
 
				+;    unsigned int    output_width,
			
 
				+;    short           *vp8_filter
			
 
				+;)
			
 
				+global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
			
 
				+sym(vp8_filter_block1d8_h6_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 7
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rdx,        arg(6) ;vp8_filter
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+
			
 
				+        mov         rdi,        arg(1) ;output_ptr
			
 
				+
			
 
				+        movsxd      rcx,        dword ptr arg(4) ;output_height
			
 
				+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
			
 
				+%if ABI_IS_32BIT=0
			
 
				+        movsxd      r8,         dword ptr arg(5) ;output_width
			
 
				+%endif
			
 
				+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
			
 
				+
			
 
				+.filter_block1d8_h6_rowloop:
			
 
				+        movq        xmm3,       MMWORD PTR [rsi - 2]
			
 
				+        movq        xmm1,       MMWORD PTR [rsi + 6]
			
 
				+
			
 
				+        prefetcht2  [rsi+rax-2]
			
 
				+
			
 
				+        pslldq      xmm1,       8
			
 
				+        por         xmm1,       xmm3
			
 
				+
			
 
				+        movdqa      xmm4,       xmm1
			
 
				+        movdqa      xmm5,       xmm1
			
 
				+
			
 
				+        movdqa      xmm6,       xmm1
			
 
				+        movdqa      xmm7,       xmm1
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
			
 
				+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
			
 
				+
			
 
				+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
			
 
				+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
			
 
				+
			
 
				+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
			
 
				+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
			
 
				+
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
			
 
				+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
			
 
				+
			
 
				+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
			
 
				+
			
 
				+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
			
 
				+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
			
 
				+
			
 
				+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
			
 
				+
			
 
				+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
			
 
				+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
			
 
				+
			
 
				+
			
 
				+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
			
 
				+
			
 
				+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
			
 
				+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
			
 
				+
			
 
				+
			
 
				+        paddsw      xmm4,       xmm7
			
 
				+        paddsw      xmm4,       xmm5
			
 
				+
			
 
				+        paddsw      xmm4,       xmm3
			
 
				+        paddsw      xmm4,       xmm6
			
 
				+
			
 
				+        paddsw      xmm4,       xmm1
			
 
				+        paddsw      xmm4,       [GLOBAL(rd)]
			
 
				+
			
 
				+        psraw       xmm4,       7
			
 
				+
			
 
				+        packuswb    xmm4,       xmm0
			
 
				+        punpcklbw   xmm4,       xmm0
			
 
				+
			
 
				+        movdqa      XMMWORD Ptr [rdi],         xmm4
			
 
				+        lea         rsi,        [rsi + rax]
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rdi,        DWORD Ptr arg(5) ;[output_width]
			
 
				+%else
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+        dec         rcx
			
 
				+
			
 
				+        jnz         .filter_block1d8_h6_rowloop                ; next row
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_filter_block1d16_h6_sse2
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    unsigned short *output_ptr,
			
 
				+;    unsigned int    src_pixels_per_line,
			
 
				+;    unsigned int    pixel_step,
			
 
				+;    unsigned int    output_height,
			
 
				+;    unsigned int    output_width,
			
 
				+;    short           *vp8_filter
			
 
				+;)
			
 
				+;/************************************************************************************
			
 
				+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
			
 
				+; input pixel array has output_height rows. This routine assumes that output_height is an
			
 
				+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
			
 
				+; rows each iteration to take advantage of the 128 bits operations.
			
 
				+;*************************************************************************************/
			
 
				+global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
			
 
				+sym(vp8_filter_block1d16_h6_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 7
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rdx,        arg(6) ;vp8_filter
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+
			
 
				+        mov         rdi,        arg(1) ;output_ptr
			
 
				+
			
 
				+        movsxd      rcx,        dword ptr arg(4) ;output_height
			
 
				+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
			
 
				+%if ABI_IS_32BIT=0
			
 
				+        movsxd      r8,         dword ptr arg(5) ;output_width
			
 
				+%endif
			
 
				+
			
 
				+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
			
 
				+
			
 
				+.filter_block1d16_h6_sse2_rowloop:
			
 
				+        movq        xmm3,       MMWORD PTR [rsi - 2]
			
 
				+        movq        xmm1,       MMWORD PTR [rsi + 6]
			
 
				+
			
 
				+        movq        xmm2,       MMWORD PTR [rsi +14]
			
 
				+        pslldq      xmm2,       8
			
 
				+
			
 
				+        por         xmm2,       xmm1
			
 
				+        prefetcht2  [rsi+rax-2]
			
 
				+
			
 
				+        pslldq      xmm1,       8
			
 
				+        por         xmm1,       xmm3
			
 
				+
			
 
				+        movdqa      xmm4,       xmm1
			
 
				+        movdqa      xmm5,       xmm1
			
 
				+
			
 
				+        movdqa      xmm6,       xmm1
			
 
				+        movdqa      xmm7,       xmm1
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
			
 
				+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
			
 
				+
			
 
				+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
			
 
				+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
			
 
				+
			
 
				+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
			
 
				+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
			
 
				+
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
			
 
				+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
			
 
				+
			
 
				+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
			
 
				+
			
 
				+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
			
 
				+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
			
 
				+
			
 
				+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
			
 
				+
			
 
				+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
			
 
				+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
			
 
				+
			
 
				+
			
 
				+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
			
 
				+
			
 
				+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
			
 
				+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
			
 
				+
			
 
				+        paddsw      xmm4,       xmm7
			
 
				+        paddsw      xmm4,       xmm5
			
 
				+
			
 
				+        paddsw      xmm4,       xmm3
			
 
				+        paddsw      xmm4,       xmm6
			
 
				+
			
 
				+        paddsw      xmm4,       xmm1
			
 
				+        paddsw      xmm4,       [GLOBAL(rd)]
			
 
				+
			
 
				+        psraw       xmm4,       7
			
 
				+
			
 
				+        packuswb    xmm4,       xmm0
			
 
				+        punpcklbw   xmm4,       xmm0
			
 
				+
			
 
				+        movdqa      XMMWORD Ptr [rdi],         xmm4
			
 
				+
			
 
				+        movdqa      xmm3,       xmm2
			
 
				+        movdqa      xmm4,       xmm2
			
 
				+
			
 
				+        movdqa      xmm5,       xmm2
			
 
				+        movdqa      xmm6,       xmm2
			
 
				+
			
 
				+        movdqa      xmm7,       xmm2
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
			
 
				+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
			
 
				+
			
 
				+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
			
 
				+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
			
 
				+
			
 
				+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
			
 
				+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
			
 
				+
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
			
 
				+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
			
 
				+
			
 
				+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
			
 
				+
			
 
				+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
			
 
				+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
			
 
				+
			
 
				+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
			
 
				+
			
 
				+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
			
 
				+        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
			
 
				+
			
 
				+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
			
 
				+
			
 
				+        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
			
 
				+        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
			
 
				+
			
 
				+
			
 
				+        paddsw      xmm4,       xmm7
			
 
				+        paddsw      xmm4,       xmm5
			
 
				+
			
 
				+        paddsw      xmm4,       xmm3
			
 
				+        paddsw      xmm4,       xmm6
			
 
				+
			
 
				+        paddsw      xmm4,       xmm2
			
 
				+        paddsw      xmm4,       [GLOBAL(rd)]
			
 
				+
			
 
				+        psraw       xmm4,       7
			
 
				+
			
 
				+        packuswb    xmm4,       xmm0
			
 
				+        punpcklbw   xmm4,       xmm0
			
 
				+
			
 
				+        movdqa      XMMWORD Ptr [rdi+16],      xmm4
			
 
				+
			
 
				+        lea         rsi,        [rsi + rax]
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rdi,        DWORD Ptr arg(5) ;[output_width]
			
 
				+%else
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+
			
 
				+        dec         rcx
			
 
				+        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_filter_block1d8_v6_sse2
			
 
				+;(
			
 
				+;    short *src_ptr,
			
 
				+;    unsigned char *output_ptr,
			
 
				+;    int dst_ptich,
			
 
				+;    unsigned int pixels_per_line,
			
 
				+;    unsigned int pixel_step,
			
 
				+;    unsigned int output_height,
			
 
				+;    unsigned int output_width,
			
 
				+;    short * vp8_filter
			
 
				+;)
			
 
				+;/************************************************************************************
			
 
				+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
			
 
				+; input pixel array has output_height rows.
			
 
				+;*************************************************************************************/
			
 
				+global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
			
 
				+sym(vp8_filter_block1d8_v6_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 8
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rax,        arg(7) ;vp8_filter
			
 
				+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
			
 
				+
			
 
				+        mov         rdi,        arg(1) ;output_ptr
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+
			
 
				+        sub         rsi,        rdx
			
 
				+        sub         rsi,        rdx
			
 
				+
			
 
				+        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
			
 
				+        pxor        xmm0,       xmm0                        ; clear xmm0
			
 
				+
			
 
				+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
			
 
				+%if ABI_IS_32BIT=0
			
 
				+        movsxd      r8,         dword ptr arg(2) ; dst_ptich
			
 
				+%endif
			
 
				+
			
 
				+.vp8_filter_block1d8_v6_sse2_loop:
			
 
				+        movdqa      xmm1,       XMMWORD PTR [rsi]
			
 
				+        pmullw      xmm1,       [rax]
			
 
				+
			
 
				+        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
			
 
				+        pmullw      xmm2,       [rax + 16]
			
 
				+
			
 
				+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
			
 
				+        pmullw      xmm3,       [rax + 32]
			
 
				+
			
 
				+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
			
 
				+        pmullw      xmm5,       [rax + 64]
			
 
				+
			
 
				+        add         rsi,        rdx
			
 
				+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
			
 
				+
			
 
				+        pmullw      xmm4,       [rax + 48]
			
 
				+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
			
 
				+
			
 
				+        pmullw      xmm6,       [rax + 80]
			
 
				+
			
 
				+        paddsw      xmm2,       xmm5
			
 
				+        paddsw      xmm2,       xmm3
			
 
				+
			
 
				+        paddsw      xmm2,       xmm1
			
 
				+        paddsw      xmm2,       xmm4
			
 
				+
			
 
				+        paddsw      xmm2,       xmm6
			
 
				+        paddsw      xmm2,       xmm7
			
 
				+
			
 
				+        psraw       xmm2,       7
			
 
				+        packuswb    xmm2,       xmm0              ; pack and saturate
			
 
				+
			
 
				+        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
			
 
				+%else
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+        dec         rcx         ; decrement count
			
 
				+        jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_filter_block1d16_v6_sse2
			
 
				+;(
			
 
				+;    unsigned short *src_ptr,
			
 
				+;    unsigned char *output_ptr,
			
 
				+;    int dst_ptich,
			
 
				+;    unsigned int pixels_per_line,
			
 
				+;    unsigned int pixel_step,
			
 
				+;    unsigned int output_height,
			
 
				+;    unsigned int output_width,
			
 
				+;    const short    *vp8_filter
			
 
				+;)
			
 
				+;/************************************************************************************
			
 
				+; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
			
 
				+; input pixel array has output_height rows.
			
 
				+;*************************************************************************************/
			
 
				+global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
			
 
				+sym(vp8_filter_block1d16_v6_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 8
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rax,        arg(7) ;vp8_filter
			
 
				+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
			
 
				+
			
 
				+        mov         rdi,        arg(1) ;output_ptr
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+
			
 
				+        sub         rsi,        rdx
			
 
				+        sub         rsi,        rdx
			
 
				+
			
 
				+        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
			
 
				+%if ABI_IS_32BIT=0
			
 
				+        movsxd      r8,         dword ptr arg(2) ; dst_ptich
			
 
				+%endif
			
 
				+
			
 
				+.vp8_filter_block1d16_v6_sse2_loop:
			
 
				+; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
			
 
				+        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
			
 
				+        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
			
 
				+        pmullw      xmm1,       [rax + 16]
			
 
				+        pmullw      xmm2,       [rax + 16]
			
 
				+
			
 
				+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
			
 
				+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
			
 
				+        pmullw      xmm3,       [rax + 64]
			
 
				+        pmullw      xmm4,       [rax + 64]
			
 
				+
			
 
				+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
			
 
				+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
			
 
				+        pmullw      xmm5,       [rax + 32]
			
 
				+        pmullw      xmm6,       [rax + 32]
			
 
				+
			
 
				+        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
			
 
				+        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
			
 
				+        pmullw      xmm7,       [rax]
			
 
				+        pmullw      xmm0,       [rax]
			
 
				+
			
 
				+        paddsw      xmm1,       xmm3
			
 
				+        paddsw      xmm2,       xmm4
			
 
				+        paddsw      xmm1,       xmm5
			
 
				+        paddsw      xmm2,       xmm6
			
 
				+        paddsw      xmm1,       xmm7
			
 
				+        paddsw      xmm2,       xmm0
			
 
				+
			
 
				+        add         rsi,        rdx
			
 
				+
			
 
				+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
			
 
				+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
			
 
				+        pmullw      xmm3,       [rax + 48]
			
 
				+        pmullw      xmm4,       [rax + 48]
			
 
				+
			
 
				+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
			
 
				+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
			
 
				+        pmullw      xmm5,       [rax + 80]
			
 
				+        pmullw      xmm6,       [rax + 80]
			
 
				+
			
 
				+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
			
 
				+        pxor        xmm0,       xmm0                        ; clear xmm0
			
 
				+
			
 
				+        paddsw      xmm1,       xmm3
			
 
				+        paddsw      xmm2,       xmm4
			
 
				+        paddsw      xmm1,       xmm5
			
 
				+        paddsw      xmm2,       xmm6
			
 
				+
			
 
				+        paddsw      xmm1,       xmm7
			
 
				+        paddsw      xmm2,       xmm7
			
 
				+
			
 
				+        psraw       xmm1,       7
			
 
				+        psraw       xmm2,       7
			
 
				+
			
 
				+        packuswb    xmm1,       xmm2              ; pack and saturate
			
 
				+        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
			
 
				+%else
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+        dec         rcx         ; decrement count
			
 
				+        jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_filter_block1d8_h6_only_sse2
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    unsigned int    src_pixels_per_line,
			
 
				+;    unsigned char  *output_ptr,
			
 
				+;    int dst_ptich,
			
 
				+;    unsigned int    output_height,
			
 
				+;    const short    *vp8_filter
			
 
				+;)
			
 
				+; First-pass filter only when yoffset==0
			
 
				+global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
			
 
				+sym(vp8_filter_block1d8_h6_only_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rdx,        arg(5) ;vp8_filter
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+
			
 
				+        mov         rdi,        arg(2) ;output_ptr
			
 
				+
			
 
				+        movsxd      rcx,        dword ptr arg(4) ;output_height
			
 
				+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
			
 
				+%if ABI_IS_32BIT=0
			
 
				+        movsxd      r8,         dword ptr arg(3) ;dst_ptich
			
 
				+%endif
			
 
				+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
			
 
				+
			
 
				+.filter_block1d8_h6_only_rowloop:
			
 
				+        movq        xmm3,       MMWORD PTR [rsi - 2]
			
 
				+        movq        xmm1,       MMWORD PTR [rsi + 6]
			
 
				+
			
 
				+        prefetcht2  [rsi+rax-2]
			
 
				+
			
 
				+        pslldq      xmm1,       8
			
 
				+        por         xmm1,       xmm3
			
 
				+
			
 
				+        movdqa      xmm4,       xmm1
			
 
				+        movdqa      xmm5,       xmm1
			
 
				+
			
 
				+        movdqa      xmm6,       xmm1
			
 
				+        movdqa      xmm7,       xmm1
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
			
 
				+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
			
 
				+
			
 
				+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
			
 
				+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
			
 
				+
			
 
				+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
			
 
				+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
			
 
				+
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
			
 
				+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
			
 
				+
			
 
				+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
			
 
				+
			
 
				+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
			
 
				+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
			
 
				+
			
 
				+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
			
 
				+
			
 
				+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
			
 
				+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
			
 
				+
			
 
				+
			
 
				+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
			
 
				+
			
 
				+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
			
 
				+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
			
 
				+
			
 
				+
			
 
				+        paddsw      xmm4,       xmm7
			
 
				+        paddsw      xmm4,       xmm5
			
 
				+
			
 
				+        paddsw      xmm4,       xmm3
			
 
				+        paddsw      xmm4,       xmm6
			
 
				+
			
 
				+        paddsw      xmm4,       xmm1
			
 
				+        paddsw      xmm4,       [GLOBAL(rd)]
			
 
				+
			
 
				+        psraw       xmm4,       7
			
 
				+
			
 
				+        packuswb    xmm4,       xmm0
			
 
				+
			
 
				+        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
			
 
				+        lea         rsi,        [rsi + rax]
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
			
 
				+%else
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+        dec         rcx
			
 
				+
			
 
				+        jnz         .filter_block1d8_h6_only_rowloop               ; next row
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_filter_block1d16_h6_only_sse2
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    unsigned int    src_pixels_per_line,
			
 
				+;    unsigned char  *output_ptr,
			
 
				+;    int dst_ptich,
			
 
				+;    unsigned int    output_height,
			
 
				+;    const short    *vp8_filter
			
 
				+;)
			
 
				+; First-pass filter only when yoffset==0
			
 
				+global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
			
 
				+sym(vp8_filter_block1d16_h6_only_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rdx,        arg(5) ;vp8_filter
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+
			
 
				+        mov         rdi,        arg(2) ;output_ptr
			
 
				+
			
 
				+        movsxd      rcx,        dword ptr arg(4) ;output_height
			
 
				+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
			
 
				+%if ABI_IS_32BIT=0
			
 
				+        movsxd      r8,         dword ptr arg(3) ;dst_ptich
			
 
				+%endif
			
 
				+
			
 
				+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
			
 
				+
			
 
				+.filter_block1d16_h6_only_sse2_rowloop:
			
 
				+        movq        xmm3,       MMWORD PTR [rsi - 2]
			
 
				+        movq        xmm1,       MMWORD PTR [rsi + 6]
			
 
				+
			
 
				+        movq        xmm2,       MMWORD PTR [rsi +14]
			
 
				+        pslldq      xmm2,       8
			
 
				+
			
 
				+        por         xmm2,       xmm1
			
 
				+        prefetcht2  [rsi+rax-2]
			
 
				+
			
 
				+        pslldq      xmm1,       8
			
 
				+        por         xmm1,       xmm3
			
 
				+
			
 
				+        movdqa      xmm4,       xmm1
			
 
				+        movdqa      xmm5,       xmm1
			
 
				+
			
 
				+        movdqa      xmm6,       xmm1
			
 
				+        movdqa      xmm7,       xmm1
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
			
 
				+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
			
 
				+
			
 
				+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
			
 
				+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
			
 
				+
			
 
				+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
			
 
				+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
			
 
				+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
			
 
				+
			
 
				+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
			
 
				+
			
 
				+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
			
 
				+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
			
 
				+
			
 
				+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
			
 
				+
			
 
				+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
			
 
				+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
			
 
				+
			
 
				+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
			
 
				+
			
 
				+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
			
 
				+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
			
 
				+
			
 
				+        paddsw      xmm4,       xmm7
			
 
				+        paddsw      xmm4,       xmm5
			
 
				+
			
 
				+        paddsw      xmm4,       xmm3
			
 
				+        paddsw      xmm4,       xmm6
			
 
				+
			
 
				+        paddsw      xmm4,       xmm1
			
 
				+        paddsw      xmm4,       [GLOBAL(rd)]
			
 
				+
			
 
				+        psraw       xmm4,       7
			
 
				+
			
 
				+        packuswb    xmm4,       xmm0                        ; lower 8 bytes
			
 
				+
			
 
				+        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
			
 
				+
			
 
				+        movdqa      xmm3,       xmm2
			
 
				+        movdqa      xmm4,       xmm2
			
 
				+
			
 
				+        movdqa      xmm5,       xmm2
			
 
				+        movdqa      xmm6,       xmm2
			
 
				+
			
 
				+        movdqa      xmm7,       xmm2
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
			
 
				+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
			
 
				+
			
 
				+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
			
 
				+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
			
 
				+
			
 
				+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
			
 
				+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
			
 
				+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
			
 
				+
			
 
				+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
			
 
				+
			
 
				+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
			
 
				+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
			
 
				+
			
 
				+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
			
 
				+
			
 
				+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
			
 
				+        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
			
 
				+
			
 
				+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
			
 
				+
			
 
				+        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
			
 
				+        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
			
 
				+
			
 
				+        paddsw      xmm4,       xmm7
			
 
				+        paddsw      xmm4,       xmm5
			
 
				+
			
 
				+        paddsw      xmm4,       xmm3
			
 
				+        paddsw      xmm4,       xmm6
			
 
				+
			
 
				+        paddsw      xmm4,       xmm2
			
 
				+        paddsw      xmm4,       [GLOBAL(rd)]
			
 
				+
			
 
				+        psraw       xmm4,       7
			
 
				+
			
 
				+        packuswb    xmm4,       xmm0                        ; higher 8 bytes
			
 
				+
			
 
				+        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
			
 
				+
			
 
				+        lea         rsi,        [rsi + rax]
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
			
 
				+%else
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+
			
 
				+        dec         rcx
			
 
				+        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_filter_block1d8_v6_only_sse2
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    unsigned int    src_pixels_per_line,
			
 
				+;    unsigned char *output_ptr,
			
 
				+;    int dst_ptich,
			
 
				+;    unsigned int output_height,
			
 
				+;    const short    *vp8_filter
			
 
				+;)
			
 
				+; Second-pass filter only when xoffset==0
			
 
				+global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
			
 
				+sym(vp8_filter_block1d8_v6_only_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+        mov         rdi,        arg(2) ;output_ptr
			
 
				+
			
 
				+        movsxd      rcx,        dword ptr arg(4) ;output_height
			
 
				+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
			
 
				+
			
 
				+        mov         rax,        arg(5) ;vp8_filter
			
 
				+
			
 
				+        pxor        xmm0,       xmm0                        ; clear xmm0
			
 
				+
			
 
				+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
			
 
				+%if ABI_IS_32BIT=0
			
 
				+        movsxd      r8,         dword ptr arg(3) ; dst_ptich
			
 
				+%endif
			
 
				+
			
 
				+.vp8_filter_block1d8_v6_only_sse2_loop:
			
 
				+        movq        xmm1,       MMWORD PTR [rsi]
			
 
				+        movq        xmm2,       MMWORD PTR [rsi + rdx]
			
 
				+        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
			
 
				+        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
			
 
				+        add         rsi,        rdx
			
 
				+        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
			
 
				+        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
			
 
				+
			
 
				+        punpcklbw   xmm1,       xmm0
			
 
				+        pmullw      xmm1,       [rax]
			
 
				+
			
 
				+        punpcklbw   xmm2,       xmm0
			
 
				+        pmullw      xmm2,       [rax + 16]
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0
			
 
				+        pmullw      xmm3,       [rax + 32]
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0
			
 
				+        pmullw      xmm5,       [rax + 64]
			
 
				+
			
 
				+        punpcklbw   xmm4,       xmm0
			
 
				+        pmullw      xmm4,       [rax + 48]
			
 
				+
			
 
				+        punpcklbw   xmm6,       xmm0
			
 
				+        pmullw      xmm6,       [rax + 80]
			
 
				+
			
 
				+        paddsw      xmm2,       xmm5
			
 
				+        paddsw      xmm2,       xmm3
			
 
				+
			
 
				+        paddsw      xmm2,       xmm1
			
 
				+        paddsw      xmm2,       xmm4
			
 
				+
			
 
				+        paddsw      xmm2,       xmm6
			
 
				+        paddsw      xmm2,       xmm7
			
 
				+
			
 
				+        psraw       xmm2,       7
			
 
				+        packuswb    xmm2,       xmm0              ; pack and saturate
			
 
				+
			
 
				+        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
			
 
				+%else
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+        dec         rcx         ; decrement count
			
 
				+        jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_unpack_block1d16_h6_sse2
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    unsigned short *output_ptr,
			
 
				+;    unsigned int    src_pixels_per_line,
			
 
				+;    unsigned int    output_height,
			
 
				+;    unsigned int    output_width
			
 
				+;)
			
 
				+global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
			
 
				+sym(vp8_unpack_block1d16_h6_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 5
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+        mov         rdi,        arg(1) ;output_ptr
			
 
				+
			
 
				+        movsxd      rcx,        dword ptr arg(3) ;output_height
			
 
				+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
			
 
				+
			
 
				+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
			
 
				+%if ABI_IS_32BIT=0
			
 
				+        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
			
 
				+%endif
			
 
				+
			
 
				+.unpack_block1d16_h6_sse2_rowloop:
			
 
				+        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
			
 
				+        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
			
 
				+        punpcklbw   xmm1,       xmm0
			
 
				+
			
 
				+        movdqa      XMMWORD Ptr [rdi],         xmm1
			
 
				+        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
			
 
				+
			
 
				+        lea         rsi,        [rsi + rax]
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rdi,        DWORD Ptr arg(4) ;[output_width]
			
 
				+%else
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+        dec         rcx
			
 
				+        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_bilinear_predict16x16_sse2
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    int   src_pixels_per_line,
			
 
				+;    int  xoffset,
			
 
				+;    int  yoffset,
			
 
				+;    unsigned char *dst_ptr,
			
 
				+;    int dst_pitch
			
 
				+;)
			
 
				+extern sym(vp8_bilinear_filters_x86_8)
			
 
				+global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
			
 
				+sym(vp8_bilinear_predict16x16_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
			
 
				+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
			
 
				+
			
 
				+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
			
 
				+        movsxd      rax,        dword ptr arg(2) ;xoffset
			
 
				+
			
 
				+        cmp         rax,        0      ;skip first_pass filter if xoffset=0
			
 
				+        je          .b16x16_sp_only
			
 
				+
			
 
				+        shl         rax,        5
			
 
				+        add         rax,        rcx    ;HFilter
			
 
				+
			
 
				+        mov         rdi,        arg(4) ;dst_ptr
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
			
 
				+
			
 
				+        movdqa      xmm1,       [rax]
			
 
				+        movdqa      xmm2,       [rax+16]
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(3) ;yoffset
			
 
				+
			
 
				+        cmp         rax,        0      ;skip second_pass filter if yoffset=0
			
 
				+        je          .b16x16_fp_only
			
 
				+
			
 
				+        shl         rax,        5
			
 
				+        add         rax,        rcx    ;VFilter
			
 
				+
			
 
				+        lea         rcx,        [rdi+rdx*8]
			
 
				+        lea         rcx,        [rcx+rdx*8]
			
 
				+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
			
 
				+
			
 
				+        pxor        xmm0,       xmm0
			
 
				+
			
 
				+%if ABI_IS_32BIT=0
			
 
				+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
			
 
				+%endif
			
 
				+        ; get the first horizontal line done
			
 
				+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
			
 
				+        movdqa      xmm4,       xmm3                 ; make a copy of current line
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
			
 
				+        punpckhbw   xmm4,       xmm0
			
 
				+
			
 
				+        pmullw      xmm3,       xmm1
			
 
				+        pmullw      xmm4,       xmm1
			
 
				+
			
 
				+        movdqu      xmm5,       [rsi+1]
			
 
				+        movdqa      xmm6,       xmm5
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0
			
 
				+        punpckhbw   xmm6,       xmm0
			
 
				+
			
 
				+        pmullw      xmm5,       xmm2
			
 
				+        pmullw      xmm6,       xmm2
			
 
				+
			
 
				+        paddw       xmm3,       xmm5
			
 
				+        paddw       xmm4,       xmm6
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       xmm4,       [GLOBAL(rd)]
			
 
				+        psraw       xmm4,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        movdqa      xmm7,       xmm3
			
 
				+        packuswb    xmm7,       xmm4
			
 
				+
			
 
				+        add         rsi,        rdx                 ; next line
			
 
				+.next_row:
			
 
				+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
			
 
				+        movdqa      xmm4,       xmm3                 ; make a copy of current line
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
			
 
				+        punpckhbw   xmm4,       xmm0
			
 
				+
			
 
				+        pmullw      xmm3,       xmm1
			
 
				+        pmullw      xmm4,       xmm1
			
 
				+
			
 
				+        movdqu      xmm5,       [rsi+1]
			
 
				+        movdqa      xmm6,       xmm5
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0
			
 
				+        punpckhbw   xmm6,       xmm0
			
 
				+
			
 
				+        pmullw      xmm5,       xmm2
			
 
				+        pmullw      xmm6,       xmm2
			
 
				+
			
 
				+        paddw       xmm3,       xmm5
			
 
				+        paddw       xmm4,       xmm6
			
 
				+
			
 
				+        movdqa      xmm5,       xmm7
			
 
				+        movdqa      xmm6,       xmm7
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0
			
 
				+        punpckhbw   xmm6,       xmm0
			
 
				+
			
 
				+        pmullw      xmm5,       [rax]
			
 
				+        pmullw      xmm6,       [rax]
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       xmm4,       [GLOBAL(rd)]
			
 
				+        psraw       xmm4,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        movdqa      xmm7,       xmm3
			
 
				+        packuswb    xmm7,       xmm4
			
 
				+
			
 
				+        pmullw      xmm3,       [rax+16]
			
 
				+        pmullw      xmm4,       [rax+16]
			
 
				+
			
 
				+        paddw       xmm3,       xmm5
			
 
				+        paddw       xmm4,       xmm6
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       xmm4,       [GLOBAL(rd)]
			
 
				+        psraw       xmm4,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        packuswb    xmm3,       xmm4
			
 
				+        movdqa      [rdi],      xmm3                 ; store the results in the destination
			
 
				+
			
 
				+        add         rsi,        rdx                 ; next line
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rdi,        DWORD PTR arg(5) ;dst_pitch
			
 
				+%else
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+
			
 
				+        cmp         rdi,        rcx
			
 
				+        jne         .next_row
			
 
				+
			
 
				+        jmp         .done
			
 
				+
			
 
				+.b16x16_sp_only:
			
 
				+        movsxd      rax,        dword ptr arg(3) ;yoffset
			
 
				+        shl         rax,        5
			
 
				+        add         rax,        rcx    ;VFilter
			
 
				+
			
 
				+        mov         rdi,        arg(4) ;dst_ptr
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
			
 
				+
			
 
				+        movdqa      xmm1,       [rax]
			
 
				+        movdqa      xmm2,       [rax+16]
			
 
				+
			
 
				+        lea         rcx,        [rdi+rdx*8]
			
 
				+        lea         rcx,        [rcx+rdx*8]
			
 
				+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
			
 
				+
			
 
				+        pxor        xmm0,       xmm0
			
 
				+
			
 
				+        ; get the first horizontal line done
			
 
				+        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
			
 
				+
			
 
				+        add         rsi,        rax                 ; next line
			
 
				+.next_row_spo:
			
 
				+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
			
 
				+
			
 
				+        movdqa      xmm5,       xmm7
			
 
				+        movdqa      xmm6,       xmm7
			
 
				+
			
 
				+        movdqa      xmm4,       xmm3                 ; make a copy of current line
			
 
				+        movdqa      xmm7,       xmm3
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0
			
 
				+        punpckhbw   xmm6,       xmm0
			
 
				+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
			
 
				+        punpckhbw   xmm4,       xmm0
			
 
				+
			
 
				+        pmullw      xmm5,       xmm1
			
 
				+        pmullw      xmm6,       xmm1
			
 
				+        pmullw      xmm3,       xmm2
			
 
				+        pmullw      xmm4,       xmm2
			
 
				+
			
 
				+        paddw       xmm3,       xmm5
			
 
				+        paddw       xmm4,       xmm6
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       xmm4,       [GLOBAL(rd)]
			
 
				+        psraw       xmm4,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        packuswb    xmm3,       xmm4
			
 
				+        movdqa      [rdi],      xmm3                 ; store the results in the destination
			
 
				+
			
 
				+        add         rsi,        rax                 ; next line
			
 
				+        add         rdi,        rdx                 ;dst_pitch
			
 
				+        cmp         rdi,        rcx
			
 
				+        jne         .next_row_spo
			
 
				+
			
 
				+        jmp         .done
			
 
				+
			
 
				+.b16x16_fp_only:
			
 
				+        lea         rcx,        [rdi+rdx*8]
			
 
				+        lea         rcx,        [rcx+rdx*8]
			
 
				+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
			
 
				+        pxor        xmm0,       xmm0
			
 
				+
			
 
				+.next_row_fpo:
			
 
				+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
			
 
				+        movdqa      xmm4,       xmm3                 ; make a copy of current line
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
			
 
				+        punpckhbw   xmm4,       xmm0
			
 
				+
			
 
				+        pmullw      xmm3,       xmm1
			
 
				+        pmullw      xmm4,       xmm1
			
 
				+
			
 
				+        movdqu      xmm5,       [rsi+1]
			
 
				+        movdqa      xmm6,       xmm5
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm0
			
 
				+        punpckhbw   xmm6,       xmm0
			
 
				+
			
 
				+        pmullw      xmm5,       xmm2
			
 
				+        pmullw      xmm6,       xmm2
			
 
				+
			
 
				+        paddw       xmm3,       xmm5
			
 
				+        paddw       xmm4,       xmm6
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       xmm4,       [GLOBAL(rd)]
			
 
				+        psraw       xmm4,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        packuswb    xmm3,       xmm4
			
 
				+        movdqa      [rdi],      xmm3                 ; store the results in the destination
			
 
				+
			
 
				+        add         rsi,        rax                 ; next line
			
 
				+        add         rdi,        rdx                 ; dst_pitch
			
 
				+        cmp         rdi,        rcx
			
 
				+        jne         .next_row_fpo
			
 
				+
			
 
				+.done:
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_bilinear_predict8x8_sse2
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    int   src_pixels_per_line,
			
 
				+;    int  xoffset,
			
 
				+;    int  yoffset,
			
 
				+;    unsigned char *dst_ptr,
			
 
				+;    int dst_pitch
			
 
				+;)
			
 
				+global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
			
 
				+sym(vp8_bilinear_predict8x8_sse2):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub         rsp, 144                         ; reserve 144 bytes
			
 
				+
			
 
				+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
			
 
				+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
			
 
				+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
			
 
				+
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
			
 
				+
			
 
				+    ;Read 9-line unaligned data in and put them on stack. This gives a big
			
 
				+    ;performance boost.
			
 
				+        movdqu      xmm0,       [rsi]
			
 
				+        lea         rax,        [rdx + rdx*2]
			
 
				+        movdqu      xmm1,       [rsi+rdx]
			
 
				+        movdqu      xmm2,       [rsi+rdx*2]
			
 
				+        add         rsi,        rax
			
 
				+        movdqu      xmm3,       [rsi]
			
 
				+        movdqu      xmm4,       [rsi+rdx]
			
 
				+        movdqu      xmm5,       [rsi+rdx*2]
			
 
				+        add         rsi,        rax
			
 
				+        movdqu      xmm6,       [rsi]
			
 
				+        movdqu      xmm7,       [rsi+rdx]
			
 
				+
			
 
				+        movdqa      XMMWORD PTR [rsp],            xmm0
			
 
				+
			
 
				+        movdqu      xmm0,       [rsi+rdx*2]
			
 
				+
			
 
				+        movdqa      XMMWORD PTR [rsp+16],         xmm1
			
 
				+        movdqa      XMMWORD PTR [rsp+32],         xmm2
			
 
				+        movdqa      XMMWORD PTR [rsp+48],         xmm3
			
 
				+        movdqa      XMMWORD PTR [rsp+64],         xmm4
			
 
				+        movdqa      XMMWORD PTR [rsp+80],         xmm5
			
 
				+        movdqa      XMMWORD PTR [rsp+96],         xmm6
			
 
				+        movdqa      XMMWORD PTR [rsp+112],        xmm7
			
 
				+        movdqa      XMMWORD PTR [rsp+128],        xmm0
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(2) ;xoffset
			
 
				+        shl         rax,        5
			
 
				+        add         rax,        rcx    ;HFilter
			
 
				+
			
 
				+        mov         rdi,        arg(4) ;dst_ptr
			
 
				+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
			
 
				+
			
 
				+        movdqa      xmm1,       [rax]
			
 
				+        movdqa      xmm2,       [rax+16]
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(3) ;yoffset
			
 
				+        shl         rax,        5
			
 
				+        add         rax,        rcx    ;VFilter
			
 
				+
			
 
				+        lea         rcx,        [rdi+rdx*8]
			
 
				+
			
 
				+        movdqa      xmm5,       [rax]
			
 
				+        movdqa      xmm6,       [rax+16]
			
 
				+
			
 
				+        pxor        xmm0,       xmm0
			
 
				+
			
 
				+        ; get the first horizontal line done
			
 
				+        movdqa      xmm3,       XMMWORD PTR [rsp]
			
 
				+        movdqa      xmm4,       xmm3                 ; make a copy of current line
			
 
				+        psrldq      xmm4,       1
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
			
 
				+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
			
 
				+
			
 
				+        pmullw      xmm3,       xmm1
			
 
				+        pmullw      xmm4,       xmm2
			
 
				+
			
 
				+        paddw       xmm3,       xmm4
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        movdqa      xmm7,       xmm3
			
 
				+        add         rsp,        16                 ; next line
			
 
				+.next_row8x8:
			
 
				+        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
			
 
				+        movdqa      xmm4,       xmm3                 ; make a copy of current line
			
 
				+        psrldq      xmm4,       1
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
			
 
				+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
			
 
				+
			
 
				+        pmullw      xmm3,       xmm1
			
 
				+        pmullw      xmm4,       xmm2
			
 
				+
			
 
				+        paddw       xmm3,       xmm4
			
 
				+        pmullw      xmm7,       xmm5
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        movdqa      xmm4,       xmm3
			
 
				+
			
 
				+        pmullw      xmm3,       xmm6
			
 
				+        paddw       xmm3,       xmm7
			
 
				+
			
 
				+        movdqa      xmm7,       xmm4
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
			
 
				+
			
 
				+        packuswb    xmm3,       xmm0
			
 
				+        movq        [rdi],      xmm3                 ; store the results in the destination
			
 
				+
			
 
				+        add         rsp,        16                 ; next line
			
 
				+        add         rdi,        rdx
			
 
				+
			
 
				+        cmp         rdi,        rcx
			
 
				+        jne         .next_row8x8
			
 
				+
			
 
				+    ;add rsp, 144
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+SECTION_RODATA
			
 
				+align 16
			
 
				+rd:
			
 
				+    times 8 dw 0x40
			
--- a/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm
+++ b/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm
@@ -0,0 +1,1508 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+%define BLOCK_HEIGHT_WIDTH 4
			
 
				+%define VP8_FILTER_WEIGHT 128
			
 
				+%define VP8_FILTER_SHIFT  7
			
 
				+
			
 
				+
			
 
				+;/************************************************************************************
			
 
				+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
			
 
				+; input pixel array has output_height rows. This routine assumes that output_height is an
			
 
				+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
			
 
				+; rows each iteration to take advantage of the 128 bits operations.
			
 
				+;
			
 
				+; This is an implementation of some of the SSE optimizations first seen in ffvp8
			
 
				+;
			
 
				+;*************************************************************************************/
			
 
				+;void vp8_filter_block1d8_h6_ssse3
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    unsigned int    src_pixels_per_line,
			
 
				+;    unsigned char *output_ptr,
			
 
				+;    unsigned int    output_pitch,
			
 
				+;    unsigned int    output_height,
			
 
				+;    unsigned int    vp8_filter_index
			
 
				+;)
			
 
				+global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
			
 
				+sym(vp8_filter_block1d8_h6_ssse3):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    movsxd      rdx, DWORD PTR arg(5)   ;table index
			
 
				+    xor         rsi, rsi
			
 
				+    shl         rdx, 4
			
 
				+
			
 
				+    movdqa      xmm7, [GLOBAL(rd)]
			
 
				+
			
 
				+    lea         rax, [GLOBAL(k0_k5)]
			
 
				+    add         rax, rdx
			
 
				+    mov         rdi, arg(2)             ;output_ptr
			
 
				+
			
 
				+    cmp         esi, DWORD PTR [rax]
			
 
				+    je          vp8_filter_block1d8_h4_ssse3
			
 
				+
			
 
				+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
			
 
				+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
			
 
				+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
			
 
				+
			
 
				+    mov         rsi, arg(0)             ;src_ptr
			
 
				+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
			
 
				+    movsxd      rcx, dword ptr arg(4)   ;output_height
			
 
				+
			
 
				+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
			
 
				+
			
 
				+    sub         rdi, rdx
			
 
				+;xmm3 free
			
 
				+.filter_block1d8_h6_rowloop_ssse3:
			
 
				+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
			
 
				+
			
 
				+    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
			
 
				+
			
 
				+    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
			
 
				+
			
 
				+    movdqa      xmm1,   xmm0
			
 
				+    pmaddubsw   xmm0,   xmm4
			
 
				+
			
 
				+    movdqa      xmm2,   xmm1
			
 
				+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
			
 
				+
			
 
				+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
			
 
				+    pmaddubsw   xmm1,   xmm5
			
 
				+
			
 
				+    lea         rdi,    [rdi + rdx]
			
 
				+    pmaddubsw   xmm2,   xmm6
			
 
				+
			
 
				+    lea         rsi,    [rsi + rax]
			
 
				+    dec         rcx
			
 
				+
			
 
				+    paddsw      xmm0,   xmm1
			
 
				+    paddsw      xmm2,   xmm7
			
 
				+
			
 
				+    paddsw      xmm0,   xmm2
			
 
				+
			
 
				+    psraw       xmm0,   7
			
 
				+
			
 
				+    packuswb    xmm0,   xmm0
			
 
				+
			
 
				+    movq        MMWORD Ptr [rdi], xmm0
			
 
				+    jnz         .filter_block1d8_h6_rowloop_ssse3
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+vp8_filter_block1d8_h4_ssse3:
			
 
				+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
			
 
				+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
			
 
				+
			
 
				+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
			
 
				+    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
			
 
				+
			
 
				+    mov         rsi, arg(0)             ;src_ptr
			
 
				+
			
 
				+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
			
 
				+    movsxd      rcx, dword ptr arg(4)   ;output_height
			
 
				+
			
 
				+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
			
 
				+
			
 
				+    sub         rdi, rdx
			
 
				+
			
 
				+.filter_block1d8_h4_rowloop_ssse3:
			
 
				+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
			
 
				+
			
 
				+    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
			
 
				+
			
 
				+    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
			
 
				+
			
 
				+    movdqa      xmm2,   xmm0
			
 
				+    pshufb      xmm0,   xmm3
			
 
				+
			
 
				+    pshufb      xmm2,   xmm4
			
 
				+    pmaddubsw   xmm0,   xmm5
			
 
				+
			
 
				+    lea         rdi,    [rdi + rdx]
			
 
				+    pmaddubsw   xmm2,   xmm6
			
 
				+
			
 
				+    lea         rsi,    [rsi + rax]
			
 
				+    dec         rcx
			
 
				+
			
 
				+    paddsw      xmm0,   xmm7
			
 
				+
			
 
				+    paddsw      xmm0,   xmm2
			
 
				+
			
 
				+    psraw       xmm0,   7
			
 
				+
			
 
				+    packuswb    xmm0,   xmm0
			
 
				+
			
 
				+    movq        MMWORD Ptr [rdi], xmm0
			
 
				+
			
 
				+    jnz         .filter_block1d8_h4_rowloop_ssse3
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+;void vp8_filter_block1d16_h6_ssse3
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    unsigned int    src_pixels_per_line,
			
 
				+;    unsigned char  *output_ptr,
			
 
				+;    unsigned int    output_pitch,
			
 
				+;    unsigned int    output_height,
			
 
				+;    unsigned int    vp8_filter_index
			
 
				+;)
			
 
				+global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
			
 
				+sym(vp8_filter_block1d16_h6_ssse3):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    movsxd      rdx, DWORD PTR arg(5)           ;table index
			
 
				+    xor         rsi, rsi
			
 
				+    shl         rdx, 4      ;
			
 
				+
			
 
				+    lea         rax, [GLOBAL(k0_k5)]
			
 
				+    add         rax, rdx
			
 
				+
			
 
				+    mov         rdi, arg(2)                     ;output_ptr
			
 
				+
			
 
				+    mov         rsi, arg(0)                     ;src_ptr
			
 
				+
			
 
				+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
			
 
				+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
			
 
				+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
			
 
				+
			
 
				+    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
			
 
				+    movsxd      rcx, dword ptr arg(4)           ;output_height
			
 
				+    movsxd      rdx, dword ptr arg(3)           ;output_pitch
			
 
				+
			
 
				+.filter_block1d16_h6_rowloop_ssse3:
			
 
				+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
			
 
				+
			
 
				+    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
			
 
				+
			
 
				+    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
			
 
				+
			
 
				+    movdqa      xmm1,   xmm0
			
 
				+    pmaddubsw   xmm0,   xmm4
			
 
				+
			
 
				+    movdqa      xmm2,   xmm1
			
 
				+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
			
 
				+
			
 
				+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
			
 
				+    movq        xmm3,   MMWORD PTR [rsi +  6]
			
 
				+
			
 
				+    pmaddubsw   xmm1,   xmm5
			
 
				+    movq        xmm7,   MMWORD PTR [rsi + 11]
			
 
				+
			
 
				+    pmaddubsw   xmm2,   xmm6
			
 
				+    punpcklbw   xmm3,   xmm7
			
 
				+
			
 
				+    paddsw      xmm0,   xmm1
			
 
				+    movdqa      xmm1,   xmm3
			
 
				+
			
 
				+    pmaddubsw   xmm3,   xmm4
			
 
				+    paddsw      xmm0,   xmm2
			
 
				+
			
 
				+    movdqa      xmm2,   xmm1
			
 
				+    paddsw      xmm0,   [GLOBAL(rd)]
			
 
				+
			
 
				+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
			
 
				+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
			
 
				+
			
 
				+    psraw       xmm0,   7
			
 
				+    pmaddubsw   xmm1,   xmm5
			
 
				+
			
 
				+    pmaddubsw   xmm2,   xmm6
			
 
				+    packuswb    xmm0,   xmm0
			
 
				+
			
 
				+    lea         rsi,    [rsi + rax]
			
 
				+    paddsw      xmm3,   xmm1
			
 
				+
			
 
				+    paddsw      xmm3,   xmm2
			
 
				+
			
 
				+    paddsw      xmm3,   [GLOBAL(rd)]
			
 
				+
			
 
				+    psraw       xmm3,   7
			
 
				+
			
 
				+    packuswb    xmm3,   xmm3
			
 
				+
			
 
				+    punpcklqdq  xmm0,   xmm3
			
 
				+
			
 
				+    movdqa      XMMWORD Ptr [rdi], xmm0
			
 
				+
			
 
				+    lea         rdi,    [rdi + rdx]
			
 
				+    dec         rcx
			
 
				+    jnz         .filter_block1d16_h6_rowloop_ssse3
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+;void vp8_filter_block1d4_h6_ssse3
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    unsigned int    src_pixels_per_line,
			
 
				+;    unsigned char  *output_ptr,
			
 
				+;    unsigned int    output_pitch,
			
 
				+;    unsigned int    output_height,
			
 
				+;    unsigned int    vp8_filter_index
			
 
				+;)
			
 
				+global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
			
 
				+sym(vp8_filter_block1d4_h6_ssse3):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    movsxd      rdx, DWORD PTR arg(5)   ;table index
			
 
				+    xor         rsi, rsi
			
 
				+    shl         rdx, 4      ;
			
 
				+
			
 
				+    lea         rax, [GLOBAL(k0_k5)]
			
 
				+    add         rax, rdx
			
 
				+    movdqa      xmm7, [GLOBAL(rd)]
			
 
				+
			
 
				+    cmp         esi, DWORD PTR [rax]
			
 
				+    je          .vp8_filter_block1d4_h4_ssse3
			
 
				+
			
 
				+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
			
 
				+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
			
 
				+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
			
 
				+
			
 
				+    mov         rsi, arg(0)             ;src_ptr
			
 
				+    mov         rdi, arg(2)             ;output_ptr
			
 
				+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
			
 
				+    movsxd      rcx, dword ptr arg(4)   ;output_height
			
 
				+
			
 
				+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
			
 
				+
			
 
				+;xmm3 free
			
 
				+.filter_block1d4_h6_rowloop_ssse3:
			
 
				+    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
			
 
				+
			
 
				+    movdqa      xmm1, xmm0
			
 
				+    pshufb      xmm0, [GLOBAL(shuf1b)]
			
 
				+
			
 
				+    movdqa      xmm2, xmm1
			
 
				+    pshufb      xmm1, [GLOBAL(shuf2b)]
			
 
				+    pmaddubsw   xmm0, xmm4
			
 
				+    pshufb      xmm2, [GLOBAL(shuf3b)]
			
 
				+    pmaddubsw   xmm1, xmm5
			
 
				+
			
 
				+;--
			
 
				+    pmaddubsw   xmm2, xmm6
			
 
				+
			
 
				+    lea         rsi,    [rsi + rax]
			
 
				+;--
			
 
				+    paddsw      xmm0, xmm1
			
 
				+    paddsw      xmm0, xmm7
			
 
				+    pxor        xmm1, xmm1
			
 
				+    paddsw      xmm0, xmm2
			
 
				+    psraw       xmm0, 7
			
 
				+    packuswb    xmm0, xmm0
			
 
				+
			
 
				+    movd        DWORD PTR [rdi], xmm0
			
 
				+
			
 
				+    add         rdi, rdx
			
 
				+    dec         rcx
			
 
				+    jnz         .filter_block1d4_h6_rowloop_ssse3
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+.vp8_filter_block1d4_h4_ssse3:
			
 
				+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
			
 
				+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
			
 
				+    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
			
 
				+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
			
 
				+
			
 
				+    mov         rsi, arg(0)             ;src_ptr
			
 
				+    mov         rdi, arg(2)             ;output_ptr
			
 
				+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
			
 
				+    movsxd      rcx, dword ptr arg(4)   ;output_height
			
 
				+
			
 
				+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
			
 
				+
			
 
				+.filter_block1d4_h4_rowloop_ssse3:
			
 
				+    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
			
 
				+
			
 
				+    movdqa      xmm2, xmm1
			
 
				+    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
			
 
				+    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
			
 
				+    pmaddubsw   xmm1, xmm5
			
 
				+
			
 
				+;--
			
 
				+    pmaddubsw   xmm2, xmm6
			
 
				+
			
 
				+    lea         rsi,    [rsi + rax]
			
 
				+;--
			
 
				+    paddsw      xmm1, xmm7
			
 
				+    paddsw      xmm1, xmm2
			
 
				+    psraw       xmm1, 7
			
 
				+    packuswb    xmm1, xmm1
			
 
				+
			
 
				+    movd        DWORD PTR [rdi], xmm1
			
 
				+
			
 
				+    add         rdi, rdx
			
 
				+    dec         rcx
			
 
				+    jnz         .filter_block1d4_h4_rowloop_ssse3
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+
			
 
				+;void vp8_filter_block1d16_v6_ssse3
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    unsigned int   src_pitch,
			
 
				+;    unsigned char *output_ptr,
			
 
				+;    unsigned int   out_pitch,
			
 
				+;    unsigned int   output_height,
			
 
				+;    unsigned int   vp8_filter_index
			
 
				+;)
			
 
				+global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
			
 
				+sym(vp8_filter_block1d16_v6_ssse3):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    movsxd      rdx, DWORD PTR arg(5)   ;table index
			
 
				+    xor         rsi, rsi
			
 
				+    shl         rdx, 4      ;
			
 
				+
			
 
				+    lea         rax, [GLOBAL(k0_k5)]
			
 
				+    add         rax, rdx
			
 
				+
			
 
				+    cmp         esi, DWORD PTR [rax]
			
 
				+    je          .vp8_filter_block1d16_v4_ssse3
			
 
				+
			
 
				+    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
			
 
				+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
			
 
				+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
			
 
				+
			
 
				+    mov         rsi, arg(0)             ;src_ptr
			
 
				+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
			
 
				+    mov         rdi, arg(2)             ;output_ptr
			
 
				+
			
 
				+%if ABI_IS_32BIT=0
			
 
				+    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
			
 
				+%endif
			
 
				+    mov         rax, rsi
			
 
				+    movsxd      rcx, DWORD PTR arg(4)   ;output_height
			
 
				+    add         rax, rdx
			
 
				+
			
 
				+
			
 
				+.vp8_filter_block1d16_v6_ssse3_loop:
			
 
				+    movq        xmm1, MMWORD PTR [rsi]                  ;A
			
 
				+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
			
 
				+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
			
 
				+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
			
 
				+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
			
 
				+
			
 
				+    punpcklbw   xmm2, xmm4                  ;B D
			
 
				+    punpcklbw   xmm3, xmm0                  ;C E
			
 
				+
			
 
				+    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
			
 
				+
			
 
				+    pmaddubsw   xmm3, xmm6
			
 
				+    punpcklbw   xmm1, xmm0                  ;A F
			
 
				+    pmaddubsw   xmm2, xmm7
			
 
				+    pmaddubsw   xmm1, xmm5
			
 
				+
			
 
				+    paddsw      xmm2, xmm3
			
 
				+    paddsw      xmm2, xmm1
			
 
				+    paddsw      xmm2, [GLOBAL(rd)]
			
 
				+    psraw       xmm2, 7
			
 
				+    packuswb    xmm2, xmm2
			
 
				+
			
 
				+    movq        MMWORD PTR [rdi], xmm2          ;store the results
			
 
				+
			
 
				+    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
			
 
				+    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
			
 
				+    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
			
 
				+    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
			
 
				+    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
			
 
				+
			
 
				+    punpcklbw   xmm2, xmm4                  ;B D
			
 
				+    punpcklbw   xmm3, xmm0                  ;C E
			
 
				+
			
 
				+    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
			
 
				+    pmaddubsw   xmm3, xmm6
			
 
				+    punpcklbw   xmm1, xmm0                  ;A F
			
 
				+    pmaddubsw   xmm2, xmm7
			
 
				+    pmaddubsw   xmm1, xmm5
			
 
				+
			
 
				+    add         rsi,  rdx
			
 
				+    add         rax,  rdx
			
 
				+;--
			
 
				+;--
			
 
				+    paddsw      xmm2, xmm3
			
 
				+    paddsw      xmm2, xmm1
			
 
				+    paddsw      xmm2, [GLOBAL(rd)]
			
 
				+    psraw       xmm2, 7
			
 
				+    packuswb    xmm2, xmm2
			
 
				+
			
 
				+    movq        MMWORD PTR [rdi+8], xmm2
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+    add         rdi,        DWORD PTR arg(3) ;out_pitch
			
 
				+%else
			
 
				+    add         rdi,        r8
			
 
				+%endif
			
 
				+    dec         rcx
			
 
				+    jnz         .vp8_filter_block1d16_v6_ssse3_loop
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+.vp8_filter_block1d16_v4_ssse3:
			
 
				+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
			
 
				+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
			
 
				+
			
 
				+    mov         rsi, arg(0)             ;src_ptr
			
 
				+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
			
 
				+    mov         rdi, arg(2)             ;output_ptr
			
 
				+
			
 
				+%if ABI_IS_32BIT=0
			
 
				+    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
			
 
				+%endif
			
 
				+    mov         rax, rsi
			
 
				+    movsxd      rcx, DWORD PTR arg(4)   ;output_height
			
 
				+    add         rax, rdx
			
 
				+
			
 
				+.vp8_filter_block1d16_v4_ssse3_loop:
			
 
				+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
			
 
				+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
			
 
				+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
			
 
				+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
			
 
				+
			
 
				+    punpcklbw   xmm2, xmm4                  ;B D
			
 
				+    punpcklbw   xmm3, xmm0                  ;C E
			
 
				+
			
 
				+    pmaddubsw   xmm3, xmm6
			
 
				+    pmaddubsw   xmm2, xmm7
			
 
				+    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
			
 
				+    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
			
 
				+    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
			
 
				+    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
			
 
				+
			
 
				+    paddsw      xmm2, [GLOBAL(rd)]
			
 
				+    paddsw      xmm2, xmm3
			
 
				+    psraw       xmm2, 7
			
 
				+    packuswb    xmm2, xmm2
			
 
				+
			
 
				+    punpcklbw   xmm5, xmm4                  ;B D
			
 
				+    punpcklbw   xmm1, xmm0                  ;C E
			
 
				+
			
 
				+    pmaddubsw   xmm1, xmm6
			
 
				+    pmaddubsw   xmm5, xmm7
			
 
				+
			
 
				+    movdqa      xmm4, [GLOBAL(rd)]
			
 
				+    add         rsi,  rdx
			
 
				+    add         rax,  rdx
			
 
				+;--
			
 
				+;--
			
 
				+    paddsw      xmm5, xmm1
			
 
				+    paddsw      xmm5, xmm4
			
 
				+    psraw       xmm5, 7
			
 
				+    packuswb    xmm5, xmm5
			
 
				+
			
 
				+    punpcklqdq  xmm2, xmm5
			
 
				+
			
 
				+    movdqa       XMMWORD PTR [rdi], xmm2
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+    add         rdi,        DWORD PTR arg(3) ;out_pitch
			
 
				+%else
			
 
				+    add         rdi,        r8
			
 
				+%endif
			
 
				+    dec         rcx
			
 
				+    jnz         .vp8_filter_block1d16_v4_ssse3_loop
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+;void vp8_filter_block1d8_v6_ssse3
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    unsigned int   src_pitch,
			
 
				+;    unsigned char *output_ptr,
			
 
				+;    unsigned int   out_pitch,
			
 
				+;    unsigned int   output_height,
			
 
				+;    unsigned int   vp8_filter_index
			
 
				+;)
			
 
				+global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
			
 
				+sym(vp8_filter_block1d8_v6_ssse3):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    movsxd      rdx, DWORD PTR arg(5)   ;table index
			
 
				+    xor         rsi, rsi
			
 
				+    shl         rdx, 4      ;
			
 
				+
			
 
				+    lea         rax, [GLOBAL(k0_k5)]
			
 
				+    add         rax, rdx
			
 
				+
			
 
				+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
			
 
				+    mov         rdi, arg(2)             ;output_ptr
			
 
				+%if ABI_IS_32BIT=0
			
 
				+    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
			
 
				+%endif
			
 
				+    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
			
 
				+
			
 
				+    cmp         esi, DWORD PTR [rax]
			
 
				+    je          .vp8_filter_block1d8_v4_ssse3
			
 
				+
			
 
				+    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
			
 
				+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
			
 
				+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
			
 
				+
			
 
				+    mov         rsi, arg(0)             ;src_ptr
			
 
				+
			
 
				+    mov         rax, rsi
			
 
				+    add         rax, rdx
			
 
				+
			
 
				+.vp8_filter_block1d8_v6_ssse3_loop:
			
 
				+    movq        xmm1, MMWORD PTR [rsi]                  ;A
			
 
				+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
			
 
				+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
			
 
				+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
			
 
				+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
			
 
				+
			
 
				+    punpcklbw   xmm2, xmm4                  ;B D
			
 
				+    punpcklbw   xmm3, xmm0                  ;C E
			
 
				+
			
 
				+    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
			
 
				+    movdqa      xmm4, [GLOBAL(rd)]
			
 
				+
			
 
				+    pmaddubsw   xmm3, xmm6
			
 
				+    punpcklbw   xmm1, xmm0                  ;A F
			
 
				+    pmaddubsw   xmm2, xmm7
			
 
				+    pmaddubsw   xmm1, xmm5
			
 
				+    add         rsi,  rdx
			
 
				+    add         rax,  rdx
			
 
				+;--
			
 
				+;--
			
 
				+    paddsw      xmm2, xmm3
			
 
				+    paddsw      xmm2, xmm1
			
 
				+    paddsw      xmm2, xmm4
			
 
				+    psraw       xmm2, 7
			
 
				+    packuswb    xmm2, xmm2
			
 
				+
			
 
				+    movq        MMWORD PTR [rdi], xmm2
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
			
 
				+%else
			
 
				+    add         rdi,        r8
			
 
				+%endif
			
 
				+    dec         rcx
			
 
				+    jnz         .vp8_filter_block1d8_v6_ssse3_loop
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+.vp8_filter_block1d8_v4_ssse3:
			
 
				+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
			
 
				+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
			
 
				+    movdqa      xmm5, [GLOBAL(rd)]
			
 
				+
			
 
				+    mov         rsi, arg(0)             ;src_ptr
			
 
				+
			
 
				+    mov         rax, rsi
			
 
				+    add         rax, rdx
			
 
				+
			
 
				+.vp8_filter_block1d8_v4_ssse3_loop:
			
 
				+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
			
 
				+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
			
 
				+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
			
 
				+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
			
 
				+
			
 
				+    punpcklbw   xmm2, xmm4                  ;B D
			
 
				+    punpcklbw   xmm3, xmm0                  ;C E
			
 
				+
			
 
				+    pmaddubsw   xmm3, xmm6
			
 
				+    pmaddubsw   xmm2, xmm7
			
 
				+    add         rsi,  rdx
			
 
				+    add         rax,  rdx
			
 
				+;--
			
 
				+;--
			
 
				+    paddsw      xmm2, xmm3
			
 
				+    paddsw      xmm2, xmm5
			
 
				+    psraw       xmm2, 7
			
 
				+    packuswb    xmm2, xmm2
			
 
				+
			
 
				+    movq        MMWORD PTR [rdi], xmm2
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
			
 
				+%else
			
 
				+    add         rdi,        r8
			
 
				+%endif
			
 
				+    dec         rcx
			
 
				+    jnz         .vp8_filter_block1d8_v4_ssse3_loop
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+;void vp8_filter_block1d4_v6_ssse3
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    unsigned int   src_pitch,
			
 
				+;    unsigned char *output_ptr,
			
 
				+;    unsigned int   out_pitch,
			
 
				+;    unsigned int   output_height,
			
 
				+;    unsigned int   vp8_filter_index
			
 
				+;)
			
 
				+global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
			
 
				+sym(vp8_filter_block1d4_v6_ssse3):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    movsxd      rdx, DWORD PTR arg(5)   ;table index
			
 
				+    xor         rsi, rsi
			
 
				+    shl         rdx, 4      ;
			
 
				+
			
 
				+    lea         rax, [GLOBAL(k0_k5)]
			
 
				+    add         rax, rdx
			
 
				+
			
 
				+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
			
 
				+    mov         rdi, arg(2)             ;output_ptr
			
 
				+%if ABI_IS_32BIT=0
			
 
				+    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
			
 
				+%endif
			
 
				+    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
			
 
				+
			
 
				+    cmp         esi, DWORD PTR [rax]
			
 
				+    je          .vp8_filter_block1d4_v4_ssse3
			
 
				+
			
 
				+    movq        mm5, MMWORD PTR [rax]         ;k0_k5
			
 
				+    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
			
 
				+    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
			
 
				+
			
 
				+    mov         rsi, arg(0)             ;src_ptr
			
 
				+
			
 
				+    mov         rax, rsi
			
 
				+    add         rax, rdx
			
 
				+
			
 
				+.vp8_filter_block1d4_v6_ssse3_loop:
			
 
				+    movd        mm1, DWORD PTR [rsi]                  ;A
			
 
				+    movd        mm2, DWORD PTR [rsi + rdx]            ;B
			
 
				+    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
			
 
				+    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
			
 
				+    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
			
 
				+
			
 
				+    punpcklbw   mm2, mm4                  ;B D
			
 
				+    punpcklbw   mm3, mm0                  ;C E
			
 
				+
			
 
				+    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
			
 
				+
			
 
				+    movq        mm4, [GLOBAL(rd)]
			
 
				+
			
 
				+    pmaddubsw   mm3, mm6
			
 
				+    punpcklbw   mm1, mm0                  ;A F
			
 
				+    pmaddubsw   mm2, mm7
			
 
				+    pmaddubsw   mm1, mm5
			
 
				+    add         rsi,  rdx
			
 
				+    add         rax,  rdx
			
 
				+;--
			
 
				+;--
			
 
				+    paddsw      mm2, mm3
			
 
				+    paddsw      mm2, mm1
			
 
				+    paddsw      mm2, mm4
			
 
				+    psraw       mm2, 7
			
 
				+    packuswb    mm2, mm2
			
 
				+
			
 
				+    movd        DWORD PTR [rdi], mm2
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
			
 
				+%else
			
 
				+    add         rdi,        r8
			
 
				+%endif
			
 
				+    dec         rcx
			
 
				+    jnz         .vp8_filter_block1d4_v6_ssse3_loop
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+.vp8_filter_block1d4_v4_ssse3:
			
 
				+    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
			
 
				+    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
			
 
				+    movq        mm5, MMWORD PTR [GLOBAL(rd)]
			
 
				+
			
 
				+    mov         rsi, arg(0)             ;src_ptr
			
 
				+
			
 
				+    mov         rax, rsi
			
 
				+    add         rax, rdx
			
 
				+
			
 
				+.vp8_filter_block1d4_v4_ssse3_loop:
			
 
				+    movd        mm2, DWORD PTR [rsi + rdx]            ;B
			
 
				+    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
			
 
				+    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
			
 
				+    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
			
 
				+
			
 
				+    punpcklbw   mm2, mm4                  ;B D
			
 
				+    punpcklbw   mm3, mm0                  ;C E
			
 
				+
			
 
				+    pmaddubsw   mm3, mm6
			
 
				+    pmaddubsw   mm2, mm7
			
 
				+    add         rsi,  rdx
			
 
				+    add         rax,  rdx
			
 
				+;--
			
 
				+;--
			
 
				+    paddsw      mm2, mm3
			
 
				+    paddsw      mm2, mm5
			
 
				+    psraw       mm2, 7
			
 
				+    packuswb    mm2, mm2
			
 
				+
			
 
				+    movd        DWORD PTR [rdi], mm2
			
 
				+
			
 
				+%if ABI_IS_32BIT
			
 
				+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
			
 
				+%else
			
 
				+    add         rdi,        r8
			
 
				+%endif
			
 
				+    dec         rcx
			
 
				+    jnz         .vp8_filter_block1d4_v4_ssse3_loop
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+;void vp8_bilinear_predict16x16_ssse3
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    int   src_pixels_per_line,
			
 
				+;    int  xoffset,
			
 
				+;    int  yoffset,
			
 
				+;    unsigned char *dst_ptr,
			
 
				+;    int dst_pitch
			
 
				+;)
			
 
				+global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
			
 
				+sym(vp8_bilinear_predict16x16_ssse3):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
			
 
				+        movsxd      rax,        dword ptr arg(2)    ; xoffset
			
 
				+
			
 
				+        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
			
 
				+        je          .b16x16_sp_only
			
 
				+
			
 
				+        shl         rax,        4
			
 
				+        lea         rax,        [rax + rcx]         ; HFilter
			
 
				+
			
 
				+        mov         rdi,        arg(4)              ; dst_ptr
			
 
				+        mov         rsi,        arg(0)              ; src_ptr
			
 
				+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
			
 
				+
			
 
				+        movdqa      xmm1,       [rax]
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(3)    ; yoffset
			
 
				+
			
 
				+        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
			
 
				+        je          .b16x16_fp_only
			
 
				+
			
 
				+        shl         rax,        4
			
 
				+        lea         rax,        [rax + rcx]         ; VFilter
			
 
				+
			
 
				+        lea         rcx,        [rdi+rdx*8]
			
 
				+        lea         rcx,        [rcx+rdx*8]
			
 
				+        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
			
 
				+
			
 
				+        movdqa      xmm2,       [rax]
			
 
				+
			
 
				+%if ABI_IS_32BIT=0
			
 
				+        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
			
 
				+%endif
			
 
				+        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
			
 
				+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
			
 
				+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
			
 
				+
			
 
				+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
			
 
				+
			
 
				+        lea         rsi,        [rsi + rdx]         ; next line
			
 
				+
			
 
				+        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
			
 
				+
			
 
				+        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
			
 
				+        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
			
 
				+
			
 
				+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
			
 
				+        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
			
 
				+
			
 
				+        movdqa      xmm7,       xmm3
			
 
				+        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
			
 
				+
			
 
				+.next_row:
			
 
				+        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
			
 
				+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
			
 
				+
			
 
				+        punpcklbw   xmm6,       xmm5
			
 
				+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
			
 
				+
			
 
				+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
			
 
				+        lea         rsi,        [rsi + rdx]         ; next line
			
 
				+
			
 
				+        pmaddubsw   xmm6,       xmm1
			
 
				+
			
 
				+        punpcklbw   xmm4,       xmm5
			
 
				+        pmaddubsw   xmm4,       xmm1
			
 
				+
			
 
				+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
			
 
				+        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
			
 
				+
			
 
				+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
			
 
				+        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
			
 
				+
			
 
				+        packuswb    xmm6,       xmm4
			
 
				+        movdqa      xmm5,       xmm7
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm6
			
 
				+        pmaddubsw   xmm5,       xmm2
			
 
				+
			
 
				+        punpckhbw   xmm7,       xmm6
			
 
				+        pmaddubsw   xmm7,       xmm2
			
 
				+
			
 
				+        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
			
 
				+        psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
			
 
				+
			
 
				+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
			
 
				+        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
			
 
				+
			
 
				+        packuswb    xmm5,       xmm7
			
 
				+        movdqa      xmm7,       xmm6
			
 
				+
			
 
				+        movdqa      [rdi],      xmm5                ; store the results in the destination
			
 
				+%if ABI_IS_32BIT
			
 
				+        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
			
 
				+%else
			
 
				+        add         rdi,        r8
			
 
				+%endif
			
 
				+
			
 
				+        cmp         rdi,        rcx
			
 
				+        jne         .next_row
			
 
				+
			
 
				+        jmp         .done
			
 
				+
			
 
				+.b16x16_sp_only:
			
 
				+        movsxd      rax,        dword ptr arg(3)    ; yoffset
			
 
				+        shl         rax,        4
			
 
				+        lea         rax,        [rax + rcx]         ; VFilter
			
 
				+
			
 
				+        mov         rdi,        arg(4)              ; dst_ptr
			
 
				+        mov         rsi,        arg(0)              ; src_ptr
			
 
				+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
			
 
				+
			
 
				+        movdqa      xmm1,       [rax]               ; VFilter
			
 
				+
			
 
				+        lea         rcx,        [rdi+rdx*8]
			
 
				+        lea         rcx,        [rcx+rdx*8]
			
 
				+        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
			
 
				+
			
 
				+        ; get the first horizontal line done
			
 
				+        movq        xmm4,       [rsi]               ; load row 0
			
 
				+        movq        xmm2,       [rsi + 8]           ; load row 0
			
 
				+
			
 
				+        lea         rsi,        [rsi + rax]         ; next line
			
 
				+.next_row_sp:
			
 
				+        movq        xmm3,       [rsi]               ; load row + 1
			
 
				+        movq        xmm5,       [rsi + 8]           ; load row + 1
			
 
				+
			
 
				+        punpcklbw   xmm4,       xmm3
			
 
				+        punpcklbw   xmm2,       xmm5
			
 
				+
			
 
				+        pmaddubsw   xmm4,       xmm1
			
 
				+        movq        xmm7,       [rsi + rax]         ; load row + 2
			
 
				+
			
 
				+        pmaddubsw   xmm2,       xmm1
			
 
				+        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm7
			
 
				+        punpcklbw   xmm5,       xmm6
			
 
				+
			
 
				+        pmaddubsw   xmm3,       xmm1
			
 
				+        paddw       xmm4,       [GLOBAL(rd)]
			
 
				+
			
 
				+        pmaddubsw   xmm5,       xmm1
			
 
				+        paddw       xmm2,       [GLOBAL(rd)]
			
 
				+
			
 
				+        psraw       xmm4,       VP8_FILTER_SHIFT
			
 
				+        psraw       xmm2,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        packuswb    xmm4,       xmm2
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]
			
 
				+
			
 
				+        movdqa      [rdi],      xmm4                ; store row 0
			
 
				+        paddw       xmm5,       [GLOBAL(rd)]
			
 
				+
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT
			
 
				+        psraw       xmm5,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        packuswb    xmm3,       xmm5
			
 
				+        movdqa      xmm4,       xmm7
			
 
				+
			
 
				+        movdqa      [rdi + rdx],xmm3                ; store row 1
			
 
				+        lea         rsi,        [rsi + 2*rax]
			
 
				+
			
 
				+        movdqa      xmm2,       xmm6
			
 
				+        lea         rdi,        [rdi + 2*rdx]
			
 
				+
			
 
				+        cmp         rdi,        rcx
			
 
				+        jne         .next_row_sp
			
 
				+
			
 
				+        jmp         .done
			
 
				+
			
 
				+.b16x16_fp_only:
			
 
				+        lea         rcx,        [rdi+rdx*8]
			
 
				+        lea         rcx,        [rcx+rdx*8]
			
 
				+        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
			
 
				+
			
 
				+.next_row_fp:
			
 
				+        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
			
 
				+        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
			
 
				+
			
 
				+        punpcklbw   xmm2,       xmm4
			
 
				+        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
			
 
				+
			
 
				+        pmaddubsw   xmm2,       xmm1
			
 
				+        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
			
 
				+
			
 
				+        lea         rsi,        [rsi + rax]         ; next line
			
 
				+        punpcklbw   xmm3,       xmm4
			
 
				+
			
 
				+        pmaddubsw   xmm3,       xmm1
			
 
				+        movq        xmm5,       [rsi]
			
 
				+
			
 
				+        paddw       xmm2,       [GLOBAL(rd)]
			
 
				+        movq        xmm7,       [rsi+1]
			
 
				+
			
 
				+        movq        xmm6,       [rsi+8]
			
 
				+        psraw       xmm2,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm7
			
 
				+        movq        xmm7,       [rsi+9]
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]
			
 
				+        pmaddubsw   xmm5,       xmm1
			
 
				+
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT
			
 
				+        punpcklbw   xmm6,       xmm7
			
 
				+
			
 
				+        packuswb    xmm2,       xmm3
			
 
				+        pmaddubsw   xmm6,       xmm1
			
 
				+
			
 
				+        movdqa      [rdi],      xmm2                ; store the results in the destination
			
 
				+        paddw       xmm5,       [GLOBAL(rd)]
			
 
				+
			
 
				+        lea         rdi,        [rdi + rdx]         ; dst_pitch
			
 
				+        psraw       xmm5,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        paddw       xmm6,       [GLOBAL(rd)]
			
 
				+        psraw       xmm6,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        packuswb    xmm5,       xmm6
			
 
				+        lea         rsi,        [rsi + rax]         ; next line
			
 
				+
			
 
				+        movdqa      [rdi],      xmm5                ; store the results in the destination
			
 
				+        lea         rdi,        [rdi + rdx]         ; dst_pitch
			
 
				+
			
 
				+        cmp         rdi,        rcx
			
 
				+
			
 
				+        jne         .next_row_fp
			
 
				+
			
 
				+.done:
			
 
				+    ; begin epilog
			
 
				+    pop         rdi
			
 
				+    pop         rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+;void vp8_bilinear_predict8x8_ssse3
			
 
				+;(
			
 
				+;    unsigned char  *src_ptr,
			
 
				+;    int   src_pixels_per_line,
			
 
				+;    int  xoffset,
			
 
				+;    int  yoffset,
			
 
				+;    unsigned char *dst_ptr,
			
 
				+;    int dst_pitch
			
 
				+;)
			
 
				+global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
			
 
				+sym(vp8_bilinear_predict8x8_ssse3):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    SAVE_XMM 7
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub         rsp, 144                         ; reserve 144 bytes
			
 
				+
			
 
				+        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
			
 
				+
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
			
 
				+
			
 
				+    ;Read 9-line unaligned data in and put them on stack. This gives a big
			
 
				+    ;performance boost.
			
 
				+        movdqu      xmm0,       [rsi]
			
 
				+        lea         rax,        [rdx + rdx*2]
			
 
				+        movdqu      xmm1,       [rsi+rdx]
			
 
				+        movdqu      xmm2,       [rsi+rdx*2]
			
 
				+        add         rsi,        rax
			
 
				+        movdqu      xmm3,       [rsi]
			
 
				+        movdqu      xmm4,       [rsi+rdx]
			
 
				+        movdqu      xmm5,       [rsi+rdx*2]
			
 
				+        add         rsi,        rax
			
 
				+        movdqu      xmm6,       [rsi]
			
 
				+        movdqu      xmm7,       [rsi+rdx]
			
 
				+
			
 
				+        movdqa      XMMWORD PTR [rsp],            xmm0
			
 
				+
			
 
				+        movdqu      xmm0,       [rsi+rdx*2]
			
 
				+
			
 
				+        movdqa      XMMWORD PTR [rsp+16],         xmm1
			
 
				+        movdqa      XMMWORD PTR [rsp+32],         xmm2
			
 
				+        movdqa      XMMWORD PTR [rsp+48],         xmm3
			
 
				+        movdqa      XMMWORD PTR [rsp+64],         xmm4
			
 
				+        movdqa      XMMWORD PTR [rsp+80],         xmm5
			
 
				+        movdqa      XMMWORD PTR [rsp+96],         xmm6
			
 
				+        movdqa      XMMWORD PTR [rsp+112],        xmm7
			
 
				+        movdqa      XMMWORD PTR [rsp+128],        xmm0
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(2)    ; xoffset
			
 
				+        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
			
 
				+        je          .b8x8_sp_only
			
 
				+
			
 
				+        shl         rax,        4
			
 
				+        add         rax,        rcx                 ; HFilter
			
 
				+
			
 
				+        mov         rdi,        arg(4)              ; dst_ptr
			
 
				+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
			
 
				+
			
 
				+        movdqa      xmm0,       [rax]
			
 
				+
			
 
				+        movsxd      rax,        dword ptr arg(3)    ; yoffset
			
 
				+        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
			
 
				+        je          .b8x8_fp_only
			
 
				+
			
 
				+        shl         rax,        4
			
 
				+        lea         rax,        [rax + rcx]         ; VFilter
			
 
				+
			
 
				+        lea         rcx,        [rdi+rdx*8]
			
 
				+
			
 
				+        movdqa      xmm1,       [rax]
			
 
				+
			
 
				+        ; get the first horizontal line done
			
 
				+        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
			
 
				+        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
			
 
				+
			
 
				+        psrldq      xmm5,       1
			
 
				+        lea         rsp,        [rsp + 16]          ; next line
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
			
 
				+        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
			
 
				+
			
 
				+        movdqa      xmm7,       xmm3
			
 
				+        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
			
 
				+
			
 
				+.next_row:
			
 
				+        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
			
 
				+        lea         rsp,        [rsp + 16]          ; next line
			
 
				+
			
 
				+        movdqa      xmm5,       xmm6
			
 
				+
			
 
				+        psrldq      xmm5,       1
			
 
				+
			
 
				+        punpcklbw   xmm6,       xmm5
			
 
				+        pmaddubsw   xmm6,       xmm0
			
 
				+
			
 
				+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
			
 
				+        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
			
 
				+
			
 
				+        packuswb    xmm6,       xmm6
			
 
				+
			
 
				+        punpcklbw   xmm7,       xmm6
			
 
				+        pmaddubsw   xmm7,       xmm1
			
 
				+
			
 
				+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
			
 
				+        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
			
 
				+
			
 
				+        packuswb    xmm7,       xmm7
			
 
				+
			
 
				+        movq        [rdi],      xmm7                ; store the results in the destination
			
 
				+        lea         rdi,        [rdi + rdx]
			
 
				+
			
 
				+        movdqa      xmm7,       xmm6
			
 
				+
			
 
				+        cmp         rdi,        rcx
			
 
				+        jne         .next_row
			
 
				+
			
 
				+        jmp         .done8x8
			
 
				+
			
 
				+.b8x8_sp_only:
			
 
				+        movsxd      rax,        dword ptr arg(3)    ; yoffset
			
 
				+        shl         rax,        4
			
 
				+        lea         rax,        [rax + rcx]         ; VFilter
			
 
				+
			
 
				+        mov         rdi,        arg(4) ;dst_ptr
			
 
				+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
			
 
				+
			
 
				+        movdqa      xmm0,       [rax]               ; VFilter
			
 
				+
			
 
				+        movq        xmm1,       XMMWORD PTR [rsp]
			
 
				+        movq        xmm2,       XMMWORD PTR [rsp+16]
			
 
				+
			
 
				+        movq        xmm3,       XMMWORD PTR [rsp+32]
			
 
				+        punpcklbw   xmm1,       xmm2
			
 
				+
			
 
				+        movq        xmm4,       XMMWORD PTR [rsp+48]
			
 
				+        punpcklbw   xmm2,       xmm3
			
 
				+
			
 
				+        movq        xmm5,       XMMWORD PTR [rsp+64]
			
 
				+        punpcklbw   xmm3,       xmm4
			
 
				+
			
 
				+        movq        xmm6,       XMMWORD PTR [rsp+80]
			
 
				+        punpcklbw   xmm4,       xmm5
			
 
				+
			
 
				+        movq        xmm7,       XMMWORD PTR [rsp+96]
			
 
				+        punpcklbw   xmm5,       xmm6
			
 
				+
			
 
				+        pmaddubsw   xmm1,       xmm0
			
 
				+        pmaddubsw   xmm2,       xmm0
			
 
				+
			
 
				+        pmaddubsw   xmm3,       xmm0
			
 
				+        pmaddubsw   xmm4,       xmm0
			
 
				+
			
 
				+        pmaddubsw   xmm5,       xmm0
			
 
				+        punpcklbw   xmm6,       xmm7
			
 
				+
			
 
				+        pmaddubsw   xmm6,       xmm0
			
 
				+        paddw       xmm1,       [GLOBAL(rd)]
			
 
				+
			
 
				+        paddw       xmm2,       [GLOBAL(rd)]
			
 
				+        psraw       xmm1,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]
			
 
				+        psraw       xmm2,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        paddw       xmm4,       [GLOBAL(rd)]
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        paddw       xmm5,       [GLOBAL(rd)]
			
 
				+        psraw       xmm4,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        paddw       xmm6,       [GLOBAL(rd)]
			
 
				+        psraw       xmm5,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        psraw       xmm6,       VP8_FILTER_SHIFT
			
 
				+        packuswb    xmm1,       xmm1
			
 
				+
			
 
				+        packuswb    xmm2,       xmm2
			
 
				+        movq        [rdi],      xmm1
			
 
				+
			
 
				+        packuswb    xmm3,       xmm3
			
 
				+        movq        [rdi+rdx],  xmm2
			
 
				+
			
 
				+        packuswb    xmm4,       xmm4
			
 
				+        movq        xmm1,       XMMWORD PTR [rsp+112]
			
 
				+
			
 
				+        lea         rdi,        [rdi + 2*rdx]
			
 
				+        movq        xmm2,       XMMWORD PTR [rsp+128]
			
 
				+
			
 
				+        packuswb    xmm5,       xmm5
			
 
				+        movq        [rdi],      xmm3
			
 
				+
			
 
				+        packuswb    xmm6,       xmm6
			
 
				+        movq        [rdi+rdx],  xmm4
			
 
				+
			
 
				+        lea         rdi,        [rdi + 2*rdx]
			
 
				+        punpcklbw   xmm7,       xmm1
			
 
				+
			
 
				+        movq        [rdi],      xmm5
			
 
				+        pmaddubsw   xmm7,       xmm0
			
 
				+
			
 
				+        movq        [rdi+rdx],  xmm6
			
 
				+        punpcklbw   xmm1,       xmm2
			
 
				+
			
 
				+        pmaddubsw   xmm1,       xmm0
			
 
				+        paddw       xmm7,       [GLOBAL(rd)]
			
 
				+
			
 
				+        psraw       xmm7,       VP8_FILTER_SHIFT
			
 
				+        paddw       xmm1,       [GLOBAL(rd)]
			
 
				+
			
 
				+        psraw       xmm1,       VP8_FILTER_SHIFT
			
 
				+        packuswb    xmm7,       xmm7
			
 
				+
			
 
				+        packuswb    xmm1,       xmm1
			
 
				+        lea         rdi,        [rdi + 2*rdx]
			
 
				+
			
 
				+        movq        [rdi],      xmm7
			
 
				+
			
 
				+        movq        [rdi+rdx],  xmm1
			
 
				+        lea         rsp,        [rsp + 144]
			
 
				+
			
 
				+        jmp         .done8x8
			
 
				+
			
 
				+.b8x8_fp_only:
			
 
				+        lea         rcx,        [rdi+rdx*8]
			
 
				+
			
 
				+.next_row_fp:
			
 
				+        movdqa      xmm1,       XMMWORD PTR [rsp]
			
 
				+        movdqa      xmm3,       XMMWORD PTR [rsp+16]
			
 
				+
			
 
				+        movdqa      xmm2,       xmm1
			
 
				+        movdqa      xmm5,       XMMWORD PTR [rsp+32]
			
 
				+
			
 
				+        psrldq      xmm2,       1
			
 
				+        movdqa      xmm7,       XMMWORD PTR [rsp+48]
			
 
				+
			
 
				+        movdqa      xmm4,       xmm3
			
 
				+        psrldq      xmm4,       1
			
 
				+
			
 
				+        movdqa      xmm6,       xmm5
			
 
				+        psrldq      xmm6,       1
			
 
				+
			
 
				+        punpcklbw   xmm1,       xmm2
			
 
				+        pmaddubsw   xmm1,       xmm0
			
 
				+
			
 
				+        punpcklbw   xmm3,       xmm4
			
 
				+        pmaddubsw   xmm3,       xmm0
			
 
				+
			
 
				+        punpcklbw   xmm5,       xmm6
			
 
				+        pmaddubsw   xmm5,       xmm0
			
 
				+
			
 
				+        movdqa      xmm2,       xmm7
			
 
				+        psrldq      xmm2,       1
			
 
				+
			
 
				+        punpcklbw   xmm7,       xmm2
			
 
				+        pmaddubsw   xmm7,       xmm0
			
 
				+
			
 
				+        paddw       xmm1,       [GLOBAL(rd)]
			
 
				+        psraw       xmm1,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        paddw       xmm3,       [GLOBAL(rd)]
			
 
				+        psraw       xmm3,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        paddw       xmm5,       [GLOBAL(rd)]
			
 
				+        psraw       xmm5,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        paddw       xmm7,       [GLOBAL(rd)]
			
 
				+        psraw       xmm7,       VP8_FILTER_SHIFT
			
 
				+
			
 
				+        packuswb    xmm1,       xmm1
			
 
				+        packuswb    xmm3,       xmm3
			
 
				+
			
 
				+        packuswb    xmm5,       xmm5
			
 
				+        movq        [rdi],      xmm1
			
 
				+
			
 
				+        packuswb    xmm7,       xmm7
			
 
				+        movq        [rdi+rdx],  xmm3
			
 
				+
			
 
				+        lea         rdi,        [rdi + 2*rdx]
			
 
				+        movq        [rdi],      xmm5
			
 
				+
			
 
				+        lea         rsp,        [rsp + 4*16]
			
 
				+        movq        [rdi+rdx],  xmm7
			
 
				+
			
 
				+        lea         rdi,        [rdi + 2*rdx]
			
 
				+        cmp         rdi,        rcx
			
 
				+
			
 
				+        jne         .next_row_fp
			
 
				+
			
 
				+        lea         rsp,        [rsp + 16]
			
 
				+
			
 
				+.done8x8:
			
 
				+    ;add rsp, 144
			
 
				+    pop         rsp
			
 
				+    ; begin epilog
			
 
				+    pop         rdi
			
 
				+    pop         rsi
			
 
				+    RESTORE_GOT
			
 
				+    RESTORE_XMM
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+SECTION_RODATA
			
 
				+align 16
			
 
				+shuf1b:
			
 
				+    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
			
 
				+shuf2b:
			
 
				+    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
			
 
				+shuf3b:
			
 
				+    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
			
 
				+
			
 
				+align 16
			
 
				+shuf2bfrom1:
			
 
				+    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
			
 
				+align 16
			
 
				+shuf3bfrom1:
			
 
				+    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
			
 
				+
			
 
				+align 16
			
 
				+rd:
			
 
				+    times 8 dw 0x40
			
 
				+
			
 
				+align 16
			
 
				+k0_k5:
			
 
				+    times 8 db 0, 0             ;placeholder
			
 
				+    times 8 db 0, 0
			
 
				+    times 8 db 2, 1
			
 
				+    times 8 db 0, 0
			
 
				+    times 8 db 3, 3
			
 
				+    times 8 db 0, 0
			
 
				+    times 8 db 1, 2
			
 
				+    times 8 db 0, 0
			
 
				+k1_k3:
			
 
				+    times 8 db  0,    0         ;placeholder
			
 
				+    times 8 db  -6,  12
			
 
				+    times 8 db -11,  36
			
 
				+    times 8 db  -9,  50
			
 
				+    times 8 db -16,  77
			
 
				+    times 8 db  -6,  93
			
 
				+    times 8 db  -8, 108
			
 
				+    times 8 db  -1, 123
			
 
				+k2_k4:
			
 
				+    times 8 db 128,    0        ;placeholder
			
 
				+    times 8 db 123,   -1
			
 
				+    times 8 db 108,   -8
			
 
				+    times 8 db  93,   -6
			
 
				+    times 8 db  77,  -16
			
 
				+    times 8 db  50,   -9
			
 
				+    times 8 db  36,  -11
			
 
				+    times 8 db  12,   -6
			
 
				+align 16
			
 
				+vp8_bilinear_filters_ssse3:
			
 
				+    times 8 db 128, 0
			
 
				+    times 8 db 112, 16
			
 
				+    times 8 db 96,  32
			
 
				+    times 8 db 80,  48
			
 
				+    times 8 db 64,  64
			
 
				+    times 8 db 48,  80
			
 
				+    times 8 db 32,  96
			
 
				+    times 8 db 16,  112
			
 
				+
			
--- a/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c
+++ b/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -0,0 +1,625 @@
 
				+/*
			
 
				+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+ *
			
 
				+ *  Use of this source code is governed by a BSD-style license
			
 
				+ *  that can be found in the LICENSE file in the root of the source
			
 
				+ *  tree. An additional intellectual property rights grant can be found
			
 
				+ *  in the file PATENTS.  All contributing project authors may
			
 
				+ *  be found in the AUTHORS file in the root of the source tree.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include "vpx_config.h"
			
 
				+#include "vp8_rtcd.h"
			
 
				+#include "vpx_ports/mem.h"
			
 
				+#include "filter_x86.h"
			
 
				+
			
 
				+extern const short vp8_six_tap_mmx[8][6*8];
			
 
				+
			
 
				+extern void vp8_filter_block1d_h6_mmx
			
 
				+(
			
 
				+    unsigned char   *src_ptr,
			
 
				+    unsigned short  *output_ptr,
			
 
				+    unsigned int    src_pixels_per_line,
			
 
				+    unsigned int    pixel_step,
			
 
				+    unsigned int    output_height,
			
 
				+    unsigned int    output_width,
			
 
				+    const short      *vp8_filter
			
 
				+);
			
 
				+extern void vp8_filter_block1dc_v6_mmx
			
 
				+(
			
 
				+    unsigned short *src_ptr,
			
 
				+    unsigned char  *output_ptr,
			
 
				+    int             output_pitch,
			
 
				+    unsigned int    pixels_per_line,
			
 
				+    unsigned int    pixel_step,
			
 
				+    unsigned int    output_height,
			
 
				+    unsigned int    output_width,
			
 
				+    const short    *vp8_filter
			
 
				+);
			
 
				+extern void vp8_filter_block1d8_h6_sse2
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    unsigned short *output_ptr,
			
 
				+    unsigned int    src_pixels_per_line,
			
 
				+    unsigned int    pixel_step,
			
 
				+    unsigned int    output_height,
			
 
				+    unsigned int    output_width,
			
 
				+    const short    *vp8_filter
			
 
				+);
			
 
				+extern void vp8_filter_block1d16_h6_sse2
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    unsigned short *output_ptr,
			
 
				+    unsigned int    src_pixels_per_line,
			
 
				+    unsigned int    pixel_step,
			
 
				+    unsigned int    output_height,
			
 
				+    unsigned int    output_width,
			
 
				+    const short    *vp8_filter
			
 
				+);
			
 
				+extern void vp8_filter_block1d8_v6_sse2
			
 
				+(
			
 
				+    unsigned short *src_ptr,
			
 
				+    unsigned char *output_ptr,
			
 
				+    int dst_ptich,
			
 
				+    unsigned int pixels_per_line,
			
 
				+    unsigned int pixel_step,
			
 
				+    unsigned int output_height,
			
 
				+    unsigned int output_width,
			
 
				+    const short    *vp8_filter
			
 
				+);
			
 
				+extern void vp8_filter_block1d16_v6_sse2
			
 
				+(
			
 
				+    unsigned short *src_ptr,
			
 
				+    unsigned char *output_ptr,
			
 
				+    int dst_ptich,
			
 
				+    unsigned int pixels_per_line,
			
 
				+    unsigned int pixel_step,
			
 
				+    unsigned int output_height,
			
 
				+    unsigned int output_width,
			
 
				+    const short    *vp8_filter
			
 
				+);
			
 
				+extern void vp8_unpack_block1d16_h6_sse2
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    unsigned short *output_ptr,
			
 
				+    unsigned int    src_pixels_per_line,
			
 
				+    unsigned int    output_height,
			
 
				+    unsigned int    output_width
			
 
				+);
			
 
				+extern void vp8_filter_block1d8_h6_only_sse2
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    unsigned int    src_pixels_per_line,
			
 
				+    unsigned char  *output_ptr,
			
 
				+    int dst_ptich,
			
 
				+    unsigned int    output_height,
			
 
				+    const short    *vp8_filter
			
 
				+);
			
 
				+extern void vp8_filter_block1d16_h6_only_sse2
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    unsigned int    src_pixels_per_line,
			
 
				+    unsigned char  *output_ptr,
			
 
				+    int dst_ptich,
			
 
				+    unsigned int    output_height,
			
 
				+    const short    *vp8_filter
			
 
				+);
			
 
				+extern void vp8_filter_block1d8_v6_only_sse2
			
 
				+(
			
 
				+    unsigned char *src_ptr,
			
 
				+    unsigned int   src_pixels_per_line,
			
 
				+    unsigned char *output_ptr,
			
 
				+    int dst_ptich,
			
 
				+    unsigned int   output_height,
			
 
				+    const short   *vp8_filter
			
 
				+);
			
 
				+
			
 
				+
			
 
				+#if HAVE_MMX
			
 
				+void vp8_sixtap_predict4x4_mmx
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    DECLARE_ALIGNED(16, unsigned short, FData2[16*16]);  /* Temp data bufffer used in filtering */
			
 
				+    const short *HFilter, *VFilter;
			
 
				+    HFilter = vp8_six_tap_mmx[xoffset];
			
 
				+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
			
 
				+    VFilter = vp8_six_tap_mmx[yoffset];
			
 
				+    vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_sixtap_predict16x16_mmx
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+
			
 
				+    DECLARE_ALIGNED(16, unsigned short, FData2[24*24]);  /* Temp data bufffer used in filtering */
			
 
				+
			
 
				+    const short *HFilter, *VFilter;
			
 
				+
			
 
				+
			
 
				+    HFilter = vp8_six_tap_mmx[xoffset];
			
 
				+
			
 
				+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
			
 
				+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
			
 
				+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,  FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
			
 
				+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
			
 
				+
			
 
				+    VFilter = vp8_six_tap_mmx[yoffset];
			
 
				+    vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, 16, VFilter);
			
 
				+    vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
			
 
				+    vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
			
 
				+    vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_sixtap_predict8x8_mmx
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+
			
 
				+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);    /* Temp data bufffer used in filtering */
			
 
				+
			
 
				+    const short *HFilter, *VFilter;
			
 
				+
			
 
				+    HFilter = vp8_six_tap_mmx[xoffset];
			
 
				+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
			
 
				+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
			
 
				+
			
 
				+    VFilter = vp8_six_tap_mmx[yoffset];
			
 
				+    vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, 8, VFilter);
			
 
				+    vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_sixtap_predict8x4_mmx
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+
			
 
				+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);    /* Temp data bufffer used in filtering */
			
 
				+
			
 
				+    const short *HFilter, *VFilter;
			
 
				+
			
 
				+    HFilter = vp8_six_tap_mmx[xoffset];
			
 
				+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
			
 
				+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
			
 
				+
			
 
				+    VFilter = vp8_six_tap_mmx[yoffset];
			
 
				+    vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, 8, VFilter);
			
 
				+    vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+void vp8_bilinear_predict16x16_mmx
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    vp8_bilinear_predict8x8_mmx(src_ptr,   src_pixels_per_line, xoffset, yoffset, dst_ptr,   dst_pitch);
			
 
				+    vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
			
 
				+    vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,   src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8,   dst_pitch);
			
 
				+    vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#if HAVE_SSE2
			
 
				+void vp8_sixtap_predict16x16_sse2
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+
			
 
				+)
			
 
				+{
			
 
				+    DECLARE_ALIGNED(16, unsigned short, FData2[24*24]);    /* Temp data bufffer used in filtering */
			
 
				+
			
 
				+    const short *HFilter, *VFilter;
			
 
				+
			
 
				+    if (xoffset)
			
 
				+    {
			
 
				+        if (yoffset)
			
 
				+        {
			
 
				+            HFilter = vp8_six_tap_mmx[xoffset];
			
 
				+            vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
			
 
				+            VFilter = vp8_six_tap_mmx[yoffset];
			
 
				+            vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            /* First-pass only */
			
 
				+            HFilter = vp8_six_tap_mmx[xoffset];
			
 
				+            vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        /* Second-pass only */
			
 
				+        VFilter = vp8_six_tap_mmx[yoffset];
			
 
				+        vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 21, 32);
			
 
				+        vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_sixtap_predict8x8_sse2
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);  /* Temp data bufffer used in filtering */
			
 
				+    const short *HFilter, *VFilter;
			
 
				+
			
 
				+    if (xoffset)
			
 
				+    {
			
 
				+        if (yoffset)
			
 
				+        {
			
 
				+            HFilter = vp8_six_tap_mmx[xoffset];
			
 
				+            vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
			
 
				+            VFilter = vp8_six_tap_mmx[yoffset];
			
 
				+            vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            /* First-pass only */
			
 
				+            HFilter = vp8_six_tap_mmx[xoffset];
			
 
				+            vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        /* Second-pass only */
			
 
				+        VFilter = vp8_six_tap_mmx[yoffset];
			
 
				+        vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_sixtap_predict8x4_sse2
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);  /* Temp data bufffer used in filtering */
			
 
				+    const short *HFilter, *VFilter;
			
 
				+
			
 
				+    if (xoffset)
			
 
				+    {
			
 
				+        if (yoffset)
			
 
				+        {
			
 
				+            HFilter = vp8_six_tap_mmx[xoffset];
			
 
				+            vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
			
 
				+            VFilter = vp8_six_tap_mmx[yoffset];
			
 
				+            vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            /* First-pass only */
			
 
				+            HFilter = vp8_six_tap_mmx[xoffset];
			
 
				+            vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        /* Second-pass only */
			
 
				+        VFilter = vp8_six_tap_mmx[yoffset];
			
 
				+        vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#if HAVE_SSSE3
			
 
				+
			
 
				+extern void vp8_filter_block1d8_h6_ssse3
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    unsigned int    src_pixels_per_line,
			
 
				+    unsigned char  *output_ptr,
			
 
				+    unsigned int    output_pitch,
			
 
				+    unsigned int    output_height,
			
 
				+    unsigned int    vp8_filter_index
			
 
				+);
			
 
				+
			
 
				+extern void vp8_filter_block1d16_h6_ssse3
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    unsigned int    src_pixels_per_line,
			
 
				+    unsigned char  *output_ptr,
			
 
				+    unsigned int    output_pitch,
			
 
				+    unsigned int    output_height,
			
 
				+    unsigned int    vp8_filter_index
			
 
				+);
			
 
				+
			
 
				+extern void vp8_filter_block1d16_v6_ssse3
			
 
				+(
			
 
				+    unsigned char *src_ptr,
			
 
				+    unsigned int   src_pitch,
			
 
				+    unsigned char *output_ptr,
			
 
				+    unsigned int   out_pitch,
			
 
				+    unsigned int   output_height,
			
 
				+    unsigned int   vp8_filter_index
			
 
				+);
			
 
				+
			
 
				+extern void vp8_filter_block1d8_v6_ssse3
			
 
				+(
			
 
				+    unsigned char *src_ptr,
			
 
				+    unsigned int   src_pitch,
			
 
				+    unsigned char *output_ptr,
			
 
				+    unsigned int   out_pitch,
			
 
				+    unsigned int   output_height,
			
 
				+    unsigned int   vp8_filter_index
			
 
				+);
			
 
				+
			
 
				+extern void vp8_filter_block1d4_h6_ssse3
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    unsigned int    src_pixels_per_line,
			
 
				+    unsigned char  *output_ptr,
			
 
				+    unsigned int    output_pitch,
			
 
				+    unsigned int    output_height,
			
 
				+    unsigned int    vp8_filter_index
			
 
				+);
			
 
				+
			
 
				+extern void vp8_filter_block1d4_v6_ssse3
			
 
				+(
			
 
				+    unsigned char *src_ptr,
			
 
				+    unsigned int   src_pitch,
			
 
				+    unsigned char *output_ptr,
			
 
				+    unsigned int   out_pitch,
			
 
				+    unsigned int   output_height,
			
 
				+    unsigned int   vp8_filter_index
			
 
				+);
			
 
				+
			
 
				+void vp8_sixtap_predict16x16_ssse3
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+
			
 
				+)
			
 
				+{
			
 
				+    DECLARE_ALIGNED(16, unsigned char, FData2[24*24]);
			
 
				+
			
 
				+    if (xoffset)
			
 
				+    {
			
 
				+        if (yoffset)
			
 
				+        {
			
 
				+            vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
			
 
				+                                          src_pixels_per_line, FData2,
			
 
				+                                          16, 21, xoffset);
			
 
				+            vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
			
 
				+                                          16, yoffset);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            /* First-pass only */
			
 
				+            vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
			
 
				+                                          dst_ptr, dst_pitch, 16, xoffset);
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        if (yoffset)
			
 
				+        {
			
 
				+            /* Second-pass only */
			
 
				+            vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
			
 
				+                                          src_pixels_per_line,
			
 
				+                                          dst_ptr, dst_pitch, 16, yoffset);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
			
 
				+             * yoffset==0) case correctly. Add copy function here to guarantee
			
 
				+             * six-tap function handles all possible offsets. */
			
 
				+            vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_sixtap_predict8x8_ssse3
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    DECLARE_ALIGNED(16, unsigned char, FData2[256]);
			
 
				+
			
 
				+    if (xoffset)
			
 
				+    {
			
 
				+        if (yoffset)
			
 
				+        {
			
 
				+            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
			
 
				+                                         src_pixels_per_line, FData2,
			
 
				+                                         8, 13, xoffset);
			
 
				+            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
			
 
				+                                         8, yoffset);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
			
 
				+                                         dst_ptr, dst_pitch, 8, xoffset);
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        if (yoffset)
			
 
				+        {
			
 
				+            /* Second-pass only */
			
 
				+            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
			
 
				+                                         src_pixels_per_line,
			
 
				+                                         dst_ptr, dst_pitch, 8, yoffset);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
			
 
				+             * yoffset==0) case correctly. Add copy function here to guarantee
			
 
				+             * six-tap function handles all possible offsets. */
			
 
				+            vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void vp8_sixtap_predict8x4_ssse3
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+    DECLARE_ALIGNED(16, unsigned char, FData2[256]);
			
 
				+
			
 
				+    if (xoffset)
			
 
				+    {
			
 
				+        if (yoffset)
			
 
				+        {
			
 
				+            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
			
 
				+                                         src_pixels_per_line, FData2,
			
 
				+                                         8, 9, xoffset);
			
 
				+            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
			
 
				+                                         4, yoffset);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            /* First-pass only */
			
 
				+            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
			
 
				+                                         dst_ptr, dst_pitch, 4, xoffset);
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        if (yoffset)
			
 
				+        {
			
 
				+            /* Second-pass only */
			
 
				+            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
			
 
				+                                         src_pixels_per_line,
			
 
				+                                         dst_ptr, dst_pitch, 4, yoffset);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
			
 
				+             * yoffset==0) case correctly. Add copy function here to guarantee
			
 
				+             * six-tap function handles all possible offsets. */
			
 
				+            vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void vp8_sixtap_predict4x4_ssse3
			
 
				+(
			
 
				+    unsigned char  *src_ptr,
			
 
				+    int   src_pixels_per_line,
			
 
				+    int  xoffset,
			
 
				+    int  yoffset,
			
 
				+    unsigned char *dst_ptr,
			
 
				+    int dst_pitch
			
 
				+)
			
 
				+{
			
 
				+  DECLARE_ALIGNED(16, unsigned char, FData2[4*9]);
			
 
				+
			
 
				+  if (xoffset)
			
 
				+  {
			
 
				+      if (yoffset)
			
 
				+      {
			
 
				+          vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
			
 
				+                                       src_pixels_per_line,
			
 
				+                                       FData2, 4, 9, xoffset);
			
 
				+          vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
			
 
				+                                       4, yoffset);
			
 
				+      }
			
 
				+      else
			
 
				+      {
			
 
				+          vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
			
 
				+                                       dst_ptr, dst_pitch, 4, xoffset);
			
 
				+      }
			
 
				+  }
			
 
				+  else
			
 
				+  {
			
 
				+      if (yoffset)
			
 
				+      {
			
 
				+          vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
			
 
				+                                       src_pixels_per_line,
			
 
				+                                       dst_ptr, dst_pitch, 4, yoffset);
			
 
				+      }
			
 
				+      else
			
 
				+      {
			
 
				+        /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
			
 
				+          * yoffset==0) case correctly. Add copy function here to guarantee
			
 
				+          * six-tap function handles all possible offsets. */
			
 
				+          int r;
			
 
				+
			
 
				+          for (r = 0; r < 4; r++)
			
 
				+          {
			
 
				+            dst_ptr[0]  = src_ptr[0];
			
 
				+            dst_ptr[1]  = src_ptr[1];
			
 
				+            dst_ptr[2]  = src_ptr[2];
			
 
				+            dst_ptr[3]  = src_ptr[3];
			
 
				+            dst_ptr     += dst_pitch;
			
 
				+            src_ptr     += src_pixels_per_line;
			
 
				+          }
			
 
				+      }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm
+++ b/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm
@@ -0,0 +1,1753 @@
 
				+;
			
 
				+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
			
 
				+;
			
 
				+;  Use of this source code is governed by a BSD-style license
			
 
				+;  that can be found in the LICENSE file in the root of the source
			
 
				+;  tree. An additional intellectual property rights grant can be found
			
 
				+;  in the file PATENTS.  All contributing project authors may
			
 
				+;  be found in the AUTHORS file in the root of the source tree.
			
 
				+;
			
 
				+
			
 
				+
			
 
				+%include "vpx_ports/x86_abi_support.asm"
			
 
				+
			
 
				+
			
 
				+;void vp8_loop_filter_horizontal_edge_mmx
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int src_pixel_step,
			
 
				+;    const char *blimit,
			
 
				+;    const char *limit,
			
 
				+;    const char *thresh,
			
 
				+;    int  count
			
 
				+;)
			
 
				+global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
			
 
				+sym(vp8_loop_filter_horizontal_edge_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub         rsp, 32                         ; reserve 32 bytes
			
 
				+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
			
 
				+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
			
 
				+
			
 
				+        mov         rsi, arg(0) ;src_ptr
			
 
				+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
			
 
				+
			
 
				+        movsxd      rcx, dword ptr arg(5) ;count
			
 
				+.next8_h:
			
 
				+        mov         rdx, arg(3) ;limit
			
 
				+        movq        mm7, [rdx]
			
 
				+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
			
 
				+        add         rdi, rax
			
 
				+
			
 
				+        ; calculate breakout conditions
			
 
				+        movq        mm2, [rdi+2*rax]      ; q3
			
 
				+        movq        mm1, [rsi+2*rax]      ; q2
			
 
				+        movq        mm6, mm1              ; q2
			
 
				+        psubusb     mm1, mm2              ; q2-=q3
			
 
				+        psubusb     mm2, mm6              ; q3-=q2
			
 
				+        por         mm1, mm2              ; abs(q3-q2)
			
 
				+        psubusb     mm1, mm7              ;
			
 
				+
			
 
				+
			
 
				+        movq        mm4, [rsi+rax]        ; q1
			
 
				+        movq        mm3, mm4              ; q1
			
 
				+        psubusb     mm4, mm6              ; q1-=q2
			
 
				+        psubusb     mm6, mm3              ; q2-=q1
			
 
				+        por         mm4, mm6              ; abs(q2-q1)
			
 
				+
			
 
				+        psubusb     mm4, mm7
			
 
				+        por        mm1, mm4
			
 
				+
			
 
				+        movq        mm4, [rsi]            ; q0
			
 
				+        movq        mm0, mm4              ; q0
			
 
				+        psubusb     mm4, mm3              ; q0-=q1
			
 
				+        psubusb     mm3, mm0              ; q1-=q0
			
 
				+        por         mm4, mm3              ; abs(q0-q1)
			
 
				+        movq        t0, mm4               ; save to t0
			
 
				+        psubusb     mm4, mm7
			
 
				+        por        mm1, mm4
			
 
				+
			
 
				+
			
 
				+        neg         rax                   ; negate pitch to deal with above border
			
 
				+
			
 
				+        movq        mm2, [rsi+4*rax]      ; p3
			
 
				+        movq        mm4, [rdi+4*rax]      ; p2
			
 
				+        movq        mm5, mm4              ; p2
			
 
				+        psubusb     mm4, mm2              ; p2-=p3
			
 
				+        psubusb     mm2, mm5              ; p3-=p2
			
 
				+        por         mm4, mm2              ; abs(p3 - p2)
			
 
				+        psubusb     mm4, mm7
			
 
				+        por        mm1, mm4
			
 
				+
			
 
				+
			
 
				+        movq        mm4, [rsi+2*rax]      ; p1
			
 
				+        movq        mm3, mm4              ; p1
			
 
				+        psubusb     mm4, mm5              ; p1-=p2
			
 
				+        psubusb     mm5, mm3              ; p2-=p1
			
 
				+        por         mm4, mm5              ; abs(p2 - p1)
			
 
				+        psubusb     mm4, mm7
			
 
				+        por        mm1, mm4
			
 
				+
			
 
				+        movq        mm2, mm3              ; p1
			
 
				+
			
 
				+        movq        mm4, [rsi+rax]        ; p0
			
 
				+        movq        mm5, mm4              ; p0
			
 
				+        psubusb     mm4, mm3              ; p0-=p1
			
 
				+        psubusb     mm3, mm5              ; p1-=p0
			
 
				+        por         mm4, mm3              ; abs(p1 - p0)
			
 
				+        movq        t1, mm4               ; save to t1
			
 
				+        psubusb     mm4, mm7
			
 
				+        por        mm1, mm4
			
 
				+
			
 
				+        movq        mm3, [rdi]            ; q1
			
 
				+        movq        mm4, mm3              ; q1
			
 
				+        psubusb     mm3, mm2              ; q1-=p1
			
 
				+        psubusb     mm2, mm4              ; p1-=q1
			
 
				+        por         mm2, mm3              ; abs(p1-q1)
			
 
				+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
			
 
				+        psrlw       mm2, 1                ; abs(p1-q1)/2
			
 
				+
			
 
				+        movq        mm6, mm5              ; p0
			
 
				+        movq        mm3, [rsi]            ; q0
			
 
				+        psubusb     mm5, mm3              ; p0-=q0
			
 
				+        psubusb     mm3, mm6              ; q0-=p0
			
 
				+        por         mm5, mm3              ; abs(p0 - q0)
			
 
				+        paddusb     mm5, mm5              ; abs(p0-q0)*2
			
 
				+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
			
 
				+
			
 
				+        mov         rdx, arg(2) ;blimit           ; get blimit
			
 
				+        movq        mm7, [rdx]            ; blimit
			
 
				+
			
 
				+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
			
 
				+        por         mm1,    mm5
			
 
				+        pxor        mm5,    mm5
			
 
				+        pcmpeqb     mm1,    mm5           ; mask mm1
			
 
				+
			
 
				+        ; calculate high edge variance
			
 
				+        mov         rdx, arg(4) ;thresh           ; get thresh
			
 
				+        movq        mm7, [rdx]            ;
			
 
				+        movq        mm4, t0               ; get abs (q1 - q0)
			
 
				+        psubusb     mm4, mm7
			
 
				+        movq        mm3, t1               ; get abs (p1 - p0)
			
 
				+        psubusb     mm3, mm7
			
 
				+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
			
 
				+
			
 
				+        pcmpeqb     mm4,        mm5
			
 
				+
			
 
				+        pcmpeqb     mm5,        mm5
			
 
				+        pxor        mm4,        mm5
			
 
				+
			
 
				+
			
 
				+        ; start work on filters
			
 
				+        movq        mm2, [rsi+2*rax]      ; p1
			
 
				+        movq        mm7, [rdi]            ; q1
			
 
				+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
			
 
				+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
			
 
				+        psubsb      mm2, mm7              ; p1 - q1
			
 
				+        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
			
 
				+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
			
 
				+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
			
 
				+        movq        mm3, mm0              ; q0
			
 
				+        psubsb      mm0, mm6              ; q0 - p0
			
 
				+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
			
 
				+        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
			
 
				+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
			
 
				+        pand        mm1, mm2                  ; mask filter values we don't care about
			
 
				+        movq        mm2, mm1
			
 
				+        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
			
 
				+        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
			
 
				+
			
 
				+        pxor        mm0, mm0             ;
			
 
				+        pxor        mm5, mm5
			
 
				+        punpcklbw   mm0, mm2            ;
			
 
				+        punpckhbw   mm5, mm2            ;
			
 
				+        psraw       mm0, 11             ;
			
 
				+        psraw       mm5, 11
			
 
				+        packsswb    mm0, mm5
			
 
				+        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
			
 
				+
			
 
				+        pxor        mm0, mm0              ; 0
			
 
				+        movq        mm5, mm1              ; abcdefgh
			
 
				+        punpcklbw   mm0, mm1              ; e0f0g0h0
			
 
				+        psraw       mm0, 11               ; sign extended shift right by 3
			
 
				+        pxor        mm1, mm1              ; 0
			
 
				+        punpckhbw   mm1, mm5              ; a0b0c0d0
			
 
				+        psraw       mm1, 11               ; sign extended shift right by 3
			
 
				+        movq        mm5, mm0              ; save results
			
 
				+
			
 
				+        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
			
 
				+        paddsw      mm5, [GLOBAL(ones)]
			
 
				+        paddsw      mm1, [GLOBAL(ones)]
			
 
				+        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
			
 
				+        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
			
 
				+        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
			
 
				+        pandn       mm4, mm5              ; high edge variance additive
			
 
				+
			
 
				+        paddsb      mm6, mm2              ; p0+= p0 add
			
 
				+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
			
 
				+        movq        [rsi+rax], mm6        ; write back
			
 
				+
			
 
				+        movq        mm6, [rsi+2*rax]      ; p1
			
 
				+        pxor        mm6, [GLOBAL(t80)]    ; reoffset
			
 
				+        paddsb      mm6, mm4              ; p1+= p1 add
			
 
				+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
			
 
				+        movq        [rsi+2*rax], mm6      ; write back
			
 
				+
			
 
				+        psubsb      mm3, mm0              ; q0-= q0 add
			
 
				+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
			
 
				+        movq        [rsi], mm3            ; write back
			
 
				+
			
 
				+        psubsb      mm7, mm4              ; q1-= q1 add
			
 
				+        pxor        mm7, [GLOBAL(t80)]    ; unoffset
			
 
				+        movq        [rdi], mm7            ; write back
			
 
				+
			
 
				+        add         rsi,8
			
 
				+        neg         rax
			
 
				+        dec         rcx
			
 
				+        jnz         .next8_h
			
 
				+
			
 
				+    add rsp, 32
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_loop_filter_vertical_edge_mmx
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int  src_pixel_step,
			
 
				+;    const char *blimit,
			
 
				+;    const char *limit,
			
 
				+;    const char *thresh,
			
 
				+;    int count
			
 
				+;)
			
 
				+global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
			
 
				+sym(vp8_loop_filter_vertical_edge_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub          rsp, 64      ; reserve 64 bytes
			
 
				+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
			
 
				+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
			
 
				+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
			
 
				+
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
			
 
				+
			
 
				+        lea         rsi,        [rsi + rax*4 - 4]
			
 
				+
			
 
				+        movsxd      rcx,        dword ptr arg(5) ;count
			
 
				+.next8_v:
			
 
				+        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
			
 
				+        add         rdi,        rax
			
 
				+
			
 
				+
			
 
				+        ;transpose
			
 
				+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
			
 
				+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
			
 
				+
			
 
				+        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
			
 
				+        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
			
 
				+
			
 
				+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
			
 
				+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
			
 
				+
			
 
				+        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
			
 
				+        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
			
 
				+
			
 
				+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
			
 
				+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
			
 
				+
			
 
				+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
			
 
				+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
			
 
				+
			
 
				+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
			
 
				+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
			
 
				+
			
 
				+        neg         rax
			
 
				+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
			
 
				+
			
 
				+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
			
 
				+        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
			
 
				+
			
 
				+        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
			
 
				+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
			
 
				+
			
 
				+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
			
 
				+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
			
 
				+
			
 
				+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
			
 
				+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
			
 
				+
			
 
				+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
			
 
				+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
			
 
				+
			
 
				+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
			
 
				+
			
 
				+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
			
 
				+        psubusb     mm5,        mm7                         ; q2-q3
			
 
				+
			
 
				+        psubusb     mm7,        mm6                         ; q3-q2
			
 
				+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
			
 
				+
			
 
				+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
			
 
				+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
			
 
				+
			
 
				+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
			
 
				+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
			
 
				+
			
 
				+        psubusb     mm3,        mm6                         ; q1-q2
			
 
				+        psubusb     mm6,        mm5                         ; q2-q1
			
 
				+
			
 
				+        por         mm6,        mm3                         ; mm6=abs(q2-q1)
			
 
				+        lea         rdx,        srct
			
 
				+
			
 
				+        movq        [rdx+24],   mm5                         ; save q1
			
 
				+        movq        [rdx+16],   mm0                         ; save q0
			
 
				+
			
 
				+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
			
 
				+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
			
 
				+
			
 
				+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
			
 
				+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
			
 
				+
			
 
				+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
			
 
				+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
			
 
				+
			
 
				+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
			
 
				+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
			
 
				+
			
 
				+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
			
 
				+        psubusb     mm2,        mm0                         ; p2-p3
			
 
				+
			
 
				+        psubusb     mm0,        mm1                         ; p3-p2
			
 
				+        por         mm0,        mm2                         ; mm0=abs(p3-p2)
			
 
				+
			
 
				+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
			
 
				+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
			
 
				+
			
 
				+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
			
 
				+        movq        [rdx+8],    mm3                         ; save p0
			
 
				+
			
 
				+        movq        [rdx],      mm2                         ; save p1
			
 
				+        movq        mm5,        mm2                         ; mm5 = p1
			
 
				+
			
 
				+        psubusb     mm2,        mm1                         ; p1-p2
			
 
				+        psubusb     mm1,        mm5                         ; p2-p1
			
 
				+
			
 
				+        por         mm1,        mm2                         ; mm1=abs(p2-p1)
			
 
				+        mov         rdx,        arg(3) ;limit
			
 
				+
			
 
				+        movq        mm4,        [rdx]                       ; mm4 = limit
			
 
				+        psubusb     mm7,        mm4
			
 
				+
			
 
				+        psubusb     mm0,        mm4
			
 
				+        psubusb     mm1,        mm4
			
 
				+
			
 
				+        psubusb     mm6,        mm4
			
 
				+        por         mm7,        mm6
			
 
				+
			
 
				+        por         mm0,        mm1
			
 
				+        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
			
 
				+
			
 
				+        movq        mm1,        mm5                         ; p1
			
 
				+
			
 
				+        movq        mm7,        mm3                         ; mm3=mm7=p0
			
 
				+        psubusb     mm7,        mm5                         ; p0 - p1
			
 
				+
			
 
				+        psubusb     mm5,        mm3                         ; p1 - p0
			
 
				+        por         mm5,        mm7                         ; abs(p1-p0)
			
 
				+
			
 
				+        movq        t0,         mm5                         ; save abs(p1-p0)
			
 
				+        lea         rdx,        srct
			
 
				+
			
 
				+        psubusb     mm5,        mm4
			
 
				+        por         mm0,        mm5                         ; mm0=mask
			
 
				+
			
 
				+        movq        mm5,        [rdx+16]                    ; mm5=q0
			
 
				+        movq        mm7,        [rdx+24]                    ; mm7=q1
			
 
				+
			
 
				+        movq        mm6,        mm5                         ; mm6=q0
			
 
				+        movq        mm2,        mm7                         ; q1
			
 
				+        psubusb     mm5,        mm7                         ; q0-q1
			
 
				+
			
 
				+        psubusb     mm7,        mm6                         ; q1-q0
			
 
				+        por         mm7,        mm5                         ; abs(q1-q0)
			
 
				+
			
 
				+        movq        t1,         mm7                         ; save abs(q1-q0)
			
 
				+        psubusb     mm7,        mm4
			
 
				+
			
 
				+        por         mm0,        mm7                         ; mask
			
 
				+
			
 
				+        movq        mm5,        mm2                         ; q1
			
 
				+        psubusb     mm5,        mm1                         ; q1-=p1
			
 
				+        psubusb     mm1,        mm2                         ; p1-=q1
			
 
				+        por         mm5,        mm1                         ; abs(p1-q1)
			
 
				+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
			
 
				+        psrlw       mm5,        1                           ; abs(p1-q1)/2
			
 
				+
			
 
				+        mov         rdx,        arg(2) ;blimit                      ;
			
 
				+
			
 
				+        movq        mm4,        [rdx]                       ;blimit
			
 
				+        movq        mm1,        mm3                         ; mm1=mm3=p0
			
 
				+
			
 
				+        movq        mm7,        mm6                         ; mm7=mm6=q0
			
 
				+        psubusb     mm1,        mm7                         ; p0-q0
			
 
				+
			
 
				+        psubusb     mm7,        mm3                         ; q0-p0
			
 
				+        por         mm1,        mm7                         ; abs(q0-p0)
			
 
				+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
			
 
				+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
			
 
				+
			
 
				+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
			
 
				+        por         mm1,        mm0;                        ; mask
			
 
				+
			
 
				+        pxor        mm0,        mm0
			
 
				+        pcmpeqb     mm1,        mm0
			
 
				+
			
 
				+        ; calculate high edge variance
			
 
				+        mov         rdx,        arg(4) ;thresh            ; get thresh
			
 
				+        movq        mm7,        [rdx]
			
 
				+        ;
			
 
				+        movq        mm4,        t0              ; get abs (q1 - q0)
			
 
				+        psubusb     mm4,        mm7
			
 
				+
			
 
				+        movq        mm3,        t1              ; get abs (p1 - p0)
			
 
				+        psubusb     mm3,        mm7
			
 
				+
			
 
				+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
			
 
				+        pcmpeqb     mm4,        mm0
			
 
				+
			
 
				+        pcmpeqb     mm0,        mm0
			
 
				+        pxor        mm4,        mm0
			
 
				+
			
 
				+
			
 
				+
			
 
				+        ; start work on filters
			
 
				+        lea         rdx,        srct
			
 
				+
			
 
				+        movq        mm2,        [rdx]           ; p1
			
 
				+        movq        mm7,        [rdx+24]        ; q1
			
 
				+
			
 
				+        movq        mm6,        [rdx+8]         ; p0
			
 
				+        movq        mm0,        [rdx+16]        ; q0
			
 
				+
			
 
				+        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
			
 
				+        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
			
 
				+
			
 
				+        psubsb      mm2,        mm7             ; p1 - q1
			
 
				+        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
			
 
				+
			
 
				+        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
			
 
				+        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
			
 
				+
			
 
				+        movq        mm3,        mm0             ; q0
			
 
				+        psubsb      mm0,        mm6             ; q0 - p0
			
 
				+
			
 
				+        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
			
 
				+        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
			
 
				+
			
 
				+        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
			
 
				+        pand       mm1,        mm2              ; mask filter values we don't care about
			
 
				+
			
 
				+        movq        mm2,        mm1
			
 
				+        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
			
 
				+
			
 
				+        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
			
 
				+        pxor        mm0,        mm0          ;
			
 
				+
			
 
				+        pxor        mm5,        mm5
			
 
				+        punpcklbw   mm0,        mm2         ;
			
 
				+
			
 
				+        punpckhbw   mm5,        mm2         ;
			
 
				+        psraw       mm0,        11              ;
			
 
				+
			
 
				+        psraw       mm5,        11
			
 
				+        packsswb    mm0,        mm5
			
 
				+
			
 
				+        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
			
 
				+
			
 
				+        pxor        mm0,        mm0           ; 0
			
 
				+        movq        mm5,        mm1           ; abcdefgh
			
 
				+
			
 
				+        punpcklbw   mm0,        mm1           ; e0f0g0h0
			
 
				+        psraw       mm0,        11                ; sign extended shift right by 3
			
 
				+
			
 
				+        pxor        mm1,        mm1           ; 0
			
 
				+        punpckhbw   mm1,        mm5           ; a0b0c0d0
			
 
				+
			
 
				+        psraw       mm1,        11                ; sign extended shift right by 3
			
 
				+        movq        mm5,        mm0              ; save results
			
 
				+
			
 
				+        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
			
 
				+        paddsw      mm5,        [GLOBAL(ones)]
			
 
				+
			
 
				+        paddsw      mm1,        [GLOBAL(ones)]
			
 
				+        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
			
 
				+
			
 
				+        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
			
 
				+        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
			
 
				+
			
 
				+        pandn       mm4,        mm5             ; high edge variance additive
			
 
				+
			
 
				+        paddsb      mm6,        mm2             ; p0+= p0 add
			
 
				+        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
			
 
				+
			
 
				+        ; mm6=p0                               ;
			
 
				+        movq        mm1,        [rdx]           ; p1
			
 
				+        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
			
 
				+
			
 
				+        paddsb      mm1,        mm4                 ; p1+= p1 add
			
 
				+        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
			
 
				+        ; mm6 = p0 mm1 = p1
			
 
				+
			
 
				+        psubsb      mm3,        mm0                 ; q0-= q0 add
			
 
				+        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
			
 
				+
			
 
				+        ; mm3 = q0
			
 
				+        psubsb      mm7,        mm4                 ; q1-= q1 add
			
 
				+        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
			
 
				+        ; mm7 = q1
			
 
				+
			
 
				+        ; transpose and write back
			
 
				+        ; mm1 =    72 62 52 42 32 22 12 02
			
 
				+        ; mm6 =    73 63 53 43 33 23 13 03
			
 
				+        ; mm3 =    74 64 54 44 34 24 14 04
			
 
				+        ; mm7 =    75 65 55 45 35 25 15 05
			
 
				+
			
 
				+        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
			
 
				+        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
			
 
				+
			
 
				+        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
			
 
				+        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
			
 
				+
			
 
				+        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
			
 
				+        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
			
 
				+
			
 
				+        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
			
 
				+        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
			
 
				+
			
 
				+        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
			
 
				+        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
			
 
				+
			
 
				+        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
			
 
				+        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
			
 
				+
			
 
				+
			
 
				+        ; mm2 = 15 14 13 12 05 04 03 02
			
 
				+        ; mm6 = 35 34 33 32 25 24 23 22
			
 
				+        ; mm5 = 55 54 53 52 45 44 43 42
			
 
				+        ; mm1 = 75 74 73 72 65 64 63 62
			
 
				+
			
 
				+
			
 
				+
			
 
				+        movd        [rsi+rax*4+2], mm2
			
 
				+        psrlq       mm2,        32
			
 
				+
			
 
				+        movd        [rdi+rax*4+2], mm2
			
 
				+        movd        [rsi+rax*2+2], mm6
			
 
				+
			
 
				+        psrlq       mm6,        32
			
 
				+        movd        [rsi+rax+2],mm6
			
 
				+
			
 
				+        movd        [rsi+2],    mm1
			
 
				+        psrlq       mm1,        32
			
 
				+
			
 
				+        movd        [rdi+2],    mm1
			
 
				+        neg         rax
			
 
				+
			
 
				+        movd        [rdi+rax+2],mm5
			
 
				+        psrlq       mm5,        32
			
 
				+
			
 
				+        movd        [rdi+rax*2+2], mm5
			
 
				+
			
 
				+        lea         rsi,        [rsi+rax*8]
			
 
				+        dec         rcx
			
 
				+        jnz         .next8_v
			
 
				+
			
 
				+    add rsp, 64
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_mbloop_filter_horizontal_edge_mmx
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int  src_pixel_step,
			
 
				+;    const char *blimit,
			
 
				+;    const char *limit,
			
 
				+;    const char *thresh,
			
 
				+;    int count
			
 
				+;)
			
 
				+global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
			
 
				+sym(vp8_mbloop_filter_horizontal_edge_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub          rsp, 32      ; reserve 32 bytes
			
 
				+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
			
 
				+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
			
 
				+
			
 
				+        mov         rsi, arg(0) ;src_ptr
			
 
				+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
			
 
				+
			
 
				+        movsxd      rcx, dword ptr arg(5) ;count
			
 
				+.next8_mbh:
			
 
				+        mov         rdx, arg(3) ;limit
			
 
				+        movq        mm7, [rdx]
			
 
				+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
			
 
				+        add         rdi, rax
			
 
				+
			
 
				+        ; calculate breakout conditions
			
 
				+        movq        mm2, [rdi+2*rax]      ; q3
			
 
				+
			
 
				+        movq        mm1, [rsi+2*rax]      ; q2
			
 
				+        movq        mm6, mm1              ; q2
			
 
				+        psubusb     mm1, mm2              ; q2-=q3
			
 
				+        psubusb     mm2, mm6              ; q3-=q2
			
 
				+        por         mm1, mm2              ; abs(q3-q2)
			
 
				+        psubusb     mm1, mm7
			
 
				+
			
 
				+
			
 
				+        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
			
 
				+        movq        mm4, [rsi+rax]        ; q1
			
 
				+        movq        mm3, mm4              ; q1
			
 
				+        psubusb     mm4, mm6              ; q1-=q2
			
 
				+        psubusb     mm6, mm3              ; q2-=q1
			
 
				+        por         mm4, mm6              ; abs(q2-q1)
			
 
				+        psubusb     mm4, mm7
			
 
				+        por        mm1, mm4
			
 
				+
			
 
				+
			
 
				+        ; mm1 = mask,      mm3=q1, mm7 = limit
			
 
				+
			
 
				+        movq        mm4, [rsi]            ; q0
			
 
				+        movq        mm0, mm4              ; q0
			
 
				+        psubusb     mm4, mm3              ; q0-=q1
			
 
				+        psubusb     mm3, mm0              ; q1-=q0
			
 
				+        por         mm4, mm3              ; abs(q0-q1)
			
 
				+        movq        t0, mm4               ; save to t0
			
 
				+        psubusb     mm4, mm7
			
 
				+        por        mm1, mm4
			
 
				+
			
 
				+
			
 
				+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
			
 
				+
			
 
				+        neg         rax                   ; negate pitch to deal with above border
			
 
				+
			
 
				+        movq        mm2, [rsi+4*rax]      ; p3
			
 
				+        movq        mm4, [rdi+4*rax]      ; p2
			
 
				+        movq        mm5, mm4              ; p2
			
 
				+        psubusb     mm4, mm2              ; p2-=p3
			
 
				+        psubusb     mm2, mm5              ; p3-=p2
			
 
				+        por         mm4, mm2              ; abs(p3 - p2)
			
 
				+        psubusb     mm4, mm7
			
 
				+        por        mm1, mm4
			
 
				+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
			
 
				+
			
 
				+        movq        mm4, [rsi+2*rax]      ; p1
			
 
				+        movq        mm3, mm4              ; p1
			
 
				+        psubusb     mm4, mm5              ; p1-=p2
			
 
				+        psubusb     mm5, mm3              ; p2-=p1
			
 
				+        por         mm4, mm5              ; abs(p2 - p1)
			
 
				+        psubusb     mm4, mm7
			
 
				+        por        mm1, mm4
			
 
				+
			
 
				+        movq        mm2, mm3              ; p1
			
 
				+
			
 
				+
			
 
				+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
			
 
				+
			
 
				+        movq        mm4, [rsi+rax]        ; p0
			
 
				+        movq        mm5, mm4              ; p0
			
 
				+        psubusb     mm4, mm3              ; p0-=p1
			
 
				+        psubusb     mm3, mm5              ; p1-=p0
			
 
				+        por         mm4, mm3              ; abs(p1 - p0)
			
 
				+        movq        t1, mm4               ; save to t1
			
 
				+        psubusb     mm4, mm7
			
 
				+        por        mm1, mm4
			
 
				+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
			
 
				+        ; mm5 = p0
			
 
				+        movq        mm3, [rdi]            ; q1
			
 
				+        movq        mm4, mm3              ; q1
			
 
				+        psubusb     mm3, mm2              ; q1-=p1
			
 
				+        psubusb     mm2, mm4              ; p1-=q1
			
 
				+        por         mm2, mm3              ; abs(p1-q1)
			
 
				+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
			
 
				+        psrlw       mm2, 1                ; abs(p1-q1)/2
			
 
				+
			
 
				+        movq        mm6, mm5              ; p0
			
 
				+        movq        mm3, mm0              ; q0
			
 
				+        psubusb     mm5, mm3              ; p0-=q0
			
 
				+        psubusb     mm3, mm6              ; q0-=p0
			
 
				+        por         mm5, mm3              ; abs(p0 - q0)
			
 
				+        paddusb     mm5, mm5              ; abs(p0-q0)*2
			
 
				+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
			
 
				+
			
 
				+        mov         rdx, arg(2) ;blimit           ; get blimit
			
 
				+        movq        mm7, [rdx]            ; blimit
			
 
				+
			
 
				+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
			
 
				+        por         mm1,    mm5
			
 
				+        pxor        mm5,    mm5
			
 
				+        pcmpeqb     mm1,    mm5           ; mask mm1
			
 
				+
			
 
				+        ; mm1 = mask, mm0=q0,  mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
			
 
				+        ; mm6 = p0,
			
 
				+
			
 
				+        ; calculate high edge variance
			
 
				+        mov         rdx, arg(4) ;thresh           ; get thresh
			
 
				+        movq        mm7, [rdx]            ;
			
 
				+        movq        mm4, t0               ; get abs (q1 - q0)
			
 
				+        psubusb     mm4, mm7
			
 
				+        movq        mm3, t1               ; get abs (p1 - p0)
			
 
				+        psubusb     mm3, mm7
			
 
				+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
			
 
				+
			
 
				+        pcmpeqb     mm4,        mm5
			
 
				+
			
 
				+        pcmpeqb     mm5,        mm5
			
 
				+        pxor        mm4,        mm5
			
 
				+
			
 
				+
			
 
				+
			
 
				+        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
			
 
				+        ; mm6 = p0, mm4=hev
			
 
				+        ; start work on filters
			
 
				+        movq        mm2, [rsi+2*rax]      ; p1
			
 
				+        movq        mm7, [rdi]            ; q1
			
 
				+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
			
 
				+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
			
 
				+        psubsb      mm2, mm7              ; p1 - q1
			
 
				+
			
 
				+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
			
 
				+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
			
 
				+        movq        mm3, mm0              ; q0
			
 
				+        psubsb      mm0, mm6              ; q0 - p0
			
 
				+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
			
 
				+        paddsb      mm2, mm0              ; 2 * (q0 - p0)
			
 
				+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
			
 
				+        pand        mm1, mm2              ; mask filter values we don't care about
			
 
				+
			
 
				+
			
 
				+        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
			
 
				+        movq        mm2, mm1              ; vp8_filter
			
 
				+        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
			
 
				+
			
 
				+        movq        mm5,        mm2       ;
			
 
				+        paddsb      mm5,        [GLOBAL(t3)];
			
 
				+
			
 
				+        pxor        mm0, mm0              ; 0
			
 
				+        pxor        mm7, mm7              ; 0
			
 
				+
			
 
				+        punpcklbw   mm0, mm5              ; e0f0g0h0
			
 
				+        psraw       mm0, 11               ; sign extended shift right by 3
			
 
				+        punpckhbw   mm7, mm5              ; a0b0c0d0
			
 
				+        psraw       mm7, 11               ; sign extended shift right by 3
			
 
				+        packsswb    mm0, mm7              ; Filter2 >>=3;
			
 
				+
			
 
				+        movq        mm5, mm0              ; Filter2
			
 
				+
			
 
				+        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
			
 
				+        pxor        mm0, mm0              ; 0
			
 
				+        pxor        mm7, mm7              ; 0
			
 
				+
			
 
				+        punpcklbw   mm0, mm2              ; e0f0g0h0
			
 
				+        psraw       mm0, 11               ; sign extended shift right by 3
			
 
				+        punpckhbw   mm7, mm2              ; a0b0c0d0
			
 
				+        psraw       mm7, 11               ; sign extended shift right by 3
			
 
				+        packsswb    mm0, mm7              ; Filter2 >>=3;
			
 
				+
			
 
				+        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
			
 
				+        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
			
 
				+        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
			
 
				+
			
 
				+        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
			
 
				+        ; vp8_filter &= ~hev;
			
 
				+        ; Filter2 = vp8_filter;
			
 
				+        pandn       mm4, mm1              ; vp8_filter&=~hev
			
 
				+
			
 
				+
			
 
				+        ; mm3=qs0, mm4=filter2, mm6=ps0
			
 
				+
			
 
				+        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
			
 
				+        ; s = vp8_signed_char_clamp(qs0 - u);
			
 
				+        ; *oq0 = s^0x80;
			
 
				+        ; s = vp8_signed_char_clamp(ps0 + u);
			
 
				+        ; *op0 = s^0x80;
			
 
				+        pxor        mm0, mm0
			
 
				+
			
 
				+        pxor        mm1, mm1
			
 
				+        pxor        mm2, mm2
			
 
				+        punpcklbw   mm1, mm4
			
 
				+        punpckhbw   mm2, mm4
			
 
				+        pmulhw      mm1, [GLOBAL(s27)]
			
 
				+        pmulhw      mm2, [GLOBAL(s27)]
			
 
				+        paddw       mm1, [GLOBAL(s63)]
			
 
				+        paddw       mm2, [GLOBAL(s63)]
			
 
				+        psraw       mm1, 7
			
 
				+        psraw       mm2, 7
			
 
				+        packsswb    mm1, mm2
			
 
				+
			
 
				+        psubsb      mm3, mm1
			
 
				+        paddsb      mm6, mm1
			
 
				+
			
 
				+        pxor        mm3, [GLOBAL(t80)]
			
 
				+        pxor        mm6, [GLOBAL(t80)]
			
 
				+        movq        [rsi+rax], mm6
			
 
				+        movq        [rsi],     mm3
			
 
				+
			
 
				+        ; roughly 2/7th difference across boundary
			
 
				+        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
			
 
				+        ; s = vp8_signed_char_clamp(qs1 - u);
			
 
				+        ; *oq1 = s^0x80;
			
 
				+        ; s = vp8_signed_char_clamp(ps1 + u);
			
 
				+        ; *op1 = s^0x80;
			
 
				+        pxor        mm1, mm1
			
 
				+        pxor        mm2, mm2
			
 
				+        punpcklbw   mm1, mm4
			
 
				+        punpckhbw   mm2, mm4
			
 
				+        pmulhw      mm1, [GLOBAL(s18)]
			
 
				+        pmulhw      mm2, [GLOBAL(s18)]
			
 
				+        paddw       mm1, [GLOBAL(s63)]
			
 
				+        paddw       mm2, [GLOBAL(s63)]
			
 
				+        psraw       mm1, 7
			
 
				+        psraw       mm2, 7
			
 
				+        packsswb    mm1, mm2
			
 
				+
			
 
				+        movq        mm3, [rdi]
			
 
				+        movq        mm6, [rsi+rax*2]       ; p1
			
 
				+
			
 
				+        pxor        mm3, [GLOBAL(t80)]
			
 
				+        pxor        mm6, [GLOBAL(t80)]
			
 
				+
			
 
				+        paddsb      mm6, mm1
			
 
				+        psubsb      mm3, mm1
			
 
				+
			
 
				+        pxor        mm6, [GLOBAL(t80)]
			
 
				+        pxor        mm3, [GLOBAL(t80)]
			
 
				+        movq        [rdi], mm3
			
 
				+        movq        [rsi+rax*2], mm6
			
 
				+
			
 
				+        ; roughly 1/7th difference across boundary
			
 
				+        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
			
 
				+        ; s = vp8_signed_char_clamp(qs2 - u);
			
 
				+        ; *oq2 = s^0x80;
			
 
				+        ; s = vp8_signed_char_clamp(ps2 + u);
			
 
				+        ; *op2 = s^0x80;
			
 
				+        pxor        mm1, mm1
			
 
				+        pxor        mm2, mm2
			
 
				+        punpcklbw   mm1, mm4
			
 
				+        punpckhbw   mm2, mm4
			
 
				+        pmulhw      mm1, [GLOBAL(s9)]
			
 
				+        pmulhw      mm2, [GLOBAL(s9)]
			
 
				+        paddw       mm1, [GLOBAL(s63)]
			
 
				+        paddw       mm2, [GLOBAL(s63)]
			
 
				+        psraw       mm1, 7
			
 
				+        psraw       mm2, 7
			
 
				+        packsswb    mm1, mm2
			
 
				+
			
 
				+
			
 
				+        movq        mm6, [rdi+rax*4]
			
 
				+        neg         rax
			
 
				+        movq        mm3, [rdi+rax  ]
			
 
				+
			
 
				+        pxor        mm6, [GLOBAL(t80)]
			
 
				+        pxor        mm3, [GLOBAL(t80)]
			
 
				+
			
 
				+        paddsb      mm6, mm1
			
 
				+        psubsb      mm3, mm1
			
 
				+
			
 
				+        pxor        mm6, [GLOBAL(t80)]
			
 
				+        pxor        mm3, [GLOBAL(t80)]
			
 
				+        movq        [rdi+rax  ], mm3
			
 
				+        neg         rax
			
 
				+        movq        [rdi+rax*4], mm6
			
 
				+
			
 
				+;EARLY_BREAK_OUT:
			
 
				+        neg         rax
			
 
				+        add         rsi,8
			
 
				+        dec         rcx
			
 
				+        jnz         .next8_mbh
			
 
				+
			
 
				+    add rsp, 32
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_mbloop_filter_vertical_edge_mmx
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int  src_pixel_step,
			
 
				+;    const char *blimit,
			
 
				+;    const char *limit,
			
 
				+;    const char *thresh,
			
 
				+;    int count
			
 
				+;)
			
 
				+global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
			
 
				+sym(vp8_mbloop_filter_vertical_edge_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 6
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub          rsp, 96      ; reserve 96 bytes
			
 
				+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
			
 
				+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
			
 
				+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
			
 
				+
			
 
				+        mov         rsi,        arg(0) ;src_ptr
			
 
				+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
			
 
				+
			
 
				+        lea         rsi,        [rsi + rax*4 - 4]
			
 
				+
			
 
				+        movsxd      rcx,        dword ptr arg(5) ;count
			
 
				+.next8_mbv:
			
 
				+        lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
			
 
				+
			
 
				+        ;transpose
			
 
				+        movq        mm0,        [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
			
 
				+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
			
 
				+
			
 
				+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
			
 
				+        punpckhbw   mm7,        mm0                         ; 77 67 76 66 75 65 74 64
			
 
				+
			
 
				+        punpcklbw   mm6,        mm0                         ; 73 63 72 62 71 61 70 60
			
 
				+        movq        mm0,        [rsi+rax]                   ; 57 56 55 54 53 52 51 50
			
 
				+
			
 
				+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
			
 
				+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
			
 
				+
			
 
				+        punpckhbw   mm5,        mm0                         ; 57 47 56 46 55 45 54 44
			
 
				+        punpcklbw   mm4,        mm0                         ; 53 43 52 42 51 41 50 40
			
 
				+
			
 
				+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
			
 
				+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
			
 
				+
			
 
				+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
			
 
				+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
			
 
				+
			
 
				+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
			
 
				+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
			
 
				+
			
 
				+        neg         rax
			
 
				+
			
 
				+        movq        mm7,        [rsi+rax]                   ; 37 36 35 34 33 32 31 30
			
 
				+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
			
 
				+
			
 
				+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
			
 
				+        punpckhbw   mm6,        mm7                         ; 37 27 36 36 35 25 34 24
			
 
				+
			
 
				+        punpcklbw   mm1,        mm7                         ; 33 23 32 22 31 21 30 20
			
 
				+
			
 
				+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
			
 
				+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
			
 
				+
			
 
				+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
			
 
				+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
			
 
				+
			
 
				+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
			
 
				+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
			
 
				+
			
 
				+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
			
 
				+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
			
 
				+
			
 
				+        lea         rdx,        srct
			
 
				+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
			
 
				+
			
 
				+        movq        [rdx+56],   mm7
			
 
				+        psubusb     mm5,        mm7                         ; q2-q3
			
 
				+
			
 
				+
			
 
				+        movq        [rdx+48],   mm6
			
 
				+        psubusb     mm7,        mm6                         ; q3-q2
			
 
				+
			
 
				+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
			
 
				+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
			
 
				+
			
 
				+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
			
 
				+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
			
 
				+
			
 
				+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
			
 
				+        psubusb     mm3,        mm6                         ; q1-q2
			
 
				+
			
 
				+        psubusb     mm6,        mm5                         ; q2-q1
			
 
				+        por         mm6,        mm3                         ; mm6=abs(q2-q1)
			
 
				+
			
 
				+        movq        [rdx+40],   mm5                         ; save q1
			
 
				+        movq        [rdx+32],   mm0                         ; save q0
			
 
				+
			
 
				+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
			
 
				+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
			
 
				+
			
 
				+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
			
 
				+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
			
 
				+
			
 
				+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
			
 
				+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
			
 
				+
			
 
				+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
			
 
				+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
			
 
				+
			
 
				+        movq        [rdx],      mm0                         ; save p3
			
 
				+        movq        [rdx+8],    mm1                         ; save p2
			
 
				+
			
 
				+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
			
 
				+        psubusb     mm2,        mm0                         ; p2-p3
			
 
				+
			
 
				+        psubusb     mm0,        mm1                         ; p3-p2
			
 
				+        por         mm0,        mm2                         ; mm0=abs(p3-p2)
			
 
				+
			
 
				+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
			
 
				+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
			
 
				+
			
 
				+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
			
 
				+        movq        [rdx+24],   mm3                         ; save p0
			
 
				+
			
 
				+        movq        [rdx+16],   mm2                         ; save p1
			
 
				+        movq        mm5,        mm2                         ; mm5 = p1
			
 
				+
			
 
				+        psubusb     mm2,        mm1                         ; p1-p2
			
 
				+        psubusb     mm1,        mm5                         ; p2-p1
			
 
				+
			
 
				+        por         mm1,        mm2                         ; mm1=abs(p2-p1)
			
 
				+        mov         rdx,        arg(3) ;limit
			
 
				+
			
 
				+        movq        mm4,        [rdx]                       ; mm4 = limit
			
 
				+        psubusb     mm7,        mm4                         ; abs(q3-q2) > limit
			
 
				+
			
 
				+        psubusb     mm0,        mm4                         ; abs(p3-p2) > limit
			
 
				+        psubusb     mm1,        mm4                         ; abs(p2-p1) > limit
			
 
				+
			
 
				+        psubusb     mm6,        mm4                         ; abs(q2-q1) > limit
			
 
				+        por         mm7,        mm6                         ; or
			
 
				+
			
 
				+        por         mm0,        mm1                         ;
			
 
				+        por         mm0,        mm7                         ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
			
 
				+
			
 
				+        movq        mm1,        mm5                         ; p1
			
 
				+
			
 
				+        movq        mm7,        mm3                         ; mm3=mm7=p0
			
 
				+        psubusb     mm7,        mm5                         ; p0 - p1
			
 
				+
			
 
				+        psubusb     mm5,        mm3                         ; p1 - p0
			
 
				+        por         mm5,        mm7                         ; abs(p1-p0)
			
 
				+
			
 
				+        movq        t0,         mm5                         ; save abs(p1-p0)
			
 
				+        lea         rdx,        srct
			
 
				+
			
 
				+        psubusb     mm5,        mm4                         ; mm5 = abs(p1-p0) > limit
			
 
				+        por         mm0,        mm5                         ; mm0=mask
			
 
				+
			
 
				+        movq        mm5,        [rdx+32]                    ; mm5=q0
			
 
				+        movq        mm7,        [rdx+40]                    ; mm7=q1
			
 
				+
			
 
				+        movq        mm6,        mm5                         ; mm6=q0
			
 
				+        movq        mm2,        mm7                         ; q1
			
 
				+        psubusb     mm5,        mm7                         ; q0-q1
			
 
				+
			
 
				+        psubusb     mm7,        mm6                         ; q1-q0
			
 
				+        por         mm7,        mm5                         ; abs(q1-q0)
			
 
				+
			
 
				+        movq        t1,         mm7                         ; save abs(q1-q0)
			
 
				+        psubusb     mm7,        mm4                         ; mm7=abs(q1-q0)> limit
			
 
				+
			
 
				+        por         mm0,        mm7                         ; mask
			
 
				+
			
 
				+        movq        mm5,        mm2                         ; q1
			
 
				+        psubusb     mm5,        mm1                         ; q1-=p1
			
 
				+        psubusb     mm1,        mm2                         ; p1-=q1
			
 
				+        por         mm5,        mm1                         ; abs(p1-q1)
			
 
				+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
			
 
				+        psrlw       mm5,        1                           ; abs(p1-q1)/2
			
 
				+
			
 
				+        mov         rdx,        arg(2) ;blimit                      ;
			
 
				+
			
 
				+        movq        mm4,        [rdx]                       ;blimit
			
 
				+        movq        mm1,        mm3                         ; mm1=mm3=p0
			
 
				+
			
 
				+        movq        mm7,        mm6                         ; mm7=mm6=q0
			
 
				+        psubusb     mm1,        mm7                         ; p0-q0
			
 
				+
			
 
				+        psubusb     mm7,        mm3                         ; q0-p0
			
 
				+        por         mm1,        mm7                         ; abs(q0-p0)
			
 
				+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
			
 
				+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
			
 
				+
			
 
				+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
			
 
				+        por         mm1,        mm0;                        ; mask
			
 
				+
			
 
				+        pxor        mm0,        mm0
			
 
				+        pcmpeqb     mm1,        mm0
			
 
				+
			
 
				+        ; calculate high edge variance
			
 
				+        mov         rdx,        arg(4) ;thresh            ; get thresh
			
 
				+        movq        mm7,        [rdx]
			
 
				+        ;
			
 
				+        movq        mm4,        t0              ; get abs (q1 - q0)
			
 
				+        psubusb     mm4,        mm7             ; abs(q1 - q0) > thresh
			
 
				+
			
 
				+        movq        mm3,        t1              ; get abs (p1 - p0)
			
 
				+        psubusb     mm3,        mm7             ; abs(p1 - p0)> thresh
			
 
				+
			
 
				+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
			
 
				+        pcmpeqb     mm4,        mm0
			
 
				+
			
 
				+        pcmpeqb     mm0,        mm0
			
 
				+        pxor        mm4,        mm0
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+        ; start work on filters
			
 
				+        lea         rdx,        srct
			
 
				+
			
 
				+        ; start work on filters
			
 
				+        movq        mm2, [rdx+16]         ; p1
			
 
				+        movq        mm7, [rdx+40]         ; q1
			
 
				+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
			
 
				+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
			
 
				+        psubsb      mm2, mm7              ; p1 - q1
			
 
				+
			
 
				+        movq        mm6, [rdx+24]         ; p0
			
 
				+        movq        mm0, [rdx+32]         ; q0
			
 
				+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
			
 
				+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
			
 
				+
			
 
				+        movq        mm3, mm0              ; q0
			
 
				+        psubsb      mm0, mm6              ; q0 - p0
			
 
				+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
			
 
				+        paddsb      mm2, mm0              ; 2 * (q0 - p0)
			
 
				+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
			
 
				+        pand       mm1, mm2           ; mask filter values we don't care about
			
 
				+
			
 
				+        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
			
 
				+        movq        mm2, mm1              ; vp8_filter
			
 
				+        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
			
 
				+
			
 
				+        movq        mm5,        mm2       ;
			
 
				+        paddsb      mm5,        [GLOBAL(t3)];
			
 
				+
			
 
				+        pxor        mm0, mm0              ; 0
			
 
				+        pxor        mm7, mm7              ; 0
			
 
				+
			
 
				+        punpcklbw   mm0, mm5              ; e0f0g0h0
			
 
				+        psraw       mm0, 11               ; sign extended shift right by 3
			
 
				+        punpckhbw   mm7, mm5              ; a0b0c0d0
			
 
				+        psraw       mm7, 11               ; sign extended shift right by 3
			
 
				+        packsswb    mm0, mm7              ; Filter2 >>=3;
			
 
				+
			
 
				+        movq        mm5, mm0              ; Filter2
			
 
				+
			
 
				+        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
			
 
				+        pxor        mm0, mm0              ; 0
			
 
				+        pxor        mm7, mm7              ; 0
			
 
				+
			
 
				+        punpcklbw   mm0, mm2              ; e0f0g0h0
			
 
				+        psraw       mm0, 11               ; sign extended shift right by 3
			
 
				+        punpckhbw   mm7, mm2              ; a0b0c0d0
			
 
				+        psraw       mm7, 11               ; sign extended shift right by 3
			
 
				+        packsswb    mm0, mm7              ; Filter2 >>=3;
			
 
				+
			
 
				+        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
			
 
				+        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
			
 
				+        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
			
 
				+
			
 
				+        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
			
 
				+        ; vp8_filter &= ~hev;
			
 
				+        ; Filter2 = vp8_filter;
			
 
				+        pandn       mm4, mm1              ; vp8_filter&=~hev
			
 
				+
			
 
				+
			
 
				+        ; mm3=qs0, mm4=filter2, mm6=ps0
			
 
				+
			
 
				+        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
			
 
				+        ; s = vp8_signed_char_clamp(qs0 - u);
			
 
				+        ; *oq0 = s^0x80;
			
 
				+        ; s = vp8_signed_char_clamp(ps0 + u);
			
 
				+        ; *op0 = s^0x80;
			
 
				+        pxor        mm0, mm0
			
 
				+
			
 
				+        pxor        mm1, mm1
			
 
				+        pxor        mm2, mm2
			
 
				+        punpcklbw   mm1, mm4
			
 
				+        punpckhbw   mm2, mm4
			
 
				+        pmulhw      mm1, [GLOBAL(s27)]
			
 
				+        pmulhw      mm2, [GLOBAL(s27)]
			
 
				+        paddw       mm1, [GLOBAL(s63)]
			
 
				+        paddw       mm2, [GLOBAL(s63)]
			
 
				+        psraw       mm1, 7
			
 
				+        psraw       mm2, 7
			
 
				+        packsswb    mm1, mm2
			
 
				+
			
 
				+        psubsb      mm3, mm1
			
 
				+        paddsb      mm6, mm1
			
 
				+
			
 
				+        pxor        mm3, [GLOBAL(t80)]
			
 
				+        pxor        mm6, [GLOBAL(t80)]
			
 
				+        movq        [rdx+24], mm6
			
 
				+        movq        [rdx+32], mm3
			
 
				+
			
 
				+        ; roughly 2/7th difference across boundary
			
 
				+        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
			
 
				+        ; s = vp8_signed_char_clamp(qs1 - u);
			
 
				+        ; *oq1 = s^0x80;
			
 
				+        ; s = vp8_signed_char_clamp(ps1 + u);
			
 
				+        ; *op1 = s^0x80;
			
 
				+        pxor        mm1, mm1
			
 
				+        pxor        mm2, mm2
			
 
				+        punpcklbw   mm1, mm4
			
 
				+        punpckhbw   mm2, mm4
			
 
				+        pmulhw      mm1, [GLOBAL(s18)]
			
 
				+        pmulhw      mm2, [GLOBAL(s18)]
			
 
				+        paddw       mm1, [GLOBAL(s63)]
			
 
				+        paddw       mm2, [GLOBAL(s63)]
			
 
				+        psraw       mm1, 7
			
 
				+        psraw       mm2, 7
			
 
				+        packsswb    mm1, mm2
			
 
				+
			
 
				+        movq        mm3, [rdx + 40]
			
 
				+        movq        mm6, [rdx + 16]       ; p1
			
 
				+        pxor        mm3, [GLOBAL(t80)]
			
 
				+        pxor        mm6, [GLOBAL(t80)]
			
 
				+
			
 
				+        paddsb      mm6, mm1
			
 
				+        psubsb      mm3, mm1
			
 
				+
			
 
				+        pxor        mm6, [GLOBAL(t80)]
			
 
				+        pxor        mm3, [GLOBAL(t80)]
			
 
				+        movq        [rdx + 40], mm3
			
 
				+        movq        [rdx + 16], mm6
			
 
				+
			
 
				+        ; roughly 1/7th difference across boundary
			
 
				+        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
			
 
				+        ; s = vp8_signed_char_clamp(qs2 - u);
			
 
				+        ; *oq2 = s^0x80;
			
 
				+        ; s = vp8_signed_char_clamp(ps2 + u);
			
 
				+        ; *op2 = s^0x80;
			
 
				+        pxor        mm1, mm1
			
 
				+        pxor        mm2, mm2
			
 
				+        punpcklbw   mm1, mm4
			
 
				+        punpckhbw   mm2, mm4
			
 
				+        pmulhw      mm1, [GLOBAL(s9)]
			
 
				+        pmulhw      mm2, [GLOBAL(s9)]
			
 
				+        paddw       mm1, [GLOBAL(s63)]
			
 
				+        paddw       mm2, [GLOBAL(s63)]
			
 
				+        psraw       mm1, 7
			
 
				+        psraw       mm2, 7
			
 
				+        packsswb    mm1, mm2
			
 
				+
			
 
				+        movq        mm6, [rdx+ 8]
			
 
				+        movq        mm3, [rdx+48]
			
 
				+
			
 
				+        pxor        mm6, [GLOBAL(t80)]
			
 
				+        pxor        mm3, [GLOBAL(t80)]
			
 
				+
			
 
				+        paddsb      mm6, mm1
			
 
				+        psubsb      mm3, mm1
			
 
				+
			
 
				+        pxor        mm6, [GLOBAL(t80)]          ; mm6 = 71 61 51 41 31 21 11 01
			
 
				+        pxor        mm3, [GLOBAL(t80)]          ; mm3 = 76 66 56 46 36 26 15 06
			
 
				+
			
 
				+        ; transpose and write back
			
 
				+        movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
			
 
				+        movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
			
 
				+
			
 
				+        punpcklbw   mm0,    mm6                 ; mm0 = 31 30 21 20 11 10 01 00
			
 
				+        punpckhbw   mm1,    mm6                 ; mm3 = 71 70 61 60 51 50 41 40
			
 
				+
			
 
				+        movq        mm2,    [rdx+16]            ; mm2 = 72 62 52 42 32 22 12 02
			
 
				+        movq        mm6,    mm2                 ; mm3 = 72 62 52 42 32 22 12 02
			
 
				+
			
 
				+        punpcklbw   mm2,    [rdx+24]            ; mm2 = 33 32 23 22 13 12 03 02
			
 
				+        punpckhbw   mm6,    [rdx+24]            ; mm3 = 73 72 63 62 53 52 43 42
			
 
				+
			
 
				+        movq        mm5,    mm0                 ; mm5 = 31 30 21 20 11 10 01 00
			
 
				+        punpcklwd   mm0,    mm2                 ; mm0 = 13 12 11 10 03 02 01 00
			
 
				+
			
 
				+        punpckhwd   mm5,    mm2                 ; mm5 = 33 32 31 30 23 22 21 20
			
 
				+        movq        mm4,    mm1                 ; mm4 = 71 70 61 60 51 50 41 40
			
 
				+
			
 
				+        punpcklwd   mm1,    mm6                 ; mm1 = 53 52 51 50 43 42 41 40
			
 
				+        punpckhwd   mm4,    mm6                 ; mm4 = 73 72 71 70 63 62 61 60
			
 
				+
			
 
				+        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
			
 
				+        punpcklbw   mm2,    [rdx+40]            ; mm2 = 35 34 25 24 15 14 05 04
			
 
				+
			
 
				+        movq        mm6,    mm3                 ; mm6 = 76 66 56 46 36 26 15 06
			
 
				+        punpcklbw   mm6,    [rdx+56]            ; mm6 = 37 36 27 26 17 16 07 06
			
 
				+
			
 
				+        movq        mm7,    mm2                 ; mm7 = 35 34 25 24 15 14 05 04
			
 
				+        punpcklwd   mm2,    mm6                 ; mm2 = 17 16 15 14 07 06 05 04
			
 
				+
			
 
				+        punpckhwd   mm7,    mm6                 ; mm7 = 37 36 35 34 27 26 25 24
			
 
				+        movq        mm6,    mm0                 ; mm6 = 13 12 11 10 03 02 01 00
			
 
				+
			
 
				+        punpckldq   mm0,    mm2                 ; mm0 = 07 06 05 04 03 02 01 00
			
 
				+        punpckhdq   mm6,    mm2                 ; mm6 = 17 16 15 14 13 12 11 10
			
 
				+
			
 
				+        movq        [rsi+rax*4], mm0            ; write out
			
 
				+        movq        [rdi+rax*4], mm6            ; write out
			
 
				+
			
 
				+        movq        mm0,    mm5                 ; mm0 = 33 32 31 30 23 22 21 20
			
 
				+        punpckldq   mm0,    mm7                 ; mm0 = 27 26 25 24 23 22 20 20
			
 
				+
			
 
				+        punpckhdq   mm5,    mm7                 ; mm5 = 37 36 35 34 33 32 31 30
			
 
				+        movq        [rsi+rax*2], mm0            ; write out
			
 
				+
			
 
				+        movq        [rdi+rax*2], mm5            ; write out
			
 
				+        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
			
 
				+
			
 
				+        punpckhbw   mm2,    [rdx+40]            ; mm2 = 75 74 65 64 54 54 45 44
			
 
				+        punpckhbw   mm3,    [rdx+56]            ; mm3 = 77 76 67 66 57 56 47 46
			
 
				+
			
 
				+        movq        mm5,    mm2                 ; mm5 = 75 74 65 64 54 54 45 44
			
 
				+        punpcklwd   mm2,    mm3                 ; mm2 = 57 56 55 54 47 46 45 44
			
 
				+
			
 
				+        punpckhwd   mm5,    mm3                 ; mm5 = 77 76 75 74 67 66 65 64
			
 
				+        movq        mm0,    mm1                 ; mm0=  53 52 51 50 43 42 41 40
			
 
				+
			
 
				+        movq        mm3,    mm4                 ; mm4 = 73 72 71 70 63 62 61 60
			
 
				+        punpckldq   mm0,    mm2                 ; mm0 = 47 46 45 44 43 42 41 40
			
 
				+
			
 
				+        punpckhdq   mm1,    mm2                 ; mm1 = 57 56 55 54 53 52 51 50
			
 
				+        movq        [rsi],  mm0                 ; write out
			
 
				+
			
 
				+        movq        [rdi],  mm1                 ; write out
			
 
				+        neg         rax
			
 
				+
			
 
				+        punpckldq   mm3,    mm5                 ; mm3 = 67 66 65 64 63 62 61 60
			
 
				+        punpckhdq   mm4,    mm5                 ; mm4 = 77 76 75 74 73 72 71 60
			
 
				+
			
 
				+        movq        [rsi+rax*2], mm3
			
 
				+        movq        [rdi+rax*2], mm4
			
 
				+
			
 
				+        lea         rsi,        [rsi+rax*8]
			
 
				+        dec         rcx
			
 
				+
			
 
				+        jnz         .next8_mbv
			
 
				+
			
 
				+    add rsp, 96
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_loop_filter_simple_horizontal_edge_mmx
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int  src_pixel_step,
			
 
				+;    const char *blimit
			
 
				+;)
			
 
				+global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
			
 
				+sym(vp8_loop_filter_simple_horizontal_edge_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 3
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+        mov         rsi, arg(0) ;src_ptr
			
 
				+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
			
 
				+
			
 
				+        mov         rcx, 2                ; count
			
 
				+.nexts8_h:
			
 
				+        mov         rdx, arg(2) ;blimit           ; get blimit
			
 
				+        movq        mm3, [rdx]            ;
			
 
				+
			
 
				+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
			
 
				+        add         rdi, rax
			
 
				+        neg         rax
			
 
				+
			
 
				+        ; calculate mask
			
 
				+        movq        mm1, [rsi+2*rax]      ; p1
			
 
				+        movq        mm0, [rdi]            ; q1
			
 
				+        movq        mm2, mm1
			
 
				+        movq        mm7, mm0
			
 
				+        movq        mm4, mm0
			
 
				+        psubusb     mm0, mm1              ; q1-=p1
			
 
				+        psubusb     mm1, mm4              ; p1-=q1
			
 
				+        por         mm1, mm0              ; abs(p1-q1)
			
 
				+        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
			
 
				+        psrlw       mm1, 1                ; abs(p1-q1)/2
			
 
				+
			
 
				+        movq        mm5, [rsi+rax]        ; p0
			
 
				+        movq        mm4, [rsi]            ; q0
			
 
				+        movq        mm0, mm4              ; q0
			
 
				+        movq        mm6, mm5              ; p0
			
 
				+        psubusb     mm5, mm4              ; p0-=q0
			
 
				+        psubusb     mm4, mm6              ; q0-=p0
			
 
				+        por         mm5, mm4              ; abs(p0 - q0)
			
 
				+        paddusb     mm5, mm5              ; abs(p0-q0)*2
			
 
				+        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
			
 
				+
			
 
				+        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
			
 
				+        pxor        mm3, mm3
			
 
				+        pcmpeqb     mm5, mm3
			
 
				+
			
 
				+        ; start work on filters
			
 
				+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
			
 
				+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
			
 
				+        psubsb      mm2, mm7              ; p1 - q1
			
 
				+
			
 
				+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
			
 
				+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
			
 
				+        movq        mm3, mm0              ; q0
			
 
				+        psubsb      mm0, mm6              ; q0 - p0
			
 
				+        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
			
 
				+        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
			
 
				+        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
			
 
				+        pand        mm5, mm2              ; mask filter values we don't care about
			
 
				+
			
 
				+        ; do + 4 side
			
 
				+        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
			
 
				+
			
 
				+        movq        mm0, mm5              ; get a copy of filters
			
 
				+        psllw       mm0, 8                ; shift left 8
			
 
				+        psraw       mm0, 3                ; arithmetic shift right 11
			
 
				+        psrlw       mm0, 8
			
 
				+        movq        mm1, mm5              ; get a copy of filters
			
 
				+        psraw       mm1, 11               ; arithmetic shift right 11
			
 
				+        psllw       mm1, 8                ; shift left 8 to put it back
			
 
				+
			
 
				+        por         mm0, mm1              ; put the two together to get result
			
 
				+
			
 
				+        psubsb      mm3, mm0              ; q0-= q0 add
			
 
				+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
			
 
				+        movq        [rsi], mm3            ; write back
			
 
				+
			
 
				+
			
 
				+        ; now do +3 side
			
 
				+        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
			
 
				+
			
 
				+        movq        mm0, mm5              ; get a copy of filters
			
 
				+        psllw       mm0, 8                ; shift left 8
			
 
				+        psraw       mm0, 3                ; arithmetic shift right 11
			
 
				+        psrlw       mm0, 8
			
 
				+        psraw       mm5, 11               ; arithmetic shift right 11
			
 
				+        psllw       mm5, 8                ; shift left 8 to put it back
			
 
				+        por         mm0, mm5              ; put the two together to get result
			
 
				+
			
 
				+
			
 
				+        paddsb      mm6, mm0              ; p0+= p0 add
			
 
				+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
			
 
				+        movq        [rsi+rax], mm6        ; write back
			
 
				+
			
 
				+        add         rsi,8
			
 
				+        neg         rax
			
 
				+        dec         rcx
			
 
				+        jnz         .nexts8_h
			
 
				+
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+;void vp8_loop_filter_simple_vertical_edge_mmx
			
 
				+;(
			
 
				+;    unsigned char *src_ptr,
			
 
				+;    int  src_pixel_step,
			
 
				+;    const char *blimit
			
 
				+;)
			
 
				+global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
			
 
				+sym(vp8_loop_filter_simple_vertical_edge_mmx):
			
 
				+    push        rbp
			
 
				+    mov         rbp, rsp
			
 
				+    SHADOW_ARGS_TO_STACK 3
			
 
				+    GET_GOT     rbx
			
 
				+    push        rsi
			
 
				+    push        rdi
			
 
				+    ; end prolog
			
 
				+
			
 
				+    ALIGN_STACK 16, rax
			
 
				+    sub          rsp, 32      ; reserve 32 bytes
			
 
				+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
			
 
				+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
			
 
				+
			
 
				+        mov         rsi, arg(0) ;src_ptr
			
 
				+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
			
 
				+
			
 
				+        lea         rsi, [rsi + rax*4- 2];  ;
			
 
				+        mov         rcx, 2                                      ; count
			
 
				+.nexts8_v:
			
 
				+
			
 
				+        lea         rdi,        [rsi + rax];
			
 
				+        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
			
 
				+
			
 
				+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
			
 
				+        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
			
 
				+
			
 
				+        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
			
 
				+        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
			
 
				+
			
 
				+        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
			
 
				+        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
			
 
				+
			
 
				+        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
			
 
				+        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
			
 
				+
			
 
				+        neg         rax
			
 
				+
			
 
				+        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
			
 
				+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
			
 
				+
			
 
				+        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
			
 
				+        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
			
 
				+
			
 
				+        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
			
 
				+        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
			
 
				+
			
 
				+        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
			
 
				+        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
			
 
				+
			
 
				+        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
			
 
				+        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
			
 
				+
			
 
				+        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
			
 
				+        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
			
 
				+
			
 
				+        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
			
 
				+        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
			
 
				+
			
 
				+        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
			
 
				+
			
 
				+
			
 
				+        ; calculate mask
			
 
				+        movq        mm6,        mm0                             ; p1
			
 
				+        movq        mm7,        mm3                             ; q1
			
 
				+        psubusb     mm7,        mm6                             ; q1-=p1
			
 
				+        psubusb     mm6,        mm3                             ; p1-=q1
			
 
				+        por         mm6,        mm7                             ; abs(p1-q1)
			
 
				+        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
			
 
				+        psrlw       mm6,        1                               ; abs(p1-q1)/2
			
 
				+
			
 
				+        movq        mm5,        mm1                             ; p0
			
 
				+        movq        mm4,        mm2                             ; q0
			
 
				+
			
 
				+        psubusb     mm5,        mm2                             ; p0-=q0
			
 
				+        psubusb     mm4,        mm1                             ; q0-=p0
			
 
				+
			
 
				+        por         mm5,        mm4                             ; abs(p0 - q0)
			
 
				+        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
			
 
				+        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
			
 
				+
			
 
				+        mov         rdx,        arg(2) ;blimit                          ; get blimit
			
 
				+        movq        mm7,        [rdx]
			
 
				+
			
 
				+        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
			
 
				+        pxor        mm7,        mm7
			
 
				+        pcmpeqb     mm5,        mm7                             ; mm5 = mask
			
 
				+
			
 
				+        ; start work on filters
			
 
				+        movq        t0,         mm0
			
 
				+        movq        t1,         mm3
			
 
				+
			
 
				+        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
			
 
				+        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
			
 
				+
			
 
				+        psubsb      mm0,        mm3                             ; p1 - q1
			
 
				+        movq        mm6,        mm1                             ; p0
			
 
				+
			
 
				+        movq        mm7,        mm2                             ; q0
			
 
				+        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
			
 
				+
			
 
				+        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
			
 
				+        movq        mm3,        mm7                             ; offseted ; q0
			
 
				+
			
 
				+        psubsb      mm7,        mm6                             ; q0 - p0
			
 
				+        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
			
 
				+
			
 
				+        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
			
 
				+        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
			
 
				+
			
 
				+        pand        mm5,        mm0                             ; mask filter values we don't care about
			
 
				+
			
 
				+        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
			
 
				+
			
 
				+        movq        mm0,        mm5                             ; get a copy of filters
			
 
				+        psllw       mm0,        8                               ; shift left 8
			
 
				+        psraw       mm0,        3                               ; arithmetic shift right 11
			
 
				+        psrlw       mm0,        8
			
 
				+
			
 
				+        movq        mm7,        mm5                             ; get a copy of filters
			
 
				+        psraw       mm7,        11                              ; arithmetic shift right 11
			
 
				+        psllw       mm7,        8                               ; shift left 8 to put it back
			
 
				+
			
 
				+        por         mm0,        mm7                             ; put the two together to get result
			
 
				+
			
 
				+        psubsb      mm3,        mm0                             ; q0-= q0sz add
			
 
				+        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
			
 
				+
			
 
				+        ; now do +3 side
			
 
				+        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
			
 
				+
			
 
				+        movq        mm0, mm5                                    ; get a copy of filters
			
 
				+        psllw       mm0, 8                                      ; shift left 8
			
 
				+        psraw       mm0, 3                                      ; arithmetic shift right 11
			
 
				+        psrlw       mm0, 8
			
 
				+
			
 
				+        psraw       mm5, 11                                     ; arithmetic shift right 11
			
 
				+        psllw       mm5, 8                                      ; shift left 8 to put it back
			
 
				+        por         mm0, mm5                                    ; put the two together to get result
			
 
				+
			
 
				+        paddsb      mm6, mm0                                    ; p0+= p0 add
			
 
				+        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
			
 
				+
			
 
				+
			
 
				+        movq        mm0,        t0
			
 
				+        movq        mm4,        t1
			
 
				+
			
 
				+        ; mm0 = 70 60 50 40 30 20 10 00
			
 
				+        ; mm6 = 71 61 51 41 31 21 11 01
			
 
				+        ; mm3 = 72 62 52 42 32 22 12 02
			
 
				+        ; mm4 = 73 63 53 43 33 23 13 03
			
 
				+        ; transpose back to write out
			
 
				+
			
 
				+        movq        mm1,        mm0                         ;
			
 
				+        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
			
 
				+
			
 
				+        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
			
 
				+        movq        mm2,        mm3                         ;
			
 
				+
			
 
				+        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
			
 
				+        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
			
 
				+
			
 
				+        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
			
 
				+        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
			
 
				+
			
 
				+        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
			
 
				+        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
			
 
				+
			
 
				+        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
			
 
				+        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
			
 
				+
			
 
				+        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
			
 
				+        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
			
 
				+
			
 
				+        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
			
 
				+        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
			
 
				+
			
 
				+        psrlq       mm6,        32                          ; 33 32 31 30
			
 
				+        movd        [rsi],      mm1                         ; write 43 42 41 40
			
 
				+
			
 
				+        movd        [rsi + rax], mm6                        ; write 33 32 31 30
			
 
				+        neg         rax
			
 
				+
			
 
				+        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
			
 
				+        psrlq       mm1,        32                          ; 53 52 51 50
			
 
				+
			
 
				+        movd        [rdi],      mm1                         ; write out 53 52 51 50
			
 
				+        psrlq       mm5,        32                          ; 73 72 71 70
			
 
				+
			
 
				+        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
			
 
				+
			
 
				+        lea         rsi,        [rsi+rax*8]                 ; next 8
			
 
				+
			
 
				+        dec         rcx
			
 
				+        jnz         .nexts8_v
			
 
				+
			
 
				+    add rsp, 32
			
 
				+    pop rsp
			
 
				+    ; begin epilog
			
 
				+    pop rdi
			
 
				+    pop rsi
			
 
				+    RESTORE_GOT
			
 
				+    UNSHADOW_ARGS
			
 
				+    pop         rbp
			
 
				+    ret
			
 
				+
			
 
				+
			
 
				+
			
 
				+;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
			
 
				+;                  int y_stride,
			
 
				+;                  loop_filter_info *lfi)
			
 
				+;{
			
 
				+;
			
 
				+;
			
 
				+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
			
 
				+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
			
 
				+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
			
 
				+;}
			
 
				+
			
 
				+SECTION_RODATA
			
 
				+align 16
			
 
				+tfe:
			
 
				+    times 8 db 0xfe
			
 
				+align 16
			
 
				+t80:
			
 
				+    times 8 db 0x80
			
 
				+align 16
			
 
				+t1s:
			
 
				+    times 8 db 0x01
			
 
				+align 16
			
 
				+t3:
			
 
				+    times 8 db 0x03
			
 
				+align 16
			
 
				+t4:
			
 
				+    times 8 db 0x04
			
 
				+align 16
			
 
				+ones:
			
 
				+    times 4 dw 0x0001
			
 
				+align 16
			
 
				+s27:
			
 
				+    times 4 dw 0x1b00
			
 
				+align 16
			
 
				+s18:
			
 
				+    times 4 dw 0x1200
			
 
				+align 16
			
 
				+s9:
			
 
				+    times 4 dw 0x0900
			
 
				+align 16
			
 
				+s63:
			
 
				+    times 4 dw 0x003f