Browse Source

Merge pull request #101348 from akien-mga/libwebp-1.5.0

libwebp: Update to 1.5.0
Thaddeus Crews 3 months ago
parent
commit
ea5548f7d5
89 changed files with 3136 additions and 2148 deletions
  1. 1 1
      thirdparty/README.md
  2. 3 0
      thirdparty/libwebp/AUTHORS
  3. 5 4
      thirdparty/libwebp/sharpyuv/sharpyuv.c
  4. 14 5
      thirdparty/libwebp/sharpyuv/sharpyuv.h
  5. 13 9
      thirdparty/libwebp/sharpyuv/sharpyuv_csp.c
  6. 5 0
      thirdparty/libwebp/sharpyuv/sharpyuv_csp.h
  7. 2 1
      thirdparty/libwebp/src/dec/tree_dec.c
  8. 1 1
      thirdparty/libwebp/src/dec/vp8i_dec.h
  9. 8 5
      thirdparty/libwebp/src/dec/vp8l_dec.c
  10. 0 4
      thirdparty/libwebp/src/dec/vp8li_dec.h
  11. 1 1
      thirdparty/libwebp/src/demux/demux.c
  12. 2 2
      thirdparty/libwebp/src/dsp/cost.c
  13. 2 2
      thirdparty/libwebp/src/dsp/cost_mips32.c
  14. 2 2
      thirdparty/libwebp/src/dsp/cost_neon.c
  15. 2 2
      thirdparty/libwebp/src/dsp/cost_sse2.c
  16. 27 20
      thirdparty/libwebp/src/dsp/dec.c
  17. 12 10
      thirdparty/libwebp/src/dsp/dec_mips32.c
  18. 16 12
      thirdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
  19. 18 11
      thirdparty/libwebp/src/dsp/dec_msa.c
  20. 45 35
      thirdparty/libwebp/src/dsp/dec_neon.c
  21. 12 8
      thirdparty/libwebp/src/dsp/dec_sse2.c
  22. 94 59
      thirdparty/libwebp/src/dsp/dsp.h
  23. 90 51
      thirdparty/libwebp/src/dsp/enc.c
  24. 24 16
      thirdparty/libwebp/src/dsp/enc_mips32.c
  25. 58 43
      thirdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
  26. 79 46
      thirdparty/libwebp/src/dsp/enc_msa.c
  27. 317 30
      thirdparty/libwebp/src/dsp/enc_neon.c
  28. 125 77
      thirdparty/libwebp/src/dsp/enc_sse2.c
  29. 12 9
      thirdparty/libwebp/src/dsp/enc_sse41.c
  30. 56 88
      thirdparty/libwebp/src/dsp/filters.c
  31. 53 85
      thirdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
  32. 16 11
      thirdparty/libwebp/src/dsp/filters_msa.c
  33. 49 74
      thirdparty/libwebp/src/dsp/filters_neon.c
  34. 48 69
      thirdparty/libwebp/src/dsp/filters_sse2.c
  35. 18 18
      thirdparty/libwebp/src/dsp/lossless.c
  36. 57 46
      thirdparty/libwebp/src/dsp/lossless.h
  37. 36 15
      thirdparty/libwebp/src/dsp/lossless_common.h
  38. 207 265
      thirdparty/libwebp/src/dsp/lossless_enc.c
  39. 38 33
      thirdparty/libwebp/src/dsp/lossless_enc_mips32.c
  40. 10 15
      thirdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c
  41. 2 2
      thirdparty/libwebp/src/dsp/lossless_enc_msa.c
  42. 3 2
      thirdparty/libwebp/src/dsp/lossless_enc_neon.c
  43. 96 52
      thirdparty/libwebp/src/dsp/lossless_enc_sse2.c
  44. 10 7
      thirdparty/libwebp/src/dsp/lossless_enc_sse41.c
  45. 23 23
      thirdparty/libwebp/src/dsp/lossless_neon.c
  46. 21 17
      thirdparty/libwebp/src/dsp/lossless_sse2.c
  47. 4 4
      thirdparty/libwebp/src/dsp/lossless_sse41.c
  48. 6 5
      thirdparty/libwebp/src/dsp/rescaler.c
  49. 4 4
      thirdparty/libwebp/src/dsp/rescaler_mips32.c
  50. 14 13
      thirdparty/libwebp/src/dsp/rescaler_msa.c
  51. 2 2
      thirdparty/libwebp/src/dsp/rescaler_neon.c
  52. 8 10
      thirdparty/libwebp/src/dsp/rescaler_sse2.c
  53. 24 12
      thirdparty/libwebp/src/dsp/upsampling.c
  54. 12 6
      thirdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
  55. 36 19
      thirdparty/libwebp/src/dsp/upsampling_msa.c
  56. 58 53
      thirdparty/libwebp/src/dsp/upsampling_neon.c
  57. 19 10
      thirdparty/libwebp/src/dsp/upsampling_sse2.c
  58. 20 11
      thirdparty/libwebp/src/dsp/upsampling_sse41.c
  59. 30 18
      thirdparty/libwebp/src/dsp/yuv.c
  60. 41 23
      thirdparty/libwebp/src/dsp/yuv.h
  61. 4 3
      thirdparty/libwebp/src/dsp/yuv_mips32.c
  62. 4 3
      thirdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
  63. 12 6
      thirdparty/libwebp/src/dsp/yuv_neon.c
  64. 76 54
      thirdparty/libwebp/src/dsp/yuv_sse2.c
  65. 38 27
      thirdparty/libwebp/src/dsp/yuv_sse41.c
  66. 1 0
      thirdparty/libwebp/src/enc/alpha_enc.c
  67. 51 47
      thirdparty/libwebp/src/enc/backward_references_cost_enc.c
  68. 5 9
      thirdparty/libwebp/src/enc/backward_references_enc.c
  69. 0 4
      thirdparty/libwebp/src/enc/config_enc.c
  70. 1 1
      thirdparty/libwebp/src/enc/cost_enc.c
  71. 0 1
      thirdparty/libwebp/src/enc/cost_enc.h
  72. 209 178
      thirdparty/libwebp/src/enc/histogram_enc.c
  73. 7 10
      thirdparty/libwebp/src/enc/histogram_enc.h
  74. 12 1
      thirdparty/libwebp/src/enc/iterator_enc.c
  75. 475 156
      thirdparty/libwebp/src/enc/predictor_enc.c
  76. 7 5
      thirdparty/libwebp/src/enc/quant_enc.c
  77. 6 7
      thirdparty/libwebp/src/enc/vp8i_enc.h
  78. 162 134
      thirdparty/libwebp/src/enc/vp8l_enc.c
  79. 14 8
      thirdparty/libwebp/src/enc/vp8li_enc.h
  80. 2 1
      thirdparty/libwebp/src/mux/anim_encode.c
  81. 1 1
      thirdparty/libwebp/src/mux/muxi.h
  82. 6 4
      thirdparty/libwebp/src/mux/muxread.c
  83. 2 1
      thirdparty/libwebp/src/utils/bit_reader_utils.c
  84. 2 0
      thirdparty/libwebp/src/utils/bit_reader_utils.h
  85. 13 2
      thirdparty/libwebp/src/utils/palette.c
  86. 2 0
      thirdparty/libwebp/src/utils/palette.h
  87. 5 4
      thirdparty/libwebp/src/webp/encode.h
  88. 6 1
      thirdparty/libwebp/src/webp/format_constants.h
  89. 2 2
      thirdparty/libwebp/src/webp/types.h

+ 1 - 1
thirdparty/README.md

@@ -581,7 +581,7 @@ Files extracted from upstream source:
 ## libwebp
 ## libwebp
 
 
 - Upstream: https://chromium.googlesource.com/webm/libwebp/
 - Upstream: https://chromium.googlesource.com/webm/libwebp/
-- Version: 1.4.0 (845d5476a866141ba35ac133f856fa62f0b7445f, 2024)
+- Version: 1.5.0 (a4d7a715337ded4451fec90ff8ce79728e04126c, 2024)
 - License: BSD-3-Clause
 - License: BSD-3-Clause
 
 
 Files extracted from upstream source:
 Files extracted from upstream source:

+ 3 - 0
thirdparty/libwebp/AUTHORS

@@ -11,11 +11,13 @@ Contributors:
 - Christopher Degawa (ccom at randomderp dot com)
 - Christopher Degawa (ccom at randomderp dot com)
 - Clement Courbet (courbet at google dot com)
 - Clement Courbet (courbet at google dot com)
 - Djordje Pesut (djordje dot pesut at imgtec dot com)
 - Djordje Pesut (djordje dot pesut at imgtec dot com)
+- Frank (1433351828 at qq dot com)
 - Frank Barchard (fbarchard at google dot com)
 - Frank Barchard (fbarchard at google dot com)
 - Hui Su (huisu at google dot com)
 - Hui Su (huisu at google dot com)
 - H. Vetinari (h dot vetinari at gmx dot com)
 - H. Vetinari (h dot vetinari at gmx dot com)
 - Ilya Kurdyukov (jpegqs at gmail dot com)
 - Ilya Kurdyukov (jpegqs at gmail dot com)
 - Ingvar Stepanyan (rreverser at google dot com)
 - Ingvar Stepanyan (rreverser at google dot com)
+- Istvan Stefan (Istvan dot Stefan at arm dot com)
 - James Zern (jzern at google dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
 - Jan Engelhardt (jengelh at medozas dot de)
 - Jehan (jehan at girinstud dot io)
 - Jehan (jehan at girinstud dot io)
@@ -62,6 +64,7 @@ Contributors:
 - Vincent Rabaud (vrabaud at google dot com)
 - Vincent Rabaud (vrabaud at google dot com)
 - Vlad Tsyrklevich (vtsyrklevich at chromium dot org)
 - Vlad Tsyrklevich (vtsyrklevich at chromium dot org)
 - Wan-Teh Chang (wtc at google dot com)
 - Wan-Teh Chang (wtc at google dot com)
+- wrv (wrv at utexas dot edu)
 - Yang Zhang (yang dot zhang at arm dot com)
 - Yang Zhang (yang dot zhang at arm dot com)
 - Yannis Guyon (yguyon at google dot com)
 - Yannis Guyon (yguyon at google dot com)
 - Zhi An Ng (zhin at chromium dot org)
 - Zhi An Ng (zhin at chromium dot org)

+ 5 - 4
thirdparty/libwebp/sharpyuv/sharpyuv.c

@@ -565,10 +565,11 @@ int SharpYuvConvertWithOptions(const void* r_ptr, const void* g_ptr,
   scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
   scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
   scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);
   scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);
 
 
-  return DoSharpArgbToYuv(r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride,
-                          rgb_bit_depth, y_ptr, y_stride, u_ptr, u_stride,
-                          v_ptr, v_stride, yuv_bit_depth, width, height,
-                          &scaled_matrix, transfer_type);
+  return DoSharpArgbToYuv(
+      (const uint8_t*)r_ptr, (const uint8_t*)g_ptr, (const uint8_t*)b_ptr,
+      rgb_step, rgb_stride, rgb_bit_depth, (uint8_t*)y_ptr, y_stride,
+      (uint8_t*)u_ptr, u_stride, (uint8_t*)v_ptr, v_stride, yuv_bit_depth,
+      width, height, &scaled_matrix, transfer_type);
 }
 }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------

+ 14 - 5
thirdparty/libwebp/sharpyuv/sharpyuv.h

@@ -52,7 +52,7 @@ extern "C" {
 // SharpYUV API version following the convention from semver.org
 // SharpYUV API version following the convention from semver.org
 #define SHARPYUV_VERSION_MAJOR 0
 #define SHARPYUV_VERSION_MAJOR 0
 #define SHARPYUV_VERSION_MINOR 4
 #define SHARPYUV_VERSION_MINOR 4
-#define SHARPYUV_VERSION_PATCH 0
+#define SHARPYUV_VERSION_PATCH 1
 // Version as a uint32_t. The major number is the high 8 bits.
 // Version as a uint32_t. The major number is the high 8 bits.
 // The minor number is the middle 8 bits. The patch number is the low 16 bits.
 // The minor number is the middle 8 bits. The patch number is the low 16 bits.
 #define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \
 #define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \
@@ -66,10 +66,17 @@ extern "C" {
 SHARPYUV_EXTERN int SharpYuvGetVersion(void);
 SHARPYUV_EXTERN int SharpYuvGetVersion(void);
 
 
 // RGB to YUV conversion matrix, in 16 bit fixed point.
 // RGB to YUV conversion matrix, in 16 bit fixed point.
-// y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
-// u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
-// v = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
-// Then y, u and v values are divided by 1<<16 and rounded.
+// y_ = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
+// u_ = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
+// v_ = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
+// Then the values are divided by 1<<16 and rounded.
+// y = (y_ + (1 << 15)) >> 16
+// u = (u_ + (1 << 15)) >> 16
+// v = (v_ + (1 << 15)) >> 16
+//
+// Typically, the offset values rgb_to_y[3], rgb_to_u[3] and rgb_to_v[3] depend
+// on the input's bit depth, e.g., rgb_to_u[3] = 1 << (rgb_bit_depth - 1 + 16).
+// See also sharpyuv_csp.h to get a predefined matrix or generate a matrix.
 typedef struct {
 typedef struct {
   int rgb_to_y[4];
   int rgb_to_y[4];
   int rgb_to_u[4];
   int rgb_to_u[4];
@@ -127,6 +134,8 @@ typedef enum SharpYuvTransferFunctionType {
 //     adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
 //     adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
 //     should be multiples of 2.
 //     should be multiples of 2.
 // width, height: width and height of the image in pixels
 // width, height: width and height of the image in pixels
+// yuv_matrix: RGB to YUV conversion matrix. The matrix values typically
+//     depend on the input's rgb_bit_depth.
 // This function calls SharpYuvConvertWithOptions with a default transfer
 // This function calls SharpYuvConvertWithOptions with a default transfer
 // function of kSharpYuvTransferFunctionSrgb.
 // function of kSharpYuvTransferFunctionSrgb.
 SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
 SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,

+ 13 - 9
thirdparty/libwebp/sharpyuv/sharpyuv_csp.c

@@ -22,16 +22,16 @@ void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
   const float kr = yuv_color_space->kr;
   const float kr = yuv_color_space->kr;
   const float kb = yuv_color_space->kb;
   const float kb = yuv_color_space->kb;
   const float kg = 1.0f - kr - kb;
   const float kg = 1.0f - kr - kb;
-  const float cr = 0.5f / (1.0f - kb);
-  const float cb = 0.5f / (1.0f - kr);
+  const float cb = 0.5f / (1.0f - kb);
+  const float cr = 0.5f / (1.0f - kr);
 
 
   const int shift = yuv_color_space->bit_depth - 8;
   const int shift = yuv_color_space->bit_depth - 8;
 
 
   const float denom = (float)((1 << yuv_color_space->bit_depth) - 1);
   const float denom = (float)((1 << yuv_color_space->bit_depth) - 1);
   float scale_y = 1.0f;
   float scale_y = 1.0f;
   float add_y = 0.0f;
   float add_y = 0.0f;
-  float scale_u = cr;
-  float scale_v = cb;
+  float scale_u = cb;
+  float scale_v = cr;
   float add_uv = (float)(128 << shift);
   float add_uv = (float)(128 << shift);
   assert(yuv_color_space->bit_depth >= 8);
   assert(yuv_color_space->bit_depth >= 8);
 
 
@@ -59,31 +59,35 @@ void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
 }
 }
 
 
 // Matrices are in YUV_FIX fixed point precision.
 // Matrices are in YUV_FIX fixed point precision.
-// WebP's matrix, similar but not identical to kRec601LimitedMatrix.
+// WebP's matrix, similar but not identical to kRec601LimitedMatrix
+// Derived using the following formulas:
+// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
+// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
+// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
 static const SharpYuvConversionMatrix kWebpMatrix = {
 static const SharpYuvConversionMatrix kWebpMatrix = {
   {16839, 33059, 6420, 16 << 16},
   {16839, 33059, 6420, 16 << 16},
   {-9719, -19081, 28800, 128 << 16},
   {-9719, -19081, 28800, 128 << 16},
   {28800, -24116, -4684, 128 << 16},
   {28800, -24116, -4684, 128 << 16},
 };
 };
-// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeLimited
+// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeLimited
 static const SharpYuvConversionMatrix kRec601LimitedMatrix = {
 static const SharpYuvConversionMatrix kRec601LimitedMatrix = {
   {16829, 33039, 6416, 16 << 16},
   {16829, 33039, 6416, 16 << 16},
   {-9714, -19071, 28784, 128 << 16},
   {-9714, -19071, 28784, 128 << 16},
   {28784, -24103, -4681, 128 << 16},
   {28784, -24103, -4681, 128 << 16},
 };
 };
-// Kr=0.2990f Kb=0.1140f bits=8 range=kSharpYuvRangeFull
+// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeFull
 static const SharpYuvConversionMatrix kRec601FullMatrix = {
 static const SharpYuvConversionMatrix kRec601FullMatrix = {
   {19595, 38470, 7471, 0},
   {19595, 38470, 7471, 0},
   {-11058, -21710, 32768, 128 << 16},
   {-11058, -21710, 32768, 128 << 16},
   {32768, -27439, -5329, 128 << 16},
   {32768, -27439, -5329, 128 << 16},
 };
 };
-// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeLimited
+// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeLimited
 static const SharpYuvConversionMatrix kRec709LimitedMatrix = {
 static const SharpYuvConversionMatrix kRec709LimitedMatrix = {
   {11966, 40254, 4064, 16 << 16},
   {11966, 40254, 4064, 16 << 16},
   {-6596, -22189, 28784, 128 << 16},
   {-6596, -22189, 28784, 128 << 16},
   {28784, -26145, -2639, 128 << 16},
   {28784, -26145, -2639, 128 << 16},
 };
 };
-// Kr=0.2126f Kb=0.0722f bits=8 range=kSharpYuvRangeFull
+// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeFull
 static const SharpYuvConversionMatrix kRec709FullMatrix = {
 static const SharpYuvConversionMatrix kRec709FullMatrix = {
   {13933, 46871, 4732, 0},
   {13933, 46871, 4732, 0},
   {-7509, -25259, 32768, 128 << 16},
   {-7509, -25259, 32768, 128 << 16},

+ 5 - 0
thirdparty/libwebp/sharpyuv/sharpyuv_csp.h

@@ -41,10 +41,15 @@ SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix(
 
 
 // Enums for precomputed conversion matrices.
 // Enums for precomputed conversion matrices.
 typedef enum {
 typedef enum {
+  // WebP's matrix, similar but not identical to kSharpYuvMatrixRec601Limited
   kSharpYuvMatrixWebp = 0,
   kSharpYuvMatrixWebp = 0,
+  // Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeLimited
   kSharpYuvMatrixRec601Limited,
   kSharpYuvMatrixRec601Limited,
+  // Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeFull
   kSharpYuvMatrixRec601Full,
   kSharpYuvMatrixRec601Full,
+  // Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeLimited
   kSharpYuvMatrixRec709Limited,
   kSharpYuvMatrixRec709Limited,
+  // Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeFull
   kSharpYuvMatrixRec709Full,
   kSharpYuvMatrixRec709Full,
   kSharpYuvMatrixNum
   kSharpYuvMatrixNum
 } SharpYuvMatrixType;
 } SharpYuvMatrixType;

+ 2 - 1
thirdparty/libwebp/src/dec/tree_dec.c

@@ -16,7 +16,8 @@
 #include "src/utils/bit_reader_inl_utils.h"
 #include "src/utils/bit_reader_inl_utils.h"
 
 
 #if !defined(USE_GENERIC_TREE)
 #if !defined(USE_GENERIC_TREE)
-#if !defined(__arm__) && !defined(_M_ARM) && !WEBP_AARCH64
+#if !defined(__arm__) && !defined(_M_ARM) && !WEBP_AARCH64 && \
+    !defined(__wasm__)
 // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
 // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
 #define USE_GENERIC_TREE 1   // ALTERNATE_CODE
 #define USE_GENERIC_TREE 1   // ALTERNATE_CODE
 #else
 #else

+ 1 - 1
thirdparty/libwebp/src/dec/vp8i_dec.h

@@ -32,7 +32,7 @@ extern "C" {
 
 
 // version numbers
 // version numbers
 #define DEC_MAJ_VERSION 1
 #define DEC_MAJ_VERSION 1
-#define DEC_MIN_VERSION 4
+#define DEC_MIN_VERSION 5
 #define DEC_REV_VERSION 0
 #define DEC_REV_VERSION 0
 
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).

+ 8 - 5
thirdparty/libwebp/src/dec/vp8l_dec.c

@@ -20,10 +20,9 @@
 #include "src/dsp/dsp.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
 #include "src/dsp/lossless_common.h"
-#include "src/dsp/yuv.h"
-#include "src/utils/endian_inl_utils.h"
 #include "src/utils/huffman_utils.h"
 #include "src/utils/huffman_utils.h"
 #include "src/utils/utils.h"
 #include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
 
 
 #define NUM_ARGB_CACHE_ROWS          16
 #define NUM_ARGB_CACHE_ROWS          16
 
 
@@ -381,7 +380,8 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
 
 
   if (allow_recursion && VP8LReadBits(br, 1)) {
   if (allow_recursion && VP8LReadBits(br, 1)) {
     // use meta Huffman codes.
     // use meta Huffman codes.
-    const int huffman_precision = VP8LReadBits(br, 3) + 2;
+    const int huffman_precision =
+        MIN_HUFFMAN_BITS + VP8LReadBits(br, NUM_HUFFMAN_BITS);
     const int huffman_xsize = VP8LSubSampleSize(xsize, huffman_precision);
     const int huffman_xsize = VP8LSubSampleSize(xsize, huffman_precision);
     const int huffman_ysize = VP8LSubSampleSize(ysize, huffman_precision);
     const int huffman_ysize = VP8LSubSampleSize(ysize, huffman_precision);
     const int huffman_pixs = huffman_xsize * huffman_ysize;
     const int huffman_pixs = huffman_xsize * huffman_ysize;
@@ -1351,7 +1351,8 @@ static int ReadTransform(int* const xsize, int const* ysize,
   switch (type) {
   switch (type) {
     case PREDICTOR_TRANSFORM:
     case PREDICTOR_TRANSFORM:
     case CROSS_COLOR_TRANSFORM:
     case CROSS_COLOR_TRANSFORM:
-      transform->bits_ = VP8LReadBits(br, 3) + 2;
+      transform->bits_ =
+          MIN_TRANSFORM_BITS + VP8LReadBits(br, NUM_TRANSFORM_BITS);
       ok = DecodeImageStream(VP8LSubSampleSize(transform->xsize_,
       ok = DecodeImageStream(VP8LSubSampleSize(transform->xsize_,
                                                transform->bits_),
                                                transform->bits_),
                              VP8LSubSampleSize(transform->ysize_,
                              VP8LSubSampleSize(transform->ysize_,
@@ -1416,7 +1417,9 @@ VP8LDecoder* VP8LNew(void) {
   return dec;
   return dec;
 }
 }
 
 
-void VP8LClear(VP8LDecoder* const dec) {
+// Resets the decoder in its initial state, reclaiming memory.
+// Preserves the dec->status_ value.
+static void VP8LClear(VP8LDecoder* const dec) {
   int i;
   int i;
   if (dec == NULL) return;
   if (dec == NULL) return;
   ClearMetadata(&dec->hdr_);
   ClearMetadata(&dec->hdr_);

+ 0 - 4
thirdparty/libwebp/src/dec/vp8li_dec.h

@@ -121,10 +121,6 @@ WEBP_NODISCARD int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
 // this function. Returns false in case of error, with updated dec->status_.
 // this function. Returns false in case of error, with updated dec->status_.
 WEBP_NODISCARD int VP8LDecodeImage(VP8LDecoder* const dec);
 WEBP_NODISCARD int VP8LDecodeImage(VP8LDecoder* const dec);
 
 
-// Resets the decoder in its initial state, reclaiming memory.
-// Preserves the dec->status_ value.
-void VP8LClear(VP8LDecoder* const dec);
-
 // Clears and deallocate a lossless decoder instance.
 // Clears and deallocate a lossless decoder instance.
 void VP8LDelete(VP8LDecoder* const dec);
 void VP8LDelete(VP8LDecoder* const dec);
 
 

+ 1 - 1
thirdparty/libwebp/src/demux/demux.c

@@ -24,7 +24,7 @@
 #include "src/webp/format_constants.h"
 #include "src/webp/format_constants.h"
 
 
 #define DMUX_MAJ_VERSION 1
 #define DMUX_MAJ_VERSION 1
-#define DMUX_MIN_VERSION 4
+#define DMUX_MIN_VERSION 5
 #define DMUX_REV_VERSION 0
 #define DMUX_REV_VERSION 0
 
 
 typedef struct {
 typedef struct {

+ 2 - 2
thirdparty/libwebp/src/dsp/cost.c

@@ -354,8 +354,8 @@ static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
   return cost;
   return cost;
 }
 }
 
 
-static void SetResidualCoeffs_C(const int16_t* const coeffs,
-                                VP8Residual* const res) {
+static void SetResidualCoeffs_C(const int16_t* WEBP_RESTRICT const coeffs,
+                                VP8Residual* WEBP_RESTRICT const res) {
   int n;
   int n;
   res->last = -1;
   res->last = -1;
   assert(res->first == 0 || coeffs[0] == 0);
   assert(res->first == 0 || coeffs[0] == 0);

+ 2 - 2
thirdparty/libwebp/src/dsp/cost_mips32.c

@@ -96,8 +96,8 @@ static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
   return cost;
   return cost;
 }
 }
 
 
-static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
-                                     VP8Residual* const res) {
+static void SetResidualCoeffs_MIPS32(const int16_t* WEBP_RESTRICT const coeffs,
+                                     VP8Residual* WEBP_RESTRICT const res) {
   const int16_t* p_coeffs = (int16_t*)coeffs;
   const int16_t* p_coeffs = (int16_t*)coeffs;
   int temp0, temp1, temp2, n, n1;
   int temp0, temp1, temp2, n, n1;
   assert(res->first == 0 || coeffs[0] == 0);
   assert(res->first == 0 || coeffs[0] == 0);

+ 2 - 2
thirdparty/libwebp/src/dsp/cost_neon.c

@@ -19,8 +19,8 @@
 static const uint8_t position[16] = { 1, 2,  3,  4,  5,  6,  7,  8,
 static const uint8_t position[16] = { 1, 2,  3,  4,  5,  6,  7,  8,
                                       9, 10, 11, 12, 13, 14, 15, 16 };
                                       9, 10, 11, 12, 13, 14, 15, 16 };
 
 
-static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
-                                   VP8Residual* const res) {
+static void SetResidualCoeffs_NEON(const int16_t* WEBP_RESTRICT const coeffs,
+                                   VP8Residual* WEBP_RESTRICT const res) {
   const int16x8_t minus_one = vdupq_n_s16(-1);
   const int16x8_t minus_one = vdupq_n_s16(-1);
   const int16x8_t coeffs_0 = vld1q_s16(coeffs);
   const int16x8_t coeffs_0 = vld1q_s16(coeffs);
   const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);
   const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);

+ 2 - 2
thirdparty/libwebp/src/dsp/cost_sse2.c

@@ -22,8 +22,8 @@
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 
 
-static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
-                                   VP8Residual* const res) {
+static void SetResidualCoeffs_SSE2(const int16_t* WEBP_RESTRICT const coeffs,
+                                   VP8Residual* WEBP_RESTRICT const res) {
   const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
   const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
   const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
   const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
   // Use SSE2 to compare 16 values with a single instruction.
   // Use SSE2 to compare 16 values with a single instruction.

+ 27 - 20
thirdparty/libwebp/src/dsp/dec.c

@@ -38,7 +38,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 } while (0)
 } while (0)
 
 
 #if !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE
-static void TransformOne_C(const int16_t* in, uint8_t* dst) {
+static void TransformOne_C(const int16_t* WEBP_RESTRICT in,
+                           uint8_t* WEBP_RESTRICT dst) {
   int C[4 * 4], *tmp;
   int C[4 * 4], *tmp;
   int i;
   int i;
   tmp = C;
   tmp = C;
@@ -82,7 +83,8 @@ static void TransformOne_C(const int16_t* in, uint8_t* dst) {
 }
 }
 
 
 // Simplified transform when only in[0], in[1] and in[4] are non-zero
 // Simplified transform when only in[0], in[1] and in[4] are non-zero
-static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_C(const int16_t* WEBP_RESTRICT in,
+                           uint8_t* WEBP_RESTRICT dst) {
   const int a = in[0] + 4;
   const int a = in[0] + 4;
   const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
   const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
   const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
   const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@@ -95,7 +97,8 @@ static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
 }
 }
 #undef STORE2
 #undef STORE2
 
 
-static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo_C(const int16_t* WEBP_RESTRICT in,
+                           uint8_t* WEBP_RESTRICT dst, int do_two) {
   TransformOne_C(in, dst);
   TransformOne_C(in, dst);
   if (do_two) {
   if (do_two) {
     TransformOne_C(in + 16, dst + 4);
     TransformOne_C(in + 16, dst + 4);
@@ -103,13 +106,15 @@ static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
 }
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 #endif  // !WEBP_NEON_OMIT_C_CODE
 
 
-static void TransformUV_C(const int16_t* in, uint8_t* dst) {
+static void TransformUV_C(const int16_t* WEBP_RESTRICT in,
+                          uint8_t* WEBP_RESTRICT dst) {
   VP8Transform(in + 0 * 16, dst, 1);
   VP8Transform(in + 0 * 16, dst, 1);
   VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
   VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }
 }
 
 
 #if !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE
-static void TransformDC_C(const int16_t* in, uint8_t* dst) {
+static void TransformDC_C(const int16_t* WEBP_RESTRICT in,
+                          uint8_t* WEBP_RESTRICT dst) {
   const int DC = in[0] + 4;
   const int DC = in[0] + 4;
   int i, j;
   int i, j;
   for (j = 0; j < 4; ++j) {
   for (j = 0; j < 4; ++j) {
@@ -120,7 +125,8 @@ static void TransformDC_C(const int16_t* in, uint8_t* dst) {
 }
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 #endif  // !WEBP_NEON_OMIT_C_CODE
 
 
-static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
+static void TransformDCUV_C(const int16_t* WEBP_RESTRICT in,
+                            uint8_t* WEBP_RESTRICT dst) {
   if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
   if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
   if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
   if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
   if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
   if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@@ -133,7 +139,8 @@ static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
 // Paragraph 14.3
 // Paragraph 14.3
 
 
 #if !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE
-static void TransformWHT_C(const int16_t* in, int16_t* out) {
+static void TransformWHT_C(const int16_t* WEBP_RESTRICT in,
+                           int16_t* WEBP_RESTRICT out) {
   int tmp[16];
   int tmp[16];
   int i;
   int i;
   for (i = 0; i < 4; ++i) {
   for (i = 0; i < 4; ++i) {
@@ -161,7 +168,7 @@ static void TransformWHT_C(const int16_t* in, int16_t* out) {
 }
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 #endif  // !WEBP_NEON_OMIT_C_CODE
 
 
-void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+VP8WHT VP8TransformWHT;
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Intra predictions
 // Intra predictions
@@ -661,32 +668,32 @@ static void HFilter16i_C(uint8_t* p, int stride,
 
 
 #if !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE
 // 8-pixels wide variant, for chroma filtering
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                       int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 #endif  // !WEBP_NEON_OMIT_C_CODE
 
 
 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                       int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 #endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
 
 #if !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE
-static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
-                        int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                        int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 #endif  // !WEBP_NEON_OMIT_C_CODE
 
 
 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
-static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
-                        int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i_C(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                        int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 }
@@ -694,8 +701,8 @@ static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 
 
-static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
-                               int dst_stride) {
+static void DitherCombine8x8_C(const uint8_t* WEBP_RESTRICT dither,
+                               uint8_t* WEBP_RESTRICT dst, int dst_stride) {
   int i, j;
   int i, j;
   for (j = 0; j < 8; ++j) {
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i) {
     for (i = 0; i < 8; ++i) {
@@ -730,8 +737,8 @@ VP8SimpleFilterFunc VP8SimpleHFilter16;
 VP8SimpleFilterFunc VP8SimpleVFilter16i;
 VP8SimpleFilterFunc VP8SimpleVFilter16i;
 VP8SimpleFilterFunc VP8SimpleHFilter16i;
 VP8SimpleFilterFunc VP8SimpleHFilter16i;
 
 
-void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
-                            int dst_stride);
+void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
+                            uint8_t* WEBP_RESTRICT dst, int dst_stride);
 
 
 extern VP8CPUInfo VP8GetCPUInfo;
 extern VP8CPUInfo VP8GetCPUInfo;
 extern void VP8DspInitSSE2(void);
 extern void VP8DspInitSSE2(void);

+ 12 - 10
thirdparty/libwebp/src/dsp/dec_mips32.c

@@ -133,26 +133,26 @@ static void HFilter16(uint8_t* p, int stride,
 }
 }
 
 
 // 8-pixels wide variant, for chroma filtering
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                     int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 }
 
 
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                     int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 }
 
 
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                      int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 }
 
 
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                      int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 }
@@ -215,7 +215,8 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
   }
   }
 }
 }
 
 
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst) {
   int temp0, temp1, temp2, temp3, temp4;
   int temp0, temp1, temp2, temp3, temp4;
   int temp5, temp6, temp7, temp8, temp9;
   int temp5, temp6, temp7, temp8, temp9;
   int temp10, temp11, temp12, temp13, temp14;
   int temp10, temp11, temp12, temp13, temp14;
@@ -532,7 +533,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
   );
   );
 }
 }
 
 
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst, int do_two) {
   TransformOne(in, dst);
   TransformOne(in, dst);
   if (do_two) {
   if (do_two) {
     TransformOne(in + 16, dst + 4);
     TransformOne(in + 16, dst + 4);

+ 16 - 12
thirdparty/libwebp/src/dsp/dec_mips_dsp_r2.c

@@ -21,7 +21,8 @@
 static const int kC1 = WEBP_TRANSFORM_AC3_C1;
 static const int kC1 = WEBP_TRANSFORM_AC3_C1;
 static const int kC2 = WEBP_TRANSFORM_AC3_C2;
 static const int kC2 = WEBP_TRANSFORM_AC3_C2;
 
 
-static void TransformDC(const int16_t* in, uint8_t* dst) {
+static void TransformDC(const int16_t* WEBP_RESTRICT in,
+                        uint8_t* WEBP_RESTRICT dst) {
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
 
 
   __asm__ volatile (
   __asm__ volatile (
@@ -45,7 +46,8 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
   );
   );
 }
 }
 
 
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst) {
   const int a = in[0] + 4;
   const int a = in[0] + 4;
   int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
   int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
   const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
   const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@@ -81,7 +83,8 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   );
   );
 }
 }
 
 
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst) {
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
 
 
@@ -148,7 +151,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
   );
   );
 }
 }
 
 
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst, int do_two) {
   TransformOne(in, dst);
   TransformOne(in, dst);
   if (do_two) {
   if (do_two) {
     TransformOne(in + 16, dst + 4);
     TransformOne(in + 16, dst + 4);
@@ -434,14 +438,14 @@ static void HFilter16(uint8_t* p, int stride,
 }
 }
 
 
 // 8-pixels wide variant, for chroma filtering
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                     int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 }
 
 
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                     int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 }
@@ -465,14 +469,14 @@ static void HFilter16i(uint8_t* p, int stride,
   }
   }
 }
 }
 
 
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                      int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 }
 
 
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                      int stride, int thresh, int ithresh, int hev_thresh) {
   FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 }

+ 18 - 11
thirdparty/libwebp/src/dsp/dec_msa.c

@@ -38,7 +38,8 @@
   BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
   BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
 }
 }
 
 
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst) {
   v8i16 input0, input1;
   v8i16 input0, input1;
   v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
   v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
   v4i32 res0, res1, res2, res3;
   v4i32 res0, res1, res2, res3;
@@ -65,14 +66,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
   ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
   ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }
 }
 
 
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst, int do_two) {
   TransformOne(in, dst);
   TransformOne(in, dst);
   if (do_two) {
   if (do_two) {
     TransformOne(in + 16, dst + 4);
     TransformOne(in + 16, dst + 4);
   }
   }
 }
 }
 
 
-static void TransformWHT(const int16_t* in, int16_t* out) {
+static void TransformWHT(const int16_t* WEBP_RESTRICT in,
+                         int16_t* WEBP_RESTRICT out) {
   v8i16 input0, input1;
   v8i16 input0, input1;
   const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
   const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
   const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
   const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
@@ -114,13 +117,15 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
   out[240] = __msa_copy_s_h(out1, 7);
   out[240] = __msa_copy_s_h(out1, 7);
 }
 }
 
 
-static void TransformDC(const int16_t* in, uint8_t* dst) {
+static void TransformDC(const int16_t* WEBP_RESTRICT in,
+                        uint8_t* WEBP_RESTRICT dst) {
   const int DC = (in[0] + 4) >> 3;
   const int DC = (in[0] + 4) >> 3;
   const v8i16 tmp0 = __msa_fill_h(DC);
   const v8i16 tmp0 = __msa_fill_h(DC);
   ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
   ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
 }
 }
 
 
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3(const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst) {
   const int a = in[0] + 4;
   const int a = in[0] + 4;
   const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
   const int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
   const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
   const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
@@ -475,8 +480,8 @@ static void HFilter16i(uint8_t* src_y, int stride,
 }
 }
 
 
 // 8-pixels wide variants, for chroma filtering
 // 8-pixels wide variants, for chroma filtering
-static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
-                     int b_limit_in, int limit_in, int thresh_in) {
+static void VFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
+                     int stride, int b_limit_in, int limit_in, int thresh_in) {
   uint8_t* ptmp_src_u = src_u - 4 * stride;
   uint8_t* ptmp_src_u = src_u - 4 * stride;
   uint8_t* ptmp_src_v = src_v - 4 * stride;
   uint8_t* ptmp_src_v = src_v - 4 * stride;
   uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
   uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
@@ -520,8 +525,8 @@ static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
   SD(q2_d, ptmp_src_v);
   SD(q2_d, ptmp_src_v);
 }
 }
 
 
-static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
-                     int b_limit_in, int limit_in, int thresh_in) {
+static void HFilter8(uint8_t* WEBP_RESTRICT src_u, uint8_t* WEBP_RESTRICT src_v,
+                     int stride, int b_limit_in, int limit_in, int thresh_in) {
   uint8_t* ptmp_src_u = src_u - 4;
   uint8_t* ptmp_src_u = src_u - 4;
   uint8_t* ptmp_src_v = src_v - 4;
   uint8_t* ptmp_src_v = src_v - 4;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@@ -556,7 +561,8 @@ static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
   ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
   ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
 }
 }
 
 
-static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+static void VFilter8i(uint8_t* WEBP_RESTRICT src_u,
+                      uint8_t* WEBP_RESTRICT src_v, int stride,
                       int b_limit_in, int limit_in, int thresh_in) {
                       int b_limit_in, int limit_in, int thresh_in) {
   uint64_t p1_d, p0_d, q0_d, q1_d;
   uint64_t p1_d, p0_d, q0_d, q1_d;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
@@ -587,7 +593,8 @@ static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
   SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
   SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
 }
 }
 
 
-static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+static void HFilter8i(uint8_t* WEBP_RESTRICT src_u,
+                      uint8_t* WEBP_RESTRICT src_v, int stride,
                       int b_limit_in, int limit_in, int thresh_in) {
                       int b_limit_in, int limit_in, int thresh_in) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
   v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
   v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;

+ 45 - 35
thirdparty/libwebp/src/dsp/dec_neon.c

@@ -916,8 +916,8 @@ static void HFilter16i_NEON(uint8_t* p, int stride,
 #endif  // !WORK_AROUND_GCC
 #endif  // !WORK_AROUND_GCC
 
 
 // 8-pixels wide variant, for chroma filtering
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                          int stride, int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
   {
@@ -932,7 +932,8 @@ static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
     Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
     Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
   }
   }
 }
 }
-static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                           int stride,
                            int thresh, int ithresh, int hev_thresh) {
                            int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   u += 4 * stride;
   u += 4 * stride;
@@ -949,8 +950,8 @@ static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
 }
 }
 
 
 #if !defined(WORK_AROUND_GCC)
 #if !defined(WORK_AROUND_GCC)
-static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                          int stride, int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
   {
@@ -964,7 +965,8 @@ static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
   }
   }
 }
 }
 
 
-static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                           int stride,
                            int thresh, int ithresh, int hev_thresh) {
                            int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   u += 4;
   u += 4;
@@ -1041,7 +1043,8 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
   Transpose8x2_NEON(E0, E1, rows);
   Transpose8x2_NEON(E0, E1, rows);
 }
 }
 
 
-static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst) {
   int16x8x2_t rows;
   int16x8x2_t rows;
   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
   TransformPass_NEON(&rows);
   TransformPass_NEON(&rows);
@@ -1051,7 +1054,8 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
 
 
 #else
 #else
 
 
-static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst) {
   const int kBPS = BPS;
   const int kBPS = BPS;
   // kC1, kC2. Padded because vld1.16 loads 8 bytes
   // kC1, kC2. Padded because vld1.16 loads 8 bytes
   const int16_t constants[4] = { kC1, kC2, 0, 0 };
   const int16_t constants[4] = { kC1, kC2, 0, 0 };
@@ -1184,14 +1188,16 @@ static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
 
 
 #endif    // WEBP_USE_INTRINSICS
 #endif    // WEBP_USE_INTRINSICS
 
 
-static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
+static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst, int do_two) {
   TransformOne_NEON(in, dst);
   TransformOne_NEON(in, dst);
   if (do_two) {
   if (do_two) {
     TransformOne_NEON(in + 16, dst + 4);
     TransformOne_NEON(in + 16, dst + 4);
   }
   }
 }
 }
 
 
-static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
+static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
+                             uint8_t* WEBP_RESTRICT dst) {
   const int16x8_t DC = vdupq_n_s16(in[0]);
   const int16x8_t DC = vdupq_n_s16(in[0]);
   Add4x4_NEON(DC, DC, dst);
   Add4x4_NEON(DC, DC, dst);
 }
 }
@@ -1205,7 +1211,8 @@ static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
 } while (0)
 } while (0)
 
 
-static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
+static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
+                              int16_t* WEBP_RESTRICT out) {
   int32x4x4_t tmp;
   int32x4x4_t tmp;
 
 
   {
   {
@@ -1256,7 +1263,8 @@ static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 
 
-static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst) {
   const int16x4_t A = vld1_dup_s16(in);
   const int16x4_t A = vld1_dup_s16(in);
   const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
   const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
   const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
   const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
@@ -1300,18 +1308,19 @@ static void DC4_NEON(uint8_t* dst) {    // DC
 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
   const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
-  const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
+  const uint16x8_t d = vsubl_u8(T, TL);  // A[c] - A[-1]
   int y;
   int y;
   for (y = 0; y < size; y += 4) {
   for (y = 0; y < size; y += 4) {
     // left edge
     // left edge
-    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
-    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
-    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
-    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
-    const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
-    const int16x8_t r1 = vaddq_s16(L1, d);
-    const int16x8_t r2 = vaddq_s16(L2, d);
-    const int16x8_t r3 = vaddq_s16(L3, d);
+    const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
+    const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
+    const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
+    const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
+    // L[r] + A[c] - A[-1]
+    const int16x8_t r0 = vreinterpretq_s16_u16(vaddw_u8(d, L0));
+    const int16x8_t r1 = vreinterpretq_s16_u16(vaddw_u8(d, L1));
+    const int16x8_t r2 = vreinterpretq_s16_u16(vaddw_u8(d, L2));
+    const int16x8_t r3 = vreinterpretq_s16_u16(vaddw_u8(d, L3));
     // Saturate and store the result.
     // Saturate and store the result.
     const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
     const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
     const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
     const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
@@ -1572,23 +1581,24 @@ static void TM16_NEON(uint8_t* dst) {
   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
   const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
   // A[c] - A[-1]
   // A[c] - A[-1]
-  const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
-  const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
+  const uint16x8_t d_lo = vsubl_u8(vget_low_u8(T), TL);
+  const uint16x8_t d_hi = vsubl_u8(vget_high_u8(T), TL);
   int y;
   int y;
   for (y = 0; y < 16; y += 4) {
   for (y = 0; y < 16; y += 4) {
     // left edge
     // left edge
-    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
-    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
-    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
-    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
-    const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
-    const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
-    const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
-    const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
-    const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
-    const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
-    const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
-    const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
+    const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
+    const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
+    const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
+    const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
+    // L[r] + A[c] - A[-1]
+    const int16x8_t r0_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L0));
+    const int16x8_t r1_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L1));
+    const int16x8_t r2_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L2));
+    const int16x8_t r3_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L3));
+    const int16x8_t r0_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L0));
+    const int16x8_t r1_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L1));
+    const int16x8_t r2_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L2));
+    const int16x8_t r3_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L3));
     // Saturate and store the result.
     // Saturate and store the result.
     const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
     const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
     const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
     const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));

+ 12 - 8
thirdparty/libwebp/src/dsp/dec_sse2.c

@@ -30,7 +30,8 @@
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 // Transforms (Paragraph 14.4)
 
 
-static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
+static void Transform_SSE2(const int16_t* WEBP_RESTRICT in,
+                           uint8_t* WEBP_RESTRICT dst, int do_two) {
   // This implementation makes use of 16-bit fixed point versions of two
   // This implementation makes use of 16-bit fixed point versions of two
   // multiply constants:
   // multiply constants:
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -197,7 +198,8 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
 
 
 #if (USE_TRANSFORM_AC3 == 1)
 #if (USE_TRANSFORM_AC3 == 1)
 
 
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_SSE2(const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst) {
   const __m128i A = _mm_set1_epi16(in[0] + 4);
   const __m128i A = _mm_set1_epi16(in[0] + 4);
   const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
   const __m128i c4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
   const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
   const __m128i d4 = _mm_set1_epi16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
@@ -792,8 +794,8 @@ static void HFilter16i_SSE2(uint8_t* p, int stride,
 }
 }
 
 
 // 8-pixels wide variant, for chroma filtering
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                          int stride, int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i mask;
   __m128i t1, p2, p1, p0, q0, q1, q2;
   __m128i t1, p2, p1, p0, q0, q1, q2;
 
 
@@ -817,8 +819,8 @@ static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
   STOREUV(q2, u, v, 2 * stride);
   STOREUV(q2, u, v, 2 * stride);
 }
 }
 
 
-static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
-                          int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                          int stride, int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i mask;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 
 
@@ -837,7 +839,8 @@ static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
   Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
   Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
 }
 }
 
 
-static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+static void VFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                           int stride,
                            int thresh, int ithresh, int hev_thresh) {
                            int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i mask;
   __m128i t1, t2, p1, p0, q0, q1;
   __m128i t1, t2, p1, p0, q0, q1;
@@ -863,7 +866,8 @@ static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
   STOREUV(q1, u, v, 1 * stride);
   STOREUV(q1, u, v, 1 * stride);
 }
 }
 
 
-static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+static void HFilter8i_SSE2(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                           int stride,
                            int thresh, int ithresh, int hev_thresh) {
                            int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i mask;
   __m128i t1, t2, p1, p0, q0, q1;
   __m128i t1, t2, p1, p0, q0, q1;

+ 94 - 59
thirdparty/libwebp/src/dsp/dsp.h

@@ -60,53 +60,66 @@ extern "C" {
 // Transforms
 // Transforms
 // VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
 // VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
 //          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
 //          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
-typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                        int do_two);
-typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
-typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
+typedef void (*VP8Idct)(const uint8_t* WEBP_RESTRICT ref,
+                        const int16_t* WEBP_RESTRICT in,
+                        uint8_t* WEBP_RESTRICT dst, int do_two);
+typedef void (*VP8Fdct)(const uint8_t* WEBP_RESTRICT src,
+                        const uint8_t* WEBP_RESTRICT ref,
+                        int16_t* WEBP_RESTRICT out);
+typedef void (*VP8WHT)(const int16_t* WEBP_RESTRICT in,
+                       int16_t* WEBP_RESTRICT out);
 extern VP8Idct VP8ITransform;
 extern VP8Idct VP8ITransform;
 extern VP8Fdct VP8FTransform;
 extern VP8Fdct VP8FTransform;
 extern VP8Fdct VP8FTransform2;   // performs two transforms at a time
 extern VP8Fdct VP8FTransform2;   // performs two transforms at a time
 extern VP8WHT VP8FTransformWHT;
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
 // *dst is the destination block. *top and *left can be NULL.
-typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left,
-                              const uint8_t* top);
-typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top);
+typedef void (*VP8IntraPreds)(uint8_t* WEBP_RESTRICT dst,
+                              const uint8_t* WEBP_RESTRICT left,
+                              const uint8_t* WEBP_RESTRICT top);
+typedef void (*VP8Intra4Preds)(uint8_t* WEBP_RESTRICT dst,
+                               const uint8_t* WEBP_RESTRICT top);
 extern VP8Intra4Preds VP8EncPredLuma4;
 extern VP8Intra4Preds VP8EncPredLuma4;
 extern VP8IntraPreds VP8EncPredLuma16;
 extern VP8IntraPreds VP8EncPredLuma16;
 extern VP8IntraPreds VP8EncPredChroma8;
 extern VP8IntraPreds VP8EncPredChroma8;
 
 
-typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
+typedef int (*VP8Metric)(const uint8_t* WEBP_RESTRICT pix,
+                         const uint8_t* WEBP_RESTRICT ref);
 extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
 extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
-typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
-                          const uint16_t* const weights);
+typedef int (*VP8WMetric)(const uint8_t* WEBP_RESTRICT pix,
+                          const uint8_t* WEBP_RESTRICT ref,
+                          const uint16_t* WEBP_RESTRICT const weights);
 // The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major
 // The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major
 // 4 by 4 symmetric matrix.
 // 4 by 4 symmetric matrix.
 extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
 extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
 
 
 // Compute the average (DC) of four 4x4 blocks.
 // Compute the average (DC) of four 4x4 blocks.
 // Each sub-4x4 block #i sum is stored in dc[i].
 // Each sub-4x4 block #i sum is stored in dc[i].
-typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]);
+typedef void (*VP8MeanMetric)(const uint8_t* WEBP_RESTRICT ref,
+                              uint32_t dc[4]);
 extern VP8MeanMetric VP8Mean16x4;
 extern VP8MeanMetric VP8Mean16x4;
 
 
-typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
+typedef void (*VP8BlockCopy)(const uint8_t* WEBP_RESTRICT src,
+                             uint8_t* WEBP_RESTRICT dst);
 extern VP8BlockCopy VP8Copy4x4;
 extern VP8BlockCopy VP8Copy4x4;
 extern VP8BlockCopy VP8Copy16x8;
 extern VP8BlockCopy VP8Copy16x8;
 // Quantization
 // Quantization
 struct VP8Matrix;   // forward declaration
 struct VP8Matrix;   // forward declaration
-typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
-                                const struct VP8Matrix* const mtx);
+typedef int (*VP8QuantizeBlock)(
+    int16_t in[16], int16_t out[16],
+    const struct VP8Matrix* WEBP_RESTRICT const mtx);
 // Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
 // Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
-typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32],
-                                  const struct VP8Matrix* const mtx);
+typedef int (*VP8Quantize2Blocks)(
+    int16_t in[32], int16_t out[32],
+    const struct VP8Matrix* WEBP_RESTRICT const mtx);
 
 
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
 extern VP8Quantize2Blocks VP8EncQuantize2Blocks;
 extern VP8Quantize2Blocks VP8EncQuantize2Blocks;
 
 
 // specific to 2nd transform:
 // specific to 2nd transform:
-typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
-                                   const struct VP8Matrix* const mtx);
+typedef int (*VP8QuantizeBlockWHT)(
+    int16_t in[16], int16_t out[16],
+    const struct VP8Matrix* WEBP_RESTRICT const mtx);
 extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 
 
 extern const int VP8DspScan[16 + 4 + 4];
 extern const int VP8DspScan[16 + 4 + 4];
@@ -118,9 +131,10 @@ typedef struct {
   int max_value;
   int max_value;
   int last_non_zero;
   int last_non_zero;
 } VP8Histogram;
 } VP8Histogram;
-typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+typedef void (*VP8CHisto)(const uint8_t* WEBP_RESTRICT ref,
+                          const uint8_t* WEBP_RESTRICT pred,
                           int start_block, int end_block,
                           int start_block, int end_block,
-                          VP8Histogram* const histo);
+                          VP8Histogram* WEBP_RESTRICT const histo);
 extern VP8CHisto VP8CollectHistogram;
 extern VP8CHisto VP8CollectHistogram;
 // General-purpose util function to help VP8CollectHistogram().
 // General-purpose util function to help VP8CollectHistogram().
 void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
 void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
@@ -138,8 +152,9 @@ extern const uint16_t VP8LevelFixedCosts[2047 /*MAX_LEVEL*/ + 1];
 extern const uint8_t VP8EncBands[16 + 1];
 extern const uint8_t VP8EncBands[16 + 1];
 
 
 struct VP8Residual;
 struct VP8Residual;
-typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
-                                         struct VP8Residual* const res);
+typedef void (*VP8SetResidualCoeffsFunc)(
+    const int16_t* WEBP_RESTRICT const coeffs,
+    struct VP8Residual* WEBP_RESTRICT const res);
 extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
 extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
 
 
 // Cost calculation function.
 // Cost calculation function.
@@ -193,9 +208,11 @@ void VP8SSIMDspInit(void);
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Decoding
 // Decoding
 
 
-typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
+typedef void (*VP8DecIdct)(const int16_t* WEBP_RESTRICT coeffs,
+                           uint8_t* WEBP_RESTRICT dst);
 // when doing two transforms, coeffs is actually int16_t[2][16].
 // when doing two transforms, coeffs is actually int16_t[2][16].
-typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
+typedef void (*VP8DecIdct2)(const int16_t* WEBP_RESTRICT coeffs,
+                            uint8_t* WEBP_RESTRICT dst, int do_two);
 extern VP8DecIdct2 VP8Transform;
 extern VP8DecIdct2 VP8Transform;
 extern VP8DecIdct VP8TransformAC3;
 extern VP8DecIdct VP8TransformAC3;
 extern VP8DecIdct VP8TransformUV;
 extern VP8DecIdct VP8TransformUV;
@@ -233,7 +250,8 @@ extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
 // regular filter (on both macroblock edges and inner edges)
 // regular filter (on both macroblock edges and inner edges)
 typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
 typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
                                   int thresh, int ithresh, int hev_t);
                                   int thresh, int ithresh, int hev_t);
-typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
+typedef void (*VP8ChromaFilterFunc)(uint8_t* WEBP_RESTRICT u,
+                                    uint8_t* WEBP_RESTRICT v, int stride,
                                     int thresh, int ithresh, int hev_t);
                                     int thresh, int ithresh, int hev_t);
 // on outer edge
 // on outer edge
 extern VP8LumaFilterFunc VP8VFilter16;
 extern VP8LumaFilterFunc VP8VFilter16;
@@ -253,8 +271,8 @@ extern VP8ChromaFilterFunc VP8HFilter8i;
 #define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1))
 #define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1))
 #define VP8_DITHER_AMP_BITS 7
 #define VP8_DITHER_AMP_BITS 7
 #define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS)
 #define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS)
-extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
-                                   int dst_stride);
+extern void (*VP8DitherCombine8x8)(const uint8_t* WEBP_RESTRICT dither,
+                                   uint8_t* WEBP_RESTRICT dst, int dst_stride);
 
 
 // must be called before anything using the above
 // must be called before anything using the above
 void VP8DspInit(void);
 void VP8DspInit(void);
@@ -267,10 +285,10 @@ void VP8DspInit(void);
 // Convert a pair of y/u/v lines together to the output rgb/a colorspace.
 // Convert a pair of y/u/v lines together to the output rgb/a colorspace.
 // bottom_y can be NULL if only one line of output is needed (at top/bottom).
 // bottom_y can be NULL if only one line of output is needed (at top/bottom).
 typedef void (*WebPUpsampleLinePairFunc)(
 typedef void (*WebPUpsampleLinePairFunc)(
-    const uint8_t* top_y, const uint8_t* bottom_y,
-    const uint8_t* top_u, const uint8_t* top_v,
-    const uint8_t* cur_u, const uint8_t* cur_v,
-    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+    const uint8_t* WEBP_RESTRICT top_y, const uint8_t* WEBP_RESTRICT bottom_y,
+    const uint8_t* WEBP_RESTRICT top_u, const uint8_t* WEBP_RESTRICT top_v,
+    const uint8_t* WEBP_RESTRICT cur_u, const uint8_t* WEBP_RESTRICT cur_v,
+    uint8_t* WEBP_RESTRICT top_dst, uint8_t* WEBP_RESTRICT bottom_dst, int len);
 
 
 #ifdef FANCY_UPSAMPLING
 #ifdef FANCY_UPSAMPLING
 
 
@@ -280,13 +298,15 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 #endif    // FANCY_UPSAMPLING
 #endif    // FANCY_UPSAMPLING
 
 
 // Per-row point-sampling methods.
 // Per-row point-sampling methods.
-typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
-                                   const uint8_t* u, const uint8_t* v,
-                                   uint8_t* dst, int len);
+typedef void (*WebPSamplerRowFunc)(const uint8_t* WEBP_RESTRICT y,
+                                   const uint8_t* WEBP_RESTRICT u,
+                                   const uint8_t* WEBP_RESTRICT v,
+                                   uint8_t* WEBP_RESTRICT dst, int len);
 // Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
 // Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
-void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
-                             const uint8_t* u, const uint8_t* v, int uv_stride,
-                             uint8_t* dst, int dst_stride,
+void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
+                             const uint8_t* WEBP_RESTRICT u,
+                             const uint8_t* WEBP_RESTRICT v, int uv_stride,
+                             uint8_t* WEBP_RESTRICT dst, int dst_stride,
                              int width, int height, WebPSamplerRowFunc func);
                              int width, int height, WebPSamplerRowFunc func);
 
 
 // Sampling functions to convert rows of YUV to RGB(A)
 // Sampling functions to convert rows of YUV to RGB(A)
@@ -298,9 +318,10 @@ extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
 WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
 WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
 
 
 // YUV444->RGB converters
 // YUV444->RGB converters
-typedef void (*WebPYUV444Converter)(const uint8_t* y,
-                                    const uint8_t* u, const uint8_t* v,
-                                    uint8_t* dst, int len);
+typedef void (*WebPYUV444Converter)(const uint8_t* WEBP_RESTRICT y,
+                                    const uint8_t* WEBP_RESTRICT u,
+                                    const uint8_t* WEBP_RESTRICT v,
+                                    uint8_t* WEBP_RESTRICT dst, int len);
 
 
 extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 
 
@@ -316,26 +337,35 @@ void WebPInitYUV444Converters(void);
 // ARGB -> YUV converters
 // ARGB -> YUV converters
 
 
 // Convert ARGB samples to luma Y.
 // Convert ARGB samples to luma Y.
-extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
+extern void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
+                                  uint8_t* WEBP_RESTRICT y, int width);
 // Convert ARGB samples to U/V with downsampling. do_store should be '1' for
 // Convert ARGB samples to U/V with downsampling. do_store should be '1' for
 // even lines and '0' for odd ones. 'src_width' is the original width, not
 // even lines and '0' for odd ones. 'src_width' is the original width, not
 // the U/V one.
 // the U/V one.
-extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
+extern void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
+                                   uint8_t* WEBP_RESTRICT u,
+                                   uint8_t* WEBP_RESTRICT v,
                                    int src_width, int do_store);
                                    int src_width, int do_store);
 
 
 // Convert a row of accumulated (four-values) of rgba32 toward U/V
 // Convert a row of accumulated (four-values) of rgba32 toward U/V
-extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
-                                     uint8_t* u, uint8_t* v, int width);
+extern void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
+                                     uint8_t* WEBP_RESTRICT u,
+                                     uint8_t* WEBP_RESTRICT v, int width);
 
 
 // Convert RGB or BGR to Y
 // Convert RGB or BGR to Y
-extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
-extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
+extern void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
+                                   uint8_t* WEBP_RESTRICT y, int width);
+extern void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
+                                   uint8_t* WEBP_RESTRICT y, int width);
 
 
 // used for plain-C fallback.
 // used for plain-C fallback.
-extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
+extern void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
+                                  uint8_t* WEBP_RESTRICT u,
+                                  uint8_t* WEBP_RESTRICT v,
                                   int src_width, int do_store);
                                   int src_width, int do_store);
-extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
-                                    uint8_t* u, uint8_t* v, int width);
+extern void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
+                                    uint8_t* WEBP_RESTRICT u,
+                                    uint8_t* WEBP_RESTRICT v, int width);
 
 
 // Must be called before using the above.
 // Must be called before using the above.
 void WebPInitConvertARGBToYUV(void);
 void WebPInitConvertARGBToYUV(void);
@@ -348,8 +378,9 @@ struct WebPRescaler;
 // Import a row of data and save its contribution in the rescaler.
 // Import a row of data and save its contribution in the rescaler.
 // 'channel' denotes the channel number to be imported. 'Expand' corresponds to
 // 'channel' denotes the channel number to be imported. 'Expand' corresponds to
 // the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
 // the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
-typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk,
-                                          const uint8_t* src);
+typedef void (*WebPRescalerImportRowFunc)(
+    struct WebPRescaler* WEBP_RESTRICT const wrk,
+    const uint8_t* WEBP_RESTRICT src);
 
 
 extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
 extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
 extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
 extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
@@ -362,16 +393,19 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
 extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
 extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
 
 
 // Plain-C implementation, as fall-back.
 // Plain-C implementation, as fall-back.
-extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
-                                          const uint8_t* src);
-extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
-                                          const uint8_t* src);
+extern void WebPRescalerImportRowExpand_C(
+    struct WebPRescaler* WEBP_RESTRICT const wrk,
+    const uint8_t* WEBP_RESTRICT src);
+extern void WebPRescalerImportRowShrink_C(
+    struct WebPRescaler* WEBP_RESTRICT const wrk,
+    const uint8_t* WEBP_RESTRICT src);
 extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
 extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
 extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
 extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
 
 
 // Main entry calls:
 // Main entry calls:
-extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
-                                  const uint8_t* src);
+extern void WebPRescalerImportRow(
+    struct WebPRescaler* WEBP_RESTRICT const wrk,
+    const uint8_t* WEBP_RESTRICT src);
 // Export one row (starting at x_out position) from rescaler.
 // Export one row (starting at x_out position) from rescaler.
 extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);
 extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);
 
 
@@ -480,8 +514,9 @@ typedef enum {     // Filter types.
   WEBP_FILTER_FAST
   WEBP_FILTER_FAST
 } WEBP_FILTER_TYPE;
 } WEBP_FILTER_TYPE;
 
 
-typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
-                               int stride, uint8_t* out);
+typedef void (*WebPFilterFunc)(const uint8_t* WEBP_RESTRICT in,
+                               int width, int height, int stride,
+                               uint8_t* WEBP_RESTRICT out);
 // In-place un-filtering.
 // In-place un-filtering.
 // Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'.
 // Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'.
 typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds,
 typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds,

+ 90 - 51
thirdparty/libwebp/src/dsp/enc.c

@@ -59,9 +59,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
 }
 }
 
 
 #if !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE
-static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
+static void CollectHistogram_C(const uint8_t* WEBP_RESTRICT ref,
+                               const uint8_t* WEBP_RESTRICT pred,
                                int start_block, int end_block,
                                int start_block, int end_block,
-                               VP8Histogram* const histo) {
+                               VP8Histogram* WEBP_RESTRICT const histo) {
   int j;
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   for (j = start_block; j < end_block; ++j) {
   for (j = start_block; j < end_block; ++j) {
@@ -109,8 +110,9 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
 #define STORE(x, y, v) \
 #define STORE(x, y, v) \
   dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
   dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
 
 
-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                      uint8_t* dst) {
+static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
+                                      const int16_t* WEBP_RESTRICT in,
+                                      uint8_t* WEBP_RESTRICT dst) {
   int C[4 * 4], *tmp;
   int C[4 * 4], *tmp;
   int i;
   int i;
   tmp = C;
   tmp = C;
@@ -146,7 +148,9 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   }
   }
 }
 }
 
 
-static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+static void ITransform_C(const uint8_t* WEBP_RESTRICT ref,
+                         const int16_t* WEBP_RESTRICT in,
+                         uint8_t* WEBP_RESTRICT dst,
                          int do_two) {
                          int do_two) {
   ITransformOne(ref, in, dst);
   ITransformOne(ref, in, dst);
   if (do_two) {
   if (do_two) {
@@ -154,7 +158,9 @@ static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
   }
   }
 }
 }
 
 
-static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_C(const uint8_t* WEBP_RESTRICT src,
+                         const uint8_t* WEBP_RESTRICT ref,
+                         int16_t* WEBP_RESTRICT out) {
   int i;
   int i;
   int tmp[16];
   int tmp[16];
   for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
   for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@@ -184,14 +190,16 @@ static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
 }
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 #endif  // !WEBP_NEON_OMIT_C_CODE
 
 
-static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
-                          int16_t* out) {
+static void FTransform2_C(const uint8_t* WEBP_RESTRICT src,
+                          const uint8_t* WEBP_RESTRICT ref,
+                          int16_t* WEBP_RESTRICT out) {
   VP8FTransform(src, ref, out);
   VP8FTransform(src, ref, out);
   VP8FTransform(src + 4, ref + 4, out + 16);
   VP8FTransform(src + 4, ref + 4, out + 16);
 }
 }
 
 
 #if !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE
-static void FTransformWHT_C(const int16_t* in, int16_t* out) {
+static void FTransformWHT_C(const int16_t* WEBP_RESTRICT in,
+                            int16_t* WEBP_RESTRICT out) {
   // input is 12b signed
   // input is 12b signed
   int32_t tmp[16];
   int32_t tmp[16];
   int i;
   int i;
@@ -234,8 +242,9 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
   }
   }
 }
 }
 
 
-static WEBP_INLINE void VerticalPred(uint8_t* dst,
-                                     const uint8_t* top, int size) {
+static WEBP_INLINE void VerticalPred(uint8_t* WEBP_RESTRICT dst,
+                                     const uint8_t* WEBP_RESTRICT top,
+                                     int size) {
   int j;
   int j;
   if (top != NULL) {
   if (top != NULL) {
     for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
     for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
@@ -244,8 +253,9 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void HorizontalPred(uint8_t* dst,
-                                       const uint8_t* left, int size) {
+static WEBP_INLINE void HorizontalPred(uint8_t* WEBP_RESTRICT dst,
+                                       const uint8_t* WEBP_RESTRICT left,
+                                       int size) {
   if (left != NULL) {
   if (left != NULL) {
     int j;
     int j;
     for (j = 0; j < size; ++j) {
     for (j = 0; j < size; ++j) {
@@ -256,8 +266,9 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
-                                   const uint8_t* top, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t* WEBP_RESTRICT dst,
+                                   const uint8_t* WEBP_RESTRICT left,
+                                   const uint8_t* WEBP_RESTRICT top, int size) {
   int y;
   int y;
   if (left != NULL) {
   if (left != NULL) {
     if (top != NULL) {
     if (top != NULL) {
@@ -286,8 +297,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
-                               const uint8_t* top,
+static WEBP_INLINE void DCMode(uint8_t* WEBP_RESTRICT dst,
+                               const uint8_t* WEBP_RESTRICT left,
+                               const uint8_t* WEBP_RESTRICT top,
                                int size, int round, int shift) {
                                int size, int round, int shift) {
   int DC = 0;
   int DC = 0;
   int j;
   int j;
@@ -312,8 +324,9 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
 // Chroma 8x8 prediction (paragraph 12.2)
 
 
-static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
-                               const uint8_t* top) {
+static void IntraChromaPreds_C(uint8_t* WEBP_RESTRICT dst,
+                               const uint8_t* WEBP_RESTRICT left,
+                               const uint8_t* WEBP_RESTRICT top) {
   // U block
   // U block
   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
   VerticalPred(C8VE8 + dst, top, 8);
   VerticalPred(C8VE8 + dst, top, 8);
@@ -332,22 +345,28 @@ static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
 // luma 16x16 prediction (paragraph 12.3)
 
 
-static void Intra16Preds_C(uint8_t* dst,
-                           const uint8_t* left, const uint8_t* top) {
+#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
+static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst,
+                           const uint8_t* WEBP_RESTRICT left,
+                           const uint8_t* WEBP_RESTRICT top) {
   DCMode(I16DC16 + dst, left, top, 16, 16, 5);
   DCMode(I16DC16 + dst, left, top, 16, 16, 5);
   VerticalPred(I16VE16 + dst, top, 16);
   VerticalPred(I16VE16 + dst, top, 16);
   HorizontalPred(I16HE16 + dst, left, 16);
   HorizontalPred(I16HE16 + dst, left, 16);
   TrueMotion(I16TM16 + dst, left, top, 16);
   TrueMotion(I16TM16 + dst, left, top, 16);
 }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // luma 4x4 prediction
 // luma 4x4 prediction
 
 
+#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
+
 #define DST(x, y) dst[(x) + (y) * BPS]
 #define DST(x, y) dst[(x) + (y) * BPS]
 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
 
-static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+// vertical
+static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   const uint8_t vals[4] = {
   const uint8_t vals[4] = {
     AVG3(top[-1], top[0], top[1]),
     AVG3(top[-1], top[0], top[1]),
     AVG3(top[ 0], top[1], top[2]),
     AVG3(top[ 0], top[1], top[2]),
@@ -360,7 +379,8 @@ static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
   }
   }
 }
 }
 
 
-static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+// horizontal
+static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   const int X = top[-1];
   const int X = top[-1];
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
@@ -372,14 +392,14 @@ static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }
 }
 
 
-static void DC4(uint8_t* dst, const uint8_t* top) {
+static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   uint32_t dc = 4;
   uint32_t dc = 4;
   int i;
   int i;
   for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
   for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
   Fill(dst, dc >> 3, 4);
   Fill(dst, dc >> 3, 4);
 }
 }
 
 
-static void RD4(uint8_t* dst, const uint8_t* top) {
+static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   const int X = top[-1];
   const int X = top[-1];
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
@@ -398,7 +418,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
   DST(3, 0)                                     = AVG3(D, C, B);
   DST(3, 0)                                     = AVG3(D, C, B);
 }
 }
 
 
-static void LD4(uint8_t* dst, const uint8_t* top) {
+static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   const int A = top[0];
   const int A = top[0];
   const int B = top[1];
   const int B = top[1];
   const int C = top[2];
   const int C = top[2];
@@ -416,7 +436,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
   DST(3, 3)                                     = AVG3(G, H, H);
   DST(3, 3)                                     = AVG3(G, H, H);
 }
 }
 
 
-static void VR4(uint8_t* dst, const uint8_t* top) {
+static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   const int X = top[-1];
   const int X = top[-1];
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
@@ -438,7 +458,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
   DST(3, 1) =             AVG3(B, C, D);
   DST(3, 1) =             AVG3(B, C, D);
 }
 }
 
 
-static void VL4(uint8_t* dst, const uint8_t* top) {
+static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   const int A = top[0];
   const int A = top[0];
   const int B = top[1];
   const int B = top[1];
   const int C = top[2];
   const int C = top[2];
@@ -460,7 +480,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
               DST(3, 3) = AVG3(F, G, H);
               DST(3, 3) = AVG3(F, G, H);
 }
 }
 
 
-static void HU4(uint8_t* dst, const uint8_t* top) {
+static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
   const int K = top[-4];
   const int K = top[-4];
@@ -475,7 +495,7 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
   DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
   DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 }
 
 
-static void HD4(uint8_t* dst, const uint8_t* top) {
+static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   const int X = top[-1];
   const int X = top[-1];
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
@@ -498,7 +518,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
   DST(1, 3)             = AVG3(L, K, J);
   DST(1, 3)             = AVG3(L, K, J);
 }
 }
 
 
-static void TM4(uint8_t* dst, const uint8_t* top) {
+static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   int x, y;
   int x, y;
   const uint8_t* const clip = clip1 + 255 - top[-1];
   const uint8_t* const clip = clip1 + 255 - top[-1];
   for (y = 0; y < 4; ++y) {
   for (y = 0; y < 4; ++y) {
@@ -516,7 +536,8 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
 
 
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst,
+                          const uint8_t* WEBP_RESTRICT top) {
   DC4(I4DC4 + dst, top);
   DC4(I4DC4 + dst, top);
   TM4(I4TM4 + dst, top);
   TM4(I4TM4 + dst, top);
   VE4(I4VE4 + dst, top);
   VE4(I4VE4 + dst, top);
@@ -529,11 +550,14 @@ static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
   HU4(I4HU4 + dst, top);
   HU4(I4HU4 + dst, top);
 }
 }
 
 
+#endif  // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
+
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Metric
 // Metric
 
 
 #if !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE
-static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
+static WEBP_INLINE int GetSSE(const uint8_t* WEBP_RESTRICT a,
+                              const uint8_t* WEBP_RESTRICT b,
                               int w, int h) {
                               int w, int h) {
   int count = 0;
   int count = 0;
   int y, x;
   int y, x;
@@ -548,21 +572,25 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
   return count;
   return count;
 }
 }
 
 
-static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_C(const uint8_t* WEBP_RESTRICT a,
+                      const uint8_t* WEBP_RESTRICT b) {
   return GetSSE(a, b, 16, 16);
   return GetSSE(a, b, 16, 16);
 }
 }
-static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_C(const uint8_t* WEBP_RESTRICT a,
+                     const uint8_t* WEBP_RESTRICT b) {
   return GetSSE(a, b, 16, 8);
   return GetSSE(a, b, 16, 8);
 }
 }
-static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_C(const uint8_t* WEBP_RESTRICT a,
+                    const uint8_t* WEBP_RESTRICT b) {
   return GetSSE(a, b, 8, 8);
   return GetSSE(a, b, 8, 8);
 }
 }
-static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_C(const uint8_t* WEBP_RESTRICT a,
+                    const uint8_t* WEBP_RESTRICT b) {
   return GetSSE(a, b, 4, 4);
   return GetSSE(a, b, 4, 4);
 }
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 #endif  // !WEBP_NEON_OMIT_C_CODE
 
 
-static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_C(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) {
   int k, x, y;
   int k, x, y;
   for (k = 0; k < 4; ++k) {
   for (k = 0; k < 4; ++k) {
     uint32_t avg = 0;
     uint32_t avg = 0;
@@ -586,7 +614,8 @@ static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
 // Hadamard transform
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform(const uint8_t* in, const uint16_t* w) {
+static int TTransform(const uint8_t* WEBP_RESTRICT in,
+                      const uint16_t* WEBP_RESTRICT w) {
   int sum = 0;
   int sum = 0;
   int tmp[16];
   int tmp[16];
   int i;
   int i;
@@ -620,15 +649,17 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
   return sum;
   return sum;
 }
 }
 
 
-static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto4x4_C(const uint8_t* WEBP_RESTRICT const a,
+                      const uint8_t* WEBP_RESTRICT const b,
+                      const uint16_t* WEBP_RESTRICT const w) {
   const int sum1 = TTransform(a, w);
   const int sum1 = TTransform(a, w);
   const int sum2 = TTransform(b, w);
   const int sum2 = TTransform(b, w);
   return abs(sum2 - sum1) >> 5;
   return abs(sum2 - sum1) >> 5;
 }
 }
 
 
-static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
-                        const uint16_t* const w) {
+static int Disto16x16_C(const uint8_t* WEBP_RESTRICT const a,
+                        const uint8_t* WEBP_RESTRICT const b,
+                        const uint16_t* WEBP_RESTRICT const w) {
   int D = 0;
   int D = 0;
   int x, y;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@@ -644,13 +675,14 @@ static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
 // Quantization
 // Quantization
 //
 //
 
 
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 static const uint8_t kZigzag[16] = {
 static const uint8_t kZigzag[16] = {
   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
 };
 };
 
 
 // Simple quantization
 // Simple quantization
 static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
 static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
-                           const VP8Matrix* const mtx) {
+                           const VP8Matrix* WEBP_RESTRICT const mtx) {
   int last = -1;
   int last = -1;
   int n;
   int n;
   for (n = 0; n < 16; ++n) {
   for (n = 0; n < 16; ++n) {
@@ -675,9 +707,8 @@ static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
   return (last >= 0);
   return (last >= 0);
 }
 }
 
 
-#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
 static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
-                             const VP8Matrix* const mtx) {
+                             const VP8Matrix* WEBP_RESTRICT const mtx) {
   int nz;
   int nz;
   nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
   nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@@ -688,7 +719,8 @@ static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Block copy
 // Block copy
 
 
-static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
+static WEBP_INLINE void Copy(const uint8_t* WEBP_RESTRICT src,
+                             uint8_t* WEBP_RESTRICT dst, int w, int h) {
   int y;
   int y;
   for (y = 0; y < h; ++y) {
   for (y = 0; y < h; ++y) {
     memcpy(dst, src, w);
     memcpy(dst, src, w);
@@ -697,11 +729,13 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
   }
   }
 }
 }
 
 
-static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
+static void Copy4x4_C(const uint8_t* WEBP_RESTRICT src,
+                      uint8_t* WEBP_RESTRICT dst) {
   Copy(src, dst, 4, 4);
   Copy(src, dst, 4, 4);
 }
 }
 
 
-static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
+static void Copy16x8_C(const uint8_t* WEBP_RESTRICT src,
+                       uint8_t* WEBP_RESTRICT dst) {
   Copy(src, dst, 16, 8);
   Copy(src, dst, 16, 8);
 }
 }
 
 
@@ -760,14 +794,19 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
   VP8EncQuantizeBlock = QuantizeBlock_C;
   VP8EncQuantizeBlock = QuantizeBlock_C;
   VP8EncQuantize2Blocks = Quantize2Blocks_C;
   VP8EncQuantize2Blocks = Quantize2Blocks_C;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
 #endif
 #endif
 
 
-  VP8FTransform2 = FTransform2_C;
+#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
   VP8EncPredLuma4 = Intra4Preds_C;
   VP8EncPredLuma4 = Intra4Preds_C;
+#endif
+#if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
   VP8EncPredLuma16 = Intra16Preds_C;
   VP8EncPredLuma16 = Intra16Preds_C;
+#endif
+
+  VP8FTransform2 = FTransform2_C;
   VP8EncPredChroma8 = IntraChromaPreds_C;
   VP8EncPredChroma8 = IntraChromaPreds_C;
   VP8Mean16x4 = Mean16x4_C;
   VP8Mean16x4 = Mean16x4_C;
-  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
   VP8Copy4x4 = Copy4x4_C;
   VP8Copy4x4 = Copy4x4_C;
   VP8Copy16x8 = Copy16x8_C;
   VP8Copy16x8 = Copy16x8_C;
 
 

+ 24 - 16
thirdparty/libwebp/src/dsp/enc_mips32.c

@@ -109,9 +109,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2;
   "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
   "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
 
 
 // Does one or two inverse transforms.
 // Does one or two inverse transforms.
-static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
-                                             const int16_t* in,
-                                             uint8_t* dst) {
+static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* WEBP_RESTRICT ref,
+                                             const int16_t* WEBP_RESTRICT in,
+                                             uint8_t* WEBP_RESTRICT dst) {
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
   int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
   int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
   int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
   int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@@ -141,8 +141,9 @@ static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
   );
   );
 }
 }
 
 
-static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
-                              uint8_t* dst, int do_two) {
+static void ITransform_MIPS32(const uint8_t* WEBP_RESTRICT ref,
+                              const int16_t* WEBP_RESTRICT in,
+                              uint8_t* WEBP_RESTRICT dst, int do_two) {
   ITransformOne_MIPS32(ref, in, dst);
   ITransformOne_MIPS32(ref, in, dst);
   if (do_two) {
   if (do_two) {
     ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
     ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
@@ -236,7 +237,7 @@ static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
 }
 }
 
 
 static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
 static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
-                                  const VP8Matrix* const mtx) {
+                                  const VP8Matrix* WEBP_RESTRICT const mtx) {
   int nz;
   int nz;
   nz  = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz  = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
   nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
@@ -358,8 +359,9 @@ static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
   "msub   %[temp6],  %[temp0]                \n\t"                \
   "msub   %[temp6],  %[temp0]                \n\t"                \
   "msub   %[temp7],  %[temp1]                \n\t"
   "msub   %[temp7],  %[temp1]                \n\t"
 
 
-static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
-                           const uint16_t* const w) {
+static int Disto4x4_MIPS32(const uint8_t* WEBP_RESTRICT const a,
+                           const uint8_t* WEBP_RESTRICT const b,
+                           const uint16_t* WEBP_RESTRICT const w) {
   int tmp[32];
   int tmp[32];
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
 
 
@@ -393,8 +395,9 @@ static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
 #undef VERTICAL_PASS
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS
 #undef HORIZONTAL_PASS
 
 
-static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
-                             const uint16_t* const w) {
+static int Disto16x16_MIPS32(const uint8_t* WEBP_RESTRICT const a,
+                             const uint8_t* WEBP_RESTRICT const b,
+                             const uint16_t* WEBP_RESTRICT const w) {
   int D = 0;
   int D = 0;
   int x, y;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@@ -475,8 +478,9 @@ static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
   "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
   "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
   "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"
   "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"
 
 
-static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
-                              int16_t* out) {
+static void FTransform_MIPS32(const uint8_t* WEBP_RESTRICT src,
+                              const uint8_t* WEBP_RESTRICT ref,
+                              int16_t* WEBP_RESTRICT out) {
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
   int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
   int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
   int temp17, temp18, temp19, temp20;
   int temp17, temp18, temp19, temp20;
@@ -537,7 +541,8 @@ static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
   GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
   GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
   GET_SSE_INNER(D, D + 1, D + 2, D + 3)
   GET_SSE_INNER(D, D + 1, D + 2, D + 3)
 
 
-static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPS32(const uint8_t* WEBP_RESTRICT a,
+                           const uint8_t* WEBP_RESTRICT b) {
   int count;
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
 
@@ -571,7 +576,8 @@ static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
   return count;
   return count;
 }
 }
 
 
-static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPS32(const uint8_t* WEBP_RESTRICT a,
+                          const uint8_t* WEBP_RESTRICT b) {
   int count;
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
 
@@ -597,7 +603,8 @@ static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
   return count;
   return count;
 }
 }
 
 
-static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPS32(const uint8_t* WEBP_RESTRICT a,
+                         const uint8_t* WEBP_RESTRICT b) {
   int count;
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
 
@@ -619,7 +626,8 @@ static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
   return count;
   return count;
 }
 }
 
 
-static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPS32(const uint8_t* WEBP_RESTRICT a,
+                         const uint8_t* WEBP_RESTRICT b) {
   int count;
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
 

+ 58 - 43
thirdparty/libwebp/src/dsp/enc_mips_dsp_r2.c

@@ -141,8 +141,9 @@ static const int kC2 = WEBP_TRANSFORM_AC3_C2;
   "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
   "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
   "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
   "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
 
 
-static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
-                                 int16_t* out) {
+static void FTransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
+                                 const uint8_t* WEBP_RESTRICT ref,
+                                 int16_t* WEBP_RESTRICT out) {
   const int c2217 = 2217;
   const int c2217 = 2217;
   const int c5352 = 5352;
   const int c5352 = 5352;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@@ -171,8 +172,9 @@ static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
 #undef VERTICAL_PASS
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS
 #undef HORIZONTAL_PASS
 
 
-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                      uint8_t* dst) {
+static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
+                                      const int16_t* WEBP_RESTRICT in,
+                                      uint8_t* WEBP_RESTRICT dst) {
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
 
 
@@ -239,16 +241,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   );
   );
 }
 }
 
 
-static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
-                                 uint8_t* dst, int do_two) {
+static void ITransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT ref,
+                                 const int16_t* WEBP_RESTRICT in,
+                                 uint8_t* WEBP_RESTRICT dst, int do_two) {
   ITransformOne(ref, in, dst);
   ITransformOne(ref, in, dst);
   if (do_two) {
   if (do_two) {
     ITransformOne(ref + 4, in + 16, dst + 4);
     ITransformOne(ref + 4, in + 16, dst + 4);
   }
   }
 }
 }
 
 
-static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
-                              const uint16_t* const w) {
+static int Disto4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
+                              const uint8_t* WEBP_RESTRICT const b,
+                              const uint16_t* WEBP_RESTRICT const w) {
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
 
 
@@ -314,9 +318,9 @@ static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
   return abs(temp3 - temp17) >> 5;
   return abs(temp3 - temp17) >> 5;
 }
 }
 
 
-static int Disto16x16_MIPSdspR2(const uint8_t* const a,
-                                const uint8_t* const b,
-                                const uint16_t* const w) {
+static int Disto16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
+                                const uint8_t* WEBP_RESTRICT const b,
+                                const uint16_t* WEBP_RESTRICT const w) {
   int D = 0;
   int D = 0;
   int x, y;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@@ -367,8 +371,8 @@ static int Disto16x16_MIPSdspR2(const uint8_t* const a,
 } while (0)
 } while (0)
 
 
 #define VERTICAL_PRED(DST, TOP, SIZE)                                          \
 #define VERTICAL_PRED(DST, TOP, SIZE)                                          \
-static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST),                     \
-                                           const uint8_t* (TOP)) {             \
+static WEBP_INLINE void VerticalPred##SIZE(                                    \
+    uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (TOP)) {        \
   int j;                                                                       \
   int j;                                                                       \
   if ((TOP)) {                                                                 \
   if ((TOP)) {                                                                 \
     for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
     for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
@@ -383,8 +387,8 @@ VERTICAL_PRED(dst, top, 16)
 #undef VERTICAL_PRED
 #undef VERTICAL_PRED
 
 
 #define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
 #define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
-static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST),                   \
-                                             const uint8_t* (LEFT)) {          \
+static WEBP_INLINE void HorizontalPred##SIZE(                                  \
+    uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (LEFT)) {       \
   if (LEFT) {                                                                  \
   if (LEFT) {                                                                  \
     int j;                                                                     \
     int j;                                                                     \
     for (j = 0; j < (SIZE); ++j) {                                             \
     for (j = 0; j < (SIZE); ++j) {                                             \
@@ -451,8 +455,9 @@ HORIZONTAL_PRED(dst, left, 16)
 } while (0)
 } while (0)
 
 
 #define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
 #define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
-static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\
-                                         const uint8_t* (TOP)) {               \
+static WEBP_INLINE void TrueMotion##SIZE(uint8_t* WEBP_RESTRICT (DST),         \
+                                         const uint8_t* WEBP_RESTRICT (LEFT),  \
+                                         const uint8_t* WEBP_RESTRICT (TOP)) { \
   if ((LEFT) != NULL) {                                                        \
   if ((LEFT) != NULL) {                                                        \
     if ((TOP) != NULL) {                                                       \
     if ((TOP) != NULL) {                                                       \
       CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
       CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
@@ -480,8 +485,9 @@ TRUE_MOTION(dst, left, top, 16)
 #undef CLIP_8B_TO_DST
 #undef CLIP_8B_TO_DST
 #undef CLIPPING
 #undef CLIPPING
 
 
-static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
-                                 const uint8_t* top) {
+static WEBP_INLINE void DCMode16(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT left,
+                                 const uint8_t* WEBP_RESTRICT top) {
   int DC, DC1;
   int DC, DC1;
   int temp0, temp1, temp2, temp3;
   int temp0, temp1, temp2, temp3;
 
 
@@ -543,8 +549,9 @@ static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
   FILL_8_OR_16(dst, DC, 16);
   FILL_8_OR_16(dst, DC, 16);
 }
 }
 
 
-static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
-                                const uint8_t* top) {
+static WEBP_INLINE void DCMode8(uint8_t* WEBP_RESTRICT dst,
+                                const uint8_t* WEBP_RESTRICT left,
+                                const uint8_t* WEBP_RESTRICT top) {
   int DC, DC1;
   int DC, DC1;
   int temp0, temp1, temp2, temp3;
   int temp0, temp1, temp2, temp3;
 
 
@@ -588,7 +595,7 @@ static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
   FILL_8_OR_16(dst, DC, 8);
   FILL_8_OR_16(dst, DC, 8);
 }
 }
 
 
-static void DC4(uint8_t* dst, const uint8_t* top) {
+static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   int temp0, temp1;
   int temp0, temp1;
   __asm__ volatile(
   __asm__ volatile(
     "ulw          %[temp0],   0(%[top])               \n\t"
     "ulw          %[temp0],   0(%[top])               \n\t"
@@ -609,7 +616,7 @@ static void DC4(uint8_t* dst, const uint8_t* top) {
   );
   );
 }
 }
 
 
-static void TM4(uint8_t* dst, const uint8_t* top) {
+static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
   int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
   const int c35 = 0xff00ff;
   const int c35 = 0xff00ff;
   __asm__ volatile (
   __asm__ volatile (
@@ -664,7 +671,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
   );
   );
 }
 }
 
 
-static void VE4(uint8_t* dst, const uint8_t* top) {
+static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
   __asm__ volatile(
   __asm__ volatile(
     "ulw             %[temp0],   -1(%[top])              \n\t"
     "ulw             %[temp0],   -1(%[top])              \n\t"
@@ -695,7 +702,7 @@ static void VE4(uint8_t* dst, const uint8_t* top) {
   );
   );
 }
 }
 
 
-static void HE4(uint8_t* dst, const uint8_t* top) {
+static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
   __asm__ volatile(
   __asm__ volatile(
     "ulw             %[temp0],   -4(%[top])              \n\t"
     "ulw             %[temp0],   -4(%[top])              \n\t"
@@ -731,7 +738,7 @@ static void HE4(uint8_t* dst, const uint8_t* top) {
   );
   );
 }
 }
 
 
-static void RD4(uint8_t* dst, const uint8_t* top) {
+static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   int temp0, temp1, temp2, temp3, temp4, temp5;
   int temp6, temp7, temp8, temp9, temp10, temp11;
   int temp6, temp7, temp8, temp9, temp10, temp11;
   __asm__ volatile(
   __asm__ volatile(
@@ -780,7 +787,7 @@ static void RD4(uint8_t* dst, const uint8_t* top) {
   );
   );
 }
 }
 
 
-static void VR4(uint8_t* dst, const uint8_t* top) {
+static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   int temp0, temp1, temp2, temp3, temp4;
   int temp0, temp1, temp2, temp3, temp4;
   int temp5, temp6, temp7, temp8, temp9;
   int temp5, temp6, temp7, temp8, temp9;
   __asm__ volatile (
   __asm__ volatile (
@@ -830,7 +837,7 @@ static void VR4(uint8_t* dst, const uint8_t* top) {
   );
   );
 }
 }
 
 
-static void LD4(uint8_t* dst, const uint8_t* top) {
+static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   int temp0, temp1, temp2, temp3, temp4, temp5;
   int temp6, temp7, temp8, temp9, temp10, temp11;
   int temp6, temp7, temp8, temp9, temp10, temp11;
   __asm__ volatile(
   __asm__ volatile(
@@ -877,7 +884,7 @@ static void LD4(uint8_t* dst, const uint8_t* top) {
   );
   );
 }
 }
 
 
-static void VL4(uint8_t* dst, const uint8_t* top) {
+static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   int temp0, temp1, temp2, temp3, temp4;
   int temp0, temp1, temp2, temp3, temp4;
   int temp5, temp6, temp7, temp8, temp9;
   int temp5, temp6, temp7, temp8, temp9;
   __asm__ volatile (
   __asm__ volatile (
@@ -926,7 +933,7 @@ static void VL4(uint8_t* dst, const uint8_t* top) {
   );
   );
 }
 }
 
 
-static void HD4(uint8_t* dst, const uint8_t* top) {
+static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   int temp0, temp1, temp2, temp3, temp4;
   int temp0, temp1, temp2, temp3, temp4;
   int temp5, temp6, temp7, temp8, temp9;
   int temp5, temp6, temp7, temp8, temp9;
   __asm__ volatile (
   __asm__ volatile (
@@ -974,7 +981,7 @@ static void HD4(uint8_t* dst, const uint8_t* top) {
   );
   );
 }
 }
 
 
-static void HU4(uint8_t* dst, const uint8_t* top) {
+static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   __asm__ volatile (
   __asm__ volatile (
     "ulw             %[temp0],   -5(%[top])              \n\t"
     "ulw             %[temp0],   -5(%[top])              \n\t"
@@ -1013,8 +1020,9 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
 // Chroma 8x8 prediction (paragraph 12.2)
 
 
-static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
-                                       const uint8_t* top) {
+static void IntraChromaPreds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
+                                       const uint8_t* WEBP_RESTRICT left,
+                                       const uint8_t* WEBP_RESTRICT top) {
   // U block
   // U block
   DCMode8(C8DC8 + dst, left, top);
   DCMode8(C8DC8 + dst, left, top);
   VerticalPred8(C8VE8 + dst, top);
   VerticalPred8(C8VE8 + dst, top);
@@ -1033,8 +1041,9 @@ static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
 // luma 16x16 prediction (paragraph 12.3)
 
 
-static void Intra16Preds_MIPSdspR2(uint8_t* dst,
-                                   const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
+                                   const uint8_t* WEBP_RESTRICT left,
+                                   const uint8_t* WEBP_RESTRICT top) {
   DCMode16(I16DC16 + dst, left, top);
   DCMode16(I16DC16 + dst, left, top);
   VerticalPred16(I16VE16 + dst, top);
   VerticalPred16(I16VE16 + dst, top);
   HorizontalPred16(I16HE16 + dst, left);
   HorizontalPred16(I16HE16 + dst, left);
@@ -1043,7 +1052,8 @@ static void Intra16Preds_MIPSdspR2(uint8_t* dst,
 
 
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
+                                  const uint8_t* WEBP_RESTRICT top) {
   DC4(I4DC4 + dst, top);
   DC4(I4DC4 + dst, top);
   TM4(I4TM4 + dst, top);
   TM4(I4TM4 + dst, top);
   VE4(I4VE4 + dst, top);
   VE4(I4VE4 + dst, top);
@@ -1079,7 +1089,8 @@ static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
   GET_SSE_INNER(C)                        \
   GET_SSE_INNER(C)                        \
   GET_SSE_INNER(D)
   GET_SSE_INNER(D)
 
 
-static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
+                              const uint8_t* WEBP_RESTRICT b) {
   int count;
   int count;
   int temp0, temp1, temp2, temp3;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
   __asm__ volatile (
@@ -1109,7 +1120,8 @@ static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   return count;
   return count;
 }
 }
 
 
-static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
+                             const uint8_t* WEBP_RESTRICT b) {
   int count;
   int count;
   int temp0, temp1, temp2, temp3;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
   __asm__ volatile (
@@ -1131,7 +1143,8 @@ static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   return count;
   return count;
 }
 }
 
 
-static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
+                            const uint8_t* WEBP_RESTRICT b) {
   int count;
   int count;
   int temp0, temp1, temp2, temp3;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
   __asm__ volatile (
@@ -1149,7 +1162,8 @@ static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   return count;
   return count;
 }
 }
 
 
-static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
+                            const uint8_t* WEBP_RESTRICT b) {
   int count;
   int count;
   int temp0, temp1, temp2, temp3;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
   __asm__ volatile (
@@ -1273,7 +1287,7 @@ static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
 "3:                                                          \n\t"
 "3:                                                          \n\t"
 
 
 static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
 static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
-                                   const VP8Matrix* const mtx) {
+                                   const VP8Matrix* WEBP_RESTRICT const mtx) {
   int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
   int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
   int sign, coeff, level;
   int sign, coeff, level;
   int max_level = MAX_LEVEL;
   int max_level = MAX_LEVEL;
@@ -1314,7 +1328,7 @@ static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
 }
 }
 
 
 static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
 static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
-                                     const VP8Matrix* const mtx) {
+                                     const VP8Matrix* WEBP_RESTRICT const mtx) {
   int nz;
   int nz;
   nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
   nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
@@ -1360,7 +1374,8 @@ static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
   "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
   "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
   "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
   "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
 
 
-static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MIPSdspR2(const int16_t* WEBP_RESTRICT in,
+                                    int16_t* WEBP_RESTRICT out) {
   int temp0, temp1, temp2, temp3, temp4;
   int temp0, temp1, temp2, temp3, temp4;
   int temp5, temp6, temp7, temp8, temp9;
   int temp5, temp6, temp7, temp8, temp9;
 
 

+ 79 - 46
thirdparty/libwebp/src/dsp/enc_msa.c

@@ -41,8 +41,9 @@
   BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);      \
   BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);      \
 } while (0)
 } while (0)
 
 
-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                      uint8_t* dst) {
+static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
+                                      const int16_t* WEBP_RESTRICT in,
+                                      uint8_t* WEBP_RESTRICT dst) {
   v8i16 input0, input1;
   v8i16 input0, input1;
   v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
   v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
   v4i32 res0, res1, res2, res3;
   v4i32 res0, res1, res2, res3;
@@ -69,16 +70,18 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
   ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }
 }
 
 
-static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                           int do_two) {
+static void ITransform_MSA(const uint8_t* WEBP_RESTRICT ref,
+                           const int16_t* WEBP_RESTRICT in,
+                           uint8_t* WEBP_RESTRICT dst, int do_two) {
   ITransformOne(ref, in, dst);
   ITransformOne(ref, in, dst);
   if (do_two) {
   if (do_two) {
     ITransformOne(ref + 4, in + 16, dst + 4);
     ITransformOne(ref + 4, in + 16, dst + 4);
   }
   }
 }
 }
 
 
-static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
-                           int16_t* out) {
+static void FTransform_MSA(const uint8_t* WEBP_RESTRICT src,
+                           const uint8_t* WEBP_RESTRICT ref,
+                           int16_t* WEBP_RESTRICT out) {
   uint64_t out0, out1, out2, out3;
   uint64_t out0, out1, out2, out3;
   uint32_t in0, in1, in2, in3;
   uint32_t in0, in1, in2, in3;
   v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
   v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
@@ -131,7 +134,8 @@ static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
   SD4(out0, out1, out2, out3, out, 8);
   SD4(out0, out1, out2, out3, out, 8);
 }
 }
 
 
-static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MSA(const int16_t* WEBP_RESTRICT in,
+                              int16_t* WEBP_RESTRICT out) {
   v8i16 in0 = { 0 };
   v8i16 in0 = { 0 };
   v8i16 in1 = { 0 };
   v8i16 in1 = { 0 };
   v8i16 tmp0, tmp1, tmp2, tmp3;
   v8i16 tmp0, tmp1, tmp2, tmp3;
@@ -168,7 +172,8 @@ static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
   ST_SH2(out0, out1, out, 8);
   ST_SH2(out0, out1, out, 8);
 }
 }
 
 
-static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
+static int TTransform_MSA(const uint8_t* WEBP_RESTRICT in,
+                          const uint16_t* WEBP_RESTRICT w) {
   int sum;
   int sum;
   uint32_t in0_m, in1_m, in2_m, in3_m;
   uint32_t in0_m, in1_m, in2_m, in3_m;
   v16i8 src0 = { 0 };
   v16i8 src0 = { 0 };
@@ -200,15 +205,17 @@ static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
   return sum;
   return sum;
 }
 }
 
 
-static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
-                        const uint16_t* const w) {
+static int Disto4x4_MSA(const uint8_t* WEBP_RESTRICT const a,
+                        const uint8_t* WEBP_RESTRICT const b,
+                        const uint16_t* WEBP_RESTRICT const w) {
   const int sum1 = TTransform_MSA(a, w);
   const int sum1 = TTransform_MSA(a, w);
   const int sum2 = TTransform_MSA(b, w);
   const int sum2 = TTransform_MSA(b, w);
   return abs(sum2 - sum1) >> 5;
   return abs(sum2 - sum1) >> 5;
 }
 }
 
 
-static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
-                          const uint16_t* const w) {
+static int Disto16x16_MSA(const uint8_t* WEBP_RESTRICT const a,
+                          const uint8_t* WEBP_RESTRICT const b,
+                          const uint16_t* WEBP_RESTRICT const w) {
   int D = 0;
   int D = 0;
   int x, y;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@@ -259,7 +266,9 @@ static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
 
-static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+// vertical
+static WEBP_INLINE void VE4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
   const v16u8 A1 = { 0 };
   const v16u8 A1 = { 0 };
   const uint64_t val_m = LD(top - 1);
   const uint64_t val_m = LD(top - 1);
   const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
   const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
@@ -272,7 +281,9 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
   SW4(out, out, out, out, dst, BPS);
   SW4(out, out, out, out, dst, BPS);
 }
 }
 
 
-static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+// horizontal
+static WEBP_INLINE void HE4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
   const int X = top[-1];
   const int X = top[-1];
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
@@ -284,7 +295,8 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }
 }
 
 
-static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
   uint32_t dc = 4;
   uint32_t dc = 4;
   int i;
   int i;
   for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
   for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
@@ -293,7 +305,8 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
   SW4(dc, dc, dc, dc, dst, BPS);
   SW4(dc, dc, dc, dc, dst, BPS);
 }
 }
 
 
-static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void RD4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
   const v16u8 A2 = { 0 };
   const v16u8 A2 = { 0 };
   const uint64_t val_m = LD(top - 5);
   const uint64_t val_m = LD(top - 5);
   const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
   const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
@@ -313,7 +326,8 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
   SW4(val3, val2, val1, val0, dst, BPS);
   SW4(val3, val2, val1, val0, dst, BPS);
 }
 }
 
 
-static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void LD4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
   const v16u8 A1 = { 0 };
   const v16u8 A1 = { 0 };
   const uint64_t val_m = LD(top);
   const uint64_t val_m = LD(top);
   const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
   const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
@@ -333,7 +347,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
   SW4(val0, val1, val2, val3, dst, BPS);
   SW4(val0, val1, val2, val3, dst, BPS);
 }
 }
 
 
-static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VR4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
   const int X = top[-1];
   const int X = top[-1];
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
@@ -354,7 +369,8 @@ static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
   DST(3, 1) =             AVG3(B, C, D);
   DST(3, 1) =             AVG3(B, C, D);
 }
 }
 
 
-static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VL4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
   const int A = top[0];
   const int A = top[0];
   const int B = top[1];
   const int B = top[1];
   const int C = top[2];
   const int C = top[2];
@@ -375,7 +391,8 @@ static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
               DST(3, 3) = AVG3(F, G, H);
               DST(3, 3) = AVG3(F, G, H);
 }
 }
 
 
-static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HU4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
   const int K = top[-4];
   const int K = top[-4];
@@ -390,7 +407,8 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
   DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
   DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 }
 
 
-static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HD4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
   const int X = top[-1];
   const int X = top[-1];
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
@@ -411,7 +429,8 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
   DST(1, 3)             = AVG3(L, K, J);
   DST(1, 3)             = AVG3(L, K, J);
 }
 }
 
 
-static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void TM4(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
   const v16i8 zero = { 0 };
   const v16i8 zero = { 0 };
   const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
   const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
   const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
   const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
@@ -431,7 +450,8 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
 #undef AVG3
 #undef AVG3
 #undef AVG2
 #undef AVG2
 
 
-static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MSA(uint8_t* WEBP_RESTRICT dst,
+                            const uint8_t* WEBP_RESTRICT top) {
   DC4(I4DC4 + dst, top);
   DC4(I4DC4 + dst, top);
   TM4(I4TM4 + dst, top);
   TM4(I4TM4 + dst, top);
   VE4(I4VE4 + dst, top);
   VE4(I4VE4 + dst, top);
@@ -451,7 +471,8 @@ static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
     ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);  \
     ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);  \
 } while (0)
 } while (0)
 
 
-static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VerticalPred16x16(uint8_t* WEBP_RESTRICT dst,
+                                          const uint8_t* WEBP_RESTRICT top) {
   if (top != NULL) {
   if (top != NULL) {
     const v16u8 out = LD_UB(top);
     const v16u8 out = LD_UB(top);
     STORE16x16(out, dst);
     STORE16x16(out, dst);
@@ -461,8 +482,8 @@ static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
   }
   }
 }
 }
 
 
-static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
-                                            const uint8_t* left) {
+static WEBP_INLINE void HorizontalPred16x16(uint8_t* WEBP_RESTRICT dst,
+                                            const uint8_t* WEBP_RESTRICT left) {
   if (left != NULL) {
   if (left != NULL) {
     int j;
     int j;
     for (j = 0; j < 16; j += 4) {
     for (j = 0; j < 16; j += 4) {
@@ -480,8 +501,9 @@ static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
-                                        const uint8_t* top) {
+static WEBP_INLINE void TrueMotion16x16(uint8_t* WEBP_RESTRICT dst,
+                                        const uint8_t* WEBP_RESTRICT left,
+                                        const uint8_t* WEBP_RESTRICT top) {
   if (left != NULL) {
   if (left != NULL) {
     if (top != NULL) {
     if (top != NULL) {
       int j;
       int j;
@@ -519,8 +541,9 @@ static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
-                                    const uint8_t* top) {
+static WEBP_INLINE void DCMode16x16(uint8_t* WEBP_RESTRICT dst,
+                                    const uint8_t* WEBP_RESTRICT left,
+                                    const uint8_t* WEBP_RESTRICT top) {
   int DC;
   int DC;
   v16u8 out;
   v16u8 out;
   if (top != NULL && left != NULL) {
   if (top != NULL && left != NULL) {
@@ -548,8 +571,9 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
   STORE16x16(out, dst);
   STORE16x16(out, dst);
 }
 }
 
 
-static void Intra16Preds_MSA(uint8_t* dst,
-                             const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MSA(uint8_t* WEBP_RESTRICT dst,
+                             const uint8_t* WEBP_RESTRICT left,
+                             const uint8_t* WEBP_RESTRICT top) {
   DCMode16x16(I16DC16 + dst, left, top);
   DCMode16x16(I16DC16 + dst, left, top);
   VerticalPred16x16(I16VE16 + dst, top);
   VerticalPred16x16(I16VE16 + dst, top);
   HorizontalPred16x16(I16HE16 + dst, left);
   HorizontalPred16x16(I16HE16 + dst, left);
@@ -574,7 +598,8 @@ static void Intra16Preds_MSA(uint8_t* dst,
   SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
   SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
 } while (0)
 } while (0)
 
 
-static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VerticalPred8x8(uint8_t* WEBP_RESTRICT dst,
+                                        const uint8_t* WEBP_RESTRICT top) {
   if (top != NULL) {
   if (top != NULL) {
     const uint64_t out = LD(top);
     const uint64_t out = LD(top);
     STORE8x8(out, dst);
     STORE8x8(out, dst);
@@ -584,7 +609,8 @@ static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
   }
   }
 }
 }
 
 
-static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HorizontalPred8x8(uint8_t* WEBP_RESTRICT dst,
+                                          const uint8_t* WEBP_RESTRICT left) {
   if (left != NULL) {
   if (left != NULL) {
     int j;
     int j;
     for (j = 0; j < 8; j += 4) {
     for (j = 0; j < 8; j += 4) {
@@ -606,8 +632,9 @@ static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
   }
   }
 }
 }
 
 
-static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
-                                      const uint8_t* top) {
+static WEBP_INLINE void TrueMotion8x8(uint8_t* WEBP_RESTRICT dst,
+                                      const uint8_t* WEBP_RESTRICT left,
+                                      const uint8_t* WEBP_RESTRICT top) {
   if (left != NULL) {
   if (left != NULL) {
     if (top != NULL) {
     if (top != NULL) {
       int j;
       int j;
@@ -646,8 +673,9 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
-                                  const uint8_t* top) {
+static WEBP_INLINE void DCMode8x8(uint8_t* WEBP_RESTRICT dst,
+                                  const uint8_t* WEBP_RESTRICT left,
+                                  const uint8_t* WEBP_RESTRICT top) {
   uint64_t out;
   uint64_t out;
   v16u8 src = { 0 };
   v16u8 src = { 0 };
   if (top != NULL && left != NULL) {
   if (top != NULL && left != NULL) {
@@ -670,8 +698,9 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
   STORE8x8(out, dst);
   STORE8x8(out, dst);
 }
 }
 
 
-static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
-                                 const uint8_t* top) {
+static void IntraChromaPreds_MSA(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT left,
+                                 const uint8_t* WEBP_RESTRICT top) {
   // U block
   // U block
   DCMode8x8(C8DC8 + dst, left, top);
   DCMode8x8(C8DC8 + dst, left, top);
   VerticalPred8x8(C8VE8 + dst, top);
   VerticalPred8x8(C8VE8 + dst, top);
@@ -712,7 +741,8 @@ static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
   DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
   DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
 } while (0)
 } while (0)
 
 
-static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MSA(const uint8_t* WEBP_RESTRICT a,
+                        const uint8_t* WEBP_RESTRICT b) {
   uint32_t sum;
   uint32_t sum;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -739,7 +769,8 @@ static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
   return sum;
   return sum;
 }
 }
 
 
-static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MSA(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT b) {
   uint32_t sum;
   uint32_t sum;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -758,7 +789,8 @@ static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
   return sum;
   return sum;
 }
 }
 
 
-static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MSA(const uint8_t* WEBP_RESTRICT a,
+                      const uint8_t* WEBP_RESTRICT b) {
   uint32_t sum;
   uint32_t sum;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -778,7 +810,8 @@ static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
   return sum;
   return sum;
 }
 }
 
 
-static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MSA(const uint8_t* WEBP_RESTRICT a,
+                      const uint8_t* WEBP_RESTRICT b) {
   uint32_t sum = 0;
   uint32_t sum = 0;
   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
   v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
   v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
@@ -801,7 +834,7 @@ static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
 // Quantization
 // Quantization
 
 
 static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
 static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
-                             const VP8Matrix* const mtx) {
+                             const VP8Matrix* WEBP_RESTRICT const mtx) {
   int sum;
   int sum;
   v8i16 in0, in1, sh0, sh1, out0, out1;
   v8i16 in0, in1, sh0, sh1, out0, out1;
   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@@ -854,7 +887,7 @@ static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
 }
 }
 
 
 static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
 static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
-                               const VP8Matrix* const mtx) {
+                               const VP8Matrix* WEBP_RESTRICT const mtx) {
   int nz;
   int nz;
   nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
   nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;

+ 317 - 30
thirdparty/libwebp/src/dsp/enc_neon.c

@@ -60,8 +60,8 @@ static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
 
 
 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
                                     const int16x8_t row23,
                                     const int16x8_t row23,
-                                    const uint8_t* const ref,
-                                    uint8_t* const dst) {
+                                    const uint8_t* WEBP_RESTRICT const ref,
+                                    uint8_t* WEBP_RESTRICT const dst) {
   uint32x2_t dst01 = vdup_n_u32(0);
   uint32x2_t dst01 = vdup_n_u32(0);
   uint32x2_t dst23 = vdup_n_u32(0);
   uint32x2_t dst23 = vdup_n_u32(0);
 
 
@@ -120,8 +120,9 @@ static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
   Transpose8x2_NEON(E0, E1, rows);
   Transpose8x2_NEON(E0, E1, rows);
 }
 }
 
 
-static void ITransformOne_NEON(const uint8_t* ref,
-                               const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
+                               const int16_t* WEBP_RESTRICT in,
+                               uint8_t* WEBP_RESTRICT dst) {
   int16x8x2_t rows;
   int16x8x2_t rows;
   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
   TransformPass_NEON(&rows);
   TransformPass_NEON(&rows);
@@ -131,8 +132,9 @@ static void ITransformOne_NEON(const uint8_t* ref,
 
 
 #else
 #else
 
 
-static void ITransformOne_NEON(const uint8_t* ref,
-                               const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
+                               const int16_t* WEBP_RESTRICT in,
+                               uint8_t* WEBP_RESTRICT dst) {
   const int kBPS = BPS;
   const int kBPS = BPS;
   const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
   const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
 
 
@@ -247,8 +249,9 @@ static void ITransformOne_NEON(const uint8_t* ref,
 
 
 #endif    // WEBP_USE_INTRINSICS
 #endif    // WEBP_USE_INTRINSICS
 
 
-static void ITransform_NEON(const uint8_t* ref,
-                            const int16_t* in, uint8_t* dst, int do_two) {
+static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,
+                            const int16_t* WEBP_RESTRICT in,
+                            uint8_t* WEBP_RESTRICT dst, int do_two) {
   ITransformOne_NEON(ref, in, dst);
   ITransformOne_NEON(ref, in, dst);
   if (do_two) {
   if (do_two) {
     ITransformOne_NEON(ref + 4, in + 16, dst + 4);
     ITransformOne_NEON(ref + 4, in + 16, dst + 4);
@@ -294,8 +297,9 @@ static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
   return vreinterpretq_s16_u16(vsubl_u8(a, b));
   return vreinterpretq_s16_u16(vsubl_u8(a, b));
 }
 }
 
 
-static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
-                            int16_t* out) {
+static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
+                            const uint8_t* WEBP_RESTRICT ref,
+                            int16_t* WEBP_RESTRICT out) {
   int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
   int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
   {
   {
     const uint8x16_t S0 = Load4x4_NEON(src);
     const uint8x16_t S0 = Load4x4_NEON(src);
@@ -364,8 +368,9 @@ static const int32_t kCoeff32[] = {
   51000, 51000, 51000, 51000
   51000, 51000, 51000, 51000
 };
 };
 
 
-static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
-                            int16_t* out) {
+static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
+                            const uint8_t* WEBP_RESTRICT ref,
+                            int16_t* WEBP_RESTRICT out) {
   const int kBPS = BPS;
   const int kBPS = BPS;
   const uint8_t* src_ptr = src;
   const uint8_t* src_ptr = src;
   const uint8_t* ref_ptr = ref;
   const uint8_t* ref_ptr = ref;
@@ -484,7 +489,8 @@ static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
   src += stride;                                    \
   src += stride;                                    \
 } while (0)
 } while (0)
 
 
-static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
+static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,
+                               int16_t* WEBP_RESTRICT out) {
   const int stride = 16;
   const int stride = 16;
   const int16x4_t zero = vdup_n_s16(0);
   const int16x4_t zero = vdup_n_s16(0);
   int32x4x4_t tmp0;
   int32x4x4_t tmp0;
@@ -659,8 +665,9 @@ static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
 // Hadamard transform
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
-                         const uint16_t* const w) {
+static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,
+                         const uint8_t* WEBP_RESTRICT const b,
+                         const uint16_t* WEBP_RESTRICT const w) {
   uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
   uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
   uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
   uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
   uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
   uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@@ -701,8 +708,9 @@ static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
 }
 }
 #undef LOAD_LANE_32b
 #undef LOAD_LANE_32b
 
 
-static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
-                           const uint16_t* const w) {
+static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,
+                           const uint8_t* WEBP_RESTRICT const b,
+                           const uint16_t* WEBP_RESTRICT const w) {
   int D = 0;
   int D = 0;
   int x, y;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@@ -715,9 +723,10 @@ static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 
 
-static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
+static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
+                                  const uint8_t* WEBP_RESTRICT pred,
                                   int start_block, int end_block,
                                   int start_block, int end_block,
-                                  VP8Histogram* const histo) {
+                                  VP8Histogram* WEBP_RESTRICT const histo) {
   const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
   const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
   int j;
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@@ -747,9 +756,9 @@ static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 
 
-static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
-                                             const uint8_t* const b,
-                                             uint32x4_t* const sum) {
+static WEBP_INLINE void AccumulateSSE16_NEON(
+    const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,
+    uint32x4_t* const sum) {
   const uint8x16_t a0 = vld1q_u8(a);
   const uint8x16_t a0 = vld1q_u8(a);
   const uint8x16_t b0 = vld1q_u8(b);
   const uint8x16_t b0 = vld1q_u8(b);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@@ -775,7 +784,8 @@ static int SumToInt_NEON(uint32x4_t sum) {
 #endif
 #endif
 }
 }
 
 
-static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,
+                         const uint8_t* WEBP_RESTRICT b) {
   uint32x4_t sum = vdupq_n_u32(0);
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   int y;
   for (y = 0; y < 16; ++y) {
   for (y = 0; y < 16; ++y) {
@@ -784,7 +794,8 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
   return SumToInt_NEON(sum);
   return SumToInt_NEON(sum);
 }
 }
 
 
-static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,
+                        const uint8_t* WEBP_RESTRICT b) {
   uint32x4_t sum = vdupq_n_u32(0);
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   int y;
   for (y = 0; y < 8; ++y) {
   for (y = 0; y < 8; ++y) {
@@ -793,7 +804,8 @@ static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
   return SumToInt_NEON(sum);
   return SumToInt_NEON(sum);
 }
 }
 
 
-static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT b) {
   uint32x4_t sum = vdupq_n_u32(0);
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   int y;
   for (y = 0; y < 8; ++y) {
   for (y = 0; y < 8; ++y) {
@@ -806,7 +818,8 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
   return SumToInt_NEON(sum);
   return SumToInt_NEON(sum);
 }
 }
 
 
-static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT b) {
   const uint8x16_t a0 = Load4x4_NEON(a);
   const uint8x16_t a0 = Load4x4_NEON(a);
   const uint8x16_t b0 = Load4x4_NEON(b);
   const uint8x16_t b0 = Load4x4_NEON(b);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@@ -825,8 +838,9 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
 // Compilation with gcc-4.6.x is problematic for now.
 // Compilation with gcc-4.6.x is problematic for now.
 #if !defined(WORK_AROUND_GCC)
 #if !defined(WORK_AROUND_GCC)
 
 
-static int16x8_t Quantize_NEON(int16_t* const in,
-                               const VP8Matrix* const mtx, int offset) {
+static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,
+                               const VP8Matrix* WEBP_RESTRICT const mtx,
+                               int offset) {
   const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
   const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
   const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
   const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
   const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
   const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@@ -860,7 +874,7 @@ static const uint8_t kShuffles[4][8] = {
 };
 };
 
 
 static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
 static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
-                              const VP8Matrix* const mtx) {
+                              const VP8Matrix* WEBP_RESTRICT const mtx) {
   const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
   const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
   const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
   const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
   uint8x8x4_t shuffles;
   uint8x8x4_t shuffles;
@@ -902,7 +916,7 @@ static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
 }
 }
 
 
 static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
 static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
-                                const VP8Matrix* const mtx) {
+                                const VP8Matrix* WEBP_RESTRICT const mtx) {
   int nz;
   int nz;
   nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
   nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
@@ -911,6 +925,271 @@ static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
 
 
 #endif   // !WORK_AROUND_GCC
 #endif   // !WORK_AROUND_GCC
 
 
+#if WEBP_AARCH64
+
+#if BPS == 32
+#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane)                              \
+  do {                                                                         \
+    uint8x16_t r;                                                              \
+    r = vqtbl2q_u8(qcombined, tbl);                                            \
+    r = vreinterpretq_u8_u32(                                                  \
+        vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane),          \
+                       vreinterpretq_u32_u8(r), 1));                           \
+    vst1q_u8(dst, r);                                                          \
+  } while (0)
+
+#define RD4_VR4_LD4_VL4_NEON(dst, tbl)                                         \
+  do {                                                                         \
+    uint8x16_t r;                                                              \
+    r = vqtbl2q_u8(qcombined, tbl);                                            \
+    vst1q_u8(dst, r);                                                          \
+  } while (0)
+
+static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
+                             const uint8_t* WEBP_RESTRICT top) {
+  // 0   1   2   3   4   5   6   7   8   9  10  11  12  13
+  //     L   K   J   I   X   A   B   C   D   E   F   G   H
+  //    -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7
+  static const uint8_t kLookupTbl1[64] = {
+    0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 12, 12,
+    3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0,
+    4, 20, 21, 22,  3, 18,  2, 17,  3, 19,  4, 20,  2, 17,  1, 16,
+    2, 18,  3, 19,  1, 16, 31, 31,  1, 17,  2, 18, 31, 31, 31, 31
+  };
+
+  static const uint8_t kLookupTbl2[64] = {
+    20, 21, 22, 23,  5,  6,  7,  8, 22, 23, 24, 25,  6,  7,  8,  9,
+    19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,
+    18, 19, 20, 21, 19,  5,  6,  7, 24, 25, 26, 27,  7,  8,  9, 26,
+    17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27
+  };
+
+  static const uint8_t kLookupTbl3[64] = {
+    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 19, 19, 19, 19,
+    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 18, 18, 18, 18,
+    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 17, 17, 17, 17,
+    30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 16, 16, 16, 16
+  };
+
+  const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1);
+  const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2);
+  const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3);
+
+  const uint8x16_t preload = vld1q_u8(top - 5);
+  uint8x16x2_t qcombined;
+  uint8x16_t result0, result1;
+
+  uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);
+  uint8x16_t b = preload;
+  uint8x16_t c = vextq_u8(a, a, 2);
+
+  uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);
+  uint8x16_t avg2_all = vrhaddq_u8(a, b);
+
+  uint8x8_t preload_x8, sub_a, sub_c;
+  uint8_t result_u8;
+  uint8x8_t res_lo, res_hi;
+  uint8x16_t full_b;
+  uint16x8_t sub, sum_lo, sum_hi;
+
+  preload_x8 = vget_low_u8(c);
+  preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);
+
+  result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;
+
+  avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);
+  avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);
+
+  qcombined.val[0] = avg2_all;
+  qcombined.val[1] = avg3_all;
+
+  sub_a = vdup_laneq_u8(preload, 4);
+
+  // preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}
+  full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);
+  // preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}
+  sub_c = vreinterpret_u8_u32(vdup_n_u32(
+      vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));
+
+  sub = vsubl_u8(sub_c, sub_a);
+  sum_lo = vaddw_u8(sub, vget_low_u8(full_b));
+  res_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
+
+  sum_hi = vaddw_u8(sub, vget_high_u8(full_b));
+  res_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
+
+  // DC4, VE4, HE4, TM4
+  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);
+  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);
+  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);
+  DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);
+
+  // RD4, VR4, LD4, VL4
+  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);
+  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);
+  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);
+  RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);
+
+  // HD4, HU4
+  result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);
+  result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);
+
+  vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));
+  vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));
+  vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
+  vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
+}
+#endif  // BPS == 32
+
+static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {
+  uint8x16_t a = vdupq_n_u8(value);
+  int i;
+  for (i = 0; i < 16; i++) {
+    vst1q_u8(dst + BPS * i, a);
+  }
+}
+
+static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) {
+  uint8x16_t a = vld1q_u8(src);
+  int i;
+  for (i = 0; i < 16; i++) {
+    vst1q_u8(dst + BPS * i, a);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst,
+                                              const uint8_t* left) {
+  uint8x16_t a;
+
+  if (left == NULL) {
+    Fill_NEON(dst, 129);
+    return;
+  }
+
+  a = vld1q_u8(left + 0);
+  vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0));
+  vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1));
+  vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2));
+  vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3));
+  vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4));
+  vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5));
+  vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6));
+  vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7));
+  vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8));
+  vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9));
+  vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10));
+  vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11));
+  vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12));
+  vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13));
+  vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14));
+  vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15));
+}
+
+static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) {
+  if (top != NULL) {
+    Fill16_NEON(dst, top);
+  } else {
+    Fill_NEON(dst, 127);
+  }
+}
+
+static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left,
+                                    const uint8_t* top) {
+  uint8_t s;
+
+  if (top != NULL) {
+    uint16_t dc;
+    dc = vaddlvq_u8(vld1q_u8(top));
+    if (left != NULL) {
+      // top and left present.
+      dc += vaddlvq_u8(vld1q_u8(left));
+      s = vqrshrnh_n_u16(dc, 5);
+    } else {
+      // top but no left.
+      s = vqrshrnh_n_u16(dc, 4);
+    }
+  } else {
+    if (left != NULL) {
+      uint16_t dc;
+      // left but no top.
+      dc = vaddlvq_u8(vld1q_u8(left));
+      s = vqrshrnh_n_u16(dc, 4);
+    } else {
+      // No top, no left, nothing.
+      s = 0x80;
+    }
+  }
+  Fill_NEON(dst, s);
+}
+
+static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst,
+                                              const uint8x8_t outer,
+                                              const uint8x8x2_t inner,
+                                              const uint16x8_t a, int i,
+                                              const int n) {
+  uint8x8_t d1, d2;
+  uint16x8_t r1, r2;
+
+  r1 = vaddl_u8(outer, inner.val[0]);
+  r1 = vqsubq_u16(r1, a);
+  d1 = vqmovun_s16(vreinterpretq_s16_u16(r1));
+  r2 = vaddl_u8(outer, inner.val[1]);
+  r2 = vqsubq_u16(r2, a);
+  d2 = vqmovun_s16(vreinterpretq_s16_u16(r2));
+  vst1_u8(dst + BPS * (i * 4 + n), d1);
+  vst1_u8(dst + BPS * (i * 4 + n) + 8, d2);
+}
+
+static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,
+                                        const uint8_t* top) {
+  int i;
+  uint16x8_t a;
+  uint8x8x2_t inner;
+
+  if (left == NULL) {
+    // True motion without left samples (hence: with default 129 value) is
+    // equivalent to VE prediction where you just copy the top samples.
+    // Note that if top samples are not available, the default value is then
+    // 129, and not 127 as in the VerticalPred case.
+    if (top != NULL) {
+      VerticalPred16_NEON(dst, top);
+    } else {
+      Fill_NEON(dst, 129);
+    }
+    return;
+  }
+
+  // left is not NULL.
+  if (top == NULL) {
+    HorizontalPred16_NEON(dst, left);
+    return;
+  }
+
+  // Neither left nor top are NULL.
+  a = vdupq_n_u16(left[-1]);
+  inner = vld1_u8_x2(top);
+
+  for (i = 0; i < 4; i++) {
+    const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]);
+
+    TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0);
+    TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1);
+    TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2);
+    TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3);
+  }
+}
+
+static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,
+                              const uint8_t* WEBP_RESTRICT left,
+                              const uint8_t* WEBP_RESTRICT top) {
+  DCMode_NEON(I16DC16 + dst, left, top);
+  VerticalPred16_NEON(I16VE16 + dst, top);
+  HorizontalPred16_NEON(I16HE16 + dst, left);
+  TrueMotion_NEON(I16TM16 + dst, left, top);
+}
+
+#endif // WEBP_AARCH64
+
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Entry point
 // Entry point
 
 
@@ -931,9 +1210,17 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
   VP8SSE8x8 = SSE8x8_NEON;
   VP8SSE8x8 = SSE8x8_NEON;
   VP8SSE4x4 = SSE4x4_NEON;
   VP8SSE4x4 = SSE4x4_NEON;
 
 
+#if WEBP_AARCH64
+#if BPS == 32
+  VP8EncPredLuma4 = Intra4Preds_NEON;
+#endif
+  VP8EncPredLuma16 = Intra16Preds_NEON;
+#endif
+
 #if !defined(WORK_AROUND_GCC)
 #if !defined(WORK_AROUND_GCC)
   VP8EncQuantizeBlock = QuantizeBlock_NEON;
   VP8EncQuantizeBlock = QuantizeBlock_NEON;
   VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
   VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_NEON;
 #endif
 #endif
 }
 }
 
 

+ 125 - 77
thirdparty/libwebp/src/dsp/enc_sse2.c

@@ -26,8 +26,9 @@
 // Transforms (Paragraph 14.4)
 // Transforms (Paragraph 14.4)
 
 
 // Does one inverse transform.
 // Does one inverse transform.
-static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in,
-                                uint8_t* dst) {
+static void ITransform_One_SSE2(const uint8_t* WEBP_RESTRICT ref,
+                                const int16_t* WEBP_RESTRICT in,
+                                uint8_t* WEBP_RESTRICT dst) {
   // This implementation makes use of 16-bit fixed point versions of two
   // This implementation makes use of 16-bit fixed point versions of two
   // multiply constants:
   // multiply constants:
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -177,8 +178,9 @@ static void ITransform_One_SSE2(const uint8_t* ref, const int16_t* in,
 }
 }
 
 
 // Does two inverse transforms.
 // Does two inverse transforms.
-static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in,
-                                uint8_t* dst) {
+static void ITransform_Two_SSE2(const uint8_t* WEBP_RESTRICT ref,
+                                const int16_t* WEBP_RESTRICT in,
+                                uint8_t* WEBP_RESTRICT dst) {
   // This implementation makes use of 16-bit fixed point versions of two
   // This implementation makes use of 16-bit fixed point versions of two
   // multiply constants:
   // multiply constants:
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -316,7 +318,9 @@ static void ITransform_Two_SSE2(const uint8_t* ref, const int16_t* in,
 }
 }
 
 
 // Does one or two inverse transforms.
 // Does one or two inverse transforms.
-static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+static void ITransform_SSE2(const uint8_t* WEBP_RESTRICT ref,
+                            const int16_t* WEBP_RESTRICT in,
+                            uint8_t* WEBP_RESTRICT dst,
                             int do_two) {
                             int do_two) {
   if (do_two) {
   if (do_two) {
     ITransform_Two_SSE2(ref, in, dst);
     ITransform_Two_SSE2(ref, in, dst);
@@ -373,7 +377,7 @@ static void FTransformPass1_SSE2(const __m128i* const in01,
 
 
 static void FTransformPass2_SSE2(const __m128i* const v01,
 static void FTransformPass2_SSE2(const __m128i* const v01,
                                  const __m128i* const v32,
                                  const __m128i* const v32,
-                                 int16_t* out) {
+                                 int16_t* WEBP_RESTRICT out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   const __m128i seven = _mm_set1_epi16(7);
   const __m128i seven = _mm_set1_epi16(7);
   const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
   const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
@@ -424,8 +428,9 @@ static void FTransformPass2_SSE2(const __m128i* const v01,
   _mm_storeu_si128((__m128i*)&out[8], d2_f3);
   _mm_storeu_si128((__m128i*)&out[8], d2_f3);
 }
 }
 
 
-static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
-                            int16_t* out) {
+static void FTransform_SSE2(const uint8_t* WEBP_RESTRICT src,
+                            const uint8_t* WEBP_RESTRICT ref,
+                            int16_t* WEBP_RESTRICT out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   // Load src.
   // Load src.
   const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
   const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
@@ -468,8 +473,9 @@ static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
   FTransformPass2_SSE2(&v01, &v32, out);
   FTransformPass2_SSE2(&v01, &v32, out);
 }
 }
 
 
-static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
-                             int16_t* out) {
+static void FTransform2_SSE2(const uint8_t* WEBP_RESTRICT src,
+                             const uint8_t* WEBP_RESTRICT ref,
+                             int16_t* WEBP_RESTRICT out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
 
 
   // Load src and convert to 16b.
   // Load src and convert to 16b.
@@ -517,7 +523,8 @@ static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
   FTransformPass2_SSE2(&v01h, &v32h, out + 16);
   FTransformPass2_SSE2(&v01h, &v32h, out + 16);
 }
 }
 
 
-static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
+static void FTransformWHTRow_SSE2(const int16_t* WEBP_RESTRICT const in,
+                                  __m128i* const out) {
   const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
   const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
   const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
   const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
   const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
   const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
@@ -533,7 +540,8 @@ static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
   *out = _mm_madd_epi16(D, kMult);
   *out = _mm_madd_epi16(D, kMult);
 }
 }
 
 
-static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
+static void FTransformWHT_SSE2(const int16_t* WEBP_RESTRICT in,
+                               int16_t* WEBP_RESTRICT out) {
   // Input is 12b signed.
   // Input is 12b signed.
   __m128i row0, row1, row2, row3;
   __m128i row0, row1, row2, row3;
   // Rows are 14b signed.
   // Rows are 14b signed.
@@ -566,9 +574,10 @@ static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
 // Compute susceptibility based on DCT-coeff histograms:
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
 // the higher, the "easier" the macroblock is to compress.
 
 
-static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
+static void CollectHistogram_SSE2(const uint8_t* WEBP_RESTRICT ref,
+                                  const uint8_t* WEBP_RESTRICT pred,
                                   int start_block, int end_block,
                                   int start_block, int end_block,
-                                  VP8Histogram* const histo) {
+                                  VP8Histogram* WEBP_RESTRICT const histo) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
   int j;
   int j;
@@ -640,7 +649,8 @@ static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
   }
   }
 }
 }
 
 
-static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                   const uint8_t* WEBP_RESTRICT top) {
   int j;
   int j;
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   for (j = 0; j < 8; ++j) {
   for (j = 0; j < 8; ++j) {
@@ -648,7 +658,8 @@ static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
   }
   }
 }
 }
 
 
-static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE16_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                  const uint8_t* WEBP_RESTRICT top) {
   const __m128i top_values = _mm_load_si128((const __m128i*)top);
   const __m128i top_values = _mm_load_si128((const __m128i*)top);
   int j;
   int j;
   for (j = 0; j < 16; ++j) {
   for (j = 0; j < 16; ++j) {
@@ -656,8 +667,9 @@ static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
   }
   }
 }
 }
 
 
-static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
-                                          const uint8_t* top, int size) {
+static WEBP_INLINE void VerticalPred_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                          const uint8_t* WEBP_RESTRICT top,
+                                          int size) {
   if (top != NULL) {
   if (top != NULL) {
     if (size == 8) {
     if (size == 8) {
       VE8uv_SSE2(dst, top);
       VE8uv_SSE2(dst, top);
@@ -669,7 +681,8 @@ static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                   const uint8_t* WEBP_RESTRICT left) {
   int j;
   int j;
   for (j = 0; j < 8; ++j) {
   for (j = 0; j < 8; ++j) {
     const __m128i values = _mm_set1_epi8((char)left[j]);
     const __m128i values = _mm_set1_epi8((char)left[j]);
@@ -678,7 +691,8 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
   }
   }
 }
 }
 
 
-static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE16_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                  const uint8_t* WEBP_RESTRICT left) {
   int j;
   int j;
   for (j = 0; j < 16; ++j) {
   for (j = 0; j < 16; ++j) {
     const __m128i values = _mm_set1_epi8((char)left[j]);
     const __m128i values = _mm_set1_epi8((char)left[j]);
@@ -687,8 +701,9 @@ static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
   }
   }
 }
 }
 
 
-static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
-                                            const uint8_t* left, int size) {
+static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                            const uint8_t* WEBP_RESTRICT left,
+                                            int size) {
   if (left != NULL) {
   if (left != NULL) {
     if (size == 8) {
     if (size == 8) {
       HE8uv_SSE2(dst, left);
       HE8uv_SSE2(dst, left);
@@ -700,8 +715,9 @@ static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
-                                const uint8_t* top, int size) {
+static WEBP_INLINE void TM_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                const uint8_t* WEBP_RESTRICT left,
+                                const uint8_t* WEBP_RESTRICT top, int size) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   int y;
   int y;
   if (size == 8) {
   if (size == 8) {
@@ -728,8 +744,10 @@ static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
-                                        const uint8_t* top, int size) {
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                        const uint8_t* WEBP_RESTRICT left,
+                                        const uint8_t* WEBP_RESTRICT top,
+                                        int size) {
   if (left != NULL) {
   if (left != NULL) {
     if (top != NULL) {
     if (top != NULL) {
       TM_SSE2(dst, left, top, size);
       TM_SSE2(dst, left, top, size);
@@ -749,8 +767,9 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
-                                   const uint8_t* top) {
+static WEBP_INLINE void DC8uv_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                   const uint8_t* WEBP_RESTRICT left,
+                                   const uint8_t* WEBP_RESTRICT top) {
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
   const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
   const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
   const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
@@ -758,7 +777,8 @@ static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
   Put8x8uv_SSE2(DC >> 4, dst);
   Put8x8uv_SSE2(DC >> 4, dst);
 }
 }
 
 
-static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                         const uint8_t* WEBP_RESTRICT top) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   const __m128i sum = _mm_sad_epu8(top_values, zero);
   const __m128i sum = _mm_sad_epu8(top_values, zero);
@@ -766,7 +786,8 @@ static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
   Put8x8uv_SSE2(DC >> 3, dst);
   Put8x8uv_SSE2(DC >> 3, dst);
 }
 }
 
 
-static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                        const uint8_t* WEBP_RESTRICT left) {
   // 'left' is contiguous so we can reuse the top summation.
   // 'left' is contiguous so we can reuse the top summation.
   DC8uvNoLeft_SSE2(dst, left);
   DC8uvNoLeft_SSE2(dst, left);
 }
 }
@@ -775,8 +796,9 @@ static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
   Put8x8uv_SSE2(0x80, dst);
   Put8x8uv_SSE2(0x80, dst);
 }
 }
 
 
-static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
-                                       const uint8_t* top) {
+static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                       const uint8_t* WEBP_RESTRICT left,
+                                       const uint8_t* WEBP_RESTRICT top) {
   if (top != NULL) {
   if (top != NULL) {
     if (left != NULL) {  // top and left present
     if (left != NULL) {  // top and left present
       DC8uv_SSE2(dst, left, top);
       DC8uv_SSE2(dst, left, top);
@@ -790,8 +812,9 @@ static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
-                                  const uint8_t* top) {
+static WEBP_INLINE void DC16_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                  const uint8_t* WEBP_RESTRICT left,
+                                  const uint8_t* WEBP_RESTRICT top) {
   const __m128i top_row = _mm_load_si128((const __m128i*)top);
   const __m128i top_row = _mm_load_si128((const __m128i*)top);
   const __m128i left_row = _mm_load_si128((const __m128i*)left);
   const __m128i left_row = _mm_load_si128((const __m128i*)left);
   const int DC =
   const int DC =
@@ -799,13 +822,15 @@ static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
   Put16_SSE2(DC >> 5, dst);
   Put16_SSE2(DC >> 5, dst);
 }
 }
 
 
-static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                        const uint8_t* WEBP_RESTRICT top) {
   const __m128i top_row = _mm_load_si128((const __m128i*)top);
   const __m128i top_row = _mm_load_si128((const __m128i*)top);
   const int DC = VP8HorizontalAdd8b(&top_row) + 8;
   const int DC = VP8HorizontalAdd8b(&top_row) + 8;
   Put16_SSE2(DC >> 4, dst);
   Put16_SSE2(DC >> 4, dst);
 }
 }
 
 
-static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                       const uint8_t* WEBP_RESTRICT left) {
   // 'left' is contiguous so we can reuse the top summation.
   // 'left' is contiguous so we can reuse the top summation.
   DC16NoLeft_SSE2(dst, left);
   DC16NoLeft_SSE2(dst, left);
 }
 }
@@ -814,8 +839,9 @@ static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
   Put16_SSE2(0x80, dst);
   Put16_SSE2(0x80, dst);
 }
 }
 
 
-static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
-                                      const uint8_t* top) {
+static WEBP_INLINE void DC16Mode_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                      const uint8_t* WEBP_RESTRICT left,
+                                      const uint8_t* WEBP_RESTRICT top) {
   if (top != NULL) {
   if (top != NULL) {
     if (left != NULL) {  // top and left present
     if (left != NULL) {  // top and left present
       DC16_SSE2(dst, left, top);
       DC16_SSE2(dst, left, top);
@@ -844,8 +870,9 @@ static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
 //   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
 //   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
 //   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
 //   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
 
 
-static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // vertical
+// vertical
+static WEBP_INLINE void VE4_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT top) {
   const __m128i one = _mm_set1_epi8(1);
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
   const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -861,8 +888,9 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // horizontal
+// horizontal
+static WEBP_INLINE void HE4_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT top) {
   const int X = top[-1];
   const int X = top[-1];
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
@@ -874,15 +902,17 @@ static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }
 }
 
 
-static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC4_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT top) {
   uint32_t dc = 4;
   uint32_t dc = 4;
   int i;
   int i;
   for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
   for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
   Fill_SSE2(dst, dc >> 3, 4);
   Fill_SSE2(dst, dc >> 3, 4);
 }
 }
 
 
-static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // Down-Left
+// Down-Left
+static WEBP_INLINE void LD4_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT top) {
   const __m128i one = _mm_set1_epi8(1);
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
   const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -898,8 +928,9 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
   WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
   WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 }
 
 
-static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // Vertical-Right
+// Vertical-Right
+static WEBP_INLINE void VR4_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT top) {
   const __m128i one = _mm_set1_epi8(1);
   const __m128i one = _mm_set1_epi8(1);
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
@@ -924,8 +955,9 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
   DST(0, 3) = AVG3(K, J, I);
   DST(0, 3) = AVG3(K, J, I);
 }
 }
 
 
-static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // Vertical-Left
+// Vertical-Left
+static WEBP_INLINE void VL4_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT top) {
   const __m128i one = _mm_set1_epi8(1);
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
   const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
   const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
   const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@@ -951,8 +983,9 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
   DST(3, 3) = (extra_out >> 8) & 0xff;
   DST(3, 3) = (extra_out >> 8) & 0xff;
 }
 }
 
 
-static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
-                                 const uint8_t* top) {  // Down-right
+// Down-right
+static WEBP_INLINE void RD4_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT top) {
   const __m128i one = _mm_set1_epi8(1);
   const __m128i one = _mm_set1_epi8(1);
   const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
   const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
   const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
   const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
@@ -968,7 +1001,8 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
   WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
   WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 }
 
 
-static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HU4_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT top) {
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
   const int K = top[-4];
   const int K = top[-4];
@@ -983,7 +1017,8 @@ static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
   DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
   DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 }
 
 
-static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HD4_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT top) {
   const int X = top[-1];
   const int X = top[-1];
   const int I = top[-2];
   const int I = top[-2];
   const int J = top[-3];
   const int J = top[-3];
@@ -1006,7 +1041,8 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
   DST(1, 3)             = AVG3(L, K, J);
   DST(1, 3)             = AVG3(L, K, J);
 }
 }
 
 
-static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void TM4_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                 const uint8_t* WEBP_RESTRICT top) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top));
   const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top));
   const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
   const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
@@ -1028,7 +1064,8 @@ static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
 
 
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_SSE2(uint8_t* WEBP_RESTRICT dst,
+                             const uint8_t* WEBP_RESTRICT top) {
   DC4_SSE2(I4DC4 + dst, top);
   DC4_SSE2(I4DC4 + dst, top);
   TM4_SSE2(I4TM4 + dst, top);
   TM4_SSE2(I4TM4 + dst, top);
   VE4_SSE2(I4VE4 + dst, top);
   VE4_SSE2(I4VE4 + dst, top);
@@ -1044,8 +1081,9 @@ static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
 // Chroma 8x8 prediction (paragraph 12.2)
 
 
-static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
-                                  const uint8_t* top) {
+static void IntraChromaPreds_SSE2(uint8_t* WEBP_RESTRICT dst,
+                                  const uint8_t* WEBP_RESTRICT left,
+                                  const uint8_t* WEBP_RESTRICT top) {
   // U block
   // U block
   DC8uvMode_SSE2(C8DC8 + dst, left, top);
   DC8uvMode_SSE2(C8DC8 + dst, left, top);
   VerticalPred_SSE2(C8VE8 + dst, top, 8);
   VerticalPred_SSE2(C8VE8 + dst, top, 8);
@@ -1064,8 +1102,9 @@ static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
 // luma 16x16 prediction (paragraph 12.3)
 
 
-static void Intra16Preds_SSE2(uint8_t* dst,
-                              const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_SSE2(uint8_t* WEBP_RESTRICT dst,
+                              const uint8_t* WEBP_RESTRICT left,
+                              const uint8_t* WEBP_RESTRICT top) {
   DC16Mode_SSE2(I16DC16 + dst, left, top);
   DC16Mode_SSE2(I16DC16 + dst, left, top);
   VerticalPred_SSE2(I16VE16 + dst, top, 16);
   VerticalPred_SSE2(I16VE16 + dst, top, 16);
   HorizontalPred_SSE2(I16HE16 + dst, left, 16);
   HorizontalPred_SSE2(I16HE16 + dst, left, 16);
@@ -1092,7 +1131,8 @@ static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
   *sum = _mm_add_epi32(sum1, sum2);
   *sum = _mm_add_epi32(sum1, sum2);
 }
 }
 
 
-static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
+static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* WEBP_RESTRICT a,
+                                     const uint8_t* WEBP_RESTRICT b,
                                      int num_pairs) {
                                      int num_pairs) {
   __m128i sum = _mm_setzero_si128();
   __m128i sum = _mm_setzero_si128();
   int32_t tmp[4];
   int32_t tmp[4];
@@ -1114,18 +1154,21 @@ static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
   return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
   return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
 }
 }
 
 
-static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_SSE2(const uint8_t* WEBP_RESTRICT a,
+                         const uint8_t* WEBP_RESTRICT b) {
   return SSE_16xN_SSE2(a, b, 8);
   return SSE_16xN_SSE2(a, b, 8);
 }
 }
 
 
-static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_SSE2(const uint8_t* WEBP_RESTRICT a,
+                        const uint8_t* WEBP_RESTRICT b) {
   return SSE_16xN_SSE2(a, b, 4);
   return SSE_16xN_SSE2(a, b, 4);
 }
 }
 
 
 #define LOAD_8x16b(ptr) \
 #define LOAD_8x16b(ptr) \
   _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
   _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
 
 
-static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_SSE2(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT b) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   int num_pairs = 4;
   int num_pairs = 4;
   __m128i sum = zero;
   __m128i sum = zero;
@@ -1152,7 +1195,8 @@ static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
 }
 }
 #undef LOAD_8x16b
 #undef LOAD_8x16b
 
 
-static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_SSE2(const uint8_t* WEBP_RESTRICT a,
+                       const uint8_t* WEBP_RESTRICT b) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
 
 
   // Load values. Note that we read 8 pixels instead of 4,
   // Load values. Note that we read 8 pixels instead of 4,
@@ -1189,7 +1233,7 @@ static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 
 
-static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_SSE2(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) {
   const __m128i mask = _mm_set1_epi16(0x00ff);
   const __m128i mask = _mm_set1_epi16(0x00ff);
   const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
   const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
   const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
   const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
@@ -1227,8 +1271,9 @@ static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
 // Hadamard transform
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
-                           const uint16_t* const w) {
+static int TTransform_SSE2(const uint8_t* WEBP_RESTRICT inA,
+                           const uint8_t* WEBP_RESTRICT inB,
+                           const uint16_t* WEBP_RESTRICT const w) {
   int32_t sum[4];
   int32_t sum[4];
   __m128i tmp_0, tmp_1, tmp_2, tmp_3;
   __m128i tmp_0, tmp_1, tmp_2, tmp_3;
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
@@ -1328,14 +1373,16 @@ static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
   return sum[0] + sum[1] + sum[2] + sum[3];
   return sum[0] + sum[1] + sum[2] + sum[3];
 }
 }
 
 
-static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
-                         const uint16_t* const w) {
+static int Disto4x4_SSE2(const uint8_t* WEBP_RESTRICT const a,
+                         const uint8_t* WEBP_RESTRICT const b,
+                         const uint16_t* WEBP_RESTRICT const w) {
   const int diff_sum = TTransform_SSE2(a, b, w);
   const int diff_sum = TTransform_SSE2(a, b, w);
   return abs(diff_sum) >> 5;
   return abs(diff_sum) >> 5;
 }
 }
 
 
-static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
-                           const uint16_t* const w) {
+static int Disto16x16_SSE2(const uint8_t* WEBP_RESTRICT const a,
+                           const uint8_t* WEBP_RESTRICT const b,
+                           const uint16_t* WEBP_RESTRICT const w) {
   int D = 0;
   int D = 0;
   int x, y;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@@ -1350,9 +1397,10 @@ static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
 // Quantization
 // Quantization
 //
 //
 
 
-static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
-                                            const uint16_t* const sharpen,
-                                            const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock_SSE2(
+    int16_t in[16], int16_t out[16],
+    const uint16_t* WEBP_RESTRICT const sharpen,
+    const VP8Matrix* WEBP_RESTRICT const mtx) {
   const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
   const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   __m128i coeff0, coeff8;
   __m128i coeff0, coeff8;
@@ -1463,17 +1511,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
 }
 }
 
 
 static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
 static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
-                              const VP8Matrix* const mtx) {
+                              const VP8Matrix* WEBP_RESTRICT const mtx) {
   return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
   return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
 }
 }
 
 
 static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
 static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
-                                 const VP8Matrix* const mtx) {
+                                 const VP8Matrix* WEBP_RESTRICT const mtx) {
   return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
   return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
 }
 }
 
 
 static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
 static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
-                                const VP8Matrix* const mtx) {
+                                const VP8Matrix* WEBP_RESTRICT const mtx) {
   int nz;
   int nz;
   const uint16_t* const sharpen = &mtx->sharpen_[0];
   const uint16_t* const sharpen = &mtx->sharpen_[0];
   nz  = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
   nz  = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;

+ 12 - 9
thirdparty/libwebp/src/dsp/enc_sse41.c

@@ -23,9 +23,10 @@
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms.
 // Compute susceptibility based on DCT-coeff histograms.
 
 
-static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
+static void CollectHistogram_SSE41(const uint8_t* WEBP_RESTRICT ref,
+                                   const uint8_t* WEBP_RESTRICT pred,
                                    int start_block, int end_block,
                                    int start_block, int end_block,
-                                   VP8Histogram* const histo) {
+                                   VP8Histogram* WEBP_RESTRICT const histo) {
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
   int j;
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@@ -168,14 +169,16 @@ static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
   return sum[0] + sum[1] + sum[2] + sum[3];
   return sum[0] + sum[1] + sum[2] + sum[3];
 }
 }
 
 
-static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
-                          const uint16_t* const w) {
+static int Disto4x4_SSE41(const uint8_t* WEBP_RESTRICT const a,
+                          const uint8_t* WEBP_RESTRICT const b,
+                          const uint16_t* WEBP_RESTRICT const w) {
   const int diff_sum = TTransform_SSE41(a, b, w);
   const int diff_sum = TTransform_SSE41(a, b, w);
   return abs(diff_sum) >> 5;
   return abs(diff_sum) >> 5;
 }
 }
 
 
-static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
-                            const uint16_t* const w) {
+static int Disto16x16_SSE41(const uint8_t* WEBP_RESTRICT const a,
+                            const uint8_t* WEBP_RESTRICT const b,
+                            const uint16_t* WEBP_RESTRICT const w) {
   int D = 0;
   int D = 0;
   int x, y;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
@@ -301,17 +304,17 @@ static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
 #undef PSHUFB_CST
 #undef PSHUFB_CST
 
 
 static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
 static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
-                               const VP8Matrix* const mtx) {
+                               const VP8Matrix* WEBP_RESTRICT const mtx) {
   return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
   return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
 }
 }
 
 
 static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
 static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
-                                  const VP8Matrix* const mtx) {
+                                  const VP8Matrix* WEBP_RESTRICT const mtx) {
   return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
   return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
 }
 }
 
 
 static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
 static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
-                                 const VP8Matrix* const mtx) {
+                                 const VP8Matrix* WEBP_RESTRICT const mtx) {
   int nz;
   int nz;
   const uint16_t* const sharpen = &mtx->sharpen_[0];
   const uint16_t* const sharpen = &mtx->sharpen_[0];
   nz  = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
   nz  = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;

+ 56 - 88
thirdparty/libwebp/src/dsp/filters.c

@@ -23,55 +23,42 @@
   do {                                                                         \
   do {                                                                         \
     assert((in) != NULL);                                                      \
     assert((in) != NULL);                                                      \
     assert((out) != NULL);                                                     \
     assert((out) != NULL);                                                     \
+    assert((in) != (out));                                                     \
     assert(width > 0);                                                         \
     assert(width > 0);                                                         \
     assert(height > 0);                                                        \
     assert(height > 0);                                                        \
     assert(stride >= width);                                                   \
     assert(stride >= width);                                                   \
-    assert(row >= 0 && num_rows > 0 && row + num_rows <= height);              \
-    (void)height;  /* Silence unused warning. */                               \
   } while (0)
   } while (0)
 
 
 #if !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE
-static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
-                                      uint8_t* dst, int length, int inverse) {
+static WEBP_INLINE void PredictLine_C(const uint8_t* WEBP_RESTRICT src,
+                                      const uint8_t* WEBP_RESTRICT pred,
+                                      uint8_t* WEBP_RESTRICT dst, int length) {
   int i;
   int i;
-  if (inverse) {
-    for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] + pred[i]);
-  } else {
-    for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]);
-  }
+  for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]);
 }
 }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Horizontal filter.
 // Horizontal filter.
 
 
-static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
+static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* WEBP_RESTRICT in,
                                              int width, int height, int stride,
                                              int width, int height, int stride,
-                                             int row, int num_rows,
-                                             int inverse, uint8_t* out) {
-  const uint8_t* preds;
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+                                             uint8_t* WEBP_RESTRICT out) {
+  const uint8_t* preds = in;
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
-  preds = inverse ? out : in;
-
-  if (row == 0) {
-    // Leftmost pixel is the same as input for topmost scanline.
-    out[0] = in[0];
-    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
-    row = 1;
-    preds += stride;
-    in += stride;
-    out += stride;
-  }
+
+  // Leftmost pixel is the same as input for topmost scanline.
+  out[0] = in[0];
+  PredictLine_C(in + 1, preds, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
-  while (row < last_row) {
+  for (row = 1; row < height; ++row) {
     // Leftmost pixel is predicted from above.
     // Leftmost pixel is predicted from above.
-    PredictLine_C(in, preds - stride, out, 1, inverse);
-    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
-    ++row;
+    PredictLine_C(in, preds - stride, out, 1);
+    PredictLine_C(in + 1, preds, out + 1, width - 1);
     preds += stride;
     preds += stride;
     in += stride;
     in += stride;
     out += stride;
     out += stride;
@@ -81,35 +68,23 @@ static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Vertical filter.
 // Vertical filter.
 
 
-static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
+static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* WEBP_RESTRICT in,
                                            int width, int height, int stride,
                                            int width, int height, int stride,
-                                           int row, int num_rows,
-                                           int inverse, uint8_t* out) {
-  const uint8_t* preds;
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+                                           uint8_t* WEBP_RESTRICT out) {
+  const uint8_t* preds = in;
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
-  preds = inverse ? out : in;
-
-  if (row == 0) {
-    // Very first top-left pixel is copied.
-    out[0] = in[0];
-    // Rest of top scan-line is left-predicted.
-    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
-    row = 1;
-    in += stride;
-    out += stride;
-  } else {
-    // We are starting from in-between. Make sure 'preds' points to prev row.
-    preds -= stride;
-  }
+
+  // Very first top-left pixel is copied.
+  out[0] = in[0];
+  // Rest of top scan-line is left-predicted.
+  PredictLine_C(in + 1, preds, out + 1, width - 1);
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
-  while (row < last_row) {
-    PredictLine_C(in, preds, out, width, inverse);
-    ++row;
+  for (row = 1; row < height; ++row) {
+    PredictLine_C(in, preds, out, width);
     preds += stride;
     preds += stride;
     in += stride;
     in += stride;
     out += stride;
     out += stride;
@@ -126,40 +101,31 @@ static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
 }
 }
 
 
 #if !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE
-static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
+static WEBP_INLINE void DoGradientFilter_C(const uint8_t* WEBP_RESTRICT in,
                                            int width, int height, int stride,
                                            int width, int height, int stride,
-                                           int row, int num_rows,
-                                           int inverse, uint8_t* out) {
-  const uint8_t* preds;
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+                                           uint8_t* WEBP_RESTRICT out) {
+  const uint8_t* preds = in;
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
-  preds = inverse ? out : in;
 
 
   // left prediction for top scan-line
   // left prediction for top scan-line
-  if (row == 0) {
-    out[0] = in[0];
-    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
-    row = 1;
-    preds += stride;
-    in += stride;
-    out += stride;
-  }
+  out[0] = in[0];
+  PredictLine_C(in + 1, preds, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
-  while (row < last_row) {
+  for (row = 1; row < height; ++row) {
     int w;
     int w;
     // leftmost pixel: predict from above.
     // leftmost pixel: predict from above.
-    PredictLine_C(in, preds - stride, out, 1, inverse);
+    PredictLine_C(in, preds - stride, out, 1);
     for (w = 1; w < width; ++w) {
     for (w = 1; w < width; ++w) {
       const int pred = GradientPredictor_C(preds[w - 1],
       const int pred = GradientPredictor_C(preds[w - 1],
                                            preds[w - stride],
                                            preds[w - stride],
                                            preds[w - stride - 1]);
                                            preds[w - stride - 1]);
-      out[w] = (uint8_t)(in[w] + (inverse ? pred : -pred));
+      out[w] = (uint8_t)(in[w] - pred);
     }
     }
-    ++row;
     preds += stride;
     preds += stride;
     in += stride;
     in += stride;
     out += stride;
     out += stride;
@@ -172,20 +138,22 @@ static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 
 
 #if !WEBP_NEON_OMIT_C_CODE
 #if !WEBP_NEON_OMIT_C_CODE
-static void HorizontalFilter_C(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
-                       filtered_data);
+static void HorizontalFilter_C(const uint8_t* WEBP_RESTRICT data,
+                               int width, int height, int stride,
+                               uint8_t* WEBP_RESTRICT filtered_data) {
+  DoHorizontalFilter_C(data, width, height, stride, filtered_data);
 }
 }
 
 
-static void VerticalFilter_C(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
+static void VerticalFilter_C(const uint8_t* WEBP_RESTRICT data,
+                             int width, int height, int stride,
+                             uint8_t* WEBP_RESTRICT filtered_data) {
+  DoVerticalFilter_C(data, width, height, stride, filtered_data);
 }
 }
 
 
-static void GradientFilter_C(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
+static void GradientFilter_C(const uint8_t* WEBP_RESTRICT data,
+                             int width, int height, int stride,
+                             uint8_t* WEBP_RESTRICT filtered_data) {
+  DoGradientFilter_C(data, width, height, stride, filtered_data);
 }
 }
 #endif  // !WEBP_NEON_OMIT_C_CODE
 #endif  // !WEBP_NEON_OMIT_C_CODE
 
 

+ 53 - 85
thirdparty/libwebp/src/dsp/filters_mips_dsp_r2.c

@@ -26,13 +26,12 @@
 
 
 #define DCHECK(in, out)                                                        \
 #define DCHECK(in, out)                                                        \
   do {                                                                         \
   do {                                                                         \
-    assert(in != NULL);                                                        \
-    assert(out != NULL);                                                       \
+    assert((in) != NULL);                                                      \
+    assert((out) != NULL);                                                     \
+    assert((in) != (out));                                                     \
     assert(width > 0);                                                         \
     assert(width > 0);                                                         \
     assert(height > 0);                                                        \
     assert(height > 0);                                                        \
     assert(stride >= width);                                                   \
     assert(stride >= width);                                                   \
-    assert(row >= 0 && num_rows > 0 && row + num_rows <= height);              \
-    (void)height;  /* Silence unused warning. */                               \
   } while (0)
   } while (0)
 
 
 #define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do {                        \
 #define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do {                        \
@@ -103,7 +102,8 @@
     );                                                                         \
     );                                                                         \
   } while (0)
   } while (0)
 
 
-static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
+static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
+                                              uint8_t* WEBP_RESTRICT dst,
                                               int length) {
                                               int length) {
   DO_PREDICT_LINE(src, dst, length, 0);
   DO_PREDICT_LINE(src, dst, length, 0);
 }
 }
@@ -184,99 +184,75 @@ static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
 // Horizontal filter.
 // Horizontal filter.
 
 
 #define FILTER_LINE_BY_LINE do {                                               \
 #define FILTER_LINE_BY_LINE do {                                               \
-    while (row < last_row) {                                                   \
+    for (row = 1; row < height; ++row) {                                       \
       PREDICT_LINE_ONE_PASS(in, preds - stride, out);                          \
       PREDICT_LINE_ONE_PASS(in, preds - stride, out);                          \
       DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0);                          \
       DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0);                          \
-      ++row;                                                                   \
       preds += stride;                                                         \
       preds += stride;                                                         \
       in += stride;                                                            \
       in += stride;                                                            \
       out += stride;                                                           \
       out += stride;                                                           \
     }                                                                          \
     }                                                                          \
   } while (0)
   } while (0)
 
 
-static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
-                                                     int width, int height,
-                                                     int stride,
-                                                     int row, int num_rows,
-                                                     uint8_t* out) {
-  const uint8_t* preds;
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(
+    const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
+    uint8_t* WEBP_RESTRICT out) {
+  const uint8_t* preds = in;
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
-  preds = in;
-
-  if (row == 0) {
-    // Leftmost pixel is the same as input for topmost scanline.
-    out[0] = in[0];
-    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
-    row = 1;
-    preds += stride;
-    in += stride;
-    out += stride;
-  }
+
+  // Leftmost pixel is the same as input for topmost scanline.
+  out[0] = in[0];
+  PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
   FILTER_LINE_BY_LINE;
   FILTER_LINE_BY_LINE;
 }
 }
 #undef FILTER_LINE_BY_LINE
 #undef FILTER_LINE_BY_LINE
 
 
-static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
-                                       int width, int height,
-                                       int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
-                               filtered_data);
+static void HorizontalFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
+                                       int width, int height, int stride,
+                                       uint8_t* WEBP_RESTRICT filtered_data) {
+  DoHorizontalFilter_MIPSdspR2(data, width, height, stride, filtered_data);
 }
 }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Vertical filter.
 // Vertical filter.
 
 
 #define FILTER_LINE_BY_LINE do {                                               \
 #define FILTER_LINE_BY_LINE do {                                               \
-    while (row < last_row) {                                                   \
+    for (row = 1; row < height; ++row) {                                       \
       DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0);                      \
       DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0);                      \
-      ++row;                                                                   \
       preds += stride;                                                         \
       preds += stride;                                                         \
       in += stride;                                                            \
       in += stride;                                                            \
       out += stride;                                                           \
       out += stride;                                                           \
     }                                                                          \
     }                                                                          \
   } while (0)
   } while (0)
 
 
-static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
-                                                   int width, int height,
-                                                   int stride,
-                                                   int row, int num_rows,
-                                                   uint8_t* out) {
-  const uint8_t* preds;
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(
+    const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
+    uint8_t* WEBP_RESTRICT out) {
+  const uint8_t* preds = in;
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
-  preds = in;
-
-  if (row == 0) {
-    // Very first top-left pixel is copied.
-    out[0] = in[0];
-    // Rest of top scan-line is left-predicted.
-    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
-    row = 1;
-    in += stride;
-    out += stride;
-  } else {
-    // We are starting from in-between. Make sure 'preds' points to prev row.
-    preds -= stride;
-  }
+
+  // Very first top-left pixel is copied.
+  out[0] = in[0];
+  // Rest of top scan-line is left-predicted.
+  PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
   FILTER_LINE_BY_LINE;
   FILTER_LINE_BY_LINE;
 }
 }
 #undef FILTER_LINE_BY_LINE
 #undef FILTER_LINE_BY_LINE
 
 
-static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
-                                     int stride, uint8_t* filtered_data) {
-  DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
-                             filtered_data);
+static void VerticalFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
+                                     int width, int height, int stride,
+                                     uint8_t* WEBP_RESTRICT filtered_data) {
+  DoVerticalFilter_MIPSdspR2(data, width, height, stride, filtered_data);
 }
 }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
@@ -297,7 +273,7 @@ static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
 }
 }
 
 
 #define FILTER_LINE_BY_LINE(PREDS, OPERATION) do {                             \
 #define FILTER_LINE_BY_LINE(PREDS, OPERATION) do {                             \
-    while (row < last_row) {                                                   \
+    for (row = 1; row < height; ++row) {                                       \
       int w;                                                                   \
       int w;                                                                   \
       PREDICT_LINE_ONE_PASS(in, PREDS - stride, out);                          \
       PREDICT_LINE_ONE_PASS(in, PREDS - stride, out);                          \
       for (w = 1; w < width; ++w) {                                            \
       for (w = 1; w < width; ++w) {                                            \
@@ -306,42 +282,34 @@ static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
                                                      PREDS[w - stride - 1]);   \
                                                      PREDS[w - stride - 1]);   \
         out[w] = in[w] OPERATION pred;                                         \
         out[w] = in[w] OPERATION pred;                                         \
       }                                                                        \
       }                                                                        \
-      ++row;                                                                   \
       in += stride;                                                            \
       in += stride;                                                            \
       out += stride;                                                           \
       out += stride;                                                           \
     }                                                                          \
     }                                                                          \
   } while (0)
   } while (0)
 
 
-static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
+static void DoGradientFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT in,
                                        int width, int height, int stride,
                                        int width, int height, int stride,
-                                       int row, int num_rows, uint8_t* out) {
-  const uint8_t* preds;
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+                                       uint8_t* WEBP_RESTRICT out) {
+  const uint8_t* preds = in;
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
-  preds = in;
 
 
   // left prediction for top scan-line
   // left prediction for top scan-line
-  if (row == 0) {
-    out[0] = in[0];
-    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
-    row = 1;
-    preds += stride;
-    in += stride;
-    out += stride;
-  }
+  out[0] = in[0];
+  PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
   FILTER_LINE_BY_LINE(in, -);
   FILTER_LINE_BY_LINE(in, -);
 }
 }
 #undef FILTER_LINE_BY_LINE
 #undef FILTER_LINE_BY_LINE
 
 
-static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
-                                     int stride, uint8_t* filtered_data) {
-  DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
-                             filtered_data);
+static void GradientFilter_MIPSdspR2(const uint8_t* WEBP_RESTRICT data,
+                                     int width, int height, int stride,
+                                     uint8_t* WEBP_RESTRICT filtered_data) {
+  DoGradientFilter_MIPSdspR2(data, width, height, stride, filtered_data);
 }
 }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------

+ 16 - 11
thirdparty/libwebp/src/dsp/filters_msa.c

@@ -21,7 +21,8 @@
 
 
 static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
 static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
                                             const uint8_t* pred,
                                             const uint8_t* pred,
-                                            uint8_t* dst, int length) {
+                                            uint8_t* WEBP_RESTRICT dst,
+                                            int length) {
   v16u8 src0, pred0, dst0;
   v16u8 src0, pred0, dst0;
   assert(length >= 0);
   assert(length >= 0);
   while (length >= 32) {
   while (length >= 32) {
@@ -58,8 +59,9 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
 
 
 #define DCHECK(in, out)        \
 #define DCHECK(in, out)        \
   do {                         \
   do {                         \
-    assert(in != NULL);        \
-    assert(out != NULL);       \
+    assert((in) != NULL);      \
+    assert((out) != NULL);     \
+    assert((in) != (out));     \
     assert(width > 0);         \
     assert(width > 0);         \
     assert(height > 0);        \
     assert(height > 0);        \
     assert(stride >= width);   \
     assert(stride >= width);   \
@@ -68,8 +70,9 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Horrizontal filter
 // Horrizontal filter
 
 
-static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
-                                 int stride, uint8_t* filtered_data) {
+static void HorizontalFilter_MSA(const uint8_t* WEBP_RESTRICT data,
+                                 int width, int height, int stride,
+                                 uint8_t* WEBP_RESTRICT filtered_data) {
   const uint8_t* preds = data;
   const uint8_t* preds = data;
   const uint8_t* in = data;
   const uint8_t* in = data;
   uint8_t* out = filtered_data;
   uint8_t* out = filtered_data;
@@ -99,8 +102,8 @@ static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
 
 
 static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
 static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
                                             const uint8_t* ppred,
                                             const uint8_t* ppred,
-                                            uint8_t* poutput, int stride,
-                                            int size) {
+                                            uint8_t* WEBP_RESTRICT poutput,
+                                            int stride, int size) {
   int w;
   int w;
   const v16i8 zero = { 0 };
   const v16i8 zero = { 0 };
   while (size >= 16) {
   while (size >= 16) {
@@ -131,8 +134,9 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
 }
 }
 
 
 
 
-static void GradientFilter_MSA(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
+static void GradientFilter_MSA(const uint8_t* WEBP_RESTRICT data,
+                               int width, int height, int stride,
+                               uint8_t* WEBP_RESTRICT filtered_data) {
   const uint8_t* in = data;
   const uint8_t* in = data;
   const uint8_t* preds = data;
   const uint8_t* preds = data;
   uint8_t* out = filtered_data;
   uint8_t* out = filtered_data;
@@ -159,8 +163,9 @@ static void GradientFilter_MSA(const uint8_t* data, int width, int height,
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Vertical filter
 // Vertical filter
 
 
-static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
+static void VerticalFilter_MSA(const uint8_t* WEBP_RESTRICT data,
+                               int width, int height, int stride,
+                               uint8_t* WEBP_RESTRICT filtered_data) {
   const uint8_t* in = data;
   const uint8_t* in = data;
   const uint8_t* preds = data;
   const uint8_t* preds = data;
   uint8_t* out = filtered_data;
   uint8_t* out = filtered_data;

+ 49 - 74
thirdparty/libwebp/src/dsp/filters_neon.c

@@ -23,13 +23,12 @@
 
 
 #define DCHECK(in, out)                                                        \
 #define DCHECK(in, out)                                                        \
   do {                                                                         \
   do {                                                                         \
-    assert(in != NULL);                                                        \
-    assert(out != NULL);                                                       \
+    assert((in) != NULL);                                                      \
+    assert((out) != NULL);                                                     \
+    assert((in) != (out));                                                     \
     assert(width > 0);                                                         \
     assert(width > 0);                                                         \
     assert(height > 0);                                                        \
     assert(height > 0);                                                        \
     assert(stride >= width);                                                   \
     assert(stride >= width);                                                   \
-    assert(row >= 0 && num_rows > 0 && row + num_rows <= height);              \
-    (void)height;  /* Silence unused warning. */                               \
   } while (0)
   } while (0)
 
 
 // load eight u8 and widen to s16
 // load eight u8 and widen to s16
@@ -46,7 +45,7 @@
 #define ROTATE_RIGHT_N(A, N)   vext_u8((A), (A), (8 - (N)) % 8)
 #define ROTATE_RIGHT_N(A, N)   vext_u8((A), (A), (8 - (N)) % 8)
 
 
 static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
 static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
-                             uint8_t* dst, int length) {
+                             uint8_t* WEBP_RESTRICT dst, int length) {
   int i;
   int i;
   assert(length >= 0);
   assert(length >= 0);
   for (i = 0; i + 16 <= length; i += 16) {
   for (i = 0; i + 16 <= length; i += 16) {
@@ -59,86 +58,70 @@ static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
 }
 }
 
 
 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
-static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) {
+static void PredictLineLeft_NEON(const uint8_t* WEBP_RESTRICT src,
+                                 uint8_t* WEBP_RESTRICT dst, int length) {
   PredictLine_NEON(src, src - 1, dst, length);
   PredictLine_NEON(src, src - 1, dst, length);
 }
 }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Horizontal filter.
 // Horizontal filter.
 
 
-static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in,
-                                                int width, int height,
-                                                int stride,
-                                                int row, int num_rows,
-                                                uint8_t* out) {
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+static WEBP_INLINE void DoHorizontalFilter_NEON(
+    const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
+    uint8_t* WEBP_RESTRICT out) {
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
 
 
-  if (row == 0) {
-    // Leftmost pixel is the same as input for topmost scanline.
-    out[0] = in[0];
-    PredictLineLeft_NEON(in + 1, out + 1, width - 1);
-    row = 1;
-    in += stride;
-    out += stride;
-  }
+  // Leftmost pixel is the same as input for topmost scanline.
+  out[0] = in[0];
+  PredictLineLeft_NEON(in + 1, out + 1, width - 1);
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
-  while (row < last_row) {
+  for (row = 1; row < height; ++row) {
     // Leftmost pixel is predicted from above.
     // Leftmost pixel is predicted from above.
     out[0] = in[0] - in[-stride];
     out[0] = in[0] - in[-stride];
     PredictLineLeft_NEON(in + 1, out + 1, width - 1);
     PredictLineLeft_NEON(in + 1, out + 1, width - 1);
-    ++row;
     in += stride;
     in += stride;
     out += stride;
     out += stride;
   }
   }
 }
 }
 
 
-static void HorizontalFilter_NEON(const uint8_t* data, int width, int height,
-                                  int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter_NEON(data, width, height, stride, 0, height,
-                          filtered_data);
+static void HorizontalFilter_NEON(const uint8_t* WEBP_RESTRICT data,
+                                  int width, int height, int stride,
+                                  uint8_t* WEBP_RESTRICT filtered_data) {
+  DoHorizontalFilter_NEON(data, width, height, stride, filtered_data);
 }
 }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Vertical filter.
 // Vertical filter.
 
 
-static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
+static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* WEBP_RESTRICT in,
                                               int width, int height, int stride,
                                               int width, int height, int stride,
-                                              int row, int num_rows,
-                                              uint8_t* out) {
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+                                              uint8_t* WEBP_RESTRICT out) {
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
 
 
-  if (row == 0) {
-    // Very first top-left pixel is copied.
-    out[0] = in[0];
-    // Rest of top scan-line is left-predicted.
-    PredictLineLeft_NEON(in + 1, out + 1, width - 1);
-    row = 1;
-    in += stride;
-    out += stride;
-  }
+  // Very first top-left pixel is copied.
+  out[0] = in[0];
+  // Rest of top scan-line is left-predicted.
+  PredictLineLeft_NEON(in + 1, out + 1, width - 1);
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
-  while (row < last_row) {
+  for (row = 1; row < height; ++row) {
     PredictLine_NEON(in, in - stride, out, width);
     PredictLine_NEON(in, in - stride, out, width);
-    ++row;
     in += stride;
     in += stride;
     out += stride;
     out += stride;
   }
   }
 }
 }
 
 
-static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
-                                int stride, uint8_t* filtered_data) {
-  DoVerticalFilter_NEON(data, width, height, stride, 0, height,
-                        filtered_data);
+static void VerticalFilter_NEON(const uint8_t* WEBP_RESTRICT data,
+                                int width, int height, int stride,
+                                uint8_t* WEBP_RESTRICT filtered_data) {
+  DoVerticalFilter_NEON(data, width, height, stride, filtered_data);
 }
 }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
@@ -151,7 +134,8 @@ static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
 
 
 static void GradientPredictDirect_NEON(const uint8_t* const row,
 static void GradientPredictDirect_NEON(const uint8_t* const row,
                                        const uint8_t* const top,
                                        const uint8_t* const top,
-                                       uint8_t* const out, int length) {
+                                       uint8_t* WEBP_RESTRICT const out,
+                                       int length) {
   int i;
   int i;
   for (i = 0; i + 8 <= length; i += 8) {
   for (i = 0; i + 8 <= length; i += 8) {
     const uint8x8_t A = vld1_u8(&row[i - 1]);
     const uint8x8_t A = vld1_u8(&row[i - 1]);
@@ -167,40 +151,31 @@ static void GradientPredictDirect_NEON(const uint8_t* const row,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
-                                              int width, int height,
-                                              int stride,
-                                              int row, int num_rows,
-                                              uint8_t* out) {
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* WEBP_RESTRICT in,
+                                              int width, int height, int stride,
+                                              uint8_t* WEBP_RESTRICT out) {
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
 
 
   // left prediction for top scan-line
   // left prediction for top scan-line
-  if (row == 0) {
-    out[0] = in[0];
-    PredictLineLeft_NEON(in + 1, out + 1, width - 1);
-    row = 1;
-    in += stride;
-    out += stride;
-  }
+  out[0] = in[0];
+  PredictLineLeft_NEON(in + 1, out + 1, width - 1);
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
-  while (row < last_row) {
+  for (row = 1; row < height; ++row) {
     out[0] = in[0] - in[-stride];
     out[0] = in[0] - in[-stride];
     GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1);
     GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1);
-    ++row;
     in += stride;
     in += stride;
     out += stride;
     out += stride;
   }
   }
 }
 }
 
 
-static void GradientFilter_NEON(const uint8_t* data, int width, int height,
-                                int stride, uint8_t* filtered_data) {
-  DoGradientFilter_NEON(data, width, height, stride, 0, height,
-                        filtered_data);
+static void GradientFilter_NEON(const uint8_t* WEBP_RESTRICT data,
+                                int width, int height, int stride,
+                                uint8_t* WEBP_RESTRICT filtered_data) {
+  DoGradientFilter_NEON(data, width, height, stride, filtered_data);
 }
 }
 
 
 #undef DCHECK
 #undef DCHECK

+ 48 - 69
thirdparty/libwebp/src/dsp/filters_sse2.c

@@ -27,15 +27,15 @@
   do {                                                                         \
   do {                                                                         \
     assert((in) != NULL);                                                      \
     assert((in) != NULL);                                                      \
     assert((out) != NULL);                                                     \
     assert((out) != NULL);                                                     \
+    assert((in) != (out));                                                     \
     assert(width > 0);                                                         \
     assert(width > 0);                                                         \
     assert(height > 0);                                                        \
     assert(height > 0);                                                        \
     assert(stride >= width);                                                   \
     assert(stride >= width);                                                   \
-    assert(row >= 0 && num_rows > 0 && row + num_rows <= height);              \
-    (void)height;  /* Silence unused warning. */                               \
   } while (0)
   } while (0)
 
 
-static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
-                                uint8_t* dst, int length) {
+static void PredictLineTop_SSE2(const uint8_t* WEBP_RESTRICT src,
+                                const uint8_t* WEBP_RESTRICT pred,
+                                uint8_t* WEBP_RESTRICT dst, int length) {
   int i;
   int i;
   const int max_pos = length & ~31;
   const int max_pos = length & ~31;
   assert(length >= 0);
   assert(length >= 0);
@@ -53,7 +53,8 @@ static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
 }
 }
 
 
 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
-static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
+static void PredictLineLeft_SSE2(const uint8_t* WEBP_RESTRICT src,
+                                 uint8_t* WEBP_RESTRICT dst, int length) {
   int i;
   int i;
   const int max_pos = length & ~31;
   const int max_pos = length & ~31;
   assert(length >= 0);
   assert(length >= 0);
@@ -73,32 +74,23 @@ static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Horizontal filter.
 // Horizontal filter.
 
 
-static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
-                                                int width, int height,
-                                                int stride,
-                                                int row, int num_rows,
-                                                uint8_t* out) {
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+static WEBP_INLINE void DoHorizontalFilter_SSE2(
+    const uint8_t* WEBP_RESTRICT in, int width, int height, int stride,
+    uint8_t* WEBP_RESTRICT out) {
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
 
 
-  if (row == 0) {
-    // Leftmost pixel is the same as input for topmost scanline.
-    out[0] = in[0];
-    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
-    row = 1;
-    in += stride;
-    out += stride;
-  }
+  // Leftmost pixel is the same as input for topmost scanline.
+  out[0] = in[0];
+  PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
-  while (row < last_row) {
+  for (row = 1; row < height; ++row) {
     // Leftmost pixel is predicted from above.
     // Leftmost pixel is predicted from above.
     out[0] = in[0] - in[-stride];
     out[0] = in[0] - in[-stride];
     PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
     PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
-    ++row;
     in += stride;
     in += stride;
     out += stride;
     out += stride;
   }
   }
@@ -107,30 +99,22 @@ static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Vertical filter.
 // Vertical filter.
 
 
-static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
+static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* WEBP_RESTRICT in,
                                               int width, int height, int stride,
                                               int width, int height, int stride,
-                                              int row, int num_rows,
-                                              uint8_t* out) {
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+                                              uint8_t* WEBP_RESTRICT out) {
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
 
 
-  if (row == 0) {
-    // Very first top-left pixel is copied.
-    out[0] = in[0];
-    // Rest of top scan-line is left-predicted.
-    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
-    row = 1;
-    in += stride;
-    out += stride;
-  }
+  // Very first top-left pixel is copied.
+  out[0] = in[0];
+  // Rest of top scan-line is left-predicted.
+  PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
-  while (row < last_row) {
+  for (row = 1; row < height; ++row) {
     PredictLineTop_SSE2(in, in - stride, out, width);
     PredictLineTop_SSE2(in, in - stride, out, width);
-    ++row;
     in += stride;
     in += stride;
     out += stride;
     out += stride;
   }
   }
@@ -146,7 +130,8 @@ static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
 
 
 static void GradientPredictDirect_SSE2(const uint8_t* const row,
 static void GradientPredictDirect_SSE2(const uint8_t* const row,
                                        const uint8_t* const top,
                                        const uint8_t* const top,
-                                       uint8_t* const out, int length) {
+                                       uint8_t* WEBP_RESTRICT const out,
+                                       int length) {
   const int max_pos = length & ~7;
   const int max_pos = length & ~7;
   int i;
   int i;
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
@@ -170,30 +155,22 @@ static void GradientPredictDirect_SSE2(const uint8_t* const row,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
+static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* WEBP_RESTRICT in,
                                               int width, int height, int stride,
                                               int width, int height, int stride,
-                                              int row, int num_rows,
-                                              uint8_t* out) {
-  const size_t start_offset = row * stride;
-  const int last_row = row + num_rows;
+                                              uint8_t* WEBP_RESTRICT out) {
+  int row;
   DCHECK(in, out);
   DCHECK(in, out);
-  in += start_offset;
-  out += start_offset;
 
 
   // left prediction for top scan-line
   // left prediction for top scan-line
-  if (row == 0) {
-    out[0] = in[0];
-    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
-    row = 1;
-    in += stride;
-    out += stride;
-  }
+  out[0] = in[0];
+  PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
+  in += stride;
+  out += stride;
 
 
   // Filter line-by-line.
   // Filter line-by-line.
-  while (row < last_row) {
+  for (row = 1; row < height; ++row) {
     out[0] = (uint8_t)(in[0] - in[-stride]);
     out[0] = (uint8_t)(in[0] - in[-stride]);
     GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
     GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
-    ++row;
     in += stride;
     in += stride;
     out += stride;
     out += stride;
   }
   }
@@ -203,20 +180,22 @@ static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 
 
-static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
-                                  int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter_SSE2(data, width, height, stride, 0, height,
-                          filtered_data);
+static void HorizontalFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
+                                  int width, int height, int stride,
+                                  uint8_t* WEBP_RESTRICT filtered_data) {
+  DoHorizontalFilter_SSE2(data, width, height, stride, filtered_data);
 }
 }
 
 
-static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
-                                int stride, uint8_t* filtered_data) {
-  DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
+static void VerticalFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
+                                int width, int height, int stride,
+                                uint8_t* WEBP_RESTRICT filtered_data) {
+  DoVerticalFilter_SSE2(data, width, height, stride, filtered_data);
 }
 }
 
 
-static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
-                                int stride, uint8_t* filtered_data) {
-  DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
+static void GradientFilter_SSE2(const uint8_t* WEBP_RESTRICT data,
+                                int width, int height, int stride,
+                                uint8_t* WEBP_RESTRICT filtered_data) {
+  DoGradientFilter_SSE2(data, width, height, stride, filtered_data);
 }
 }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------

+ 18 - 18
thirdparty/libwebp/src/dsp/lossless.c

@@ -107,14 +107,14 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Predictors
 // Predictors
 
 
-uint32_t VP8LPredictor0_C(const uint32_t* const left,
-                          const uint32_t* const top) {
+static uint32_t VP8LPredictor0_C(const uint32_t* const left,
+                                 const uint32_t* const top) {
   (void)top;
   (void)top;
   (void)left;
   (void)left;
   return ARGB_BLACK;
   return ARGB_BLACK;
 }
 }
-uint32_t VP8LPredictor1_C(const uint32_t* const left,
-                          const uint32_t* const top) {
+static uint32_t VP8LPredictor1_C(const uint32_t* const left,
+                                 const uint32_t* const top) {
   (void)top;
   (void)top;
   return *left;
   return *left;
 }
 }
@@ -182,13 +182,13 @@ uint32_t VP8LPredictor13_C(const uint32_t* const left,
 }
 }
 
 
 static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper,
-                            int num_pixels, uint32_t* out) {
+                            int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int x;
   int x;
   (void)upper;
   (void)upper;
   for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK);
   for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK);
 }
 }
 static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
-                            int num_pixels, uint32_t* out) {
+                            int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   uint32_t left = out[-1];
   uint32_t left = out[-1];
   (void)upper;
   (void)upper;
@@ -441,8 +441,8 @@ static int is_big_endian(void) {
   return (tmp.b[0] != 1);
   return (tmp.b[0] != 1);
 }
 }
 
 
-void VP8LConvertBGRAToRGB_C(const uint32_t* src,
-                            int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGB_C(const uint32_t* WEBP_RESTRICT src,
+                            int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const uint32_t* const src_end = src + num_pixels;
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
   while (src < src_end) {
     const uint32_t argb = *src++;
     const uint32_t argb = *src++;
@@ -452,8 +452,8 @@ void VP8LConvertBGRAToRGB_C(const uint32_t* src,
   }
   }
 }
 }
 
 
-void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGBA_C(const uint32_t* WEBP_RESTRICT src,
+                             int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const uint32_t* const src_end = src + num_pixels;
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
   while (src < src_end) {
     const uint32_t argb = *src++;
     const uint32_t argb = *src++;
@@ -464,8 +464,8 @@ void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
   }
   }
 }
 }
 
 
-void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
-                                 int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* WEBP_RESTRICT src,
+                                 int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const uint32_t* const src_end = src + num_pixels;
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
   while (src < src_end) {
     const uint32_t argb = *src++;
     const uint32_t argb = *src++;
@@ -481,8 +481,8 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
   }
   }
 }
 }
 
 
-void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
-                               int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToRGB565_C(const uint32_t* WEBP_RESTRICT src,
+                               int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const uint32_t* const src_end = src + num_pixels;
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
   while (src < src_end) {
     const uint32_t argb = *src++;
     const uint32_t argb = *src++;
@@ -498,8 +498,8 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
   }
   }
 }
 }
 
 
-void VP8LConvertBGRAToBGR_C(const uint32_t* src,
-                            int num_pixels, uint8_t* dst) {
+void VP8LConvertBGRAToBGR_C(const uint32_t* WEBP_RESTRICT src,
+                            int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const uint32_t* const src_end = src + num_pixels;
   const uint32_t* const src_end = src + num_pixels;
   while (src < src_end) {
   while (src < src_end) {
     const uint32_t argb = *src++;
     const uint32_t argb = *src++;
@@ -509,8 +509,8 @@ void VP8LConvertBGRAToBGR_C(const uint32_t* src,
   }
   }
 }
 }
 
 
-static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
-                       int swap_on_big_endian) {
+static void CopyOrSwap(const uint32_t* WEBP_RESTRICT src, int num_pixels,
+                       uint8_t* WEBP_RESTRICT dst, int swap_on_big_endian) {
   if (is_big_endian() == swap_on_big_endian) {
   if (is_big_endian() == swap_on_big_endian) {
     const uint32_t* const src_end = src + num_pixels;
     const uint32_t* const src_end = src + num_pixels;
     while (src < src_end) {
     while (src < src_end) {

+ 57 - 46
thirdparty/libwebp/src/dsp/lossless.h

@@ -18,6 +18,7 @@
 #include "src/webp/types.h"
 #include "src/webp/types.h"
 #include "src/webp/decode.h"
 #include "src/webp/decode.h"
 
 
+#include "src/dsp/dsp.h"
 #include "src/enc/histogram_enc.h"
 #include "src/enc/histogram_enc.h"
 #include "src/utils/utils.h"
 #include "src/utils/utils.h"
 
 
@@ -32,10 +33,6 @@ typedef uint32_t (*VP8LPredictorFunc)(const uint32_t* const left,
                                       const uint32_t* const top);
                                       const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
 extern VP8LPredictorFunc VP8LPredictors[16];
 
 
-uint32_t VP8LPredictor0_C(const uint32_t* const left,
-                          const uint32_t* const top);
-uint32_t VP8LPredictor1_C(const uint32_t* const left,
-                          const uint32_t* const top);
 uint32_t VP8LPredictor2_C(const uint32_t* const left,
 uint32_t VP8LPredictor2_C(const uint32_t* const left,
                           const uint32_t* const top);
                           const uint32_t* const top);
 uint32_t VP8LPredictor3_C(const uint32_t* const left,
 uint32_t VP8LPredictor3_C(const uint32_t* const left,
@@ -64,7 +61,7 @@ uint32_t VP8LPredictor13_C(const uint32_t* const left,
 // These Add/Sub function expects upper[-1] and out[-1] to be readable.
 // These Add/Sub function expects upper[-1] and out[-1] to be readable.
 typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
 typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
                                         const uint32_t* upper, int num_pixels,
                                         const uint32_t* upper, int num_pixels,
-                                        uint32_t* out);
+                                        uint32_t* WEBP_RESTRICT out);
 extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
 extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
 extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
 extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
 
 
@@ -95,8 +92,8 @@ void VP8LInverseTransform(const struct VP8LTransform* const transform,
                           const uint32_t* const in, uint32_t* const out);
                           const uint32_t* const in, uint32_t* const out);
 
 
 // Color space conversion.
 // Color space conversion.
-typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
-                                uint8_t* dst);
+typedef void (*VP8LConvertFunc)(const uint32_t* WEBP_RESTRICT src,
+                                int num_pixels, uint8_t* WEBP_RESTRICT dst);
 extern VP8LConvertFunc VP8LConvertBGRAToRGB;
 extern VP8LConvertFunc VP8LConvertBGRAToRGB;
 extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
 extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
 extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
 extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
@@ -131,13 +128,16 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
                                  const uint32_t* src, int num_pixels,
                                  const uint32_t* src, int num_pixels,
                                  uint32_t* dst);
                                  uint32_t* dst);
 
 
-void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
-void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
-void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
-                                 int num_pixels, uint8_t* dst);
-void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
-                               int num_pixels, uint8_t* dst);
-void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGB_C(const uint32_t* WEBP_RESTRICT src, int num_pixels,
+                            uint8_t* WEBP_RESTRICT dst);
+void VP8LConvertBGRAToRGBA_C(const uint32_t* WEBP_RESTRICT src, int num_pixels,
+                             uint8_t* WEBP_RESTRICT dst);
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* WEBP_RESTRICT src,
+                                 int num_pixels, uint8_t* WEBP_RESTRICT dst);
+void VP8LConvertBGRAToRGB565_C(const uint32_t* WEBP_RESTRICT src,
+                               int num_pixels, uint8_t* WEBP_RESTRICT dst);
+void VP8LConvertBGRAToBGR_C(const uint32_t* WEBP_RESTRICT src, int num_pixels,
+                            uint8_t* WEBP_RESTRICT dst);
 void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
 void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
                                 uint32_t* dst);
                                 uint32_t* dst);
 
 
@@ -149,32 +149,35 @@ void VP8LDspInit(void);
 
 
 typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
 typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
 extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
 extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
-typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
-                                       uint32_t* dst, int num_pixels);
+typedef void (*VP8LTransformColorFunc)(
+    const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT dst,
+    int num_pixels);
 extern VP8LTransformColorFunc VP8LTransformColor;
 extern VP8LTransformColorFunc VP8LTransformColor;
 typedef void (*VP8LCollectColorBlueTransformsFunc)(
 typedef void (*VP8LCollectColorBlueTransformsFunc)(
-    const uint32_t* argb, int stride,
+    const uint32_t* WEBP_RESTRICT argb, int stride,
     int tile_width, int tile_height,
     int tile_width, int tile_height,
-    int green_to_blue, int red_to_blue, int histo[]);
+    int green_to_blue, int red_to_blue, uint32_t histo[]);
 extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
 extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
 
 
 typedef void (*VP8LCollectColorRedTransformsFunc)(
 typedef void (*VP8LCollectColorRedTransformsFunc)(
-    const uint32_t* argb, int stride,
+    const uint32_t* WEBP_RESTRICT argb, int stride,
     int tile_width, int tile_height,
     int tile_width, int tile_height,
-    int green_to_red, int histo[]);
+    int green_to_red, uint32_t histo[]);
 extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
 extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
 
 
 // Expose some C-only fallback functions
 // Expose some C-only fallback functions
-void VP8LTransformColor_C(const VP8LMultipliers* const m,
-                          uint32_t* data, int num_pixels);
+void VP8LTransformColor_C(const VP8LMultipliers* WEBP_RESTRICT const m,
+                          uint32_t* WEBP_RESTRICT data, int num_pixels);
 void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
 void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
-void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
+void VP8LCollectColorRedTransforms_C(const uint32_t* WEBP_RESTRICT argb,
+                                     int stride,
                                      int tile_width, int tile_height,
                                      int tile_width, int tile_height,
-                                     int green_to_red, int histo[]);
-void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
+                                     int green_to_red, uint32_t histo[]);
+void VP8LCollectColorBlueTransforms_C(const uint32_t* WEBP_RESTRICT argb,
+                                      int stride,
                                       int tile_width, int tile_height,
                                       int tile_width, int tile_height,
                                       int green_to_blue, int red_to_blue,
                                       int green_to_blue, int red_to_blue,
-                                      int histo[]);
+                                      uint32_t histo[]);
 
 
 extern VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
 extern VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
 extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
 extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
@@ -183,14 +186,17 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
 // Huffman-cost related functions.
 // Huffman-cost related functions.
 
 
 typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length);
 typedef uint32_t (*VP8LCostFunc)(const uint32_t* population, int length);
-typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
+typedef uint32_t (*VP8LCostCombinedFunc)(const uint32_t* WEBP_RESTRICT X,
+                                         const uint32_t* WEBP_RESTRICT Y,
                                          int length);
                                          int length);
-typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256],
-                                                const int Y[256]);
+typedef uint64_t (*VP8LCombinedShannonEntropyFunc)(const uint32_t X[256],
+                                                   const uint32_t Y[256]);
+typedef uint64_t (*VP8LShannonEntropyFunc)(const uint32_t* X, int length);
 
 
 extern VP8LCostFunc VP8LExtraCost;
 extern VP8LCostFunc VP8LExtraCost;
 extern VP8LCostCombinedFunc VP8LExtraCostCombined;
 extern VP8LCostCombinedFunc VP8LExtraCostCombined;
 extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
 extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
+extern VP8LShannonEntropyFunc VP8LShannonEntropy;
 
 
 typedef struct {        // small struct to hold counters
 typedef struct {        // small struct to hold counters
   int counts[2];        // index: 0=zero streak, 1=non-zero streak
   int counts[2];        // index: 0=zero streak, 1=non-zero streak
@@ -198,7 +204,7 @@ typedef struct {        // small struct to hold counters
 } VP8LStreaks;
 } VP8LStreaks;
 
 
 typedef struct {            // small struct to hold bit entropy results
 typedef struct {            // small struct to hold bit entropy results
-  float entropy;            // entropy
+  uint64_t entropy;         // entropy
   uint32_t sum;             // sum of the population
   uint32_t sum;             // sum of the population
   int nonzeros;             // number of non-zero elements in the population
   int nonzeros;             // number of non-zero elements in the population
   uint32_t max_val;         // maximum value in the population
   uint32_t max_val;         // maximum value in the population
@@ -212,26 +218,30 @@ void VP8LBitEntropyInit(VP8LBitEntropy* const entropy);
 // codec specific heuristics.
 // codec specific heuristics.
 typedef void (*VP8LGetCombinedEntropyUnrefinedFunc)(
 typedef void (*VP8LGetCombinedEntropyUnrefinedFunc)(
     const uint32_t X[], const uint32_t Y[], int length,
     const uint32_t X[], const uint32_t Y[], int length,
-    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats);
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
+    VP8LStreaks* WEBP_RESTRICT const stats);
 extern VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
 extern VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
 
 
 // Get the entropy for the distribution 'X'.
 // Get the entropy for the distribution 'X'.
-typedef void (*VP8LGetEntropyUnrefinedFunc)(const uint32_t X[], int length,
-                                            VP8LBitEntropy* const bit_entropy,
-                                            VP8LStreaks* const stats);
+typedef void (*VP8LGetEntropyUnrefinedFunc)(
+    const uint32_t X[], int length,
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
+    VP8LStreaks* WEBP_RESTRICT const stats);
 extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
 extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
 
 
-void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
-                              VP8LBitEntropy* const entropy);
+void VP8LBitsEntropyUnrefined(const uint32_t* WEBP_RESTRICT const array, int n,
+                              VP8LBitEntropy* WEBP_RESTRICT const entropy);
 
 
-typedef void (*VP8LAddVectorFunc)(const uint32_t* a, const uint32_t* b,
-                                  uint32_t* out, int size);
+typedef void (*VP8LAddVectorFunc)(const uint32_t* WEBP_RESTRICT a,
+                                  const uint32_t* WEBP_RESTRICT b,
+                                  uint32_t* WEBP_RESTRICT out, int size);
 extern VP8LAddVectorFunc VP8LAddVector;
 extern VP8LAddVectorFunc VP8LAddVector;
-typedef void (*VP8LAddVectorEqFunc)(const uint32_t* a, uint32_t* out, int size);
+typedef void (*VP8LAddVectorEqFunc)(const uint32_t* WEBP_RESTRICT a,
+                                    uint32_t* WEBP_RESTRICT out, int size);
 extern VP8LAddVectorEqFunc VP8LAddVectorEq;
 extern VP8LAddVectorEqFunc VP8LAddVectorEq;
-void VP8LHistogramAdd(const VP8LHistogram* const a,
-                      const VP8LHistogram* const b,
-                      VP8LHistogram* const out);
+void VP8LHistogramAdd(const VP8LHistogram* WEBP_RESTRICT const a,
+                      const VP8LHistogram* WEBP_RESTRICT const b,
+                      VP8LHistogram* WEBP_RESTRICT const out);
 
 
 // -----------------------------------------------------------------------------
 // -----------------------------------------------------------------------------
 // PrefixEncode()
 // PrefixEncode()
@@ -241,11 +251,12 @@ typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1,
 // Returns the first index where array1 and array2 are different.
 // Returns the first index where array1 and array2 are different.
 extern VP8LVectorMismatchFunc VP8LVectorMismatch;
 extern VP8LVectorMismatchFunc VP8LVectorMismatch;
 
 
-typedef void (*VP8LBundleColorMapFunc)(const uint8_t* const row, int width,
-                                       int xbits, uint32_t* dst);
+typedef void (*VP8LBundleColorMapFunc)(const uint8_t* WEBP_RESTRICT const row,
+                                       int width, int xbits,
+                                       uint32_t* WEBP_RESTRICT dst);
 extern VP8LBundleColorMapFunc VP8LBundleColorMap;
 extern VP8LBundleColorMapFunc VP8LBundleColorMap;
-void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
-                          uint32_t* dst);
+void VP8LBundleColorMap_C(const uint8_t* WEBP_RESTRICT const row,
+                          int width, int xbits, uint32_t* WEBP_RESTRICT dst);
 
 
 // Must be called before calling any of the above methods.
 // Must be called before calling any of the above methods.
 void VP8LEncDspInit(void);
 void VP8LEncDspInit(void);

+ 36 - 15
thirdparty/libwebp/src/dsp/lossless_common.h

@@ -73,23 +73,44 @@ static WEBP_INLINE int VP8LNearLosslessBits(int near_lossless_quality) {
 // Keeping a high threshold for now.
 // Keeping a high threshold for now.
 #define APPROX_LOG_WITH_CORRECTION_MAX  65536
 #define APPROX_LOG_WITH_CORRECTION_MAX  65536
 #define APPROX_LOG_MAX                   4096
 #define APPROX_LOG_MAX                   4096
+// VP8LFastLog2 and VP8LFastSLog2 are used on elements from image histograms.
+// The histogram values cannot exceed the maximum number of pixels, which
+// is (1 << 14) * (1 << 14). Therefore S * log(S) < (1 << 33).
+// No more than 32 bits of precision should be chosen.
+// To match the original float implementation, 23 bits of precision are used.
+#define LOG_2_PRECISION_BITS 23
 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
+// LOG_2_RECIPROCAL * (1 << LOG_2_PRECISION_BITS)
+#define LOG_2_RECIPROCAL_FIXED_DOUBLE 12102203.161561485379934310913085937500
+#define LOG_2_RECIPROCAL_FIXED ((uint64_t)12102203)
 #define LOG_LOOKUP_IDX_MAX 256
 #define LOG_LOOKUP_IDX_MAX 256
-extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
-extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
-typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
+extern const uint32_t kLog2Table[LOG_LOOKUP_IDX_MAX];
+extern const uint64_t kSLog2Table[LOG_LOOKUP_IDX_MAX];
+typedef uint32_t (*VP8LFastLog2SlowFunc)(uint32_t v);
+typedef uint64_t (*VP8LFastSLog2SlowFunc)(uint32_t v);
 
 
 extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
 extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
-extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+extern VP8LFastSLog2SlowFunc VP8LFastSLog2Slow;
 
 
-static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
+static WEBP_INLINE uint32_t VP8LFastLog2(uint32_t v) {
   return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
   return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
 }
 }
 // Fast calculation of v * log2(v) for integer input.
 // Fast calculation of v * log2(v) for integer input.
-static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
+static WEBP_INLINE uint64_t VP8LFastSLog2(uint32_t v) {
   return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
   return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
 }
 }
 
 
+static WEBP_INLINE uint64_t RightShiftRound(uint64_t v, uint32_t shift) {
+  return (v + (1ull << shift >> 1)) >> shift;
+}
+
+static WEBP_INLINE int64_t DivRound(int64_t a, int64_t b) {
+  return ((a < 0) == (b < 0)) ? ((a + b / 2) / b) : ((a - b / 2) / b);
+}
+
+#define WEBP_INT64_MAX ((int64_t)((1ull << 63) - 1))
+#define WEBP_UINT64_MAX (~0ull)
+
 // -----------------------------------------------------------------------------
 // -----------------------------------------------------------------------------
 // PrefixEncode()
 // PrefixEncode()
 
 
@@ -173,15 +194,15 @@ uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
 
 
 // The predictor is added to the output pixel (which
 // The predictor is added to the output pixel (which
 // is therefore considered as a residual) to get the final prediction.
 // is therefore considered as a residual) to get the final prediction.
-#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD)             \
-static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
-                          int num_pixels, uint32_t* out) {           \
-  int x;                                                             \
-  assert(upper != NULL);                                             \
-  for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x);       \
-    out[x] = VP8LAddPixels(in[x], pred);                             \
-  }                                                                  \
+#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD)                 \
+static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper,     \
+                          int num_pixels, uint32_t* WEBP_RESTRICT out) { \
+  int x;                                                                 \
+  assert(upper != NULL);                                                 \
+  for (x = 0; x < num_pixels; ++x) {                                     \
+    const uint32_t pred = (PREDICTOR)(&out[x - 1], upper + x);           \
+    out[x] = VP8LAddPixels(in[x], pred);                                 \
+  }                                                                      \
 }
 }
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus

+ 207 - 265
thirdparty/libwebp/src/dsp/lossless_enc.c

@@ -24,203 +24,123 @@
 #include "src/dsp/lossless_common.h"
 #include "src/dsp/lossless_common.h"
 #include "src/dsp/yuv.h"
 #include "src/dsp/yuv.h"
 
 
-// lookup table for small values of log2(int)
-const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
-  0.0000000000000000f, 0.0000000000000000f,
-  1.0000000000000000f, 1.5849625007211560f,
-  2.0000000000000000f, 2.3219280948873621f,
-  2.5849625007211560f, 2.8073549220576041f,
-  3.0000000000000000f, 3.1699250014423121f,
-  3.3219280948873621f, 3.4594316186372973f,
-  3.5849625007211560f, 3.7004397181410921f,
-  3.8073549220576041f, 3.9068905956085187f,
-  4.0000000000000000f, 4.0874628412503390f,
-  4.1699250014423121f, 4.2479275134435852f,
-  4.3219280948873626f, 4.3923174227787606f,
-  4.4594316186372973f, 4.5235619560570130f,
-  4.5849625007211560f, 4.6438561897747243f,
-  4.7004397181410917f, 4.7548875021634682f,
-  4.8073549220576037f, 4.8579809951275718f,
-  4.9068905956085187f, 4.9541963103868749f,
-  5.0000000000000000f, 5.0443941193584533f,
-  5.0874628412503390f, 5.1292830169449663f,
-  5.1699250014423121f, 5.2094533656289501f,
-  5.2479275134435852f, 5.2854022188622487f,
-  5.3219280948873626f, 5.3575520046180837f,
-  5.3923174227787606f, 5.4262647547020979f,
-  5.4594316186372973f, 5.4918530963296747f,
-  5.5235619560570130f, 5.5545888516776376f,
-  5.5849625007211560f, 5.6147098441152083f,
-  5.6438561897747243f, 5.6724253419714951f,
-  5.7004397181410917f, 5.7279204545631987f,
-  5.7548875021634682f, 5.7813597135246599f,
-  5.8073549220576037f, 5.8328900141647412f,
-  5.8579809951275718f, 5.8826430493618415f,
-  5.9068905956085187f, 5.9307373375628866f,
-  5.9541963103868749f, 5.9772799234999167f,
-  6.0000000000000000f, 6.0223678130284543f,
-  6.0443941193584533f, 6.0660891904577720f,
-  6.0874628412503390f, 6.1085244567781691f,
-  6.1292830169449663f, 6.1497471195046822f,
-  6.1699250014423121f, 6.1898245588800175f,
-  6.2094533656289501f, 6.2288186904958804f,
-  6.2479275134435852f, 6.2667865406949010f,
-  6.2854022188622487f, 6.3037807481771030f,
-  6.3219280948873626f, 6.3398500028846243f,
-  6.3575520046180837f, 6.3750394313469245f,
-  6.3923174227787606f, 6.4093909361377017f,
-  6.4262647547020979f, 6.4429434958487279f,
-  6.4594316186372973f, 6.4757334309663976f,
-  6.4918530963296747f, 6.5077946401986963f,
-  6.5235619560570130f, 6.5391588111080309f,
-  6.5545888516776376f, 6.5698556083309478f,
-  6.5849625007211560f, 6.5999128421871278f,
-  6.6147098441152083f, 6.6293566200796094f,
-  6.6438561897747243f, 6.6582114827517946f,
-  6.6724253419714951f, 6.6865005271832185f,
-  6.7004397181410917f, 6.7142455176661224f,
-  6.7279204545631987f, 6.7414669864011464f,
-  6.7548875021634682f, 6.7681843247769259f,
-  6.7813597135246599f, 6.7944158663501061f,
-  6.8073549220576037f, 6.8201789624151878f,
-  6.8328900141647412f, 6.8454900509443747f,
-  6.8579809951275718f, 6.8703647195834047f,
-  6.8826430493618415f, 6.8948177633079437f,
-  6.9068905956085187f, 6.9188632372745946f,
-  6.9307373375628866f, 6.9425145053392398f,
-  6.9541963103868749f, 6.9657842846620869f,
-  6.9772799234999167f, 6.9886846867721654f,
-  7.0000000000000000f, 7.0112272554232539f,
-  7.0223678130284543f, 7.0334230015374501f,
-  7.0443941193584533f, 7.0552824355011898f,
-  7.0660891904577720f, 7.0768155970508308f,
-  7.0874628412503390f, 7.0980320829605263f,
-  7.1085244567781691f, 7.1189410727235076f,
-  7.1292830169449663f, 7.1395513523987936f,
-  7.1497471195046822f, 7.1598713367783890f,
-  7.1699250014423121f, 7.1799090900149344f,
-  7.1898245588800175f, 7.1996723448363644f,
-  7.2094533656289501f, 7.2191685204621611f,
-  7.2288186904958804f, 7.2384047393250785f,
-  7.2479275134435852f, 7.2573878426926521f,
-  7.2667865406949010f, 7.2761244052742375f,
-  7.2854022188622487f, 7.2946207488916270f,
-  7.3037807481771030f, 7.3128829552843557f,
-  7.3219280948873626f, 7.3309168781146167f,
-  7.3398500028846243f, 7.3487281542310771f,
-  7.3575520046180837f, 7.3663222142458160f,
-  7.3750394313469245f, 7.3837042924740519f,
-  7.3923174227787606f, 7.4008794362821843f,
-  7.4093909361377017f, 7.4178525148858982f,
-  7.4262647547020979f, 7.4346282276367245f,
-  7.4429434958487279f, 7.4512111118323289f,
-  7.4594316186372973f, 7.4676055500829976f,
-  7.4757334309663976f, 7.4838157772642563f,
-  7.4918530963296747f, 7.4998458870832056f,
-  7.5077946401986963f, 7.5156998382840427f,
-  7.5235619560570130f, 7.5313814605163118f,
-  7.5391588111080309f, 7.5468944598876364f,
-  7.5545888516776376f, 7.5622424242210728f,
-  7.5698556083309478f, 7.5774288280357486f,
-  7.5849625007211560f, 7.5924570372680806f,
-  7.5999128421871278f, 7.6073303137496104f,
-  7.6147098441152083f, 7.6220518194563764f,
-  7.6293566200796094f, 7.6366246205436487f,
-  7.6438561897747243f, 7.6510516911789281f,
-  7.6582114827517946f, 7.6653359171851764f,
-  7.6724253419714951f, 7.6794800995054464f,
-  7.6865005271832185f, 7.6934869574993252f,
-  7.7004397181410917f, 7.7073591320808825f,
-  7.7142455176661224f, 7.7210991887071855f,
-  7.7279204545631987f, 7.7347096202258383f,
-  7.7414669864011464f, 7.7481928495894605f,
-  7.7548875021634682f, 7.7615512324444795f,
-  7.7681843247769259f, 7.7747870596011736f,
-  7.7813597135246599f, 7.7879025593914317f,
-  7.7944158663501061f, 7.8008998999203047f,
-  7.8073549220576037f, 7.8137811912170374f,
-  7.8201789624151878f, 7.8265484872909150f,
-  7.8328900141647412f, 7.8392037880969436f,
-  7.8454900509443747f, 7.8517490414160571f,
-  7.8579809951275718f, 7.8641861446542797f,
-  7.8703647195834047f, 7.8765169465649993f,
-  7.8826430493618415f, 7.8887432488982591f,
-  7.8948177633079437f, 7.9008668079807486f,
-  7.9068905956085187f, 7.9128893362299619f,
-  7.9188632372745946f, 7.9248125036057812f,
-  7.9307373375628866f, 7.9366379390025709f,
-  7.9425145053392398f, 7.9483672315846778f,
-  7.9541963103868749f, 7.9600019320680805f,
-  7.9657842846620869f, 7.9715435539507719f,
-  7.9772799234999167f, 7.9829935746943103f,
-  7.9886846867721654f, 7.9943534368588577f
+// lookup table for small values of log2(int) * (1 << LOG_2_PRECISION_BITS).
+// Obtained in Python with:
+// a = [ str(round((1<<23)*math.log2(i))) if i else "0" for i in range(256)]
+// print(',\n'.join(['  '+','.join(v)
+//       for v in batched([i.rjust(9) for i in a],7)]))
+const uint32_t kLog2Table[LOG_LOOKUP_IDX_MAX] = {
+         0,        0,  8388608, 13295629, 16777216, 19477745, 21684237,
+  23549800, 25165824, 26591258, 27866353, 29019816, 30072845, 31041538,
+  31938408, 32773374, 33554432, 34288123, 34979866, 35634199, 36254961,
+  36845429, 37408424, 37946388, 38461453, 38955489, 39430146, 39886887,
+  40327016, 40751698, 41161982, 41558811, 41943040, 42315445, 42676731,
+  43027545, 43368474, 43700062, 44022807, 44337167, 44643569, 44942404,
+  45234037, 45518808, 45797032, 46069003, 46334996, 46595268, 46850061,
+  47099600, 47344097, 47583753, 47818754, 48049279, 48275495, 48497560,
+  48715624, 48929828, 49140306, 49347187, 49550590, 49750631, 49947419,
+  50141058, 50331648, 50519283, 50704053, 50886044, 51065339, 51242017,
+  51416153, 51587818, 51757082, 51924012, 52088670, 52251118, 52411415,
+  52569616, 52725775, 52879946, 53032177, 53182516, 53331012, 53477707,
+  53622645, 53765868, 53907416, 54047327, 54185640, 54322389, 54457611,
+  54591338, 54723604, 54854440, 54983876, 55111943, 55238669, 55364082,
+  55488208, 55611074, 55732705, 55853126, 55972361, 56090432, 56207362,
+  56323174, 56437887, 56551524, 56664103, 56775645, 56886168, 56995691,
+  57104232, 57211808, 57318436, 57424133, 57528914, 57632796, 57735795,
+  57837923, 57939198, 58039632, 58139239, 58238033, 58336027, 58433234,
+  58529666, 58625336, 58720256, 58814437, 58907891, 59000628, 59092661,
+  59183999, 59274652, 59364632, 59453947, 59542609, 59630625, 59718006,
+  59804761, 59890898, 59976426, 60061354, 60145690, 60229443, 60312620,
+  60395229, 60477278, 60558775, 60639726, 60720140, 60800023, 60879382,
+  60958224, 61036555, 61114383, 61191714, 61268554, 61344908, 61420785,
+  61496188, 61571124, 61645600, 61719620, 61793189, 61866315, 61939001,
+  62011253, 62083076, 62154476, 62225457, 62296024, 62366182, 62435935,
+  62505289, 62574248, 62642816, 62710997, 62778797, 62846219, 62913267,
+  62979946, 63046260, 63112212, 63177807, 63243048, 63307939, 63372484,
+  63436687, 63500551, 63564080, 63627277, 63690146, 63752690, 63814912,
+  63876816, 63938405, 63999682, 64060650, 64121313, 64181673, 64241734,
+  64301498, 64360969, 64420148, 64479040, 64537646, 64595970, 64654014,
+  64711782, 64769274, 64826495, 64883447, 64940132, 64996553, 65052711,
+  65108611, 65164253, 65219641, 65274776, 65329662, 65384299, 65438691,
+  65492840, 65546747, 65600416, 65653847, 65707044, 65760008, 65812741,
+  65865245, 65917522, 65969575, 66021404, 66073013, 66124403, 66175575,
+  66226531, 66277275, 66327806, 66378127, 66428240, 66478146, 66527847,
+  66577345, 66626641, 66675737, 66724635, 66773336, 66821842, 66870154,
+  66918274, 66966204, 67013944, 67061497
 };
 };
 
 
-const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
-  0.00000000f,    0.00000000f,  2.00000000f,   4.75488750f,
-  8.00000000f,   11.60964047f,  15.50977500f,  19.65148445f,
-  24.00000000f,  28.52932501f,  33.21928095f,  38.05374781f,
-  43.01955001f,  48.10571634f,  53.30296891f,  58.60335893f,
-  64.00000000f,  69.48686830f,  75.05865003f,  80.71062276f,
-  86.43856190f,  92.23866588f,  98.10749561f,  104.04192499f,
-  110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
-  134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
-  160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
-  186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
-  212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
-  240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
-  268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
-  296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
-  325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
-  354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
-  384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
-  413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
-  444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
-  474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
-  505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
-  536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
-  568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
-  600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
-  632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
-  664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
-  696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
-  729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
-  762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
-  795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
-  828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
-  862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
-  896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
-  929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
-  963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
-  998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
-  1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
-  1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
-  1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
-  1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
-  1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
-  1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
-  1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
-  1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
-  1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
-  1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
-  1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
-  1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
-  1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
-  1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
-  1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
-  1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
-  1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
-  1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
-  1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
-  1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
-  1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
-  1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
-  1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
-  1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
-  1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
-  1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
-  1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
-  2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
+// lookup table for small values of int*log2(int) * (1 << LOG_2_PRECISION_BITS).
+// Obtained in Python with:
+// a=[ "%d"%i if i<(1<<32) else "%dull"%i
+//     for i in [ round((1<<LOG_2_PRECISION_BITS)*math.log2(i)*i) if i
+//     else 0 for i in range(256)]]
+// print(',\n '.join([','.join(v) for v in batched([i.rjust(15)
+//                      for i in a],4)]))
+const uint64_t kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
+               0,              0,       16777216,       39886887,
+        67108864,       97388723,      130105423,      164848600,
+       201326592,      239321324,      278663526,      319217973,
+       360874141,      403539997,      447137711,      491600606,
+       536870912,      582898099,      629637592,      677049776,
+       725099212,      773754010,      822985323,      872766924,
+       923074875,      973887230,     1025183802,     1076945958,
+      1129156447,     1181799249,     1234859451,     1288323135,
+      1342177280,     1396409681,     1451008871,     1505964059,
+      1561265072,     1616902301,     1672866655,     1729149526,
+      1785742744,     1842638548,     1899829557,     1957308741,
+      2015069397,     2073105127,     2131409817,  2189977618ull,
+   2248802933ull,  2307880396ull,  2367204859ull,  2426771383ull,
+   2486575220ull,  2546611805ull,  2606876748ull,  2667365819ull,
+   2728074942ull,  2789000187ull,  2850137762ull,  2911484006ull,
+   2973035382ull,  3034788471ull,  3096739966ull,  3158886666ull,
+   3221225472ull,  3283753383ull,  3346467489ull,  3409364969ull,
+   3472443085ull,  3535699182ull,  3599130679ull,  3662735070ull,
+   3726509920ull,  3790452862ull,  3854561593ull,  3918833872ull,
+   3983267519ull,  4047860410ull,  4112610476ull,  4177515704ull,
+   4242574127ull,  4307783833ull,  4373142952ull,  4438649662ull,
+   4504302186ull,  4570098787ull,  4636037770ull,  4702117480ull,
+   4768336298ull,  4834692645ull,  4901184974ull,  4967811774ull,
+   5034571569ull,  5101462912ull,  5168484389ull,  5235634615ull,
+   5302912235ull,  5370315922ull,  5437844376ull,  5505496324ull,
+   5573270518ull,  5641165737ull,  5709180782ull,  5777314477ull,
+   5845565671ull,  5913933235ull,  5982416059ull,  6051013057ull,
+   6119723161ull,  6188545324ull,  6257478518ull,  6326521733ull,
+   6395673979ull,  6464934282ull,  6534301685ull,  6603775250ull,
+   6673354052ull,  6743037185ull,  6812823756ull,  6882712890ull,
+   6952703725ull,  7022795412ull,  7092987118ull,  7163278025ull,
+   7233667324ull,  7304154222ull,  7374737939ull,  7445417707ull,
+   7516192768ull,  7587062379ull,  7658025806ull,  7729082328ull,
+   7800231234ull,  7871471825ull,  7942803410ull,  8014225311ull,
+   8085736859ull,  8157337394ull,  8229026267ull,  8300802839ull,
+   8372666477ull,  8444616560ull,  8516652476ull,  8588773618ull,
+   8660979393ull,  8733269211ull,  8805642493ull,  8878098667ull,
+   8950637170ull,  9023257446ull,  9095958945ull,  9168741125ull,
+   9241603454ull,  9314545403ull,  9387566451ull,  9460666086ull,
+   9533843800ull,  9607099093ull,  9680431471ull,  9753840445ull,
+   9827325535ull,  9900886263ull,  9974522161ull, 10048232765ull,
+  10122017615ull, 10195876260ull, 10269808253ull, 10343813150ull,
+  10417890516ull, 10492039919ull, 10566260934ull, 10640553138ull,
+  10714916116ull, 10789349456ull, 10863852751ull, 10938425600ull,
+  11013067604ull, 11087778372ull, 11162557513ull, 11237404645ull,
+  11312319387ull, 11387301364ull, 11462350205ull, 11537465541ull,
+  11612647010ull, 11687894253ull, 11763206912ull, 11838584638ull,
+  11914027082ull, 11989533899ull, 12065104750ull, 12140739296ull,
+  12216437206ull, 12292198148ull, 12368021795ull, 12443907826ull,
+  12519855920ull, 12595865759ull, 12671937032ull, 12748069427ull,
+  12824262637ull, 12900516358ull, 12976830290ull, 13053204134ull,
+  13129637595ull, 13206130381ull, 13282682202ull, 13359292772ull,
+  13435961806ull, 13512689025ull, 13589474149ull, 13666316903ull,
+  13743217014ull, 13820174211ull, 13897188225ull, 13974258793ull,
+  14051385649ull, 14128568535ull, 14205807192ull, 14283101363ull,
+  14360450796ull, 14437855239ull, 14515314443ull, 14592828162ull,
+  14670396151ull, 14748018167ull, 14825693972ull, 14903423326ull,
+  14981205995ull, 15059041743ull, 15136930339ull, 15214871554ull,
+  15292865160ull, 15370910930ull, 15449008641ull, 15527158071ull,
+  15605359001ull, 15683611210ull, 15761914485ull, 15840268608ull,
+  15918673369ull, 15997128556ull, 16075633960ull, 16154189373ull,
+  16232794589ull, 16311449405ull, 16390153617ull, 16468907026ull,
+  16547709431ull, 16626560636ull, 16705460444ull, 16784408661ull,
+  16863405094ull, 16942449552ull, 17021541845ull, 17100681785ull
 };
 };
 
 
 const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
 const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
@@ -326,23 +246,19 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
   112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
   112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
 };
 };
 
 
-static float FastSLog2Slow_C(uint32_t v) {
+static uint64_t FastSLog2Slow_C(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    const uint64_t orig_v = v;
+    uint64_t correction;
 #if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
 #if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
     // use clz if available
     // use clz if available
-    const int log_cnt = BitsLog2Floor(v) - 7;
+    const uint64_t log_cnt = BitsLog2Floor(v) - 7;
     const uint32_t y = 1 << log_cnt;
     const uint32_t y = 1 << log_cnt;
-    int correction = 0;
-    const float v_f = (float)v;
-    const uint32_t orig_v = v;
     v >>= log_cnt;
     v >>= log_cnt;
 #else
 #else
-    int log_cnt = 0;
+    uint64_t log_cnt = 0;
     uint32_t y = 1;
     uint32_t y = 1;
-    int correction = 0;
-    const float v_f = (float)v;
-    const uint32_t orig_v = v;
     do {
     do {
       ++log_cnt;
       ++log_cnt;
       v = v >> 1;
       v = v >> 1;
@@ -354,45 +270,43 @@ static float FastSLog2Slow_C(uint32_t v) {
     // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
     // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
     // The correction factor: log(1 + d) ~ d; for very small d values, so
     // The correction factor: log(1 + d) ~ d; for very small d values, so
     // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
     // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
-    // LOG_2_RECIPROCAL ~ 23/16
-    correction = (23 * (orig_v & (y - 1))) >> 4;
-    return v_f * (kLog2Table[v] + log_cnt) + correction;
+    correction = LOG_2_RECIPROCAL_FIXED * (orig_v & (y - 1));
+    return orig_v * (kLog2Table[v] + (log_cnt << LOG_2_PRECISION_BITS)) +
+           correction;
   } else {
   } else {
-    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+    return (uint64_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * v * log((double)v) + .5);
   }
   }
 }
 }
 
 
-static float FastLog2Slow_C(uint32_t v) {
+static uint32_t FastLog2Slow_C(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    const uint32_t orig_v = v;
+    uint32_t log_2;
 #if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
 #if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
     // use clz if available
     // use clz if available
-    const int log_cnt = BitsLog2Floor(v) - 7;
+    const uint32_t log_cnt = BitsLog2Floor(v) - 7;
     const uint32_t y = 1 << log_cnt;
     const uint32_t y = 1 << log_cnt;
-    const uint32_t orig_v = v;
-    double log_2;
     v >>= log_cnt;
     v >>= log_cnt;
 #else
 #else
-    int log_cnt = 0;
+    uint32_t log_cnt = 0;
     uint32_t y = 1;
     uint32_t y = 1;
-    const uint32_t orig_v = v;
-    double log_2;
     do {
     do {
       ++log_cnt;
       ++log_cnt;
       v = v >> 1;
       v = v >> 1;
       y = y << 1;
       y = y << 1;
     } while (v >= LOG_LOOKUP_IDX_MAX);
     } while (v >= LOG_LOOKUP_IDX_MAX);
 #endif
 #endif
-    log_2 = kLog2Table[v] + log_cnt;
+    log_2 = kLog2Table[v] + (log_cnt << LOG_2_PRECISION_BITS);
     if (orig_v >= APPROX_LOG_MAX) {
     if (orig_v >= APPROX_LOG_MAX) {
       // Since the division is still expensive, add this correction factor only
       // Since the division is still expensive, add this correction factor only
       // for large values of 'v'.
       // for large values of 'v'.
-      const int correction = (23 * (orig_v & (y - 1))) >> 4;
-      log_2 += (double)correction / orig_v;
+      const uint64_t correction = LOG_2_RECIPROCAL_FIXED * (orig_v & (y - 1));
+      log_2 += (uint32_t)DivRound(correction, orig_v);
     }
     }
-    return (float)log_2;
+    return log_2;
   } else {
   } else {
-    return (float)(LOG_2_RECIPROCAL * log((double)v));
+    return (uint32_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * log((double)v) + .5);
   }
   }
 }
 }
 
 
@@ -400,37 +314,53 @@ static float FastLog2Slow_C(uint32_t v) {
 // Methods to calculate Entropy (Shannon).
 // Methods to calculate Entropy (Shannon).
 
 
 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
-static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
+static uint64_t CombinedShannonEntropy_C(const uint32_t X[256],
+                                         const uint32_t Y[256]) {
   int i;
   int i;
-  float retval = 0.f;
-  int sumX = 0, sumXY = 0;
+  uint64_t retval = 0;
+  uint32_t sumX = 0, sumXY = 0;
   for (i = 0; i < 256; ++i) {
   for (i = 0; i < 256; ++i) {
-    const int x = X[i];
+    const uint32_t x = X[i];
     if (x != 0) {
     if (x != 0) {
-      const int xy = x + Y[i];
+      const uint32_t xy = x + Y[i];
       sumX += x;
       sumX += x;
-      retval -= VP8LFastSLog2(x);
+      retval += VP8LFastSLog2(x);
       sumXY += xy;
       sumXY += xy;
-      retval -= VP8LFastSLog2(xy);
+      retval += VP8LFastSLog2(xy);
     } else if (Y[i] != 0) {
     } else if (Y[i] != 0) {
       sumXY += Y[i];
       sumXY += Y[i];
-      retval -= VP8LFastSLog2(Y[i]);
+      retval += VP8LFastSLog2(Y[i]);
+    }
+  }
+  retval = VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY) - retval;
+  return retval;
+}
+
+static uint64_t ShannonEntropy_C(const uint32_t* X, int n) {
+  int i;
+  uint64_t retval = 0;
+  uint32_t sumX = 0;
+  for (i = 0; i < n; ++i) {
+    const int x = X[i];
+    if (x != 0) {
+      sumX += x;
+      retval += VP8LFastSLog2(x);
     }
     }
   }
   }
-  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+  retval = VP8LFastSLog2(sumX) - retval;
   return retval;
   return retval;
 }
 }
 
 
 void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
 void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
-  entropy->entropy = 0.;
+  entropy->entropy = 0;
   entropy->sum = 0;
   entropy->sum = 0;
   entropy->nonzeros = 0;
   entropy->nonzeros = 0;
   entropy->max_val = 0;
   entropy->max_val = 0;
   entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM;
   entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM;
 }
 }
 
 
-void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
-                              VP8LBitEntropy* const entropy) {
+void VP8LBitsEntropyUnrefined(const uint32_t* WEBP_RESTRICT const array, int n,
+                              VP8LBitEntropy* WEBP_RESTRICT const entropy) {
   int i;
   int i;
 
 
   VP8LBitEntropyInit(entropy);
   VP8LBitEntropyInit(entropy);
@@ -440,18 +370,20 @@ void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
       entropy->sum += array[i];
       entropy->sum += array[i];
       entropy->nonzero_code = i;
       entropy->nonzero_code = i;
       ++entropy->nonzeros;
       ++entropy->nonzeros;
-      entropy->entropy -= VP8LFastSLog2(array[i]);
+      entropy->entropy += VP8LFastSLog2(array[i]);
       if (entropy->max_val < array[i]) {
       if (entropy->max_val < array[i]) {
         entropy->max_val = array[i];
         entropy->max_val = array[i];
       }
       }
     }
     }
   }
   }
-  entropy->entropy += VP8LFastSLog2(entropy->sum);
+  entropy->entropy = VP8LFastSLog2(entropy->sum) - entropy->entropy;
 }
 }
 
 
 static WEBP_INLINE void GetEntropyUnrefinedHelper(
 static WEBP_INLINE void GetEntropyUnrefinedHelper(
-    uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
-    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+    uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev,
+    int* WEBP_RESTRICT const i_prev,
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
+    VP8LStreaks* WEBP_RESTRICT const stats) {
   const int streak = i - *i_prev;
   const int streak = i - *i_prev;
 
 
   // Gather info for the bit entropy.
   // Gather info for the bit entropy.
@@ -459,7 +391,7 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
     bit_entropy->sum += (*val_prev) * streak;
     bit_entropy->sum += (*val_prev) * streak;
     bit_entropy->nonzeros += streak;
     bit_entropy->nonzeros += streak;
     bit_entropy->nonzero_code = *i_prev;
     bit_entropy->nonzero_code = *i_prev;
-    bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
+    bit_entropy->entropy += VP8LFastSLog2(*val_prev) * streak;
     if (bit_entropy->max_val < *val_prev) {
     if (bit_entropy->max_val < *val_prev) {
       bit_entropy->max_val = *val_prev;
       bit_entropy->max_val = *val_prev;
     }
     }
@@ -473,9 +405,10 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
   *i_prev = i;
   *i_prev = i;
 }
 }
 
 
-static void GetEntropyUnrefined_C(const uint32_t X[], int length,
-                                  VP8LBitEntropy* const bit_entropy,
-                                  VP8LStreaks* const stats) {
+static void GetEntropyUnrefined_C(
+    const uint32_t X[], int length,
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
+    VP8LStreaks* WEBP_RESTRICT const stats) {
   int i;
   int i;
   int i_prev = 0;
   int i_prev = 0;
   uint32_t x_prev = X[0];
   uint32_t x_prev = X[0];
@@ -491,14 +424,13 @@ static void GetEntropyUnrefined_C(const uint32_t X[], int length,
   }
   }
   GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
   GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
 
 
-  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+  bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;
 }
 }
 
 
-static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
-                                          const uint32_t Y[],
-                                          int length,
-                                          VP8LBitEntropy* const bit_entropy,
-                                          VP8LStreaks* const stats) {
+static void GetCombinedEntropyUnrefined_C(
+    const uint32_t X[], const uint32_t Y[], int length,
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
+    VP8LStreaks* WEBP_RESTRICT const stats) {
   int i = 1;
   int i = 1;
   int i_prev = 0;
   int i_prev = 0;
   uint32_t xy_prev = X[0] + Y[0];
   uint32_t xy_prev = X[0] + Y[0];
@@ -514,7 +446,7 @@ static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
   }
   }
   GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
   GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
 
 
-  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+  bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;
 }
 }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
@@ -538,8 +470,8 @@ static WEBP_INLINE int8_t U32ToS8(uint32_t v) {
   return (int8_t)(v & 0xff);
   return (int8_t)(v & 0xff);
 }
 }
 
 
-void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
-                          int num_pixels) {
+void VP8LTransformColor_C(const VP8LMultipliers* WEBP_RESTRICT const m,
+                          uint32_t* WEBP_RESTRICT data, int num_pixels) {
   int i;
   int i;
   for (i = 0; i < num_pixels; ++i) {
   for (i = 0; i < num_pixels; ++i) {
     const uint32_t argb = data[i];
     const uint32_t argb = data[i];
@@ -575,9 +507,10 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
   return (new_blue & 0xff);
   return (new_blue & 0xff);
 }
 }
 
 
-void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
+void VP8LCollectColorRedTransforms_C(const uint32_t* WEBP_RESTRICT argb,
+                                     int stride,
                                      int tile_width, int tile_height,
                                      int tile_width, int tile_height,
-                                     int green_to_red, int histo[]) {
+                                     int green_to_red, uint32_t histo[]) {
   while (tile_height-- > 0) {
   while (tile_height-- > 0) {
     int x;
     int x;
     for (x = 0; x < tile_width; ++x) {
     for (x = 0; x < tile_width; ++x) {
@@ -587,10 +520,11 @@ void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
   }
   }
 }
 }
 
 
-void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
+void VP8LCollectColorBlueTransforms_C(const uint32_t* WEBP_RESTRICT argb,
+                                      int stride,
                                       int tile_width, int tile_height,
                                       int tile_width, int tile_height,
                                       int green_to_blue, int red_to_blue,
                                       int green_to_blue, int red_to_blue,
-                                      int histo[]) {
+                                      uint32_t histo[]) {
   while (tile_height-- > 0) {
   while (tile_height-- > 0) {
     int x;
     int x;
     for (x = 0; x < tile_width; ++x) {
     for (x = 0; x < tile_width; ++x) {
@@ -614,8 +548,8 @@ static int VectorMismatch_C(const uint32_t* const array1,
 }
 }
 
 
 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
-void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
-                          uint32_t* dst) {
+void VP8LBundleColorMap_C(const uint8_t* WEBP_RESTRICT const row,
+                          int width, int xbits, uint32_t* WEBP_RESTRICT dst) {
   int x;
   int x;
   if (xbits > 0) {
   if (xbits > 0) {
     const int bit_depth = 1 << (3 - xbits);
     const int bit_depth = 1 << (3 - xbits);
@@ -646,7 +580,8 @@ static uint32_t ExtraCost_C(const uint32_t* population, int length) {
   return cost;
   return cost;
 }
 }
 
 
-static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
+static uint32_t ExtraCostCombined_C(const uint32_t* WEBP_RESTRICT X,
+                                    const uint32_t* WEBP_RESTRICT Y,
                                     int length) {
                                     int length) {
   int i;
   int i;
   uint32_t cost = X[4] + Y[4] + X[5] + Y[5];
   uint32_t cost = X[4] + Y[4] + X[5] + Y[5];
@@ -661,13 +596,15 @@ static uint32_t ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 
 
-static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out,
-                        int size) {
+static void AddVector_C(const uint32_t* WEBP_RESTRICT a,
+                        const uint32_t* WEBP_RESTRICT b,
+                        uint32_t* WEBP_RESTRICT out, int size) {
   int i;
   int i;
   for (i = 0; i < size; ++i) out[i] = a[i] + b[i];
   for (i = 0; i < size; ++i) out[i] = a[i] + b[i];
 }
 }
 
 
-static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
+static void AddVectorEq_C(const uint32_t* WEBP_RESTRICT a,
+                          uint32_t* WEBP_RESTRICT out, int size) {
   int i;
   int i;
   for (i = 0; i < size; ++i) out[i] += a[i];
   for (i = 0; i < size; ++i) out[i] += a[i];
 }
 }
@@ -696,8 +633,9 @@ static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
   }                                                                            \
   }                                                                            \
 } while (0)
 } while (0)
 
 
-void VP8LHistogramAdd(const VP8LHistogram* const a,
-                      const VP8LHistogram* const b, VP8LHistogram* const out) {
+void VP8LHistogramAdd(const VP8LHistogram* WEBP_RESTRICT const a,
+                      const VP8LHistogram* WEBP_RESTRICT const b,
+                      VP8LHistogram* WEBP_RESTRICT const out) {
   int i;
   int i;
   const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
   const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
   assert(a->palette_code_bits_ == b->palette_code_bits_);
   assert(a->palette_code_bits_ == b->palette_code_bits_);
@@ -727,14 +665,14 @@ void VP8LHistogramAdd(const VP8LHistogram* const a,
 // Image transforms.
 // Image transforms.
 
 
 static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
 static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
-                            int num_pixels, uint32_t* out) {
+                            int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
   for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
   (void)upper;
   (void)upper;
 }
 }
 
 
 static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
 static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
-                            int num_pixels, uint32_t* out) {
+                            int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
   for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
   (void)upper;
   (void)upper;
@@ -745,7 +683,8 @@ static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
 #define GENERATE_PREDICTOR_SUB(PREDICTOR_I)                                \
 #define GENERATE_PREDICTOR_SUB(PREDICTOR_I)                                \
 static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in,              \
 static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in,              \
                                           const uint32_t* upper,           \
                                           const uint32_t* upper,           \
-                                          int num_pixels, uint32_t* out) { \
+                                          int num_pixels,                  \
+                                          uint32_t* WEBP_RESTRICT out) {   \
   int x;                                                                   \
   int x;                                                                   \
   assert(upper != NULL);                                                   \
   assert(upper != NULL);                                                   \
   for (x = 0; x < num_pixels; ++x) {                                       \
   for (x = 0; x < num_pixels; ++x) {                                       \
@@ -778,11 +717,12 @@ VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
 VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
 VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
 
 
 VP8LFastLog2SlowFunc VP8LFastLog2Slow;
 VP8LFastLog2SlowFunc VP8LFastLog2Slow;
-VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+VP8LFastSLog2SlowFunc VP8LFastSLog2Slow;
 
 
 VP8LCostFunc VP8LExtraCost;
 VP8LCostFunc VP8LExtraCost;
 VP8LCostCombinedFunc VP8LExtraCostCombined;
 VP8LCostCombinedFunc VP8LExtraCostCombined;
 VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
 VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
+VP8LShannonEntropyFunc VP8LShannonEntropy;
 
 
 VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
 VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
 VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
 VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
@@ -822,6 +762,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
   VP8LExtraCost = ExtraCost_C;
   VP8LExtraCost = ExtraCost_C;
   VP8LExtraCostCombined = ExtraCostCombined_C;
   VP8LExtraCostCombined = ExtraCostCombined_C;
   VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
   VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
+  VP8LShannonEntropy = ShannonEntropy_C;
 
 
   VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
   VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
   VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
   VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
@@ -911,6 +852,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
   assert(VP8LExtraCost != NULL);
   assert(VP8LExtraCost != NULL);
   assert(VP8LExtraCostCombined != NULL);
   assert(VP8LExtraCostCombined != NULL);
   assert(VP8LCombinedShannonEntropy != NULL);
   assert(VP8LCombinedShannonEntropy != NULL);
+  assert(VP8LShannonEntropy != NULL);
   assert(VP8LGetEntropyUnrefined != NULL);
   assert(VP8LGetEntropyUnrefined != NULL);
   assert(VP8LGetCombinedEntropyUnrefined != NULL);
   assert(VP8LGetCombinedEntropyUnrefined != NULL);
   assert(VP8LAddVector != NULL);
   assert(VP8LAddVector != NULL);

+ 38 - 33
thirdparty/libwebp/src/dsp/lossless_enc_mips32.c

@@ -23,12 +23,12 @@
 #include <stdlib.h>
 #include <stdlib.h>
 #include <string.h>
 #include <string.h>
 
 
-static float FastSLog2Slow_MIPS32(uint32_t v) {
+static uint64_t FastSLog2Slow_MIPS32(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
-    uint32_t log_cnt, y, correction;
+    uint32_t log_cnt, y;
+    uint64_t correction;
     const int c24 = 24;
     const int c24 = 24;
-    const float v_f = (float)v;
     uint32_t temp;
     uint32_t temp;
 
 
     // Xf = 256 = 2^8
     // Xf = 256 = 2^8
@@ -49,22 +49,23 @@ static float FastSLog2Slow_MIPS32(uint32_t v) {
     // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
     // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
     // The correction factor: log(1 + d) ~ d; for very small d values, so
     // The correction factor: log(1 + d) ~ d; for very small d values, so
     // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
     // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
-    // LOG_2_RECIPROCAL ~ 23/16
 
 
     // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
     // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
-    correction = (23 * (v & (y - 1))) >> 4;
-    return v_f * (kLog2Table[temp] + log_cnt) + correction;
+    correction = LOG_2_RECIPROCAL_FIXED * (v & (y - 1));
+    return (uint64_t)v * (kLog2Table[temp] +
+                          ((uint64_t)log_cnt << LOG_2_PRECISION_BITS)) +
+           correction;
   } else {
   } else {
-    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+    return (uint64_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * v * log((double)v) + .5);
   }
   }
 }
 }
 
 
-static float FastLog2Slow_MIPS32(uint32_t v) {
+static uint32_t FastLog2Slow_MIPS32(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     uint32_t log_cnt, y;
     uint32_t log_cnt, y;
     const int c24 = 24;
     const int c24 = 24;
-    double log_2;
+    uint32_t log_2;
     uint32_t temp;
     uint32_t temp;
 
 
     __asm__ volatile(
     __asm__ volatile(
@@ -78,17 +79,16 @@ static float FastLog2Slow_MIPS32(uint32_t v) {
       : [c24]"r"(c24), [v]"r"(v)
       : [c24]"r"(c24), [v]"r"(v)
     );
     );
 
 
-    log_2 = kLog2Table[temp] + log_cnt;
+    log_2 = kLog2Table[temp] + (log_cnt << LOG_2_PRECISION_BITS);
     if (v >= APPROX_LOG_MAX) {
     if (v >= APPROX_LOG_MAX) {
       // Since the division is still expensive, add this correction factor only
       // Since the division is still expensive, add this correction factor only
       // for large values of 'v'.
       // for large values of 'v'.
-
-      const uint32_t correction = (23 * (v & (y - 1))) >> 4;
-      log_2 += (double)correction / v;
+      const uint64_t correction = LOG_2_RECIPROCAL_FIXED * (v & (y - 1));
+      log_2 += (uint32_t)DivRound(correction, v);
     }
     }
-    return (float)log_2;
+    return log_2;
   } else {
   } else {
-    return (float)(LOG_2_RECIPROCAL * log((double)v));
+    return (uint32_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * log((double)v) + .5);
   }
   }
 }
 }
 
 
@@ -149,8 +149,9 @@ static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) {
 //     pY += 2;
 //     pY += 2;
 //   }
 //   }
 //   return cost;
 //   return cost;
-static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X,
-                                         const uint32_t* const Y, int length) {
+static uint32_t ExtraCostCombined_MIPS32(const uint32_t* WEBP_RESTRICT const X,
+                                         const uint32_t* WEBP_RESTRICT const Y,
+                                         int length) {
   int i, temp0, temp1, temp2, temp3;
   int i, temp0, temp1, temp2, temp3;
   const uint32_t* pX = &X[4];
   const uint32_t* pX = &X[4];
   const uint32_t* pY = &Y[4];
   const uint32_t* pY = &Y[4];
@@ -215,8 +216,10 @@ static uint32_t ExtraCostCombined_MIPS32(const uint32_t* const X,
 
 
 // Returns the various RLE counts
 // Returns the various RLE counts
 static WEBP_INLINE void GetEntropyUnrefinedHelper(
 static WEBP_INLINE void GetEntropyUnrefinedHelper(
-    uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
-    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+    uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev,
+    int* WEBP_RESTRICT const i_prev,
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
+    VP8LStreaks* WEBP_RESTRICT const stats) {
   int* const pstreaks = &stats->streaks[0][0];
   int* const pstreaks = &stats->streaks[0][0];
   int* const pcnts = &stats->counts[0];
   int* const pcnts = &stats->counts[0];
   int temp0, temp1, temp2, temp3;
   int temp0, temp1, temp2, temp3;
@@ -227,7 +230,7 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
     bit_entropy->sum += (*val_prev) * streak;
     bit_entropy->sum += (*val_prev) * streak;
     bit_entropy->nonzeros += streak;
     bit_entropy->nonzeros += streak;
     bit_entropy->nonzero_code = *i_prev;
     bit_entropy->nonzero_code = *i_prev;
-    bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
+    bit_entropy->entropy += VP8LFastSLog2(*val_prev) * streak;
     if (bit_entropy->max_val < *val_prev) {
     if (bit_entropy->max_val < *val_prev) {
       bit_entropy->max_val = *val_prev;
       bit_entropy->max_val = *val_prev;
     }
     }
@@ -241,9 +244,10 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
   *i_prev = i;
   *i_prev = i;
 }
 }
 
 
-static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
-                                       VP8LBitEntropy* const bit_entropy,
-                                       VP8LStreaks* const stats) {
+static void GetEntropyUnrefined_MIPS32(
+    const uint32_t X[], int length,
+    VP8LBitEntropy* WEBP_RESTRICT const bit_entropy,
+    VP8LStreaks* WEBP_RESTRICT const stats) {
   int i;
   int i;
   int i_prev = 0;
   int i_prev = 0;
   uint32_t x_prev = X[0];
   uint32_t x_prev = X[0];
@@ -259,14 +263,13 @@ static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
   }
   }
   GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
   GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
 
 
-  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+  bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy;
 }
 }
 
 
-static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
-                                               const uint32_t Y[],
-                                               int length,
-                                               VP8LBitEntropy* const entropy,
-                                               VP8LStreaks* const stats) {
+static void GetCombinedEntropyUnrefined_MIPS32(
+    const uint32_t X[], const uint32_t Y[], int length,
+    VP8LBitEntropy* WEBP_RESTRICT const entropy,
+    VP8LStreaks* WEBP_RESTRICT const stats) {
   int i = 1;
   int i = 1;
   int i_prev = 0;
   int i_prev = 0;
   uint32_t xy_prev = X[0] + Y[0];
   uint32_t xy_prev = X[0] + Y[0];
@@ -282,7 +285,7 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
   }
   }
   GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
   GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
 
 
-  entropy->entropy += VP8LFastSLog2(entropy->sum);
+  entropy->entropy = VP8LFastSLog2(entropy->sum) - entropy->entropy;
 }
 }
 
 
 #define ASM_START                                       \
 #define ASM_START                                       \
@@ -344,8 +347,9 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
     ASM_END_COMMON_0                                    \
     ASM_END_COMMON_0                                    \
     ASM_END_COMMON_1
     ASM_END_COMMON_1
 
 
-static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
-                             uint32_t* pout, int size) {
+static void AddVector_MIPS32(const uint32_t* WEBP_RESTRICT pa,
+                             const uint32_t* WEBP_RESTRICT pb,
+                             uint32_t* WEBP_RESTRICT pout, int size) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   const int end = ((size) / 4) * 4;
   const int end = ((size) / 4) * 4;
   const uint32_t* const LoopEnd = pa + end;
   const uint32_t* const LoopEnd = pa + end;
@@ -356,7 +360,8 @@ static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
   for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i];
   for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i];
 }
 }
 
 
-static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
+static void AddVectorEq_MIPS32(const uint32_t* WEBP_RESTRICT pa,
+                               uint32_t* WEBP_RESTRICT pout, int size) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   const int end = ((size) / 4) * 4;
   const int end = ((size) / 4) * 4;
   const uint32_t* const LoopEnd = pa + end;
   const uint32_t* const LoopEnd = pa + end;

+ 10 - 15
thirdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c

@@ -78,8 +78,9 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
   return (uint32_t)((int)(color_pred) * color) >> 5;
   return (uint32_t)((int)(color_pred) * color) >> 5;
 }
 }
 
 
-static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
-                                     uint32_t* data, int num_pixels) {
+static void TransformColor_MIPSdspR2(
+    const VP8LMultipliers* WEBP_RESTRICT const m, uint32_t* WEBP_RESTRICT data,
+    int num_pixels) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   int temp0, temp1, temp2, temp3, temp4, temp5;
   uint32_t argb, argb1, new_red, new_red1;
   uint32_t argb, argb1, new_red, new_red1;
   const uint32_t G_to_R = m->green_to_red_;
   const uint32_t G_to_R = m->green_to_red_;
@@ -171,13 +172,10 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
   return (new_blue & 0xff);
   return (new_blue & 0xff);
 }
 }
 
 
-static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
-                                                 int stride,
-                                                 int tile_width,
-                                                 int tile_height,
-                                                 int green_to_blue,
-                                                 int red_to_blue,
-                                                 int histo[]) {
+static void CollectColorBlueTransforms_MIPSdspR2(
+    const uint32_t* WEBP_RESTRICT argb, int stride,
+    int tile_width, int tile_height,
+    int green_to_blue, int red_to_blue, uint32_t histo[]) {
   const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
   const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
   const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
   const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
   const uint32_t mask = 0xff00ffu;
   const uint32_t mask = 0xff00ffu;
@@ -225,12 +223,9 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
   return (new_red & 0xff);
   return (new_red & 0xff);
 }
 }
 
 
-static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
-                                                int stride,
-                                                int tile_width,
-                                                int tile_height,
-                                                int green_to_red,
-                                                int histo[]) {
+static void CollectColorRedTransforms_MIPSdspR2(
+    const uint32_t* WEBP_RESTRICT argb, int stride,
+    int tile_width, int tile_height, int green_to_red, uint32_t histo[]) {
   const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
   const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
   while (tile_height-- > 0) {
   while (tile_height-- > 0) {
     int x;
     int x;

+ 2 - 2
thirdparty/libwebp/src/dsp/lossless_enc_msa.c

@@ -48,8 +48,8 @@
   dst = VSHF_UB(src, t0, mask1);                                \
   dst = VSHF_UB(src, t0, mask1);                                \
 } while (0)
 } while (0)
 
 
-static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
-                               int num_pixels) {
+static void TransformColor_MSA(const VP8LMultipliers* WEBP_RESTRICT const m,
+                               uint32_t* WEBP_RESTRICT data, int num_pixels) {
   v16u8 src0, dst0;
   v16u8 src0, dst0;
   const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
   const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
                                          (m->green_to_red_ << 16));
                                          (m->green_to_red_ << 16));

+ 3 - 2
thirdparty/libwebp/src/dsp/lossless_enc_neon.c

@@ -72,8 +72,9 @@ static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Color Transform
 // Color Transform
 
 
-static void TransformColor_NEON(const VP8LMultipliers* const m,
-                                uint32_t* argb_data, int num_pixels) {
+static void TransformColor_NEON(const VP8LMultipliers* WEBP_RESTRICT const m,
+                                uint32_t* WEBP_RESTRICT argb_data,
+                                int num_pixels) {
   // sign-extended multiplying constants, pre-shifted by 6.
   // sign-extended multiplying constants, pre-shifted by 6.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 6)
 #define CST(X)  (((int16_t)(m->X << 8)) >> 6)
   const int16_t rb[8] = {
   const int16_t rb[8] = {

+ 96 - 52
thirdparty/libwebp/src/dsp/lossless_enc_sse2.c

@@ -49,8 +49,9 @@ static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
 #define MK_CST_16(HI, LO) \
 #define MK_CST_16(HI, LO) \
   _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
   _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
 
 
-static void TransformColor_SSE2(const VP8LMultipliers* const m,
-                                uint32_t* argb_data, int num_pixels) {
+static void TransformColor_SSE2(const VP8LMultipliers* WEBP_RESTRICT const m,
+                                uint32_t* WEBP_RESTRICT argb_data,
+                                int num_pixels) {
   const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
   const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
                                      CST_5b(m->green_to_blue_));
                                      CST_5b(m->green_to_blue_));
   const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
   const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
@@ -79,10 +80,11 @@ static void TransformColor_SSE2(const VP8LMultipliers* const m,
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 #define SPAN 8
 #define SPAN 8
-static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
+static void CollectColorBlueTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb,
+                                            int stride,
                                             int tile_width, int tile_height,
                                             int tile_width, int tile_height,
                                             int green_to_blue, int red_to_blue,
                                             int green_to_blue, int red_to_blue,
-                                            int histo[]) {
+                                            uint32_t histo[]) {
   const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
   const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
   const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
   const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
   const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
   const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
@@ -126,9 +128,10 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
   }
   }
 }
 }
 
 
-static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
+static void CollectColorRedTransforms_SSE2(const uint32_t* WEBP_RESTRICT argb,
+                                           int stride,
                                            int tile_width, int tile_height,
                                            int tile_width, int tile_height,
-                                           int green_to_red, int histo[]) {
+                                           int green_to_red, uint32_t histo[]) {
   const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
   const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
   const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
   const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
   const __m128i mask = _mm_set1_epi32(0xff);
   const __m128i mask = _mm_set1_epi32(0xff);
@@ -172,75 +175,113 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
 
 
 // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
 // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
 // that's ok since the histogram values are less than 1<<28 (max picture size).
 // that's ok since the histogram values are less than 1<<28 (max picture size).
-#define LINE_SIZE 16    // 8 or 16
-static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
-                           int size) {
-  int i;
-  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
+static void AddVector_SSE2(const uint32_t* WEBP_RESTRICT a,
+                           const uint32_t* WEBP_RESTRICT b,
+                           uint32_t* WEBP_RESTRICT out, int size) {
+  int i = 0;
+  int aligned_size = size & ~15;
+  // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
+  // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
+  // 2). See the usage in VP8LHistogramAdd().
+  assert(size >= 16);
+  assert(size % 2 == 0);
+
+  do {
     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
-#if (LINE_SIZE == 16)
     const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
     const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
     const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
     const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
-#endif
     const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i +  0]);
     const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i +  0]);
     const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i +  4]);
     const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i +  4]);
-#if (LINE_SIZE == 16)
     const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i +  8]);
     const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i +  8]);
     const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
     const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
-#endif
     _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
     _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
     _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
     _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
-#if (LINE_SIZE == 16)
     _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
     _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
     _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
     _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
-#endif
+    i += 16;
+  } while (i != aligned_size);
+
+  if ((size & 8) != 0) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
+    _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
+    i += 8;
   }
   }
-  for (; i < size; ++i) {
-    out[i] = a[i] + b[i];
+
+  size &= 7;
+  if (size == 4) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i]);
+    _mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0));
+  } else if (size == 2) {
+    const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]);
+    const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[i]);
+    _mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0));
   }
   }
 }
 }
 
 
-static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
-  int i;
-  for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
+static void AddVectorEq_SSE2(const uint32_t* WEBP_RESTRICT a,
+                             uint32_t* WEBP_RESTRICT out, int size) {
+  int i = 0;
+  int aligned_size = size & ~15;
+  // Size is, at minimum, NUM_DISTANCE_CODES (40) and may be as large as
+  // NUM_LITERAL_CODES (256) + NUM_LENGTH_CODES (24) + (0 or a non-zero power of
+  // 2). See the usage in VP8LHistogramAdd().
+  assert(size >= 16);
+  assert(size % 2 == 0);
+
+  do {
     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
     const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
-#if (LINE_SIZE == 16)
     const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
     const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
     const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
     const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
-#endif
     const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]);
     const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]);
     const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]);
     const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]);
-#if (LINE_SIZE == 16)
     const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]);
     const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]);
     const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
     const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
-#endif
     _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
     _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
     _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
     _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
-#if (LINE_SIZE == 16)
     _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
     _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
     _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
     _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
-#endif
+    i += 16;
+  } while (i != aligned_size);
+
+  if ((size & 8) != 0) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
+    _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
+    i += 8;
   }
   }
-  for (; i < size; ++i) {
-    out[i] += a[i];
+
+  size &= 7;
+  if (size == 4) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i]);
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i]);
+    _mm_storeu_si128((__m128i*)&out[i], _mm_add_epi32(a0, b0));
+  } else if (size == 2) {
+    const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[i]);
+    const __m128i b0 = _mm_loadl_epi64((const __m128i*)&out[i]);
+    _mm_storel_epi64((__m128i*)&out[i], _mm_add_epi32(a0, b0));
   }
   }
 }
 }
-#undef LINE_SIZE
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Entropy
 // Entropy
 
 
-// TODO(https://crbug.com/webp/499): this function produces different results
-// from the C code due to use of double/float resulting in output differences
-// when compared to -noasm.
-#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
+#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
 
 
-static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
+static uint64_t CombinedShannonEntropy_SSE2(const uint32_t X[256],
+                                            const uint32_t Y[256]) {
   int i;
   int i;
-  float retval = 0.f;
-  int sumX = 0, sumXY = 0;
+  uint64_t retval = 0;
+  uint32_t sumX = 0, sumXY = 0;
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
 
 
   for (i = 0; i < 256; i += 16) {
   for (i = 0; i < 256; i += 16) {
@@ -260,19 +301,19 @@ static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
     int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
     int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
     while (my) {
     while (my) {
       const int32_t j = BitsCtz(my);
       const int32_t j = BitsCtz(my);
-      int xy;
+      uint32_t xy;
       if ((mx >> j) & 1) {
       if ((mx >> j) & 1) {
         const int x = X[i + j];
         const int x = X[i + j];
         sumXY += x;
         sumXY += x;
-        retval -= VP8LFastSLog2(x);
+        retval += VP8LFastSLog2(x);
       }
       }
       xy = X[i + j] + Y[i + j];
       xy = X[i + j] + Y[i + j];
       sumX += xy;
       sumX += xy;
-      retval -= VP8LFastSLog2(xy);
+      retval += VP8LFastSLog2(xy);
       my &= my - 1;
       my &= my - 1;
     }
     }
   }
   }
-  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+  retval = VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY) - retval;
   return retval;
   return retval;
 }
 }
 
 
@@ -335,8 +376,9 @@ static int VectorMismatch_SSE2(const uint32_t* const array1,
 }
 }
 
 
 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
-static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
-                                uint32_t* dst) {
+static void BundleColorMap_SSE2(const uint8_t* WEBP_RESTRICT const row,
+                                int width, int xbits,
+                                uint32_t* WEBP_RESTRICT dst) {
   int x;
   int x;
   assert(xbits >= 0);
   assert(xbits >= 0);
   assert(xbits <= 3);
   assert(xbits <= 3);
@@ -425,7 +467,7 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
 
 
 // Predictor0: ARGB_BLACK.
 // Predictor0: ARGB_BLACK.
 static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
   const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -442,7 +484,8 @@ static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
 #define GENERATE_PREDICTOR_1(X, IN)                                         \
 #define GENERATE_PREDICTOR_1(X, IN)                                         \
   static void PredictorSub##X##_SSE2(const uint32_t* const in,              \
   static void PredictorSub##X##_SSE2(const uint32_t* const in,              \
                                      const uint32_t* const upper,           \
                                      const uint32_t* const upper,           \
-                                     int num_pixels, uint32_t* const out) { \
+                                     int num_pixels,                        \
+                                     uint32_t* WEBP_RESTRICT const out) {   \
     int i;                                                                  \
     int i;                                                                  \
     for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
     for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
       const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);          \
       const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);          \
@@ -464,7 +507,7 @@ GENERATE_PREDICTOR_1(4, upper[i - 1])    // Predictor4: TL
 
 
 // Predictor5: avg2(avg2(L, TR), T)
 // Predictor5: avg2(avg2(L, TR), T)
 static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
     const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@@ -484,7 +527,8 @@ static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
 
 
 #define GENERATE_PREDICTOR_2(X, A, B)                                         \
 #define GENERATE_PREDICTOR_2(X, A, B)                                         \
 static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
 static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
-                                   int num_pixels, uint32_t* out) {           \
+                                   int num_pixels,                            \
+                                   uint32_t* WEBP_RESTRICT out) {             \
   int i;                                                                      \
   int i;                                                                      \
   for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
   for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
     const __m128i tA = _mm_loadu_si128((const __m128i*)&(A));                 \
     const __m128i tA = _mm_loadu_si128((const __m128i*)&(A));                 \
@@ -508,7 +552,7 @@ GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1])    // Predictor9: average(T, TR)
 
 
 // Predictor10: avg(avg(L,TL), avg(T, TR)).
 // Predictor10: avg(avg(L,TL), avg(T, TR)).
 static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
     const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@@ -543,7 +587,7 @@ static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
 }
 }
 
 
 static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
     const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
@@ -569,7 +613,7 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
 
 
 // Predictor12: ClampedSubSubtractFull.
 // Predictor12: ClampedSubSubtractFull.
 static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -598,7 +642,7 @@ static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
 
 
 // Predictors13: ClampedAddSubtractHalf
 // Predictors13: ClampedAddSubtractHalf
 static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   for (i = 0; i + 2 <= num_pixels; i += 2) {
   for (i = 0; i + 2 <= num_pixels; i += 2) {

+ 10 - 7
thirdparty/libwebp/src/dsp/lossless_enc_sse41.c

@@ -44,8 +44,9 @@ static uint32_t ExtraCost_SSE41(const uint32_t* const a, int length) {
   return HorizontalSum_SSE41(cost);
   return HorizontalSum_SSE41(cost);
 }
 }
 
 
-static uint32_t ExtraCostCombined_SSE41(const uint32_t* const a,
-                                        const uint32_t* const b, int length) {
+static uint32_t ExtraCostCombined_SSE41(const uint32_t* WEBP_RESTRICT const a,
+                                        const uint32_t* WEBP_RESTRICT const b,
+                                        int length) {
   int i;
   int i;
   __m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]),
   __m128i cost = _mm_add_epi32(_mm_set_epi32(2 * a[7], 2 * a[6], a[5], a[4]),
                                _mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4]));
                                _mm_set_epi32(2 * b[7], 2 * b[6], b[5], b[4]));
@@ -95,10 +96,11 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
 #define MK_CST_16(HI, LO) \
 #define MK_CST_16(HI, LO) \
   _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
   _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
 
 
-static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
+static void CollectColorBlueTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb,
+                                             int stride,
                                              int tile_width, int tile_height,
                                              int tile_width, int tile_height,
                                              int green_to_blue, int red_to_blue,
                                              int green_to_blue, int red_to_blue,
-                                             int histo[]) {
+                                             uint32_t histo[]) {
   const __m128i mult =
   const __m128i mult =
       MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
       MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
   const __m128i perm =
   const __m128i perm =
@@ -141,10 +143,11 @@ static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
   }
   }
 }
 }
 
 
-static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
+static void CollectColorRedTransforms_SSE41(const uint32_t* WEBP_RESTRICT argb,
+                                            int stride,
                                             int tile_width, int tile_height,
                                             int tile_width, int tile_height,
-                                            int green_to_red, int histo[]) {
-
+                                            int green_to_red,
+                                            uint32_t histo[]) {
   const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
   const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
   const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
   const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
   if (tile_width >= 4) {
   if (tile_width >= 4) {

+ 23 - 23
thirdparty/libwebp/src/dsp/lossless_neon.c

@@ -26,8 +26,8 @@
 #if !defined(WORK_AROUND_GCC)
 #if !defined(WORK_AROUND_GCC)
 // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
 // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
 // gcc-4.8.x at least.
 // gcc-4.8.x at least.
-static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
-                                   int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src,
+                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const uint32_t* const end = src + (num_pixels & ~15);
   const uint32_t* const end = src + (num_pixels & ~15);
   for (; src < end; src += 16) {
   for (; src < end; src += 16) {
     uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
     uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
   VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst);  // left-overs
   VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst);  // left-overs
 }
 }
 
 
-static void ConvertBGRAToBGR_NEON(const uint32_t* src,
-                                  int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src,
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const uint32_t* const end = src + (num_pixels & ~15);
   const uint32_t* const end = src + (num_pixels & ~15);
   for (; src < end; src += 16) {
   for (; src < end; src += 16) {
     const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
     const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -53,8 +53,8 @@ static void ConvertBGRAToBGR_NEON(const uint32_t* src,
   VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst);  // left-overs
   VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst);  // left-overs
 }
 }
 
 
-static void ConvertBGRAToRGB_NEON(const uint32_t* src,
-                                  int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src,
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const uint32_t* const end = src + (num_pixels & ~15);
   const uint32_t* const end = src + (num_pixels & ~15);
   for (; src < end; src += 16) {
   for (; src < end; src += 16) {
     const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
     const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -71,8 +71,8 @@ static void ConvertBGRAToRGB_NEON(const uint32_t* src,
 
 
 static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
 static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
 
 
-static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
-                                   int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_NEON(const uint32_t* WEBP_RESTRICT src,
+                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const uint32_t* const end = src + (num_pixels & ~1);
   const uint32_t* const end = src + (num_pixels & ~1);
   const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
   const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
   for (; src < end; src += 2) {
   for (; src < end; src += 2) {
@@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = {
   { 21, 22, 24, 25, 26, 28, 29, 30 }
   { 21, 22, 24, 25, 26, 28, 29, 30 }
 };
 };
 
 
-static void ConvertBGRAToBGR_NEON(const uint32_t* src,
-                                  int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_NEON(const uint32_t* WEBP_RESTRICT src,
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const uint32_t* const end = src + (num_pixels & ~7);
   const uint32_t* const end = src + (num_pixels & ~7);
   const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
   const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
   const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
   const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
@@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = {
   { 21, 20, 26, 25, 24, 30, 29, 28 }
   { 21, 20, 26, 25, 24, 30, 29, 28 }
 };
 };
 
 
-static void ConvertBGRAToRGB_NEON(const uint32_t* src,
-                                  int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_NEON(const uint32_t* WEBP_RESTRICT src,
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const uint32_t* const end = src + (num_pixels & ~7);
   const uint32_t* const end = src + (num_pixels & ~7);
   const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
   const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
   const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
   const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
@@ -209,7 +209,7 @@ static uint32_t Predictor13_NEON(const uint32_t* const left,
 
 
 // Predictor0: ARGB_BLACK.
 // Predictor0: ARGB_BLACK.
 static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK));
   const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK));
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -222,7 +222,7 @@ static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
 
 
 // Predictor1: left.
 // Predictor1: left.
 static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   const uint8x16_t zero = LOADQ_U32_AS_U8(0);
   const uint8x16_t zero = LOADQ_U32_AS_U8(0);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -248,7 +248,7 @@ static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
 #define GENERATE_PREDICTOR_1(X, IN)                                       \
 #define GENERATE_PREDICTOR_1(X, IN)                                       \
 static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
 static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
                                    const uint32_t* upper, int num_pixels, \
                                    const uint32_t* upper, int num_pixels, \
-                                   uint32_t* out) {                       \
+                                   uint32_t* WEBP_RESTRICT out) {         \
   int i;                                                                  \
   int i;                                                                  \
   for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
   for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
     const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
     const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
@@ -276,7 +276,7 @@ GENERATE_PREDICTOR_1(4, upper[i - 1])
 } while (0)
 } while (0)
 
 
 static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -301,7 +301,7 @@ static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
 
 
 // Predictor6: average(left, TL)
 // Predictor6: average(left, TL)
 static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -317,7 +317,7 @@ static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
 
 
 // Predictor7: average(left, T)
 // Predictor7: average(left, T)
 static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -335,7 +335,7 @@ static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
 #define GENERATE_PREDICTOR_2(X, IN)                                       \
 #define GENERATE_PREDICTOR_2(X, IN)                                       \
 static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
 static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
                                    const uint32_t* upper, int num_pixels, \
                                    const uint32_t* upper, int num_pixels, \
-                                   uint32_t* out) {                       \
+                                   uint32_t* WEBP_RESTRICT out) {         \
   int i;                                                                  \
   int i;                                                                  \
   for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
   for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
     const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
     const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
@@ -363,7 +363,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 } while (0)
 } while (0)
 
 
 static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -394,7 +394,7 @@ static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
 } while (0)
 } while (0)
 
 
 static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -427,7 +427,7 @@ static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
 } while (0)
 } while (0)
 
 
 static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1]));
   uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1]));
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -468,7 +468,7 @@ static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
 } while (0)
 } while (0)
 
 
 static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {

+ 21 - 17
thirdparty/libwebp/src/dsp/lossless_sse2.c

@@ -186,7 +186,7 @@ static uint32_t Predictor13_SSE2(const uint32_t* const left,
 
 
 // Predictor0: ARGB_BLACK.
 // Predictor0: ARGB_BLACK.
 static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
   const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -202,7 +202,7 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
 
 
 // Predictor1: left.
 // Predictor1: left.
 static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
-                               int num_pixels, uint32_t* out) {
+                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   __m128i prev = _mm_set1_epi32((int)out[-1]);
   __m128i prev = _mm_set1_epi32((int)out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -230,7 +230,8 @@ static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
 // per 8 bit channel.
 // per 8 bit channel.
 #define GENERATE_PREDICTOR_1(X, IN)                                           \
 #define GENERATE_PREDICTOR_1(X, IN)                                           \
 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
-                                  int num_pixels, uint32_t* out) {            \
+                                   int num_pixels,                            \
+                                   uint32_t* WEBP_RESTRICT out) {             \
   int i;                                                                      \
   int i;                                                                      \
   for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
   for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
@@ -259,7 +260,8 @@ GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
 
 
 #define GENERATE_PREDICTOR_2(X, IN)                                           \
 #define GENERATE_PREDICTOR_2(X, IN)                                           \
 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
-                                   int num_pixels, uint32_t* out) {           \
+                                   int num_pixels,                            \
+                                   uint32_t* WEBP_RESTRICT out) {             \
   int i;                                                                      \
   int i;                                                                      \
   for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
   for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
     const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN));            \
     const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN));            \
@@ -297,7 +299,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 } while (0)
 } while (0)
 
 
 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   __m128i L = _mm_cvtsi32_si128((int)out[-1]);
   __m128i L = _mm_cvtsi32_si128((int)out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
   for (i = 0; i + 4 <= num_pixels; i += 4) {
@@ -344,7 +346,7 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
 } while (0)
 } while (0)
 
 
 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   __m128i pa;
   __m128i pa;
   __m128i L = _mm_cvtsi32_si128((int)out[-1]);
   __m128i L = _mm_cvtsi32_si128((int)out[-1]);
@@ -395,7 +397,7 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
 } while (0)
 } while (0)
 
 
 static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
-                                int num_pixels, uint32_t* out) {
+                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
   int i;
   int i;
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
   const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
@@ -490,8 +492,8 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Color-space conversion functions
 // Color-space conversion functions
 
 
-static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
-                                  uint8_t* dst) {
+static void ConvertBGRAToRGB_SSE2(const uint32_t* WEBP_RESTRICT src,
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const __m128i* in = (const __m128i*)src;
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   __m128i* out = (__m128i*)dst;
 
 
@@ -526,8 +528,8 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
   }
   }
 }
 }
 
 
-static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
-                                   int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_SSE2(const uint32_t* WEBP_RESTRICT src,
+                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
   const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
   const __m128i* in = (const __m128i*)src;
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   __m128i* out = (__m128i*)dst;
@@ -554,8 +556,9 @@ static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
   }
   }
 }
 }
 
 
-static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
-                                       int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* WEBP_RESTRICT src,
+                                       int num_pixels,
+                                       uint8_t* WEBP_RESTRICT dst) {
   const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
   const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
   const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
   const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
   const __m128i* in = (const __m128i*)src;
   const __m128i* in = (const __m128i*)src;
@@ -590,8 +593,9 @@ static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
   }
   }
 }
 }
 
 
-static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
-                                     int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB565_SSE2(const uint32_t* WEBP_RESTRICT src,
+                                     int num_pixels,
+                                     uint8_t* WEBP_RESTRICT dst) {
   const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
   const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
   const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
   const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
   const __m128i mask_0x07 = _mm_set1_epi8(0x07);
   const __m128i mask_0x07 = _mm_set1_epi8(0x07);
@@ -631,8 +635,8 @@ static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
   }
   }
 }
 }
 
 
-static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
-                                  int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_SSE2(const uint32_t* WEBP_RESTRICT src,
+                                  int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
   const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
   const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
   const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
   const __m128i* in = (const __m128i*)src;
   const __m128i* in = (const __m128i*)src;

+ 4 - 4
thirdparty/libwebp/src/dsp/lossless_sse41.c

@@ -77,8 +77,8 @@ static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
   }                                                   \
   }                                                   \
 } while (0)
 } while (0)
 
 
-static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
-                                   uint8_t* dst) {
+static void ConvertBGRAToRGB_SSE41(const uint32_t* WEBP_RESTRICT src,
+                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const __m128i* in = (const __m128i*)src;
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   __m128i* out = (__m128i*)dst;
   const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
   const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
@@ -95,8 +95,8 @@ static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
   }
   }
 }
 }
 
 
-static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
-                                   int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_SSE41(const uint32_t* WEBP_RESTRICT src,
+                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
   const __m128i* in = (const __m128i*)src;
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   __m128i* out = (__m128i*)dst;
   const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
   const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,

+ 6 - 5
thirdparty/libwebp/src/dsp/rescaler.c

@@ -26,8 +26,8 @@
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Row import
 // Row import
 
 
-void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
-                                   const uint8_t* src) {
+void WebPRescalerImportRowExpand_C(WebPRescaler* WEBP_RESTRICT const wrk,
+                                   const uint8_t* WEBP_RESTRICT src) {
   const int x_stride = wrk->num_channels;
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   int channel;
   int channel;
@@ -59,8 +59,8 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
   }
   }
 }
 }
 
 
-void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk,
-                                   const uint8_t* src) {
+void WebPRescalerImportRowShrink_C(WebPRescaler* WEBP_RESTRICT const wrk,
+                                   const uint8_t* WEBP_RESTRICT src) {
   const int x_stride = wrk->num_channels;
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   int channel;
   int channel;
@@ -158,7 +158,8 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Main entry calls
 // Main entry calls
 
 
-void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) {
+void WebPRescalerImportRow(WebPRescaler* WEBP_RESTRICT const wrk,
+                           const uint8_t* WEBP_RESTRICT src) {
   assert(!WebPRescalerInputDone(wrk));
   assert(!WebPRescalerInputDone(wrk));
   if (!wrk->x_expand) {
   if (!wrk->x_expand) {
     WebPRescalerImportRowShrink(wrk, src);
     WebPRescalerImportRowShrink(wrk, src);

+ 4 - 4
thirdparty/libwebp/src/dsp/rescaler_mips32.c

@@ -21,8 +21,8 @@
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Row import
 // Row import
 
 
-static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
-                                   const uint8_t* src) {
+static void ImportRowShrink_MIPS32(WebPRescaler* WEBP_RESTRICT const wrk,
+                                   const uint8_t* WEBP_RESTRICT src) {
   const int x_stride = wrk->num_channels;
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const int fx_scale = wrk->fx_scale;
   const int fx_scale = wrk->fx_scale;
@@ -81,8 +81,8 @@ static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
   }
   }
 }
 }
 
 
-static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
-                                   const uint8_t* src) {
+static void ImportRowExpand_MIPS32(WebPRescaler* WEBP_RESTRICT const wrk,
+                                   const uint8_t* WEBP_RESTRICT src) {
   const int x_stride = wrk->num_channels;
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const int x_add = wrk->x_add;
   const int x_add = wrk->x_add;

+ 14 - 13
thirdparty/libwebp/src/dsp/rescaler_msa.c

@@ -114,9 +114,9 @@
   dst = __msa_copy_s_w((v4i32)t0, 0);                             \
   dst = __msa_copy_s_w((v4i32)t0, 0);                             \
 } while (0)
 } while (0)
 
 
-static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
-                                          int length,
-                                          WebPRescaler* const wrk) {
+static WEBP_INLINE void ExportRowExpand_0(
+    const uint32_t* WEBP_RESTRICT frow, uint8_t* WEBP_RESTRICT dst, int length,
+    WebPRescaler* WEBP_RESTRICT const wrk) {
   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
   const v4i32 zero = { 0 };
   const v4i32 zero = { 0 };
@@ -171,9 +171,10 @@ static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
-                                          uint8_t* dst, int length,
-                                          WebPRescaler* const wrk) {
+static WEBP_INLINE void ExportRowExpand_1(
+    const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow,
+    uint8_t* WEBP_RESTRICT dst, int length,
+    WebPRescaler* WEBP_RESTRICT const wrk) {
   const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
   const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
   const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
   const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
   const v4i32 B1 = __msa_fill_w(B);
   const v4i32 B1 = __msa_fill_w(B);
@@ -262,10 +263,10 @@ static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
 }
 }
 
 
 #if 0  // disabled for now. TODO(skal): make match the C-code
 #if 0  // disabled for now. TODO(skal): make match the C-code
-static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
-                                          uint8_t* dst, int length,
-                                          const uint32_t yscale,
-                                          WebPRescaler* const wrk) {
+static WEBP_INLINE void ExportRowShrink_0(
+    const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow,
+    uint8_t* WEBP_RESTRICT dst, int length, const uint32_t yscale,
+    WebPRescaler* WEBP_RESTRICT const wrk) {
   const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
   const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
   const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
   const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
   const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
   const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
@@ -348,9 +349,9 @@ static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
   }
   }
 }
 }
 
 
-static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
-                                          int length,
-                                          WebPRescaler* const wrk) {
+static WEBP_INLINE void ExportRowShrink_1(
+    uint32_t* WEBP_RESTRICT irow, uint8_t* WEBP_RESTRICT dst, int length,
+    WebPRescaler* WEBP_RESTRICT const wrk) {
   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
   const v4i32 zero = { 0 };
   const v4i32 zero = { 0 };

+ 2 - 2
thirdparty/libwebp/src/dsp/rescaler_neon.c

@@ -45,8 +45,8 @@
 #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
 #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
 #endif
 #endif
 
 
-static uint32x4_t Interpolate_NEON(const rescaler_t* const frow,
-                                   const rescaler_t* const irow,
+static uint32x4_t Interpolate_NEON(const rescaler_t* WEBP_RESTRICT const frow,
+                                   const rescaler_t* WEBP_RESTRICT const irow,
                                    uint32_t A, uint32_t B) {
                                    uint32_t A, uint32_t B) {
   LOAD_32x4(frow, A0);
   LOAD_32x4(frow, A0);
   LOAD_32x4(irow, B0);
   LOAD_32x4(irow, B0);

+ 8 - 10
thirdparty/libwebp/src/dsp/rescaler_sse2.c

@@ -43,8 +43,8 @@ static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) {
   *out = _mm_unpacklo_epi8(A, zero);
   *out = _mm_unpacklo_epi8(A, zero);
 }
 }
 
 
-static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
-                                         const uint8_t* src) {
+static void RescalerImportRowExpand_SSE2(WebPRescaler* WEBP_RESTRICT const wrk,
+                                         const uint8_t* WEBP_RESTRICT src) {
   rescaler_t* frow = wrk->frow;
   rescaler_t* frow = wrk->frow;
   const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
   const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
   const int x_add = wrk->x_add;
   const int x_add = wrk->x_add;
@@ -109,8 +109,8 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
   assert(accum == 0);
   assert(accum == 0);
 }
 }
 
 
-static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
-                                         const uint8_t* src) {
+static void RescalerImportRowShrink_SSE2(WebPRescaler* WEBP_RESTRICT const wrk,
+                                         const uint8_t* WEBP_RESTRICT src) {
   const int x_sub = wrk->x_sub;
   const int x_sub = wrk->x_sub;
   int accum = 0;
   int accum = 0;
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
@@ -168,12 +168,10 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
 // Row export
 // Row export
 
 
 // load *src as epi64, multiply by mult and store result in [out0 ... out3]
 // load *src as epi64, multiply by mult and store result in [out0 ... out3]
-static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src,
-                                                 const __m128i* const mult,
-                                                 __m128i* const out0,
-                                                 __m128i* const out1,
-                                                 __m128i* const out2,
-                                                 __m128i* const out3) {
+static WEBP_INLINE void LoadDispatchAndMult_SSE2(
+    const rescaler_t* WEBP_RESTRICT const src, const __m128i* const mult,
+    __m128i* const out0, __m128i* const out1, __m128i* const out2,
+    __m128i* const out3) {
   const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
   const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
   const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
   const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
   const __m128i A2 = _mm_srli_epi64(A0, 32);
   const __m128i A2 = _mm_srli_epi64(A0, 32);

+ 24 - 12
thirdparty/libwebp/src/dsp/upsampling.c

@@ -35,10 +35,14 @@ WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
 #define LOAD_UV(u, v) ((u) | ((v) << 16))
 #define LOAD_UV(u, v) ((u) | ((v) << 16))
 
 
 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
-                      const uint8_t* cur_u, const uint8_t* cur_v,              \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
+                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
+                      const uint8_t* WEBP_RESTRICT cur_u,                      \
+                      const uint8_t* WEBP_RESTRICT cur_v,                      \
+                      uint8_t* WEBP_RESTRICT top_dst,                          \
+                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
   int x;                                                                       \
   int x;                                                                       \
   const int last_pixel_pair = (len - 1) >> 1;                                  \
   const int last_pixel_pair = (len - 1) >> 1;                                  \
   uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
   uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
@@ -136,10 +140,14 @@ static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
 
 
 #if !defined(FANCY_UPSAMPLING)
 #if !defined(FANCY_UPSAMPLING)
 #define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC)                                      \
 #define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC)                                      \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
-                      const uint8_t* bot_u, const uint8_t* bot_v,              \
-                      uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
+                      const uint8_t* WEBP_RESTRICT bot_y,                      \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
+                      const uint8_t* WEBP_RESTRICT bot_u,                      \
+                      const uint8_t* WEBP_RESTRICT bot_v,                      \
+                      uint8_t* WEBP_RESTRICT top_dst,                          \
+                      uint8_t* WEBP_RESTRICT bot_dst, int len) {               \
   const int half_len = len >> 1;                                               \
   const int half_len = len >> 1;                                               \
   int x;                                                                       \
   int x;                                                                       \
   assert(top_dst != NULL);                                                     \
   assert(top_dst != NULL);                                                     \
@@ -178,10 +186,14 @@ WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
 // YUV444 converter
 // YUV444 converter
 
 
 #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
 #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
-extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
-                      uint8_t* dst, int len);                                  \
-void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,           \
-               uint8_t* dst, int len) {                                        \
+extern void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
+                      const uint8_t* WEBP_RESTRICT u,                          \
+                      const uint8_t* WEBP_RESTRICT v,                          \
+                      uint8_t* WEBP_RESTRICT dst, int len);                    \
+void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                                 \
+               const uint8_t* WEBP_RESTRICT u,                                 \
+               const uint8_t* WEBP_RESTRICT v,                                 \
+               uint8_t* WEBP_RESTRICT dst, int len) {                          \
   int i;                                                                       \
   int i;                                                                       \
   for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]);         \
   for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]);         \
 }
 }

+ 12 - 6
thirdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c

@@ -143,10 +143,14 @@ static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
 #define LOAD_UV(u, v) ((u) | ((v) << 16))
 #define LOAD_UV(u, v) ((u) | ((v) << 16))
 
 
 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
-                      const uint8_t* cur_u, const uint8_t* cur_v,              \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
+                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
+                      const uint8_t* WEBP_RESTRICT cur_u,                      \
+                      const uint8_t* WEBP_RESTRICT cur_v,                      \
+                      uint8_t* WEBP_RESTRICT top_dst,                          \
+                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
   int x;                                                                       \
   int x;                                                                       \
   const int last_pixel_pair = (len - 1) >> 1;                                  \
   const int last_pixel_pair = (len - 1) >> 1;                                  \
   uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
   uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
@@ -241,8 +245,10 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
 // YUV444 converter
 // YUV444 converter
 
 
 #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
 #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
-static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
-                      uint8_t* dst, int len) {                                 \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
+                      const uint8_t* WEBP_RESTRICT u,                          \
+                      const uint8_t* WEBP_RESTRICT v,                          \
+                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
   int i;                                                                       \
   int i;                                                                       \
   for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
   for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
 }
 }

+ 36 - 19
thirdparty/libwebp/src/dsp/upsampling_msa.c

@@ -320,8 +320,10 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
 }
 }
 
 
 #if !defined(WEBP_REDUCE_CSP)
 #if !defined(WEBP_REDUCE_CSP)
-static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
-                         const uint8_t* v, uint8_t* dst, int length) {
+static void YuvToRgbLine(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst, int length) {
   v16u8 R, G, B;
   v16u8 R, G, B;
   while (length >= 16) {
   while (length >= 16) {
     CALC_RGB16(y, u, v, R, G, B);
     CALC_RGB16(y, u, v, R, G, B);
@@ -347,8 +349,10 @@ static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
   }
   }
 }
 }
 
 
-static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
-                         const uint8_t* v, uint8_t* dst, int length) {
+static void YuvToBgrLine(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst, int length) {
   v16u8 R, G, B;
   v16u8 R, G, B;
   while (length >= 16) {
   while (length >= 16) {
     CALC_RGB16(y, u, v, R, G, B);
     CALC_RGB16(y, u, v, R, G, B);
@@ -375,8 +379,10 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
 }
 }
 #endif  // WEBP_REDUCE_CSP
 #endif  // WEBP_REDUCE_CSP
 
 
-static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
-                          const uint8_t* v, uint8_t* dst, int length) {
+static void YuvToRgbaLine(const uint8_t* WEBP_RESTRICT y,
+                          const uint8_t* WEBP_RESTRICT u,
+                          const uint8_t* WEBP_RESTRICT v,
+                          uint8_t* WEBP_RESTRICT dst, int length) {
   v16u8 R, G, B;
   v16u8 R, G, B;
   const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   while (length >= 16) {
   while (length >= 16) {
@@ -403,8 +409,10 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
   }
   }
 }
 }
 
 
-static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
-                          const uint8_t* v, uint8_t* dst, int length) {
+static void YuvToBgraLine(const uint8_t* WEBP_RESTRICT y,
+                          const uint8_t* WEBP_RESTRICT u,
+                          const uint8_t* WEBP_RESTRICT v,
+                          uint8_t* WEBP_RESTRICT dst, int length) {
   v16u8 R, G, B;
   v16u8 R, G, B;
   const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   while (length >= 16) {
   while (length >= 16) {
@@ -432,8 +440,10 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
 }
 }
 
 
 #if !defined(WEBP_REDUCE_CSP)
 #if !defined(WEBP_REDUCE_CSP)
-static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
-                          const uint8_t* v, uint8_t* dst, int length) {
+static void YuvToArgbLine(const uint8_t* WEBP_RESTRICT y,
+                          const uint8_t* WEBP_RESTRICT u,
+                          const uint8_t* WEBP_RESTRICT v,
+                          uint8_t* WEBP_RESTRICT dst, int length) {
   v16u8 R, G, B;
   v16u8 R, G, B;
   const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   while (length >= 16) {
   while (length >= 16) {
@@ -460,8 +470,10 @@ static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
   }
   }
 }
 }
 
 
-static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
-                              const uint8_t* v, uint8_t* dst, int length) {
+static void YuvToRgba4444Line(const uint8_t* WEBP_RESTRICT y,
+                              const uint8_t* WEBP_RESTRICT u,
+                              const uint8_t* WEBP_RESTRICT v,
+                              uint8_t* WEBP_RESTRICT dst, int length) {
   v16u8 R, G, B, RG, BA, tmp0, tmp1;
   v16u8 R, G, B, RG, BA, tmp0, tmp1;
   while (length >= 16) {
   while (length >= 16) {
 #if (WEBP_SWAP_16BIT_CSP == 1)
 #if (WEBP_SWAP_16BIT_CSP == 1)
@@ -496,8 +508,10 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
   }
   }
 }
 }
 
 
-static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
-                            const uint8_t* v, uint8_t* dst, int length) {
+static void YuvToRgb565Line(const uint8_t* WEBP_RESTRICT y,
+                            const uint8_t* WEBP_RESTRICT u,
+                            const uint8_t* WEBP_RESTRICT v,
+                            uint8_t* WEBP_RESTRICT dst, int length) {
   v16u8 R, G, B, RG, GB, tmp0, tmp1;
   v16u8 R, G, B, RG, GB, tmp0, tmp1;
   while (length >= 16) {
   while (length >= 16) {
 #if (WEBP_SWAP_16BIT_CSP == 1)
 #if (WEBP_SWAP_16BIT_CSP == 1)
@@ -564,11 +578,14 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
 } while (0)
 } while (0)
 
 
 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                            \
 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                            \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,        \
-                      const uint8_t* top_u, const uint8_t* top_v,        \
-                      const uint8_t* cur_u, const uint8_t* cur_v,        \
-                      uint8_t* top_dst, uint8_t* bot_dst, int len)       \
-{                                                                        \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                \
+                      const uint8_t* WEBP_RESTRICT bot_y,                \
+                      const uint8_t* WEBP_RESTRICT top_u,                \
+                      const uint8_t* WEBP_RESTRICT top_v,                \
+                      const uint8_t* WEBP_RESTRICT cur_u,                \
+                      const uint8_t* WEBP_RESTRICT cur_v,                \
+                      uint8_t* WEBP_RESTRICT top_dst,                    \
+                      uint8_t* WEBP_RESTRICT bot_dst, int len) {         \
   int size = (len - 1) >> 1;                                             \
   int size = (len - 1) >> 1;                                             \
   uint8_t temp_u[64];                                                    \
   uint8_t temp_u[64];                                                    \
   uint8_t temp_v[64];                                                    \
   uint8_t temp_v[64];                                                    \

+ 58 - 53
thirdparty/libwebp/src/dsp/upsampling_neon.c

@@ -58,8 +58,9 @@
 } while (0)
 } while (0)
 
 
 // Turn the macro into a function for reducing code-size when non-critical
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2,
-                                  uint8_t* out) {
+static void Upsample16Pixels_NEON(const uint8_t* WEBP_RESTRICT const r1,
+                                  const uint8_t* WEBP_RESTRICT const r2,
+                                  uint8_t* WEBP_RESTRICT const out) {
   UPSAMPLE_16PIXELS(r1, r2, out);
   UPSAMPLE_16PIXELS(r1, r2, out);
 }
 }
 
 
@@ -189,57 +190,61 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
   }                                                                     \
   }                                                                     \
 }
 }
 
 
-#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                       \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,    \
-                      const uint8_t* top_u, const uint8_t* top_v,       \
-                      const uint8_t* cur_u, const uint8_t* cur_v,       \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
-  int block;                                                            \
-  /* 16 byte aligned array to cache reconstructed u and v */            \
-  uint8_t uv_buf[2 * 32 + 15];                                          \
-  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
-  const int uv_len = (len + 1) >> 1;                                    \
-  /* 9 pixels must be read-able for each block */                       \
-  const int num_blocks = (uv_len - 1) >> 3;                             \
-  const int leftover = uv_len - num_blocks * 8;                         \
-  const int last_pos = 1 + 16 * num_blocks;                             \
-                                                                        \
-  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
-  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
-                                                                        \
-  const int16x4_t coeff1 = vld1_s16(kCoeffs1);                          \
-  const int16x8_t R_Rounder = vdupq_n_s16(-14234);                      \
-  const int16x8_t G_Rounder = vdupq_n_s16(8708);                        \
-  const int16x8_t B_Rounder = vdupq_n_s16(-17685);                      \
-                                                                        \
-  /* Treat the first pixel in regular way */                            \
-  assert(top_y != NULL);                                                \
-  {                                                                     \
-    const int u0 = (top_u[0] + u_diag) >> 1;                            \
-    const int v0 = (top_v[0] + v_diag) >> 1;                            \
-    VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst);                         \
-  }                                                                     \
-  if (bottom_y != NULL) {                                               \
-    const int u0 = (cur_u[0] + u_diag) >> 1;                            \
-    const int v0 = (cur_v[0] + v_diag) >> 1;                            \
-    VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst);                   \
-  }                                                                     \
-                                                                        \
-  for (block = 0; block < num_blocks; ++block) {                        \
-    UPSAMPLE_16PIXELS(top_u, cur_u, r_uv);                              \
-    UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16);                         \
-    CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv,                    \
-                  top_dst, bottom_dst, 16 * block + 1, 16);             \
-    top_u += 8;                                                         \
-    cur_u += 8;                                                         \
-    top_v += 8;                                                         \
-    cur_v += 8;                                                         \
-  }                                                                     \
-                                                                        \
-  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                    \
-  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);               \
-  CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv,          \
-                top_dst, bottom_dst, last_pos, len - last_pos);         \
+#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                              \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
+                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
+                      const uint8_t* WEBP_RESTRICT cur_u,                      \
+                      const uint8_t* WEBP_RESTRICT cur_v,                      \
+                      uint8_t* WEBP_RESTRICT top_dst,                          \
+                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
+  int block;                                                                   \
+  /* 16 byte aligned array to cache reconstructed u and v */                   \
+  uint8_t uv_buf[2 * 32 + 15];                                                 \
+  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~(uintptr_t)15); \
+  const int uv_len = (len + 1) >> 1;                                           \
+  /* 9 pixels must be read-able for each block */                              \
+  const int num_blocks = (uv_len - 1) >> 3;                                    \
+  const int leftover = uv_len - num_blocks * 8;                                \
+  const int last_pos = 1 + 16 * num_blocks;                                    \
+                                                                               \
+  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                         \
+  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                         \
+                                                                               \
+  const int16x4_t coeff1 = vld1_s16(kCoeffs1);                                 \
+  const int16x8_t R_Rounder = vdupq_n_s16(-14234);                             \
+  const int16x8_t G_Rounder = vdupq_n_s16(8708);                               \
+  const int16x8_t B_Rounder = vdupq_n_s16(-17685);                             \
+                                                                               \
+  /* Treat the first pixel in regular way */                                   \
+  assert(top_y != NULL);                                                       \
+  {                                                                            \
+    const int u0 = (top_u[0] + u_diag) >> 1;                                   \
+    const int v0 = (top_v[0] + v_diag) >> 1;                                   \
+    VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst);                                \
+  }                                                                            \
+  if (bottom_y != NULL) {                                                      \
+    const int u0 = (cur_u[0] + u_diag) >> 1;                                   \
+    const int v0 = (cur_v[0] + v_diag) >> 1;                                   \
+    VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst);                          \
+  }                                                                            \
+                                                                               \
+  for (block = 0; block < num_blocks; ++block) {                               \
+    UPSAMPLE_16PIXELS(top_u, cur_u, r_uv);                                     \
+    UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16);                                \
+    CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv,                           \
+                  top_dst, bottom_dst, 16 * block + 1, 16);                    \
+    top_u += 8;                                                                \
+    cur_u += 8;                                                                \
+    top_v += 8;                                                                \
+    cur_v += 8;                                                                \
+  }                                                                            \
+                                                                               \
+  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                           \
+  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);                      \
+  CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv,                 \
+                top_dst, bottom_dst, last_pos, len - last_pos);                \
 }
 }
 
 
 // NEON variants of the fancy upsampler.
 // NEON variants of the fancy upsampler.

+ 19 - 10
thirdparty/libwebp/src/dsp/upsampling_sse2.c

@@ -88,8 +88,9 @@
 } while (0)
 } while (0)
 
 
 // Turn the macro into a function for reducing code-size when non-critical
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
-                                  uint8_t* const out) {
+static void Upsample32Pixels_SSE2(const uint8_t* WEBP_RESTRICT const r1,
+                                  const uint8_t* WEBP_RESTRICT const r2,
+                                  uint8_t* WEBP_RESTRICT const out) {
   UPSAMPLE_32PIXELS(r1, r2, out);
   UPSAMPLE_32PIXELS(r1, r2, out);
 }
 }
 
 
@@ -114,10 +115,14 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
 } while (0)
 } while (0)
 
 
 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
-                      const uint8_t* cur_u, const uint8_t* cur_v,              \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
+                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
+                      const uint8_t* WEBP_RESTRICT cur_u,                      \
+                      const uint8_t* WEBP_RESTRICT cur_v,                      \
+                      uint8_t* WEBP_RESTRICT top_dst,                          \
+                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
   int uv_pos, pos;                                                             \
   int uv_pos, pos;                                                             \
   /* 16byte-aligned array to cache reconstructed u and v */                    \
   /* 16byte-aligned array to cache reconstructed u and v */                    \
   uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
   uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
@@ -215,10 +220,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 extern void WebPInitYUV444ConvertersSSE2(void);
 extern void WebPInitYUV444ConvertersSSE2(void);
 
 
 #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
 #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
-extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v,       \
-                   uint8_t* dst, int len);                                     \
-static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
-                      uint8_t* dst, int len) {                                 \
+extern void CALL_C(const uint8_t* WEBP_RESTRICT y,                             \
+                   const uint8_t* WEBP_RESTRICT u,                             \
+                   const uint8_t* WEBP_RESTRICT v,                             \
+                   uint8_t* WEBP_RESTRICT dst, int len);                       \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
+                      const uint8_t* WEBP_RESTRICT u,                          \
+                      const uint8_t* WEBP_RESTRICT v,                          \
+                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
   int i;                                                                       \
   int i;                                                                       \
   const int max_len = len & ~31;                                               \
   const int max_len = len & ~31;                                               \
   for (i = 0; i < max_len; i += 32) {                                          \
   for (i = 0; i < max_len; i += 32) {                                          \

+ 20 - 11
thirdparty/libwebp/src/dsp/upsampling_sse41.c

@@ -90,8 +90,9 @@
 } while (0)
 } while (0)
 
 
 // Turn the macro into a function for reducing code-size when non-critical
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
-                                  uint8_t* const out) {
+static void Upsample32Pixels_SSE41(const uint8_t* WEBP_RESTRICT const r1,
+                                   const uint8_t* WEBP_RESTRICT const r2,
+                                   uint8_t* WEBP_RESTRICT const out) {
   UPSAMPLE_32PIXELS(r1, r2, out);
   UPSAMPLE_32PIXELS(r1, r2, out);
 }
 }
 
 
@@ -116,14 +117,18 @@ static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
 } while (0)
 } while (0)
 
 
 #define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
 #define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
-static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
-                      const uint8_t* top_u, const uint8_t* top_v,              \
-                      const uint8_t* cur_u, const uint8_t* cur_v,              \
-                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                      \
+                      const uint8_t* WEBP_RESTRICT bottom_y,                   \
+                      const uint8_t* WEBP_RESTRICT top_u,                      \
+                      const uint8_t* WEBP_RESTRICT top_v,                      \
+                      const uint8_t* WEBP_RESTRICT cur_u,                      \
+                      const uint8_t* WEBP_RESTRICT cur_v,                      \
+                      uint8_t* WEBP_RESTRICT top_dst,                          \
+                      uint8_t* WEBP_RESTRICT bottom_dst, int len) {            \
   int uv_pos, pos;                                                             \
   int uv_pos, pos;                                                             \
   /* 16byte-aligned array to cache reconstructed u and v */                    \
   /* 16byte-aligned array to cache reconstructed u and v */                    \
   uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
   uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
-  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~(uintptr_t)15);  \
   uint8_t* const r_v = r_u + 32;                                               \
   uint8_t* const r_v = r_u + 32;                                               \
                                                                                \
                                                                                \
   assert(top_y != NULL);                                                       \
   assert(top_y != NULL);                                                       \
@@ -202,10 +207,14 @@ extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 extern void WebPInitYUV444ConvertersSSE41(void);
 extern void WebPInitYUV444ConvertersSSE41(void);
 
 
 #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
 #define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
-extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v,       \
-                   uint8_t* dst, int len);                                     \
-static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
-                      uint8_t* dst, int len) {                                 \
+extern void CALL_C(const uint8_t* WEBP_RESTRICT y,                             \
+                   const uint8_t* WEBP_RESTRICT u,                             \
+                   const uint8_t* WEBP_RESTRICT v,                             \
+                   uint8_t* WEBP_RESTRICT dst, int len);                       \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
+                      const uint8_t* WEBP_RESTRICT u,                          \
+                      const uint8_t* WEBP_RESTRICT v,                          \
+                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
   int i;                                                                       \
   int i;                                                                       \
   const int max_len = len & ~31;                                               \
   const int max_len = len & ~31;                                               \
   for (i = 0; i < max_len; i += 32) {                                          \
   for (i = 0; i < max_len; i += 32) {                                          \

+ 30 - 18
thirdparty/libwebp/src/dsp/yuv.c

@@ -20,9 +20,10 @@
 // Plain-C version
 // Plain-C version
 
 
 #define ROW_FUNC(FUNC_NAME, FUNC, XSTEP)                                       \
 #define ROW_FUNC(FUNC_NAME, FUNC, XSTEP)                                       \
-static void FUNC_NAME(const uint8_t* y,                                        \
-                      const uint8_t* u, const uint8_t* v,                      \
-                      uint8_t* dst, int len) {                                 \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
+                      const uint8_t* WEBP_RESTRICT u,                          \
+                      const uint8_t* WEBP_RESTRICT v,                          \
+                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
   const uint8_t* const end = dst + (len & ~1) * (XSTEP);                       \
   const uint8_t* const end = dst + (len & ~1) * (XSTEP);                       \
   while (dst != end) {                                                         \
   while (dst != end) {                                                         \
     FUNC(y[0], u[0], v[0], dst);                                               \
     FUNC(y[0], u[0], v[0], dst);                                               \
@@ -49,9 +50,10 @@ ROW_FUNC(YuvToRgb565Row,   VP8YuvToRgb565, 2)
 #undef ROW_FUNC
 #undef ROW_FUNC
 
 
 // Main call for processing a plane with a WebPSamplerRowFunc function:
 // Main call for processing a plane with a WebPSamplerRowFunc function:
-void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
-                             const uint8_t* u, const uint8_t* v, int uv_stride,
-                             uint8_t* dst, int dst_stride,
+void WebPSamplerProcessPlane(const uint8_t* WEBP_RESTRICT y, int y_stride,
+                             const uint8_t* WEBP_RESTRICT u,
+                             const uint8_t* WEBP_RESTRICT v, int uv_stride,
+                             uint8_t* WEBP_RESTRICT dst, int dst_stride,
                              int width, int height, WebPSamplerRowFunc func) {
                              int width, int height, WebPSamplerRowFunc func) {
   int j;
   int j;
   for (j = 0; j < height; ++j) {
   for (j = 0; j < height; ++j) {
@@ -117,7 +119,8 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
 // ARGB -> YUV converters
 // ARGB -> YUV converters
 
 
-static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_C(const uint32_t* WEBP_RESTRICT argb,
+                             uint8_t* WEBP_RESTRICT y, int width) {
   int i;
   int i;
   for (i = 0; i < width; ++i) {
   for (i = 0; i < width; ++i) {
     const uint32_t p = argb[i];
     const uint32_t p = argb[i];
@@ -126,7 +129,8 @@ static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
   }
   }
 }
 }
 
 
-void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
+void WebPConvertARGBToUV_C(const uint32_t* WEBP_RESTRICT argb,
+                           uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
                            int src_width, int do_store) {
                            int src_width, int do_store) {
   // No rounding. Last pixel is dealt with separately.
   // No rounding. Last pixel is dealt with separately.
   const int uv_width = src_width >> 1;
   const int uv_width = src_width >> 1;
@@ -169,22 +173,25 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
 
 
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
 
 
-static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_C(const uint8_t* WEBP_RESTRICT rgb,
+                              uint8_t* WEBP_RESTRICT y, int width) {
   int i;
   int i;
   for (i = 0; i < width; ++i, rgb += 3) {
   for (i = 0; i < width; ++i, rgb += 3) {
     y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
     y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
   }
   }
 }
 }
 
 
-static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_C(const uint8_t* WEBP_RESTRICT bgr,
+                              uint8_t* WEBP_RESTRICT y, int width) {
   int i;
   int i;
   for (i = 0; i < width; ++i, bgr += 3) {
   for (i = 0; i < width; ++i, bgr += 3) {
     y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
     y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
   }
   }
 }
 }
 
 
-void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
-                             uint8_t* u, uint8_t* v, int width) {
+void WebPConvertRGBA32ToUV_C(const uint16_t* WEBP_RESTRICT rgb,
+                             uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
+                             int width) {
   int i;
   int i;
   for (i = 0; i < width; i += 1, rgb += 4) {
   for (i = 0; i < width; i += 1, rgb += 4) {
     const int r = rgb[0], g = rgb[1], b = rgb[2];
     const int r = rgb[0], g = rgb[1], b = rgb[2];
@@ -195,13 +202,18 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
 
 
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
 
 
-void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
-void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
-void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
-                              uint8_t* u, uint8_t* v, int width);
+void (*WebPConvertRGB24ToY)(const uint8_t* WEBP_RESTRICT rgb,
+                            uint8_t* WEBP_RESTRICT y, int width);
+void (*WebPConvertBGR24ToY)(const uint8_t* WEBP_RESTRICT bgr,
+                            uint8_t* WEBP_RESTRICT y, int width);
+void (*WebPConvertRGBA32ToUV)(const uint16_t* WEBP_RESTRICT rgb,
+                              uint8_t* WEBP_RESTRICT u,
+                              uint8_t* WEBP_RESTRICT v, int width);
 
 
-void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
-void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
+void (*WebPConvertARGBToY)(const uint32_t* WEBP_RESTRICT argb,
+                           uint8_t* WEBP_RESTRICT y, int width);
+void (*WebPConvertARGBToUV)(const uint32_t* WEBP_RESTRICT argb,
+                            uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
                             int src_width, int do_store);
                             int src_width, int do_store);
 
 
 extern void WebPInitConvertARGBToYUVSSE2(void);
 extern void WebPInitConvertARGBToYUVSSE2(void);

+ 41 - 23
thirdparty/libwebp/src/dsp/yuv.h

@@ -11,15 +11,15 @@
 //
 //
 // The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
 // The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
 // More information at: https://en.wikipedia.org/wiki/YCbCr
 // More information at: https://en.wikipedia.org/wiki/YCbCr
-// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
-// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
-// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
+// Y = 0.2568 * R + 0.5041 * G + 0.0979 * B + 16
+// U = -0.1482 * R - 0.2910 * G + 0.4392 * B + 128
+// V = 0.4392 * R - 0.3678 * G - 0.0714 * B + 128
 // We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
 // We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
 //
 //
 // For the Y'CbCr to RGB conversion, the BT.601 specification reads:
 // For the Y'CbCr to RGB conversion, the BT.601 specification reads:
 //   R = 1.164 * (Y-16) + 1.596 * (V-128)
 //   R = 1.164 * (Y-16) + 1.596 * (V-128)
-//   G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128)
-//   B = 1.164 * (Y-16)                   + 2.018 * (U-128)
+//   G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.392 * (U-128)
+//   B = 1.164 * (Y-16)                   + 2.017 * (U-128)
 // where Y is in the [16,235] range, and U/V in the [16,240] range.
 // where Y is in the [16,235] range, and U/V in the [16,240] range.
 //
 //
 // The fixed-point implementation used here is:
 // The fixed-point implementation used here is:
@@ -149,20 +149,34 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
 #if defined(WEBP_USE_SSE2)
 #if defined(WEBP_USE_SSE2)
 
 
 // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
 // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
-void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst);
-void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst);
-void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst);
-void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst);
-void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst);
-void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
-                             const uint8_t* v, uint8_t* dst);
-void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                           uint8_t* dst);
+void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst);
+void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
+                        const uint8_t* WEBP_RESTRICT u,
+                        const uint8_t* WEBP_RESTRICT v,
+                        uint8_t* WEBP_RESTRICT dst);
+void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst);
+void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
+                        const uint8_t* WEBP_RESTRICT u,
+                        const uint8_t* WEBP_RESTRICT v,
+                        uint8_t* WEBP_RESTRICT dst);
+void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst);
+void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
+                             const uint8_t* WEBP_RESTRICT u,
+                             const uint8_t* WEBP_RESTRICT v,
+                             uint8_t* WEBP_RESTRICT dst);
+void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
+                           const uint8_t* WEBP_RESTRICT u,
+                           const uint8_t* WEBP_RESTRICT v,
+                           uint8_t* WEBP_RESTRICT dst);
 
 
 #endif    // WEBP_USE_SSE2
 #endif    // WEBP_USE_SSE2
 
 
@@ -172,10 +186,14 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
 #if defined(WEBP_USE_SSE41)
 #if defined(WEBP_USE_SSE41)
 
 
 // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
 // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
-void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst);
-void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst);
+void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst);
+void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst);
 
 
 #endif    // WEBP_USE_SSE41
 #endif    // WEBP_USE_SSE41
 
 

+ 4 - 3
thirdparty/libwebp/src/dsp/yuv_mips32.c

@@ -22,9 +22,10 @@
 // simple point-sampling
 // simple point-sampling
 
 
 #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
 #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
-static void FUNC_NAME(const uint8_t* y,                                        \
-                      const uint8_t* u, const uint8_t* v,                      \
-                      uint8_t* dst, int len) {                                 \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
+                      const uint8_t* WEBP_RESTRICT u,                          \
+                      const uint8_t* WEBP_RESTRICT v,                          \
+                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
   int i, r, g, b;                                                              \
   int i, r, g, b;                                                              \
   int temp0, temp1, temp2, temp3, temp4;                                       \
   int temp0, temp1, temp2, temp3, temp4;                                       \
   for (i = 0; i < (len >> 1); i++) {                                           \
   for (i = 0; i < (len >> 1); i++) {                                           \

+ 4 - 3
thirdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c

@@ -69,9 +69,10 @@
   : "memory", "hi", "lo"                                                       \
   : "memory", "hi", "lo"                                                       \
 
 
 #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
 #define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
-static void FUNC_NAME(const uint8_t* y,                                        \
-                      const uint8_t* u, const uint8_t* v,                      \
-                      uint8_t* dst, int len) {                                 \
+static void FUNC_NAME(const uint8_t* WEBP_RESTRICT y,                          \
+                      const uint8_t* WEBP_RESTRICT u,                          \
+                      const uint8_t* WEBP_RESTRICT v,                          \
+                      uint8_t* WEBP_RESTRICT dst, int len) {                   \
   int i;                                                                       \
   int i;                                                                       \
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;             \
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;             \
   const int t_con_1 = 26149;                                                   \
   const int t_con_1 = 26149;                                                   \

+ 12 - 6
thirdparty/libwebp/src/dsp/yuv_neon.c

@@ -46,7 +46,8 @@ static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
   return vqmovn_u16(Y2);
   return vqmovn_u16(Y2);
 }
 }
 
 
-static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_NEON(const uint8_t* WEBP_RESTRICT rgb,
+                                 uint8_t* WEBP_RESTRICT y, int width) {
   int i;
   int i;
   for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
   for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
     const uint8x8x3_t RGB = vld3_u8(rgb);
     const uint8x8x3_t RGB = vld3_u8(rgb);
@@ -58,7 +59,8 @@ static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
   }
   }
 }
 }
 
 
-static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_NEON(const uint8_t* WEBP_RESTRICT bgr,
+                                 uint8_t* WEBP_RESTRICT y, int width) {
   int i;
   int i;
   for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
   for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
     const uint8x8x3_t BGR = vld3_u8(bgr);
     const uint8x8x3_t BGR = vld3_u8(bgr);
@@ -70,7 +72,8 @@ static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
   }
   }
 }
 }
 
 
-static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_NEON(const uint32_t* WEBP_RESTRICT argb,
+                                uint8_t* WEBP_RESTRICT y, int width) {
   int i;
   int i;
   for (i = 0; i + 8 <= width; i += 8) {
   for (i = 0; i + 8 <= width; i += 8) {
     const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
     const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
@@ -114,8 +117,9 @@ static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
   MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST);       \
   MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST);       \
 } while (0)
 } while (0)
 
 
-static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
-                                   uint8_t* u, uint8_t* v, int width) {
+static void ConvertRGBA32ToUV_NEON(const uint16_t* WEBP_RESTRICT rgb,
+                                   uint8_t* WEBP_RESTRICT u,
+                                   uint8_t* WEBP_RESTRICT v, int width) {
   int i;
   int i;
   for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
   for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
     const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
     const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
@@ -131,7 +135,9 @@ static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
   }
   }
 }
 }
 
 
-static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
+static void ConvertARGBToUV_NEON(const uint32_t* WEBP_RESTRICT argb,
+                                 uint8_t* WEBP_RESTRICT u,
+                                 uint8_t* WEBP_RESTRICT v,
                                  int src_width, int do_store) {
                                  int src_width, int do_store) {
   int i;
   int i;
   for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
   for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {

+ 76 - 54
thirdparty/libwebp/src/dsp/yuv_sse2.c

@@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
 }
 }
 
 
 // Convert 32 samples of YUV444 to R/G/B
 // Convert 32 samples of YUV444 to R/G/B
-static void YUV444ToRGB_SSE2(const uint8_t* const y,
-                             const uint8_t* const u,
-                             const uint8_t* const v,
+static void YUV444ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
+                             const uint8_t* WEBP_RESTRICT const u,
+                             const uint8_t* WEBP_RESTRICT const v,
                              __m128i* const R, __m128i* const G,
                              __m128i* const R, __m128i* const G,
                              __m128i* const B) {
                              __m128i* const B) {
   const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
   const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
@@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE2(const uint8_t* const y,
 }
 }
 
 
 // Convert 32 samples of YUV420 to R/G/B
 // Convert 32 samples of YUV420 to R/G/B
-static void YUV420ToRGB_SSE2(const uint8_t* const y,
-                             const uint8_t* const u,
-                             const uint8_t* const v,
+static void YUV420ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
+                             const uint8_t* WEBP_RESTRICT const u,
+                             const uint8_t* WEBP_RESTRICT const v,
                              __m128i* const R, __m128i* const G,
                              __m128i* const R, __m128i* const G,
                              __m128i* const B) {
                              __m128i* const B) {
   const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
   const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
@@ -108,7 +108,7 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
                                            const __m128i* const G,
                                            const __m128i* const G,
                                            const __m128i* const B,
                                            const __m128i* const B,
                                            const __m128i* const A,
                                            const __m128i* const A,
-                                           uint8_t* const dst) {
+                                           uint8_t* WEBP_RESTRICT const dst) {
   const __m128i rb = _mm_packus_epi16(*R, *B);
   const __m128i rb = _mm_packus_epi16(*R, *B);
   const __m128i ga = _mm_packus_epi16(*G, *A);
   const __m128i ga = _mm_packus_epi16(*G, *A);
   const __m128i rg = _mm_unpacklo_epi8(rb, ga);
   const __m128i rg = _mm_unpacklo_epi8(rb, ga);
@@ -120,11 +120,9 @@ static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
 }
 }
 
 
 // Pack R/G/B/A results into 16b output.
 // Pack R/G/B/A results into 16b output.
-static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
-                                              const __m128i* const G,
-                                              const __m128i* const B,
-                                              const __m128i* const A,
-                                              uint8_t* const dst) {
+static WEBP_INLINE void PackAndStore4444_SSE2(
+     const __m128i* const R, const __m128i* const G, const __m128i* const B,
+     const __m128i* const A, uint8_t* WEBP_RESTRICT const dst) {
 #if (WEBP_SWAP_16BIT_CSP == 0)
 #if (WEBP_SWAP_16BIT_CSP == 0)
   const __m128i rg0 = _mm_packus_epi16(*R, *G);
   const __m128i rg0 = _mm_packus_epi16(*R, *G);
   const __m128i ba0 = _mm_packus_epi16(*B, *A);
   const __m128i ba0 = _mm_packus_epi16(*B, *A);
@@ -145,7 +143,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
 static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
 static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
                                              const __m128i* const G,
                                              const __m128i* const G,
                                              const __m128i* const B,
                                              const __m128i* const B,
-                                             uint8_t* const dst) {
+                                             uint8_t* WEBP_RESTRICT const dst) {
   const __m128i r0 = _mm_packus_epi16(*R, *R);
   const __m128i r0 = _mm_packus_epi16(*R, *R);
   const __m128i g0 = _mm_packus_epi16(*G, *G);
   const __m128i g0 = _mm_packus_epi16(*G, *G);
   const __m128i b0 = _mm_packus_epi16(*B, *B);
   const __m128i b0 = _mm_packus_epi16(*B, *B);
@@ -170,7 +168,7 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
 static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
 static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
                                          __m128i* const in2, __m128i* const in3,
                                          __m128i* const in2, __m128i* const in3,
                                          __m128i* const in4, __m128i* const in5,
                                          __m128i* const in4, __m128i* const in5,
-                                         uint8_t* const rgb) {
+                                         uint8_t* WEBP_RESTRICT const rgb) {
   // The input is 6 registers of sixteen 8b but for the sake of explanation,
   // The input is 6 registers of sixteen 8b but for the sake of explanation,
   // let's take 6 registers of four 8b values.
   // let's take 6 registers of four 8b values.
   // To pack, we will keep taking one every two 8b integer and move it
   // To pack, we will keep taking one every two 8b integer and move it
@@ -193,8 +191,10 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
   _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
   _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
 }
 }
 
 
-void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst) {
+void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   int n;
   for (n = 0; n < 32; n += 8, dst += 32) {
   for (n = 0; n < 32; n += 8, dst += 32) {
@@ -204,8 +204,10 @@ void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
   }
 }
 }
 
 
-void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst) {
+void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   int n;
   for (n = 0; n < 32; n += 8, dst += 32) {
   for (n = 0; n < 32; n += 8, dst += 32) {
@@ -215,8 +217,10 @@ void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
   }
 }
 }
 
 
-void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst) {
+void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   int n;
   for (n = 0; n < 32; n += 8, dst += 32) {
   for (n = 0; n < 32; n += 8, dst += 32) {
@@ -226,8 +230,10 @@ void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
   }
 }
 }
 
 
-void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
-                             const uint8_t* v, uint8_t* dst) {
+void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
+                             const uint8_t* WEBP_RESTRICT u,
+                             const uint8_t* WEBP_RESTRICT v,
+                             uint8_t* WEBP_RESTRICT dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   int n;
   for (n = 0; n < 32; n += 8, dst += 16) {
   for (n = 0; n < 32; n += 8, dst += 16) {
@@ -237,8 +243,10 @@ void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
   }
   }
 }
 }
 
 
-void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                           uint8_t* dst) {
+void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
+                           const uint8_t* WEBP_RESTRICT u,
+                           const uint8_t* WEBP_RESTRICT v,
+                           uint8_t* WEBP_RESTRICT dst) {
   int n;
   int n;
   for (n = 0; n < 32; n += 8, dst += 16) {
   for (n = 0; n < 32; n += 8, dst += 16) {
     __m128i R, G, B;
     __m128i R, G, B;
@@ -247,8 +255,10 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
   }
 }
 }
 
 
-void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst) {
+void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
+                        const uint8_t* WEBP_RESTRICT u,
+                        const uint8_t* WEBP_RESTRICT v,
+                        uint8_t* WEBP_RESTRICT dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
   __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
 
 
@@ -269,8 +279,10 @@ void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
   PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 }
 }
 
 
-void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst) {
+void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
+                        const uint8_t* WEBP_RESTRICT u,
+                        const uint8_t* WEBP_RESTRICT v,
+                        uint8_t* WEBP_RESTRICT dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
   __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
 
 
@@ -294,9 +306,10 @@ void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
 // Arbitrary-length row conversion functions
 // Arbitrary-length row conversion functions
 
 
-static void YuvToRgbaRow_SSE2(const uint8_t* y,
-                              const uint8_t* u, const uint8_t* v,
-                              uint8_t* dst, int len) {
+static void YuvToRgbaRow_SSE2(const uint8_t* WEBP_RESTRICT y,
+                              const uint8_t* WEBP_RESTRICT u,
+                              const uint8_t* WEBP_RESTRICT v,
+                              uint8_t* WEBP_RESTRICT dst, int len) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   int n;
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@@ -316,9 +329,10 @@ static void YuvToRgbaRow_SSE2(const uint8_t* y,
   }
   }
 }
 }
 
 
-static void YuvToBgraRow_SSE2(const uint8_t* y,
-                              const uint8_t* u, const uint8_t* v,
-                              uint8_t* dst, int len) {
+static void YuvToBgraRow_SSE2(const uint8_t* WEBP_RESTRICT y,
+                              const uint8_t* WEBP_RESTRICT u,
+                              const uint8_t* WEBP_RESTRICT v,
+                              uint8_t* WEBP_RESTRICT dst, int len) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   int n;
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@@ -338,9 +352,10 @@ static void YuvToBgraRow_SSE2(const uint8_t* y,
   }
   }
 }
 }
 
 
-static void YuvToArgbRow_SSE2(const uint8_t* y,
-                              const uint8_t* u, const uint8_t* v,
-                              uint8_t* dst, int len) {
+static void YuvToArgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
+                              const uint8_t* WEBP_RESTRICT u,
+                              const uint8_t* WEBP_RESTRICT v,
+                              uint8_t* WEBP_RESTRICT dst, int len) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   int n;
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
@@ -360,9 +375,10 @@ static void YuvToArgbRow_SSE2(const uint8_t* y,
   }
   }
 }
 }
 
 
-static void YuvToRgbRow_SSE2(const uint8_t* y,
-                             const uint8_t* u, const uint8_t* v,
-                             uint8_t* dst, int len) {
+static void YuvToRgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
+                             const uint8_t* WEBP_RESTRICT u,
+                             const uint8_t* WEBP_RESTRICT v,
+                             uint8_t* WEBP_RESTRICT dst, int len) {
   int n;
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@@ -397,9 +413,10 @@ static void YuvToRgbRow_SSE2(const uint8_t* y,
   }
   }
 }
 }
 
 
-static void YuvToBgrRow_SSE2(const uint8_t* y,
-                             const uint8_t* u, const uint8_t* v,
-                             uint8_t* dst, int len) {
+static void YuvToBgrRow_SSE2(const uint8_t* WEBP_RESTRICT y,
+                             const uint8_t* WEBP_RESTRICT u,
+                             const uint8_t* WEBP_RESTRICT v,
+                             uint8_t* WEBP_RESTRICT dst, int len) {
   int n;
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@@ -471,7 +488,7 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // Similar to PlanarTo24bHelper(), but in reverse order.
 // Similar to PlanarTo24bHelper(), but in reverse order.
 static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
 static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
-    const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
+    const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
   __m128i tmp[6];
   __m128i tmp[6];
   tmp[0] = _mm_loadu_si128((const __m128i*)(rgb +  0));
   tmp[0] = _mm_loadu_si128((const __m128i*)(rgb +  0));
   tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
   tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
@@ -488,8 +505,8 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
 }
 }
 
 
 // Convert 8 packed ARGB to r[], g[], b[]
 // Convert 8 packed ARGB to r[], g[], b[]
-static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
-                                                 __m128i* const rgb /*in[6]*/) {
+static WEBP_INLINE void RGB32PackedToPlanar_SSE2(
+    const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   __m128i a0 = LOAD_16(argb + 0);
   __m128i a0 = LOAD_16(argb + 0);
   __m128i a1 = LOAD_16(argb + 4);
   __m128i a1 = LOAD_16(argb + 4);
@@ -562,7 +579,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
 #undef MK_CST_16
 #undef MK_CST_16
 #undef TRANSFORM
 #undef TRANSFORM
 
 
-static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb,
+                                 uint8_t* WEBP_RESTRICT y, int width) {
   const int max_width = width & ~31;
   const int max_width = width & ~31;
   int i;
   int i;
   for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
   for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
@@ -596,7 +614,8 @@ static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
   }
   }
 }
 }
 
 
-static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr,
+                                 uint8_t* WEBP_RESTRICT y, int width) {
   const int max_width = width & ~31;
   const int max_width = width & ~31;
   int i;
   int i;
   for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
   for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
@@ -630,7 +649,8 @@ static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
   }
   }
 }
 }
 
 
-static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb,
+                                uint8_t* WEBP_RESTRICT y, int width) {
   const int max_width = width & ~15;
   const int max_width = width & ~15;
   int i;
   int i;
   for (i = 0; i < max_width; i += 16) {
   for (i = 0; i < max_width; i += 16) {
@@ -658,8 +678,9 @@ static void HorizontalAddPack_SSE2(const __m128i* const A,
   *out = _mm_packs_epi32(C, D);
   *out = _mm_packs_epi32(C, D);
 }
 }
 
 
-static void ConvertARGBToUV_SSE2(const uint32_t* argb,
-                                 uint8_t* u, uint8_t* v,
+static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb,
+                                 uint8_t* WEBP_RESTRICT u,
+                                 uint8_t* WEBP_RESTRICT v,
                                  int src_width, int do_store) {
                                  int src_width, int do_store) {
   const int max_width = src_width & ~31;
   const int max_width = src_width & ~31;
   int i;
   int i;
@@ -695,7 +716,7 @@ static void ConvertARGBToUV_SSE2(const uint32_t* argb,
 
 
 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
 static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
 static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
-    const uint16_t* const rgbx,
+    const uint16_t* WEBP_RESTRICT const rgbx,
     __m128i* const r, __m128i* const g, __m128i* const b) {
     __m128i* const r, __m128i* const g, __m128i* const b) {
   const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
   const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
   const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
   const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
@@ -715,8 +736,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
   *b = _mm_unpacklo_epi64(B1, B3);
   *b = _mm_unpacklo_epi64(B1, B3);
 }
 }
 
 
-static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
-                                   uint8_t* u, uint8_t* v, int width) {
+static void ConvertRGBA32ToUV_SSE2(const uint16_t* WEBP_RESTRICT rgb,
+                                   uint8_t* WEBP_RESTRICT u,
+                                   uint8_t* WEBP_RESTRICT v, int width) {
   const int max_width = width & ~15;
   const int max_width = width & ~15;
   const uint16_t* const last_rgb = rgb + 4 * max_width;
   const uint16_t* const last_rgb = rgb + 4 * max_width;
   while (rgb < last_rgb) {
   while (rgb < last_rgb) {

+ 38 - 27
thirdparty/libwebp/src/dsp/yuv_sse41.c

@@ -82,9 +82,9 @@ static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
 }
 }
 
 
 // Convert 32 samples of YUV444 to R/G/B
 // Convert 32 samples of YUV444 to R/G/B
-static void YUV444ToRGB_SSE41(const uint8_t* const y,
-                              const uint8_t* const u,
-                              const uint8_t* const v,
+static void YUV444ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
+                              const uint8_t* WEBP_RESTRICT const u,
+                              const uint8_t* WEBP_RESTRICT const v,
                               __m128i* const R, __m128i* const G,
                               __m128i* const R, __m128i* const G,
                               __m128i* const B) {
                               __m128i* const B) {
   const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
   const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
@@ -93,9 +93,9 @@ static void YUV444ToRGB_SSE41(const uint8_t* const y,
 }
 }
 
 
 // Convert 32 samples of YUV420 to R/G/B
 // Convert 32 samples of YUV420 to R/G/B
-static void YUV420ToRGB_SSE41(const uint8_t* const y,
-                              const uint8_t* const u,
-                              const uint8_t* const v,
+static void YUV420ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
+                              const uint8_t* WEBP_RESTRICT const u,
+                              const uint8_t* WEBP_RESTRICT const v,
                               __m128i* const R, __m128i* const G,
                               __m128i* const R, __m128i* const G,
                               __m128i* const B) {
                               __m128i* const B) {
   const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
   const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
@@ -109,7 +109,7 @@ static void YUV420ToRGB_SSE41(const uint8_t* const y,
 static WEBP_INLINE void PlanarTo24b_SSE41(
 static WEBP_INLINE void PlanarTo24b_SSE41(
     __m128i* const in0, __m128i* const in1, __m128i* const in2,
     __m128i* const in0, __m128i* const in1, __m128i* const in2,
     __m128i* const in3, __m128i* const in4, __m128i* const in5,
     __m128i* const in3, __m128i* const in4, __m128i* const in5,
-    uint8_t* const rgb) {
+    uint8_t* WEBP_RESTRICT const rgb) {
   // The input is 6 registers of sixteen 8b but for the sake of explanation,
   // The input is 6 registers of sixteen 8b but for the sake of explanation,
   // let's take 6 registers of four 8b values.
   // let's take 6 registers of four 8b values.
   // To pack, we will keep taking one every two 8b integer and move it
   // To pack, we will keep taking one every two 8b integer and move it
@@ -132,8 +132,10 @@ static WEBP_INLINE void PlanarTo24b_SSE41(
   _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
   _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
 }
 }
 
 
-void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst) {
+void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
   __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
 
 
@@ -154,8 +156,10 @@ void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
   PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 }
 }
 
 
-void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst) {
+void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
+                         const uint8_t* WEBP_RESTRICT u,
+                         const uint8_t* WEBP_RESTRICT v,
+                         uint8_t* WEBP_RESTRICT dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
   __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
 
 
@@ -179,9 +183,10 @@ void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
 //-----------------------------------------------------------------------------
 //-----------------------------------------------------------------------------
 // Arbitrary-length row conversion functions
 // Arbitrary-length row conversion functions
 
 
-static void YuvToRgbRow_SSE41(const uint8_t* y,
-                              const uint8_t* u, const uint8_t* v,
-                              uint8_t* dst, int len) {
+static void YuvToRgbRow_SSE41(const uint8_t* WEBP_RESTRICT y,
+                              const uint8_t* WEBP_RESTRICT u,
+                              const uint8_t* WEBP_RESTRICT v,
+                              uint8_t* WEBP_RESTRICT dst, int len) {
   int n;
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@@ -216,9 +221,10 @@ static void YuvToRgbRow_SSE41(const uint8_t* y,
   }
   }
 }
 }
 
 
-static void YuvToBgrRow_SSE41(const uint8_t* y,
-                              const uint8_t* u, const uint8_t* v,
-                              uint8_t* dst, int len) {
+static void YuvToBgrRow_SSE41(const uint8_t* WEBP_RESTRICT y,
+                              const uint8_t* WEBP_RESTRICT u,
+                              const uint8_t* WEBP_RESTRICT v,
+                              uint8_t* WEBP_RESTRICT dst, int len) {
   int n;
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
@@ -290,7 +296,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // Similar to PlanarTo24bHelper(), but in reverse order.
 // Similar to PlanarTo24bHelper(), but in reverse order.
 static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
 static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
-    const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
+    const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
   const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb +  0));
   const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb +  0));
   const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
   const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
   const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
   const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
@@ -334,7 +340,7 @@ static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
 
 
 // Convert 8 packed ARGB to r[], g[], b[]
 // Convert 8 packed ARGB to r[], g[], b[]
 static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
 static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
-    const uint32_t* const argb, __m128i* const rgb /*in[6]*/) {
+    const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i zero = _mm_setzero_si128();
   __m128i a0 = LOAD_16(argb + 0);
   __m128i a0 = LOAD_16(argb + 0);
   __m128i a1 = LOAD_16(argb + 4);
   __m128i a1 = LOAD_16(argb + 4);
@@ -407,7 +413,8 @@ static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
 #undef MK_CST_16
 #undef MK_CST_16
 #undef TRANSFORM
 #undef TRANSFORM
 
 
-static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_SSE41(const uint8_t* WEBP_RESTRICT rgb,
+                                  uint8_t* WEBP_RESTRICT y, int width) {
   const int max_width = width & ~31;
   const int max_width = width & ~31;
   int i;
   int i;
   for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
   for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
@@ -441,7 +448,8 @@ static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
   }
   }
 }
 }
 
 
-static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_SSE41(const uint8_t* WEBP_RESTRICT bgr,
+                                  uint8_t* WEBP_RESTRICT y, int width) {
   const int max_width = width & ~31;
   const int max_width = width & ~31;
   int i;
   int i;
   for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
   for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
@@ -475,7 +483,8 @@ static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
   }
   }
 }
 }
 
 
-static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_SSE41(const uint32_t* WEBP_RESTRICT argb,
+                                 uint8_t* WEBP_RESTRICT y, int width) {
   const int max_width = width & ~15;
   const int max_width = width & ~15;
   int i;
   int i;
   for (i = 0; i < max_width; i += 16) {
   for (i = 0; i < max_width; i += 16) {
@@ -503,8 +512,9 @@ static void HorizontalAddPack_SSE41(const __m128i* const A,
   *out = _mm_packs_epi32(C, D);
   *out = _mm_packs_epi32(C, D);
 }
 }
 
 
-static void ConvertARGBToUV_SSE41(const uint32_t* argb,
-                                  uint8_t* u, uint8_t* v,
+static void ConvertARGBToUV_SSE41(const uint32_t* WEBP_RESTRICT argb,
+                                  uint8_t* WEBP_RESTRICT u,
+                                  uint8_t* WEBP_RESTRICT v,
                                   int src_width, int do_store) {
                                   int src_width, int do_store) {
   const int max_width = src_width & ~31;
   const int max_width = src_width & ~31;
   int i;
   int i;
@@ -540,7 +550,7 @@ static void ConvertARGBToUV_SSE41(const uint32_t* argb,
 
 
 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
 static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
 static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
-    const uint16_t* const rgbx,
+    const uint16_t* WEBP_RESTRICT const rgbx,
     __m128i* const r, __m128i* const g, __m128i* const b) {
     __m128i* const r, __m128i* const g, __m128i* const b) {
   const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
   const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
   const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
   const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
@@ -570,8 +580,9 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
   *b = _mm_unpackhi_epi64(B1, B3);
   *b = _mm_unpackhi_epi64(B1, B3);
 }
 }
 
 
-static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
-                                    uint8_t* u, uint8_t* v, int width) {
+static void ConvertRGBA32ToUV_SSE41(const uint16_t* WEBP_RESTRICT rgb,
+                                    uint8_t* WEBP_RESTRICT u,
+                                    uint8_t* WEBP_RESTRICT v, int width) {
   const int max_width = width & ~15;
   const int max_width = width & ~15;
   const uint16_t* const last_rgb = rgb + 4 * max_width;
   const uint16_t* const last_rgb = rgb + 4 * max_width;
   while (rgb < last_rgb) {
   while (rgb < last_rgb) {

+ 1 - 0
thirdparty/libwebp/src/enc/alpha_enc.c

@@ -276,6 +276,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
       stats->lossless_features = best.stats.lossless_features;
       stats->lossless_features = best.stats.lossless_features;
       stats->histogram_bits = best.stats.histogram_bits;
       stats->histogram_bits = best.stats.histogram_bits;
       stats->transform_bits = best.stats.transform_bits;
       stats->transform_bits = best.stats.transform_bits;
+      stats->cross_color_transform_bits = best.stats.cross_color_transform_bits;
       stats->cache_bits = best.stats.cache_bits;
       stats->cache_bits = best.stats.cache_bits;
       stats->palette_size = best.stats.palette_size;
       stats->palette_size = best.stats.palette_size;
       stats->lossless_size = best.stats.lossless_size;
       stats->lossless_size = best.stats.lossless_size;

+ 51 - 47
thirdparty/libwebp/src/enc/backward_references_cost_enc.c

@@ -15,7 +15,7 @@
 //
 //
 
 
 #include <assert.h>
 #include <assert.h>
-#include <float.h>
+#include <string.h>
 
 
 #include "src/dsp/lossless_common.h"
 #include "src/dsp/lossless_common.h"
 #include "src/enc/backward_references_enc.h"
 #include "src/enc/backward_references_enc.h"
@@ -31,15 +31,15 @@ extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
                                       const PixOrCopy v);
                                       const PixOrCopy v);
 
 
 typedef struct {
 typedef struct {
-  float alpha_[VALUES_IN_BYTE];
-  float red_[VALUES_IN_BYTE];
-  float blue_[VALUES_IN_BYTE];
-  float distance_[NUM_DISTANCE_CODES];
-  float* literal_;
+  uint32_t alpha_[VALUES_IN_BYTE];
+  uint32_t red_[VALUES_IN_BYTE];
+  uint32_t blue_[VALUES_IN_BYTE];
+  uint32_t distance_[NUM_DISTANCE_CODES];
+  uint32_t* literal_;
 } CostModel;
 } CostModel;
 
 
 static void ConvertPopulationCountTableToBitEstimates(
 static void ConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const uint32_t population_counts[], float output[]) {
+    int num_symbols, const uint32_t population_counts[], uint32_t output[]) {
   uint32_t sum = 0;
   uint32_t sum = 0;
   int nonzeros = 0;
   int nonzeros = 0;
   int i;
   int i;
@@ -52,7 +52,7 @@ static void ConvertPopulationCountTableToBitEstimates(
   if (nonzeros <= 1) {
   if (nonzeros <= 1) {
     memset(output, 0, num_symbols * sizeof(*output));
     memset(output, 0, num_symbols * sizeof(*output));
   } else {
   } else {
-    const float logsum = VP8LFastLog2(sum);
+    const uint32_t logsum = VP8LFastLog2(sum);
     for (i = 0; i < num_symbols; ++i) {
     for (i = 0; i < num_symbols; ++i) {
       output[i] = logsum - VP8LFastLog2(population_counts[i]);
       output[i] = logsum - VP8LFastLog2(population_counts[i]);
     }
     }
@@ -93,47 +93,47 @@ static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
   return ok;
   return ok;
 }
 }
 
 
-static WEBP_INLINE float GetLiteralCost(const CostModel* const m, uint32_t v) {
-  return m->alpha_[v >> 24] +
-         m->red_[(v >> 16) & 0xff] +
-         m->literal_[(v >> 8) & 0xff] +
-         m->blue_[v & 0xff];
+static WEBP_INLINE int64_t GetLiteralCost(const CostModel* const m,
+                                          uint32_t v) {
+  return (int64_t)m->alpha_[v >> 24] + m->red_[(v >> 16) & 0xff] +
+         m->literal_[(v >> 8) & 0xff] + m->blue_[v & 0xff];
 }
 }
 
 
-static WEBP_INLINE float GetCacheCost(const CostModel* const m, uint32_t idx) {
+static WEBP_INLINE int64_t GetCacheCost(const CostModel* const m,
+                                        uint32_t idx) {
   const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
   const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
-  return m->literal_[literal_idx];
+  return (int64_t)m->literal_[literal_idx];
 }
 }
 
 
-static WEBP_INLINE float GetLengthCost(const CostModel* const m,
-                                       uint32_t length) {
+static WEBP_INLINE int64_t GetLengthCost(const CostModel* const m,
+                                         uint32_t length) {
   int code, extra_bits;
   int code, extra_bits;
   VP8LPrefixEncodeBits(length, &code, &extra_bits);
   VP8LPrefixEncodeBits(length, &code, &extra_bits);
-  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
+  return (int64_t)m->literal_[VALUES_IN_BYTE + code] +
+         ((int64_t)extra_bits << LOG_2_PRECISION_BITS);
 }
 }
 
 
-static WEBP_INLINE float GetDistanceCost(const CostModel* const m,
-                                         uint32_t distance) {
+static WEBP_INLINE int64_t GetDistanceCost(const CostModel* const m,
+                                           uint32_t distance) {
   int code, extra_bits;
   int code, extra_bits;
   VP8LPrefixEncodeBits(distance, &code, &extra_bits);
   VP8LPrefixEncodeBits(distance, &code, &extra_bits);
-  return m->distance_[code] + extra_bits;
+  return (int64_t)m->distance_[code] +
+         ((int64_t)extra_bits << LOG_2_PRECISION_BITS);
 }
 }
 
 
 static WEBP_INLINE void AddSingleLiteralWithCostModel(
 static WEBP_INLINE void AddSingleLiteralWithCostModel(
     const uint32_t* const argb, VP8LColorCache* const hashers,
     const uint32_t* const argb, VP8LColorCache* const hashers,
     const CostModel* const cost_model, int idx, int use_color_cache,
     const CostModel* const cost_model, int idx, int use_color_cache,
-    float prev_cost, float* const cost, uint16_t* const dist_array) {
-  float cost_val = prev_cost;
+    int64_t prev_cost, int64_t* const cost, uint16_t* const dist_array) {
+  int64_t cost_val = prev_cost;
   const uint32_t color = argb[idx];
   const uint32_t color = argb[idx];
   const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
   const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
   if (ix >= 0) {
   if (ix >= 0) {
     // use_color_cache is true and hashers contains color
     // use_color_cache is true and hashers contains color
-    const float mul0 = 0.68f;
-    cost_val += GetCacheCost(cost_model, ix) * mul0;
+    cost_val += DivRound(GetCacheCost(cost_model, ix) * 68, 100);
   } else {
   } else {
-    const float mul1 = 0.82f;
     if (use_color_cache) VP8LColorCacheInsert(hashers, color);
     if (use_color_cache) VP8LColorCacheInsert(hashers, color);
-    cost_val += GetLiteralCost(cost_model, color) * mul1;
+    cost_val += DivRound(GetLiteralCost(cost_model, color) * 82, 100);
   }
   }
   if (cost[idx] > cost_val) {
   if (cost[idx] > cost_val) {
     cost[idx] = cost_val;
     cost[idx] = cost_val;
@@ -163,7 +163,7 @@ static WEBP_INLINE void AddSingleLiteralWithCostModel(
 // therefore no overlapping intervals.
 // therefore no overlapping intervals.
 typedef struct CostInterval CostInterval;
 typedef struct CostInterval CostInterval;
 struct CostInterval {
 struct CostInterval {
-  float cost_;
+  int64_t cost_;
   int start_;
   int start_;
   int end_;
   int end_;
   int index_;
   int index_;
@@ -173,7 +173,7 @@ struct CostInterval {
 
 
 // The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
 // The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
 typedef struct {
 typedef struct {
-  float cost_;
+  int64_t cost_;
   int start_;
   int start_;
   int end_;       // Exclusive.
   int end_;       // Exclusive.
 } CostCacheInterval;
 } CostCacheInterval;
@@ -188,8 +188,9 @@ typedef struct {
   int count_;  // The number of stored intervals.
   int count_;  // The number of stored intervals.
   CostCacheInterval* cache_intervals_;
   CostCacheInterval* cache_intervals_;
   size_t cache_intervals_size_;
   size_t cache_intervals_size_;
-  float cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
-  float* costs_;
+  // Contains the GetLengthCost(cost_model, k).
+  int64_t cost_cache_[MAX_LENGTH];
+  int64_t* costs_;
   uint16_t* dist_array_;
   uint16_t* dist_array_;
   // Most of the time, we only need few intervals -> use a free-list, to avoid
   // Most of the time, we only need few intervals -> use a free-list, to avoid
   // fragmentation with small allocs in most common cases.
   // fragmentation with small allocs in most common cases.
@@ -298,7 +299,7 @@ static int CostManagerInit(CostManager* const manager,
     cur->end_ = 1;
     cur->end_ = 1;
     cur->cost_ = manager->cost_cache_[0];
     cur->cost_ = manager->cost_cache_[0];
     for (i = 1; i < cost_cache_size; ++i) {
     for (i = 1; i < cost_cache_size; ++i) {
-      const float cost_val = manager->cost_cache_[i];
+      const int64_t cost_val = manager->cost_cache_[i];
       if (cost_val != cur->cost_) {
       if (cost_val != cur->cost_) {
         ++cur;
         ++cur;
         // Initialize an interval.
         // Initialize an interval.
@@ -311,13 +312,15 @@ static int CostManagerInit(CostManager* const manager,
            manager->cache_intervals_size_);
            manager->cache_intervals_size_);
   }
   }
 
 
-  manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
+  manager->costs_ =
+      (int64_t*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
   if (manager->costs_ == NULL) {
   if (manager->costs_ == NULL) {
     CostManagerClear(manager);
     CostManagerClear(manager);
     return 0;
     return 0;
   }
   }
-  // Set the initial costs_ high for every pixel as we will keep the minimum.
-  for (i = 0; i < pix_count; ++i) manager->costs_[i] = FLT_MAX;
+  // Set the initial costs_ to INT64_MAX for every pixel as we will keep the
+  // minimum.
+  for (i = 0; i < pix_count; ++i) manager->costs_[i] = WEBP_INT64_MAX;
 
 
   return 1;
   return 1;
 }
 }
@@ -325,7 +328,7 @@ static int CostManagerInit(CostManager* const manager,
 // Given the cost and the position that define an interval, update the cost at
 // Given the cost and the position that define an interval, update the cost at
 // pixel 'i' if it is smaller than the previously computed value.
 // pixel 'i' if it is smaller than the previously computed value.
 static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
 static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
-                                   int position, float cost) {
+                                   int position, int64_t cost) {
   const int k = i - position;
   const int k = i - position;
   assert(k >= 0 && k < MAX_LENGTH);
   assert(k >= 0 && k < MAX_LENGTH);
 
 
@@ -339,7 +342,7 @@ static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
 // all the pixels between 'start' and 'end' excluded.
 // all the pixels between 'start' and 'end' excluded.
 static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
 static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
                                               int start, int end, int position,
                                               int start, int end, int position,
-                                              float cost) {
+                                              int64_t cost) {
   int i;
   int i;
   for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
   for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
 }
 }
@@ -424,7 +427,7 @@ static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
 // interval_in as a hint. The intervals are sorted by start_ value.
 // interval_in as a hint. The intervals are sorted by start_ value.
 static WEBP_INLINE void InsertInterval(CostManager* const manager,
 static WEBP_INLINE void InsertInterval(CostManager* const manager,
                                        CostInterval* const interval_in,
                                        CostInterval* const interval_in,
-                                       float cost, int position, int start,
+                                       int64_t cost, int position, int start,
                                        int end) {
                                        int end) {
   CostInterval* interval_new;
   CostInterval* interval_new;
 
 
@@ -463,7 +466,7 @@ static WEBP_INLINE void InsertInterval(CostManager* const manager,
 // If handling the interval or one of its subintervals becomes to heavy, its
 // If handling the interval or one of its subintervals becomes to heavy, its
 // contribution is added to the costs right away.
 // contribution is added to the costs right away.
 static WEBP_INLINE void PushInterval(CostManager* const manager,
 static WEBP_INLINE void PushInterval(CostManager* const manager,
-                                     float distance_cost, int position,
+                                     int64_t distance_cost, int position,
                                      int len) {
                                      int len) {
   size_t i;
   size_t i;
   CostInterval* interval = manager->head_;
   CostInterval* interval = manager->head_;
@@ -478,7 +481,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
     int j;
     int j;
     for (j = position; j < position + len; ++j) {
     for (j = position; j < position + len; ++j) {
       const int k = j - position;
       const int k = j - position;
-      float cost_tmp;
+      int64_t cost_tmp;
       assert(k >= 0 && k < MAX_LENGTH);
       assert(k >= 0 && k < MAX_LENGTH);
       cost_tmp = distance_cost + manager->cost_cache_[k];
       cost_tmp = distance_cost + manager->cost_cache_[k];
 
 
@@ -498,7 +501,7 @@ static WEBP_INLINE void PushInterval(CostManager* const manager,
     const int end = position + (cost_cache_intervals[i].end_ > len
     const int end = position + (cost_cache_intervals[i].end_ > len
                                  ? len
                                  ? len
                                  : cost_cache_intervals[i].end_);
                                  : cost_cache_intervals[i].end_);
-    const float cost = distance_cost + cost_cache_intervals[i].cost_;
+    const int64_t cost = distance_cost + cost_cache_intervals[i].cost_;
 
 
     for (; interval != NULL && interval->start_ < end;
     for (; interval != NULL && interval->start_ < end;
          interval = interval_next) {
          interval = interval_next) {
@@ -576,7 +579,7 @@ static int BackwardReferencesHashChainDistanceOnly(
   const int pix_count = xsize * ysize;
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
   const int use_color_cache = (cache_bits > 0);
   const size_t literal_array_size =
   const size_t literal_array_size =
-      sizeof(float) * (VP8LHistogramNumCodes(cache_bits));
+      sizeof(*((CostModel*)NULL)->literal_) * VP8LHistogramNumCodes(cache_bits);
   const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
   const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
   CostModel* const cost_model =
   CostModel* const cost_model =
       (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
       (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
@@ -584,13 +587,13 @@ static int BackwardReferencesHashChainDistanceOnly(
   CostManager* cost_manager =
   CostManager* cost_manager =
       (CostManager*)WebPSafeCalloc(1ULL, sizeof(*cost_manager));
       (CostManager*)WebPSafeCalloc(1ULL, sizeof(*cost_manager));
   int offset_prev = -1, len_prev = -1;
   int offset_prev = -1, len_prev = -1;
-  float offset_cost = -1.f;
+  int64_t offset_cost = -1;
   int first_offset_is_constant = -1;  // initialized with 'impossible' value
   int first_offset_is_constant = -1;  // initialized with 'impossible' value
   int reach = 0;
   int reach = 0;
 
 
   if (cost_model == NULL || cost_manager == NULL) goto Error;
   if (cost_model == NULL || cost_manager == NULL) goto Error;
 
 
-  cost_model->literal_ = (float*)(cost_model + 1);
+  cost_model->literal_ = (uint32_t*)(cost_model + 1);
   if (use_color_cache) {
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
     if (!cc_init) goto Error;
@@ -608,11 +611,12 @@ static int BackwardReferencesHashChainDistanceOnly(
   // non-processed locations from this point.
   // non-processed locations from this point.
   dist_array[0] = 0;
   dist_array[0] = 0;
   // Add first pixel as literal.
   // Add first pixel as literal.
-  AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
-                                0.f, cost_manager->costs_, dist_array);
+  AddSingleLiteralWithCostModel(argb, &hashers, cost_model, /*idx=*/0,
+                                use_color_cache, /*prev_cost=*/0,
+                                cost_manager->costs_, dist_array);
 
 
   for (i = 1; i < pix_count; ++i) {
   for (i = 1; i < pix_count; ++i) {
-    const float prev_cost = cost_manager->costs_[i - 1];
+    const int64_t prev_cost = cost_manager->costs_[i - 1];
     int offset, len;
     int offset, len;
     VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
     VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
 
 

+ 5 - 9
thirdparty/libwebp/src/enc/backward_references_enc.c

@@ -13,8 +13,6 @@
 #include "src/enc/backward_references_enc.h"
 #include "src/enc/backward_references_enc.h"
 
 
 #include <assert.h>
 #include <assert.h>
-#include <float.h>
-#include <math.h>
 
 
 #include "src/dsp/dsp.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless.h"
@@ -27,8 +25,6 @@
 
 
 #define MIN_BLOCK_SIZE 256  // minimum block size for backward references
 #define MIN_BLOCK_SIZE 256  // minimum block size for backward references
 
 
-#define MAX_ENTROPY    (1e30f)
-
 // 1M window (4M bytes) minus 120 special codes for short distances.
 // 1M window (4M bytes) minus 120 special codes for short distances.
 #define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
 #define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
 
 
@@ -758,7 +754,7 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
                                   int* const best_cache_bits) {
                                   int* const best_cache_bits) {
   int i;
   int i;
   const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
   const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
-  float entropy_min = MAX_ENTROPY;
+  uint64_t entropy_min = WEBP_UINT64_MAX;
   int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
   int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
   VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
   VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
@@ -843,7 +839,7 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
   }
   }
 
 
   for (i = 0; i <= cache_bits_max; ++i) {
   for (i = 0; i <= cache_bits_max; ++i) {
-    const float entropy = VP8LHistogramEstimateBits(histos[i]);
+    const uint64_t entropy = VP8LHistogramEstimateBits(histos[i]);
     if (i == 0 || entropy < entropy_min) {
     if (i == 0 || entropy < entropy_min) {
       entropy_min = entropy;
       entropy_min = entropy;
       *best_cache_bits = i;
       *best_cache_bits = i;
@@ -920,7 +916,7 @@ static int GetBackwardReferences(int width, int height,
   int i, lz77_type;
   int i, lz77_type;
   // Index 0 is for a color cache, index 1 for no cache (if needed).
   // Index 0 is for a color cache, index 1 for no cache (if needed).
   int lz77_types_best[2] = {0, 0};
   int lz77_types_best[2] = {0, 0};
-  float bit_costs_best[2] = {FLT_MAX, FLT_MAX};
+  uint64_t bit_costs_best[2] = {WEBP_UINT64_MAX, WEBP_UINT64_MAX};
   VP8LHashChain hash_chain_box;
   VP8LHashChain hash_chain_box;
   VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
   VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
   int status = 0;
   int status = 0;
@@ -932,7 +928,7 @@ static int GetBackwardReferences(int width, int height,
   for (lz77_type = 1; lz77_types_to_try;
   for (lz77_type = 1; lz77_types_to_try;
        lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
        lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
     int res = 0;
     int res = 0;
-    float bit_cost = 0.f;
+    uint64_t bit_cost = 0u;
     if ((lz77_types_to_try & lz77_type) == 0) continue;
     if ((lz77_types_to_try & lz77_type) == 0) continue;
     switch (lz77_type) {
     switch (lz77_type) {
       case kLZ77RLE:
       case kLZ77RLE:
@@ -1006,7 +1002,7 @@ static int GetBackwardReferences(int width, int height,
       const VP8LHashChain* const hash_chain_tmp =
       const VP8LHashChain* const hash_chain_tmp =
           (lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
           (lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
       const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
       const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
-      float bit_cost_trace;
+      uint64_t bit_cost_trace;
       if (!VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
       if (!VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
                                                 hash_chain_tmp, &refs[i],
                                                 hash_chain_tmp, &refs[i],
                                                 refs_tmp)) {
                                                 refs_tmp)) {

+ 0 - 4
thirdparty/libwebp/src/enc/config_enc.c

@@ -55,7 +55,6 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->thread_level = 0;
   config->thread_level = 0;
   config->low_memory = 0;
   config->low_memory = 0;
   config->near_lossless = 100;
   config->near_lossless = 100;
-  config->use_delta_palette = 0;
   config->use_sharp_yuv = 0;
   config->use_sharp_yuv = 0;
 
 
   // TODO(skal): tune.
   // TODO(skal): tune.
@@ -125,9 +124,6 @@ int WebPValidateConfig(const WebPConfig* config) {
   if (config->thread_level < 0 || config->thread_level > 1) return 0;
   if (config->thread_level < 0 || config->thread_level > 1) return 0;
   if (config->low_memory < 0 || config->low_memory > 1) return 0;
   if (config->low_memory < 0 || config->low_memory > 1) return 0;
   if (config->exact < 0 || config->exact > 1) return 0;
   if (config->exact < 0 || config->exact > 1) return 0;
-  if (config->use_delta_palette < 0 || config->use_delta_palette > 1) {
-    return 0;
-  }
   if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0;
   if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0;
 
 
   return 1;
   return 1;

+ 1 - 1
thirdparty/libwebp/src/enc/cost_enc.c

@@ -19,7 +19,7 @@
 // For each given level, the following table gives the pattern of contexts to
 // For each given level, the following table gives the pattern of contexts to
 // use for coding it (in [][0]) as well as the bit value to use for each
 // use for coding it (in [][0]) as well as the bit value to use for each
 // context (in [][1]).
 // context (in [][1]).
-const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
+static const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
                   {0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
                   {0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
   {0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
   {0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
   {0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013},
   {0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013},

+ 0 - 1
thirdparty/libwebp/src/enc/cost_enc.h

@@ -61,7 +61,6 @@ static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
 }
 }
 
 
 // Level cost calculations
 // Level cost calculations
-extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
 void VP8CalculateLevelCosts(VP8EncProba* const proba);
 void VP8CalculateLevelCosts(VP8EncProba* const proba);
 static WEBP_INLINE int VP8LevelCost(const uint16_t* const table, int level) {
 static WEBP_INLINE int VP8LevelCost(const uint16_t* const table, int level) {
   return VP8LevelFixedCosts[level]
   return VP8LevelFixedCosts[level]

+ 209 - 178
thirdparty/libwebp/src/enc/histogram_enc.c

@@ -13,8 +13,7 @@
 #include "src/webp/config.h"
 #include "src/webp/config.h"
 #endif
 #endif
 
 
-#include <float.h>
-#include <math.h>
+#include <string.h>
 
 
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
 #include "src/dsp/lossless_common.h"
@@ -23,8 +22,6 @@
 #include "src/enc/vp8i_enc.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/utils/utils.h"
 #include "src/utils/utils.h"
 
 
-#define MAX_BIT_COST FLT_MAX
-
 // Number of partitions for the three dominant (literal, red and blue) symbol
 // Number of partitions for the three dominant (literal, red and blue) symbol
 // costs.
 // costs.
 #define NUM_PARTITIONS 4
 #define NUM_PARTITIONS 4
@@ -33,10 +30,18 @@
 // Maximum number of histograms allowed in greedy combining algorithm.
 // Maximum number of histograms allowed in greedy combining algorithm.
 #define MAX_HISTO_GREEDY 100
 #define MAX_HISTO_GREEDY 100
 
 
+// Return the size of the histogram for a given cache_bits.
+static int GetHistogramSize(int cache_bits) {
+  const int literal_size = VP8LHistogramNumCodes(cache_bits);
+  const size_t total_size = sizeof(VP8LHistogram) + sizeof(int) * literal_size;
+  assert(total_size <= (size_t)0x7fffffff);
+  return (int)total_size;
+}
+
 static void HistogramClear(VP8LHistogram* const p) {
 static void HistogramClear(VP8LHistogram* const p) {
   uint32_t* const literal = p->literal_;
   uint32_t* const literal = p->literal_;
   const int cache_bits = p->palette_code_bits_;
   const int cache_bits = p->palette_code_bits_;
-  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  const int histo_size = GetHistogramSize(cache_bits);
   memset(p, 0, histo_size);
   memset(p, 0, histo_size);
   p->palette_code_bits_ = cache_bits;
   p->palette_code_bits_ = cache_bits;
   p->literal_ = literal;
   p->literal_ = literal;
@@ -54,20 +59,13 @@ static void HistogramCopy(const VP8LHistogram* const src,
   uint32_t* const dst_literal = dst->literal_;
   uint32_t* const dst_literal = dst->literal_;
   const int dst_cache_bits = dst->palette_code_bits_;
   const int dst_cache_bits = dst->palette_code_bits_;
   const int literal_size = VP8LHistogramNumCodes(dst_cache_bits);
   const int literal_size = VP8LHistogramNumCodes(dst_cache_bits);
-  const int histo_size = VP8LGetHistogramSize(dst_cache_bits);
+  const int histo_size = GetHistogramSize(dst_cache_bits);
   assert(src->palette_code_bits_ == dst_cache_bits);
   assert(src->palette_code_bits_ == dst_cache_bits);
   memcpy(dst, src, histo_size);
   memcpy(dst, src, histo_size);
   dst->literal_ = dst_literal;
   dst->literal_ = dst_literal;
   memcpy(dst->literal_, src->literal_, literal_size * sizeof(*dst->literal_));
   memcpy(dst->literal_, src->literal_, literal_size * sizeof(*dst->literal_));
 }
 }
 
 
-int VP8LGetHistogramSize(int cache_bits) {
-  const int literal_size = VP8LHistogramNumCodes(cache_bits);
-  const size_t total_size = sizeof(VP8LHistogram) + sizeof(int) * literal_size;
-  assert(total_size <= (size_t)0x7fffffff);
-  return (int)total_size;
-}
-
 void VP8LFreeHistogram(VP8LHistogram* const histo) {
 void VP8LFreeHistogram(VP8LHistogram* const histo) {
   WebPSafeFree(histo);
   WebPSafeFree(histo);
 }
 }
@@ -102,17 +100,17 @@ void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
     HistogramClear(p);
     HistogramClear(p);
   } else {
   } else {
     p->trivial_symbol_ = 0;
     p->trivial_symbol_ = 0;
-    p->bit_cost_ = 0.;
-    p->literal_cost_ = 0.;
-    p->red_cost_ = 0.;
-    p->blue_cost_ = 0.;
+    p->bit_cost_ = 0;
+    p->literal_cost_ = 0;
+    p->red_cost_ = 0;
+    p->blue_cost_ = 0;
     memset(p->is_used_, 0, sizeof(p->is_used_));
     memset(p->is_used_, 0, sizeof(p->is_used_));
   }
   }
 }
 }
 
 
 VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
 VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
   VP8LHistogram* histo = NULL;
   VP8LHistogram* histo = NULL;
-  const int total_size = VP8LGetHistogramSize(cache_bits);
+  const int total_size = GetHistogramSize(cache_bits);
   uint8_t* const memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
   uint8_t* const memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
   if (memory == NULL) return NULL;
   if (memory == NULL) return NULL;
   histo = (VP8LHistogram*)memory;
   histo = (VP8LHistogram*)memory;
@@ -126,7 +124,7 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
 static void HistogramSetResetPointers(VP8LHistogramSet* const set,
 static void HistogramSetResetPointers(VP8LHistogramSet* const set,
                                       int cache_bits) {
                                       int cache_bits) {
   int i;
   int i;
-  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  const int histo_size = GetHistogramSize(cache_bits);
   uint8_t* memory = (uint8_t*) (set->histograms);
   uint8_t* memory = (uint8_t*) (set->histograms);
   memory += set->max_size * sizeof(*set->histograms);
   memory += set->max_size * sizeof(*set->histograms);
   for (i = 0; i < set->max_size; ++i) {
   for (i = 0; i < set->max_size; ++i) {
@@ -140,7 +138,7 @@ static void HistogramSetResetPointers(VP8LHistogramSet* const set,
 
 
 // Returns the total size of the VP8LHistogramSet.
 // Returns the total size of the VP8LHistogramSet.
 static size_t HistogramSetTotalSize(int size, int cache_bits) {
 static size_t HistogramSetTotalSize(int size, int cache_bits) {
-  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  const int histo_size = GetHistogramSize(cache_bits);
   return (sizeof(VP8LHistogramSet) + size * (sizeof(VP8LHistogram*) +
   return (sizeof(VP8LHistogramSet) + size * (sizeof(VP8LHistogram*) +
           histo_size + WEBP_ALIGN_CST));
           histo_size + WEBP_ALIGN_CST));
 }
 }
@@ -230,8 +228,8 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
 // -----------------------------------------------------------------------------
 // -----------------------------------------------------------------------------
 // Entropy-related functions.
 // Entropy-related functions.
 
 
-static WEBP_INLINE float BitsEntropyRefine(const VP8LBitEntropy* entropy) {
-  float mix;
+static WEBP_INLINE uint64_t BitsEntropyRefine(const VP8LBitEntropy* entropy) {
+  uint64_t mix;
   if (entropy->nonzeros < 5) {
   if (entropy->nonzeros < 5) {
     if (entropy->nonzeros <= 1) {
     if (entropy->nonzeros <= 1) {
       return 0;
       return 0;
@@ -240,67 +238,72 @@ static WEBP_INLINE float BitsEntropyRefine(const VP8LBitEntropy* entropy) {
     // Let's mix in a bit of entropy to favor good clustering when
     // Let's mix in a bit of entropy to favor good clustering when
     // distributions of these are combined.
     // distributions of these are combined.
     if (entropy->nonzeros == 2) {
     if (entropy->nonzeros == 2) {
-      return 0.99f * entropy->sum + 0.01f * entropy->entropy;
+      return DivRound(99 * ((uint64_t)entropy->sum << LOG_2_PRECISION_BITS) +
+                          entropy->entropy,
+                      100);
     }
     }
     // No matter what the entropy says, we cannot be better than min_limit
     // No matter what the entropy says, we cannot be better than min_limit
     // with Huffman coding. I am mixing a bit of entropy into the
     // with Huffman coding. I am mixing a bit of entropy into the
     // min_limit since it produces much better (~0.5 %) compression results
     // min_limit since it produces much better (~0.5 %) compression results
     // perhaps because of better entropy clustering.
     // perhaps because of better entropy clustering.
     if (entropy->nonzeros == 3) {
     if (entropy->nonzeros == 3) {
-      mix = 0.95f;
+      mix = 950;
     } else {
     } else {
-      mix = 0.7f;  // nonzeros == 4.
+      mix = 700;  // nonzeros == 4.
     }
     }
   } else {
   } else {
-    mix = 0.627f;
+    mix = 627;
   }
   }
 
 
   {
   {
-    float min_limit = 2.f * entropy->sum - entropy->max_val;
-    min_limit = mix * min_limit + (1.f - mix) * entropy->entropy;
+    uint64_t min_limit = (uint64_t)(2 * entropy->sum - entropy->max_val)
+                         << LOG_2_PRECISION_BITS;
+    min_limit =
+        DivRound(mix * min_limit + (1000 - mix) * entropy->entropy, 1000);
     return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
     return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
   }
   }
 }
 }
 
 
-float VP8LBitsEntropy(const uint32_t* const array, int n) {
+uint64_t VP8LBitsEntropy(const uint32_t* const array, int n) {
   VP8LBitEntropy entropy;
   VP8LBitEntropy entropy;
   VP8LBitsEntropyUnrefined(array, n, &entropy);
   VP8LBitsEntropyUnrefined(array, n, &entropy);
 
 
   return BitsEntropyRefine(&entropy);
   return BitsEntropyRefine(&entropy);
 }
 }
 
 
-static float InitialHuffmanCost(void) {
+static uint64_t InitialHuffmanCost(void) {
   // Small bias because Huffman code length is typically not stored in
   // Small bias because Huffman code length is typically not stored in
   // full length.
   // full length.
-  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
-  static const float kSmallBias = 9.1f;
-  return kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
+  static const uint64_t kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
+  // Subtract a bias of 9.1.
+  return (kHuffmanCodeOfHuffmanCodeSize << LOG_2_PRECISION_BITS) -
+         DivRound(91ll << LOG_2_PRECISION_BITS, 10);
 }
 }
 
 
 // Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
 // Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
-static float FinalHuffmanCost(const VP8LStreaks* const stats) {
-  // The constants in this function are experimental and got rounded from
+static uint64_t FinalHuffmanCost(const VP8LStreaks* const stats) {
+  // The constants in this function are empirical and got rounded from
   // their original values in 1/8 when switched to 1/1024.
   // their original values in 1/8 when switched to 1/1024.
-  float retval = InitialHuffmanCost();
+  uint64_t retval = InitialHuffmanCost();
   // Second coefficient: Many zeros in the histogram are covered efficiently
   // Second coefficient: Many zeros in the histogram are covered efficiently
   // by a run-length encode. Originally 2/8.
   // by a run-length encode. Originally 2/8.
-  retval += stats->counts[0] * 1.5625f + 0.234375f * stats->streaks[0][1];
+  uint32_t retval_extra = stats->counts[0] * 1600 + 240 * stats->streaks[0][1];
   // Second coefficient: Constant values are encoded less efficiently, but still
   // Second coefficient: Constant values are encoded less efficiently, but still
   // RLE'ed. Originally 6/8.
   // RLE'ed. Originally 6/8.
-  retval += stats->counts[1] * 2.578125f + 0.703125f * stats->streaks[1][1];
+  retval_extra += stats->counts[1] * 2640 + 720 * stats->streaks[1][1];
   // 0s are usually encoded more efficiently than non-0s.
   // 0s are usually encoded more efficiently than non-0s.
   // Originally 15/8.
   // Originally 15/8.
-  retval += 1.796875f * stats->streaks[0][0];
+  retval_extra += 1840 * stats->streaks[0][0];
   // Originally 26/8.
   // Originally 26/8.
-  retval += 3.28125f * stats->streaks[1][0];
-  return retval;
+  retval_extra += 3360 * stats->streaks[1][0];
+  return retval + ((uint64_t)retval_extra << (LOG_2_PRECISION_BITS - 10));
 }
 }
 
 
 // Get the symbol entropy for the distribution 'population'.
 // Get the symbol entropy for the distribution 'population'.
 // Set 'trivial_sym', if there's only one symbol present in the distribution.
 // Set 'trivial_sym', if there's only one symbol present in the distribution.
-static float PopulationCost(const uint32_t* const population, int length,
-                            uint32_t* const trivial_sym,
-                            uint8_t* const is_used) {
+static uint64_t PopulationCost(const uint32_t* const population, int length,
+                               uint32_t* const trivial_sym,
+                               uint8_t* const is_used) {
   VP8LBitEntropy bit_entropy;
   VP8LBitEntropy bit_entropy;
   VP8LStreaks stats;
   VP8LStreaks stats;
   VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
   VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
@@ -316,10 +319,11 @@ static float PopulationCost(const uint32_t* const population, int length,
 
 
 // trivial_at_end is 1 if the two histograms only have one element that is
 // trivial_at_end is 1 if the two histograms only have one element that is
 // non-zero: both the zero-th one, or both the last one.
 // non-zero: both the zero-th one, or both the last one.
-static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X,
-                                            const uint32_t* const Y, int length,
-                                            int is_X_used, int is_Y_used,
-                                            int trivial_at_end) {
+static WEBP_INLINE uint64_t GetCombinedEntropy(const uint32_t* const X,
+                                               const uint32_t* const Y,
+                                               int length, int is_X_used,
+                                               int is_Y_used,
+                                               int trivial_at_end) {
   VP8LStreaks stats;
   VP8LStreaks stats;
   if (trivial_at_end) {
   if (trivial_at_end) {
     // This configuration is due to palettization that transforms an indexed
     // This configuration is due to palettization that transforms an indexed
@@ -357,7 +361,7 @@ static WEBP_INLINE float GetCombinedEntropy(const uint32_t* const X,
 }
 }
 
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
 // Estimates the Entropy + Huffman + other block overhead size cost.
-float VP8LHistogramEstimateBits(VP8LHistogram* const p) {
+uint64_t VP8LHistogramEstimateBits(VP8LHistogram* const p) {
   return PopulationCost(p->literal_,
   return PopulationCost(p->literal_,
                         VP8LHistogramNumCodes(p->palette_code_bits_), NULL,
                         VP8LHistogramNumCodes(p->palette_code_bits_), NULL,
                         &p->is_used_[0]) +
                         &p->is_used_[0]) +
@@ -366,27 +370,42 @@ float VP8LHistogramEstimateBits(VP8LHistogram* const p) {
          PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3]) +
          PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3]) +
          PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL,
          PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL,
                         &p->is_used_[4]) +
                         &p->is_used_[4]) +
-         (float)VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES,
-                              NUM_LENGTH_CODES) +
-         (float)VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
+         ((uint64_t)(VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES,
+                                   NUM_LENGTH_CODES) +
+                     VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES))
+          << LOG_2_PRECISION_BITS);
 }
 }
 
 
 // -----------------------------------------------------------------------------
 // -----------------------------------------------------------------------------
 // Various histogram combine/cost-eval functions
 // Various histogram combine/cost-eval functions
 
 
-static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
-                                       const VP8LHistogram* const b,
-                                       float cost_threshold, float* cost) {
+// Set a + b in b, saturating at WEBP_INT64_MAX.
+static WEBP_INLINE void SaturateAdd(uint64_t a, int64_t* b) {
+  if (*b < 0 || (int64_t)a <= WEBP_INT64_MAX - *b) {
+    *b += (int64_t)a;
+  } else {
+    *b = WEBP_INT64_MAX;
+  }
+}
+
+// Returns 1 if the cost of the combined histogram is less than the threshold.
+// Otherwise returns 0 and the cost is invalid due to early bail-out.
+WEBP_NODISCARD static int GetCombinedHistogramEntropy(
+    const VP8LHistogram* const a, const VP8LHistogram* const b,
+    int64_t cost_threshold_in, uint64_t* cost) {
   const int palette_code_bits = a->palette_code_bits_;
   const int palette_code_bits = a->palette_code_bits_;
   int trivial_at_end = 0;
   int trivial_at_end = 0;
+  const uint64_t cost_threshold = (uint64_t)cost_threshold_in;
   assert(a->palette_code_bits_ == b->palette_code_bits_);
   assert(a->palette_code_bits_ == b->palette_code_bits_);
-  *cost += GetCombinedEntropy(a->literal_, b->literal_,
-                              VP8LHistogramNumCodes(palette_code_bits),
-                              a->is_used_[0], b->is_used_[0], 0);
-  *cost += (float)VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
-                                        b->literal_ + NUM_LITERAL_CODES,
-                                        NUM_LENGTH_CODES);
-  if (*cost > cost_threshold) return 0;
+  if (cost_threshold_in <= 0) return 0;
+  *cost = GetCombinedEntropy(a->literal_, b->literal_,
+                             VP8LHistogramNumCodes(palette_code_bits),
+                             a->is_used_[0], b->is_used_[0], 0);
+  *cost += (uint64_t)VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
+                                           b->literal_ + NUM_LITERAL_CODES,
+                                           NUM_LENGTH_CODES)
+           << LOG_2_PRECISION_BITS;
+  if (*cost >= cost_threshold) return 0;
 
 
   if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
   if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
       a->trivial_symbol_ == b->trivial_symbol_) {
       a->trivial_symbol_ == b->trivial_symbol_) {
@@ -401,27 +420,24 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
     }
     }
   }
   }
 
 
-  *cost +=
-      GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, a->is_used_[1],
-                         b->is_used_[1], trivial_at_end);
-  if (*cost > cost_threshold) return 0;
+  *cost += GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES,
+                              a->is_used_[1], b->is_used_[1], trivial_at_end);
+  if (*cost >= cost_threshold) return 0;
 
 
-  *cost +=
-      GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, a->is_used_[2],
-                         b->is_used_[2], trivial_at_end);
-  if (*cost > cost_threshold) return 0;
+  *cost += GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES,
+                              a->is_used_[2], b->is_used_[2], trivial_at_end);
+  if (*cost >= cost_threshold) return 0;
 
 
-  *cost +=
-      GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
-                         a->is_used_[3], b->is_used_[3], trivial_at_end);
-  if (*cost > cost_threshold) return 0;
+  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
+                              a->is_used_[3], b->is_used_[3], trivial_at_end);
+  if (*cost >= cost_threshold) return 0;
 
 
-  *cost +=
-      GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
-                         a->is_used_[4], b->is_used_[4], 0);
-  *cost += (float)VP8LExtraCostCombined(a->distance_, b->distance_,
-                                        NUM_DISTANCE_CODES);
-  if (*cost > cost_threshold) return 0;
+  *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
+                              a->is_used_[4], b->is_used_[4], 0);
+  *cost += (uint64_t)VP8LExtraCostCombined(a->distance_, b->distance_,
+                                           NUM_DISTANCE_CODES)
+           << LOG_2_PRECISION_BITS;
+  if (*cost >= cost_threshold) return 0;
 
 
   return 1;
   return 1;
 }
 }
@@ -441,33 +457,39 @@ static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
 // Since the previous score passed is 'cost_threshold', we only need to compare
 // Since the previous score passed is 'cost_threshold', we only need to compare
 // the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
 // the partial cost against 'cost_threshold + C(a) + C(b)' to possibly bail-out
 // early.
 // early.
-static float HistogramAddEval(const VP8LHistogram* const a,
-                              const VP8LHistogram* const b,
-                              VP8LHistogram* const out, float cost_threshold) {
-  float cost = 0;
-  const float sum_cost = a->bit_cost_ + b->bit_cost_;
-  cost_threshold += sum_cost;
-
-  if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
-    HistogramAdd(a, b, out);
-    out->bit_cost_ = cost;
-    out->palette_code_bits_ = a->palette_code_bits_;
-  }
-
-  return cost - sum_cost;
+// Returns 1 if the cost is less than the threshold.
+// Otherwise returns 0 and the cost is invalid due to early bail-out.
+WEBP_NODISCARD static int HistogramAddEval(const VP8LHistogram* const a,
+                                           const VP8LHistogram* const b,
+                                           VP8LHistogram* const out,
+                                           int64_t cost_threshold) {
+  uint64_t cost;
+  const uint64_t sum_cost = a->bit_cost_ + b->bit_cost_;
+  SaturateAdd(sum_cost, &cost_threshold);
+  if (!GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) return 0;
+
+  HistogramAdd(a, b, out);
+  out->bit_cost_ = cost;
+  out->palette_code_bits_ = a->palette_code_bits_;
+  return 1;
 }
 }
 
 
 // Same as HistogramAddEval(), except that the resulting histogram
 // Same as HistogramAddEval(), except that the resulting histogram
 // is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
 // is not stored. Only the cost C(a+b) - C(a) is evaluated. We omit
 // the term C(b) which is constant over all the evaluations.
 // the term C(b) which is constant over all the evaluations.
-static float HistogramAddThresh(const VP8LHistogram* const a,
-                                const VP8LHistogram* const b,
-                                float cost_threshold) {
-  float cost;
+// Returns 1 if the cost is less than the threshold.
+// Otherwise returns 0 and the cost is invalid due to early bail-out.
+WEBP_NODISCARD static int HistogramAddThresh(const VP8LHistogram* const a,
+                                             const VP8LHistogram* const b,
+                                             int64_t cost_threshold,
+                                             int64_t* cost_out) {
+  uint64_t cost;
   assert(a != NULL && b != NULL);
   assert(a != NULL && b != NULL);
-  cost = -a->bit_cost_;
-  GetCombinedHistogramEntropy(a, b, cost_threshold, &cost);
-  return cost;
+  SaturateAdd(a->bit_cost_, &cost_threshold);
+  if (!GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) return 0;
+
+  *cost_out = (int64_t)cost - (int64_t)a->bit_cost_;
+  return 1;
 }
 }
 
 
 // -----------------------------------------------------------------------------
 // -----------------------------------------------------------------------------
@@ -475,21 +497,21 @@ static float HistogramAddThresh(const VP8LHistogram* const a,
 // The structure to keep track of cost range for the three dominant entropy
 // The structure to keep track of cost range for the three dominant entropy
 // symbols.
 // symbols.
 typedef struct {
 typedef struct {
-  float literal_max_;
-  float literal_min_;
-  float red_max_;
-  float red_min_;
-  float blue_max_;
-  float blue_min_;
+  uint64_t literal_max_;
+  uint64_t literal_min_;
+  uint64_t red_max_;
+  uint64_t red_min_;
+  uint64_t blue_max_;
+  uint64_t blue_min_;
 } DominantCostRange;
 } DominantCostRange;
 
 
 static void DominantCostRangeInit(DominantCostRange* const c) {
 static void DominantCostRangeInit(DominantCostRange* const c) {
-  c->literal_max_ = 0.;
-  c->literal_min_ = MAX_BIT_COST;
-  c->red_max_ = 0.;
-  c->red_min_ = MAX_BIT_COST;
-  c->blue_max_ = 0.;
-  c->blue_min_ = MAX_BIT_COST;
+  c->literal_max_ = 0;
+  c->literal_min_ = WEBP_UINT64_MAX;
+  c->red_max_ = 0;
+  c->red_min_ = WEBP_UINT64_MAX;
+  c->blue_max_ = 0;
+  c->blue_min_ = WEBP_UINT64_MAX;
 }
 }
 
 
 static void UpdateDominantCostRange(
 static void UpdateDominantCostRange(
@@ -504,15 +526,18 @@ static void UpdateDominantCostRange(
 
 
 static void UpdateHistogramCost(VP8LHistogram* const h) {
 static void UpdateHistogramCost(VP8LHistogram* const h) {
   uint32_t alpha_sym, red_sym, blue_sym;
   uint32_t alpha_sym, red_sym, blue_sym;
-  const float alpha_cost =
+  const uint64_t alpha_cost =
       PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]);
       PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym, &h->is_used_[3]);
-  const float distance_cost =
+  const uint64_t distance_cost =
       PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
       PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
-      (float)VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
+      ((uint64_t)VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES)
+       << LOG_2_PRECISION_BITS);
   const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
   const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
   h->literal_cost_ =
   h->literal_cost_ =
       PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
       PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
-      (float)VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
+      ((uint64_t)VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES,
+                               NUM_LENGTH_CODES)
+       << LOG_2_PRECISION_BITS);
   h->red_cost_ =
   h->red_cost_ =
       PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
       PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
   h->blue_cost_ =
   h->blue_cost_ =
@@ -527,10 +552,10 @@ static void UpdateHistogramCost(VP8LHistogram* const h) {
   }
   }
 }
 }
 
 
-static int GetBinIdForEntropy(float min, float max, float val) {
-  const float range = max - min;
-  if (range > 0.) {
-    const float delta = val - min;
+static int GetBinIdForEntropy(uint64_t min, uint64_t max, uint64_t val) {
+  const uint64_t range = max - min;
+  if (range > 0) {
+    const uint64_t delta = val - min;
     return (int)((NUM_PARTITIONS - 1e-6) * delta / range);
     return (int)((NUM_PARTITIONS - 1e-6) * delta / range);
   } else {
   } else {
     return 0;
     return 0;
@@ -576,11 +601,11 @@ static void HistogramBuild(
 }
 }
 
 
 // Copies the histograms and computes its bit_cost.
 // Copies the histograms and computes its bit_cost.
-static const uint16_t kInvalidHistogramSymbol = (uint16_t)(-1);
+static const uint32_t kInvalidHistogramSymbol = (uint32_t)(-1);
 static void HistogramCopyAndAnalyze(VP8LHistogramSet* const orig_histo,
 static void HistogramCopyAndAnalyze(VP8LHistogramSet* const orig_histo,
                                     VP8LHistogramSet* const image_histo,
                                     VP8LHistogramSet* const image_histo,
                                     int* const num_used,
                                     int* const num_used,
-                                    uint16_t* const histogram_symbols) {
+                                    uint32_t* const histogram_symbols) {
   int i, cluster_id;
   int i, cluster_id;
   int num_used_orig = *num_used;
   int num_used_orig = *num_used;
   VP8LHistogram** const orig_histograms = orig_histo->histograms;
   VP8LHistogram** const orig_histograms = orig_histo->histograms;
@@ -639,11 +664,12 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
 
 
 // Merges some histograms with same bin_id together if it's advantageous.
 // Merges some histograms with same bin_id together if it's advantageous.
 // Sets the remaining histograms to NULL.
 // Sets the remaining histograms to NULL.
+// 'combine_cost_factor' has to be divided by 100.
 static void HistogramCombineEntropyBin(
 static void HistogramCombineEntropyBin(
     VP8LHistogramSet* const image_histo, int* num_used,
     VP8LHistogramSet* const image_histo, int* num_used,
-    const uint16_t* const clusters, uint16_t* const cluster_mappings,
+    const uint32_t* const clusters, uint16_t* const cluster_mappings,
     VP8LHistogram* cur_combo, const uint16_t* const bin_map, int num_bins,
     VP8LHistogram* cur_combo, const uint16_t* const bin_map, int num_bins,
-    float combine_cost_factor, int low_effort) {
+    int32_t combine_cost_factor, int low_effort) {
   VP8LHistogram** const histograms = image_histo->histograms;
   VP8LHistogram** const histograms = image_histo->histograms;
   int idx;
   int idx;
   struct {
   struct {
@@ -673,11 +699,11 @@ static void HistogramCombineEntropyBin(
       cluster_mappings[clusters[idx]] = clusters[first];
       cluster_mappings[clusters[idx]] = clusters[first];
     } else {
     } else {
       // try to merge #idx into #first (both share the same bin_id)
       // try to merge #idx into #first (both share the same bin_id)
-      const float bit_cost = histograms[idx]->bit_cost_;
-      const float bit_cost_thresh = -bit_cost * combine_cost_factor;
-      const float curr_cost_diff = HistogramAddEval(
-          histograms[first], histograms[idx], cur_combo, bit_cost_thresh);
-      if (curr_cost_diff < bit_cost_thresh) {
+      const uint64_t bit_cost = histograms[idx]->bit_cost_;
+      const int64_t bit_cost_thresh =
+          -DivRound((int64_t)bit_cost * combine_cost_factor, 100);
+      if (HistogramAddEval(histograms[first], histograms[idx], cur_combo,
+                           bit_cost_thresh)) {
         // Try to merge two histograms only if the combo is a trivial one or
         // Try to merge two histograms only if the combo is a trivial one or
         // the two candidate histograms are already non-trivial.
         // the two candidate histograms are already non-trivial.
         // For some images, 'try_combine' turns out to be false for a lot of
         // For some images, 'try_combine' turns out to be false for a lot of
@@ -724,8 +750,8 @@ static uint32_t MyRand(uint32_t* const seed) {
 typedef struct {
 typedef struct {
   int idx1;
   int idx1;
   int idx2;
   int idx2;
-  float cost_diff;
-  float cost_combo;
+  int64_t cost_diff;
+  uint64_t cost_combo;
 } HistogramPair;
 } HistogramPair;
 
 
 typedef struct {
 typedef struct {
@@ -765,7 +791,7 @@ static void HistoQueuePopPair(HistoQueue* const histo_queue,
 // Check whether a pair in the queue should be updated as head or not.
 // Check whether a pair in the queue should be updated as head or not.
 static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
 static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
                                  HistogramPair* const pair) {
                                  HistogramPair* const pair) {
-  assert(pair->cost_diff < 0.);
+  assert(pair->cost_diff < 0);
   assert(pair >= histo_queue->queue &&
   assert(pair >= histo_queue->queue &&
          pair < (histo_queue->queue + histo_queue->size));
          pair < (histo_queue->queue + histo_queue->size));
   assert(histo_queue->size > 0);
   assert(histo_queue->size > 0);
@@ -778,29 +804,35 @@ static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
 }
 }
 
 
 // Update the cost diff and combo of a pair of histograms. This needs to be
 // Update the cost diff and combo of a pair of histograms. This needs to be
-// called when the the histograms have been merged with a third one.
-static void HistoQueueUpdatePair(const VP8LHistogram* const h1,
-                                 const VP8LHistogram* const h2, float threshold,
-                                 HistogramPair* const pair) {
-  const float sum_cost = h1->bit_cost_ + h2->bit_cost_;
-  pair->cost_combo = 0.;
-  GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair->cost_combo);
-  pair->cost_diff = pair->cost_combo - sum_cost;
+// called when the histograms have been merged with a third one.
+// Returns 1 if the cost diff is less than the threshold.
+// Otherwise returns 0 and the cost is invalid due to early bail-out.
+WEBP_NODISCARD static int HistoQueueUpdatePair(const VP8LHistogram* const h1,
+                                               const VP8LHistogram* const h2,
+                                               int64_t cost_threshold,
+                                               HistogramPair* const pair) {
+  const int64_t sum_cost = h1->bit_cost_ + h2->bit_cost_;
+  SaturateAdd(sum_cost, &cost_threshold);
+  if (!GetCombinedHistogramEntropy(h1, h2, cost_threshold, &pair->cost_combo)) {
+    return 0;
+  }
+  pair->cost_diff = (int64_t)pair->cost_combo - sum_cost;
+  return 1;
 }
 }
 
 
 // Create a pair from indices "idx1" and "idx2" provided its cost
 // Create a pair from indices "idx1" and "idx2" provided its cost
 // is inferior to "threshold", a negative entropy.
 // is inferior to "threshold", a negative entropy.
-// It returns the cost of the pair, or 0. if it superior to threshold.
-static float HistoQueuePush(HistoQueue* const histo_queue,
-                            VP8LHistogram** const histograms, int idx1,
-                            int idx2, float threshold) {
+// It returns the cost of the pair, or 0 if it superior to threshold.
+static int64_t HistoQueuePush(HistoQueue* const histo_queue,
+                              VP8LHistogram** const histograms, int idx1,
+                              int idx2, int64_t threshold) {
   const VP8LHistogram* h1;
   const VP8LHistogram* h1;
   const VP8LHistogram* h2;
   const VP8LHistogram* h2;
   HistogramPair pair;
   HistogramPair pair;
 
 
   // Stop here if the queue is full.
   // Stop here if the queue is full.
-  if (histo_queue->size == histo_queue->max_size) return 0.;
-  assert(threshold <= 0.);
+  if (histo_queue->size == histo_queue->max_size) return 0;
+  assert(threshold <= 0);
   if (idx1 > idx2) {
   if (idx1 > idx2) {
     const int tmp = idx2;
     const int tmp = idx2;
     idx2 = idx1;
     idx2 = idx1;
@@ -811,10 +843,8 @@ static float HistoQueuePush(HistoQueue* const histo_queue,
   h1 = histograms[idx1];
   h1 = histograms[idx1];
   h2 = histograms[idx2];
   h2 = histograms[idx2];
 
 
-  HistoQueueUpdatePair(h1, h2, threshold, &pair);
-
   // Do not even consider the pair if it does not improve the entropy.
   // Do not even consider the pair if it does not improve the entropy.
-  if (pair.cost_diff >= threshold) return 0.;
+  if (!HistoQueueUpdatePair(h1, h2, threshold, &pair)) return 0;
 
 
   histo_queue->queue[histo_queue->size++] = pair;
   histo_queue->queue[histo_queue->size++] = pair;
   HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
   HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
@@ -851,7 +881,7 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
     for (j = i + 1; j < image_histo_size; ++j) {
     for (j = i + 1; j < image_histo_size; ++j) {
       // Initialize queue.
       // Initialize queue.
       if (image_histo->histograms[j] == NULL) continue;
       if (image_histo->histograms[j] == NULL) continue;
-      HistoQueuePush(&histo_queue, histograms, i, j, 0.);
+      HistoQueuePush(&histo_queue, histograms, i, j, 0);
     }
     }
   }
   }
 
 
@@ -879,7 +909,7 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
     // Push new pairs formed with combined histogram to the queue.
     // Push new pairs formed with combined histogram to the queue.
     for (i = 0; i < image_histo->size; ++i) {
     for (i = 0; i < image_histo->size; ++i) {
       if (i == idx1 || image_histo->histograms[i] == NULL) continue;
       if (i == idx1 || image_histo->histograms[i] == NULL) continue;
-      HistoQueuePush(&histo_queue, image_histo->histograms, idx1, i, 0.);
+      HistoQueuePush(&histo_queue, image_histo->histograms, idx1, i, 0);
     }
     }
   }
   }
 
 
@@ -937,8 +967,8 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
            ++tries_with_no_success < num_tries_no_success;
            ++tries_with_no_success < num_tries_no_success;
        ++iter) {
        ++iter) {
     int* mapping_index;
     int* mapping_index;
-    float best_cost =
-        (histo_queue.size == 0) ? 0.f : histo_queue.queue[0].cost_diff;
+    int64_t best_cost =
+        (histo_queue.size == 0) ? 0 : histo_queue.queue[0].cost_diff;
     int best_idx1 = -1, best_idx2 = 1;
     int best_idx1 = -1, best_idx2 = 1;
     const uint32_t rand_range = (*num_used - 1) * (*num_used);
     const uint32_t rand_range = (*num_used - 1) * (*num_used);
     // (*num_used) / 2 was chosen empirically. Less means faster but worse
     // (*num_used) / 2 was chosen empirically. Less means faster but worse
@@ -947,7 +977,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
 
 
     // Pick random samples.
     // Pick random samples.
     for (j = 0; *num_used >= 2 && j < num_tries; ++j) {
     for (j = 0; *num_used >= 2 && j < num_tries; ++j) {
-      float curr_cost;
+      int64_t curr_cost;
       // Choose two different histograms at random and try to combine them.
       // Choose two different histograms at random and try to combine them.
       const uint32_t tmp = MyRand(&seed) % rand_range;
       const uint32_t tmp = MyRand(&seed) % rand_range;
       uint32_t idx1 = tmp / (*num_used - 1);
       uint32_t idx1 = tmp / (*num_used - 1);
@@ -1012,8 +1042,8 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
       }
       }
       if (do_eval) {
       if (do_eval) {
         // Re-evaluate the cost of an updated pair.
         // Re-evaluate the cost of an updated pair.
-        HistoQueueUpdatePair(histograms[p->idx1], histograms[p->idx2], 0., p);
-        if (p->cost_diff >= 0.) {
+        if (!HistoQueueUpdatePair(histograms[p->idx1], histograms[p->idx2], 0,
+                                  p)) {
           HistoQueuePopPair(&histo_queue, p);
           HistoQueuePopPair(&histo_queue, p);
           continue;
           continue;
         }
         }
@@ -1040,7 +1070,7 @@ static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
 // Note: we assume that out[]->bit_cost_ is already up-to-date.
 // Note: we assume that out[]->bit_cost_ is already up-to-date.
 static void HistogramRemap(const VP8LHistogramSet* const in,
 static void HistogramRemap(const VP8LHistogramSet* const in,
                            VP8LHistogramSet* const out,
                            VP8LHistogramSet* const out,
-                           uint16_t* const symbols) {
+                           uint32_t* const symbols) {
   int i;
   int i;
   VP8LHistogram** const in_histo = in->histograms;
   VP8LHistogram** const in_histo = in->histograms;
   VP8LHistogram** const out_histo = out->histograms;
   VP8LHistogram** const out_histo = out->histograms;
@@ -1049,7 +1079,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
   if (out_size > 1) {
   if (out_size > 1) {
     for (i = 0; i < in_size; ++i) {
     for (i = 0; i < in_size; ++i) {
       int best_out = 0;
       int best_out = 0;
-      float best_bits = MAX_BIT_COST;
+      int64_t best_bits = WEBP_INT64_MAX;
       int k;
       int k;
       if (in_histo[i] == NULL) {
       if (in_histo[i] == NULL) {
         // Arbitrarily set to the previous value if unused to help future LZ77.
         // Arbitrarily set to the previous value if unused to help future LZ77.
@@ -1057,9 +1087,9 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
         continue;
         continue;
       }
       }
       for (k = 0; k < out_size; ++k) {
       for (k = 0; k < out_size; ++k) {
-        float cur_bits;
-        cur_bits = HistogramAddThresh(out_histo[k], in_histo[i], best_bits);
-        if (k == 0 || cur_bits < best_bits) {
+        int64_t cur_bits;
+        if (HistogramAddThresh(out_histo[k], in_histo[i], best_bits,
+                               &cur_bits)) {
           best_bits = cur_bits;
           best_bits = cur_bits;
           best_out = k;
           best_out = k;
         }
         }
@@ -1085,13 +1115,13 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
   }
   }
 }
 }
 
 
-static float GetCombineCostFactor(int histo_size, int quality) {
-  float combine_cost_factor = 0.16f;
+static int32_t GetCombineCostFactor(int histo_size, int quality) {
+  int32_t combine_cost_factor = 16;
   if (quality < 90) {
   if (quality < 90) {
-    if (histo_size > 256) combine_cost_factor /= 2.f;
-    if (histo_size > 512) combine_cost_factor /= 2.f;
-    if (histo_size > 1024) combine_cost_factor /= 2.f;
-    if (quality <= 50) combine_cost_factor /= 2.f;
+    if (histo_size > 256) combine_cost_factor /= 2;
+    if (histo_size > 512) combine_cost_factor /= 2;
+    if (histo_size > 1024) combine_cost_factor /= 2;
+    if (quality <= 50) combine_cost_factor /= 2;
   }
   }
   return combine_cost_factor;
   return combine_cost_factor;
 }
 }
@@ -1101,10 +1131,10 @@ static float GetCombineCostFactor(int histo_size, int quality) {
 // assign the smallest possible clusters values.
 // assign the smallest possible clusters values.
 static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
 static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
                                      uint16_t* const cluster_mappings,
                                      uint16_t* const cluster_mappings,
-                                     int num_clusters,
+                                     uint32_t num_clusters,
                                      uint16_t* const cluster_mappings_tmp,
                                      uint16_t* const cluster_mappings_tmp,
-                                     uint16_t* const symbols) {
-  int i, cluster_max;
+                                     uint32_t* const symbols) {
+  uint32_t i, cluster_max;
   int do_continue = 1;
   int do_continue = 1;
   // First, assign the lowest cluster to each pixel.
   // First, assign the lowest cluster to each pixel.
   while (do_continue) {
   while (do_continue) {
@@ -1128,7 +1158,7 @@ static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
          set->max_size * sizeof(*cluster_mappings_tmp));
          set->max_size * sizeof(*cluster_mappings_tmp));
   assert(cluster_mappings[0] == 0);
   assert(cluster_mappings[0] == 0);
   // Re-map the ids.
   // Re-map the ids.
-  for (i = 0; i < set->max_size; ++i) {
+  for (i = 0; i < (uint32_t)set->max_size; ++i) {
     int cluster;
     int cluster;
     if (symbols[i] == kInvalidHistogramSymbol) continue;
     if (symbols[i] == kInvalidHistogramSymbol) continue;
     cluster = cluster_mappings[symbols[i]];
     cluster = cluster_mappings[symbols[i]];
@@ -1142,7 +1172,7 @@ static void OptimizeHistogramSymbols(const VP8LHistogramSet* const set,
 
 
   // Make sure all cluster values are used.
   // Make sure all cluster values are used.
   cluster_max = 0;
   cluster_max = 0;
-  for (i = 0; i < set->max_size; ++i) {
+  for (i = 0; i < (uint32_t)set->max_size; ++i) {
     if (symbols[i] == kInvalidHistogramSymbol) continue;
     if (symbols[i] == kInvalidHistogramSymbol) continue;
     if (symbols[i] <= cluster_max) continue;
     if (symbols[i] <= cluster_max) continue;
     ++cluster_max;
     ++cluster_max;
@@ -1165,7 +1195,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              int low_effort, int histogram_bits, int cache_bits,
                              int low_effort, int histogram_bits, int cache_bits,
                              VP8LHistogramSet* const image_histo,
                              VP8LHistogramSet* const image_histo,
                              VP8LHistogram* const tmp_histo,
                              VP8LHistogram* const tmp_histo,
-                             uint16_t* const histogram_symbols,
+                             uint32_t* const histogram_symbols,
                              const WebPPicture* const pic, int percent_range,
                              const WebPPicture* const pic, int percent_range,
                              int* const percent) {
                              int* const percent) {
   const int histo_xsize =
   const int histo_xsize =
@@ -1181,7 +1211,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
   const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
   int entropy_combine;
   int entropy_combine;
   uint16_t* const map_tmp =
   uint16_t* const map_tmp =
-      WebPSafeMalloc(2 * image_histo_raw_size, sizeof(*map_tmp));
+      (uint16_t*)WebPSafeMalloc(2 * image_histo_raw_size, sizeof(*map_tmp));
   uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size;
   uint16_t* const cluster_mappings = map_tmp + image_histo_raw_size;
   int num_used = image_histo_raw_size;
   int num_used = image_histo_raw_size;
   if (orig_histo == NULL || map_tmp == NULL) {
   if (orig_histo == NULL || map_tmp == NULL) {
@@ -1201,7 +1231,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
 
 
   if (entropy_combine) {
   if (entropy_combine) {
     uint16_t* const bin_map = map_tmp;
     uint16_t* const bin_map = map_tmp;
-    const float combine_cost_factor =
+    const int32_t combine_cost_factor =
         GetCombineCostFactor(image_histo_raw_size, quality);
         GetCombineCostFactor(image_histo_raw_size, quality);
     const uint32_t num_clusters = num_used;
     const uint32_t num_clusters = num_used;
 
 
@@ -1217,9 +1247,10 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   // Don't combine the histograms using stochastic and greedy heuristics for
   // Don't combine the histograms using stochastic and greedy heuristics for
   // low-effort compression mode.
   // low-effort compression mode.
   if (!low_effort || !entropy_combine) {
   if (!low_effort || !entropy_combine) {
-    const float x = quality / 100.f;
     // cubic ramp between 1 and MAX_HISTO_GREEDY:
     // cubic ramp between 1 and MAX_HISTO_GREEDY:
-    const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
+    const int threshold_size =
+        (int)(1 + DivRound(quality * quality * quality * (MAX_HISTO_GREEDY - 1),
+                           100 * 100 * 100));
     int do_greedy;
     int do_greedy;
     if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size,
     if (!HistogramCombineStochastic(image_histo, &num_used, threshold_size,
                                     &do_greedy)) {
                                     &do_greedy)) {

+ 7 - 10
thirdparty/libwebp/src/enc/histogram_enc.h

@@ -40,10 +40,10 @@ typedef struct {
   int palette_code_bits_;
   int palette_code_bits_;
   uint32_t trivial_symbol_;  // True, if histograms for Red, Blue & Alpha
   uint32_t trivial_symbol_;  // True, if histograms for Red, Blue & Alpha
                              // literal symbols are single valued.
                              // literal symbols are single valued.
-  float bit_cost_;           // cached value of bit cost.
-  float literal_cost_;       // Cached values of dominant entropy costs:
-  float red_cost_;           // literal, red & blue.
-  float blue_cost_;
+  uint64_t bit_cost_;        // cached value of bit cost.
+  uint64_t literal_cost_;    // Cached values of dominant entropy costs:
+  uint64_t red_cost_;        // literal, red & blue.
+  uint64_t blue_cost_;
   uint8_t is_used_[5];       // 5 for literal, red, blue, alpha, distance
   uint8_t is_used_[5];       // 5 for literal, red, blue, alpha, distance
 } VP8LHistogram;
 } VP8LHistogram;
 
 
@@ -64,9 +64,6 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
                          const VP8LBackwardRefs* const refs,
                          const VP8LBackwardRefs* const refs,
                          int palette_code_bits);
                          int palette_code_bits);
 
 
-// Return the size of the histogram for a given cache_bits.
-int VP8LGetHistogramSize(int cache_bits);
-
 // Set the palette_code_bits and reset the stats.
 // Set the palette_code_bits and reset the stats.
 // If init_arrays is true, the arrays are also filled with 0's.
 // If init_arrays is true, the arrays are also filled with 0's.
 void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
 void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits,
@@ -112,16 +109,16 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              int low_effort, int histogram_bits, int cache_bits,
                              int low_effort, int histogram_bits, int cache_bits,
                              VP8LHistogramSet* const image_histo,
                              VP8LHistogramSet* const image_histo,
                              VP8LHistogram* const tmp_histo,
                              VP8LHistogram* const tmp_histo,
-                             uint16_t* const histogram_symbols,
+                             uint32_t* const histogram_symbols,
                              const WebPPicture* const pic, int percent_range,
                              const WebPPicture* const pic, int percent_range,
                              int* const percent);
                              int* const percent);
 
 
 // Returns the entropy for the symbols in the input array.
 // Returns the entropy for the symbols in the input array.
-float VP8LBitsEntropy(const uint32_t* const array, int n);
+uint64_t VP8LBitsEntropy(const uint32_t* const array, int n);
 
 
 // Estimate how many bits the combined entropy of literals and distance
 // Estimate how many bits the combined entropy of literals and distance
 // approximately maps to.
 // approximately maps to.
-float VP8LHistogramEstimateBits(VP8LHistogram* const p);
+uint64_t VP8LHistogramEstimateBits(VP8LHistogram* const p);
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 12 - 1
thirdparty/libwebp/src/enc/iterator_enc.c

@@ -13,6 +13,7 @@
 
 
 #include <string.h>
 #include <string.h>
 
 
+#include "src/dsp/cpu.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/enc/vp8i_enc.h"
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
@@ -54,7 +55,8 @@ void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
   InitLeft(it);
   InitLeft(it);
 }
 }
 
 
-void VP8IteratorReset(VP8EncIterator* const it) {
+// restart a scan
+static void VP8IteratorReset(VP8EncIterator* const it) {
   VP8Encoder* const enc = it->enc_;
   VP8Encoder* const enc = it->enc_;
   VP8IteratorSetRow(it, 0);
   VP8IteratorSetRow(it, 0);
   VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
   VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
@@ -424,6 +426,15 @@ void VP8IteratorStartI4(VP8EncIterator* const it) {
       it->i4_boundary_[17 + i] = it->i4_boundary_[17 + 15];
       it->i4_boundary_[17 + i] = it->i4_boundary_[17 + 15];
     }
     }
   }
   }
+#if WEBP_AARCH64 && BPS == 32 && defined(WEBP_MSAN)
+  // Intra4Preds_NEON() reads 3 uninitialized bytes from i4_boundary_ when top
+  // is positioned at offset 29 (VP8TopLeftI4[3]). The values are not used
+  // meaningfully, but due to limitations in MemorySanitizer related to
+  // modeling of tbl instructions, a warning will be issued. This can be
+  // removed if MSan is updated to support the instructions. See
+  // https://issues.webmproject.org/372109644.
+  memset(it->i4_boundary_ + sizeof(it->i4_boundary_) - 3, 0xaa, 3);
+#endif
   VP8IteratorNzToBytes(it);  // import the non-zero context
   VP8IteratorNzToBytes(it);  // import the non-zero context
 }
 }
 
 

+ 475 - 156
thirdparty/libwebp/src/enc/predictor_enc.c

@@ -14,53 +14,75 @@
 //          Urvang Joshi ([email protected])
 //          Urvang Joshi ([email protected])
 //          Vincent Rabaud ([email protected])
 //          Vincent Rabaud ([email protected])
 
 
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
 #include "src/dsp/lossless_common.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/enc/vp8li_enc.h"
 #include "src/enc/vp8li_enc.h"
+#include "src/utils/utils.h"
+#include "src/webp/encode.h"
+#include "src/webp/format_constants.h"
+#include "src/webp/types.h"
 
 
-#define MAX_DIFF_COST (1e30f)
-
-static const float kSpatialPredictorBias = 15.f;
+#define HISTO_SIZE (4 * 256)
+static const int64_t kSpatialPredictorBias = 15ll << LOG_2_PRECISION_BITS;
 static const int kPredLowEffort = 11;
 static const int kPredLowEffort = 11;
 static const uint32_t kMaskAlpha = 0xff000000;
 static const uint32_t kMaskAlpha = 0xff000000;
+static const int kNumPredModes = 14;
 
 
 // Mostly used to reduce code size + readability
 // Mostly used to reduce code size + readability
 static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
 static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
+static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 // Methods to calculate Entropy (Shannon).
 // Methods to calculate Entropy (Shannon).
 
 
-static float PredictionCostSpatial(const int counts[256], int weight_0,
-                                   float exp_val) {
+// Compute a bias for prediction entropy using a global heuristic to favor
+// values closer to 0. Hence the final negative sign.
+// 'exp_val' has a scaling factor of 1/100.
+static int64_t PredictionCostBias(const uint32_t counts[256], uint64_t weight_0,
+                                  uint64_t exp_val) {
   const int significant_symbols = 256 >> 4;
   const int significant_symbols = 256 >> 4;
-  const float exp_decay_factor = 0.6f;
-  float bits = (float)weight_0 * counts[0];
+  const uint64_t exp_decay_factor = 6;  // has a scaling factor of 1/10
+  uint64_t bits = (weight_0 * counts[0]) << LOG_2_PRECISION_BITS;
   int i;
   int i;
+  exp_val <<= LOG_2_PRECISION_BITS;
   for (i = 1; i < significant_symbols; ++i) {
   for (i = 1; i < significant_symbols; ++i) {
-    bits += exp_val * (counts[i] + counts[256 - i]);
-    exp_val *= exp_decay_factor;
+    bits += DivRound(exp_val * (counts[i] + counts[256 - i]), 100);
+    exp_val = DivRound(exp_decay_factor * exp_val, 10);
   }
   }
-  return (float)(-0.1 * bits);
+  return -DivRound((int64_t)bits, 10);
 }
 }
 
 
-static float PredictionCostSpatialHistogram(const int accumulated[4][256],
-                                            const int tile[4][256]) {
+static int64_t PredictionCostSpatialHistogram(
+    const uint32_t accumulated[HISTO_SIZE], const uint32_t tile[HISTO_SIZE],
+    int mode, int left_mode, int above_mode) {
   int i;
   int i;
-  float retval = 0.f;
+  int64_t retval = 0;
   for (i = 0; i < 4; ++i) {
   for (i = 0; i < 4; ++i) {
-    const float kExpValue = 0.94f;
-    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
-    retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
+    const uint64_t kExpValue = 94;
+    retval += PredictionCostBias(&tile[i * 256], 1, kExpValue);
+    // Compute the new cost if 'tile' is added to 'accumulate' but also add the
+    // cost of the current histogram to guide the spatial predictor selection.
+    // Basically, favor low entropy, locally and globally.
+    retval += (int64_t)VP8LCombinedShannonEntropy(&tile[i * 256],
+                                                  &accumulated[i * 256]);
   }
   }
-  return (float)retval;
+  // Favor keeping the areas locally similar.
+  if (mode == left_mode) retval -= kSpatialPredictorBias;
+  if (mode == above_mode) retval -= kSpatialPredictorBias;
+  return retval;
 }
 }
 
 
-static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
-  ++histo_argb[0][argb >> 24];
-  ++histo_argb[1][(argb >> 16) & 0xff];
-  ++histo_argb[2][(argb >> 8) & 0xff];
-  ++histo_argb[3][argb & 0xff];
+static WEBP_INLINE void UpdateHisto(uint32_t histo_argb[HISTO_SIZE],
+                                    uint32_t argb) {
+  ++histo_argb[0 * 256 + (argb >> 24)];
+  ++histo_argb[1 * 256 + ((argb >> 16) & 0xff)];
+  ++histo_argb[2 * 256 + ((argb >> 8) & 0xff)];
+  ++histo_argb[3 * 256 + (argb & 0xff)];
 }
 }
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
@@ -91,8 +113,6 @@ static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
 }
 }
 
 
 #if (WEBP_NEAR_LOSSLESS == 1)
 #if (WEBP_NEAR_LOSSLESS == 1)
-static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
-
 static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
 static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
   const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
   const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
   const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
   const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
@@ -291,23 +311,80 @@ static WEBP_INLINE void GetResidual(
   }
   }
 }
 }
 
 
-// Returns best predictor and updates the accumulated histogram.
+// Accessors to residual histograms.
+static WEBP_INLINE uint32_t* GetHistoArgb(uint32_t* const all_histos,
+                                          int subsampling_index, int mode) {
+  return &all_histos[(subsampling_index * kNumPredModes + mode) * HISTO_SIZE];
+}
+
+static WEBP_INLINE const uint32_t* GetHistoArgbConst(
+    const uint32_t* const all_histos, int subsampling_index, int mode) {
+  return &all_histos[subsampling_index * kNumPredModes * HISTO_SIZE +
+                     mode * HISTO_SIZE];
+}
+
+// Accessors to accumulated residual histogram.
+static WEBP_INLINE uint32_t* GetAccumulatedHisto(uint32_t* all_accumulated,
+                                                 int subsampling_index) {
+  return &all_accumulated[subsampling_index * HISTO_SIZE];
+}
+
+// Find and store the best predictor for a tile at subsampling
+// 'subsampling_index'.
+static void GetBestPredictorForTile(const uint32_t* const all_argb,
+                                    int subsampling_index, int tile_x,
+                                    int tile_y, int tiles_per_row,
+                                    uint32_t* all_accumulated_argb,
+                                    uint32_t** const all_modes,
+                                    uint32_t* const all_pred_histos) {
+  uint32_t* const accumulated_argb =
+      GetAccumulatedHisto(all_accumulated_argb, subsampling_index);
+  uint32_t* const modes = all_modes[subsampling_index];
+  uint32_t* const pred_histos =
+      &all_pred_histos[subsampling_index * kNumPredModes];
+  // Prediction modes of the left and above neighbor tiles.
+  const int left_mode =
+      (tile_x > 0) ? (modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff
+                   : 0xff;
+  const int above_mode =
+      (tile_y > 0) ? (modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff
+                   : 0xff;
+  int mode;
+  int64_t best_diff = WEBP_INT64_MAX;
+  uint32_t best_mode = 0;
+  const uint32_t* best_histo =
+      GetHistoArgbConst(all_argb, /*subsampling_index=*/0, best_mode);
+  for (mode = 0; mode < kNumPredModes; ++mode) {
+    const uint32_t* const histo_argb =
+        GetHistoArgbConst(all_argb, subsampling_index, mode);
+    const int64_t cur_diff = PredictionCostSpatialHistogram(
+        accumulated_argb, histo_argb, mode, left_mode, above_mode);
+
+    if (cur_diff < best_diff) {
+      best_histo = histo_argb;
+      best_diff = cur_diff;
+      best_mode = mode;
+    }
+  }
+  // Update the accumulated histogram.
+  VP8LAddVectorEq(best_histo, accumulated_argb, HISTO_SIZE);
+  modes[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (best_mode << 8);
+  ++pred_histos[best_mode];
+}
+
+// Computes the residuals for the different predictors.
 // If max_quantization > 1, assumes that near lossless processing will be
 // If max_quantization > 1, assumes that near lossless processing will be
 // applied, quantizing residuals to multiples of quantization levels up to
 // applied, quantizing residuals to multiples of quantization levels up to
 // max_quantization (the actual quantization level depends on smoothness near
 // max_quantization (the actual quantization level depends on smoothness near
 // the given pixel).
 // the given pixel).
-static int GetBestPredictorForTile(int width, int height,
-                                   int tile_x, int tile_y, int bits,
-                                   int accumulated[4][256],
-                                   uint32_t* const argb_scratch,
-                                   const uint32_t* const argb,
-                                   int max_quantization,
-                                   int exact, int used_subtract_green,
-                                   const uint32_t* const modes) {
-  const int kNumPredModes = 14;
-  const int start_x = tile_x << bits;
-  const int start_y = tile_y << bits;
-  const int tile_size = 1 << bits;
+static void ComputeResidualsForTile(
+    int width, int height, int tile_x, int tile_y, int min_bits,
+    uint32_t update_up_to_index, uint32_t* const all_argb,
+    uint32_t* const argb_scratch, const uint32_t* const argb,
+    int max_quantization, int exact, int used_subtract_green) {
+  const int start_x = tile_x << min_bits;
+  const int start_y = tile_y << min_bits;
+  const int tile_size = 1 << min_bits;
   const int max_y = GetMin(tile_size, height - start_y);
   const int max_y = GetMin(tile_size, height - start_y);
   const int max_x = GetMin(tile_size, width - start_x);
   const int max_x = GetMin(tile_size, width - start_x);
   // Whether there exist columns just outside the tile.
   // Whether there exist columns just outside the tile.
@@ -318,35 +395,20 @@ static int GetBestPredictorForTile(int width, int height,
 #if (WEBP_NEAR_LOSSLESS == 1)
 #if (WEBP_NEAR_LOSSLESS == 1)
   const int context_width = max_x + have_left + (max_x < width - start_x);
   const int context_width = max_x + have_left + (max_x < width - start_x);
 #endif
 #endif
-  const int tiles_per_row = VP8LSubSampleSize(width, bits);
-  // Prediction modes of the left and above neighbor tiles.
-  const int left_mode = (tile_x > 0) ?
-      (modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff : 0xff;
-  const int above_mode = (tile_y > 0) ?
-      (modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff : 0xff;
   // The width of upper_row and current_row is one pixel larger than image width
   // The width of upper_row and current_row is one pixel larger than image width
   // to allow the top right pixel to point to the leftmost pixel of the next row
   // to allow the top right pixel to point to the leftmost pixel of the next row
   // when at the right edge.
   // when at the right edge.
   uint32_t* upper_row = argb_scratch;
   uint32_t* upper_row = argb_scratch;
   uint32_t* current_row = upper_row + width + 1;
   uint32_t* current_row = upper_row + width + 1;
   uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
   uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
-  float best_diff = MAX_DIFF_COST;
-  int best_mode = 0;
   int mode;
   int mode;
-  int histo_stack_1[4][256];
-  int histo_stack_2[4][256];
   // Need pointers to be able to swap arrays.
   // Need pointers to be able to swap arrays.
-  int (*histo_argb)[256] = histo_stack_1;
-  int (*best_histo)[256] = histo_stack_2;
-  int i, j;
   uint32_t residuals[1 << MAX_TRANSFORM_BITS];
   uint32_t residuals[1 << MAX_TRANSFORM_BITS];
-  assert(bits <= MAX_TRANSFORM_BITS);
   assert(max_x <= (1 << MAX_TRANSFORM_BITS));
   assert(max_x <= (1 << MAX_TRANSFORM_BITS));
-
   for (mode = 0; mode < kNumPredModes; ++mode) {
   for (mode = 0; mode < kNumPredModes; ++mode) {
-    float cur_diff;
     int relative_y;
     int relative_y;
-    memset(histo_argb, 0, sizeof(histo_stack_1));
+    uint32_t* const histo_argb =
+        GetHistoArgb(all_argb, /*subsampling_index=*/0, mode);
     if (start_y > 0) {
     if (start_y > 0) {
       // Read the row above the tile which will become the first upper_row.
       // Read the row above the tile which will become the first upper_row.
       // Include a pixel to the left if it exists; include a pixel to the right
       // Include a pixel to the left if it exists; include a pixel to the right
@@ -382,41 +444,31 @@ static int GetBestPredictorForTile(int width, int height,
       for (relative_x = 0; relative_x < max_x; ++relative_x) {
       for (relative_x = 0; relative_x < max_x; ++relative_x) {
         UpdateHisto(histo_argb, residuals[relative_x]);
         UpdateHisto(histo_argb, residuals[relative_x]);
       }
       }
-    }
-    cur_diff = PredictionCostSpatialHistogram(
-        (const int (*)[256])accumulated, (const int (*)[256])histo_argb);
-    // Favor keeping the areas locally similar.
-    if (mode == left_mode) cur_diff -= kSpatialPredictorBias;
-    if (mode == above_mode) cur_diff -= kSpatialPredictorBias;
-
-    if (cur_diff < best_diff) {
-      int (*tmp)[256] = histo_argb;
-      histo_argb = best_histo;
-      best_histo = tmp;
-      best_diff = cur_diff;
-      best_mode = mode;
-    }
-  }
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 256; j++) {
-      accumulated[i][j] += best_histo[i][j];
+      if (update_up_to_index > 0) {
+        uint32_t subsampling_index;
+        for (subsampling_index = 1; subsampling_index <= update_up_to_index;
+             ++subsampling_index) {
+          uint32_t* const super_histo =
+              GetHistoArgb(all_argb, subsampling_index, mode);
+          for (relative_x = 0; relative_x < max_x; ++relative_x) {
+            UpdateHisto(super_histo, residuals[relative_x]);
+          }
+        }
+      }
     }
     }
   }
   }
-
-  return best_mode;
 }
 }
 
 
 // Converts pixels of the image to residuals with respect to predictions.
 // Converts pixels of the image to residuals with respect to predictions.
 // If max_quantization > 1, applies near lossless processing, quantizing
 // If max_quantization > 1, applies near lossless processing, quantizing
 // residuals to multiples of quantization levels up to max_quantization
 // residuals to multiples of quantization levels up to max_quantization
 // (the actual quantization level depends on smoothness near the given pixel).
 // (the actual quantization level depends on smoothness near the given pixel).
-static void CopyImageWithPrediction(int width, int height,
-                                    int bits, uint32_t* const modes,
+static void CopyImageWithPrediction(int width, int height, int bits,
+                                    const uint32_t* const modes,
                                     uint32_t* const argb_scratch,
                                     uint32_t* const argb_scratch,
-                                    uint32_t* const argb,
-                                    int low_effort, int max_quantization,
-                                    int exact, int used_subtract_green) {
+                                    uint32_t* const argb, int low_effort,
+                                    int max_quantization, int exact,
+                                    int used_subtract_green) {
   const int tiles_per_row = VP8LSubSampleSize(width, bits);
   const int tiles_per_row = VP8LSubSampleSize(width, bits);
   // The width of upper_row and current_row is one pixel larger than image width
   // The width of upper_row and current_row is one pixel larger than image width
   // to allow the top right pixel to point to the leftmost pixel of the next row
   // to allow the top right pixel to point to the leftmost pixel of the next row
@@ -469,47 +521,307 @@ static void CopyImageWithPrediction(int width, int height,
   }
   }
 }
 }
 
 
+// Checks whether 'image' can be subsampled by finding the biggest power of 2
+// squares (defined by 'best_bits') of uniform value it is made out of.
+void VP8LOptimizeSampling(uint32_t* const image, int full_width,
+                          int full_height, int bits, int max_bits,
+                          int* best_bits_out) {
+  int width = VP8LSubSampleSize(full_width, bits);
+  int height = VP8LSubSampleSize(full_height, bits);
+  int old_width, x, y, square_size;
+  int best_bits = bits;
+  *best_bits_out = bits;
+  // Check rows first.
+  while (best_bits < max_bits) {
+    const int new_square_size = 1 << (best_bits + 1 - bits);
+    int is_good = 1;
+    square_size = 1 << (best_bits - bits);
+    for (y = 0; y + square_size < height; y += new_square_size) {
+      // Check the first lines of consecutive line groups.
+      if (memcmp(&image[y * width], &image[(y + square_size) * width],
+                 width * sizeof(*image)) != 0) {
+        is_good = 0;
+        break;
+      }
+    }
+    if (is_good) {
+      ++best_bits;
+    } else {
+      break;
+    }
+  }
+  if (best_bits == bits) return;
+
+  // Check columns.
+  while (best_bits > bits) {
+    int is_good = 1;
+    square_size = 1 << (best_bits - bits);
+    for (y = 0; is_good && y < height; ++y) {
+      for (x = 0; is_good && x < width; x += square_size) {
+        int i;
+        for (i = x + 1; i < GetMin(x + square_size, width); ++i) {
+          if (image[y * width + i] != image[y * width + x]) {
+            is_good = 0;
+            break;
+          }
+        }
+      }
+    }
+    if (is_good) {
+      break;
+    }
+    --best_bits;
+  }
+  if (best_bits == bits) return;
+
+  // Subsample the image.
+  old_width = width;
+  square_size = 1 << (best_bits - bits);
+  width = VP8LSubSampleSize(full_width, best_bits);
+  height = VP8LSubSampleSize(full_height, best_bits);
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      image[y * width + x] = image[square_size * (y * old_width + x)];
+    }
+  }
+  *best_bits_out = best_bits;
+}
+
+// Computes the best predictor image.
+// Finds the best predictors per tile. Once done, finds the best predictor image
+// sampling.
+// best_bits is set to 0 in case of error.
+// The following requires some glossary:
+// - a tile is a square of side 2^min_bits pixels.
+// - a super-tile of a tile is a square of side 2^bits pixels with bits in
+// [min_bits+1, max_bits].
+// - the max-tile of a tile is the square of 2^max_bits pixels containing it.
+//   If this max-tile crosses the border of an image, it is cropped.
+// - tile, super-tiles and max_tile are aligned on powers of 2 in the original
+//   image.
+// - coordinates for tile, super-tile, max-tile are respectively named
+//   tile_x, super_tile_x, max_tile_x at their bit scale.
+// - in the max-tile, a tile has local coordinates (local_tile_x, local_tile_y).
+// The tiles are processed in the following zigzag order to complete the
+// super-tiles as soon as possible:
+//   1  2|  5  6
+//   3  4|  7  8
+// --------------
+//   9 10| 13 14
+//  11 12| 15 16
+// When computing the residuals for a tile, the histogram of the above
+// super-tile is updated. If this super-tile is finished, its histogram is used
+// to update the histogram of the next super-tile and so on up to the max-tile.
+static void GetBestPredictorsAndSubSampling(
+    int width, int height, const int min_bits, const int max_bits,
+    uint32_t* const argb_scratch, const uint32_t* const argb,
+    int max_quantization, int exact, int used_subtract_green,
+    const WebPPicture* const pic, int percent_range, int* const percent,
+    uint32_t** const all_modes, int* best_bits, uint32_t** best_mode) {
+  const uint32_t tiles_per_row = VP8LSubSampleSize(width, min_bits);
+  const uint32_t tiles_per_col = VP8LSubSampleSize(height, min_bits);
+  int64_t best_cost;
+  uint32_t subsampling_index;
+  const uint32_t max_subsampling_index = max_bits - min_bits;
+  // Compute the needed memory size for residual histograms, accumulated
+  // residual histograms and predictor histograms.
+  const int num_argb = (max_subsampling_index + 1) * kNumPredModes * HISTO_SIZE;
+  const int num_accumulated_rgb = (max_subsampling_index + 1) * HISTO_SIZE;
+  const int num_predictors = (max_subsampling_index + 1) * kNumPredModes;
+  uint32_t* const raw_data = (uint32_t*)WebPSafeCalloc(
+      num_argb + num_accumulated_rgb + num_predictors, sizeof(uint32_t));
+  uint32_t* const all_argb = raw_data;
+  uint32_t* const all_accumulated_argb = all_argb + num_argb;
+  uint32_t* const all_pred_histos = all_accumulated_argb + num_accumulated_rgb;
+  const int max_tile_size = 1 << max_subsampling_index;  // in tile size
+  int percent_start = *percent;
+  // When using the residuals of a tile for its super-tiles, you can either:
+  // - use each residual to update the histogram of the super-tile, with a cost
+  //   of 4 * (1<<n)^2 increment operations (4 for the number of channels, and
+  //   (1<<n)^2 for the number of pixels in the tile)
+  // - use the histogram of the tile to update the histogram of the super-tile,
+  //   with a cost of HISTO_SIZE (1024)
+  // The first method is therefore faster until n==4. 'update_up_to_index'
+  // defines the maximum subsampling_index for which the residuals should be
+  // individually added to the super-tile histogram.
+  const uint32_t update_up_to_index =
+      GetMax(GetMin(4, max_bits), min_bits) - min_bits;
+  // Coordinates in the max-tile in tile units.
+  uint32_t local_tile_x = 0, local_tile_y = 0;
+  uint32_t max_tile_x = 0, max_tile_y = 0;
+  uint32_t tile_x = 0, tile_y = 0;
+
+  *best_bits = 0;
+  *best_mode = NULL;
+  if (raw_data == NULL) return;
+
+  while (tile_y < tiles_per_col) {
+    ComputeResidualsForTile(width, height, tile_x, tile_y, min_bits,
+                            update_up_to_index, all_argb, argb_scratch, argb,
+                            max_quantization, exact, used_subtract_green);
+
+    // Update all the super-tiles that are complete.
+    subsampling_index = 0;
+    while (1) {
+      const uint32_t super_tile_x = tile_x >> subsampling_index;
+      const uint32_t super_tile_y = tile_y >> subsampling_index;
+      const uint32_t super_tiles_per_row =
+          VP8LSubSampleSize(width, min_bits + subsampling_index);
+      GetBestPredictorForTile(all_argb, subsampling_index, super_tile_x,
+                              super_tile_y, super_tiles_per_row,
+                              all_accumulated_argb, all_modes, all_pred_histos);
+      if (subsampling_index == max_subsampling_index) break;
+
+      // Update the following super-tile histogram if it has not been updated
+      // yet.
+      ++subsampling_index;
+      if (subsampling_index > update_up_to_index &&
+          subsampling_index <= max_subsampling_index) {
+        VP8LAddVectorEq(
+            GetHistoArgbConst(all_argb, subsampling_index - 1, /*mode=*/0),
+            GetHistoArgb(all_argb, subsampling_index, /*mode=*/0),
+            HISTO_SIZE * kNumPredModes);
+      }
+      // Check whether the super-tile is not complete (if the smallest tile
+      // is not at the end of a line/column or at the beginning of a super-tile
+      // of size (1 << subsampling_index)).
+      if (!((tile_x == (tiles_per_row - 1) ||
+             (local_tile_x + 1) % (1 << subsampling_index) == 0) &&
+            (tile_y == (tiles_per_col - 1) ||
+             (local_tile_y + 1) % (1 << subsampling_index) == 0))) {
+        --subsampling_index;
+        // subsampling_index now is the index of the last finished super-tile.
+        break;
+      }
+    }
+    // Reset all the histograms belonging to finished tiles.
+    memset(all_argb, 0,
+           HISTO_SIZE * kNumPredModes * (subsampling_index + 1) *
+               sizeof(*all_argb));
+
+    if (subsampling_index == max_subsampling_index) {
+      // If a new max-tile is started.
+      if (tile_x == (tiles_per_row - 1)) {
+        max_tile_x = 0;
+        ++max_tile_y;
+      } else {
+        ++max_tile_x;
+      }
+      local_tile_x = 0;
+      local_tile_y = 0;
+    } else {
+      // Proceed with the Z traversal.
+      uint32_t coord_x = local_tile_x >> subsampling_index;
+      uint32_t coord_y = local_tile_y >> subsampling_index;
+      if (tile_x == (tiles_per_row - 1) && coord_x % 2 == 0) {
+        ++coord_y;
+      } else {
+        if (coord_x % 2 == 0) {
+          ++coord_x;
+        } else {
+          // Z traversal.
+          ++coord_y;
+          --coord_x;
+        }
+      }
+      local_tile_x = coord_x << subsampling_index;
+      local_tile_y = coord_y << subsampling_index;
+    }
+    tile_x = max_tile_x * max_tile_size + local_tile_x;
+    tile_y = max_tile_y * max_tile_size + local_tile_y;
+
+    if (tile_x == 0 &&
+        !WebPReportProgress(
+            pic, percent_start + percent_range * tile_y / tiles_per_col,
+            percent)) {
+      WebPSafeFree(raw_data);
+      return;
+    }
+  }
+
+  // Figure out the best sampling.
+  best_cost = WEBP_INT64_MAX;
+  for (subsampling_index = 0; subsampling_index <= max_subsampling_index;
+       ++subsampling_index) {
+    int plane;
+    const uint32_t* const accumulated =
+        GetAccumulatedHisto(all_accumulated_argb, subsampling_index);
+    int64_t cost = VP8LShannonEntropy(
+        &all_pred_histos[subsampling_index * kNumPredModes], kNumPredModes);
+    for (plane = 0; plane < 4; ++plane) {
+      cost += VP8LShannonEntropy(&accumulated[plane * 256], 256);
+    }
+    if (cost < best_cost) {
+      best_cost = cost;
+      *best_bits = min_bits + subsampling_index;
+      *best_mode = all_modes[subsampling_index];
+    }
+  }
+
+  WebPSafeFree(raw_data);
+
+  VP8LOptimizeSampling(*best_mode, width, height, *best_bits,
+                       MAX_TRANSFORM_BITS, best_bits);
+}
+
 // Finds the best predictor for each tile, and converts the image to residuals
 // Finds the best predictor for each tile, and converts the image to residuals
 // with respect to predictions. If near_lossless_quality < 100, applies
 // with respect to predictions. If near_lossless_quality < 100, applies
 // near lossless processing, shaving off more bits of residuals for lower
 // near lossless processing, shaving off more bits of residuals for lower
 // qualities.
 // qualities.
-int VP8LResidualImage(int width, int height, int bits, int low_effort,
-                      uint32_t* const argb, uint32_t* const argb_scratch,
-                      uint32_t* const image, int near_lossless_quality,
-                      int exact, int used_subtract_green,
-                      const WebPPicture* const pic, int percent_range,
-                      int* const percent) {
-  const int tiles_per_row = VP8LSubSampleSize(width, bits);
-  const int tiles_per_col = VP8LSubSampleSize(height, bits);
+int VP8LResidualImage(int width, int height, int min_bits, int max_bits,
+                      int low_effort, uint32_t* const argb,
+                      uint32_t* const argb_scratch, uint32_t* const image,
+                      int near_lossless_quality, int exact,
+                      int used_subtract_green, const WebPPicture* const pic,
+                      int percent_range, int* const percent,
+                      int* const best_bits) {
   int percent_start = *percent;
   int percent_start = *percent;
-  int tile_y;
-  int histo[4][256];
   const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
   const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
   if (low_effort) {
   if (low_effort) {
+    const int tiles_per_row = VP8LSubSampleSize(width, max_bits);
+    const int tiles_per_col = VP8LSubSampleSize(height, max_bits);
     int i;
     int i;
     for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
     for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
       image[i] = ARGB_BLACK | (kPredLowEffort << 8);
       image[i] = ARGB_BLACK | (kPredLowEffort << 8);
     }
     }
+    *best_bits = max_bits;
   } else {
   } else {
-    memset(histo, 0, sizeof(histo));
-    for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
-      int tile_x;
-      for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
-        const int pred = GetBestPredictorForTile(
-            width, height, tile_x, tile_y, bits, histo, argb_scratch, argb,
-            max_quantization, exact, used_subtract_green, image);
-        image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
-      }
-
-      if (!WebPReportProgress(
-              pic, percent_start + percent_range * tile_y / tiles_per_col,
-              percent)) {
-        return 0;
-      }
+    // Allocate data to try all samplings from min_bits to max_bits.
+    int bits;
+    uint32_t sum_num_pixels = 0u;
+    uint32_t *modes_raw, *best_mode;
+    uint32_t* modes[MAX_TRANSFORM_BITS + 1];
+    uint32_t num_pixels[MAX_TRANSFORM_BITS + 1];
+    for (bits = min_bits; bits <= max_bits; ++bits) {
+      const int tiles_per_row = VP8LSubSampleSize(width, bits);
+      const int tiles_per_col = VP8LSubSampleSize(height, bits);
+      num_pixels[bits] = tiles_per_row * tiles_per_col;
+      sum_num_pixels += num_pixels[bits];
     }
     }
+    modes_raw = (uint32_t*)WebPSafeMalloc(sum_num_pixels, sizeof(*modes_raw));
+    if (modes_raw == NULL) return 0;
+    // Have modes point to the right global memory modes_raw.
+    modes[min_bits] = modes_raw;
+    for (bits = min_bits + 1; bits <= max_bits; ++bits) {
+      modes[bits] = modes[bits - 1] + num_pixels[bits - 1];
+    }
+    // Find the best sampling.
+    GetBestPredictorsAndSubSampling(
+        width, height, min_bits, max_bits, argb_scratch, argb, max_quantization,
+        exact, used_subtract_green, pic, percent_range, percent,
+        &modes[min_bits], best_bits, &best_mode);
+    if (*best_bits == 0) {
+      WebPSafeFree(modes_raw);
+      return 0;
+    }
+    // Keep the best predictor image.
+    memcpy(image, best_mode,
+           VP8LSubSampleSize(width, *best_bits) *
+               VP8LSubSampleSize(height, *best_bits) * sizeof(*image));
+    WebPSafeFree(modes_raw);
   }
   }
 
 
-  CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
+  CopyImageWithPrediction(width, height, *best_bits, image, argb_scratch, argb,
                           low_effort, max_quantization, exact,
                           low_effort, max_quantization, exact,
                           used_subtract_green);
                           used_subtract_green);
   return WebPReportProgress(pic, percent_start + percent_range, percent);
   return WebPReportProgress(pic, percent_start + percent_range, percent);
@@ -539,48 +851,51 @@ static WEBP_INLINE uint32_t MultipliersToColorCode(
          m->green_to_red_;
          m->green_to_red_;
 }
 }
 
 
-static float PredictionCostCrossColor(const int accumulated[256],
-                                      const int counts[256]) {
+static int64_t PredictionCostCrossColor(const uint32_t accumulated[256],
+                                        const uint32_t counts[256]) {
   // Favor low entropy, locally and globally.
   // Favor low entropy, locally and globally.
   // Favor small absolute values for PredictionCostSpatial
   // Favor small absolute values for PredictionCostSpatial
-  static const float kExpValue = 2.4f;
-  return VP8LCombinedShannonEntropy(counts, accumulated) +
-         PredictionCostSpatial(counts, 3, kExpValue);
+  static const uint64_t kExpValue = 240;
+  return (int64_t)VP8LCombinedShannonEntropy(counts, accumulated) +
+         PredictionCostBias(counts, 3, kExpValue);
 }
 }
 
 
-static float GetPredictionCostCrossColorRed(
+static int64_t GetPredictionCostCrossColorRed(
     const uint32_t* argb, int stride, int tile_width, int tile_height,
     const uint32_t* argb, int stride, int tile_width, int tile_height,
     VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
     VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
-    const int accumulated_red_histo[256]) {
-  int histo[256] = { 0 };
-  float cur_diff;
+    const uint32_t accumulated_red_histo[256]) {
+  uint32_t histo[256] = { 0 };
+  int64_t cur_diff;
 
 
   VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
   VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
                                 green_to_red, histo);
                                 green_to_red, histo);
 
 
   cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
   cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
   if ((uint8_t)green_to_red == prev_x.green_to_red_) {
   if ((uint8_t)green_to_red == prev_x.green_to_red_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
+    // favor keeping the areas locally similar
+    cur_diff -= 3ll << LOG_2_PRECISION_BITS;
   }
   }
   if ((uint8_t)green_to_red == prev_y.green_to_red_) {
   if ((uint8_t)green_to_red == prev_y.green_to_red_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
+    // favor keeping the areas locally similar
+    cur_diff -= 3ll << LOG_2_PRECISION_BITS;
   }
   }
   if (green_to_red == 0) {
   if (green_to_red == 0) {
-    cur_diff -= 3;
+    cur_diff -= 3ll << LOG_2_PRECISION_BITS;
   }
   }
   return cur_diff;
   return cur_diff;
 }
 }
 
 
-static void GetBestGreenToRed(
-    const uint32_t* argb, int stride, int tile_width, int tile_height,
-    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
-    const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
+static void GetBestGreenToRed(const uint32_t* argb, int stride, int tile_width,
+                              int tile_height, VP8LMultipliers prev_x,
+                              VP8LMultipliers prev_y, int quality,
+                              const uint32_t accumulated_red_histo[256],
+                              VP8LMultipliers* const best_tx) {
   const int kMaxIters = 4 + ((7 * quality) >> 8);  // in range [4..6]
   const int kMaxIters = 4 + ((7 * quality) >> 8);  // in range [4..6]
   int green_to_red_best = 0;
   int green_to_red_best = 0;
   int iter, offset;
   int iter, offset;
-  float best_diff = GetPredictionCostCrossColorRed(
-      argb, stride, tile_width, tile_height, prev_x, prev_y,
-      green_to_red_best, accumulated_red_histo);
+  int64_t best_diff = GetPredictionCostCrossColorRed(
+      argb, stride, tile_width, tile_height, prev_x, prev_y, green_to_red_best,
+      accumulated_red_histo);
   for (iter = 0; iter < kMaxIters; ++iter) {
   for (iter = 0; iter < kMaxIters; ++iter) {
     // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
     // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
     // one in color computation. Having initial delta here as 1 is sufficient
     // one in color computation. Having initial delta here as 1 is sufficient
@@ -589,7 +904,7 @@ static void GetBestGreenToRed(
     // Try a negative and a positive delta from the best known value.
     // Try a negative and a positive delta from the best known value.
     for (offset = -delta; offset <= delta; offset += 2 * delta) {
     for (offset = -delta; offset <= delta; offset += 2 * delta) {
       const int green_to_red_cur = offset + green_to_red_best;
       const int green_to_red_cur = offset + green_to_red_best;
-      const float cur_diff = GetPredictionCostCrossColorRed(
+      const int64_t cur_diff = GetPredictionCostCrossColorRed(
           argb, stride, tile_width, tile_height, prev_x, prev_y,
           argb, stride, tile_width, tile_height, prev_x, prev_y,
           green_to_red_cur, accumulated_red_histo);
           green_to_red_cur, accumulated_red_histo);
       if (cur_diff < best_diff) {
       if (cur_diff < best_diff) {
@@ -601,45 +916,50 @@ static void GetBestGreenToRed(
   best_tx->green_to_red_ = (green_to_red_best & 0xff);
   best_tx->green_to_red_ = (green_to_red_best & 0xff);
 }
 }
 
 
-static float GetPredictionCostCrossColorBlue(
+static int64_t GetPredictionCostCrossColorBlue(
     const uint32_t* argb, int stride, int tile_width, int tile_height,
     const uint32_t* argb, int stride, int tile_width, int tile_height,
-    VP8LMultipliers prev_x, VP8LMultipliers prev_y,
-    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
-  int histo[256] = { 0 };
-  float cur_diff;
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_blue,
+    int red_to_blue, const uint32_t accumulated_blue_histo[256]) {
+  uint32_t histo[256] = { 0 };
+  int64_t cur_diff;
 
 
   VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
   VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
                                  green_to_blue, red_to_blue, histo);
                                  green_to_blue, red_to_blue, histo);
 
 
   cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
   cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
   if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
   if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
+    // favor keeping the areas locally similar
+    cur_diff -= 3ll << LOG_2_PRECISION_BITS;
   }
   }
   if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
   if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
+    // favor keeping the areas locally similar
+    cur_diff -= 3ll << LOG_2_PRECISION_BITS;
   }
   }
   if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
   if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
+    // favor keeping the areas locally similar
+    cur_diff -= 3ll << LOG_2_PRECISION_BITS;
   }
   }
   if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
   if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
+    // favor keeping the areas locally similar
+    cur_diff -= 3ll << LOG_2_PRECISION_BITS;
   }
   }
   if (green_to_blue == 0) {
   if (green_to_blue == 0) {
-    cur_diff -= 3;
+    cur_diff -= 3ll << LOG_2_PRECISION_BITS;
   }
   }
   if (red_to_blue == 0) {
   if (red_to_blue == 0) {
-    cur_diff -= 3;
+    cur_diff -= 3ll << LOG_2_PRECISION_BITS;
   }
   }
   return cur_diff;
   return cur_diff;
 }
 }
 
 
 #define kGreenRedToBlueNumAxis 8
 #define kGreenRedToBlueNumAxis 8
 #define kGreenRedToBlueMaxIters 7
 #define kGreenRedToBlueMaxIters 7
-static void GetBestGreenRedToBlue(
-    const uint32_t* argb, int stride, int tile_width, int tile_height,
-    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
-    const int accumulated_blue_histo[256],
-    VP8LMultipliers* const best_tx) {
+static void GetBestGreenRedToBlue(const uint32_t* argb, int stride,
+                                  int tile_width, int tile_height,
+                                  VP8LMultipliers prev_x,
+                                  VP8LMultipliers prev_y, int quality,
+                                  const uint32_t accumulated_blue_histo[256],
+                                  VP8LMultipliers* const best_tx) {
   const int8_t offset[kGreenRedToBlueNumAxis][2] =
   const int8_t offset[kGreenRedToBlueNumAxis][2] =
       {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
       {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
   const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
   const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
@@ -649,9 +969,9 @@ static void GetBestGreenRedToBlue(
   int red_to_blue_best = 0;
   int red_to_blue_best = 0;
   int iter;
   int iter;
   // Initial value at origin:
   // Initial value at origin:
-  float best_diff = GetPredictionCostCrossColorBlue(
-      argb, stride, tile_width, tile_height, prev_x, prev_y,
-      green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
+  int64_t best_diff = GetPredictionCostCrossColorBlue(
+      argb, stride, tile_width, tile_height, prev_x, prev_y, green_to_blue_best,
+      red_to_blue_best, accumulated_blue_histo);
   for (iter = 0; iter < iters; ++iter) {
   for (iter = 0; iter < iters; ++iter) {
     const int delta = delta_lut[iter];
     const int delta = delta_lut[iter];
     int axis;
     int axis;
@@ -659,7 +979,7 @@ static void GetBestGreenRedToBlue(
       const int green_to_blue_cur =
       const int green_to_blue_cur =
           offset[axis][0] * delta + green_to_blue_best;
           offset[axis][0] * delta + green_to_blue_best;
       const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
       const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
-      const float cur_diff = GetPredictionCostCrossColorBlue(
+      const int64_t cur_diff = GetPredictionCostCrossColorBlue(
           argb, stride, tile_width, tile_height, prev_x, prev_y,
           argb, stride, tile_width, tile_height, prev_x, prev_y,
           green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
           green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
       if (cur_diff < best_diff) {
       if (cur_diff < best_diff) {
@@ -684,13 +1004,10 @@ static void GetBestGreenRedToBlue(
 #undef kGreenRedToBlueNumAxis
 #undef kGreenRedToBlueNumAxis
 
 
 static VP8LMultipliers GetBestColorTransformForTile(
 static VP8LMultipliers GetBestColorTransformForTile(
-    int tile_x, int tile_y, int bits,
-    VP8LMultipliers prev_x,
-    VP8LMultipliers prev_y,
-    int quality, int xsize, int ysize,
-    const int accumulated_red_histo[256],
-    const int accumulated_blue_histo[256],
-    const uint32_t* const argb) {
+    int tile_x, int tile_y, int bits, VP8LMultipliers prev_x,
+    VP8LMultipliers prev_y, int quality, int xsize, int ysize,
+    const uint32_t accumulated_red_histo[256],
+    const uint32_t accumulated_blue_histo[256], const uint32_t* const argb) {
   const int max_tile_size = 1 << bits;
   const int max_tile_size = 1 << bits;
   const int tile_y_offset = tile_y * max_tile_size;
   const int tile_y_offset = tile_y * max_tile_size;
   const int tile_x_offset = tile_x * max_tile_size;
   const int tile_x_offset = tile_x * max_tile_size;
@@ -728,13 +1045,13 @@ static void CopyTileWithColorTransform(int xsize, int ysize,
 int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
 int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
                             uint32_t* const argb, uint32_t* image,
                             uint32_t* const argb, uint32_t* image,
                             const WebPPicture* const pic, int percent_range,
                             const WebPPicture* const pic, int percent_range,
-                            int* const percent) {
+                            int* const percent, int* const best_bits) {
   const int max_tile_size = 1 << bits;
   const int max_tile_size = 1 << bits;
   const int tile_xsize = VP8LSubSampleSize(width, bits);
   const int tile_xsize = VP8LSubSampleSize(width, bits);
   const int tile_ysize = VP8LSubSampleSize(height, bits);
   const int tile_ysize = VP8LSubSampleSize(height, bits);
   int percent_start = *percent;
   int percent_start = *percent;
-  int accumulated_red_histo[256] = { 0 };
-  int accumulated_blue_histo[256] = { 0 };
+  uint32_t accumulated_red_histo[256] = { 0 };
+  uint32_t accumulated_blue_histo[256] = { 0 };
   int tile_x, tile_y;
   int tile_x, tile_y;
   VP8LMultipliers prev_x, prev_y;
   VP8LMultipliers prev_x, prev_y;
   MultipliersClear(&prev_y);
   MultipliersClear(&prev_y);
@@ -788,5 +1105,7 @@ int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
       return 0;
       return 0;
     }
     }
   }
   }
+  VP8LOptimizeSampling(image, width, height, bits, MAX_TRANSFORM_BITS,
+                       best_bits);
   return 1;
   return 1;
 }
 }

+ 7 - 5
thirdparty/libwebp/src/enc/quant_enc.c

@@ -462,7 +462,7 @@ const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
 
 
 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
-const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
+static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
 };
 };
 
 
@@ -478,7 +478,9 @@ void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
   VP8EncPredChroma8(it->yuv_p_, left, top);
   VP8EncPredChroma8(it->yuv_p_, left, top);
 }
 }
 
 
-void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
+// Form all the ten Intra4x4 predictions in the yuv_p_ cache
+// for the 4x4 block it->i4_
+static void MakeIntra4Preds(const VP8EncIterator* const it) {
   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
 }
 }
 
 
@@ -1102,7 +1104,7 @@ static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
 
 
     InitScore(&rd_i4);
     InitScore(&rd_i4);
-    VP8MakeIntra4Preds(it);
+    MakeIntra4Preds(it);
     for (mode = 0; mode < NUM_BMODES; ++mode) {
     for (mode = 0; mode < NUM_BMODES; ++mode) {
       VP8ModeScore rd_tmp;
       VP8ModeScore rd_tmp;
       int16_t tmp_levels[16];
       int16_t tmp_levels[16];
@@ -1237,7 +1239,7 @@ static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
-      VP8MakeIntra4Preds(it);
+      MakeIntra4Preds(it);
       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
                               src, dst, mode) << it->i4_;
                               src, dst, mode) << it->i4_;
     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
@@ -1305,7 +1307,7 @@ static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
       const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
       const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
 
 
-      VP8MakeIntra4Preds(it);
+      MakeIntra4Preds(it);
       for (mode = 0; mode < NUM_BMODES; ++mode) {
       for (mode = 0; mode < NUM_BMODES; ++mode) {
         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
         const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
         const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT

+ 6 - 7
thirdparty/libwebp/src/enc/vp8i_enc.h

@@ -16,6 +16,7 @@
 
 
 #include <string.h>     // for memcpy()
 #include <string.h>     // for memcpy()
 #include "src/dec/common_dec.h"
 #include "src/dec/common_dec.h"
+#include "src/dsp/cpu.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/bit_writer_utils.h"
 #include "src/utils/bit_writer_utils.h"
 #include "src/utils/thread_utils.h"
 #include "src/utils/thread_utils.h"
@@ -31,7 +32,7 @@ extern "C" {
 
 
 // version numbers
 // version numbers
 #define ENC_MAJ_VERSION 1
 #define ENC_MAJ_VERSION 1
-#define ENC_MIN_VERSION 4
+#define ENC_MIN_VERSION 5
 #define ENC_REV_VERSION 0
 #define ENC_REV_VERSION 0
 
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
@@ -78,7 +79,6 @@ typedef enum {   // Rate-distortion optimization levels
 extern const uint16_t VP8Scan[16];
 extern const uint16_t VP8Scan[16];
 extern const uint16_t VP8UVModeOffsets[4];
 extern const uint16_t VP8UVModeOffsets[4];
 extern const uint16_t VP8I16ModeOffsets[4];
 extern const uint16_t VP8I16ModeOffsets[4];
-extern const uint16_t VP8I4ModeOffsets[NUM_BMODES];
 
 
 // Layout of prediction blocks
 // Layout of prediction blocks
 // intra 16x16
 // intra 16x16
@@ -234,7 +234,11 @@ typedef struct {
   VP8BitWriter* bw_;               // current bit-writer
   VP8BitWriter* bw_;               // current bit-writer
   uint8_t*      preds_;            // intra mode predictors (4x4 blocks)
   uint8_t*      preds_;            // intra mode predictors (4x4 blocks)
   uint32_t*     nz_;               // non-zero pattern
   uint32_t*     nz_;               // non-zero pattern
+#if WEBP_AARCH64 && BPS == 32
+  uint8_t       i4_boundary_[40];  // 32+8 boundary samples needed by intra4x4
+#else
   uint8_t       i4_boundary_[37];  // 32+5 boundary samples needed by intra4x4
   uint8_t       i4_boundary_[37];  // 32+5 boundary samples needed by intra4x4
+#endif
   uint8_t*      i4_top_;           // pointer to the current top boundary sample
   uint8_t*      i4_top_;           // pointer to the current top boundary sample
   int           i4_;               // current intra4x4 mode being tested
   int           i4_;               // current intra4x4 mode being tested
   int           top_nz_[9];        // top-non-zero context.
   int           top_nz_[9];        // top-non-zero context.
@@ -267,8 +271,6 @@ typedef struct {
   // in iterator.c
   // in iterator.c
 // must be called first
 // must be called first
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it);
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it);
-// restart a scan
-void VP8IteratorReset(VP8EncIterator* const it);
 // reset iterator position to row 'y'
 // reset iterator position to row 'y'
 void VP8IteratorSetRow(VP8EncIterator* const it, int y);
 void VP8IteratorSetRow(VP8EncIterator* const it, int y);
 // set count down (=number of iterations to go)
 // set count down (=number of iterations to go)
@@ -444,9 +446,6 @@ extern const uint8_t VP8Cat6[];
 void VP8MakeLuma16Preds(const VP8EncIterator* const it);
 void VP8MakeLuma16Preds(const VP8EncIterator* const it);
 // Form all the four Chroma8x8 predictions in the yuv_p_ cache
 // Form all the four Chroma8x8 predictions in the yuv_p_ cache
 void VP8MakeChroma8Preds(const VP8EncIterator* const it);
 void VP8MakeChroma8Preds(const VP8EncIterator* const it);
-// Form all the ten Intra4x4 predictions in the yuv_p_ cache
-// for the 4x4 block it->i4_
-void VP8MakeIntra4Preds(const VP8EncIterator* const it);
 // Rate calculation
 // Rate calculation
 int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd);
 int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd);
 int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]);
 int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]);

+ 162 - 134
thirdparty/libwebp/src/enc/vp8l_enc.c

@@ -30,6 +30,10 @@
 
 
 // Maximum number of histogram images (sub-blocks).
 // Maximum number of histogram images (sub-blocks).
 #define MAX_HUFF_IMAGE_SIZE       2600
 #define MAX_HUFF_IMAGE_SIZE       2600
+#define MAX_HUFFMAN_BITS (MIN_HUFFMAN_BITS + (1 << NUM_HUFFMAN_BITS) - 1)
+// Empirical value for which it becomes too computationally expensive to
+// compute the best predictor image.
+#define MAX_PREDICTOR_IMAGE_SIZE (1 << 14)
 
 
 // -----------------------------------------------------------------------------
 // -----------------------------------------------------------------------------
 // Palette
 // Palette
@@ -140,8 +144,8 @@ static int AnalyzeEntropy(const uint32_t* argb,
       curr_row += argb_stride;
       curr_row += argb_stride;
     }
     }
     {
     {
-      float entropy_comp[kHistoTotal];
-      float entropy[kNumEntropyIx];
+      uint64_t entropy_comp[kHistoTotal];
+      uint64_t entropy[kNumEntropyIx];
       int k;
       int k;
       int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
       int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
       int j;
       int j;
@@ -179,19 +183,19 @@ static int AnalyzeEntropy(const uint32_t* argb,
       // When including transforms, there is an overhead in bits from
       // When including transforms, there is an overhead in bits from
       // storing them. This overhead is small but matters for small images.
       // storing them. This overhead is small but matters for small images.
       // For spatial, there are 14 transformations.
       // For spatial, there are 14 transformations.
-      entropy[kSpatial] += VP8LSubSampleSize(width, transform_bits) *
+      entropy[kSpatial] += (uint64_t)VP8LSubSampleSize(width, transform_bits) *
                            VP8LSubSampleSize(height, transform_bits) *
                            VP8LSubSampleSize(height, transform_bits) *
                            VP8LFastLog2(14);
                            VP8LFastLog2(14);
       // For color transforms: 24 as only 3 channels are considered in a
       // For color transforms: 24 as only 3 channels are considered in a
       // ColorTransformElement.
       // ColorTransformElement.
-      entropy[kSpatialSubGreen] += VP8LSubSampleSize(width, transform_bits) *
-                                   VP8LSubSampleSize(height, transform_bits) *
-                                   VP8LFastLog2(24);
+      entropy[kSpatialSubGreen] +=
+          (uint64_t)VP8LSubSampleSize(width, transform_bits) *
+          VP8LSubSampleSize(height, transform_bits) * VP8LFastLog2(24);
       // For palettes, add the cost of storing the palette.
       // For palettes, add the cost of storing the palette.
       // We empirically estimate the cost of a compressed entry as 8 bits.
       // We empirically estimate the cost of a compressed entry as 8 bits.
       // The palette is differential-coded when compressed hence a much
       // The palette is differential-coded when compressed hence a much
       // lower cost than sizeof(uint32_t)*8.
       // lower cost than sizeof(uint32_t)*8.
-      entropy[kPalette] += palette_size * 8;
+      entropy[kPalette] += (palette_size * 8ull) << LOG_2_PRECISION_BITS;
 
 
       *min_entropy_ix = kDirect;
       *min_entropy_ix = kDirect;
       for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
       for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
@@ -231,17 +235,33 @@ static int AnalyzeEntropy(const uint32_t* argb,
   }
   }
 }
 }
 
 
+// Clamp histogram and transform bits.
+static int ClampBits(int width, int height, int bits, int min_bits,
+                     int max_bits, int image_size_max) {
+  int image_size;
+  bits = (bits < min_bits) ? min_bits : (bits > max_bits) ? max_bits : bits;
+  image_size = VP8LSubSampleSize(width, bits) * VP8LSubSampleSize(height, bits);
+  while (bits < max_bits && image_size > image_size_max) {
+    ++bits;
+    image_size =
+        VP8LSubSampleSize(width, bits) * VP8LSubSampleSize(height, bits);
+  }
+  // In case the bits reduce the image too much, choose the smallest value
+  // setting the histogram image size to 1.
+  while (bits > min_bits && image_size == 1) {
+    image_size = VP8LSubSampleSize(width, bits - 1) *
+                 VP8LSubSampleSize(height, bits - 1);
+    if (image_size != 1) break;
+    --bits;
+  }
+  return bits;
+}
+
 static int GetHistoBits(int method, int use_palette, int width, int height) {
 static int GetHistoBits(int method, int use_palette, int width, int height) {
   // Make tile size a function of encoding method (Range: 0 to 6).
   // Make tile size a function of encoding method (Range: 0 to 6).
-  int histo_bits = (use_palette ? 9 : 7) - method;
-  while (1) {
-    const int huff_image_size = VP8LSubSampleSize(width, histo_bits) *
-                                VP8LSubSampleSize(height, histo_bits);
-    if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
-    ++histo_bits;
-  }
-  return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
-         (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
+  const int histo_bits = (use_palette ? 9 : 7) - method;
+  return ClampBits(width, height, histo_bits, MIN_HUFFMAN_BITS,
+                   MAX_HUFFMAN_BITS, MAX_HUFF_IMAGE_SIZE);
 }
 }
 
 
 static int GetTransformBits(int method, int histo_bits) {
 static int GetTransformBits(int method, int histo_bits) {
@@ -280,7 +300,7 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
   const int method = config->method;
   const int method = config->method;
   const int low_effort = (config->method == 0);
   const int low_effort = (config->method == 0);
   int i;
   int i;
-  int use_palette;
+  int use_palette, transform_bits;
   int n_lz77s;
   int n_lz77s;
   // If set to 0, analyze the cache with the computed cache value. If 1, also
   // If set to 0, analyze the cache with the computed cache value. If 1, also
   // analyze with no-cache.
   // analyze with no-cache.
@@ -297,7 +317,9 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
   // Empirical bit sizes.
   // Empirical bit sizes.
   enc->histo_bits_ = GetHistoBits(method, use_palette,
   enc->histo_bits_ = GetHistoBits(method, use_palette,
                                   pic->width, pic->height);
                                   pic->width, pic->height);
-  enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
+  transform_bits = GetTransformBits(method, enc->histo_bits_);
+  enc->predictor_transform_bits_ = transform_bits;
+  enc->cross_color_transform_bits_ = transform_bits;
 
 
   if (low_effort) {
   if (low_effort) {
     // AnalyzeEntropy is somewhat slow.
     // AnalyzeEntropy is somewhat slow.
@@ -311,8 +333,8 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
     // Try out multiple LZ77 on images with few colors.
     // Try out multiple LZ77 on images with few colors.
     n_lz77s = (enc->palette_size_ > 0 && enc->palette_size_ <= 16) ? 2 : 1;
     n_lz77s = (enc->palette_size_ > 0 && enc->palette_size_ <= 16) ? 2 : 1;
     if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, use_palette,
     if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, use_palette,
-                        enc->palette_size_, enc->transform_bits_,
-                        &min_entropy_ix, red_and_blue_always_zero)) {
+                        enc->palette_size_, transform_bits, &min_entropy_ix,
+                        red_and_blue_always_zero)) {
       return 0;
       return 0;
     }
     }
     if (method == 6 && config->quality == 100) {
     if (method == 6 && config->quality == 100) {
@@ -661,11 +683,12 @@ static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
   VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits);
   VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits);
 }
 }
 
 
-static int StoreImageToBitMask(
-    VP8LBitWriter* const bw, int width, int histo_bits,
-    const VP8LBackwardRefs* const refs,
-    const uint16_t* histogram_symbols,
-    const HuffmanTreeCode* const huffman_codes, const WebPPicture* const pic) {
+static int StoreImageToBitMask(VP8LBitWriter* const bw, int width,
+                               int histo_bits,
+                               const VP8LBackwardRefs* const refs,
+                               const uint32_t* histogram_symbols,
+                               const HuffmanTreeCode* const huffman_codes,
+                               const WebPPicture* const pic) {
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
   const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits);
   const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits);
   // x and y trace the position in the image.
   // x and y trace the position in the image.
@@ -673,7 +696,7 @@ static int StoreImageToBitMask(
   int y = 0;
   int y = 0;
   int tile_x = x & tile_mask;
   int tile_x = x & tile_mask;
   int tile_y = y & tile_mask;
   int tile_y = y & tile_mask;
-  int histogram_ix = histogram_symbols[0];
+  int histogram_ix = (histogram_symbols[0] >> 8) & 0xffff;
   const HuffmanTreeCode* codes = huffman_codes + 5 * histogram_ix;
   const HuffmanTreeCode* codes = huffman_codes + 5 * histogram_ix;
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
   while (VP8LRefsCursorOk(&c)) {
   while (VP8LRefsCursorOk(&c)) {
@@ -681,8 +704,10 @@ static int StoreImageToBitMask(
     if ((tile_x != (x & tile_mask)) || (tile_y != (y & tile_mask))) {
     if ((tile_x != (x & tile_mask)) || (tile_y != (y & tile_mask))) {
       tile_x = x & tile_mask;
       tile_x = x & tile_mask;
       tile_y = y & tile_mask;
       tile_y = y & tile_mask;
-      histogram_ix = histogram_symbols[(y >> histo_bits) * histo_xsize +
-                                       (x >> histo_bits)];
+      histogram_ix = (histogram_symbols[(y >> histo_bits) * histo_xsize +
+                                        (x >> histo_bits)] >>
+                      8) &
+                     0xffff;
       codes = huffman_codes + 5 * histogram_ix;
       codes = huffman_codes + 5 * histogram_ix;
     }
     }
     if (PixOrCopyIsLiteral(v)) {
     if (PixOrCopyIsLiteral(v)) {
@@ -738,7 +763,7 @@ static int EncodeImageNoHuffman(VP8LBitWriter* const bw,
   VP8LBackwardRefs* refs;
   VP8LBackwardRefs* refs;
   HuffmanTreeToken* tokens = NULL;
   HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode huffman_codes[5] = {{0, NULL, NULL}};
   HuffmanTreeCode huffman_codes[5] = {{0, NULL, NULL}};
-  const uint16_t histogram_symbols[1] = {0};  // only one tree, one symbol
+  const uint32_t histogram_symbols[1] = {0};  // only one tree, one symbol
   int cache_bits = 0;
   int cache_bits = 0;
   VP8LHistogramSet* histogram_image = NULL;
   VP8LHistogramSet* histogram_image = NULL;
   HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
   HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
@@ -821,32 +846,32 @@ static int EncodeImageInternal(
     VP8LBitWriter* const bw, const uint32_t* const argb,
     VP8LBitWriter* const bw, const uint32_t* const argb,
     VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[4], int width,
     VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[4], int width,
     int height, int quality, int low_effort, const CrunchConfig* const config,
     int height, int quality, int low_effort, const CrunchConfig* const config,
-    int* cache_bits, int histogram_bits, size_t init_byte_position,
+    int* cache_bits, int histogram_bits_in, size_t init_byte_position,
     int* const hdr_size, int* const data_size, const WebPPicture* const pic,
     int* const hdr_size, int* const data_size, const WebPPicture* const pic,
     int percent_range, int* const percent) {
     int percent_range, int* const percent) {
   const uint32_t histogram_image_xysize =
   const uint32_t histogram_image_xysize =
-      VP8LSubSampleSize(width, histogram_bits) *
-      VP8LSubSampleSize(height, histogram_bits);
+      VP8LSubSampleSize(width, histogram_bits_in) *
+      VP8LSubSampleSize(height, histogram_bits_in);
   int remaining_percent = percent_range;
   int remaining_percent = percent_range;
   int percent_start = *percent;
   int percent_start = *percent;
   VP8LHistogramSet* histogram_image = NULL;
   VP8LHistogramSet* histogram_image = NULL;
   VP8LHistogram* tmp_histo = NULL;
   VP8LHistogram* tmp_histo = NULL;
-  int histogram_image_size = 0;
+  uint32_t i, histogram_image_size = 0;
   size_t bit_array_size = 0;
   size_t bit_array_size = 0;
   HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
   HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
       3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
       3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
   HuffmanTreeToken* tokens = NULL;
   HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode* huffman_codes = NULL;
   HuffmanTreeCode* huffman_codes = NULL;
-  uint16_t* const histogram_symbols = (uint16_t*)WebPSafeMalloc(
-      histogram_image_xysize, sizeof(*histogram_symbols));
+  uint32_t* const histogram_argb = (uint32_t*)WebPSafeMalloc(
+      histogram_image_xysize, sizeof(*histogram_argb));
   int sub_configs_idx;
   int sub_configs_idx;
   int cache_bits_init, write_histogram_image;
   int cache_bits_init, write_histogram_image;
   VP8LBitWriter bw_init = *bw, bw_best;
   VP8LBitWriter bw_init = *bw, bw_best;
   int hdr_size_tmp;
   int hdr_size_tmp;
   VP8LHashChain hash_chain_histogram;  // histogram image hash chain
   VP8LHashChain hash_chain_histogram;  // histogram image hash chain
   size_t bw_size_best = ~(size_t)0;
   size_t bw_size_best = ~(size_t)0;
-  assert(histogram_bits >= MIN_HUFFMAN_BITS);
-  assert(histogram_bits <= MAX_HUFFMAN_BITS);
+  assert(histogram_bits_in >= MIN_HUFFMAN_BITS);
+  assert(histogram_bits_in <= MAX_HUFFMAN_BITS);
   assert(hdr_size != NULL);
   assert(hdr_size != NULL);
   assert(data_size != NULL);
   assert(data_size != NULL);
 
 
@@ -857,7 +882,7 @@ static int EncodeImageInternal(
   }
   }
 
 
   // Make sure we can allocate the different objects.
   // Make sure we can allocate the different objects.
-  if (huff_tree == NULL || histogram_symbols == NULL ||
+  if (huff_tree == NULL || histogram_argb == NULL ||
       !VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize)) {
       !VP8LHashChainInit(&hash_chain_histogram, histogram_image_xysize)) {
     WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
     goto Error;
     goto Error;
@@ -899,6 +924,7 @@ static int EncodeImageInternal(
 
 
     for (i_cache = 0; i_cache < (sub_config->do_no_cache_ ? 2 : 1); ++i_cache) {
     for (i_cache = 0; i_cache < (sub_config->do_no_cache_ ? 2 : 1); ++i_cache) {
       const int cache_bits_tmp = (i_cache == 0) ? cache_bits_best : 0;
       const int cache_bits_tmp = (i_cache == 0) ? cache_bits_best : 0;
+      int histogram_bits = histogram_bits_in;
       // Speed-up: no need to study the no-cache case if it was already studied
       // Speed-up: no need to study the no-cache case if it was already studied
       // in i_cache == 0.
       // in i_cache == 0.
       if (i_cache == 1 && cache_bits_best == 0) break;
       if (i_cache == 1 && cache_bits_best == 0) break;
@@ -920,7 +946,7 @@ static int EncodeImageInternal(
       if (!VP8LGetHistoImageSymbols(
       if (!VP8LGetHistoImageSymbols(
               width, height, &refs_array[i_cache], quality, low_effort,
               width, height, &refs_array[i_cache], quality, low_effort,
               histogram_bits, cache_bits_tmp, histogram_image, tmp_histo,
               histogram_bits, cache_bits_tmp, histogram_image, tmp_histo,
-              histogram_symbols, pic, i_percent_range, percent)) {
+              histogram_argb, pic, i_percent_range, percent)) {
         goto Error;
         goto Error;
       }
       }
       // Create Huffman bit lengths and codes for each histogram image.
       // Create Huffman bit lengths and codes for each histogram image.
@@ -953,26 +979,19 @@ static int EncodeImageInternal(
       }
       }
 
 
       // Huffman image + meta huffman.
       // Huffman image + meta huffman.
+      histogram_image_size = 0;
+      for (i = 0; i < histogram_image_xysize; ++i) {
+        if (histogram_argb[i] >= histogram_image_size) {
+          histogram_image_size = histogram_argb[i] + 1;
+        }
+        histogram_argb[i] <<= 8;
+      }
+
       write_histogram_image = (histogram_image_size > 1);
       write_histogram_image = (histogram_image_size > 1);
       VP8LPutBits(bw, write_histogram_image, 1);
       VP8LPutBits(bw, write_histogram_image, 1);
       if (write_histogram_image) {
       if (write_histogram_image) {
-        uint32_t* const histogram_argb = (uint32_t*)WebPSafeMalloc(
-            histogram_image_xysize, sizeof(*histogram_argb));
-        int max_index = 0;
-        uint32_t i;
-        if (histogram_argb == NULL) {
-          WebPEncodingSetError(pic, VP8_ENC_ERROR_OUT_OF_MEMORY);
-          goto Error;
-        }
-        for (i = 0; i < histogram_image_xysize; ++i) {
-          const int symbol_index = histogram_symbols[i] & 0xffff;
-          histogram_argb[i] = (symbol_index << 8);
-          if (symbol_index >= max_index) {
-            max_index = symbol_index + 1;
-          }
-        }
-        histogram_image_size = max_index;
-
+        VP8LOptimizeSampling(histogram_argb, width, height, histogram_bits_in,
+                             MAX_HUFFMAN_BITS, &histogram_bits);
         VP8LPutBits(bw, histogram_bits - 2, 3);
         VP8LPutBits(bw, histogram_bits - 2, 3);
         i_percent_range = i_remaining_percent / 2;
         i_percent_range = i_remaining_percent / 2;
         i_remaining_percent -= i_percent_range;
         i_remaining_percent -= i_percent_range;
@@ -981,15 +1000,12 @@ static int EncodeImageInternal(
                 VP8LSubSampleSize(width, histogram_bits),
                 VP8LSubSampleSize(width, histogram_bits),
                 VP8LSubSampleSize(height, histogram_bits), quality, low_effort,
                 VP8LSubSampleSize(height, histogram_bits), quality, low_effort,
                 pic, i_percent_range, percent)) {
                 pic, i_percent_range, percent)) {
-          WebPSafeFree(histogram_argb);
           goto Error;
           goto Error;
         }
         }
-        WebPSafeFree(histogram_argb);
       }
       }
 
 
       // Store Huffman codes.
       // Store Huffman codes.
       {
       {
-        int i;
         int max_tokens = 0;
         int max_tokens = 0;
         // Find maximum number of symbols for the huffman tree-set.
         // Find maximum number of symbols for the huffman tree-set.
         for (i = 0; i < 5 * histogram_image_size; ++i) {
         for (i = 0; i < 5 * histogram_image_size; ++i) {
@@ -1012,7 +1028,7 @@ static int EncodeImageInternal(
       // Store actual literals.
       // Store actual literals.
       hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
       hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
       if (!StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache],
       if (!StoreImageToBitMask(bw, width, histogram_bits, &refs_array[i_cache],
-                               histogram_symbols, huffman_codes, pic)) {
+                               histogram_argb, huffman_codes, pic)) {
         goto Error;
         goto Error;
       }
       }
       // Keep track of the smallest image so far.
       // Keep track of the smallest image so far.
@@ -1049,7 +1065,7 @@ static int EncodeImageInternal(
     WebPSafeFree(huffman_codes->codes);
     WebPSafeFree(huffman_codes->codes);
     WebPSafeFree(huffman_codes);
     WebPSafeFree(huffman_codes);
   }
   }
-  WebPSafeFree(histogram_symbols);
+  WebPSafeFree(histogram_argb);
   VP8LBitWriterWipeOut(&bw_best);
   VP8LBitWriterWipeOut(&bw_best);
   return (pic->error_code == VP8_ENC_OK);
   return (pic->error_code == VP8_ENC_OK);
 }
 }
@@ -1064,54 +1080,60 @@ static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
   VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
   VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
 }
 }
 
 
-static int ApplyPredictFilter(const VP8LEncoder* const enc, int width,
-                              int height, int quality, int low_effort,
+static int ApplyPredictFilter(VP8LEncoder* const enc, int width, int height,
+                              int quality, int low_effort,
                               int used_subtract_green, VP8LBitWriter* const bw,
                               int used_subtract_green, VP8LBitWriter* const bw,
                               int percent_range, int* const percent) {
                               int percent_range, int* const percent) {
-  const int pred_bits = enc->transform_bits_;
-  const int transform_width = VP8LSubSampleSize(width, pred_bits);
-  const int transform_height = VP8LSubSampleSize(height, pred_bits);
-  // we disable near-lossless quantization if palette is used.
+  int best_bits;
   const int near_lossless_strength =
   const int near_lossless_strength =
       enc->use_palette_ ? 100 : enc->config_->near_lossless;
       enc->use_palette_ ? 100 : enc->config_->near_lossless;
-
-  if (!VP8LResidualImage(
-          width, height, pred_bits, low_effort, enc->argb_, enc->argb_scratch_,
-          enc->transform_data_, near_lossless_strength, enc->config_->exact,
-          used_subtract_green, enc->pic_, percent_range / 2, percent)) {
+  const int max_bits = ClampBits(width, height, enc->predictor_transform_bits_,
+                                 MIN_TRANSFORM_BITS, MAX_TRANSFORM_BITS,
+                                 MAX_PREDICTOR_IMAGE_SIZE);
+  const int min_bits = ClampBits(
+      width, height,
+      max_bits - 2 * (enc->config_->method > 4 ? enc->config_->method - 4 : 0),
+      MIN_TRANSFORM_BITS, MAX_TRANSFORM_BITS, MAX_PREDICTOR_IMAGE_SIZE);
+
+  if (!VP8LResidualImage(width, height, min_bits, max_bits, low_effort,
+                         enc->argb_, enc->argb_scratch_, enc->transform_data_,
+                         near_lossless_strength, enc->config_->exact,
+                         used_subtract_green, enc->pic_, percent_range / 2,
+                         percent, &best_bits)) {
     return 0;
     return 0;
   }
   }
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
   VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
-  assert(pred_bits >= 2);
-  VP8LPutBits(bw, pred_bits - 2, 3);
+  assert(best_bits >= MIN_TRANSFORM_BITS && best_bits <= MAX_TRANSFORM_BITS);
+  VP8LPutBits(bw, best_bits - MIN_TRANSFORM_BITS, NUM_TRANSFORM_BITS);
+  enc->predictor_transform_bits_ = best_bits;
   return EncodeImageNoHuffman(
   return EncodeImageNoHuffman(
-      bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
-      (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
+      bw, enc->transform_data_, &enc->hash_chain_, &enc->refs_[0],
+      VP8LSubSampleSize(width, best_bits), VP8LSubSampleSize(height, best_bits),
       quality, low_effort, enc->pic_, percent_range - percent_range / 2,
       quality, low_effort, enc->pic_, percent_range - percent_range / 2,
       percent);
       percent);
 }
 }
 
 
-static int ApplyCrossColorFilter(const VP8LEncoder* const enc, int width,
-                                 int height, int quality, int low_effort,
+static int ApplyCrossColorFilter(VP8LEncoder* const enc, int width, int height,
+                                 int quality, int low_effort,
                                  VP8LBitWriter* const bw, int percent_range,
                                  VP8LBitWriter* const bw, int percent_range,
                                  int* const percent) {
                                  int* const percent) {
-  const int ccolor_transform_bits = enc->transform_bits_;
-  const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
-  const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
+  const int min_bits = enc->cross_color_transform_bits_;
+  int best_bits;
 
 
-  if (!VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
-                               enc->argb_, enc->transform_data_, enc->pic_,
-                               percent_range / 2, percent)) {
+  if (!VP8LColorSpaceTransform(width, height, min_bits, quality, enc->argb_,
+                               enc->transform_data_, enc->pic_,
+                               percent_range / 2, percent, &best_bits)) {
     return 0;
     return 0;
   }
   }
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
   VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
-  assert(ccolor_transform_bits >= 2);
-  VP8LPutBits(bw, ccolor_transform_bits - 2, 3);
+  assert(best_bits >= MIN_TRANSFORM_BITS && best_bits <= MAX_TRANSFORM_BITS);
+  VP8LPutBits(bw, best_bits - MIN_TRANSFORM_BITS, NUM_TRANSFORM_BITS);
+  enc->cross_color_transform_bits_ = best_bits;
   return EncodeImageNoHuffman(
   return EncodeImageNoHuffman(
-      bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
-      (VP8LBackwardRefs*)&enc->refs_[0], transform_width, transform_height,
+      bw, enc->transform_data_, &enc->hash_chain_, &enc->refs_[0],
+      VP8LSubSampleSize(width, best_bits), VP8LSubSampleSize(height, best_bits),
       quality, low_effort, enc->pic_, percent_range - percent_range / 2,
       quality, low_effort, enc->pic_, percent_range - percent_range / 2,
       percent);
       percent);
 }
 }
@@ -1199,8 +1221,8 @@ static int AllocateTransformBuffer(VP8LEncoder* const enc, int width,
                         : 0;
                         : 0;
   const uint64_t transform_data_size =
   const uint64_t transform_data_size =
       (enc->use_predict_ || enc->use_cross_color_)
       (enc->use_predict_ || enc->use_cross_color_)
-          ? (uint64_t)VP8LSubSampleSize(width, enc->transform_bits_) *
-                VP8LSubSampleSize(height, enc->transform_bits_)
+          ? (uint64_t)VP8LSubSampleSize(width, MIN_TRANSFORM_BITS) *
+                VP8LSubSampleSize(height, MIN_TRANSFORM_BITS)
           : 0;
           : 0;
   const uint64_t max_alignment_in_words =
   const uint64_t max_alignment_in_words =
       (WEBP_ALIGN_CST + sizeof(uint32_t) - 1) / sizeof(uint32_t);
       (WEBP_ALIGN_CST + sizeof(uint32_t) - 1) / sizeof(uint32_t);
@@ -1374,13 +1396,11 @@ static int ApplyPalette(const uint32_t* src, uint32_t src_stride, uint32_t* dst,
 #undef APPLY_PALETTE_GREEDY_MAX
 #undef APPLY_PALETTE_GREEDY_MAX
 
 
 // Note: Expects "enc->palette_" to be set properly.
 // Note: Expects "enc->palette_" to be set properly.
-static int MapImageFromPalette(VP8LEncoder* const enc, int in_place) {
+static int MapImageFromPalette(VP8LEncoder* const enc) {
   const WebPPicture* const pic = enc->pic_;
   const WebPPicture* const pic = enc->pic_;
   const int width = pic->width;
   const int width = pic->width;
   const int height = pic->height;
   const int height = pic->height;
   const uint32_t* const palette = enc->palette_;
   const uint32_t* const palette = enc->palette_;
-  const uint32_t* src = in_place ? enc->argb_ : pic->argb;
-  const int src_stride = in_place ? enc->current_width_ : pic->argb_stride;
   const int palette_size = enc->palette_size_;
   const int palette_size = enc->palette_size_;
   int xbits;
   int xbits;
 
 
@@ -1395,9 +1415,9 @@ static int MapImageFromPalette(VP8LEncoder* const enc, int in_place) {
   if (!AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height)) {
   if (!AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height)) {
     return 0;
     return 0;
   }
   }
-  if (!ApplyPalette(src, src_stride,
-                     enc->argb_, enc->current_width_,
-                     palette, palette_size, width, height, xbits, pic)) {
+  if (!ApplyPalette(pic->argb, pic->argb_stride, enc->argb_,
+                    enc->current_width_, palette, palette_size, width, height,
+                    xbits, pic)) {
     return 0;
     return 0;
   }
   }
   enc->argb_content_ = kEncoderPalette;
   enc->argb_content_ = kEncoderPalette;
@@ -1405,24 +1425,31 @@ static int MapImageFromPalette(VP8LEncoder* const enc, int in_place) {
 }
 }
 
 
 // Save palette_[] to bitstream.
 // Save palette_[] to bitstream.
-static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
-                                       VP8LEncoder* const enc,
-                                       int percent_range, int* const percent) {
+static int EncodePalette(VP8LBitWriter* const bw, int low_effort,
+                         VP8LEncoder* const enc, int percent_range,
+                         int* const percent) {
   int i;
   int i;
   uint32_t tmp_palette[MAX_PALETTE_SIZE];
   uint32_t tmp_palette[MAX_PALETTE_SIZE];
   const int palette_size = enc->palette_size_;
   const int palette_size = enc->palette_size_;
   const uint32_t* const palette = enc->palette_;
   const uint32_t* const palette = enc->palette_;
+  // If the last element is 0, do not store it and count on automatic palette
+  // 0-filling. This can only happen if there is no pixel packing, hence if
+  // there are strictly more than 16 colors (after 0 is removed).
+  const uint32_t encoded_palette_size =
+      (enc->palette_[palette_size - 1] == 0 && palette_size > 17)
+          ? palette_size - 1
+          : palette_size;
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, COLOR_INDEXING_TRANSFORM, 2);
   VP8LPutBits(bw, COLOR_INDEXING_TRANSFORM, 2);
   assert(palette_size >= 1 && palette_size <= MAX_PALETTE_SIZE);
   assert(palette_size >= 1 && palette_size <= MAX_PALETTE_SIZE);
-  VP8LPutBits(bw, palette_size - 1, 8);
-  for (i = palette_size - 1; i >= 1; --i) {
+  VP8LPutBits(bw, encoded_palette_size - 1, 8);
+  for (i = encoded_palette_size - 1; i >= 1; --i) {
     tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
     tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
   }
   }
   tmp_palette[0] = palette[0];
   tmp_palette[0] = palette[0];
-  return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_,
-                              &enc->refs_[0], palette_size, 1, /*quality=*/20,
-                              low_effort, enc->pic_, percent_range, percent);
+  return EncodeImageNoHuffman(
+      bw, tmp_palette, &enc->hash_chain_, &enc->refs_[0], encoded_palette_size,
+      1, /*quality=*/20, low_effort, enc->pic_, percent_range, percent);
 }
 }
 
 
 // -----------------------------------------------------------------------------
 // -----------------------------------------------------------------------------
@@ -1493,7 +1520,6 @@ static int EncodeStreamHook(void* input, void* data2) {
 #endif
 #endif
   int hdr_size = 0;
   int hdr_size = 0;
   int data_size = 0;
   int data_size = 0;
-  int use_delta_palette = 0;
   int idx;
   int idx;
   size_t best_size = ~(size_t)0;
   size_t best_size = ~(size_t)0;
   VP8LBitWriter bw_init = *bw, bw_best;
   VP8LBitWriter bw_init = *bw, bw_best;
@@ -1558,45 +1584,43 @@ static int EncodeStreamHook(void* input, void* data2) {
         goto Error;
         goto Error;
       }
       }
       remaining_percent -= percent_range;
       remaining_percent -= percent_range;
-      if (!MapImageFromPalette(enc, use_delta_palette)) goto Error;
+      if (!MapImageFromPalette(enc)) goto Error;
       // If using a color cache, do not have it bigger than the number of
       // If using a color cache, do not have it bigger than the number of
       // colors.
       // colors.
       if (enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
       if (enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
         enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
         enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
       }
       }
     }
     }
-    if (!use_delta_palette) {
-      // In case image is not packed.
-      if (enc->argb_content_ != kEncoderNearLossless &&
-          enc->argb_content_ != kEncoderPalette) {
-        if (!MakeInputImageCopy(enc)) goto Error;
-      }
+    // In case image is not packed.
+    if (enc->argb_content_ != kEncoderNearLossless &&
+        enc->argb_content_ != kEncoderPalette) {
+      if (!MakeInputImageCopy(enc)) goto Error;
+    }
 
 
-      // -----------------------------------------------------------------------
-      // Apply transforms and write transform data.
+    // -------------------------------------------------------------------------
+    // Apply transforms and write transform data.
 
 
-      if (enc->use_subtract_green_) {
-        ApplySubtractGreen(enc, enc->current_width_, height, bw);
-      }
+    if (enc->use_subtract_green_) {
+      ApplySubtractGreen(enc, enc->current_width_, height, bw);
+    }
 
 
-      if (enc->use_predict_) {
-        percent_range = remaining_percent / 3;
-        if (!ApplyPredictFilter(enc, enc->current_width_, height, quality,
-                                low_effort, enc->use_subtract_green_, bw,
-                                percent_range, &percent)) {
-          goto Error;
-        }
-        remaining_percent -= percent_range;
+    if (enc->use_predict_) {
+      percent_range = remaining_percent / 3;
+      if (!ApplyPredictFilter(enc, enc->current_width_, height, quality,
+                              low_effort, enc->use_subtract_green_, bw,
+                              percent_range, &percent)) {
+        goto Error;
       }
       }
+      remaining_percent -= percent_range;
+    }
 
 
-      if (enc->use_cross_color_) {
-        percent_range = remaining_percent / 2;
-        if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
-                                   low_effort, bw, percent_range, &percent)) {
-          goto Error;
-        }
-        remaining_percent -= percent_range;
+    if (enc->use_cross_color_) {
+      percent_range = remaining_percent / 2;
+      if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
+                                 low_effort, bw, percent_range, &percent)) {
+        goto Error;
       }
       }
+      remaining_percent -= percent_range;
     }
     }
 
 
     VP8LPutBits(bw, !TRANSFORM_PRESENT, 1);  // No more transforms.
     VP8LPutBits(bw, !TRANSFORM_PRESENT, 1);  // No more transforms.
@@ -1625,7 +1649,8 @@ static int EncodeStreamHook(void* input, void* data2) {
         if (enc->use_subtract_green_) stats->lossless_features |= 4;
         if (enc->use_subtract_green_) stats->lossless_features |= 4;
         if (enc->use_palette_) stats->lossless_features |= 8;
         if (enc->use_palette_) stats->lossless_features |= 8;
         stats->histogram_bits = enc->histo_bits_;
         stats->histogram_bits = enc->histo_bits_;
-        stats->transform_bits = enc->transform_bits_;
+        stats->transform_bits = enc->predictor_transform_bits_;
+        stats->cross_color_transform_bits = enc->cross_color_transform_bits_;
         stats->cache_bits = enc->cache_bits_;
         stats->cache_bits = enc->cache_bits_;
         stats->palette_size = enc->palette_size_;
         stats->palette_size = enc->palette_size_;
         stats->lossless_size = (int)(best_size - byte_position);
         stats->lossless_size = (int)(best_size - byte_position);
@@ -1735,7 +1760,10 @@ int VP8LEncodeStream(const WebPConfig* const config,
         }
         }
         // Copy the values that were computed for the main encoder.
         // Copy the values that were computed for the main encoder.
         enc_side->histo_bits_ = enc_main->histo_bits_;
         enc_side->histo_bits_ = enc_main->histo_bits_;
-        enc_side->transform_bits_ = enc_main->transform_bits_;
+        enc_side->predictor_transform_bits_ =
+            enc_main->predictor_transform_bits_;
+        enc_side->cross_color_transform_bits_ =
+            enc_main->cross_color_transform_bits_;
         enc_side->palette_size_ = enc_main->palette_size_;
         enc_side->palette_size_ = enc_main->palette_size_;
         memcpy(enc_side->palette_, enc_main->palette_,
         memcpy(enc_side->palette_, enc_main->palette_,
                sizeof(enc_main->palette_));
                sizeof(enc_main->palette_));

+ 14 - 8
thirdparty/libwebp/src/enc/vp8li_enc.h

@@ -34,7 +34,7 @@ extern "C" {
 #endif
 #endif
 
 
 // maximum value of transform_bits_ in VP8LEncoder.
 // maximum value of transform_bits_ in VP8LEncoder.
-#define MAX_TRANSFORM_BITS 6
+#define MAX_TRANSFORM_BITS (MIN_TRANSFORM_BITS + (1 << NUM_TRANSFORM_BITS) - 1)
 
 
 typedef enum {
 typedef enum {
   kEncoderNone = 0,
   kEncoderNone = 0,
@@ -59,7 +59,8 @@ typedef struct {
 
 
   // Encoding parameters derived from quality parameter.
   // Encoding parameters derived from quality parameter.
   int histo_bits_;
   int histo_bits_;
-  int transform_bits_;    // <= MAX_TRANSFORM_BITS.
+  int predictor_transform_bits_;    // <= MAX_TRANSFORM_BITS
+  int cross_color_transform_bits_;  // <= MAX_TRANSFORM_BITS
   int cache_bits_;        // If equal to 0, don't use color cache.
   int cache_bits_;        // If equal to 0, don't use color cache.
 
 
   // Encoding parameters derived from image characteristics.
   // Encoding parameters derived from image characteristics.
@@ -104,16 +105,21 @@ int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
 
 
 // pic and percent are for progress.
 // pic and percent are for progress.
 // Returns false in case of error (stored in pic->error_code).
 // Returns false in case of error (stored in pic->error_code).
-int VP8LResidualImage(int width, int height, int bits, int low_effort,
-                      uint32_t* const argb, uint32_t* const argb_scratch,
-                      uint32_t* const image, int near_lossless, int exact,
-                      int used_subtract_green, const WebPPicture* const pic,
-                      int percent_range, int* const percent);
+int VP8LResidualImage(int width, int height, int min_bits, int max_bits,
+                      int low_effort, uint32_t* const argb,
+                      uint32_t* const argb_scratch, uint32_t* const image,
+                      int near_lossless, int exact, int used_subtract_green,
+                      const WebPPicture* const pic, int percent_range,
+                      int* const percent, int* const best_bits);
 
 
 int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
 int VP8LColorSpaceTransform(int width, int height, int bits, int quality,
                             uint32_t* const argb, uint32_t* image,
                             uint32_t* const argb, uint32_t* image,
                             const WebPPicture* const pic, int percent_range,
                             const WebPPicture* const pic, int percent_range,
-                            int* const percent);
+                            int* const percent, int* const best_bits);
+
+void VP8LOptimizeSampling(uint32_t* const image, int full_width,
+                          int full_height, int bits, int max_bits,
+                          int* best_bits_out);
 
 
 //------------------------------------------------------------------------------
 //------------------------------------------------------------------------------
 
 

+ 2 - 1
thirdparty/libwebp/src/mux/anim_encode.c

@@ -191,7 +191,8 @@ int WebPAnimEncoderOptionsInitInternal(WebPAnimEncoderOptions* enc_options,
   return 1;
   return 1;
 }
 }
 
 
-// This starting value is more fit to WebPCleanupTransparentAreaLossless().
+// This value is used to match a later call to WebPReplaceTransparentPixels(),
+// making it a no-op for lossless (see WebPEncode()).
 #define TRANSPARENT_COLOR   0x00000000
 #define TRANSPARENT_COLOR   0x00000000
 
 
 static void ClearRectangle(WebPPicture* const picture,
 static void ClearRectangle(WebPPicture* const picture,

+ 1 - 1
thirdparty/libwebp/src/mux/muxi.h

@@ -28,7 +28,7 @@ extern "C" {
 // Defines and constants.
 // Defines and constants.
 
 
 #define MUX_MAJ_VERSION 1
 #define MUX_MAJ_VERSION 1
-#define MUX_MIN_VERSION 4
+#define MUX_MIN_VERSION 5
 #define MUX_REV_VERSION 0
 #define MUX_REV_VERSION 0
 
 
 // Chunk object.
 // Chunk object.

+ 6 - 4
thirdparty/libwebp/src/mux/muxread.c

@@ -223,11 +223,13 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
   // Note this padding is historical and differs from demux.c which does not
   // Note this padding is historical and differs from demux.c which does not
   // pad the file size.
   // pad the file size.
   riff_size = SizeWithPadding(riff_size);
   riff_size = SizeWithPadding(riff_size);
-  if (riff_size < CHUNK_HEADER_SIZE) goto Err;
+  // Make sure the whole RIFF header is available.
+  if (riff_size < RIFF_HEADER_SIZE) goto Err;
   if (riff_size > size) goto Err;
   if (riff_size > size) goto Err;
-  // There's no point in reading past the end of the RIFF chunk.
-  if (size > riff_size + CHUNK_HEADER_SIZE) {
-    size = riff_size + CHUNK_HEADER_SIZE;
+  // There's no point in reading past the end of the RIFF chunk. Note riff_size
+  // includes CHUNK_HEADER_SIZE after SizeWithPadding().
+  if (size > riff_size) {
+    size = riff_size;
   }
   }
 
 
   end = data + size;
   end = data + size;

+ 2 - 1
thirdparty/libwebp/src/utils/bit_reader_utils.c

@@ -124,7 +124,8 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits,
 
 
 #if defined(__arm__) || defined(_M_ARM) || WEBP_AARCH64 || \
 #if defined(__arm__) || defined(_M_ARM) || WEBP_AARCH64 || \
     defined(__i386__) || defined(_M_IX86) || \
     defined(__i386__) || defined(_M_IX86) || \
-    defined(__x86_64__) || defined(_M_X64)
+    defined(__x86_64__) || defined(_M_X64) || \
+    defined(__wasm__)
 #define VP8L_USE_FAST_LOAD
 #define VP8L_USE_FAST_LOAD
 #endif
 #endif
 
 

+ 2 - 0
thirdparty/libwebp/src/utils/bit_reader_utils.h

@@ -69,6 +69,8 @@ extern "C" {
 #define BITS 56
 #define BITS 56
 #elif defined(__mips__)                        // MIPS
 #elif defined(__mips__)                        // MIPS
 #define BITS 24
 #define BITS 24
+#elif defined(__wasm__)                        // WASM
+#define BITS 56
 #else                                          // reasonable default
 #else                                          // reasonable default
 #define BITS 24
 #define BITS 24
 #endif
 #endif

+ 13 - 2
thirdparty/libwebp/src/utils/palette.c

@@ -191,6 +191,12 @@ static void PaletteSortMinimizeDeltas(const uint32_t* const palette_sorted,
   // Find greedily always the closest color of the predicted color to minimize
   // Find greedily always the closest color of the predicted color to minimize
   // deltas in the palette. This reduces storage needs since the
   // deltas in the palette. This reduces storage needs since the
   // palette is stored with delta encoding.
   // palette is stored with delta encoding.
+  if (num_colors > 17) {
+    if (palette[0] == 0) {
+      --num_colors;
+      SwapColor(&palette[num_colors], &palette[0]);
+    }
+  }
   for (i = 0; i < num_colors; ++i) {
   for (i = 0; i < num_colors; ++i) {
     int best_ix = i;
     int best_ix = i;
     uint32_t best_score = ~0U;
     uint32_t best_score = ~0U;
@@ -384,8 +390,13 @@ int PaletteSort(PaletteSorting method, const struct WebPPicture* const pic,
                 uint32_t* const palette) {
                 uint32_t* const palette) {
   switch (method) {
   switch (method) {
     case kSortedDefault:
     case kSortedDefault:
-      // Nothing to do, we have already sorted the palette.
-      memcpy(palette, palette_sorted, num_colors * sizeof(*palette));
+      if (palette_sorted[0] == 0 && num_colors > 17) {
+        memcpy(palette, palette_sorted + 1,
+               (num_colors - 1) * sizeof(*palette_sorted));
+        palette[num_colors - 1] = 0;
+      } else {
+        memcpy(palette, palette_sorted, num_colors * sizeof(*palette));
+      }
       return 1;
       return 1;
     case kMinimizeDelta:
     case kMinimizeDelta:
       PaletteSortMinimizeDeltas(palette_sorted, num_colors, palette);
       PaletteSortMinimizeDeltas(palette_sorted, num_colors, palette);

+ 2 - 0
thirdparty/libwebp/src/utils/palette.h

@@ -53,6 +53,8 @@ int GetColorPalette(const struct WebPPicture* const pic,
 // Sorts the palette according to the criterion defined by 'method'.
 // Sorts the palette according to the criterion defined by 'method'.
 // 'palette_sorted' is the input palette sorted lexicographically, as done in
 // 'palette_sorted' is the input palette sorted lexicographically, as done in
 // PrepareMapToPalette. Returns 0 on memory allocation error.
 // PrepareMapToPalette. Returns 0 on memory allocation error.
+// For kSortedDefault and kMinimizeDelta methods, 0 (if present) is set as the
+// last element to optimize later storage.
 int PaletteSort(PaletteSorting method, const struct WebPPicture* const pic,
 int PaletteSort(PaletteSorting method, const struct WebPPicture* const pic,
                 const uint32_t* const palette_sorted, uint32_t num_colors,
                 const uint32_t* const palette_sorted, uint32_t num_colors,
                 uint32_t* const palette);
                 uint32_t* const palette);

+ 5 - 4
thirdparty/libwebp/src/webp/encode.h

@@ -20,7 +20,7 @@
 extern "C" {
 extern "C" {
 #endif
 #endif
 
 
-#define WEBP_ENCODER_ABI_VERSION 0x020f    // MAJOR(8b) + MINOR(8b)
+#define WEBP_ENCODER_ABI_VERSION 0x0210  // MAJOR(8b) + MINOR(8b)
 
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
 // the types are left here for reference.
@@ -145,7 +145,7 @@ struct WebPConfig {
                           // RGB information for better compression. The default
                           // RGB information for better compression. The default
                           // value is 0.
                           // value is 0.
 
 
-  int use_delta_palette;  // reserved for future lossless feature
+  int use_delta_palette;  // reserved
   int use_sharp_yuv;      // if needed, use sharp (and slow) RGB->YUV conversion
   int use_sharp_yuv;      // if needed, use sharp (and slow) RGB->YUV conversion
 
 
   int qmin;               // minimum permissible quality factor
   int qmin;               // minimum permissible quality factor
@@ -224,14 +224,15 @@ struct WebPAuxStats {
   uint32_t lossless_features;  // bit0:predictor bit1:cross-color transform
   uint32_t lossless_features;  // bit0:predictor bit1:cross-color transform
                                // bit2:subtract-green bit3:color indexing
                                // bit2:subtract-green bit3:color indexing
   int histogram_bits;          // number of precision bits of histogram
   int histogram_bits;          // number of precision bits of histogram
-  int transform_bits;          // precision bits for transform
+  int transform_bits;          // precision bits for predictor transform
   int cache_bits;              // number of bits for color cache lookup
   int cache_bits;              // number of bits for color cache lookup
   int palette_size;            // number of color in palette, if used
   int palette_size;            // number of color in palette, if used
   int lossless_size;           // final lossless size
   int lossless_size;           // final lossless size
   int lossless_hdr_size;       // lossless header (transform, huffman etc) size
   int lossless_hdr_size;       // lossless header (transform, huffman etc) size
   int lossless_data_size;      // lossless image data size
   int lossless_data_size;      // lossless image data size
+  int cross_color_transform_bits;  // precision bits for cross-color transform
 
 
-  uint32_t pad[2];        // padding for later use
+  uint32_t pad[1];  // padding for later use
 };
 };
 
 
 // Signature for output function. Should return true if writing was successful.
 // Signature for output function. Should return true if writing was successful.

+ 6 - 1
thirdparty/libwebp/src/webp/format_constants.h

@@ -46,7 +46,12 @@
 #define CODE_LENGTH_CODES            19
 #define CODE_LENGTH_CODES            19
 
 
 #define MIN_HUFFMAN_BITS             2  // min number of Huffman bits
 #define MIN_HUFFMAN_BITS             2  // min number of Huffman bits
-#define MAX_HUFFMAN_BITS             9  // max number of Huffman bits
+#define NUM_HUFFMAN_BITS             3
+
+// the maximum number of bits defining a transform is
+// MIN_TRANSFORM_BITS + (1 << NUM_TRANSFORM_BITS) - 1
+#define MIN_TRANSFORM_BITS           2
+#define NUM_TRANSFORM_BITS           3
 
 
 #define TRANSFORM_PRESENT            1  // The bit to be written when next data
 #define TRANSFORM_PRESENT            1  // The bit to be written when next data
                                         // to be read is a transform.
                                         // to be read is a transform.

+ 2 - 2
thirdparty/libwebp/src/webp/types.h

@@ -38,11 +38,11 @@ typedef long long int int64_t;
 
 
 #ifndef WEBP_NODISCARD
 #ifndef WEBP_NODISCARD
 #if defined(WEBP_ENABLE_NODISCARD) && WEBP_ENABLE_NODISCARD
 #if defined(WEBP_ENABLE_NODISCARD) && WEBP_ENABLE_NODISCARD
-#if (defined(__cplusplus) && __cplusplus >= 201700L) || \
+#if (defined(__cplusplus) && __cplusplus >= 201703L) || \
     (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L)
     (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L)
 #define WEBP_NODISCARD [[nodiscard]]
 #define WEBP_NODISCARD [[nodiscard]]
 #else
 #else
-// gcc's __has_attribute does not work for enums.
+// gcc's __attribute__((warn_unused_result)) does not work for enums.
 #if defined(__clang__) && defined(__has_attribute)
 #if defined(__clang__) && defined(__has_attribute)
 #if __has_attribute(warn_unused_result)
 #if __has_attribute(warn_unused_result)
 #define WEBP_NODISCARD __attribute__((warn_unused_result))
 #define WEBP_NODISCARD __attribute__((warn_unused_result))