subtract_neon.c 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "./vpx_config.h"
  12. #include "vpx/vpx_integer.h"
  13. void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
  14. ptrdiff_t diff_stride, const uint8_t *src,
  15. ptrdiff_t src_stride, const uint8_t *pred,
  16. ptrdiff_t pred_stride) {
  17. int r, c;
  18. if (cols > 16) {
  19. for (r = 0; r < rows; ++r) {
  20. for (c = 0; c < cols; c += 32) {
  21. const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
  22. const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
  23. const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
  24. const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
  25. const uint16x8_t v_diff_lo_00 =
  26. vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
  27. const uint16x8_t v_diff_hi_00 =
  28. vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
  29. const uint16x8_t v_diff_lo_16 =
  30. vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
  31. const uint16x8_t v_diff_hi_16 =
  32. vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
  33. vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
  34. vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
  35. vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
  36. vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
  37. }
  38. diff += diff_stride;
  39. pred += pred_stride;
  40. src += src_stride;
  41. }
  42. } else if (cols > 8) {
  43. for (r = 0; r < rows; ++r) {
  44. const uint8x16_t v_src = vld1q_u8(&src[0]);
  45. const uint8x16_t v_pred = vld1q_u8(&pred[0]);
  46. const uint16x8_t v_diff_lo =
  47. vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
  48. const uint16x8_t v_diff_hi =
  49. vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
  50. vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
  51. vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
  52. diff += diff_stride;
  53. pred += pred_stride;
  54. src += src_stride;
  55. }
  56. } else if (cols > 4) {
  57. for (r = 0; r < rows; ++r) {
  58. const uint8x8_t v_src = vld1_u8(&src[0]);
  59. const uint8x8_t v_pred = vld1_u8(&pred[0]);
  60. const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
  61. vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
  62. diff += diff_stride;
  63. pred += pred_stride;
  64. src += src_stride;
  65. }
  66. } else {
  67. for (r = 0; r < rows; ++r) {
  68. for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
  69. diff += diff_stride;
  70. pred += pred_stride;
  71. src += src_stride;
  72. }
  73. }
  74. }