mem_neon.h 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_DSP_ARM_MEM_NEON_H_
  11. #define VPX_DSP_ARM_MEM_NEON_H_
  12. #include <arm_neon.h>
  13. #include <assert.h>
  14. #include <string.h>
  15. #include "./vpx_config.h"
  16. #include "vpx/vpx_integer.h"
  17. #include "vpx_dsp/vpx_dsp_common.h"
  18. // Helper functions used to load tran_low_t into int16, narrowing if necessary.
  19. static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
  20. #if CONFIG_VP9_HIGHBITDEPTH
  21. const int32x4x2_t v0 = vld2q_s32(buf);
  22. const int32x4x2_t v1 = vld2q_s32(buf + 8);
  23. const int16x4_t s0 = vmovn_s32(v0.val[0]);
  24. const int16x4_t s1 = vmovn_s32(v0.val[1]);
  25. const int16x4_t s2 = vmovn_s32(v1.val[0]);
  26. const int16x4_t s3 = vmovn_s32(v1.val[1]);
  27. int16x8x2_t res;
  28. res.val[0] = vcombine_s16(s0, s2);
  29. res.val[1] = vcombine_s16(s1, s3);
  30. return res;
  31. #else
  32. return vld2q_s16(buf);
  33. #endif
  34. }
  35. static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
  36. #if CONFIG_VP9_HIGHBITDEPTH
  37. const int32x4_t v0 = vld1q_s32(buf);
  38. const int32x4_t v1 = vld1q_s32(buf + 4);
  39. const int16x4_t s0 = vmovn_s32(v0);
  40. const int16x4_t s1 = vmovn_s32(v1);
  41. return vcombine_s16(s0, s1);
  42. #else
  43. return vld1q_s16(buf);
  44. #endif
  45. }
  46. static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
  47. #if CONFIG_VP9_HIGHBITDEPTH
  48. const int32x4_t v0 = vld1q_s32(buf);
  49. return vmovn_s32(v0);
  50. #else
  51. return vld1_s16(buf);
  52. #endif
  53. }
  54. static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
  55. #if CONFIG_VP9_HIGHBITDEPTH
  56. const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
  57. const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
  58. vst1q_s32(buf, v0);
  59. vst1q_s32(buf + 4, v1);
  60. #else
  61. vst1q_s16(buf, a);
  62. #endif
  63. }
  64. // Propagate type information to the compiler. Without this the compiler may
  65. // assume the required alignment of uint32_t (4 bytes) and add alignment hints
  66. // to the memory access.
  67. //
  68. // This is used for functions operating on uint8_t which wish to load or store 4
  69. // values at a time but which may not be on 4 byte boundaries.
  70. static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
  71. memcpy(buf, &a, 4);
  72. }
  73. // Load 2 sets of 4 bytes when alignment is not guaranteed.
  74. static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
  75. uint32_t a;
  76. uint32x2_t a_u32 = vdup_n_u32(0);
  77. if (stride == 4) return vld1_u8(buf);
  78. memcpy(&a, buf, 4);
  79. buf += stride;
  80. a_u32 = vld1_lane_u32(&a, a_u32, 0);
  81. memcpy(&a, buf, 4);
  82. a_u32 = vld1_lane_u32(&a, a_u32, 1);
  83. return vreinterpret_u8_u32(a_u32);
  84. }
  85. // Store 2 sets of 4 bytes when alignment is not guaranteed.
  86. static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
  87. const uint8x8_t a) {
  88. const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
  89. if (stride == 4) {
  90. vst1_u8(buf, a);
  91. return;
  92. }
  93. uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
  94. buf += stride;
  95. uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
  96. }
  97. // Load 4 sets of 4 bytes when alignment is not guaranteed.
  98. static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
  99. uint32_t a;
  100. uint32x4_t a_u32 = vdupq_n_u32(0);
  101. if (stride == 4) return vld1q_u8(buf);
  102. memcpy(&a, buf, 4);
  103. buf += stride;
  104. a_u32 = vld1q_lane_u32(&a, a_u32, 0);
  105. memcpy(&a, buf, 4);
  106. buf += stride;
  107. a_u32 = vld1q_lane_u32(&a, a_u32, 1);
  108. memcpy(&a, buf, 4);
  109. buf += stride;
  110. a_u32 = vld1q_lane_u32(&a, a_u32, 2);
  111. memcpy(&a, buf, 4);
  112. buf += stride;
  113. a_u32 = vld1q_lane_u32(&a, a_u32, 3);
  114. return vreinterpretq_u8_u32(a_u32);
  115. }
  116. // Store 4 sets of 4 bytes when alignment is not guaranteed.
  117. static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
  118. const uint8x16_t a) {
  119. const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
  120. if (stride == 4) {
  121. vst1q_u8(buf, a);
  122. return;
  123. }
  124. uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
  125. buf += stride;
  126. uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
  127. buf += stride;
  128. uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2));
  129. buf += stride;
  130. uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3));
  131. }
  132. // Load 2 sets of 4 bytes when alignment is guaranteed.
  133. static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
  134. uint32x2_t a = vdup_n_u32(0);
  135. assert(!((intptr_t)buf % sizeof(uint32_t)));
  136. assert(!(stride % sizeof(uint32_t)));
  137. a = vld1_lane_u32((const uint32_t *)buf, a, 0);
  138. buf += stride;
  139. a = vld1_lane_u32((const uint32_t *)buf, a, 1);
  140. return vreinterpret_u8_u32(a);
  141. }
  142. // Store 2 sets of 4 bytes when alignment is guaranteed.
  143. static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
  144. uint32x2_t a_u32 = vreinterpret_u32_u8(a);
  145. assert(!((intptr_t)buf % sizeof(uint32_t)));
  146. assert(!(stride % sizeof(uint32_t)));
  147. vst1_lane_u32((uint32_t *)buf, a_u32, 0);
  148. buf += stride;
  149. vst1_lane_u32((uint32_t *)buf, a_u32, 1);
  150. }
  151. #endif // VPX_DSP_ARM_MEM_NEON_H_