2
0

idct16x16_neon.c 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. /*
  2. * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "vpx_dsp/vpx_dsp_common.h"
  11. void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,
  12. int16_t *output,
  13. int output_stride);
  14. void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
  15. int16_t *output,
  16. int16_t *pass1Output,
  17. int16_t skip_adding,
  18. uint8_t *dest,
  19. int dest_stride);
  20. void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
  21. int16_t *output,
  22. int output_stride);
  23. void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,
  24. int16_t *output,
  25. int16_t *pass1Output,
  26. int16_t skip_adding,
  27. uint8_t *dest,
  28. int dest_stride);
  29. #if HAVE_NEON_ASM
  30. /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
  31. extern void vpx_push_neon(int64_t *store);
  32. extern void vpx_pop_neon(int64_t *store);
  33. #endif // HAVE_NEON_ASM
  34. void vpx_idct16x16_256_add_neon(const int16_t *input,
  35. uint8_t *dest, int dest_stride) {
  36. #if HAVE_NEON_ASM
  37. int64_t store_reg[8];
  38. #endif
  39. int16_t pass1_output[16*16] = {0};
  40. int16_t row_idct_output[16*16] = {0};
  41. #if HAVE_NEON_ASM
  42. // save d8-d15 register values.
  43. vpx_push_neon(store_reg);
  44. #endif
  45. /* Parallel idct on the upper 8 rows */
  46. // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  47. // stage 6 result in pass1_output.
  48. vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
  49. // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  50. // with result in pass1(pass1_output) to calculate final result in stage 7
  51. // which will be saved into row_idct_output.
  52. vpx_idct16x16_256_add_neon_pass2(input+1,
  53. row_idct_output,
  54. pass1_output,
  55. 0,
  56. dest,
  57. dest_stride);
  58. /* Parallel idct on the lower 8 rows */
  59. // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  60. // stage 6 result in pass1_output.
  61. vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
  62. // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  63. // with result in pass1(pass1_output) to calculate final result in stage 7
  64. // which will be saved into row_idct_output.
  65. vpx_idct16x16_256_add_neon_pass2(input+8*16+1,
  66. row_idct_output+8,
  67. pass1_output,
  68. 0,
  69. dest,
  70. dest_stride);
  71. /* Parallel idct on the left 8 columns */
  72. // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  73. // stage 6 result in pass1_output.
  74. vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
  75. // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  76. // with result in pass1(pass1_output) to calculate final result in stage 7.
  77. // Then add the result to the destination data.
  78. vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
  79. row_idct_output,
  80. pass1_output,
  81. 1,
  82. dest,
  83. dest_stride);
  84. /* Parallel idct on the right 8 columns */
  85. // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  86. // stage 6 result in pass1_output.
  87. vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
  88. // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  89. // with result in pass1(pass1_output) to calculate final result in stage 7.
  90. // Then add the result to the destination data.
  91. vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
  92. row_idct_output+8,
  93. pass1_output,
  94. 1,
  95. dest+8,
  96. dest_stride);
  97. #if HAVE_NEON_ASM
  98. // restore d8-d15 register values.
  99. vpx_pop_neon(store_reg);
  100. #endif
  101. return;
  102. }
  103. void vpx_idct16x16_10_add_neon(const int16_t *input,
  104. uint8_t *dest, int dest_stride) {
  105. #if HAVE_NEON_ASM
  106. int64_t store_reg[8];
  107. #endif
  108. int16_t pass1_output[16*16] = {0};
  109. int16_t row_idct_output[16*16] = {0};
  110. #if HAVE_NEON_ASM
  111. // save d8-d15 register values.
  112. vpx_push_neon(store_reg);
  113. #endif
  114. /* Parallel idct on the upper 8 rows */
  115. // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  116. // stage 6 result in pass1_output.
  117. vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
  118. // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  119. // with result in pass1(pass1_output) to calculate final result in stage 7
  120. // which will be saved into row_idct_output.
  121. vpx_idct16x16_10_add_neon_pass2(input+1,
  122. row_idct_output,
  123. pass1_output,
  124. 0,
  125. dest,
  126. dest_stride);
  127. /* Skip Parallel idct on the lower 8 rows as they are all 0s */
  128. /* Parallel idct on the left 8 columns */
  129. // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  130. // stage 6 result in pass1_output.
  131. vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
  132. // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  133. // with result in pass1(pass1_output) to calculate final result in stage 7.
  134. // Then add the result to the destination data.
  135. vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
  136. row_idct_output,
  137. pass1_output,
  138. 1,
  139. dest,
  140. dest_stride);
  141. /* Parallel idct on the right 8 columns */
  142. // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  143. // stage 6 result in pass1_output.
  144. vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
  145. // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  146. // with result in pass1(pass1_output) to calculate final result in stage 7.
  147. // Then add the result to the destination data.
  148. vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
  149. row_idct_output+8,
  150. pass1_output,
  151. 1,
  152. dest+8,
  153. dest_stride);
  154. #if HAVE_NEON_ASM
  155. // restore d8-d15 register values.
  156. vpx_pop_neon(store_reg);
  157. #endif
  158. return;
  159. }