inv_wht_sse2.asm 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. ;
  2. ; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION .text
  12. %macro REORDER_INPUTS 0
  13. ; a c d b to a b c d
  14. SWAP 1, 3, 2
  15. %endmacro
  16. %macro TRANSFORM_COLS 0
  17. ; input:
  18. ; m0 a
  19. ; m1 b
  20. ; m2 c
  21. ; m3 d
  22. paddw m0, m2
  23. psubw m3, m1
  24. ; wide subtract
  25. punpcklwd m4, m0
  26. punpcklwd m5, m3
  27. psrad m4, 16
  28. psrad m5, 16
  29. psubd m4, m5
  30. psrad m4, 1
  31. packssdw m4, m4 ; e
  32. psubw m5, m4, m1 ; b
  33. psubw m4, m2 ; c
  34. psubw m0, m5
  35. paddw m3, m4
  36. ; m0 a
  37. SWAP 1, 5 ; m1 b
  38. SWAP 2, 4 ; m2 c
  39. ; m3 d
  40. %endmacro
  41. %macro TRANSPOSE_4X4 0
  42. punpcklwd m0, m2
  43. punpcklwd m1, m3
  44. mova m2, m0
  45. punpcklwd m0, m1
  46. punpckhwd m2, m1
  47. pshufd m1, m0, 0x0e
  48. pshufd m3, m2, 0x0e
  49. %endmacro
  50. ; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
  51. %macro TRANSPOSE_4X4_WIDE 0
  52. mova m3, m0
  53. punpcklwd m0, m1
  54. punpckhwd m3, m1
  55. mova m2, m0
  56. punpcklwd m0, m3
  57. punpckhwd m2, m3
  58. pshufd m1, m0, 0x0e
  59. pshufd m3, m2, 0x0e
  60. %endmacro
  61. %macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero
  62. movd m%3, [outputq]
  63. movd m%4, [outputq + strideq]
  64. punpcklbw m%3, m%5
  65. punpcklbw m%4, m%5
  66. paddw m%1, m%3
  67. paddw m%2, m%4
  68. packuswb m%1, m%5
  69. packuswb m%2, m%5
  70. movd [outputq], m%1
  71. movd [outputq + strideq], m%2
  72. %endmacro
  73. INIT_XMM sse2
  74. cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
  75. %if CONFIG_VP9_HIGHBITDEPTH
  76. mova m0, [inputq + 0]
  77. packssdw m0, [inputq + 16]
  78. mova m1, [inputq + 32]
  79. packssdw m1, [inputq + 48]
  80. %else
  81. mova m0, [inputq + 0]
  82. mova m1, [inputq + 16]
  83. %endif
  84. psraw m0, 2
  85. psraw m1, 2
  86. TRANSPOSE_4X4_WIDE
  87. REORDER_INPUTS
  88. TRANSFORM_COLS
  89. TRANSPOSE_4X4
  90. REORDER_INPUTS
  91. TRANSFORM_COLS
  92. pxor m4, m4
  93. ADD_STORE_4P_2X 0, 1, 5, 6, 4
  94. lea outputq, [outputq + 2 * strideq]
  95. ADD_STORE_4P_2X 2, 3, 5, 6, 4
  96. RET