transpose.h 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. /* Copyright (C) 2011-2014 Povilas Kanapickas <[email protected]>
  2. Distributed under the Boost Software License, Version 1.0.
  3. (See accompanying file LICENSE_1_0.txt or copy at
  4. http://www.boost.org/LICENSE_1_0.txt)
  5. */
  6. #ifndef LIBSIMDPP_SIMDPP_CORE_TRANSPOSE_H
  7. #define LIBSIMDPP_SIMDPP_CORE_TRANSPOSE_H
  8. #ifndef LIBSIMDPP_SIMD_H
  9. #error "This file must be included through simd.h"
  10. #endif
  11. #include <simdpp/types.h>
  12. #include <simdpp/core/make_shuffle_bytes_mask.h>
  13. #include <simdpp/core/bit_and.h>
  14. #include <simdpp/core/shuffle2.h>
  15. #include <simdpp/detail/insn/transpose.h>
  16. #include <simdpp/detail/neon/shuffle.h>
  17. #include <simdpp/detail/null/transpose.h>
  18. namespace simdpp {
  19. namespace SIMDPP_ARCH_NAMESPACE {
  20. /** Transposes four 2x2 16-bit matrices within two int16x8 vectors
  21. Mask or expression vectors are not supported.
  22. @code
  23. r0 = [ a0_0; a1_0 ; ... ; a0_6; a1_6 ]
  24. r1 = [ a0_1; a1_1 ; ... ; a0_7; a0_7 ]
  25. @endcode
  26. @par 128-bit version:
  27. @icost{SSE2-AVX2, 4}
  28. @icost{ALTIVEC, 2-4}
  29. @par 256-bit version:
  30. The lower and higher 128-bit halves are processed as if 128-bit instruction
  31. was applied to each of them separately.
  32. @icost{SSE2-AVX, 8}
  33. @icost{AVX2, 4}
  34. @icost{NEON, 2}
  35. @icost{ALTIVEC, 4-6}
  36. */
  37. template<unsigned N, class V> SIMDPP_INL
  38. void transpose2(any_int16<N,V>& a0, any_int16<N,V>& a1)
  39. {
  40. static_assert(!is_mask<V>::value, "Mask vectors are not supported");
  41. static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
  42. uint16<N> qa0 = a0.wrapped();
  43. uint16<N> qa1 = a1.wrapped();
  44. detail::insn::i_transpose2(qa0, qa1);
  45. a0.wrapped() = qa0;
  46. a1.wrapped() = qa1;
  47. }
  48. /** Transposes two 2x2 32-bit matrices within two int32x4 vectors
  49. @code
  50. r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
  51. r1 = [ a0_1; a1_1 ; a1_3; a0_3 ]
  52. @endcode
  53. @par 128-bit version:
  54. @icost{SSE2-AVX2, 4}
  55. @icost{ALTIVEC, 2-4}
  56. @par 256-bit version:
  57. The lower and higher 128-bit halves are processed as if 128-bit instruction
  58. was applied to each of them separately.
  59. @icost{SSE2-AVX, 8}
  60. @icost{AVX2, 4}
  61. @icost{NEON, 2}
  62. @icost{ALTIVEC, 4-6}
  63. */
  64. template<unsigned N, class V> SIMDPP_INL
  65. void transpose2(any_int32<N,V>& a0, any_int32<N,V>& a1)
  66. {
  67. static_assert(!is_mask<V>::value, "Mask vectors are not supported");
  68. static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
  69. uint32<N> qa0 = a0.wrapped();
  70. uint32<N> qa1 = a1.wrapped();
  71. detail::insn::i_transpose2(qa0, qa1);
  72. a0.wrapped() = qa0;
  73. a1.wrapped() = qa1;
  74. }
  75. /** Transposes a 2x2 64-bit matrix within two int64x2 vectors
  76. @code
  77. r0 = [ a0_0; a1_0 ]
  78. r1 = [ a0_1; a1_1 ]
  79. @endcode
  80. @par 128-bit version:
  81. @icost{SSE2-AVX2, 2}
  82. @icost{ALTIVEC, 2-4}
  83. @par 256-bit version:
  84. The lower and higher 128-bit halves are processed as if 128-bit instruction
  85. was applied to each of them separately.
  86. @icost{SSE2-AVX, 4}
  87. @icost{AVX2, 2}
  88. @icost{NEON, 2}
  89. @icost{ALTIVEC, 4-6}
  90. */
  91. template<unsigned N, class V> SIMDPP_INL
  92. void transpose2(any_int64<N,V>& a0, any_int64<N,V>& a1)
  93. {
  94. static_assert(!is_mask<V>::value, "Mask vectors are not supported");
  95. static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
  96. uint64<N> qa0 = a0.wrapped();
  97. uint64<N> qa1 = a1.wrapped();
  98. detail::insn::i_transpose2(qa0, qa1);
  99. a0.wrapped() = qa0;
  100. a1.wrapped() = qa1;
  101. }
  102. /** Transposes two 2x2 32-bit matrices within two float32x4 vectors
  103. @code
  104. r0 = [ a0_0; a1_0 ; a0_2; a1_2 ]
  105. r1 = [ a0_1; a1_1 ; a0_3; a0_3 ]
  106. @endcode
  107. @par 128-bit version:
  108. @icost{SSE2-AVX2, 4}
  109. @icost{ALTIVEC, 2-4}
  110. @par 256-bit version:
  111. The lower and higher 128-bit halves are processed as if 128-bit instruction
  112. was applied to each of them separately.
  113. @icost{SSE2-SSE4.1, 8}
  114. @icost{AVX-AVX2, 4}
  115. @icost{ALTIVEC, 4-6}
  116. @icost{NEON, 2}
  117. */
  118. template<unsigned N> SIMDPP_INL
  119. void transpose2(float32<N>& a0, float32<N>& a1)
  120. {
  121. detail::insn::i_transpose2(a0, a1);
  122. }
  123. /** Transposes a 2x2 64-bit matrix within two int64x2 vectors
  124. @code
  125. r0 = [ a0_0; a1_0 ]
  126. r1 = [ a0_1; a1_1 ]
  127. @endcode
  128. @par 128-bit version:
  129. @icost{SSE2-AVX2, 2}
  130. @novec{NEON, ALTIVEC}
  131. @par 256-bit version:
  132. The lower and higher 128-bit halves are processed as if 128-bit instruction
  133. was applied to each of them separately.
  134. @icost{SSE2-SSE4.1, 4}
  135. @icost{AVX-AVX2, 2}
  136. @novec{NEON, ALTIVEC}
  137. */
  138. template<unsigned N> SIMDPP_INL
  139. void transpose2(float64<N>& a0, float64<N>& a1)
  140. {
  141. detail::insn::i_transpose2(a0, a1);
  142. }
  143. /** Transposes four 4x4 8-bit matrix within four int8x16 vectors
  144. Mask or expression vectors are not supported.
  145. @code
  146. r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ...]
  147. r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ...]
  148. r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ...]
  149. r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ...]
  150. @endcode
  151. @par 128-bit version:
  152. @icost{SSE2-AVX2, 16}
  153. @icost{NEON, 4}
  154. @icost{ALTIVEC, 8-12}
  155. @par 256-bit version:
  156. The lower and higher 128-bit halves are processed as if 128-bit instruction
  157. was applied to each of them separately.
  158. @icost{SSE2-AVX, 32}
  159. @icost{AVX2, 16}
  160. @icost{NEON, 8}
  161. @icost{ALTIVEC, 16-20}
  162. */
  163. template<unsigned N, class V> SIMDPP_INL
  164. void transpose4(any_int8<N,V>& a0, any_int8<N,V>& a1,
  165. any_int8<N,V>& a2, any_int8<N,V>& a3)
  166. {
  167. static_assert(!is_mask<V>::value, "Mask vectors are not supported");
  168. static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
  169. uint8<N> qa0, qa1, qa2, qa3;
  170. qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped();
  171. detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
  172. a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3;
  173. }
  174. /** Transposes two 4x4 16-bit matrices within four int16x8 vectors
  175. Mask or expression vectors are not supported.
  176. @code
  177. r0 = [ a0_0; a1_0; a2_0; a3_0 ; a0_4; a1_4; a2_4; a3_4 ]
  178. r1 = [ a0_1; a1_1; a2_1; a3_1 ; a0_5; a1_5; a2_5; a3_5 ]
  179. r2 = [ a0_2; a1_2; a2_2; a3_2 ; a0_6; a1_6; a2_6; a3_6 ]
  180. r3 = [ a0_3; a1_3; a2_3; a3_3 ; a0_7; a1_7; a2_7; a3_7 ]
  181. @endcode
  182. @par 128-bit version:
  183. @icost{SSE2-AVX2, 12}
  184. @icost{NEON, 4}
  185. @icost{ALTIVEC, 8-12}
  186. @par 256-bit version:
  187. The lower and higher 128-bit halves are processed as if 128-bit instruction
  188. was applied to each of them separately.
  189. @icost{SSE2-AVX, 24}
  190. @icost{AVX2, 12}
  191. @icost{NEON, 8}
  192. @icost{ALTIVEC, 16-20}
  193. */
  194. template<unsigned N, class V> SIMDPP_INL
  195. void transpose4(any_int16<N,V>& a0, any_int16<N,V>& a1,
  196. any_int16<N,V>& a2, any_int16<N,V>& a3)
  197. {
  198. static_assert(!is_mask<V>::value, "Mask vectors are not supported");
  199. static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
  200. uint16<N> qa0, qa1, qa2, qa3;
  201. qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped();
  202. detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
  203. a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3;
  204. }
  205. /** Transposes a 4x4 32-bit matrix within four int32x4 vectors
  206. Mask or expression vectors are not supported.
  207. @code
  208. r0 = [ a0_0; a1_0; a2_0; a3_0 ]
  209. r1 = [ a0_1; a1_1; a2_1; a3_1 ]
  210. r2 = [ a0_2; a1_2; a2_2; a3_2 ]
  211. r3 = [ a0_3; a1_3; a2_3; a3_3 ]
  212. @endcode
  213. @par 128-bit version:
  214. @icost{SSE2-AVX2, 12}
  215. @icost{NEON, 4}
  216. @icost{ALTIVEC, 8-12}
  217. @par 256-bit version:
  218. @icost{SSE2-AVX, 24}
  219. @icost{AVX2, 12}
  220. @icost{NEON, 8}
  221. @icost{ALTIVEC, 16-20}
  222. The lower and higher 128-bit halves are processed as if 128-bit instruction
  223. was applied to each of them separately.
  224. */
  225. template<unsigned N, class V> SIMDPP_INL
  226. void transpose4(any_int32<N,V>& a0, any_int32<N,V>& a1,
  227. any_int32<N,V>& a2, any_int32<N,V>& a3)
  228. {
  229. static_assert(!is_mask<V>::value, "Mask vectors are not supported");
  230. static_assert(is_value_vector<V>::value, "Expression vectors are not supported");
  231. uint32<N> qa0, qa1, qa2, qa3;
  232. qa0 = a0.wrapped(); qa1 = a1.wrapped(); qa2 = a2.wrapped(); qa3 = a3.wrapped();
  233. detail::insn::i_transpose4(qa0, qa1, qa2, qa3);
  234. a0.wrapped() = qa0; a1.wrapped() = qa1; a2.wrapped() = qa2; a3.wrapped() = qa3;
  235. }
  236. /** Transposes 4x4 32-bit matrix within four float32x4 vectors
  237. @code
  238. r0 = [ a0_0; a1_0; a2_0; a3_0 ]
  239. r1 = [ a0_1; a1_1; a2_1; a3_1 ]
  240. r2 = [ a0_2; a1_2; a2_2; a3_2 ]
  241. r3 = [ a0_3; a1_3; a2_3; a3_3 ]
  242. @endcode
  243. @par 128-bit version:
  244. @icost{SSE2-AVX2, 12}
  245. @icost{NEON, 4}
  246. @icost{ALTIVEC, 8-12}
  247. @par 256-bit version:
  248. @icost{SSE2-SSE4.1, 24}
  249. @icost{AVX-AVX2, 12}
  250. @icost{NEON, 8}
  251. @icost{ALTIVEC, 16-20}
  252. The lower and higher 128-bit halves are processed as if 128-bit instruction
  253. was applied to each of them separately.
  254. */
  255. template<unsigned N> SIMDPP_INL
  256. void transpose4(float32<N>& a0, float32<N>& a1,
  257. float32<N>& a2, float32<N>& a3)
  258. {
  259. detail::insn::i_transpose4(a0, a1, a2, a3);
  260. }
  261. } // namespace SIMDPP_ARCH_NAMESPACE
  262. } // namespace simdpp
  263. #endif