align.h 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. /* Copyright (C) 2013-2014 Povilas Kanapickas <[email protected]>
  2. Distributed under the Boost Software License, Version 1.0.
  3. (See accompanying file LICENSE_1_0.txt or copy at
  4. http://www.boost.org/LICENSE_1_0.txt)
  5. */
  6. #ifndef LIBSIMDPP_SIMDPP_CORE_ALIGN_H
  7. #define LIBSIMDPP_SIMDPP_CORE_ALIGN_H
  8. #ifndef LIBSIMDPP_SIMD_H
  9. #error "This file must be included through simd.h"
  10. #endif
  11. #include <simdpp/types.h>
  12. #include <simdpp/detail/insn/align.h>
  13. #include <simdpp/detail/get_expr.h>
  14. namespace simdpp {
  15. namespace SIMDPP_ARCH_NAMESPACE {
  16. /** Extracts a int8x16 vector from two concatenated int8x16 vectors
  17. @code
  18. shift: pos:| 0 1 . 14 15 |
  19. 0 r = [ l0 l1 . l14 l15 ]
  20. 1 r = [ l1 l2 . l15 u0 ]
  21. 2 r = [ l2 l3 . u0 l1 ]
  22. ... .. .. .. ... .. ..
  23. 15 r = [ l15 u0 . u13 u14 ]
  24. 16 r = [ u0 u1 . u14 u15 ]
  25. @endcode
  26. @par 128-bit version:
  27. @icost{SSE2-SSE3, 3}
  28. @par 256-bit version:
  29. The lower and higher 128-bit halves are processed as if 128-bit instruction
  30. was applied to each of them separately.
  31. @icost{SSE2-SSE3, 6}
  32. @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
  33. */
  34. template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
  35. typename detail::get_expr2_nomask<V1, V2>::empty
  36. align16(const any_vec8<N,V1>& lower,
  37. const any_vec8<N,V2>& upper)
  38. {
  39. static_assert(shift <= 16, "Shift out of bounds");
  40. if (shift == 0) return lower.wrapped().eval();
  41. if (shift == 16) return upper.wrapped().eval();
  42. typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
  43. qlower = lower.wrapped().eval();
  44. qupper = upper.wrapped().eval();
  45. return detail::insn::i_align16<shift>(qlower, qupper);
  46. }
  47. /** Extracts a int16x8 vector from two concatenated int16x8 vectors
  48. @code
  49. shift: pos:| 0 1 . 6 7 |
  50. 0 r = [ l0 l1 . l6 l7 ]
  51. 1 r = [ l1 l2 . l7 u0 ]
  52. 2 r = [ l2 l3 . u0 l1 ]
  53. ... .. .. .. ... .. ..
  54. 7 r = [ l3 u0 . u5 u6 ]
  55. 8 r = [ u0 u1 . u6 u7 ]
  56. @endcode
  57. @par 128-bit version:
  58. @icost{SSE2-SSE3, 3}
  59. @par 256-bit version:
  60. @icost{SSE2-SSE3, 6}
  61. @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
  62. The all 128-bit sub-vectors are processed as if 128-bit instruction
  63. was applied to each of them separately.
  64. */
  65. template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
  66. typename detail::get_expr2_nomask<V1, V2>::empty
  67. align8(const any_vec16<N,V1>& lower,
  68. const any_vec16<N,V2>& upper)
  69. {
  70. static_assert(shift <= 8, "Shift out of bounds");
  71. if (shift == 0) return lower.wrapped().eval();
  72. if (shift == 8) return upper.wrapped().eval();
  73. typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
  74. qlower = lower.wrapped().eval();
  75. qupper = upper.wrapped().eval();
  76. return detail::insn::i_align8<shift>(qlower, qupper);
  77. }
  78. /** Extracts a int32x4 vector from two concatenated int32x4 vectors
  79. @code
  80. shift: pos:| 0 1 2 3 |
  81. 0 r = [ l0 l1 l2 l3 ]
  82. 1 r = [ l1 l2 l3 u0 ]
  83. 2 r = [ l2 l3 u0 u1 ]
  84. 3 r = [ l3 u0 u1 u2 ]
  85. 4 r = [ u0 u1 u2 u3 ]
  86. @endcode
  87. @par int32
  88. @par 128-bit version:
  89. @icost{SSE2-SSE3, 3}
  90. @par 256-bit version:
  91. The lower and higher 128-bit halves are processed as if 128-bit instruction
  92. was applied to each of them separately.
  93. @icost{SSE2-SSE3, 6}
  94. @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
  95. @par float32
  96. @par 128-bit version:
  97. @icost{SSE2-SSE3, 3}
  98. @par 256-bit version:
  99. The lower and higher 128-bit halves are processed as if 128-bit instruction
  100. was applied to each of them separately.
  101. @icost{SSE2-SSE3, 6}
  102. @icost{SSSE3-SSE4.1 NEON, ALTIVEC, 2}
  103. */
  104. template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
  105. typename detail::get_expr2_nomask<V1, V2>::empty
  106. align4(const any_vec32<N,V1>& lower,
  107. const any_vec32<N,V2>& upper)
  108. {
  109. static_assert(shift <= 4, "Shift out of bounds");
  110. if (shift == 0) return lower.wrapped().eval();
  111. if (shift == 4) return upper.wrapped().eval();
  112. typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
  113. qlower = lower.wrapped().eval();
  114. qupper = upper.wrapped().eval();
  115. return detail::insn::i_align4<shift>(qlower, qupper);
  116. }
  117. /** Extracts a int64x2 vector from two concatenated int64x2 vectors
  118. @code
  119. shift: pos:| 0 1 |
  120. 0 r = [ l0 l1 ]
  121. 1 r = [ l1 u0 ]
  122. 2 r = [ u0 u1 ]
  123. @endcode
  124. @par int64
  125. @par 128-bit version:
  126. @icost{SSE2-SSE3, 3}
  127. @par 256-bit version:
  128. The lower and higher 128-bit halves are processed as if 128-bit instruction
  129. was applied to each of them separately.
  130. @icost{SSE2-SSE3, 6}
  131. @icost{SSSE3-AVX, NEON, ALTIVEC, 2}
  132. @par float64
  133. @par 128-bit version:
  134. @icost{SSE2-SSE3, 3}
  135. @par 256-bit version:
  136. The lower and higher 128-bit halves are processed as if 128-bit instruction
  137. was applied to each of them separately.
  138. @icost{SSE2-SSE3, 6}
  139. @icost{SSSE3-SSE4.1 NEON, ALTIVEC, 2}
  140. */
  141. template<unsigned shift, unsigned N, class V1, class V2> SIMDPP_INL
  142. typename detail::get_expr2_nomask<V1, V2>::empty
  143. align2(const any_vec64<N,V1>& lower,
  144. const any_vec64<N,V2>& upper)
  145. {
  146. static_assert(shift <= 2, "Shift out of bounds");
  147. if (shift == 0) return lower.wrapped().eval();
  148. if (shift == 2) return upper.wrapped().eval();
  149. typename detail::get_expr2_nomask_nosign<V1, V2>::type qlower, qupper;
  150. qlower = lower.wrapped().eval();
  151. qupper = upper.wrapped().eval();
  152. return detail::insn::i_align2<shift>(qlower, qupper);
  153. }
  154. } // namespace SIMDPP_ARCH_NAMESPACE
  155. } // namespace simdpp
  156. #endif