reduction.ll 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s
  2. ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3
  3. ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX
  4. ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2
  5. define fastcc float @reduction_cost_float(<4 x float> %rdx) {
  6. %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  7. %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
  8. %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  9. %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
  10. ; Check that we recognize the tree starting at the extractelement as a
  11. ; reduction.
  12. ; CHECK-LABEL: reduction_cost
  13. ; CHECK: cost of 9 {{.*}} extractelement
  14. %r = extractelement <4 x float> %bin.rdx8, i32 0
  15. ret float %r
  16. }
  17. define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
  18. %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
  19. <8 x i32> <i32 4 , i32 5, i32 6, i32 7,
  20. i32 undef, i32 undef, i32 undef, i32 undef>
  21. %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
  22. %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,
  23. <8 x i32> <i32 2 , i32 3, i32 undef, i32 undef,
  24. i32 undef, i32 undef, i32 undef, i32 undef>
  25. %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
  26. %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef,
  27. <8 x i32> <i32 1 , i32 undef, i32 undef, i32 undef,
  28. i32 undef, i32 undef, i32 undef, i32 undef>
  29. %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
  30. ; CHECK-LABEL: reduction_cost_int
  31. ; CHECK: cost of 23 {{.*}} extractelement
  32. %r = extractelement <8 x i32> %bin.rdx.3, i32 0
  33. ret i32 %r
  34. }
  35. define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
  36. %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
  37. <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
  38. %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
  39. <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
  40. %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
  41. %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
  42. <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
  43. %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
  44. <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  45. %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
  46. ; CHECK-LABEL: pairwise_hadd
  47. ; CHECK: cost of 11 {{.*}} extractelement
  48. %r = extractelement <4 x float> %bin.rdx.1, i32 0
  49. %r2 = fadd float %r, %f1
  50. ret float %r2
  51. }
  52. define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
  53. %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
  54. <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
  55. %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
  56. <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
  57. %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
  58. %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
  59. <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
  60. %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
  61. <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  62. %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
  63. ; CHECK-LABEL: pairwise_hadd_assoc
  64. ; CHECK: cost of 11 {{.*}} extractelement
  65. %r = extractelement <4 x float> %bin.rdx.1, i32 0
  66. %r2 = fadd float %r, %f1
  67. ret float %r2
  68. }
  69. define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
  70. %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
  71. <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
  72. %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
  73. <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
  74. %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
  75. %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
  76. <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  77. %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
  78. ; CHECK-LABEL: pairwise_hadd_skip_first
  79. ; CHECK: cost of 11 {{.*}} extractelement
  80. %r = extractelement <4 x float> %bin.rdx.1, i32 0
  81. %r2 = fadd float %r, %f1
  82. ret float %r2
  83. }
  84. define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) {
  85. %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
  86. %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
  87. ; SSE3: cost of 2 {{.*}} extractelement
  88. ; AVX: cost of 2 {{.*}} extractelement
  89. ; AVX2: cost of 2 {{.*}} extractelement
  90. %r = extractelement <2 x double> %bin.rdx, i32 0
  91. ret double %r
  92. }
  93. define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
  94. %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  95. %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
  96. %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  97. %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
  98. ; SSE3: cost of 4 {{.*}} extractelement
  99. ; AVX: cost of 3 {{.*}} extractelement
  100. ; AVX2: cost of 3 {{.*}} extractelement
  101. %r = extractelement <4 x float> %bin.rdx8, i32 0
  102. ret float %r
  103. }
  104. define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) {
  105. %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  106. %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
  107. %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  108. %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
  109. ; AVX: cost of 3 {{.*}} extractelement
  110. ; AVX2: cost of 3 {{.*}} extractelement
  111. %r = extractelement <4 x double> %bin.rdx8, i32 0
  112. ret double %r
  113. }
  114. define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
  115. %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
  116. %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
  117. %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  118. %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
  119. %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  120. %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
  121. ; AVX: cost of 4 {{.*}} extractelement
  122. ; AVX2: cost of 4 {{.*}} extractelement
  123. %r = extractelement <8 x float> %bin.rdx8, i32 0
  124. ret float %r
  125. }
  126. define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
  127. %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
  128. %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
  129. ; SSE3: cost of 2 {{.*}} extractelement
  130. ; AVX: cost of 1 {{.*}} extractelement
  131. ; AVX2: cost of 1 {{.*}} extractelement
  132. %r = extractelement <2 x i64> %bin.rdx, i32 0
  133. ret i64 %r
  134. }
  135. define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
  136. %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  137. %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
  138. %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  139. %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
  140. ; SSE3: cost of 3 {{.*}} extractelement
  141. ; AVX: cost of 3 {{.*}} extractelement
  142. ; AVX2: cost of 3 {{.*}} extractelement
  143. %r = extractelement <4 x i32> %bin.rdx8, i32 0
  144. ret i32 %r
  145. }
  146. define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
  147. %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  148. %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
  149. %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  150. %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
  151. ; AVX: cost of 3 {{.*}} extractelement
  152. ; AVX2: cost of 3 {{.*}} extractelement
  153. %r = extractelement <4 x i64> %bin.rdx8, i32 0
  154. ret i64 %r
  155. }
  156. define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
  157. %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
  158. %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
  159. %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  160. %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
  161. %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  162. %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
  163. ; SSE3: cost of 4 {{.*}} extractelement
  164. ; AVX: cost of 4 {{.*}} extractelement
  165. ; AVX2: cost of 4 {{.*}} extractelement
  166. %r = extractelement <8 x i16> %bin.rdx8, i32 0
  167. ret i16 %r
  168. }
  169. define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
  170. %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
  171. %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
  172. %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  173. %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
  174. %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  175. %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
  176. ; AVX: cost of 5 {{.*}} extractelement
  177. ; AVX2: cost of 5 {{.*}} extractelement
  178. %r = extractelement <8 x i32> %bin.rdx8, i32 0
  179. ret i32 %r
  180. }
  181. define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
  182. %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
  183. %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
  184. %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
  185. ; SSE3: cost of 2 {{.*}} extractelement
  186. ; AVX: cost of 2 {{.*}} extractelement
  187. ; AVX2: cost of 2 {{.*}} extractelement
  188. %r = extractelement <2 x double> %bin.rdx8, i32 0
  189. ret double %r
  190. }
  191. define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
  192. %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
  193. %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
  194. %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
  195. %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
  196. %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  197. %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
  198. ; SSE3: cost of 4 {{.*}} extractelement
  199. ; AVX: cost of 4 {{.*}} extractelement
  200. ; AVX2: cost of 4 {{.*}} extractelement
  201. %r = extractelement <4 x float> %bin.rdx8, i32 0
  202. ret float %r
  203. }
  204. define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
  205. %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
  206. %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
  207. %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
  208. %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
  209. %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  210. %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
  211. ; AVX: cost of 5 {{.*}} extractelement
  212. ; AVX2: cost of 5 {{.*}} extractelement
  213. %r = extractelement <4 x double> %bin.rdx8, i32 0
  214. ret double %r
  215. }
  216. define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
  217. %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
  218. %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
  219. %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
  220. %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  221. %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  222. %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
  223. %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  224. %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  225. %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
  226. ; AVX: cost of 7 {{.*}} extractelement
  227. ; AVX2: cost of 7 {{.*}} extractelement
  228. %r = extractelement <8 x float> %bin.rdx9, i32 0
  229. ret float %r
  230. }
  231. define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
  232. %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
  233. %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
  234. %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
  235. ; SSE3: cost of 2 {{.*}} extractelement
  236. ; AVX: cost of 1 {{.*}} extractelement
  237. ; AVX2: cost of 1 {{.*}} extractelement
  238. %r = extractelement <2 x i64> %bin.rdx8, i32 0
  239. ret i64 %r
  240. }
  241. define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
  242. %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
  243. %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
  244. %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
  245. %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
  246. %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  247. %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
  248. ; SSE3: cost of 3 {{.*}} extractelement
  249. ; AVX: cost of 3 {{.*}} extractelement
  250. ; AVX2: cost of 3 {{.*}} extractelement
  251. %r = extractelement <4 x i32> %bin.rdx8, i32 0
  252. ret i32 %r
  253. }
  254. define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
  255. %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
  256. %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
  257. %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
  258. %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
  259. %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
  260. %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
  261. ; AVX: cost of 5 {{.*}} extractelement
  262. ; AVX2: cost of 5 {{.*}} extractelement
  263. %r = extractelement <4 x i64> %bin.rdx8, i32 0
  264. ret i64 %r
  265. }
  266. define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
  267. %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
  268. %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
  269. %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
  270. %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  271. %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  272. %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
  273. %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  274. %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  275. %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
  276. ; SSE3: cost of 5 {{.*}} extractelement
  277. ; AVX: cost of 5 {{.*}} extractelement
  278. ; AVX2: cost of 5 {{.*}} extractelement
  279. %r = extractelement <8 x i16> %bin.rdx9, i32 0
  280. ret i16 %r
  281. }
  282. define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
  283. %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
  284. %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
  285. %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
  286. %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  287. %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  288. %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
  289. %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  290. %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  291. %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
  292. ; AVX: cost of 5 {{.*}} extractelement
  293. ; AVX2: cost of 5 {{.*}} extractelement
  294. %r = extractelement <8 x i32> %bin.rdx9, i32 0
  295. ret i32 %r
  296. }