vec_demanded_elts.ll 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
  1. ; RUN: opt < %s -instcombine -S | FileCheck %s
  2. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  3. define i16 @test1(float %f) {
  4. entry:
  5. ; CHECK-LABEL: @test1(
  6. ; CHECK: fmul float
  7. ; CHECK-NOT: insertelement {{.*}} 0.00
  8. ; CHECK-NOT: call {{.*}} @llvm.x86.sse.mul
  9. ; CHECK-NOT: call {{.*}} @llvm.x86.sse.sub
  10. ; CHECK: ret
  11. %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1]
  12. %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
  13. %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
  14. %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
  15. %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
  16. %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
  17. %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
  18. %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1]
  19. %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1]
  20. %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; <i16> [#uses=1]
  21. ret i16 %tmp69
  22. }
  23. define i32 @test2(float %f) {
  24. ; CHECK-LABEL: @test2(
  25. ; CHECK-NOT: insertelement
  26. ; CHECK-NOT: extractelement
  27. ; CHECK: ret
  28. %tmp5 = fmul float %f, %f
  29. %tmp9 = insertelement <4 x float> undef, float %tmp5, i32 0
  30. %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
  31. %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
  32. %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
  33. %tmp19 = bitcast <4 x float> %tmp12 to <4 x i32>
  34. %tmp21 = extractelement <4 x i32> %tmp19, i32 0
  35. ret i32 %tmp21
  36. }
  37. define i64 @test3(float %f, double %d) {
  38. ; CHECK-LABEL: @test3(
  39. ; CHECK-NOT: insertelement {{.*}} 0.00
  40. ; CHECK: ret
  41. entry:
  42. %v00 = insertelement <4 x float> undef, float %f, i32 0
  43. %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
  44. %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
  45. %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
  46. %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03)
  47. %v10 = insertelement <4 x float> undef, float %f, i32 0
  48. %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
  49. %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
  50. %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
  51. %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13)
  52. %v20 = insertelement <4 x float> undef, float %f, i32 0
  53. %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
  54. %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
  55. %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
  56. %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23)
  57. %v30 = insertelement <4 x float> undef, float %f, i32 0
  58. %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
  59. %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
  60. %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
  61. %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33)
  62. %v40 = insertelement <2 x double> undef, double %d, i32 0
  63. %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
  64. %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41)
  65. %v50 = insertelement <2 x double> undef, double %d, i32 0
  66. %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
  67. %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51)
  68. %v60 = insertelement <2 x double> undef, double %d, i32 0
  69. %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
  70. %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61)
  71. %v70 = insertelement <2 x double> undef, double %d, i32 0
  72. %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
  73. %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71)
  74. %tmp8 = add i32 %tmp0, %tmp2
  75. %tmp9 = add i32 %tmp4, %tmp6
  76. %tmp10 = add i32 %tmp8, %tmp9
  77. %tmp11 = sext i32 %tmp10 to i64
  78. %tmp12 = add i64 %tmp1, %tmp3
  79. %tmp13 = add i64 %tmp5, %tmp7
  80. %tmp14 = add i64 %tmp12, %tmp13
  81. %tmp15 = add i64 %tmp11, %tmp14
  82. ret i64 %tmp15
  83. }
  84. define void @get_image() nounwind {
  85. ; CHECK-LABEL: @get_image(
  86. ; CHECK-NOT: extractelement
  87. ; CHECK: unreachable
  88. entry:
  89. %0 = call i32 @fgetc(i8* null) nounwind ; <i32> [#uses=1]
  90. %1 = trunc i32 %0 to i8 ; <i8> [#uses=1]
  91. %tmp2 = insertelement <100 x i8> zeroinitializer, i8 %1, i32 1 ; <<100 x i8>> [#uses=1]
  92. %tmp1 = extractelement <100 x i8> %tmp2, i32 0 ; <i8> [#uses=1]
  93. %2 = icmp eq i8 %tmp1, 80 ; <i1> [#uses=1]
  94. br i1 %2, label %bb2, label %bb3
  95. bb2: ; preds = %entry
  96. br label %bb3
  97. bb3: ; preds = %bb2, %entry
  98. unreachable
  99. }
  100. ; PR4340
  101. define void @vac(<4 x float>* nocapture %a) nounwind {
  102. ; CHECK-LABEL: @vac(
  103. ; CHECK-NOT: load
  104. ; CHECK: ret
  105. entry:
  106. %tmp1 = load <4 x float>, <4 x float>* %a ; <<4 x float>> [#uses=1]
  107. %vecins = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 0 ; <<4 x float>> [#uses=1]
  108. %vecins4 = insertelement <4 x float> %vecins, float 0.000000e+00, i32 1; <<4 x float>> [#uses=1]
  109. %vecins6 = insertelement <4 x float> %vecins4, float 0.000000e+00, i32 2; <<4 x float>> [#uses=1]
  110. %vecins8 = insertelement <4 x float> %vecins6, float 0.000000e+00, i32 3; <<4 x float>> [#uses=1]
  111. store <4 x float> %vecins8, <4 x float>* %a
  112. ret void
  113. }
  114. declare i32 @fgetc(i8*)
  115. declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
  116. declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
  117. declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
  118. declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
  119. declare i32 @llvm.x86.sse.cvtss2si(<4 x float>)
  120. declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>)
  121. declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
  122. declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>)
  123. declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
  124. declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
  125. declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
  126. declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
  127. ; <rdar://problem/6945110>
  128. define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind {
  129. entry:
  130. %tmp = load <4 x i16>, <4 x i16>* %src
  131. %tmp1 = load <8 x i16>, <8 x i16>* %foo
  132. ; CHECK: %tmp2 = shufflevector
  133. %tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  134. ; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle:
  135. ; CHECK-NOT: shufflevector
  136. %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
  137. ; CHECK-NEXT: pmovzxwd
  138. %0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3)
  139. ret <4 x i32> %0
  140. }
  141. declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
  142. define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind {
  143. entry:
  144. ; CHECK-LABEL: define <4 x float> @dead_shuffle_elt(
  145. ; CHECK: shufflevector <2 x float> %y, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
  146. %shuffle.i = shufflevector <2 x float> %y, <2 x float> %y, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
  147. %shuffle9.i = shufflevector <4 x float> %x, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
  148. ret <4 x float> %shuffle9.i
  149. }
  150. define <2 x float> @test_fptrunc(double %f) {
  151. ; CHECK-LABEL: @test_fptrunc(
  152. ; CHECK: insertelement
  153. ; CHECK: insertelement
  154. ; CHECK-NOT: insertelement
  155. %tmp9 = insertelement <4 x double> undef, double %f, i32 0
  156. %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1
  157. %tmp11 = insertelement <4 x double> %tmp10, double 0.000000e+00, i32 2
  158. %tmp12 = insertelement <4 x double> %tmp11, double 0.000000e+00, i32 3
  159. %tmp5 = fptrunc <4 x double> %tmp12 to <4 x float>
  160. %ret = shufflevector <4 x float> %tmp5, <4 x float> undef, <2 x i32> <i32 0, i32 1>
  161. ret <2 x float> %ret
  162. }
  163. define <2 x double> @test_fpext(float %f) {
  164. ; CHECK-LABEL: @test_fpext(
  165. ; CHECK: insertelement
  166. ; CHECK: insertelement
  167. ; CHECK-NOT: insertelement
  168. %tmp9 = insertelement <4 x float> undef, float %f, i32 0
  169. %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
  170. %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
  171. %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
  172. %tmp5 = fpext <4 x float> %tmp12 to <4 x double>
  173. %ret = shufflevector <4 x double> %tmp5, <4 x double> undef, <2 x i32> <i32 0, i32 1>
  174. ret <2 x double> %ret
  175. }
  176. define <4 x float> @test_select(float %f, float %g) {
  177. ; CHECK-LABEL: @test_select(
  178. ; CHECK: %a0 = insertelement <4 x float> undef, float %f, i32 0
  179. ; CHECK-NOT: insertelement
  180. ; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3
  181. ; CHECK-NOT: insertelement
  182. ; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>
  183. %a0 = insertelement <4 x float> undef, float %f, i32 0
  184. %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1
  185. %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2
  186. %a3 = insertelement <4 x float> %a2, float 3.000000e+00, i32 3
  187. %b0 = insertelement <4 x float> undef, float %g, i32 0
  188. %b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1
  189. %b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2
  190. %b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3
  191. %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3
  192. ret <4 x float> %ret
  193. }
  194. ; We should optimize these two redundant insertqi into one
  195. ; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i)
  196. define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) {
  197. ; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
  198. ; CHECK-NOT: insertqi
  199. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
  200. %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32)
  201. ret <2 x i64> %2
  202. }
  203. ; The result of this insert is the second arg, since the top 64 bits of
  204. ; the result are undefined, and we copy the bottom 64 bits from the
  205. ; second arg
  206. ; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i)
  207. define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
  208. ; CHECK: ret <2 x i64> %i
  209. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
  210. ret <2 x i64> %1
  211. }
  212. ; Test the several types of ranges and ordering that exist for two insertqi
  213. ; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i)
  214. define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) {
  215. ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
  216. ; CHECK: ret <2 x i64> %[[RES]]
  217. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
  218. %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16)
  219. ret <2 x i64> %2
  220. }
  221. ; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i)
  222. define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) {
  223. ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
  224. ; CHECK: ret <2 x i64> %[[RES]]
  225. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16)
  226. %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
  227. ret <2 x i64> %2
  228. }
  229. ; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i)
  230. define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) {
  231. ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
  232. ; CHECK: ret <2 x i64> %[[RES]]
  233. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
  234. %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16)
  235. ret <2 x i64> %2
  236. }
  237. ; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i)
  238. define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) {
  239. ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
  240. ; CHECK: ret <2 x i64> %[[RES]]
  241. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16)
  242. %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
  243. ret <2 x i64> %2
  244. }
  245. ; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i)
  246. define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) {
  247. ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
  248. ; CHECK: ret <2 x i64> %[[RES]]
  249. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
  250. %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
  251. ret <2 x i64> %2
  252. }
  253. ; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i)
  254. define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) {
  255. ; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
  256. ; CHECK: ret <2 x i64> %[[RES]]
  257. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32)
  258. %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
  259. ret <2 x i64> %2
  260. }
  261. ; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i)
  262. define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) {
  263. ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
  264. ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
  265. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
  266. %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
  267. ret <2 x i64> %2
  268. }
  269. ; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i)
  270. define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
  271. ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
  272. ; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
  273. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
  274. %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
  275. ret <2 x i64> %2
  276. }
  277. ; CHECK: define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i)
  278. define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) {
  279. ; CHECK: ret <2 x i64> %i
  280. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0)
  281. ret <2 x i64> %1
  282. }
  283. ; CHECK: define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i)
  284. define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) {
  285. ; CHECK: ret <2 x i64> undef
  286. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16)
  287. ret <2 x i64> %1
  288. }
  289. ; CHECK: define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i)
  290. define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) {
  291. ; CHECK: ret <2 x i64> undef
  292. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32)
  293. ret <2 x i64> %1
  294. }
  295. ; CHECK: define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i)
  296. define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) {
  297. ; CHECK: ret <2 x i64> undef
  298. %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16)
  299. ret <2 x i64> %1
  300. }
  301. ; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
  302. declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
  303. declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
  304. define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
  305. ; CHECK-LABEL: @test_vpermilvar_ps(
  306. ; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
  307. %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
  308. ret <4 x float> %a
  309. }
  310. declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
  311. define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
  312. ; CHECK-LABEL: @test_vpermilvar_ps_256(
  313. ; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
  314. %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
  315. ret <8 x float> %a
  316. }
  317. declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
  318. define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
  319. ; CHECK-LABEL: @test_vpermilvar_pd(
  320. ; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0>
  321. %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 2, i64 0>)
  322. ret <2 x double> %a
  323. }
  324. declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
  325. define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
  326. ; CHECK-LABEL: @test_vpermilvar_pd_256(
  327. ; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
  328. %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 3, i64 1, i64 2, i64 0>)
  329. ret <4 x double> %a
  330. }
  331. define <4 x float> @test_vpermilvar_ps_zero(<4 x float> %v) {
  332. ; CHECK-LABEL: @test_vpermilvar_ps_zero(
  333. ; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
  334. %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer)
  335. ret <4 x float> %a
  336. }
  337. define <8 x float> @test_vpermilvar_ps_256_zero(<8 x float> %v) {
  338. ; CHECK-LABEL: @test_vpermilvar_ps_256_zero(
  339. ; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
  340. %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer)
  341. ret <8 x float> %a
  342. }
  343. define <2 x double> @test_vpermilvar_pd_zero(<2 x double> %v) {
  344. ; CHECK-LABEL: @test_vpermilvar_pd_zero(
  345. ; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
  346. %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> zeroinitializer)
  347. ret <2 x double> %a
  348. }
  349. define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) {
  350. ; CHECK-LABEL: @test_vpermilvar_pd_256_zero(
  351. ; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
  352. %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer)
  353. ret <4 x double> %a
  354. }
  355. define <2 x i64> @test_sse2_1() nounwind readnone uwtable {
  356. %S = bitcast i32 1 to i32
  357. %1 = zext i32 %S to i64
  358. %2 = insertelement <2 x i64> undef, i64 %1, i32 0
  359. %3 = insertelement <2 x i64> %2, i64 0, i32 1
  360. %4 = bitcast <2 x i64> %3 to <8 x i16>
  361. %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
  362. %6 = bitcast <8 x i16> %5 to <4 x i32>
  363. %7 = bitcast <2 x i64> %3 to <4 x i32>
  364. %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
  365. %9 = bitcast <4 x i32> %8 to <2 x i64>
  366. %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
  367. %11 = bitcast <2 x i64> %10 to <8 x i16>
  368. %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
  369. %13 = bitcast <8 x i16> %12 to <4 x i32>
  370. %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
  371. %15 = bitcast <4 x i32> %14 to <2 x i64>
  372. %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
  373. ret <2 x i64> %16
  374. ; CHECK: test_sse2_1
  375. ; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624>
  376. }
  377. define <4 x i64> @test_avx2_1() nounwind readnone uwtable {
  378. %S = bitcast i32 1 to i32
  379. %1 = zext i32 %S to i64
  380. %2 = insertelement <2 x i64> undef, i64 %1, i32 0
  381. %3 = insertelement <2 x i64> %2, i64 0, i32 1
  382. %4 = bitcast <2 x i64> %3 to <8 x i16>
  383. %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
  384. %6 = bitcast <16 x i16> %5 to <8 x i32>
  385. %7 = bitcast <2 x i64> %3 to <4 x i32>
  386. %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
  387. %9 = bitcast <8 x i32> %8 to <4 x i64>
  388. %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
  389. %11 = bitcast <4 x i64> %10 to <16 x i16>
  390. %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
  391. %13 = bitcast <16 x i16> %12 to <8 x i32>
  392. %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
  393. %15 = bitcast <8 x i32> %14 to <4 x i64>
  394. %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
  395. ret <4 x i64> %16
  396. ; CHECK: test_avx2_1
  397. ; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256>
  398. }
  399. define <2 x i64> @test_sse2_0() nounwind readnone uwtable {
  400. %S = bitcast i32 128 to i32
  401. %1 = zext i32 %S to i64
  402. %2 = insertelement <2 x i64> undef, i64 %1, i32 0
  403. %3 = insertelement <2 x i64> %2, i64 0, i32 1
  404. %4 = bitcast <2 x i64> %3 to <8 x i16>
  405. %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
  406. %6 = bitcast <8 x i16> %5 to <4 x i32>
  407. %7 = bitcast <2 x i64> %3 to <4 x i32>
  408. %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
  409. %9 = bitcast <4 x i32> %8 to <2 x i64>
  410. %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
  411. %11 = bitcast <2 x i64> %10 to <8 x i16>
  412. %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
  413. %13 = bitcast <8 x i16> %12 to <4 x i32>
  414. %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
  415. %15 = bitcast <4 x i32> %14 to <2 x i64>
  416. %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
  417. ret <2 x i64> %16
  418. ; CHECK: test_sse2_0
  419. ; CHECK: ret <2 x i64> zeroinitializer
  420. }
  421. define <4 x i64> @test_avx2_0() nounwind readnone uwtable {
  422. %S = bitcast i32 128 to i32
  423. %1 = zext i32 %S to i64
  424. %2 = insertelement <2 x i64> undef, i64 %1, i32 0
  425. %3 = insertelement <2 x i64> %2, i64 0, i32 1
  426. %4 = bitcast <2 x i64> %3 to <8 x i16>
  427. %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
  428. %6 = bitcast <16 x i16> %5 to <8 x i32>
  429. %7 = bitcast <2 x i64> %3 to <4 x i32>
  430. %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
  431. %9 = bitcast <8 x i32> %8 to <4 x i64>
  432. %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
  433. %11 = bitcast <4 x i64> %10 to <16 x i16>
  434. %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
  435. %13 = bitcast <16 x i16> %12 to <8 x i32>
  436. %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
  437. %15 = bitcast <8 x i32> %14 to <4 x i64>
  438. %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
  439. ret <4 x i64> %16
  440. ; CHECK: test_avx2_0
  441. ; CHECK: ret <4 x i64> zeroinitializer
  442. }
  443. define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable {
  444. %S = bitcast i32 1 to i32
  445. %1 = zext i32 %S to i64
  446. %2 = insertelement <2 x i64> undef, i64 %1, i32 0
  447. %3 = insertelement <2 x i64> %2, i64 0, i32 1
  448. %4 = bitcast <2 x i64> %3 to <8 x i16>
  449. %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4)
  450. %6 = bitcast <8 x i16> %5 to <4 x i32>
  451. %7 = bitcast <2 x i64> %3 to <4 x i32>
  452. %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
  453. %9 = bitcast <4 x i32> %8 to <2 x i64>
  454. %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
  455. %11 = bitcast <2 x i64> %10 to <8 x i16>
  456. %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
  457. %13 = bitcast <8 x i16> %12 to <4 x i32>
  458. %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
  459. %15 = bitcast <4 x i32> %14 to <2 x i64>
  460. %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
  461. ret <2 x i64> %16
  462. ; CHECK: test_sse2_psrl_1
  463. ; CHECK: ret <2 x i64> <i64 562954248421376, i64 9007267974742020>
  464. }
  465. define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable {
  466. %S = bitcast i32 1 to i32
  467. %1 = zext i32 %S to i64
  468. %2 = insertelement <2 x i64> undef, i64 %1, i32 0
  469. %3 = insertelement <2 x i64> %2, i64 0, i32 1
  470. %4 = bitcast <2 x i64> %3 to <8 x i16>
  471. %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
  472. %6 = bitcast <16 x i16> %5 to <8 x i32>
  473. %7 = bitcast <2 x i64> %3 to <4 x i32>
  474. %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
  475. %9 = bitcast <8 x i32> %8 to <4 x i64>
  476. %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
  477. %11 = bitcast <4 x i64> %10 to <16 x i16>
  478. %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
  479. %13 = bitcast <16 x i16> %12 to <8 x i32>
  480. %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
  481. %15 = bitcast <8 x i32> %14 to <4 x i64>
  482. %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
  483. ret <4 x i64> %16
  484. ; CHECK: test_avx2_psrl_1
  485. ; CHECK: ret <4 x i64> <i64 16, i64 32, i64 64, i64 128>
  486. }
  487. define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable {
  488. %S = bitcast i32 128 to i32
  489. %1 = zext i32 %S to i64
  490. %2 = insertelement <2 x i64> undef, i64 %1, i32 0
  491. %3 = insertelement <2 x i64> %2, i64 0, i32 1
  492. %4 = bitcast <2 x i64> %3 to <8 x i16>
  493. %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4)
  494. %6 = bitcast <8 x i16> %5 to <4 x i32>
  495. %7 = bitcast <2 x i64> %3 to <4 x i32>
  496. %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
  497. %9 = bitcast <4 x i32> %8 to <2 x i64>
  498. %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
  499. %11 = bitcast <2 x i64> %10 to <8 x i16>
  500. %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
  501. %13 = bitcast <8 x i16> %12 to <4 x i32>
  502. %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
  503. %15 = bitcast <4 x i32> %14 to <2 x i64>
  504. %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
  505. ret <2 x i64> %16
  506. ; CHECK: test_sse2_psrl_0
  507. ; CHECK: ret <2 x i64> zeroinitializer
  508. }
  509. define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable {
  510. %S = bitcast i32 128 to i32
  511. %1 = zext i32 %S to i64
  512. %2 = insertelement <2 x i64> undef, i64 %1, i32 0
  513. %3 = insertelement <2 x i64> %2, i64 0, i32 1
  514. %4 = bitcast <2 x i64> %3 to <8 x i16>
  515. %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
  516. %6 = bitcast <16 x i16> %5 to <8 x i32>
  517. %7 = bitcast <2 x i64> %3 to <4 x i32>
  518. %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
  519. %9 = bitcast <8 x i32> %8 to <4 x i64>
  520. %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
  521. %11 = bitcast <4 x i64> %10 to <16 x i16>
  522. %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
  523. %13 = bitcast <16 x i16> %12 to <8 x i32>
  524. %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
  525. %15 = bitcast <8 x i32> %14 to <4 x i64>
  526. %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
  527. ret <4 x i64> %16
  528. ; CHECK: test_avx2_psrl_0
  529. ; CHECK: ret <4 x i64> zeroinitializer
  530. }
  531. declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1
  532. declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1
  533. declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1
  534. declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1
  535. declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1
  536. declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1
  537. declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1
  538. declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1
  539. declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1
  540. declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1
  541. declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1
  542. declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1
  543. declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1
  544. declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1
  545. declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1
  546. declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1
  547. declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1
  548. declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1
  549. declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1
  550. declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1
  551. declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1
  552. declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1
  553. declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1
  554. declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1
  555. attributes #1 = { nounwind readnone }