vector-promotion.ll 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625
  1. ; RUN: opt < %s -sroa -S | FileCheck %s
  2. target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
  3. %S1 = type { i64, [42 x float] }
  4. define i32 @test1(<4 x i32> %x, <4 x i32> %y) {
  5. ; CHECK-LABEL: @test1(
  6. entry:
  7. %a = alloca [2 x <4 x i32>]
  8. ; CHECK-NOT: alloca
  9. %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
  10. store <4 x i32> %x, <4 x i32>* %a.x
  11. %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
  12. store <4 x i32> %y, <4 x i32>* %a.y
  13. ; CHECK-NOT: store
  14. %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
  15. %tmp1 = load i32, i32* %a.tmp1
  16. %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
  17. %tmp2 = load i32, i32* %a.tmp2
  18. %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
  19. %tmp3 = load i32, i32* %a.tmp3
  20. ; CHECK-NOT: load
  21. ; CHECK: extractelement <4 x i32> %x, i32 2
  22. ; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
  23. ; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
  24. %tmp4 = add i32 %tmp1, %tmp2
  25. %tmp5 = add i32 %tmp3, %tmp4
  26. ret i32 %tmp5
  27. ; CHECK-NEXT: add
  28. ; CHECK-NEXT: add
  29. ; CHECK-NEXT: ret
  30. }
  31. define i32 @test2(<4 x i32> %x, <4 x i32> %y) {
  32. ; CHECK-LABEL: @test2(
  33. entry:
  34. %a = alloca [2 x <4 x i32>]
  35. ; CHECK-NOT: alloca
  36. %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
  37. store <4 x i32> %x, <4 x i32>* %a.x
  38. %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
  39. store <4 x i32> %y, <4 x i32>* %a.y
  40. ; CHECK-NOT: store
  41. %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
  42. %tmp1 = load i32, i32* %a.tmp1
  43. %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
  44. %tmp2 = load i32, i32* %a.tmp2
  45. %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
  46. %a.tmp3.cast = bitcast i32* %a.tmp3 to <2 x i32>*
  47. %tmp3.vec = load <2 x i32>, <2 x i32>* %a.tmp3.cast
  48. %tmp3 = extractelement <2 x i32> %tmp3.vec, i32 0
  49. ; CHECK-NOT: load
  50. ; CHECK: %[[extract1:.*]] = extractelement <4 x i32> %x, i32 2
  51. ; CHECK-NEXT: %[[extract2:.*]] = extractelement <4 x i32> %y, i32 3
  52. ; CHECK-NEXT: %[[extract3:.*]] = shufflevector <4 x i32> %y, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
  53. ; CHECK-NEXT: %[[extract4:.*]] = extractelement <2 x i32> %[[extract3]], i32 0
  54. %tmp4 = add i32 %tmp1, %tmp2
  55. %tmp5 = add i32 %tmp3, %tmp4
  56. ret i32 %tmp5
  57. ; CHECK-NEXT: %[[sum1:.*]] = add i32 %[[extract1]], %[[extract2]]
  58. ; CHECK-NEXT: %[[sum2:.*]] = add i32 %[[extract4]], %[[sum1]]
  59. ; CHECK-NEXT: ret i32 %[[sum2]]
  60. }
  61. define i32 @test3(<4 x i32> %x, <4 x i32> %y) {
  62. ; CHECK-LABEL: @test3(
  63. entry:
  64. %a = alloca [2 x <4 x i32>]
  65. ; CHECK-NOT: alloca
  66. %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
  67. store <4 x i32> %x, <4 x i32>* %a.x
  68. %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
  69. store <4 x i32> %y, <4 x i32>* %a.y
  70. ; CHECK-NOT: store
  71. %a.y.cast = bitcast <4 x i32>* %a.y to i8*
  72. call void @llvm.memset.p0i8.i32(i8* %a.y.cast, i8 0, i32 16, i32 1, i1 false)
  73. ; CHECK-NOT: memset
  74. %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
  75. %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
  76. call void @llvm.memset.p0i8.i32(i8* %a.tmp1.cast, i8 -1, i32 4, i32 1, i1 false)
  77. %tmp1 = load i32, i32* %a.tmp1
  78. %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
  79. %tmp2 = load i32, i32* %a.tmp2
  80. %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
  81. %tmp3 = load i32, i32* %a.tmp3
  82. ; CHECK-NOT: load
  83. ; CHECK: %[[insert:.*]] = insertelement <4 x i32> %x, i32 -1, i32 2
  84. ; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
  85. ; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 3
  86. ; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 0
  87. %tmp4 = add i32 %tmp1, %tmp2
  88. %tmp5 = add i32 %tmp3, %tmp4
  89. ret i32 %tmp5
  90. ; CHECK-NEXT: add
  91. ; CHECK-NEXT: add
  92. ; CHECK-NEXT: ret
  93. }
  94. define i32 @test4(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
  95. ; CHECK-LABEL: @test4(
  96. entry:
  97. %a = alloca [2 x <4 x i32>]
  98. ; CHECK-NOT: alloca
  99. %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
  100. store <4 x i32> %x, <4 x i32>* %a.x
  101. %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
  102. store <4 x i32> %y, <4 x i32>* %a.y
  103. ; CHECK-NOT: store
  104. %a.y.cast = bitcast <4 x i32>* %a.y to i8*
  105. %z.cast = bitcast <4 x i32>* %z to i8*
  106. call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.y.cast, i8* %z.cast, i32 16, i32 1, i1 false)
  107. ; CHECK-NOT: memcpy
  108. %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
  109. %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
  110. %z.tmp1 = getelementptr inbounds <4 x i32>, <4 x i32>* %z, i64 0, i64 2
  111. %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
  112. call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.tmp1.cast, i8* %z.tmp1.cast, i32 4, i32 1, i1 false)
  113. %tmp1 = load i32, i32* %a.tmp1
  114. %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
  115. %tmp2 = load i32, i32* %a.tmp2
  116. %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
  117. %tmp3 = load i32, i32* %a.tmp3
  118. ; CHECK-NOT: memcpy
  119. ; CHECK: %[[load:.*]] = load <4 x i32>, <4 x i32>* %z
  120. ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* %z, i64 0, i64 2
  121. ; CHECK-NEXT: %[[element_load:.*]] = load i32, i32* %[[gep]]
  122. ; CHECK-NEXT: %[[insert:.*]] = insertelement <4 x i32> %x, i32 %[[element_load]], i32 2
  123. ; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
  124. ; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 3
  125. ; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 0
  126. %tmp4 = add i32 %tmp1, %tmp2
  127. %tmp5 = add i32 %tmp3, %tmp4
  128. ret i32 %tmp5
  129. ; CHECK-NEXT: add
  130. ; CHECK-NEXT: add
  131. ; CHECK-NEXT: ret
  132. }
  133. declare void @llvm.memcpy.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i32, i1) nounwind
  134. ; Same as test4 with a different sized address space pointer source.
  135. define i32 @test4_as1(<4 x i32> %x, <4 x i32> %y, <4 x i32> addrspace(1)* %z) {
  136. ; CHECK-LABEL: @test4_as1(
  137. entry:
  138. %a = alloca [2 x <4 x i32>]
  139. ; CHECK-NOT: alloca
  140. %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
  141. store <4 x i32> %x, <4 x i32>* %a.x
  142. %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
  143. store <4 x i32> %y, <4 x i32>* %a.y
  144. ; CHECK-NOT: store
  145. %a.y.cast = bitcast <4 x i32>* %a.y to i8*
  146. %z.cast = bitcast <4 x i32> addrspace(1)* %z to i8 addrspace(1)*
  147. call void @llvm.memcpy.p0i8.p1i8.i32(i8* %a.y.cast, i8 addrspace(1)* %z.cast, i32 16, i32 1, i1 false)
  148. ; CHECK-NOT: memcpy
  149. %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
  150. %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
  151. %z.tmp1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %z, i16 0, i16 2
  152. %z.tmp1.cast = bitcast i32 addrspace(1)* %z.tmp1 to i8 addrspace(1)*
  153. call void @llvm.memcpy.p0i8.p1i8.i32(i8* %a.tmp1.cast, i8 addrspace(1)* %z.tmp1.cast, i32 4, i32 1, i1 false)
  154. %tmp1 = load i32, i32* %a.tmp1
  155. %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
  156. %tmp2 = load i32, i32* %a.tmp2
  157. %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
  158. %tmp3 = load i32, i32* %a.tmp3
  159. ; CHECK-NOT: memcpy
  160. ; CHECK: %[[load:.*]] = load <4 x i32>, <4 x i32> addrspace(1)* %z
  161. ; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %z, i64 0, i64 2
  162. ; CHECK-NEXT: %[[element_load:.*]] = load i32, i32 addrspace(1)* %[[gep]]
  163. ; CHECK-NEXT: %[[insert:.*]] = insertelement <4 x i32> %x, i32 %[[element_load]], i32 2
  164. ; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
  165. ; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 3
  166. ; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 0
  167. %tmp4 = add i32 %tmp1, %tmp2
  168. %tmp5 = add i32 %tmp3, %tmp4
  169. ret i32 %tmp5
  170. ; CHECK-NEXT: add
  171. ; CHECK-NEXT: add
  172. ; CHECK-NEXT: ret
  173. }
  174. define i32 @test5(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
  175. ; CHECK-LABEL: @test5(
  176. ; The same as the above, but with reversed source and destination for the
  177. ; element memcpy, and a self copy.
  178. entry:
  179. %a = alloca [2 x <4 x i32>]
  180. ; CHECK-NOT: alloca
  181. %a.x = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0
  182. store <4 x i32> %x, <4 x i32>* %a.x
  183. %a.y = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1
  184. store <4 x i32> %y, <4 x i32>* %a.y
  185. ; CHECK-NOT: store
  186. %a.y.cast = bitcast <4 x i32>* %a.y to i8*
  187. %a.x.cast = bitcast <4 x i32>* %a.x to i8*
  188. call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.x.cast, i8* %a.y.cast, i32 16, i32 1, i1 false)
  189. ; CHECK-NOT: memcpy
  190. %a.tmp1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
  191. %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
  192. %z.tmp1 = getelementptr inbounds <4 x i32>, <4 x i32>* %z, i64 0, i64 2
  193. %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
  194. call void @llvm.memcpy.p0i8.p0i8.i32(i8* %z.tmp1.cast, i8* %a.tmp1.cast, i32 4, i32 1, i1 false)
  195. %tmp1 = load i32, i32* %a.tmp1
  196. %a.tmp2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
  197. %tmp2 = load i32, i32* %a.tmp2
  198. %a.tmp3 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
  199. %tmp3 = load i32, i32* %a.tmp3
  200. ; CHECK-NOT: memcpy
  201. ; CHECK: %[[gep:.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* %z, i64 0, i64 2
  202. ; CHECK-NEXT: %[[extract:.*]] = extractelement <4 x i32> %y, i32 2
  203. ; CHECK-NEXT: store i32 %[[extract]], i32* %[[gep]]
  204. ; CHECK-NEXT: extractelement <4 x i32> %y, i32 2
  205. ; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
  206. ; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
  207. %tmp4 = add i32 %tmp1, %tmp2
  208. %tmp5 = add i32 %tmp3, %tmp4
  209. ret i32 %tmp5
  210. ; CHECK-NEXT: add
  211. ; CHECK-NEXT: add
  212. ; CHECK-NEXT: ret
  213. }
  214. declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
  215. declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
  216. define i64 @test6(<4 x i64> %x, <4 x i64> %y, i64 %n) {
  217. ; CHECK-LABEL: @test6(
  218. ; The old scalarrepl pass would wrongly drop the store to the second alloca.
  219. ; PR13254
  220. %tmp = alloca { <4 x i64>, <4 x i64> }
  221. %p0 = getelementptr inbounds { <4 x i64>, <4 x i64> }, { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 0
  222. store <4 x i64> %x, <4 x i64>* %p0
  223. ; CHECK: store <4 x i64> %x,
  224. %p1 = getelementptr inbounds { <4 x i64>, <4 x i64> }, { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 1
  225. store <4 x i64> %y, <4 x i64>* %p1
  226. ; CHECK: store <4 x i64> %y,
  227. %addr = getelementptr inbounds { <4 x i64>, <4 x i64> }, { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 0, i64 %n
  228. %res = load i64, i64* %addr, align 4
  229. ret i64 %res
  230. }
  231. define <4 x i32> @test_subvec_store() {
  232. ; CHECK-LABEL: @test_subvec_store(
  233. entry:
  234. %a = alloca <4 x i32>
  235. ; CHECK-NOT: alloca
  236. %a.gep0 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 0
  237. %a.cast0 = bitcast i32* %a.gep0 to <2 x i32>*
  238. store <2 x i32> <i32 0, i32 0>, <2 x i32>* %a.cast0
  239. ; CHECK-NOT: store
  240. ; CHECK: select <4 x i1> <i1 true, i1 true, i1 false, i1 false>
  241. %a.gep1 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 1
  242. %a.cast1 = bitcast i32* %a.gep1 to <2 x i32>*
  243. store <2 x i32> <i32 1, i32 1>, <2 x i32>* %a.cast1
  244. ; CHECK-NEXT: select <4 x i1> <i1 false, i1 true, i1 true, i1 false>
  245. %a.gep2 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 2
  246. %a.cast2 = bitcast i32* %a.gep2 to <2 x i32>*
  247. store <2 x i32> <i32 2, i32 2>, <2 x i32>* %a.cast2
  248. ; CHECK-NEXT: select <4 x i1> <i1 false, i1 false, i1 true, i1 true>
  249. %a.gep3 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 3
  250. store i32 3, i32* %a.gep3
  251. ; CHECK-NEXT: insertelement <4 x i32>
  252. %ret = load <4 x i32>, <4 x i32>* %a
  253. ret <4 x i32> %ret
  254. ; CHECK-NEXT: ret <4 x i32>
  255. }
  256. define <4 x i32> @test_subvec_load() {
  257. ; CHECK-LABEL: @test_subvec_load(
  258. entry:
  259. %a = alloca <4 x i32>
  260. ; CHECK-NOT: alloca
  261. store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a
  262. ; CHECK-NOT: store
  263. %a.gep0 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 0
  264. %a.cast0 = bitcast i32* %a.gep0 to <2 x i32>*
  265. %first = load <2 x i32>, <2 x i32>* %a.cast0
  266. ; CHECK-NOT: load
  267. ; CHECK: %[[extract1:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
  268. %a.gep1 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 1
  269. %a.cast1 = bitcast i32* %a.gep1 to <2 x i32>*
  270. %second = load <2 x i32>, <2 x i32>* %a.cast1
  271. ; CHECK-NEXT: %[[extract2:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 1, i32 2>
  272. %a.gep2 = getelementptr <4 x i32>, <4 x i32>* %a, i32 0, i32 2
  273. %a.cast2 = bitcast i32* %a.gep2 to <2 x i32>*
  274. %third = load <2 x i32>, <2 x i32>* %a.cast2
  275. ; CHECK-NEXT: %[[extract3:.*]] = shufflevector <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
  276. %tmp = shufflevector <2 x i32> %first, <2 x i32> %second, <2 x i32> <i32 0, i32 2>
  277. %ret = shufflevector <2 x i32> %tmp, <2 x i32> %third, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  278. ; CHECK-NEXT: %[[tmp:.*]] = shufflevector <2 x i32> %[[extract1]], <2 x i32> %[[extract2]], <2 x i32> <i32 0, i32 2>
  279. ; CHECK-NEXT: %[[ret:.*]] = shufflevector <2 x i32> %[[tmp]], <2 x i32> %[[extract3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  280. ret <4 x i32> %ret
  281. ; CHECK-NEXT: ret <4 x i32> %[[ret]]
  282. }
  283. declare void @llvm.memset.p0i32.i32(i32* nocapture, i32, i32, i32, i1) nounwind
  284. define <4 x float> @test_subvec_memset() {
  285. ; CHECK-LABEL: @test_subvec_memset(
  286. entry:
  287. %a = alloca <4 x float>
  288. ; CHECK-NOT: alloca
  289. %a.gep0 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 0
  290. %a.cast0 = bitcast float* %a.gep0 to i8*
  291. call void @llvm.memset.p0i8.i32(i8* %a.cast0, i8 0, i32 8, i32 0, i1 false)
  292. ; CHECK-NOT: store
  293. ; CHECK: select <4 x i1> <i1 true, i1 true, i1 false, i1 false>
  294. %a.gep1 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 1
  295. %a.cast1 = bitcast float* %a.gep1 to i8*
  296. call void @llvm.memset.p0i8.i32(i8* %a.cast1, i8 1, i32 8, i32 0, i1 false)
  297. ; CHECK-NEXT: select <4 x i1> <i1 false, i1 true, i1 true, i1 false>
  298. %a.gep2 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 2
  299. %a.cast2 = bitcast float* %a.gep2 to i8*
  300. call void @llvm.memset.p0i8.i32(i8* %a.cast2, i8 3, i32 8, i32 0, i1 false)
  301. ; CHECK-NEXT: select <4 x i1> <i1 false, i1 false, i1 true, i1 true>
  302. %a.gep3 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 3
  303. %a.cast3 = bitcast float* %a.gep3 to i8*
  304. call void @llvm.memset.p0i8.i32(i8* %a.cast3, i8 7, i32 4, i32 0, i1 false)
  305. ; CHECK-NEXT: insertelement <4 x float>
  306. %ret = load <4 x float>, <4 x float>* %a
  307. ret <4 x float> %ret
  308. ; CHECK-NEXT: ret <4 x float>
  309. }
  310. define <4 x float> @test_subvec_memcpy(i8* %x, i8* %y, i8* %z, i8* %f, i8* %out) {
  311. ; CHECK-LABEL: @test_subvec_memcpy(
  312. entry:
  313. %a = alloca <4 x float>
  314. ; CHECK-NOT: alloca
  315. %a.gep0 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 0
  316. %a.cast0 = bitcast float* %a.gep0 to i8*
  317. call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast0, i8* %x, i32 8, i32 0, i1 false)
  318. ; CHECK: %[[xptr:.*]] = bitcast i8* %x to <2 x float>*
  319. ; CHECK-NEXT: %[[x:.*]] = load <2 x float>, <2 x float>* %[[xptr]]
  320. ; CHECK-NEXT: %[[expand_x:.*]] = shufflevector <2 x float> %[[x]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
  321. ; CHECK-NEXT: select <4 x i1> <i1 true, i1 true, i1 false, i1 false>
  322. %a.gep1 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 1
  323. %a.cast1 = bitcast float* %a.gep1 to i8*
  324. call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast1, i8* %y, i32 8, i32 0, i1 false)
  325. ; CHECK-NEXT: %[[yptr:.*]] = bitcast i8* %y to <2 x float>*
  326. ; CHECK-NEXT: %[[y:.*]] = load <2 x float>, <2 x float>* %[[yptr]]
  327. ; CHECK-NEXT: %[[expand_y:.*]] = shufflevector <2 x float> %[[y]], <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
  328. ; CHECK-NEXT: select <4 x i1> <i1 false, i1 true, i1 true, i1 false>
  329. %a.gep2 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 2
  330. %a.cast2 = bitcast float* %a.gep2 to i8*
  331. call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast2, i8* %z, i32 8, i32 0, i1 false)
  332. ; CHECK-NEXT: %[[zptr:.*]] = bitcast i8* %z to <2 x float>*
  333. ; CHECK-NEXT: %[[z:.*]] = load <2 x float>, <2 x float>* %[[zptr]]
  334. ; CHECK-NEXT: %[[expand_z:.*]] = shufflevector <2 x float> %[[z]], <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
  335. ; CHECK-NEXT: select <4 x i1> <i1 false, i1 false, i1 true, i1 true>
  336. %a.gep3 = getelementptr <4 x float>, <4 x float>* %a, i32 0, i32 3
  337. %a.cast3 = bitcast float* %a.gep3 to i8*
  338. call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast3, i8* %f, i32 4, i32 0, i1 false)
  339. ; CHECK-NEXT: %[[fptr:.*]] = bitcast i8* %f to float*
  340. ; CHECK-NEXT: %[[f:.*]] = load float, float* %[[fptr]]
  341. ; CHECK-NEXT: %[[insert_f:.*]] = insertelement <4 x float>
  342. call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %a.cast2, i32 8, i32 0, i1 false)
  343. ; CHECK-NEXT: %[[outptr:.*]] = bitcast i8* %out to <2 x float>*
  344. ; CHECK-NEXT: %[[extract_out:.*]] = shufflevector <4 x float> %[[insert_f]], <4 x float> undef, <2 x i32> <i32 2, i32 3>
  345. ; CHECK-NEXT: store <2 x float> %[[extract_out]], <2 x float>* %[[outptr]]
  346. %ret = load <4 x float>, <4 x float>* %a
  347. ret <4 x float> %ret
  348. ; CHECK-NEXT: ret <4 x float> %[[insert_f]]
  349. }
  350. define i32 @PR14212() {
  351. ; CHECK-LABEL: @PR14212(
  352. ; This caused a crash when "splitting" the load of the i32 in order to promote
  353. ; the store of <3 x i8> properly. Heavily reduced from an OpenCL test case.
  354. entry:
  355. %retval = alloca <3 x i8>, align 4
  356. ; CHECK-NOT: alloca
  357. store <3 x i8> undef, <3 x i8>* %retval, align 4
  358. %cast = bitcast <3 x i8>* %retval to i32*
  359. %load = load i32, i32* %cast, align 4
  360. ret i32 %load
  361. ; CHECK: ret i32
  362. }
  363. define <2 x i8> @PR14349.1(i32 %x) {
  364. ; CHECK: @PR14349.1
  365. ; The first testcase for broken SROA rewriting of split integer loads and
  366. ; stores due to smaller vector loads and stores. This particular test ensures
  367. ; that we can rewrite a split store of an integer to a store of a vector.
  368. entry:
  369. %a = alloca i32
  370. ; CHECK-NOT: alloca
  371. store i32 %x, i32* %a
  372. ; CHECK-NOT: store
  373. %cast = bitcast i32* %a to <2 x i8>*
  374. %vec = load <2 x i8>, <2 x i8>* %cast
  375. ; CHECK-NOT: load
  376. ret <2 x i8> %vec
  377. ; CHECK: %[[trunc:.*]] = trunc i32 %x to i16
  378. ; CHECK: %[[cast:.*]] = bitcast i16 %[[trunc]] to <2 x i8>
  379. ; CHECK: ret <2 x i8> %[[cast]]
  380. }
  381. define i32 @PR14349.2(<2 x i8> %x) {
  382. ; CHECK: @PR14349.2
  383. ; The first testcase for broken SROA rewriting of split integer loads and
  384. ; stores due to smaller vector loads and stores. This particular test ensures
  385. ; that we can rewrite a split load of an integer to a load of a vector.
  386. entry:
  387. %a = alloca i32
  388. ; CHECK-NOT: alloca
  389. %cast = bitcast i32* %a to <2 x i8>*
  390. store <2 x i8> %x, <2 x i8>* %cast
  391. ; CHECK-NOT: store
  392. %int = load i32, i32* %a
  393. ; CHECK-NOT: load
  394. ret i32 %int
  395. ; CHECK: %[[cast:.*]] = bitcast <2 x i8> %x to i16
  396. ; CHECK: %[[trunc:.*]] = zext i16 %[[cast]] to i32
  397. ; CHECK: %[[insert:.*]] = or i32 %{{.*}}, %[[trunc]]
  398. ; CHECK: ret i32 %[[insert]]
  399. }
  400. define i32 @test7(<2 x i32> %x, <2 x i32> %y) {
  401. ; Test that we can promote to vectors when the alloca doesn't mention any vector types.
  402. ; CHECK-LABEL: @test7(
  403. entry:
  404. %a = alloca [2 x i64]
  405. %a.cast = bitcast [2 x i64]* %a to [2 x <2 x i32>]*
  406. ; CHECK-NOT: alloca
  407. %a.x = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* %a.cast, i64 0, i64 0
  408. store <2 x i32> %x, <2 x i32>* %a.x
  409. %a.y = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* %a.cast, i64 0, i64 1
  410. store <2 x i32> %y, <2 x i32>* %a.y
  411. ; CHECK-NOT: store
  412. %a.tmp1 = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* %a.cast, i64 0, i64 0, i64 1
  413. %tmp1 = load i32, i32* %a.tmp1
  414. %a.tmp2 = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* %a.cast, i64 0, i64 1, i64 1
  415. %tmp2 = load i32, i32* %a.tmp2
  416. %a.tmp3 = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* %a.cast, i64 0, i64 1, i64 0
  417. %tmp3 = load i32, i32* %a.tmp3
  418. ; CHECK-NOT: load
  419. ; CHECK: extractelement <2 x i32> %x, i32 1
  420. ; CHECK-NEXT: extractelement <2 x i32> %y, i32 1
  421. ; CHECK-NEXT: extractelement <2 x i32> %y, i32 0
  422. %tmp4 = add i32 %tmp1, %tmp2
  423. %tmp5 = add i32 %tmp3, %tmp4
  424. ret i32 %tmp5
  425. ; CHECK-NEXT: add
  426. ; CHECK-NEXT: add
  427. ; CHECK-NEXT: ret
  428. }
  429. define i32 @test8(<2 x i32> %x) {
  430. ; Ensure that we can promote an alloca that doesn't mention a vector type based
  431. ; on a single store with a vector type.
  432. ; CHECK-LABEL: @test8(
  433. entry:
  434. %a = alloca i64
  435. %a.vec = bitcast i64* %a to <2 x i32>*
  436. %a.i32 = bitcast i64* %a to i32*
  437. ; CHECK-NOT: alloca
  438. store <2 x i32> %x, <2 x i32>* %a.vec
  439. ; CHECK-NOT: store
  440. %tmp1 = load i32, i32* %a.i32
  441. %a.tmp2 = getelementptr inbounds i32, i32* %a.i32, i64 1
  442. %tmp2 = load i32, i32* %a.tmp2
  443. ; CHECK-NOT: load
  444. ; CHECK: extractelement <2 x i32> %x, i32 0
  445. ; CHECK-NEXT: extractelement <2 x i32> %x, i32 1
  446. %tmp4 = add i32 %tmp1, %tmp2
  447. ret i32 %tmp4
  448. ; CHECK-NEXT: add
  449. ; CHECK-NEXT: ret
  450. }
  451. define <2 x i32> @test9(i32 %x, i32 %y) {
  452. ; Ensure that we can promote an alloca that doesn't mention a vector type based
  453. ; on a single load with a vector type.
  454. ; CHECK-LABEL: @test9(
  455. entry:
  456. %a = alloca i64
  457. %a.vec = bitcast i64* %a to <2 x i32>*
  458. %a.i32 = bitcast i64* %a to i32*
  459. ; CHECK-NOT: alloca
  460. store i32 %x, i32* %a.i32
  461. %a.tmp2 = getelementptr inbounds i32, i32* %a.i32, i64 1
  462. store i32 %y, i32* %a.tmp2
  463. ; CHECK-NOT: store
  464. ; CHECK: %[[V1:.*]] = insertelement <2 x i32> undef, i32 %x, i32 0
  465. ; CHECK-NEXT: %[[V2:.*]] = insertelement <2 x i32> %[[V1]], i32 %y, i32 1
  466. %result = load <2 x i32>, <2 x i32>* %a.vec
  467. ; CHECK-NOT: load
  468. ret <2 x i32> %result
  469. ; CHECK-NEXT: ret <2 x i32> %[[V2]]
  470. }
  471. define <2 x i32> @test10(<4 x i16> %x, i32 %y) {
  472. ; If there are multiple different vector types used, we should select the one
  473. ; with the widest elements.
  474. ; CHECK-LABEL: @test10(
  475. entry:
  476. %a = alloca i64
  477. %a.vec1 = bitcast i64* %a to <2 x i32>*
  478. %a.vec2 = bitcast i64* %a to <4 x i16>*
  479. %a.i32 = bitcast i64* %a to i32*
  480. ; CHECK-NOT: alloca
  481. store <4 x i16> %x, <4 x i16>* %a.vec2
  482. %a.tmp2 = getelementptr inbounds i32, i32* %a.i32, i64 1
  483. store i32 %y, i32* %a.tmp2
  484. ; CHECK-NOT: store
  485. ; CHECK: %[[V1:.*]] = bitcast <4 x i16> %x to <2 x i32>
  486. ; CHECK-NEXT: %[[V2:.*]] = insertelement <2 x i32> %[[V1]], i32 %y, i32 1
  487. %result = load <2 x i32>, <2 x i32>* %a.vec1
  488. ; CHECK-NOT: load
  489. ret <2 x i32> %result
  490. ; CHECK-NEXT: ret <2 x i32> %[[V2]]
  491. }
  492. define <2 x float> @test11(<4 x i16> %x, i32 %y) {
  493. ; If there are multiple different element types for different vector types,
  494. ; pick the integer types. This isn't really important, but seems like the best
  495. ; heuristic for making a deterministic decision.
  496. ; CHECK-LABEL: @test11(
  497. entry:
  498. %a = alloca i64
  499. %a.vec1 = bitcast i64* %a to <2 x float>*
  500. %a.vec2 = bitcast i64* %a to <4 x i16>*
  501. %a.i32 = bitcast i64* %a to i32*
  502. ; CHECK-NOT: alloca
  503. store <4 x i16> %x, <4 x i16>* %a.vec2
  504. %a.tmp2 = getelementptr inbounds i32, i32* %a.i32, i64 1
  505. store i32 %y, i32* %a.tmp2
  506. ; CHECK-NOT: store
  507. ; CHECK: %[[V1:.*]] = bitcast i32 %y to <2 x i16>
  508. ; CHECK-NEXT: %[[V2:.*]] = shufflevector <2 x i16> %[[V1]], <2 x i16> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
  509. ; CHECK-NEXT: %[[V3:.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i16> %[[V2]], <4 x i16> %x
  510. ; CHECK-NEXT: %[[V4:.*]] = bitcast <4 x i16> %[[V3]] to <2 x float>
  511. %result = load <2 x float>, <2 x float>* %a.vec1
  512. ; CHECK-NOT: load
  513. ret <2 x float> %result
  514. ; CHECK-NEXT: ret <2 x float> %[[V4]]
  515. }
  516. define <4 x float> @test12() {
  517. ; CHECK-LABEL: @test12(
  518. %a = alloca <3 x i32>, align 16
  519. ; CHECK-NOT: alloca
  520. %cast1 = bitcast <3 x i32>* %a to <4 x i32>*
  521. store <4 x i32> undef, <4 x i32>* %cast1, align 16
  522. ; CHECK-NOT: store
  523. %cast2 = bitcast <3 x i32>* %a to <3 x float>*
  524. %cast3 = bitcast <3 x float>* %cast2 to <4 x float>*
  525. %vec = load <4 x float>, <4 x float>* %cast3
  526. ; CHECK-NOT: load
  527. ; CHECK: %[[ret:.*]] = bitcast <4 x i32> undef to <4 x float>
  528. ; CHECK-NEXT: ret <4 x float> %[[ret]]
  529. ret <4 x float> %vec
  530. }