horizontal.ll 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. ; RUN: opt -slp-vectorizer -slp-vectorize-hor -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE
  2. target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
  3. ; #include <stdint.h>
  4. ;
  5. ; int foo(float *A, int n) {
  6. ; float sum = 0;
  7. ; for (intptr_t i=0; i < n; ++i) {
  8. ; sum += 7*A[i*4 ] +
  9. ; 7*A[i*4+1] +
  10. ; 7*A[i*4+2] +
  11. ; 7*A[i*4+3];
  12. ; }
  13. ; return sum;
  14. ; }
  15. ; NOSTORE-LABEL: add_red
  16. ; NOSTORE: fmul <4 x float>
  17. ; NOSTORE: shufflevector <4 x float>
  18. define i32 @add_red(float* %A, i32 %n) {
  19. entry:
  20. %cmp31 = icmp sgt i32 %n, 0
  21. br i1 %cmp31, label %for.body.lr.ph, label %for.end
  22. for.body.lr.ph:
  23. %0 = sext i32 %n to i64
  24. br label %for.body
  25. for.body:
  26. %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
  27. %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
  28. %mul = shl nsw i64 %i.033, 2
  29. %arrayidx = getelementptr inbounds float, float* %A, i64 %mul
  30. %1 = load float, float* %arrayidx, align 4
  31. %mul2 = fmul float %1, 7.000000e+00
  32. %add28 = or i64 %mul, 1
  33. %arrayidx4 = getelementptr inbounds float, float* %A, i64 %add28
  34. %2 = load float, float* %arrayidx4, align 4
  35. %mul5 = fmul float %2, 7.000000e+00
  36. %add6 = fadd fast float %mul2, %mul5
  37. %add829 = or i64 %mul, 2
  38. %arrayidx9 = getelementptr inbounds float, float* %A, i64 %add829
  39. %3 = load float, float* %arrayidx9, align 4
  40. %mul10 = fmul float %3, 7.000000e+00
  41. %add11 = fadd fast float %add6, %mul10
  42. %add1330 = or i64 %mul, 3
  43. %arrayidx14 = getelementptr inbounds float, float* %A, i64 %add1330
  44. %4 = load float, float* %arrayidx14, align 4
  45. %mul15 = fmul float %4, 7.000000e+00
  46. %add16 = fadd fast float %add11, %mul15
  47. %add17 = fadd fast float %sum.032, %add16
  48. %inc = add nsw i64 %i.033, 1
  49. %exitcond = icmp eq i64 %inc, %0
  50. br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
  51. for.cond.for.end_crit_edge:
  52. %phitmp = fptosi float %add17 to i32
  53. br label %for.end
  54. for.end:
  55. %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
  56. ret i32 %sum.0.lcssa
  57. }
  58. ; int foo(float * restrict A, float * restrict B, int n) {
  59. ; float sum = 0;
  60. ; for (intptr_t i=0; i < n; ++i) {
  61. ; sum *= B[0]*A[i*4 ] +
  62. ; B[1]*A[i*4+1] +
  63. ; B[2]*A[i*4+2] +
  64. ; B[3]*A[i*4+3];
  65. ; }
  66. ; return sum;
  67. ; }
  68. ; CHECK-LABEL: mul_red
  69. ; CHECK: fmul <4 x float>
  70. ; CHECK: shufflevector <4 x float>
  71. define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
  72. entry:
  73. %cmp38 = icmp sgt i32 %n, 0
  74. br i1 %cmp38, label %for.body.lr.ph, label %for.end
  75. for.body.lr.ph:
  76. %0 = load float, float* %B, align 4
  77. %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
  78. %1 = load float, float* %arrayidx4, align 4
  79. %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
  80. %2 = load float, float* %arrayidx9, align 4
  81. %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
  82. %3 = load float, float* %arrayidx15, align 4
  83. %4 = sext i32 %n to i64
  84. br label %for.body
  85. for.body:
  86. %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
  87. %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
  88. %mul = shl nsw i64 %i.040, 2
  89. %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
  90. %5 = load float, float* %arrayidx2, align 4
  91. %mul3 = fmul float %0, %5
  92. %add35 = or i64 %mul, 1
  93. %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add35
  94. %6 = load float, float* %arrayidx6, align 4
  95. %mul7 = fmul float %1, %6
  96. %add8 = fadd fast float %mul3, %mul7
  97. %add1136 = or i64 %mul, 2
  98. %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1136
  99. %7 = load float, float* %arrayidx12, align 4
  100. %mul13 = fmul float %2, %7
  101. %add14 = fadd fast float %add8, %mul13
  102. %add1737 = or i64 %mul, 3
  103. %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1737
  104. %8 = load float, float* %arrayidx18, align 4
  105. %mul19 = fmul float %3, %8
  106. %add20 = fadd fast float %add14, %mul19
  107. %mul21 = fmul float %sum.039, %add20
  108. %inc = add nsw i64 %i.040, 1
  109. %exitcond = icmp eq i64 %inc, %4
  110. br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
  111. for.cond.for.end_crit_edge:
  112. %phitmp = fptosi float %mul21 to i32
  113. br label %for.end
  114. for.end:
  115. %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
  116. ret i32 %sum.0.lcssa
  117. }
  118. ; int foo(float * restrict A, float * restrict B, int n) {
  119. ; float sum = 0;
  120. ; for (intptr_t i=0; i < n; ++i) {
  121. ; sum += B[0]*A[i*6 ] +
  122. ; B[1]*A[i*6+1] +
  123. ; B[2]*A[i*6+2] +
  124. ; B[3]*A[i*6+3] +
  125. ; B[4]*A[i*6+4] +
  126. ; B[5]*A[i*6+5] +
  127. ; B[6]*A[i*6+6] +
  128. ; B[7]*A[i*6+7] +
  129. ; B[8]*A[i*6+8];
  130. ; }
  131. ; return sum;
  132. ; }
  133. ; CHECK-LABEL: long_red
  134. ; CHECK: fmul fast <4 x float>
  135. ; CHECK: shufflevector <4 x float>
  136. define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
  137. entry:
  138. %cmp81 = icmp sgt i32 %n, 0
  139. br i1 %cmp81, label %for.body.lr.ph, label %for.end
  140. for.body.lr.ph:
  141. %0 = load float, float* %B, align 4
  142. %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
  143. %1 = load float, float* %arrayidx4, align 4
  144. %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
  145. %2 = load float, float* %arrayidx9, align 4
  146. %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
  147. %3 = load float, float* %arrayidx15, align 4
  148. %arrayidx21 = getelementptr inbounds float, float* %B, i64 4
  149. %4 = load float, float* %arrayidx21, align 4
  150. %arrayidx27 = getelementptr inbounds float, float* %B, i64 5
  151. %5 = load float, float* %arrayidx27, align 4
  152. %arrayidx33 = getelementptr inbounds float, float* %B, i64 6
  153. %6 = load float, float* %arrayidx33, align 4
  154. %arrayidx39 = getelementptr inbounds float, float* %B, i64 7
  155. %7 = load float, float* %arrayidx39, align 4
  156. %arrayidx45 = getelementptr inbounds float, float* %B, i64 8
  157. %8 = load float, float* %arrayidx45, align 4
  158. %9 = sext i32 %n to i64
  159. br label %for.body
  160. for.body:
  161. %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
  162. %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
  163. %mul = mul nsw i64 %i.083, 6
  164. %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
  165. %10 = load float, float* %arrayidx2, align 4
  166. %mul3 = fmul fast float %0, %10
  167. %add80 = or i64 %mul, 1
  168. %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add80
  169. %11 = load float, float* %arrayidx6, align 4
  170. %mul7 = fmul fast float %1, %11
  171. %add8 = fadd fast float %mul3, %mul7
  172. %add11 = add nsw i64 %mul, 2
  173. %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add11
  174. %12 = load float, float* %arrayidx12, align 4
  175. %mul13 = fmul fast float %2, %12
  176. %add14 = fadd fast float %add8, %mul13
  177. %add17 = add nsw i64 %mul, 3
  178. %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add17
  179. %13 = load float, float* %arrayidx18, align 4
  180. %mul19 = fmul fast float %3, %13
  181. %add20 = fadd fast float %add14, %mul19
  182. %add23 = add nsw i64 %mul, 4
  183. %arrayidx24 = getelementptr inbounds float, float* %A, i64 %add23
  184. %14 = load float, float* %arrayidx24, align 4
  185. %mul25 = fmul fast float %4, %14
  186. %add26 = fadd fast float %add20, %mul25
  187. %add29 = add nsw i64 %mul, 5
  188. %arrayidx30 = getelementptr inbounds float, float* %A, i64 %add29
  189. %15 = load float, float* %arrayidx30, align 4
  190. %mul31 = fmul fast float %5, %15
  191. %add32 = fadd fast float %add26, %mul31
  192. %add35 = add nsw i64 %mul, 6
  193. %arrayidx36 = getelementptr inbounds float, float* %A, i64 %add35
  194. %16 = load float, float* %arrayidx36, align 4
  195. %mul37 = fmul fast float %6, %16
  196. %add38 = fadd fast float %add32, %mul37
  197. %add41 = add nsw i64 %mul, 7
  198. %arrayidx42 = getelementptr inbounds float, float* %A, i64 %add41
  199. %17 = load float, float* %arrayidx42, align 4
  200. %mul43 = fmul fast float %7, %17
  201. %add44 = fadd fast float %add38, %mul43
  202. %add47 = add nsw i64 %mul, 8
  203. %arrayidx48 = getelementptr inbounds float, float* %A, i64 %add47
  204. %18 = load float, float* %arrayidx48, align 4
  205. %mul49 = fmul fast float %8, %18
  206. %add50 = fadd fast float %add44, %mul49
  207. %add51 = fadd fast float %sum.082, %add50
  208. %inc = add nsw i64 %i.083, 1
  209. %exitcond = icmp eq i64 %inc, %9
  210. br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
  211. for.cond.for.end_crit_edge:
  212. %phitmp = fptosi float %add51 to i32
  213. br label %for.end
  214. for.end:
  215. %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
  216. ret i32 %sum.0.lcssa
  217. }
  218. ; int foo(float * restrict A, float * restrict B, int n) {
  219. ; float sum = 0;
  220. ; for (intptr_t i=0; i < n; ++i) {
  221. ; sum += B[0]*A[i*4 ];
  222. ; sum += B[1]*A[i*4+1];
  223. ; sum += B[2]*A[i*4+2];
  224. ; sum += B[3]*A[i*4+3];
  225. ; }
  226. ; return sum;
  227. ; }
  228. ; CHECK-LABEL: chain_red
  229. ; CHECK: fmul fast <4 x float>
  230. ; CHECK: shufflevector <4 x float>
  231. define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
  232. entry:
  233. %cmp41 = icmp sgt i32 %n, 0
  234. br i1 %cmp41, label %for.body.lr.ph, label %for.end
  235. for.body.lr.ph:
  236. %0 = load float, float* %B, align 4
  237. %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
  238. %1 = load float, float* %arrayidx4, align 4
  239. %arrayidx10 = getelementptr inbounds float, float* %B, i64 2
  240. %2 = load float, float* %arrayidx10, align 4
  241. %arrayidx16 = getelementptr inbounds float, float* %B, i64 3
  242. %3 = load float, float* %arrayidx16, align 4
  243. %4 = sext i32 %n to i64
  244. br label %for.body
  245. for.body:
  246. %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
  247. %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
  248. %mul = shl nsw i64 %i.043, 2
  249. %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
  250. %5 = load float, float* %arrayidx2, align 4
  251. %mul3 = fmul fast float %0, %5
  252. %add = fadd fast float %sum.042, %mul3
  253. %add638 = or i64 %mul, 1
  254. %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add638
  255. %6 = load float, float* %arrayidx7, align 4
  256. %mul8 = fmul fast float %1, %6
  257. %add9 = fadd fast float %add, %mul8
  258. %add1239 = or i64 %mul, 2
  259. %arrayidx13 = getelementptr inbounds float, float* %A, i64 %add1239
  260. %7 = load float, float* %arrayidx13, align 4
  261. %mul14 = fmul fast float %2, %7
  262. %add15 = fadd fast float %add9, %mul14
  263. %add1840 = or i64 %mul, 3
  264. %arrayidx19 = getelementptr inbounds float, float* %A, i64 %add1840
  265. %8 = load float, float* %arrayidx19, align 4
  266. %mul20 = fmul fast float %3, %8
  267. %add21 = fadd fast float %add15, %mul20
  268. %inc = add nsw i64 %i.043, 1
  269. %exitcond = icmp eq i64 %inc, %4
  270. br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
  271. for.cond.for.end_crit_edge:
  272. %phitmp = fptosi float %add21 to i32
  273. br label %for.end
  274. for.end:
  275. %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
  276. ret i32 %sum.0.lcssa
  277. }
  278. ; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
  279. ; float sum = 0;
  280. ; for (intptr_t i=0; i < n; ++i) {
  281. ; C[i] = B[0] *A[i*4 ] +
  282. ; B[1] *A[i*4+1] +
  283. ; B[2] *A[i*4+2] +
  284. ; B[3] *A[i*4+3];
  285. ; }
  286. ; return sum;
  287. ; }
  288. ; CHECK-LABEL: store_red
  289. ; CHECK: fmul fast <4 x float>
  290. ; CHECK: shufflevector <4 x float>
  291. define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
  292. entry:
  293. %cmp37 = icmp sgt i32 %n, 0
  294. br i1 %cmp37, label %for.body.lr.ph, label %for.end
  295. for.body.lr.ph:
  296. %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
  297. %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
  298. %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
  299. %0 = sext i32 %n to i64
  300. br label %for.body
  301. for.body:
  302. %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
  303. %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
  304. %1 = load float, float* %B, align 4
  305. %mul = shl nsw i64 %i.039, 2
  306. %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
  307. %2 = load float, float* %arrayidx2, align 4
  308. %mul3 = fmul fast float %1, %2
  309. %3 = load float, float* %arrayidx4, align 4
  310. %add34 = or i64 %mul, 1
  311. %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add34
  312. %4 = load float, float* %arrayidx6, align 4
  313. %mul7 = fmul fast float %3, %4
  314. %add8 = fadd fast float %mul3, %mul7
  315. %5 = load float, float* %arrayidx9, align 4
  316. %add1135 = or i64 %mul, 2
  317. %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1135
  318. %6 = load float, float* %arrayidx12, align 4
  319. %mul13 = fmul fast float %5, %6
  320. %add14 = fadd fast float %add8, %mul13
  321. %7 = load float, float* %arrayidx15, align 4
  322. %add1736 = or i64 %mul, 3
  323. %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1736
  324. %8 = load float, float* %arrayidx18, align 4
  325. %mul19 = fmul fast float %7, %8
  326. %add20 = fadd fast float %add14, %mul19
  327. store float %add20, float* %C.addr.038, align 4
  328. %incdec.ptr = getelementptr inbounds float, float* %C.addr.038, i64 1
  329. %inc = add nsw i64 %i.039, 1
  330. %exitcond = icmp eq i64 %inc, %0
  331. br i1 %exitcond, label %for.end, label %for.body
  332. for.end:
  333. ret i32 0
  334. }
  335. ; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
  336. ; void foo(double * restrict A, double * restrict B, double * restrict C,
  337. ; int n) {
  338. ; for (intptr_t i=0; i < n; ++i) {
  339. ; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1];
  340. ; }
  341. ; }
  342. ; STORE-LABEL: store_red_double
  343. ; STORE: fmul fast <2 x double>
  344. ; STORE: extractelement <2 x double>
  345. ; STORE: extractelement <2 x double>
  346. define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
  347. entry:
  348. %cmp17 = icmp sgt i32 %n, 0
  349. br i1 %cmp17, label %for.body.lr.ph, label %for.end
  350. for.body.lr.ph:
  351. %0 = load double, double* %B, align 8
  352. %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
  353. %1 = load double, double* %arrayidx4, align 8
  354. %2 = sext i32 %n to i64
  355. br label %for.body
  356. for.body:
  357. %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
  358. %mul = shl nsw i64 %i.018, 2
  359. %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
  360. %3 = load double, double* %arrayidx2, align 8
  361. %mul3 = fmul fast double %0, %3
  362. %add16 = or i64 %mul, 1
  363. %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
  364. %4 = load double, double* %arrayidx6, align 8
  365. %mul7 = fmul fast double %1, %4
  366. %add8 = fadd fast double %mul3, %mul7
  367. %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
  368. store double %add8, double* %arrayidx9, align 8
  369. %inc = add nsw i64 %i.018, 1
  370. %exitcond = icmp eq i64 %inc, %2
  371. br i1 %exitcond, label %for.end, label %for.body
  372. for.end:
  373. ret void
  374. }