vshift-cost.ll 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
  2. ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
  3. ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
  4. ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
  5. ; Verify the cost of vector shift left instructions.
  6. ; We always emit a single pmullw in the case of v8i16 vector shifts by
  7. ; non-uniform constant.
  8. define <8 x i16> @test1(<8 x i16> %a) {
  9. %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
  10. ret <8 x i16> %shl
  11. }
  12. ; CHECK: 'Cost Model Analysis' for function 'test1':
  13. ; CHECK: Found an estimated cost of 1 for instruction: %shl
  14. define <8 x i16> @test2(<8 x i16> %a) {
  15. %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
  16. ret <8 x i16> %shl
  17. }
  18. ; CHECK: 'Cost Model Analysis' for function 'test2':
  19. ; CHECK: Found an estimated cost of 1 for instruction: %shl
  20. ; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction.
  21. ; Make sure that the estimated cost is always 1 except for the case where
  22. ; we only have SSE2 support. With SSE2, we are forced to special lower the
  23. ; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle.
  24. define <4 x i32> @test3(<4 x i32> %a) {
  25. %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
  26. ret <4 x i32> %shl
  27. }
  28. ; CHECK: 'Cost Model Analysis' for function 'test3':
  29. ; SSE2: Found an estimated cost of 6 for instruction: %shl
  30. ; SSE41: Found an estimated cost of 1 for instruction: %shl
  31. ; AVX: Found an estimated cost of 1 for instruction: %shl
  32. ; AVX2: Found an estimated cost of 1 for instruction: %shl
  33. define <4 x i32> @test4(<4 x i32> %a) {
  34. %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
  35. ret <4 x i32> %shl
  36. }
  37. ; CHECK: 'Cost Model Analysis' for function 'test4':
  38. ; SSE2: Found an estimated cost of 6 for instruction: %shl
  39. ; SSE41: Found an estimated cost of 1 for instruction: %shl
  40. ; AVX: Found an estimated cost of 1 for instruction: %shl
  41. ; AVX2: Found an estimated cost of 1 for instruction: %shl
  42. ; On AVX2 we are able to lower the following shift into a single
  43. ; vpsllvq. Therefore, the expected cost is only 1.
  44. ; In all other cases, this shift is scalarized as the target does not support
  45. ; vpsllv instructions.
  46. define <2 x i64> @test5(<2 x i64> %a) {
  47. %shl = shl <2 x i64> %a, <i64 2, i64 3>
  48. ret <2 x i64> %shl
  49. }
  50. ; CHECK: 'Cost Model Analysis' for function 'test5':
  51. ; SSE2: Found an estimated cost of 20 for instruction: %shl
  52. ; SSE41: Found an estimated cost of 20 for instruction: %shl
  53. ; AVX: Found an estimated cost of 20 for instruction: %shl
  54. ; AVX2: Found an estimated cost of 1 for instruction: %shl
  55. ; v16i16 and v8i32 shift left by non-uniform constant are lowered into
  56. ; vector multiply instructions. With AVX (but not AVX2), the vector multiply
  57. ; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert.
  58. ;
  59. ; With AVX2, instruction vpmullw works with 256bit quantities and
  60. ; therefore there is no need to split the resulting vector multiply into
  61. ; a sequence of two multiply.
  62. ;
  63. ; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice
  64. ; the cost computed in the case of 'test1'. That is because the backend
  65. ; simply emits 2 pmullw with no extract/insert.
  66. define <16 x i16> @test6(<16 x i16> %a) {
  67. %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
  68. ret <16 x i16> %shl
  69. }
  70. ; CHECK: 'Cost Model Analysis' for function 'test6':
  71. ; SSE2: Found an estimated cost of 2 for instruction: %shl
  72. ; SSE41: Found an estimated cost of 2 for instruction: %shl
  73. ; AVX: Found an estimated cost of 4 for instruction: %shl
  74. ; AVX2: Found an estimated cost of 1 for instruction: %shl
  75. ; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice
  76. ; the cost computed in the case of 'test3'. That is because the multiply
  77. ; is type-legalized into two 4i32 vector multiply.
  78. define <8 x i32> @test7(<8 x i32> %a) {
  79. %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
  80. ret <8 x i32> %shl
  81. }
  82. ; CHECK: 'Cost Model Analysis' for function 'test7':
  83. ; SSE2: Found an estimated cost of 12 for instruction: %shl
  84. ; SSE41: Found an estimated cost of 2 for instruction: %shl
  85. ; AVX: Found an estimated cost of 4 for instruction: %shl
  86. ; AVX2: Found an estimated cost of 1 for instruction: %shl
  87. ; On AVX2 we are able to lower the following shift into a single
  88. ; vpsllvq. Therefore, the expected cost is only 1.
  89. ; In all other cases, this shift is scalarized as the target does not support
  90. ; vpsllv instructions.
  91. define <4 x i64> @test8(<4 x i64> %a) {
  92. %shl = shl <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
  93. ret <4 x i64> %shl
  94. }
  95. ; CHECK: 'Cost Model Analysis' for function 'test8':
  96. ; SSE2: Found an estimated cost of 40 for instruction: %shl
  97. ; SSE41: Found an estimated cost of 40 for instruction: %shl
  98. ; AVX: Found an estimated cost of 40 for instruction: %shl
  99. ; AVX2: Found an estimated cost of 1 for instruction: %shl
  100. ; Same as 'test6', with the difference that the cost is double.
  101. define <32 x i16> @test9(<32 x i16> %a) {
  102. %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
  103. ret <32 x i16> %shl
  104. }
  105. ; CHECK: 'Cost Model Analysis' for function 'test9':
  106. ; SSE2: Found an estimated cost of 4 for instruction: %shl
  107. ; SSE41: Found an estimated cost of 4 for instruction: %shl
  108. ; AVX: Found an estimated cost of 8 for instruction: %shl
  109. ; AVX2: Found an estimated cost of 2 for instruction: %shl
  110. ; Same as 'test7', except that now the cost is double.
  111. define <16 x i32> @test10(<16 x i32> %a) {
  112. %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
  113. ret <16 x i32> %shl
  114. }
  115. ; CHECK: 'Cost Model Analysis' for function 'test10':
  116. ; SSE2: Found an estimated cost of 24 for instruction: %shl
  117. ; SSE41: Found an estimated cost of 4 for instruction: %shl
  118. ; AVX: Found an estimated cost of 8 for instruction: %shl
  119. ; AVX2: Found an estimated cost of 2 for instruction: %shl
  120. ; On AVX2 we are able to lower the following shift into a sequence of
  121. ; two vpsllvq instructions. Therefore, the expected cost is only 2.
  122. ; In all other cases, this shift is scalarized as we don't have vpsllv
  123. ; instructions.
  124. define <8 x i64> @test11(<8 x i64> %a) {
  125. %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
  126. ret <8 x i64> %shl
  127. }
  128. ; CHECK: 'Cost Model Analysis' for function 'test11':
  129. ; SSE2: Found an estimated cost of 80 for instruction: %shl
  130. ; SSE41: Found an estimated cost of 80 for instruction: %shl
  131. ; AVX: Found an estimated cost of 80 for instruction: %shl
  132. ; AVX2: Found an estimated cost of 2 for instruction: %shl