SLPVectorizer.cpp 140 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053
  1. //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
  10. // stores that can be put together into vector-stores. Next, it attempts to
  11. // construct vectorizable tree using the use-def chains. If a profitable tree
  12. // was found, the SLP vectorizer performs vectorization on the tree.
  13. //
  14. // The pass is inspired by the work described in the paper:
  15. // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #include "llvm/Transforms/Vectorize.h"
  19. #include "llvm/ADT/MapVector.h"
  20. #include "llvm/ADT/Optional.h"
  21. #include "llvm/ADT/PostOrderIterator.h"
  22. #include "llvm/ADT/SetVector.h"
  23. #include "llvm/ADT/Statistic.h"
  24. #include "llvm/Analysis/AliasAnalysis.h"
  25. #include "llvm/Analysis/AssumptionCache.h"
  26. #include "llvm/Analysis/CodeMetrics.h"
  27. #include "llvm/Analysis/LoopInfo.h"
  28. #include "llvm/Analysis/ScalarEvolution.h"
  29. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  30. #include "llvm/Analysis/TargetTransformInfo.h"
  31. #include "llvm/Analysis/ValueTracking.h"
  32. #include "llvm/IR/DataLayout.h"
  33. #include "llvm/IR/Dominators.h"
  34. #include "llvm/IR/IRBuilder.h"
  35. #include "llvm/IR/Instructions.h"
  36. #include "llvm/IR/IntrinsicInst.h"
  37. #include "llvm/IR/Module.h"
  38. #include "llvm/IR/NoFolder.h"
  39. #include "llvm/IR/Type.h"
  40. #include "llvm/IR/Value.h"
  41. #include "llvm/IR/Verifier.h"
  42. #include "llvm/Pass.h"
  43. #include "llvm/Support/CommandLine.h"
  44. #include "llvm/Support/Debug.h"
  45. #include "llvm/Support/raw_ostream.h"
  46. #include "llvm/Analysis/VectorUtils.h"
  47. #include <algorithm>
  48. #include <map>
  49. #include <memory>
  50. // //
  51. ///////////////////////////////////////////////////////////////////////////////
  52. using namespace llvm;
  53. #define SV_NAME "slp-vectorizer"
  54. #define DEBUG_TYPE "SLP"
  55. STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
  56. static cl::opt<int>
  57. SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
  58. cl::desc("Only vectorize if you gain more than this "
  59. "number "));
  60. static cl::opt<bool>
  61. ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
  62. cl::desc("Attempt to vectorize horizontal reductions"));
  63. static cl::opt<bool> ShouldStartVectorizeHorAtStore(
  64. "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
  65. cl::desc(
  66. "Attempt to vectorize horizontal reductions feeding into a store"));
  67. static cl::opt<int>
  68. MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
  69. cl::desc("Attempt to vectorize for this register size in bits"));
  70. namespace {
  71. // FIXME: Set this via cl::opt to allow overriding.
  72. static const unsigned MinVecRegSize = 128;
  73. static const unsigned RecursionMaxDepth = 12;
  74. // Limit the number of alias checks. The limit is chosen so that
  75. // it has no negative effect on the llvm benchmarks.
  76. static const unsigned AliasedCheckLimit = 10;
  77. // Another limit for the alias checks: The maximum distance between load/store
  78. // instructions where alias checks are done.
  79. // This limit is useful for very large basic blocks.
  80. static const unsigned MaxMemDepDistance = 160;
  81. /// \brief Predicate for the element types that the SLP vectorizer supports.
  82. ///
  83. /// The most important thing to filter here are types which are invalid in LLVM
  84. /// vectors. We also filter target specific types which have absolutely no
  85. /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
  86. /// avoids spending time checking the cost model and realizing that they will
  87. /// be inevitably scalarized.
  88. static bool isValidElementType(Type *Ty) {
  89. return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
  90. !Ty->isPPC_FP128Ty();
  91. }
  92. /// \returns the parent basic block if all of the instructions in \p VL
  93. /// are in the same block or null otherwise.
  94. static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
  95. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  96. if (!I0)
  97. return nullptr;
  98. BasicBlock *BB = I0->getParent();
  99. for (int i = 1, e = VL.size(); i < e; i++) {
  100. Instruction *I = dyn_cast<Instruction>(VL[i]);
  101. if (!I)
  102. return nullptr;
  103. if (BB != I->getParent())
  104. return nullptr;
  105. }
  106. return BB;
  107. }
  108. /// \returns True if all of the values in \p VL are constants.
  109. static bool allConstant(ArrayRef<Value *> VL) {
  110. for (unsigned i = 0, e = VL.size(); i < e; ++i)
  111. if (!isa<Constant>(VL[i]))
  112. return false;
  113. return true;
  114. }
  115. /// \returns True if all of the values in \p VL are identical.
  116. static bool isSplat(ArrayRef<Value *> VL) {
  117. for (unsigned i = 1, e = VL.size(); i < e; ++i)
  118. if (VL[i] != VL[0])
  119. return false;
  120. return true;
  121. }
  122. ///\returns Opcode that can be clubbed with \p Op to create an alternate
  123. /// sequence which can later be merged as a ShuffleVector instruction.
  124. static unsigned getAltOpcode(unsigned Op) {
  125. switch (Op) {
  126. case Instruction::FAdd:
  127. return Instruction::FSub;
  128. case Instruction::FSub:
  129. return Instruction::FAdd;
  130. case Instruction::Add:
  131. return Instruction::Sub;
  132. case Instruction::Sub:
  133. return Instruction::Add;
  134. default:
  135. return 0;
  136. }
  137. }
  138. ///\returns bool representing if Opcode \p Op can be part
  139. /// of an alternate sequence which can later be merged as
  140. /// a ShuffleVector instruction.
  141. static bool canCombineAsAltInst(unsigned Op) {
  142. if (Op == Instruction::FAdd || Op == Instruction::FSub ||
  143. Op == Instruction::Sub || Op == Instruction::Add)
  144. return true;
  145. return false;
  146. }
  147. /// \returns ShuffleVector instruction if intructions in \p VL have
  148. /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
  149. /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
  150. static unsigned isAltInst(ArrayRef<Value *> VL) {
  151. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  152. unsigned Opcode = I0->getOpcode();
  153. unsigned AltOpcode = getAltOpcode(Opcode);
  154. for (int i = 1, e = VL.size(); i < e; i++) {
  155. Instruction *I = dyn_cast<Instruction>(VL[i]);
  156. if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
  157. return 0;
  158. }
  159. return Instruction::ShuffleVector;
  160. }
  161. /// \returns The opcode if all of the Instructions in \p VL have the same
  162. /// opcode, or zero.
  163. static unsigned getSameOpcode(ArrayRef<Value *> VL) {
  164. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  165. if (!I0)
  166. return 0;
  167. unsigned Opcode = I0->getOpcode();
  168. for (int i = 1, e = VL.size(); i < e; i++) {
  169. Instruction *I = dyn_cast<Instruction>(VL[i]);
  170. if (!I || Opcode != I->getOpcode()) {
  171. if (canCombineAsAltInst(Opcode) && i == 1)
  172. return isAltInst(VL);
  173. return 0;
  174. }
  175. }
  176. return Opcode;
  177. }
  178. /// Get the intersection (logical and) of all of the potential IR flags
  179. /// of each scalar operation (VL) that will be converted into a vector (I).
  180. /// Flag set: NSW, NUW, exact, and all of fast-math.
  181. static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
  182. if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
  183. if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
  184. // Intersection is initialized to the 0th scalar,
  185. // so start counting from index '1'.
  186. for (int i = 1, e = VL.size(); i < e; ++i) {
  187. if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
  188. Intersection->andIRFlags(Scalar);
  189. }
  190. VecOp->copyIRFlags(Intersection);
  191. }
  192. }
  193. }
  194. /// \returns \p I after propagating metadata from \p VL.
  195. static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
  196. Instruction *I0 = cast<Instruction>(VL[0]);
  197. SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
  198. I0->getAllMetadataOtherThanDebugLoc(Metadata);
  199. for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
  200. unsigned Kind = Metadata[i].first;
  201. MDNode *MD = Metadata[i].second;
  202. for (int i = 1, e = VL.size(); MD && i != e; i++) {
  203. Instruction *I = cast<Instruction>(VL[i]);
  204. MDNode *IMD = I->getMetadata(Kind);
  205. switch (Kind) {
  206. default:
  207. MD = nullptr; // Remove unknown metadata
  208. break;
  209. case LLVMContext::MD_tbaa:
  210. MD = MDNode::getMostGenericTBAA(MD, IMD);
  211. break;
  212. case LLVMContext::MD_alias_scope:
  213. MD = MDNode::getMostGenericAliasScope(MD, IMD);
  214. break;
  215. case LLVMContext::MD_noalias:
  216. MD = MDNode::intersect(MD, IMD);
  217. break;
  218. case LLVMContext::MD_fpmath:
  219. MD = MDNode::getMostGenericFPMath(MD, IMD);
  220. break;
  221. }
  222. }
  223. I->setMetadata(Kind, MD);
  224. }
  225. return I;
  226. }
  227. /// \returns The type that all of the values in \p VL have or null if there
  228. /// are different types.
  229. static Type* getSameType(ArrayRef<Value *> VL) {
  230. Type *Ty = VL[0]->getType();
  231. for (int i = 1, e = VL.size(); i < e; i++)
  232. if (VL[i]->getType() != Ty)
  233. return nullptr;
  234. return Ty;
  235. }
  236. /// \returns True if the ExtractElement instructions in VL can be vectorized
  237. /// to use the original vector.
  238. static bool CanReuseExtract(ArrayRef<Value *> VL) {
  239. assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
  240. // Check if all of the extracts come from the same vector and from the
  241. // correct offset.
  242. Value *VL0 = VL[0];
  243. ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
  244. Value *Vec = E0->getOperand(0);
  245. // We have to extract from the same vector type.
  246. unsigned NElts = Vec->getType()->getVectorNumElements();
  247. if (NElts != VL.size())
  248. return false;
  249. // Check that all of the indices extract from the correct offset.
  250. ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
  251. if (!CI || CI->getZExtValue())
  252. return false;
  253. for (unsigned i = 1, e = VL.size(); i < e; ++i) {
  254. ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
  255. ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
  256. if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
  257. return false;
  258. }
  259. return true;
  260. }
  261. /// \returns True if in-tree use also needs extract. This refers to
  262. /// possible scalar operand in vectorized instruction.
  263. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
  264. TargetLibraryInfo *TLI) {
  265. unsigned Opcode = UserInst->getOpcode();
  266. switch (Opcode) {
  267. case Instruction::Load: {
  268. LoadInst *LI = cast<LoadInst>(UserInst);
  269. return (LI->getPointerOperand() == Scalar);
  270. }
  271. case Instruction::Store: {
  272. StoreInst *SI = cast<StoreInst>(UserInst);
  273. return (SI->getPointerOperand() == Scalar);
  274. }
  275. case Instruction::Call: {
  276. CallInst *CI = cast<CallInst>(UserInst);
  277. Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
  278. if (hasVectorInstrinsicScalarOpd(ID, 1)) {
  279. return (CI->getArgOperand(1) == Scalar);
  280. }
  281. }
  282. default:
  283. return false;
  284. }
  285. }
  286. /// \returns the AA location that is being access by the instruction.
  287. static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
  288. if (StoreInst *SI = dyn_cast<StoreInst>(I))
  289. return MemoryLocation::get(SI);
  290. if (LoadInst *LI = dyn_cast<LoadInst>(I))
  291. return MemoryLocation::get(LI);
  292. return MemoryLocation();
  293. }
  294. /// \returns True if the instruction is not a volatile or atomic load/store.
  295. static bool isSimple(Instruction *I) {
  296. if (LoadInst *LI = dyn_cast<LoadInst>(I))
  297. return LI->isSimple();
  298. if (StoreInst *SI = dyn_cast<StoreInst>(I))
  299. return SI->isSimple();
  300. if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
  301. return !MI->isVolatile();
  302. return true;
  303. }
  304. /// Bottom Up SLP Vectorizer.
  305. class BoUpSLP {
  306. public:
  307. typedef SmallVector<Value *, 8> ValueList;
  308. typedef SmallVector<Instruction *, 16> InstrList;
  309. typedef SmallPtrSet<Value *, 16> ValueSet;
  310. typedef SmallVector<StoreInst *, 8> StoreList;
  311. BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
  312. TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
  313. DominatorTree *Dt, AssumptionCache *AC)
  314. : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
  315. SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
  316. Builder(Se->getContext()) {
  317. CodeMetrics::collectEphemeralValues(F, AC, EphValues);
  318. }
  319. /// \brief Vectorize the tree that starts with the elements in \p VL.
  320. /// Returns the vectorized root.
  321. Value *vectorizeTree();
  322. /// \returns the cost incurred by unwanted spills and fills, caused by
  323. /// holding live values over call sites.
  324. int getSpillCost();
  325. /// \returns the vectorization cost of the subtree that starts at \p VL.
  326. /// A negative number means that this is profitable.
  327. int getTreeCost();
  328. /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
  329. /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
  330. void buildTree(ArrayRef<Value *> Roots,
  331. ArrayRef<Value *> UserIgnoreLst = None);
  332. /// Clear the internal data structures that are created by 'buildTree'.
  333. void deleteTree() {
  334. VectorizableTree.clear();
  335. ScalarToTreeEntry.clear();
  336. MustGather.clear();
  337. ExternalUses.clear();
  338. NumLoadsWantToKeepOrder = 0;
  339. NumLoadsWantToChangeOrder = 0;
  340. for (auto &Iter : BlocksSchedules) {
  341. BlockScheduling *BS = Iter.second.get();
  342. BS->clear();
  343. }
  344. }
  345. /// \returns true if the memory operations A and B are consecutive.
  346. bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
  347. /// \brief Perform LICM and CSE on the newly generated gather sequences.
  348. void optimizeGatherSequence();
  349. /// \returns true if it is benefitial to reverse the vector order.
  350. bool shouldReorder() const {
  351. return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
  352. }
  353. private:
  354. struct TreeEntry;
  355. /// \returns the cost of the vectorizable entry.
  356. int getEntryCost(TreeEntry *E);
  357. /// This is the recursive part of buildTree.
  358. void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
  359. /// Vectorize a single entry in the tree.
  360. Value *vectorizeTree(TreeEntry *E);
  361. /// Vectorize a single entry in the tree, starting in \p VL.
  362. Value *vectorizeTree(ArrayRef<Value *> VL);
  363. /// \returns the pointer to the vectorized value if \p VL is already
  364. /// vectorized, or NULL. They may happen in cycles.
  365. Value *alreadyVectorized(ArrayRef<Value *> VL) const;
  366. /// \brief Take the pointer operand from the Load/Store instruction.
  367. /// \returns NULL if this is not a valid Load/Store instruction.
  368. static Value *getPointerOperand(Value *I);
  369. /// \brief Take the address space operand from the Load/Store instruction.
  370. /// \returns -1 if this is not a valid Load/Store instruction.
  371. static unsigned getAddressSpaceOperand(Value *I);
  372. /// \returns the scalarization cost for this type. Scalarization in this
  373. /// context means the creation of vectors from a group of scalars.
  374. int getGatherCost(Type *Ty);
  375. /// \returns the scalarization cost for this list of values. Assuming that
  376. /// this subtree gets vectorized, we may need to extract the values from the
  377. /// roots. This method calculates the cost of extracting the values.
  378. int getGatherCost(ArrayRef<Value *> VL);
  379. /// \brief Set the Builder insert point to one after the last instruction in
  380. /// the bundle
  381. void setInsertPointAfterBundle(ArrayRef<Value *> VL);
  382. /// \returns a vector from a collection of scalars in \p VL.
  383. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
  384. /// \returns whether the VectorizableTree is fully vectoriable and will
  385. /// be beneficial even the tree height is tiny.
  386. bool isFullyVectorizableTinyTree();
  387. /// \reorder commutative operands in alt shuffle if they result in
  388. /// vectorized code.
  389. void reorderAltShuffleOperands(ArrayRef<Value *> VL,
  390. SmallVectorImpl<Value *> &Left,
  391. SmallVectorImpl<Value *> &Right);
  392. /// \reorder commutative operands to get better probability of
  393. /// generating vectorized code.
  394. void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
  395. SmallVectorImpl<Value *> &Left,
  396. SmallVectorImpl<Value *> &Right);
  397. struct TreeEntry {
  398. TreeEntry() : Scalars(), VectorizedValue(nullptr),
  399. NeedToGather(0) {}
  400. /// \returns true if the scalars in VL are equal to this entry.
  401. bool isSame(ArrayRef<Value *> VL) const {
  402. assert(VL.size() == Scalars.size() && "Invalid size");
  403. return std::equal(VL.begin(), VL.end(), Scalars.begin());
  404. }
  405. /// A vector of scalars.
  406. ValueList Scalars;
  407. /// The Scalars are vectorized into this value. It is initialized to Null.
  408. Value *VectorizedValue;
  409. /// Do we need to gather this sequence ?
  410. bool NeedToGather;
  411. };
  412. /// Create a new VectorizableTree entry.
  413. TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
  414. VectorizableTree.emplace_back();
  415. int idx = VectorizableTree.size() - 1;
  416. TreeEntry *Last = &VectorizableTree[idx];
  417. Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
  418. Last->NeedToGather = !Vectorized;
  419. if (Vectorized) {
  420. for (int i = 0, e = VL.size(); i != e; ++i) {
  421. assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
  422. ScalarToTreeEntry[VL[i]] = idx;
  423. }
  424. } else {
  425. MustGather.insert(VL.begin(), VL.end());
  426. }
  427. return Last;
  428. }
  429. /// -- Vectorization State --
  430. /// Holds all of the tree entries.
  431. std::vector<TreeEntry> VectorizableTree;
  432. /// Maps a specific scalar to its tree entry.
  433. SmallDenseMap<Value*, int> ScalarToTreeEntry;
  434. /// A list of scalars that we found that we need to keep as scalars.
  435. ValueSet MustGather;
  436. /// This POD struct describes one external user in the vectorized tree.
  437. struct ExternalUser {
  438. ExternalUser (Value *S, llvm::User *U, int L) :
  439. Scalar(S), User(U), Lane(L){};
  440. // Which scalar in our function.
  441. Value *Scalar;
  442. // Which user that uses the scalar.
  443. llvm::User *User;
  444. // Which lane does the scalar belong to.
  445. int Lane;
  446. };
  447. typedef SmallVector<ExternalUser, 16> UserList;
  448. /// Checks if two instructions may access the same memory.
  449. ///
  450. /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
  451. /// is invariant in the calling loop.
  452. bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
  453. Instruction *Inst2) {
  454. // First check if the result is already in the cache.
  455. AliasCacheKey key = std::make_pair(Inst1, Inst2);
  456. Optional<bool> &result = AliasCache[key];
  457. if (result.hasValue()) {
  458. return result.getValue();
  459. }
  460. MemoryLocation Loc2 = getLocation(Inst2, AA);
  461. bool aliased = true;
  462. if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
  463. // Do the alias check.
  464. aliased = AA->alias(Loc1, Loc2);
  465. }
  466. // Store the result in the cache.
  467. result = aliased;
  468. return aliased;
  469. }
  470. typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
  471. /// Cache for alias results.
  472. /// TODO: consider moving this to the AliasAnalysis itself.
  473. DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
  474. /// Removes an instruction from its block and eventually deletes it.
  475. /// It's like Instruction::eraseFromParent() except that the actual deletion
  476. /// is delayed until BoUpSLP is destructed.
  477. /// This is required to ensure that there are no incorrect collisions in the
  478. /// AliasCache, which can happen if a new instruction is allocated at the
  479. /// same address as a previously deleted instruction.
  480. void eraseInstruction(Instruction *I) {
  481. I->removeFromParent();
  482. I->dropAllReferences();
  483. DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
  484. }
  485. /// Temporary store for deleted instructions. Instructions will be deleted
  486. /// eventually when the BoUpSLP is destructed.
  487. SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
  488. /// A list of values that need to extracted out of the tree.
  489. /// This list holds pairs of (Internal Scalar : External User).
  490. UserList ExternalUses;
  491. /// Values used only by @llvm.assume calls.
  492. SmallPtrSet<const Value *, 32> EphValues;
  493. /// Holds all of the instructions that we gathered.
  494. SetVector<Instruction *> GatherSeq;
  495. /// A list of blocks that we are going to CSE.
  496. SetVector<BasicBlock *> CSEBlocks;
  497. /// Contains all scheduling relevant data for an instruction.
  498. /// A ScheduleData either represents a single instruction or a member of an
  499. /// instruction bundle (= a group of instructions which is combined into a
  500. /// vector instruction).
  501. struct ScheduleData {
  502. // The initial value for the dependency counters. It means that the
  503. // dependencies are not calculated yet.
  504. enum { InvalidDeps = -1 };
  505. ScheduleData()
  506. : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
  507. NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
  508. Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
  509. UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
  510. void init(int BlockSchedulingRegionID) {
  511. FirstInBundle = this;
  512. NextInBundle = nullptr;
  513. NextLoadStore = nullptr;
  514. IsScheduled = false;
  515. SchedulingRegionID = BlockSchedulingRegionID;
  516. UnscheduledDepsInBundle = UnscheduledDeps;
  517. clearDependencies();
  518. }
  519. /// Returns true if the dependency information has been calculated.
  520. bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
  521. /// Returns true for single instructions and for bundle representatives
  522. /// (= the head of a bundle).
  523. bool isSchedulingEntity() const { return FirstInBundle == this; }
  524. /// Returns true if it represents an instruction bundle and not only a
  525. /// single instruction.
  526. bool isPartOfBundle() const {
  527. return NextInBundle != nullptr || FirstInBundle != this;
  528. }
  529. /// Returns true if it is ready for scheduling, i.e. it has no more
  530. /// unscheduled depending instructions/bundles.
  531. bool isReady() const {
  532. assert(isSchedulingEntity() &&
  533. "can't consider non-scheduling entity for ready list");
  534. return UnscheduledDepsInBundle == 0 && !IsScheduled;
  535. }
  536. /// Modifies the number of unscheduled dependencies, also updating it for
  537. /// the whole bundle.
  538. int incrementUnscheduledDeps(int Incr) {
  539. UnscheduledDeps += Incr;
  540. return FirstInBundle->UnscheduledDepsInBundle += Incr;
  541. }
  542. /// Sets the number of unscheduled dependencies to the number of
  543. /// dependencies.
  544. void resetUnscheduledDeps() {
  545. incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
  546. }
  547. /// Clears all dependency information.
  548. void clearDependencies() {
  549. Dependencies = InvalidDeps;
  550. resetUnscheduledDeps();
  551. MemoryDependencies.clear();
  552. }
  553. void dump(raw_ostream &os) const {
  554. if (!isSchedulingEntity()) {
  555. os << "/ " << *Inst;
  556. } else if (NextInBundle) {
  557. os << '[' << *Inst;
  558. ScheduleData *SD = NextInBundle;
  559. while (SD) {
  560. os << ';' << *SD->Inst;
  561. SD = SD->NextInBundle;
  562. }
  563. os << ']';
  564. } else {
  565. os << *Inst;
  566. }
  567. }
  568. Instruction *Inst;
  569. /// Points to the head in an instruction bundle (and always to this for
  570. /// single instructions).
  571. ScheduleData *FirstInBundle;
  572. /// Single linked list of all instructions in a bundle. Null if it is a
  573. /// single instruction.
  574. ScheduleData *NextInBundle;
  575. /// Single linked list of all memory instructions (e.g. load, store, call)
  576. /// in the block - until the end of the scheduling region.
  577. ScheduleData *NextLoadStore;
  578. /// The dependent memory instructions.
  579. /// This list is derived on demand in calculateDependencies().
  580. SmallVector<ScheduleData *, 4> MemoryDependencies;
  581. /// This ScheduleData is in the current scheduling region if this matches
  582. /// the current SchedulingRegionID of BlockScheduling.
  583. int SchedulingRegionID;
  584. /// Used for getting a "good" final ordering of instructions.
  585. int SchedulingPriority;
  586. /// The number of dependencies. Constitutes of the number of users of the
  587. /// instruction plus the number of dependent memory instructions (if any).
  588. /// This value is calculated on demand.
  589. /// If InvalidDeps, the number of dependencies is not calculated yet.
  590. ///
  591. int Dependencies;
  592. /// The number of dependencies minus the number of dependencies of scheduled
  593. /// instructions. As soon as this is zero, the instruction/bundle gets ready
  594. /// for scheduling.
  595. /// Note that this is negative as long as Dependencies is not calculated.
  596. int UnscheduledDeps;
  597. /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
  598. /// single instructions.
  599. int UnscheduledDepsInBundle;
  600. /// True if this instruction is scheduled (or considered as scheduled in the
  601. /// dry-run).
  602. bool IsScheduled;
  603. };
  604. #ifndef NDEBUG
  605. friend raw_ostream &operator<<(raw_ostream &os,
  606. const BoUpSLP::ScheduleData &SD);
  607. #endif
  608. /// Contains all scheduling data for a basic block.
  609. ///
  610. struct BlockScheduling {
  611. BlockScheduling(BasicBlock *BB)
  612. : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
  613. ScheduleStart(nullptr), ScheduleEnd(nullptr),
  614. FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
  615. // Make sure that the initial SchedulingRegionID is greater than the
  616. // initial SchedulingRegionID in ScheduleData (which is 0).
  617. SchedulingRegionID(1) {}
  618. void clear() {
  619. ReadyInsts.clear();
  620. ScheduleStart = nullptr;
  621. ScheduleEnd = nullptr;
  622. FirstLoadStoreInRegion = nullptr;
  623. LastLoadStoreInRegion = nullptr;
  624. // Make a new scheduling region, i.e. all existing ScheduleData is not
  625. // in the new region yet.
  626. ++SchedulingRegionID;
  627. }
  628. ScheduleData *getScheduleData(Value *V) {
  629. ScheduleData *SD = ScheduleDataMap[V];
  630. if (SD && SD->SchedulingRegionID == SchedulingRegionID)
  631. return SD;
  632. return nullptr;
  633. }
  634. bool isInSchedulingRegion(ScheduleData *SD) {
  635. return SD->SchedulingRegionID == SchedulingRegionID;
  636. }
  637. /// Marks an instruction as scheduled and puts all dependent ready
  638. /// instructions into the ready-list.
  639. template <typename ReadyListType>
  640. void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
  641. SD->IsScheduled = true;
  642. DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
  643. ScheduleData *BundleMember = SD;
  644. while (BundleMember) {
  645. // Handle the def-use chain dependencies.
  646. for (Use &U : BundleMember->Inst->operands()) {
  647. ScheduleData *OpDef = getScheduleData(U.get());
  648. if (OpDef && OpDef->hasValidDependencies() &&
  649. OpDef->incrementUnscheduledDeps(-1) == 0) {
  650. // There are no more unscheduled dependencies after decrementing,
  651. // so we can put the dependent instruction into the ready list.
  652. ScheduleData *DepBundle = OpDef->FirstInBundle;
  653. assert(!DepBundle->IsScheduled &&
  654. "already scheduled bundle gets ready");
  655. ReadyList.insert(DepBundle);
  656. DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
  657. }
  658. }
  659. // Handle the memory dependencies.
  660. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
  661. if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
  662. // There are no more unscheduled dependencies after decrementing,
  663. // so we can put the dependent instruction into the ready list.
  664. ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
  665. assert(!DepBundle->IsScheduled &&
  666. "already scheduled bundle gets ready");
  667. ReadyList.insert(DepBundle);
  668. DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
  669. }
  670. }
  671. BundleMember = BundleMember->NextInBundle;
  672. }
  673. }
  674. /// Put all instructions into the ReadyList which are ready for scheduling.
  675. template <typename ReadyListType>
  676. void initialFillReadyList(ReadyListType &ReadyList) {
  677. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  678. ScheduleData *SD = getScheduleData(I);
  679. if (SD->isSchedulingEntity() && SD->isReady()) {
  680. ReadyList.insert(SD);
  681. DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
  682. }
  683. }
  684. }
  685. /// Checks if a bundle of instructions can be scheduled, i.e. has no
  686. /// cyclic dependencies. This is only a dry-run, no instructions are
  687. /// actually moved at this stage.
  688. bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
  689. /// Un-bundles a group of instructions.
  690. void cancelScheduling(ArrayRef<Value *> VL);
  691. /// Extends the scheduling region so that V is inside the region.
  692. void extendSchedulingRegion(Value *V);
  693. /// Initialize the ScheduleData structures for new instructions in the
  694. /// scheduling region.
  695. void initScheduleData(Instruction *FromI, Instruction *ToI,
  696. ScheduleData *PrevLoadStore,
  697. ScheduleData *NextLoadStore);
  698. /// Updates the dependency information of a bundle and of all instructions/
  699. /// bundles which depend on the original bundle.
  700. void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
  701. BoUpSLP *SLP);
  702. /// Sets all instruction in the scheduling region to un-scheduled.
  703. void resetSchedule();
  704. BasicBlock *BB;
  705. /// Simple memory allocation for ScheduleData.
  706. std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
  707. /// The size of a ScheduleData array in ScheduleDataChunks.
  708. int ChunkSize;
  709. /// The allocator position in the current chunk, which is the last entry
  710. /// of ScheduleDataChunks.
  711. int ChunkPos;
  712. /// Attaches ScheduleData to Instruction.
  713. /// Note that the mapping survives during all vectorization iterations, i.e.
  714. /// ScheduleData structures are recycled.
  715. DenseMap<Value *, ScheduleData *> ScheduleDataMap;
  716. struct ReadyList : SmallVector<ScheduleData *, 8> {
  717. void insert(ScheduleData *SD) { push_back(SD); }
  718. };
  719. /// The ready-list for scheduling (only used for the dry-run).
  720. ReadyList ReadyInsts;
  721. /// The first instruction of the scheduling region.
  722. Instruction *ScheduleStart;
  723. /// The first instruction _after_ the scheduling region.
  724. Instruction *ScheduleEnd;
  725. /// The first memory accessing instruction in the scheduling region
  726. /// (can be null).
  727. ScheduleData *FirstLoadStoreInRegion;
  728. /// The last memory accessing instruction in the scheduling region
  729. /// (can be null).
  730. ScheduleData *LastLoadStoreInRegion;
  731. /// The ID of the scheduling region. For a new vectorization iteration this
  732. /// is incremented which "removes" all ScheduleData from the region.
  733. int SchedulingRegionID;
  734. };
  735. /// Attaches the BlockScheduling structures to basic blocks.
  736. MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
  737. /// Performs the "real" scheduling. Done before vectorization is actually
  738. /// performed in a basic block.
  739. void scheduleBlock(BlockScheduling *BS);
  740. /// List of users to ignore during scheduling and that don't need extracting.
  741. ArrayRef<Value *> UserIgnoreList;
  742. // Number of load-bundles, which contain consecutive loads.
  743. int NumLoadsWantToKeepOrder;
  744. // Number of load-bundles of size 2, which are consecutive loads if reversed.
  745. int NumLoadsWantToChangeOrder;
  746. // Analysis and block reference.
  747. Function *F;
  748. ScalarEvolution *SE;
  749. TargetTransformInfo *TTI;
  750. TargetLibraryInfo *TLI;
  751. AliasAnalysis *AA;
  752. LoopInfo *LI;
  753. DominatorTree *DT;
  754. /// Instruction builder to construct the vectorized tree.
  755. IRBuilder<> Builder;
  756. };
  757. #ifndef NDEBUG
  758. raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
  759. SD.dump(os);
  760. return os;
  761. }
  762. #endif
  763. void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
  764. ArrayRef<Value *> UserIgnoreLst) {
  765. deleteTree();
  766. UserIgnoreList = UserIgnoreLst;
  767. if (!getSameType(Roots))
  768. return;
  769. buildTree_rec(Roots, 0);
  770. // Collect the values that we need to extract from the tree.
  771. for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
  772. TreeEntry *Entry = &VectorizableTree[EIdx];
  773. // For each lane:
  774. for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
  775. Value *Scalar = Entry->Scalars[Lane];
  776. // No need to handle users of gathered values.
  777. if (Entry->NeedToGather)
  778. continue;
  779. for (User *U : Scalar->users()) {
  780. DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
  781. Instruction *UserInst = dyn_cast<Instruction>(U);
  782. if (!UserInst)
  783. continue;
  784. // Skip in-tree scalars that become vectors
  785. if (ScalarToTreeEntry.count(U)) {
  786. int Idx = ScalarToTreeEntry[U];
  787. TreeEntry *UseEntry = &VectorizableTree[Idx];
  788. Value *UseScalar = UseEntry->Scalars[0];
  789. // Some in-tree scalars will remain as scalar in vectorized
  790. // instructions. If that is the case, the one in Lane 0 will
  791. // be used.
  792. if (UseScalar != U ||
  793. !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
  794. DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
  795. << ".\n");
  796. assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
  797. continue;
  798. }
  799. }
  800. // Ignore users in the user ignore list.
  801. if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
  802. UserIgnoreList.end())
  803. continue;
  804. DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
  805. Lane << " from " << *Scalar << ".\n");
  806. ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
  807. }
  808. }
  809. }
  810. }
  811. void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
  812. bool SameTy = getSameType(VL); (void)SameTy;
  813. bool isAltShuffle = false;
  814. assert(SameTy && "Invalid types!");
  815. if (Depth == RecursionMaxDepth) {
  816. DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
  817. newTreeEntry(VL, false);
  818. return;
  819. }
  820. // Don't handle vectors.
  821. if (VL[0]->getType()->isVectorTy()) {
  822. DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
  823. newTreeEntry(VL, false);
  824. return;
  825. }
  826. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  827. if (SI->getValueOperand()->getType()->isVectorTy()) {
  828. DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
  829. newTreeEntry(VL, false);
  830. return;
  831. }
  832. unsigned Opcode = getSameOpcode(VL);
  833. // Check that this shuffle vector refers to the alternate
  834. // sequence of opcodes.
  835. if (Opcode == Instruction::ShuffleVector) {
  836. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  837. unsigned Op = I0->getOpcode();
  838. if (Op != Instruction::ShuffleVector)
  839. isAltShuffle = true;
  840. }
  841. // If all of the operands are identical or constant we have a simple solution.
  842. if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
  843. DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
  844. newTreeEntry(VL, false);
  845. return;
  846. }
  847. // We now know that this is a vector of instructions of the same type from
  848. // the same block.
  849. // Don't vectorize ephemeral values.
  850. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  851. if (EphValues.count(VL[i])) {
  852. DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
  853. ") is ephemeral.\n");
  854. newTreeEntry(VL, false);
  855. return;
  856. }
  857. }
  858. // Check if this is a duplicate of another entry.
  859. if (ScalarToTreeEntry.count(VL[0])) {
  860. int Idx = ScalarToTreeEntry[VL[0]];
  861. TreeEntry *E = &VectorizableTree[Idx];
  862. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  863. DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
  864. if (E->Scalars[i] != VL[i]) {
  865. DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
  866. newTreeEntry(VL, false);
  867. return;
  868. }
  869. }
  870. DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
  871. return;
  872. }
  873. // Check that none of the instructions in the bundle are already in the tree.
  874. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  875. if (ScalarToTreeEntry.count(VL[i])) {
  876. DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
  877. ") is already in tree.\n");
  878. newTreeEntry(VL, false);
  879. return;
  880. }
  881. }
  882. // If any of the scalars is marked as a value that needs to stay scalar then
  883. // we need to gather the scalars.
  884. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  885. if (MustGather.count(VL[i])) {
  886. DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
  887. newTreeEntry(VL, false);
  888. return;
  889. }
  890. }
  891. // Check that all of the users of the scalars that we want to vectorize are
  892. // schedulable.
  893. Instruction *VL0 = cast<Instruction>(VL[0]);
  894. BasicBlock *BB = cast<Instruction>(VL0)->getParent();
  895. if (!DT->isReachableFromEntry(BB)) {
  896. // Don't go into unreachable blocks. They may contain instructions with
  897. // dependency cycles which confuse the final scheduling.
  898. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
  899. newTreeEntry(VL, false);
  900. return;
  901. }
  902. // Check that every instructions appears once in this bundle.
  903. for (unsigned i = 0, e = VL.size(); i < e; ++i)
  904. for (unsigned j = i+1; j < e; ++j)
  905. if (VL[i] == VL[j]) {
  906. DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
  907. newTreeEntry(VL, false);
  908. return;
  909. }
  910. auto &BSRef = BlocksSchedules[BB];
  911. if (!BSRef) {
  912. BSRef = llvm::make_unique<BlockScheduling>(BB);
  913. }
  914. BlockScheduling &BS = *BSRef.get();
  915. if (!BS.tryScheduleBundle(VL, this)) {
  916. DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
  917. BS.cancelScheduling(VL);
  918. newTreeEntry(VL, false);
  919. return;
  920. }
  921. DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
  922. switch (Opcode) {
  923. case Instruction::PHI: {
  924. PHINode *PH = dyn_cast<PHINode>(VL0);
  925. // Check for terminator values (e.g. invoke).
  926. for (unsigned j = 0; j < VL.size(); ++j)
  927. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  928. TerminatorInst *Term = dyn_cast<TerminatorInst>(
  929. cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
  930. if (Term) {
  931. DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
  932. BS.cancelScheduling(VL);
  933. newTreeEntry(VL, false);
  934. return;
  935. }
  936. }
  937. newTreeEntry(VL, true);
  938. DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
  939. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  940. ValueList Operands;
  941. // Prepare the operand vector.
  942. for (unsigned j = 0; j < VL.size(); ++j)
  943. Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
  944. PH->getIncomingBlock(i)));
  945. buildTree_rec(Operands, Depth + 1);
  946. }
  947. return;
  948. }
  949. case Instruction::ExtractElement: {
  950. bool Reuse = CanReuseExtract(VL);
  951. if (Reuse) {
  952. DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
  953. } else {
  954. BS.cancelScheduling(VL);
  955. }
  956. newTreeEntry(VL, Reuse);
  957. return;
  958. }
  959. case Instruction::Load: {
  960. // Check if the loads are consecutive or of we need to swizzle them.
  961. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
  962. LoadInst *L = cast<LoadInst>(VL[i]);
  963. if (!L->isSimple()) {
  964. BS.cancelScheduling(VL);
  965. newTreeEntry(VL, false);
  966. DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
  967. return;
  968. }
  969. const DataLayout &DL = F->getParent()->getDataLayout();
  970. if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
  971. if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
  972. ++NumLoadsWantToChangeOrder;
  973. }
  974. BS.cancelScheduling(VL);
  975. newTreeEntry(VL, false);
  976. DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
  977. return;
  978. }
  979. }
  980. ++NumLoadsWantToKeepOrder;
  981. newTreeEntry(VL, true);
  982. DEBUG(dbgs() << "SLP: added a vector of loads.\n");
  983. return;
  984. }
  985. case Instruction::ZExt:
  986. case Instruction::SExt:
  987. case Instruction::FPToUI:
  988. case Instruction::FPToSI:
  989. case Instruction::FPExt:
  990. case Instruction::PtrToInt:
  991. case Instruction::IntToPtr:
  992. case Instruction::SIToFP:
  993. case Instruction::UIToFP:
  994. case Instruction::Trunc:
  995. case Instruction::FPTrunc:
  996. case Instruction::BitCast: {
  997. Type *SrcTy = VL0->getOperand(0)->getType();
  998. for (unsigned i = 0; i < VL.size(); ++i) {
  999. Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
  1000. if (Ty != SrcTy || !isValidElementType(Ty)) {
  1001. BS.cancelScheduling(VL);
  1002. newTreeEntry(VL, false);
  1003. DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
  1004. return;
  1005. }
  1006. }
  1007. newTreeEntry(VL, true);
  1008. DEBUG(dbgs() << "SLP: added a vector of casts.\n");
  1009. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1010. ValueList Operands;
  1011. // Prepare the operand vector.
  1012. for (unsigned j = 0; j < VL.size(); ++j)
  1013. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  1014. buildTree_rec(Operands, Depth+1);
  1015. }
  1016. return;
  1017. }
  1018. case Instruction::ICmp:
  1019. case Instruction::FCmp: {
  1020. // Check that all of the compares have the same predicate.
  1021. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
  1022. Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
  1023. for (unsigned i = 1, e = VL.size(); i < e; ++i) {
  1024. CmpInst *Cmp = cast<CmpInst>(VL[i]);
  1025. if (Cmp->getPredicate() != P0 ||
  1026. Cmp->getOperand(0)->getType() != ComparedTy) {
  1027. BS.cancelScheduling(VL);
  1028. newTreeEntry(VL, false);
  1029. DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
  1030. return;
  1031. }
  1032. }
  1033. newTreeEntry(VL, true);
  1034. DEBUG(dbgs() << "SLP: added a vector of compares.\n");
  1035. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1036. ValueList Operands;
  1037. // Prepare the operand vector.
  1038. for (unsigned j = 0; j < VL.size(); ++j)
  1039. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  1040. buildTree_rec(Operands, Depth+1);
  1041. }
  1042. return;
  1043. }
  1044. case Instruction::Select:
  1045. case Instruction::Add:
  1046. case Instruction::FAdd:
  1047. case Instruction::Sub:
  1048. case Instruction::FSub:
  1049. case Instruction::Mul:
  1050. case Instruction::FMul:
  1051. case Instruction::UDiv:
  1052. case Instruction::SDiv:
  1053. case Instruction::FDiv:
  1054. case Instruction::URem:
  1055. case Instruction::SRem:
  1056. case Instruction::FRem:
  1057. case Instruction::Shl:
  1058. case Instruction::LShr:
  1059. case Instruction::AShr:
  1060. case Instruction::And:
  1061. case Instruction::Or:
  1062. case Instruction::Xor: {
  1063. newTreeEntry(VL, true);
  1064. DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
  1065. // Sort operands of the instructions so that each side is more likely to
  1066. // have the same opcode.
  1067. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
  1068. ValueList Left, Right;
  1069. reorderInputsAccordingToOpcode(VL, Left, Right);
  1070. buildTree_rec(Left, Depth + 1);
  1071. buildTree_rec(Right, Depth + 1);
  1072. return;
  1073. }
  1074. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1075. ValueList Operands;
  1076. // Prepare the operand vector.
  1077. for (unsigned j = 0; j < VL.size(); ++j)
  1078. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  1079. buildTree_rec(Operands, Depth+1);
  1080. }
  1081. return;
  1082. }
  1083. case Instruction::GetElementPtr: {
  1084. // We don't combine GEPs with complicated (nested) indexing.
  1085. for (unsigned j = 0; j < VL.size(); ++j) {
  1086. if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
  1087. DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
  1088. BS.cancelScheduling(VL);
  1089. newTreeEntry(VL, false);
  1090. return;
  1091. }
  1092. }
  1093. // We can't combine several GEPs into one vector if they operate on
  1094. // different types.
  1095. Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
  1096. for (unsigned j = 0; j < VL.size(); ++j) {
  1097. Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
  1098. if (Ty0 != CurTy) {
  1099. DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
  1100. BS.cancelScheduling(VL);
  1101. newTreeEntry(VL, false);
  1102. return;
  1103. }
  1104. }
  1105. // We don't combine GEPs with non-constant indexes.
  1106. for (unsigned j = 0; j < VL.size(); ++j) {
  1107. auto Op = cast<Instruction>(VL[j])->getOperand(1);
  1108. if (!isa<ConstantInt>(Op)) {
  1109. DEBUG(
  1110. dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
  1111. BS.cancelScheduling(VL);
  1112. newTreeEntry(VL, false);
  1113. return;
  1114. }
  1115. }
  1116. newTreeEntry(VL, true);
  1117. DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
  1118. for (unsigned i = 0, e = 2; i < e; ++i) {
  1119. ValueList Operands;
  1120. // Prepare the operand vector.
  1121. for (unsigned j = 0; j < VL.size(); ++j)
  1122. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  1123. buildTree_rec(Operands, Depth + 1);
  1124. }
  1125. return;
  1126. }
  1127. case Instruction::Store: {
  1128. const DataLayout &DL = F->getParent()->getDataLayout();
  1129. // Check if the stores are consecutive or of we need to swizzle them.
  1130. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
  1131. if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
  1132. BS.cancelScheduling(VL);
  1133. newTreeEntry(VL, false);
  1134. DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
  1135. return;
  1136. }
  1137. newTreeEntry(VL, true);
  1138. DEBUG(dbgs() << "SLP: added a vector of stores.\n");
  1139. ValueList Operands;
  1140. for (unsigned j = 0; j < VL.size(); ++j)
  1141. Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
  1142. buildTree_rec(Operands, Depth + 1);
  1143. return;
  1144. }
  1145. case Instruction::Call: {
  1146. // Check if the calls are all to the same vectorizable intrinsic.
  1147. CallInst *CI = cast<CallInst>(VL[0]);
  1148. // Check if this is an Intrinsic call or something that can be
  1149. // represented by an intrinsic call
  1150. Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
  1151. if (!isTriviallyVectorizable(ID)) {
  1152. BS.cancelScheduling(VL);
  1153. newTreeEntry(VL, false);
  1154. DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
  1155. return;
  1156. }
  1157. Function *Int = CI->getCalledFunction();
  1158. Value *A1I = nullptr;
  1159. if (hasVectorInstrinsicScalarOpd(ID, 1))
  1160. A1I = CI->getArgOperand(1);
  1161. for (unsigned i = 1, e = VL.size(); i != e; ++i) {
  1162. CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
  1163. if (!CI2 || CI2->getCalledFunction() != Int ||
  1164. getIntrinsicIDForCall(CI2, TLI) != ID) {
  1165. BS.cancelScheduling(VL);
  1166. newTreeEntry(VL, false);
  1167. DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
  1168. << "\n");
  1169. return;
  1170. }
  1171. // ctlz,cttz and powi are special intrinsics whose second argument
  1172. // should be same in order for them to be vectorized.
  1173. if (hasVectorInstrinsicScalarOpd(ID, 1)) {
  1174. Value *A1J = CI2->getArgOperand(1);
  1175. if (A1I != A1J) {
  1176. BS.cancelScheduling(VL);
  1177. newTreeEntry(VL, false);
  1178. DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
  1179. << " argument "<< A1I<<"!=" << A1J
  1180. << "\n");
  1181. return;
  1182. }
  1183. }
  1184. }
  1185. newTreeEntry(VL, true);
  1186. for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
  1187. ValueList Operands;
  1188. // Prepare the operand vector.
  1189. for (unsigned j = 0; j < VL.size(); ++j) {
  1190. CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
  1191. Operands.push_back(CI2->getArgOperand(i));
  1192. }
  1193. buildTree_rec(Operands, Depth + 1);
  1194. }
  1195. return;
  1196. }
  1197. case Instruction::ShuffleVector: {
  1198. // If this is not an alternate sequence of opcode like add-sub
  1199. // then do not vectorize this instruction.
  1200. if (!isAltShuffle) {
  1201. BS.cancelScheduling(VL);
  1202. newTreeEntry(VL, false);
  1203. DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
  1204. return;
  1205. }
  1206. newTreeEntry(VL, true);
  1207. DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
  1208. // Reorder operands if reordering would enable vectorization.
  1209. if (isa<BinaryOperator>(VL0)) {
  1210. ValueList Left, Right;
  1211. reorderAltShuffleOperands(VL, Left, Right);
  1212. buildTree_rec(Left, Depth + 1);
  1213. buildTree_rec(Right, Depth + 1);
  1214. return;
  1215. }
  1216. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1217. ValueList Operands;
  1218. // Prepare the operand vector.
  1219. for (unsigned j = 0; j < VL.size(); ++j)
  1220. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  1221. buildTree_rec(Operands, Depth + 1);
  1222. }
  1223. return;
  1224. }
  1225. default:
  1226. BS.cancelScheduling(VL);
  1227. newTreeEntry(VL, false);
  1228. DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
  1229. return;
  1230. }
  1231. }
  1232. int BoUpSLP::getEntryCost(TreeEntry *E) {
  1233. ArrayRef<Value*> VL = E->Scalars;
  1234. Type *ScalarTy = VL[0]->getType();
  1235. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  1236. ScalarTy = SI->getValueOperand()->getType();
  1237. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  1238. if (E->NeedToGather) {
  1239. if (allConstant(VL))
  1240. return 0;
  1241. if (isSplat(VL)) {
  1242. return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
  1243. }
  1244. return getGatherCost(E->Scalars);
  1245. }
  1246. unsigned Opcode = getSameOpcode(VL);
  1247. assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
  1248. Instruction *VL0 = cast<Instruction>(VL[0]);
  1249. switch (Opcode) {
  1250. case Instruction::PHI: {
  1251. return 0;
  1252. }
  1253. case Instruction::ExtractElement: {
  1254. if (CanReuseExtract(VL)) {
  1255. int DeadCost = 0;
  1256. for (unsigned i = 0, e = VL.size(); i < e; ++i) {
  1257. ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
  1258. if (E->hasOneUse())
  1259. // Take credit for instruction that will become dead.
  1260. DeadCost +=
  1261. TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
  1262. }
  1263. return -DeadCost;
  1264. }
  1265. return getGatherCost(VecTy);
  1266. }
  1267. case Instruction::ZExt:
  1268. case Instruction::SExt:
  1269. case Instruction::FPToUI:
  1270. case Instruction::FPToSI:
  1271. case Instruction::FPExt:
  1272. case Instruction::PtrToInt:
  1273. case Instruction::IntToPtr:
  1274. case Instruction::SIToFP:
  1275. case Instruction::UIToFP:
  1276. case Instruction::Trunc:
  1277. case Instruction::FPTrunc:
  1278. case Instruction::BitCast: {
  1279. Type *SrcTy = VL0->getOperand(0)->getType();
  1280. // Calculate the cost of this instruction.
  1281. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
  1282. VL0->getType(), SrcTy);
  1283. VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
  1284. int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
  1285. return VecCost - ScalarCost;
  1286. }
  1287. case Instruction::FCmp:
  1288. case Instruction::ICmp:
  1289. case Instruction::Select:
  1290. case Instruction::Add:
  1291. case Instruction::FAdd:
  1292. case Instruction::Sub:
  1293. case Instruction::FSub:
  1294. case Instruction::Mul:
  1295. case Instruction::FMul:
  1296. case Instruction::UDiv:
  1297. case Instruction::SDiv:
  1298. case Instruction::FDiv:
  1299. case Instruction::URem:
  1300. case Instruction::SRem:
  1301. case Instruction::FRem:
  1302. case Instruction::Shl:
  1303. case Instruction::LShr:
  1304. case Instruction::AShr:
  1305. case Instruction::And:
  1306. case Instruction::Or:
  1307. case Instruction::Xor: {
  1308. // Calculate the cost of this instruction.
  1309. int ScalarCost = 0;
  1310. int VecCost = 0;
  1311. if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
  1312. Opcode == Instruction::Select) {
  1313. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
  1314. ScalarCost = VecTy->getNumElements() *
  1315. TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
  1316. VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
  1317. } else {
  1318. // Certain instructions can be cheaper to vectorize if they have a
  1319. // constant second vector operand.
  1320. TargetTransformInfo::OperandValueKind Op1VK =
  1321. TargetTransformInfo::OK_AnyValue;
  1322. TargetTransformInfo::OperandValueKind Op2VK =
  1323. TargetTransformInfo::OK_UniformConstantValue;
  1324. TargetTransformInfo::OperandValueProperties Op1VP =
  1325. TargetTransformInfo::OP_None;
  1326. TargetTransformInfo::OperandValueProperties Op2VP =
  1327. TargetTransformInfo::OP_None;
  1328. // If all operands are exactly the same ConstantInt then set the
  1329. // operand kind to OK_UniformConstantValue.
  1330. // If instead not all operands are constants, then set the operand kind
  1331. // to OK_AnyValue. If all operands are constants but not the same,
  1332. // then set the operand kind to OK_NonUniformConstantValue.
  1333. ConstantInt *CInt = nullptr;
  1334. for (unsigned i = 0; i < VL.size(); ++i) {
  1335. const Instruction *I = cast<Instruction>(VL[i]);
  1336. if (!isa<ConstantInt>(I->getOperand(1))) {
  1337. Op2VK = TargetTransformInfo::OK_AnyValue;
  1338. break;
  1339. }
  1340. if (i == 0) {
  1341. CInt = cast<ConstantInt>(I->getOperand(1));
  1342. continue;
  1343. }
  1344. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
  1345. CInt != cast<ConstantInt>(I->getOperand(1)))
  1346. Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
  1347. }
  1348. // FIXME: Currently cost of model modification for division by
  1349. // power of 2 is handled only for X86. Add support for other targets.
  1350. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
  1351. CInt->getValue().isPowerOf2())
  1352. Op2VP = TargetTransformInfo::OP_PowerOf2;
  1353. ScalarCost = VecTy->getNumElements() *
  1354. TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
  1355. Op1VP, Op2VP);
  1356. VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
  1357. Op1VP, Op2VP);
  1358. }
  1359. return VecCost - ScalarCost;
  1360. }
  1361. case Instruction::GetElementPtr: {
  1362. TargetTransformInfo::OperandValueKind Op1VK =
  1363. TargetTransformInfo::OK_AnyValue;
  1364. TargetTransformInfo::OperandValueKind Op2VK =
  1365. TargetTransformInfo::OK_UniformConstantValue;
  1366. int ScalarCost =
  1367. VecTy->getNumElements() *
  1368. TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
  1369. int VecCost =
  1370. TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
  1371. return VecCost - ScalarCost;
  1372. }
  1373. case Instruction::Load: {
  1374. // Cost of wide load - cost of scalar loads.
  1375. int ScalarLdCost = VecTy->getNumElements() *
  1376. TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
  1377. int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
  1378. return VecLdCost - ScalarLdCost;
  1379. }
  1380. case Instruction::Store: {
  1381. // We know that we can merge the stores. Calculate the cost.
  1382. int ScalarStCost = VecTy->getNumElements() *
  1383. TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
  1384. int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
  1385. return VecStCost - ScalarStCost;
  1386. }
  1387. case Instruction::Call: {
  1388. CallInst *CI = cast<CallInst>(VL0);
  1389. Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
  1390. // Calculate the cost of the scalar and vector calls.
  1391. SmallVector<Type*, 4> ScalarTys, VecTys;
  1392. for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
  1393. ScalarTys.push_back(CI->getArgOperand(op)->getType());
  1394. VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
  1395. VecTy->getNumElements()));
  1396. }
  1397. int ScalarCallCost = VecTy->getNumElements() *
  1398. TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
  1399. int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
  1400. DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
  1401. << " (" << VecCallCost << "-" << ScalarCallCost << ")"
  1402. << " for " << *CI << "\n");
  1403. return VecCallCost - ScalarCallCost;
  1404. }
  1405. case Instruction::ShuffleVector: {
  1406. TargetTransformInfo::OperandValueKind Op1VK =
  1407. TargetTransformInfo::OK_AnyValue;
  1408. TargetTransformInfo::OperandValueKind Op2VK =
  1409. TargetTransformInfo::OK_AnyValue;
  1410. int ScalarCost = 0;
  1411. int VecCost = 0;
  1412. for (unsigned i = 0; i < VL.size(); ++i) {
  1413. Instruction *I = cast<Instruction>(VL[i]);
  1414. if (!I)
  1415. break;
  1416. ScalarCost +=
  1417. TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
  1418. }
  1419. // VecCost is equal to sum of the cost of creating 2 vectors
  1420. // and the cost of creating shuffle.
  1421. Instruction *I0 = cast<Instruction>(VL[0]);
  1422. VecCost =
  1423. TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
  1424. Instruction *I1 = cast<Instruction>(VL[1]);
  1425. VecCost +=
  1426. TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
  1427. VecCost +=
  1428. TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
  1429. return VecCost - ScalarCost;
  1430. }
  1431. default:
  1432. llvm_unreachable("Unknown instruction");
  1433. }
  1434. }
  1435. bool BoUpSLP::isFullyVectorizableTinyTree() {
  1436. DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
  1437. VectorizableTree.size() << " is fully vectorizable .\n");
  1438. // We only handle trees of height 2.
  1439. if (VectorizableTree.size() != 2)
  1440. return false;
  1441. // Handle splat and all-constants stores.
  1442. if (!VectorizableTree[0].NeedToGather &&
  1443. (allConstant(VectorizableTree[1].Scalars) ||
  1444. isSplat(VectorizableTree[1].Scalars)))
  1445. return true;
  1446. // Gathering cost would be too much for tiny trees.
  1447. if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
  1448. return false;
  1449. return true;
  1450. }
  1451. int BoUpSLP::getSpillCost() {
  1452. // Walk from the bottom of the tree to the top, tracking which values are
  1453. // live. When we see a call instruction that is not part of our tree,
  1454. // query TTI to see if there is a cost to keeping values live over it
  1455. // (for example, if spills and fills are required).
  1456. unsigned BundleWidth = VectorizableTree.front().Scalars.size();
  1457. int Cost = 0;
  1458. SmallPtrSet<Instruction*, 4> LiveValues;
  1459. Instruction *PrevInst = nullptr;
  1460. for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
  1461. Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
  1462. if (!Inst)
  1463. continue;
  1464. if (!PrevInst) {
  1465. PrevInst = Inst;
  1466. continue;
  1467. }
  1468. DEBUG(
  1469. dbgs() << "SLP: #LV: " << LiveValues.size();
  1470. for (auto *X : LiveValues)
  1471. dbgs() << " " << X->getName();
  1472. dbgs() << ", Looking at ";
  1473. Inst->dump();
  1474. );
  1475. // Update LiveValues.
  1476. LiveValues.erase(PrevInst);
  1477. for (auto &J : PrevInst->operands()) {
  1478. if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
  1479. LiveValues.insert(cast<Instruction>(&*J));
  1480. }
  1481. // Now find the sequence of instructions between PrevInst and Inst.
  1482. BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
  1483. --PrevInstIt;
  1484. while (InstIt != PrevInstIt) {
  1485. if (PrevInstIt == PrevInst->getParent()->rend()) {
  1486. PrevInstIt = Inst->getParent()->rbegin();
  1487. continue;
  1488. }
  1489. if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
  1490. SmallVector<Type*, 4> V;
  1491. for (auto *II : LiveValues)
  1492. V.push_back(VectorType::get(II->getType(), BundleWidth));
  1493. Cost += TTI->getCostOfKeepingLiveOverCall(V);
  1494. }
  1495. ++PrevInstIt;
  1496. }
  1497. PrevInst = Inst;
  1498. }
  1499. DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
  1500. return Cost;
  1501. }
  1502. int BoUpSLP::getTreeCost() {
  1503. int Cost = 0;
  1504. DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
  1505. VectorizableTree.size() << ".\n");
  1506. // We only vectorize tiny trees if it is fully vectorizable.
  1507. if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
  1508. if (VectorizableTree.empty()) {
  1509. assert(!ExternalUses.size() && "We should not have any external users");
  1510. }
  1511. return INT_MAX;
  1512. }
  1513. unsigned BundleWidth = VectorizableTree[0].Scalars.size();
  1514. for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
  1515. int C = getEntryCost(&VectorizableTree[i]);
  1516. DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
  1517. << *VectorizableTree[i].Scalars[0] << " .\n");
  1518. Cost += C;
  1519. }
  1520. SmallSet<Value *, 16> ExtractCostCalculated;
  1521. int ExtractCost = 0;
  1522. for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
  1523. I != E; ++I) {
  1524. // We only add extract cost once for the same scalar.
  1525. if (!ExtractCostCalculated.insert(I->Scalar).second)
  1526. continue;
  1527. // Uses by ephemeral values are free (because the ephemeral value will be
  1528. // removed prior to code generation, and so the extraction will be
  1529. // removed as well).
  1530. if (EphValues.count(I->User))
  1531. continue;
  1532. VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
  1533. ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
  1534. I->Lane);
  1535. }
  1536. Cost += getSpillCost();
  1537. DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
  1538. return Cost + ExtractCost;
  1539. }
  1540. int BoUpSLP::getGatherCost(Type *Ty) {
  1541. int Cost = 0;
  1542. for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
  1543. Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
  1544. return Cost;
  1545. }
  1546. int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
  1547. // Find the type of the operands in VL.
  1548. Type *ScalarTy = VL[0]->getType();
  1549. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  1550. ScalarTy = SI->getValueOperand()->getType();
  1551. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  1552. // Find the cost of inserting/extracting values from the vector.
  1553. return getGatherCost(VecTy);
  1554. }
  1555. Value *BoUpSLP::getPointerOperand(Value *I) {
  1556. if (LoadInst *LI = dyn_cast<LoadInst>(I))
  1557. return LI->getPointerOperand();
  1558. if (StoreInst *SI = dyn_cast<StoreInst>(I))
  1559. return SI->getPointerOperand();
  1560. return nullptr;
  1561. }
  1562. unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
  1563. if (LoadInst *L = dyn_cast<LoadInst>(I))
  1564. return L->getPointerAddressSpace();
  1565. if (StoreInst *S = dyn_cast<StoreInst>(I))
  1566. return S->getPointerAddressSpace();
  1567. return -1;
  1568. }
  1569. bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
  1570. Value *PtrA = getPointerOperand(A);
  1571. Value *PtrB = getPointerOperand(B);
  1572. unsigned ASA = getAddressSpaceOperand(A);
  1573. unsigned ASB = getAddressSpaceOperand(B);
  1574. // Check that the address spaces match and that the pointers are valid.
  1575. if (!PtrA || !PtrB || (ASA != ASB))
  1576. return false;
  1577. // Make sure that A and B are different pointers of the same type.
  1578. if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
  1579. return false;
  1580. unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
  1581. Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
  1582. APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
  1583. APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
  1584. PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
  1585. PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
  1586. APInt OffsetDelta = OffsetB - OffsetA;
  1587. // Check if they are based on the same pointer. That makes the offsets
  1588. // sufficient.
  1589. if (PtrA == PtrB)
  1590. return OffsetDelta == Size;
  1591. // Compute the necessary base pointer delta to have the necessary final delta
  1592. // equal to the size.
  1593. APInt BaseDelta = Size - OffsetDelta;
  1594. // Otherwise compute the distance with SCEV between the base pointers.
  1595. const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
  1596. const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
  1597. const SCEV *C = SE->getConstant(BaseDelta);
  1598. const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
  1599. return X == PtrSCEVB;
  1600. }
  1601. // Reorder commutative operations in alternate shuffle if the resulting vectors
  1602. // are consecutive loads. This would allow us to vectorize the tree.
  1603. // If we have something like-
  1604. // load a[0] - load b[0]
  1605. // load b[1] + load a[1]
  1606. // load a[2] - load b[2]
  1607. // load a[3] + load b[3]
  1608. // Reordering the second load b[1] load a[1] would allow us to vectorize this
  1609. // code.
  1610. void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
  1611. SmallVectorImpl<Value *> &Left,
  1612. SmallVectorImpl<Value *> &Right) {
  1613. const DataLayout &DL = F->getParent()->getDataLayout();
  1614. // Push left and right operands of binary operation into Left and Right
  1615. for (unsigned i = 0, e = VL.size(); i < e; ++i) {
  1616. Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
  1617. Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
  1618. }
  1619. // Reorder if we have a commutative operation and consecutive access
  1620. // are on either side of the alternate instructions.
  1621. for (unsigned j = 0; j < VL.size() - 1; ++j) {
  1622. if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
  1623. if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
  1624. Instruction *VL1 = cast<Instruction>(VL[j]);
  1625. Instruction *VL2 = cast<Instruction>(VL[j + 1]);
  1626. if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
  1627. std::swap(Left[j], Right[j]);
  1628. continue;
  1629. } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
  1630. std::swap(Left[j + 1], Right[j + 1]);
  1631. continue;
  1632. }
  1633. // else unchanged
  1634. }
  1635. }
  1636. if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
  1637. if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
  1638. Instruction *VL1 = cast<Instruction>(VL[j]);
  1639. Instruction *VL2 = cast<Instruction>(VL[j + 1]);
  1640. if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
  1641. std::swap(Left[j], Right[j]);
  1642. continue;
  1643. } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
  1644. std::swap(Left[j + 1], Right[j + 1]);
  1645. continue;
  1646. }
  1647. // else unchanged
  1648. }
  1649. }
  1650. }
  1651. }
  1652. void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
  1653. SmallVectorImpl<Value *> &Left,
  1654. SmallVectorImpl<Value *> &Right) {
  1655. SmallVector<Value *, 16> OrigLeft, OrigRight;
  1656. bool AllSameOpcodeLeft = true;
  1657. bool AllSameOpcodeRight = true;
  1658. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  1659. Instruction *I = cast<Instruction>(VL[i]);
  1660. Value *VLeft = I->getOperand(0);
  1661. Value *VRight = I->getOperand(1);
  1662. OrigLeft.push_back(VLeft);
  1663. OrigRight.push_back(VRight);
  1664. Instruction *ILeft = dyn_cast<Instruction>(VLeft);
  1665. Instruction *IRight = dyn_cast<Instruction>(VRight);
  1666. // Check whether all operands on one side have the same opcode. In this case
  1667. // we want to preserve the original order and not make things worse by
  1668. // reordering.
  1669. if (i && AllSameOpcodeLeft && ILeft) {
  1670. if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
  1671. if (PLeft->getOpcode() != ILeft->getOpcode())
  1672. AllSameOpcodeLeft = false;
  1673. } else
  1674. AllSameOpcodeLeft = false;
  1675. }
  1676. if (i && AllSameOpcodeRight && IRight) {
  1677. if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
  1678. if (PRight->getOpcode() != IRight->getOpcode())
  1679. AllSameOpcodeRight = false;
  1680. } else
  1681. AllSameOpcodeRight = false;
  1682. }
  1683. // Sort two opcodes. In the code below we try to preserve the ability to use
  1684. // broadcast of values instead of individual inserts.
  1685. // vl1 = load
  1686. // vl2 = phi
  1687. // vr1 = load
  1688. // vr2 = vr2
  1689. // = vl1 x vr1
  1690. // = vl2 x vr2
  1691. // If we just sorted according to opcode we would leave the first line in
  1692. // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
  1693. // = vl1 x vr1
  1694. // = vr2 x vl2
  1695. // Because vr2 and vr1 are from the same load we loose the opportunity of a
  1696. // broadcast for the packed right side in the backend: we have [vr1, vl2]
  1697. // instead of [vr1, vr2=vr1].
  1698. if (ILeft && IRight) {
  1699. if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
  1700. Left.push_back(IRight);
  1701. Right.push_back(ILeft);
  1702. } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
  1703. Right[i - 1] != IRight) {
  1704. // Try not to destroy a broad cast for no apparent benefit.
  1705. Left.push_back(IRight);
  1706. Right.push_back(ILeft);
  1707. } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
  1708. Right[i - 1] == ILeft) {
  1709. // Try preserve broadcasts.
  1710. Left.push_back(IRight);
  1711. Right.push_back(ILeft);
  1712. } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
  1713. Left[i - 1] == IRight) {
  1714. // Try preserve broadcasts.
  1715. Left.push_back(IRight);
  1716. Right.push_back(ILeft);
  1717. } else {
  1718. Left.push_back(ILeft);
  1719. Right.push_back(IRight);
  1720. }
  1721. continue;
  1722. }
  1723. // One opcode, put the instruction on the right.
  1724. if (ILeft) {
  1725. Left.push_back(VRight);
  1726. Right.push_back(ILeft);
  1727. continue;
  1728. }
  1729. Left.push_back(VLeft);
  1730. Right.push_back(VRight);
  1731. }
  1732. bool LeftBroadcast = isSplat(Left);
  1733. bool RightBroadcast = isSplat(Right);
  1734. // If operands end up being broadcast return this operand order.
  1735. if (LeftBroadcast || RightBroadcast)
  1736. return;
  1737. // Don't reorder if the operands where good to begin.
  1738. if (AllSameOpcodeRight || AllSameOpcodeLeft) {
  1739. Left = OrigLeft;
  1740. Right = OrigRight;
  1741. }
  1742. const DataLayout &DL = F->getParent()->getDataLayout();
  1743. // Finally check if we can get longer vectorizable chain by reordering
  1744. // without breaking the good operand order detected above.
  1745. // E.g. If we have something like-
  1746. // load a[0] load b[0]
  1747. // load b[1] load a[1]
  1748. // load a[2] load b[2]
  1749. // load a[3] load b[3]
  1750. // Reordering the second load b[1] load a[1] would allow us to vectorize
  1751. // this code and we still retain AllSameOpcode property.
  1752. // FIXME: This load reordering might break AllSameOpcode in some rare cases
  1753. // such as-
  1754. // add a[0],c[0] load b[0]
  1755. // add a[1],c[2] load b[1]
  1756. // b[2] load b[2]
  1757. // add a[3],c[3] load b[3]
  1758. for (unsigned j = 0; j < VL.size() - 1; ++j) {
  1759. if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
  1760. if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
  1761. if (isConsecutiveAccess(L, L1, DL)) {
  1762. std::swap(Left[j + 1], Right[j + 1]);
  1763. continue;
  1764. }
  1765. }
  1766. }
  1767. if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
  1768. if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
  1769. if (isConsecutiveAccess(L, L1, DL)) {
  1770. std::swap(Left[j + 1], Right[j + 1]);
  1771. continue;
  1772. }
  1773. }
  1774. }
  1775. // else unchanged
  1776. }
  1777. }
  1778. void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
  1779. Instruction *VL0 = cast<Instruction>(VL[0]);
  1780. BasicBlock::iterator NextInst = VL0;
  1781. ++NextInst;
  1782. Builder.SetInsertPoint(VL0->getParent(), NextInst);
  1783. Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
  1784. }
  1785. Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
  1786. Value *Vec = UndefValue::get(Ty);
  1787. // Generate the 'InsertElement' instruction.
  1788. for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
  1789. Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
  1790. if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
  1791. GatherSeq.insert(Insrt);
  1792. CSEBlocks.insert(Insrt->getParent());
  1793. // Add to our 'need-to-extract' list.
  1794. if (ScalarToTreeEntry.count(VL[i])) {
  1795. int Idx = ScalarToTreeEntry[VL[i]];
  1796. TreeEntry *E = &VectorizableTree[Idx];
  1797. // Find which lane we need to extract.
  1798. int FoundLane = -1;
  1799. for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
  1800. // Is this the lane of the scalar that we are looking for ?
  1801. if (E->Scalars[Lane] == VL[i]) {
  1802. FoundLane = Lane;
  1803. break;
  1804. }
  1805. }
  1806. assert(FoundLane >= 0 && "Could not find the correct lane");
  1807. ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
  1808. }
  1809. }
  1810. }
  1811. return Vec;
  1812. }
  1813. Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
  1814. SmallDenseMap<Value*, int>::const_iterator Entry
  1815. = ScalarToTreeEntry.find(VL[0]);
  1816. if (Entry != ScalarToTreeEntry.end()) {
  1817. int Idx = Entry->second;
  1818. const TreeEntry *En = &VectorizableTree[Idx];
  1819. if (En->isSame(VL) && En->VectorizedValue)
  1820. return En->VectorizedValue;
  1821. }
  1822. return nullptr;
  1823. }
  1824. Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
  1825. if (ScalarToTreeEntry.count(VL[0])) {
  1826. int Idx = ScalarToTreeEntry[VL[0]];
  1827. TreeEntry *E = &VectorizableTree[Idx];
  1828. if (E->isSame(VL))
  1829. return vectorizeTree(E);
  1830. }
  1831. Type *ScalarTy = VL[0]->getType();
  1832. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  1833. ScalarTy = SI->getValueOperand()->getType();
  1834. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  1835. return Gather(VL, VecTy);
  1836. }
  1837. Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
  1838. IRBuilder<>::InsertPointGuard Guard(Builder);
  1839. if (E->VectorizedValue) {
  1840. DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
  1841. return E->VectorizedValue;
  1842. }
  1843. Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
  1844. Type *ScalarTy = VL0->getType();
  1845. if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
  1846. ScalarTy = SI->getValueOperand()->getType();
  1847. VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
  1848. if (E->NeedToGather) {
  1849. setInsertPointAfterBundle(E->Scalars);
  1850. return Gather(E->Scalars, VecTy);
  1851. }
  1852. const DataLayout &DL = F->getParent()->getDataLayout();
  1853. unsigned Opcode = getSameOpcode(E->Scalars);
  1854. switch (Opcode) {
  1855. case Instruction::PHI: {
  1856. PHINode *PH = dyn_cast<PHINode>(VL0);
  1857. Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
  1858. Builder.SetCurrentDebugLocation(PH->getDebugLoc());
  1859. PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
  1860. E->VectorizedValue = NewPhi;
  1861. // PHINodes may have multiple entries from the same block. We want to
  1862. // visit every block once.
  1863. SmallSet<BasicBlock*, 4> VisitedBBs;
  1864. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  1865. ValueList Operands;
  1866. BasicBlock *IBB = PH->getIncomingBlock(i);
  1867. if (!VisitedBBs.insert(IBB).second) {
  1868. NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
  1869. continue;
  1870. }
  1871. // Prepare the operand vector.
  1872. for (Value *V : E->Scalars)
  1873. Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
  1874. Builder.SetInsertPoint(IBB->getTerminator());
  1875. Builder.SetCurrentDebugLocation(PH->getDebugLoc());
  1876. Value *Vec = vectorizeTree(Operands);
  1877. NewPhi->addIncoming(Vec, IBB);
  1878. }
  1879. assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
  1880. "Invalid number of incoming values");
  1881. return NewPhi;
  1882. }
  1883. case Instruction::ExtractElement: {
  1884. if (CanReuseExtract(E->Scalars)) {
  1885. Value *V = VL0->getOperand(0);
  1886. E->VectorizedValue = V;
  1887. return V;
  1888. }
  1889. return Gather(E->Scalars, VecTy);
  1890. }
  1891. case Instruction::ZExt:
  1892. case Instruction::SExt:
  1893. case Instruction::FPToUI:
  1894. case Instruction::FPToSI:
  1895. case Instruction::FPExt:
  1896. case Instruction::PtrToInt:
  1897. case Instruction::IntToPtr:
  1898. case Instruction::SIToFP:
  1899. case Instruction::UIToFP:
  1900. case Instruction::Trunc:
  1901. case Instruction::FPTrunc:
  1902. case Instruction::BitCast: {
  1903. ValueList INVL;
  1904. for (Value *V : E->Scalars)
  1905. INVL.push_back(cast<Instruction>(V)->getOperand(0));
  1906. setInsertPointAfterBundle(E->Scalars);
  1907. Value *InVec = vectorizeTree(INVL);
  1908. if (Value *V = alreadyVectorized(E->Scalars))
  1909. return V;
  1910. CastInst *CI = dyn_cast<CastInst>(VL0);
  1911. Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
  1912. E->VectorizedValue = V;
  1913. ++NumVectorInstructions;
  1914. return V;
  1915. }
  1916. case Instruction::FCmp:
  1917. case Instruction::ICmp: {
  1918. ValueList LHSV, RHSV;
  1919. for (Value *V : E->Scalars) {
  1920. LHSV.push_back(cast<Instruction>(V)->getOperand(0));
  1921. RHSV.push_back(cast<Instruction>(V)->getOperand(1));
  1922. }
  1923. setInsertPointAfterBundle(E->Scalars);
  1924. Value *L = vectorizeTree(LHSV);
  1925. Value *R = vectorizeTree(RHSV);
  1926. if (Value *V = alreadyVectorized(E->Scalars))
  1927. return V;
  1928. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
  1929. Value *V;
  1930. if (Opcode == Instruction::FCmp)
  1931. V = Builder.CreateFCmp(P0, L, R);
  1932. else
  1933. V = Builder.CreateICmp(P0, L, R);
  1934. E->VectorizedValue = V;
  1935. ++NumVectorInstructions;
  1936. return V;
  1937. }
  1938. case Instruction::Select: {
  1939. ValueList TrueVec, FalseVec, CondVec;
  1940. for (Value *V : E->Scalars) {
  1941. CondVec.push_back(cast<Instruction>(V)->getOperand(0));
  1942. TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
  1943. FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
  1944. }
  1945. setInsertPointAfterBundle(E->Scalars);
  1946. Value *Cond = vectorizeTree(CondVec);
  1947. Value *True = vectorizeTree(TrueVec);
  1948. Value *False = vectorizeTree(FalseVec);
  1949. if (Value *V = alreadyVectorized(E->Scalars))
  1950. return V;
  1951. Value *V = Builder.CreateSelect(Cond, True, False);
  1952. E->VectorizedValue = V;
  1953. ++NumVectorInstructions;
  1954. return V;
  1955. }
  1956. case Instruction::Add:
  1957. case Instruction::FAdd:
  1958. case Instruction::Sub:
  1959. case Instruction::FSub:
  1960. case Instruction::Mul:
  1961. case Instruction::FMul:
  1962. case Instruction::UDiv:
  1963. case Instruction::SDiv:
  1964. case Instruction::FDiv:
  1965. case Instruction::URem:
  1966. case Instruction::SRem:
  1967. case Instruction::FRem:
  1968. case Instruction::Shl:
  1969. case Instruction::LShr:
  1970. case Instruction::AShr:
  1971. case Instruction::And:
  1972. case Instruction::Or:
  1973. case Instruction::Xor: {
  1974. ValueList LHSVL, RHSVL;
  1975. if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
  1976. reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
  1977. else
  1978. for (Value *V : E->Scalars) {
  1979. LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
  1980. RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
  1981. }
  1982. setInsertPointAfterBundle(E->Scalars);
  1983. Value *LHS = vectorizeTree(LHSVL);
  1984. Value *RHS = vectorizeTree(RHSVL);
  1985. if (LHS == RHS && isa<Instruction>(LHS)) {
  1986. assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
  1987. }
  1988. if (Value *V = alreadyVectorized(E->Scalars))
  1989. return V;
  1990. BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
  1991. Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
  1992. E->VectorizedValue = V;
  1993. propagateIRFlags(E->VectorizedValue, E->Scalars);
  1994. ++NumVectorInstructions;
  1995. if (Instruction *I = dyn_cast<Instruction>(V))
  1996. return propagateMetadata(I, E->Scalars);
  1997. return V;
  1998. }
  1999. case Instruction::Load: {
  2000. // Loads are inserted at the head of the tree because we don't want to
  2001. // sink them all the way down past store instructions.
  2002. setInsertPointAfterBundle(E->Scalars);
  2003. LoadInst *LI = cast<LoadInst>(VL0);
  2004. Type *ScalarLoadTy = LI->getType();
  2005. unsigned AS = LI->getPointerAddressSpace();
  2006. Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
  2007. VecTy->getPointerTo(AS));
  2008. // The pointer operand uses an in-tree scalar so we add the new BitCast to
  2009. // ExternalUses list to make sure that an extract will be generated in the
  2010. // future.
  2011. if (ScalarToTreeEntry.count(LI->getPointerOperand()))
  2012. ExternalUses.push_back(
  2013. ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
  2014. unsigned Alignment = LI->getAlignment();
  2015. LI = Builder.CreateLoad(VecPtr);
  2016. if (!Alignment) {
  2017. Alignment = DL.getABITypeAlignment(ScalarLoadTy);
  2018. }
  2019. LI->setAlignment(Alignment);
  2020. E->VectorizedValue = LI;
  2021. ++NumVectorInstructions;
  2022. return propagateMetadata(LI, E->Scalars);
  2023. }
  2024. case Instruction::Store: {
  2025. StoreInst *SI = cast<StoreInst>(VL0);
  2026. unsigned Alignment = SI->getAlignment();
  2027. unsigned AS = SI->getPointerAddressSpace();
  2028. ValueList ValueOp;
  2029. for (Value *V : E->Scalars)
  2030. ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
  2031. setInsertPointAfterBundle(E->Scalars);
  2032. Value *VecValue = vectorizeTree(ValueOp);
  2033. Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
  2034. VecTy->getPointerTo(AS));
  2035. StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
  2036. // The pointer operand uses an in-tree scalar so we add the new BitCast to
  2037. // ExternalUses list to make sure that an extract will be generated in the
  2038. // future.
  2039. if (ScalarToTreeEntry.count(SI->getPointerOperand()))
  2040. ExternalUses.push_back(
  2041. ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
  2042. if (!Alignment) {
  2043. Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
  2044. }
  2045. S->setAlignment(Alignment);
  2046. E->VectorizedValue = S;
  2047. ++NumVectorInstructions;
  2048. return propagateMetadata(S, E->Scalars);
  2049. }
  2050. case Instruction::GetElementPtr: {
  2051. setInsertPointAfterBundle(E->Scalars);
  2052. ValueList Op0VL;
  2053. for (Value *V : E->Scalars)
  2054. Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
  2055. Value *Op0 = vectorizeTree(Op0VL);
  2056. std::vector<Value *> OpVecs;
  2057. for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
  2058. ++j) {
  2059. ValueList OpVL;
  2060. for (Value *V : E->Scalars)
  2061. OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
  2062. Value *OpVec = vectorizeTree(OpVL);
  2063. OpVecs.push_back(OpVec);
  2064. }
  2065. Value *V = Builder.CreateGEP(
  2066. cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
  2067. E->VectorizedValue = V;
  2068. ++NumVectorInstructions;
  2069. if (Instruction *I = dyn_cast<Instruction>(V))
  2070. return propagateMetadata(I, E->Scalars);
  2071. return V;
  2072. }
  2073. case Instruction::Call: {
  2074. CallInst *CI = cast<CallInst>(VL0);
  2075. setInsertPointAfterBundle(E->Scalars);
  2076. Function *FI;
  2077. Intrinsic::ID IID = Intrinsic::not_intrinsic;
  2078. Value *ScalarArg = nullptr;
  2079. if (CI && (FI = CI->getCalledFunction())) {
  2080. IID = FI->getIntrinsicID();
  2081. }
  2082. std::vector<Value *> OpVecs;
  2083. for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
  2084. ValueList OpVL;
  2085. // ctlz,cttz and powi are special intrinsics whose second argument is
  2086. // a scalar. This argument should not be vectorized.
  2087. if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
  2088. CallInst *CEI = cast<CallInst>(E->Scalars[0]);
  2089. ScalarArg = CEI->getArgOperand(j);
  2090. OpVecs.push_back(CEI->getArgOperand(j));
  2091. continue;
  2092. }
  2093. for (Value *V : E->Scalars) {
  2094. CallInst *CEI = cast<CallInst>(V);
  2095. OpVL.push_back(CEI->getArgOperand(j));
  2096. }
  2097. Value *OpVec = vectorizeTree(OpVL);
  2098. DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
  2099. OpVecs.push_back(OpVec);
  2100. }
  2101. Module *M = F->getParent();
  2102. Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
  2103. Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
  2104. Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
  2105. Value *V = Builder.CreateCall(CF, OpVecs);
  2106. // The scalar argument uses an in-tree scalar so we add the new vectorized
  2107. // call to ExternalUses list to make sure that an extract will be
  2108. // generated in the future.
  2109. if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
  2110. ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
  2111. E->VectorizedValue = V;
  2112. ++NumVectorInstructions;
  2113. return V;
  2114. }
  2115. case Instruction::ShuffleVector: {
  2116. ValueList LHSVL, RHSVL;
  2117. assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
  2118. reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
  2119. setInsertPointAfterBundle(E->Scalars);
  2120. Value *LHS = vectorizeTree(LHSVL);
  2121. Value *RHS = vectorizeTree(RHSVL);
  2122. if (Value *V = alreadyVectorized(E->Scalars))
  2123. return V;
  2124. // Create a vector of LHS op1 RHS
  2125. BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
  2126. Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
  2127. // Create a vector of LHS op2 RHS
  2128. Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
  2129. BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
  2130. Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
  2131. // Create shuffle to take alternate operations from the vector.
  2132. // Also, gather up odd and even scalar ops to propagate IR flags to
  2133. // each vector operation.
  2134. ValueList OddScalars, EvenScalars;
  2135. unsigned e = E->Scalars.size();
  2136. SmallVector<Constant *, 8> Mask(e);
  2137. for (unsigned i = 0; i < e; ++i) {
  2138. if (i & 1) {
  2139. Mask[i] = Builder.getInt32(e + i);
  2140. OddScalars.push_back(E->Scalars[i]);
  2141. } else {
  2142. Mask[i] = Builder.getInt32(i);
  2143. EvenScalars.push_back(E->Scalars[i]);
  2144. }
  2145. }
  2146. Value *ShuffleMask = ConstantVector::get(Mask);
  2147. propagateIRFlags(V0, EvenScalars);
  2148. propagateIRFlags(V1, OddScalars);
  2149. Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
  2150. E->VectorizedValue = V;
  2151. ++NumVectorInstructions;
  2152. if (Instruction *I = dyn_cast<Instruction>(V))
  2153. return propagateMetadata(I, E->Scalars);
  2154. return V;
  2155. }
  2156. default:
  2157. llvm_unreachable("unknown inst");
  2158. }
  2159. return nullptr;
  2160. }
  2161. Value *BoUpSLP::vectorizeTree() {
  2162. // All blocks must be scheduled before any instructions are inserted.
  2163. for (auto &BSIter : BlocksSchedules) {
  2164. scheduleBlock(BSIter.second.get());
  2165. }
  2166. Builder.SetInsertPoint(F->getEntryBlock().begin());
  2167. vectorizeTree(&VectorizableTree[0]);
  2168. DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
  2169. // Extract all of the elements with the external uses.
  2170. for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
  2171. it != e; ++it) {
  2172. Value *Scalar = it->Scalar;
  2173. llvm::User *User = it->User;
  2174. // Skip users that we already RAUW. This happens when one instruction
  2175. // has multiple uses of the same value.
  2176. if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
  2177. Scalar->user_end())
  2178. continue;
  2179. assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
  2180. int Idx = ScalarToTreeEntry[Scalar];
  2181. TreeEntry *E = &VectorizableTree[Idx];
  2182. assert(!E->NeedToGather && "Extracting from a gather list");
  2183. Value *Vec = E->VectorizedValue;
  2184. assert(Vec && "Can't find vectorizable value");
  2185. Value *Lane = Builder.getInt32(it->Lane);
  2186. // Generate extracts for out-of-tree users.
  2187. // Find the insertion point for the extractelement lane.
  2188. if (isa<Instruction>(Vec)){
  2189. if (PHINode *PH = dyn_cast<PHINode>(User)) {
  2190. for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
  2191. if (PH->getIncomingValue(i) == Scalar) {
  2192. Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
  2193. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2194. CSEBlocks.insert(PH->getIncomingBlock(i));
  2195. PH->setOperand(i, Ex);
  2196. }
  2197. }
  2198. } else {
  2199. Builder.SetInsertPoint(cast<Instruction>(User));
  2200. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2201. CSEBlocks.insert(cast<Instruction>(User)->getParent());
  2202. User->replaceUsesOfWith(Scalar, Ex);
  2203. }
  2204. } else {
  2205. Builder.SetInsertPoint(F->getEntryBlock().begin());
  2206. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2207. CSEBlocks.insert(&F->getEntryBlock());
  2208. User->replaceUsesOfWith(Scalar, Ex);
  2209. }
  2210. DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
  2211. }
  2212. // For each vectorized value:
  2213. for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
  2214. TreeEntry *Entry = &VectorizableTree[EIdx];
  2215. // For each lane:
  2216. for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
  2217. Value *Scalar = Entry->Scalars[Lane];
  2218. // No need to handle users of gathered values.
  2219. if (Entry->NeedToGather)
  2220. continue;
  2221. assert(Entry->VectorizedValue && "Can't find vectorizable value");
  2222. Type *Ty = Scalar->getType();
  2223. if (!Ty->isVoidTy()) {
  2224. #ifndef NDEBUG
  2225. for (User *U : Scalar->users()) {
  2226. DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
  2227. assert((ScalarToTreeEntry.count(U) ||
  2228. // It is legal to replace users in the ignorelist by undef.
  2229. (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
  2230. UserIgnoreList.end())) &&
  2231. "Replacing out-of-tree value with undef");
  2232. }
  2233. #endif
  2234. Value *Undef = UndefValue::get(Ty);
  2235. Scalar->replaceAllUsesWith(Undef);
  2236. }
  2237. DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
  2238. eraseInstruction(cast<Instruction>(Scalar));
  2239. }
  2240. }
  2241. Builder.ClearInsertionPoint();
  2242. return VectorizableTree[0].VectorizedValue;
  2243. }
  2244. void BoUpSLP::optimizeGatherSequence() {
  2245. DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
  2246. << " gather sequences instructions.\n");
  2247. // LICM InsertElementInst sequences.
  2248. for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
  2249. e = GatherSeq.end(); it != e; ++it) {
  2250. InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
  2251. if (!Insert)
  2252. continue;
  2253. // Check if this block is inside a loop.
  2254. Loop *L = LI->getLoopFor(Insert->getParent());
  2255. if (!L)
  2256. continue;
  2257. // Check if it has a preheader.
  2258. BasicBlock *PreHeader = L->getLoopPreheader();
  2259. if (!PreHeader)
  2260. continue;
  2261. // If the vector or the element that we insert into it are
  2262. // instructions that are defined in this basic block then we can't
  2263. // hoist this instruction.
  2264. Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
  2265. Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
  2266. if (CurrVec && L->contains(CurrVec))
  2267. continue;
  2268. if (NewElem && L->contains(NewElem))
  2269. continue;
  2270. // We can hoist this instruction. Move it to the pre-header.
  2271. Insert->moveBefore(PreHeader->getTerminator());
  2272. }
  2273. // Make a list of all reachable blocks in our CSE queue.
  2274. SmallVector<const DomTreeNode *, 8> CSEWorkList;
  2275. CSEWorkList.reserve(CSEBlocks.size());
  2276. for (BasicBlock *BB : CSEBlocks)
  2277. if (DomTreeNode *N = DT->getNode(BB)) {
  2278. assert(DT->isReachableFromEntry(N));
  2279. CSEWorkList.push_back(N);
  2280. }
  2281. // Sort blocks by domination. This ensures we visit a block after all blocks
  2282. // dominating it are visited.
  2283. std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
  2284. [this](const DomTreeNode *A, const DomTreeNode *B) {
  2285. return DT->properlyDominates(A, B);
  2286. });
  2287. // Perform O(N^2) search over the gather sequences and merge identical
  2288. // instructions. TODO: We can further optimize this scan if we split the
  2289. // instructions into different buckets based on the insert lane.
  2290. SmallVector<Instruction *, 16> Visited;
  2291. for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
  2292. assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
  2293. "Worklist not sorted properly!");
  2294. BasicBlock *BB = (*I)->getBlock();
  2295. // For all instructions in blocks containing gather sequences:
  2296. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
  2297. Instruction *In = it++;
  2298. if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
  2299. continue;
  2300. // Check if we can replace this instruction with any of the
  2301. // visited instructions.
  2302. for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
  2303. ve = Visited.end();
  2304. v != ve; ++v) {
  2305. if (In->isIdenticalTo(*v) &&
  2306. DT->dominates((*v)->getParent(), In->getParent())) {
  2307. In->replaceAllUsesWith(*v);
  2308. eraseInstruction(In);
  2309. In = nullptr;
  2310. break;
  2311. }
  2312. }
  2313. if (In) {
  2314. assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
  2315. Visited.push_back(In);
  2316. }
  2317. }
  2318. }
  2319. CSEBlocks.clear();
  2320. GatherSeq.clear();
  2321. }
  2322. // Groups the instructions to a bundle (which is then a single scheduling entity)
  2323. // and schedules instructions until the bundle gets ready.
  2324. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
  2325. BoUpSLP *SLP) {
  2326. if (isa<PHINode>(VL[0]))
  2327. return true;
  2328. // Initialize the instruction bundle.
  2329. Instruction *OldScheduleEnd = ScheduleEnd;
  2330. ScheduleData *PrevInBundle = nullptr;
  2331. ScheduleData *Bundle = nullptr;
  2332. bool ReSchedule = false;
  2333. DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
  2334. for (Value *V : VL) {
  2335. extendSchedulingRegion(V);
  2336. ScheduleData *BundleMember = getScheduleData(V);
  2337. assert(BundleMember &&
  2338. "no ScheduleData for bundle member (maybe not in same basic block)");
  2339. if (BundleMember->IsScheduled) {
  2340. // A bundle member was scheduled as single instruction before and now
  2341. // needs to be scheduled as part of the bundle. We just get rid of the
  2342. // existing schedule.
  2343. DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
  2344. << " was already scheduled\n");
  2345. ReSchedule = true;
  2346. }
  2347. assert(BundleMember->isSchedulingEntity() &&
  2348. "bundle member already part of other bundle");
  2349. if (PrevInBundle) {
  2350. PrevInBundle->NextInBundle = BundleMember;
  2351. } else {
  2352. Bundle = BundleMember;
  2353. }
  2354. BundleMember->UnscheduledDepsInBundle = 0;
  2355. Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
  2356. // Group the instructions to a bundle.
  2357. BundleMember->FirstInBundle = Bundle;
  2358. PrevInBundle = BundleMember;
  2359. }
  2360. if (ScheduleEnd != OldScheduleEnd) {
  2361. // The scheduling region got new instructions at the lower end (or it is a
  2362. // new region for the first bundle). This makes it necessary to
  2363. // recalculate all dependencies.
  2364. // It is seldom that this needs to be done a second time after adding the
  2365. // initial bundle to the region.
  2366. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  2367. ScheduleData *SD = getScheduleData(I);
  2368. SD->clearDependencies();
  2369. }
  2370. ReSchedule = true;
  2371. }
  2372. if (ReSchedule) {
  2373. resetSchedule();
  2374. initialFillReadyList(ReadyInsts);
  2375. }
  2376. DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
  2377. << BB->getName() << "\n");
  2378. calculateDependencies(Bundle, true, SLP);
  2379. // Now try to schedule the new bundle. As soon as the bundle is "ready" it
  2380. // means that there are no cyclic dependencies and we can schedule it.
  2381. // Note that's important that we don't "schedule" the bundle yet (see
  2382. // cancelScheduling).
  2383. while (!Bundle->isReady() && !ReadyInsts.empty()) {
  2384. ScheduleData *pickedSD = ReadyInsts.back();
  2385. ReadyInsts.pop_back();
  2386. if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
  2387. schedule(pickedSD, ReadyInsts);
  2388. }
  2389. }
  2390. return Bundle->isReady();
  2391. }
  2392. void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
  2393. if (isa<PHINode>(VL[0]))
  2394. return;
  2395. ScheduleData *Bundle = getScheduleData(VL[0]);
  2396. DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
  2397. assert(!Bundle->IsScheduled &&
  2398. "Can't cancel bundle which is already scheduled");
  2399. assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
  2400. "tried to unbundle something which is not a bundle");
  2401. // Un-bundle: make single instructions out of the bundle.
  2402. ScheduleData *BundleMember = Bundle;
  2403. while (BundleMember) {
  2404. assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
  2405. BundleMember->FirstInBundle = BundleMember;
  2406. ScheduleData *Next = BundleMember->NextInBundle;
  2407. BundleMember->NextInBundle = nullptr;
  2408. BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
  2409. if (BundleMember->UnscheduledDepsInBundle == 0) {
  2410. ReadyInsts.insert(BundleMember);
  2411. }
  2412. BundleMember = Next;
  2413. }
  2414. }
  2415. void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
  2416. if (getScheduleData(V))
  2417. return;
  2418. Instruction *I = dyn_cast<Instruction>(V);
  2419. assert(I && "bundle member must be an instruction");
  2420. assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
  2421. if (!ScheduleStart) {
  2422. // It's the first instruction in the new region.
  2423. initScheduleData(I, I->getNextNode(), nullptr, nullptr);
  2424. ScheduleStart = I;
  2425. ScheduleEnd = I->getNextNode();
  2426. assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
  2427. DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
  2428. return;
  2429. }
  2430. // Search up and down at the same time, because we don't know if the new
  2431. // instruction is above or below the existing scheduling region.
  2432. BasicBlock::reverse_iterator UpIter(ScheduleStart);
  2433. BasicBlock::reverse_iterator UpperEnd = BB->rend();
  2434. BasicBlock::iterator DownIter(ScheduleEnd);
  2435. BasicBlock::iterator LowerEnd = BB->end();
  2436. for (;;) {
  2437. if (UpIter != UpperEnd) {
  2438. if (&*UpIter == I) {
  2439. initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
  2440. ScheduleStart = I;
  2441. DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
  2442. return;
  2443. }
  2444. UpIter++;
  2445. }
  2446. if (DownIter != LowerEnd) {
  2447. if (&*DownIter == I) {
  2448. initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
  2449. nullptr);
  2450. ScheduleEnd = I->getNextNode();
  2451. assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
  2452. DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
  2453. return;
  2454. }
  2455. DownIter++;
  2456. }
  2457. assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
  2458. "instruction not found in block");
  2459. }
  2460. }
  2461. void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
  2462. Instruction *ToI,
  2463. ScheduleData *PrevLoadStore,
  2464. ScheduleData *NextLoadStore) {
  2465. ScheduleData *CurrentLoadStore = PrevLoadStore;
  2466. for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
  2467. ScheduleData *SD = ScheduleDataMap[I];
  2468. if (!SD) {
  2469. // Allocate a new ScheduleData for the instruction.
  2470. if (ChunkPos >= ChunkSize) {
  2471. ScheduleDataChunks.push_back(
  2472. llvm::make_unique<ScheduleData[]>(ChunkSize));
  2473. ChunkPos = 0;
  2474. }
  2475. SD = &(ScheduleDataChunks.back()[ChunkPos++]);
  2476. ScheduleDataMap[I] = SD;
  2477. SD->Inst = I;
  2478. }
  2479. assert(!isInSchedulingRegion(SD) &&
  2480. "new ScheduleData already in scheduling region");
  2481. SD->init(SchedulingRegionID);
  2482. if (I->mayReadOrWriteMemory()) {
  2483. // Update the linked list of memory accessing instructions.
  2484. if (CurrentLoadStore) {
  2485. CurrentLoadStore->NextLoadStore = SD;
  2486. } else {
  2487. FirstLoadStoreInRegion = SD;
  2488. }
  2489. CurrentLoadStore = SD;
  2490. }
  2491. }
  2492. if (NextLoadStore) {
  2493. if (CurrentLoadStore)
  2494. CurrentLoadStore->NextLoadStore = NextLoadStore;
  2495. } else {
  2496. LastLoadStoreInRegion = CurrentLoadStore;
  2497. }
  2498. }
  2499. void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
  2500. bool InsertInReadyList,
  2501. BoUpSLP *SLP) {
  2502. assert(SD->isSchedulingEntity());
  2503. SmallVector<ScheduleData *, 10> WorkList;
  2504. WorkList.push_back(SD);
  2505. while (!WorkList.empty()) {
  2506. ScheduleData *SD = WorkList.back();
  2507. WorkList.pop_back();
  2508. ScheduleData *BundleMember = SD;
  2509. while (BundleMember) {
  2510. assert(isInSchedulingRegion(BundleMember));
  2511. if (!BundleMember->hasValidDependencies()) {
  2512. DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
  2513. BundleMember->Dependencies = 0;
  2514. BundleMember->resetUnscheduledDeps();
  2515. // Handle def-use chain dependencies.
  2516. for (User *U : BundleMember->Inst->users()) {
  2517. if (isa<Instruction>(U)) {
  2518. ScheduleData *UseSD = getScheduleData(U);
  2519. if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
  2520. BundleMember->Dependencies++;
  2521. ScheduleData *DestBundle = UseSD->FirstInBundle;
  2522. if (!DestBundle->IsScheduled) {
  2523. BundleMember->incrementUnscheduledDeps(1);
  2524. }
  2525. if (!DestBundle->hasValidDependencies()) {
  2526. WorkList.push_back(DestBundle);
  2527. }
  2528. }
  2529. } else {
  2530. // I'm not sure if this can ever happen. But we need to be safe.
  2531. // This lets the instruction/bundle never be scheduled and eventally
  2532. // disable vectorization.
  2533. BundleMember->Dependencies++;
  2534. BundleMember->incrementUnscheduledDeps(1);
  2535. }
  2536. }
  2537. // Handle the memory dependencies.
  2538. ScheduleData *DepDest = BundleMember->NextLoadStore;
  2539. if (DepDest) {
  2540. Instruction *SrcInst = BundleMember->Inst;
  2541. MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
  2542. bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
  2543. unsigned numAliased = 0;
  2544. unsigned DistToSrc = 1;
  2545. while (DepDest) {
  2546. assert(isInSchedulingRegion(DepDest));
  2547. // We have two limits to reduce the complexity:
  2548. // 1) AliasedCheckLimit: It's a small limit to reduce calls to
  2549. // SLP->isAliased (which is the expensive part in this loop).
  2550. // 2) MaxMemDepDistance: It's for very large blocks and it aborts
  2551. // the whole loop (even if the loop is fast, it's quadratic).
  2552. // It's important for the loop break condition (see below) to
  2553. // check this limit even between two read-only instructions.
  2554. if (DistToSrc >= MaxMemDepDistance ||
  2555. ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
  2556. (numAliased >= AliasedCheckLimit ||
  2557. SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
  2558. // We increment the counter only if the locations are aliased
  2559. // (instead of counting all alias checks). This gives a better
  2560. // balance between reduced runtime and accurate dependencies.
  2561. numAliased++;
  2562. DepDest->MemoryDependencies.push_back(BundleMember);
  2563. BundleMember->Dependencies++;
  2564. ScheduleData *DestBundle = DepDest->FirstInBundle;
  2565. if (!DestBundle->IsScheduled) {
  2566. BundleMember->incrementUnscheduledDeps(1);
  2567. }
  2568. if (!DestBundle->hasValidDependencies()) {
  2569. WorkList.push_back(DestBundle);
  2570. }
  2571. }
  2572. DepDest = DepDest->NextLoadStore;
  2573. // Example, explaining the loop break condition: Let's assume our
  2574. // starting instruction is i0 and MaxMemDepDistance = 3.
  2575. //
  2576. // +--------v--v--v
  2577. // i0,i1,i2,i3,i4,i5,i6,i7,i8
  2578. // +--------^--^--^
  2579. //
  2580. // MaxMemDepDistance let us stop alias-checking at i3 and we add
  2581. // dependencies from i0 to i3,i4,.. (even if they are not aliased).
  2582. // Previously we already added dependencies from i3 to i6,i7,i8
  2583. // (because of MaxMemDepDistance). As we added a dependency from
  2584. // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
  2585. // and we can abort this loop at i6.
  2586. if (DistToSrc >= 2 * MaxMemDepDistance)
  2587. break;
  2588. DistToSrc++;
  2589. }
  2590. }
  2591. }
  2592. BundleMember = BundleMember->NextInBundle;
  2593. }
  2594. if (InsertInReadyList && SD->isReady()) {
  2595. ReadyInsts.push_back(SD);
  2596. DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
  2597. }
  2598. }
  2599. }
  2600. void BoUpSLP::BlockScheduling::resetSchedule() {
  2601. assert(ScheduleStart &&
  2602. "tried to reset schedule on block which has not been scheduled");
  2603. for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  2604. ScheduleData *SD = getScheduleData(I);
  2605. assert(isInSchedulingRegion(SD));
  2606. SD->IsScheduled = false;
  2607. SD->resetUnscheduledDeps();
  2608. }
  2609. ReadyInsts.clear();
  2610. }
  2611. void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
  2612. if (!BS->ScheduleStart)
  2613. return;
  2614. DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
  2615. BS->resetSchedule();
  2616. // For the real scheduling we use a more sophisticated ready-list: it is
  2617. // sorted by the original instruction location. This lets the final schedule
  2618. // be as close as possible to the original instruction order.
  2619. struct ScheduleDataCompare {
  2620. bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
  2621. return SD2->SchedulingPriority < SD1->SchedulingPriority;
  2622. }
  2623. };
  2624. std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
  2625. // Ensure that all depencency data is updated and fill the ready-list with
  2626. // initial instructions.
  2627. int Idx = 0;
  2628. int NumToSchedule = 0;
  2629. for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
  2630. I = I->getNextNode()) {
  2631. ScheduleData *SD = BS->getScheduleData(I);
  2632. assert(
  2633. SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
  2634. "scheduler and vectorizer have different opinion on what is a bundle");
  2635. SD->FirstInBundle->SchedulingPriority = Idx++;
  2636. if (SD->isSchedulingEntity()) {
  2637. BS->calculateDependencies(SD, false, this);
  2638. NumToSchedule++;
  2639. }
  2640. }
  2641. BS->initialFillReadyList(ReadyInsts);
  2642. Instruction *LastScheduledInst = BS->ScheduleEnd;
  2643. // Do the "real" scheduling.
  2644. while (!ReadyInsts.empty()) {
  2645. ScheduleData *picked = *ReadyInsts.begin();
  2646. ReadyInsts.erase(ReadyInsts.begin());
  2647. // Move the scheduled instruction(s) to their dedicated places, if not
  2648. // there yet.
  2649. ScheduleData *BundleMember = picked;
  2650. while (BundleMember) {
  2651. Instruction *pickedInst = BundleMember->Inst;
  2652. if (LastScheduledInst->getNextNode() != pickedInst) {
  2653. BS->BB->getInstList().remove(pickedInst);
  2654. BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
  2655. }
  2656. LastScheduledInst = pickedInst;
  2657. BundleMember = BundleMember->NextInBundle;
  2658. }
  2659. BS->schedule(picked, ReadyInsts);
  2660. NumToSchedule--;
  2661. }
  2662. assert(NumToSchedule == 0 && "could not schedule all instructions");
  2663. // Avoid duplicate scheduling of the block.
  2664. BS->ScheduleStart = nullptr;
  2665. }
  2666. /// The SLPVectorizer Pass.
  2667. struct SLPVectorizer : public FunctionPass {
  2668. typedef SmallVector<StoreInst *, 8> StoreList;
  2669. typedef MapVector<Value *, StoreList> StoreListMap;
  2670. /// Pass identification, replacement for typeid
  2671. static char ID;
  2672. explicit SLPVectorizer() : FunctionPass(ID) {
  2673. initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
  2674. }
  2675. ScalarEvolution *SE;
  2676. TargetTransformInfo *TTI;
  2677. TargetLibraryInfo *TLI;
  2678. AliasAnalysis *AA;
  2679. LoopInfo *LI;
  2680. DominatorTree *DT;
  2681. AssumptionCache *AC;
  2682. bool runOnFunction(Function &F) override {
  2683. if (skipOptnoneFunction(F))
  2684. return false;
  2685. SE = &getAnalysis<ScalarEvolution>();
  2686. TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  2687. auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
  2688. TLI = TLIP ? &TLIP->getTLI() : nullptr;
  2689. AA = &getAnalysis<AliasAnalysis>();
  2690. LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  2691. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  2692. AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  2693. StoreRefs.clear();
  2694. bool Changed = false;
  2695. // If the target claims to have no vector registers don't attempt
  2696. // vectorization.
  2697. if (!TTI->getNumberOfRegisters(true))
  2698. return false;
  2699. // Use the vector register size specified by the target unless overridden
  2700. // by a command-line option.
  2701. // TODO: It would be better to limit the vectorization factor based on
  2702. // data type rather than just register size. For example, x86 AVX has
  2703. // 256-bit registers, but it does not support integer operations
  2704. // at that width (that requires AVX2).
  2705. if (MaxVectorRegSizeOption.getNumOccurrences())
  2706. MaxVecRegSize = MaxVectorRegSizeOption;
  2707. else
  2708. MaxVecRegSize = TTI->getRegisterBitWidth(true);
  2709. // Don't vectorize when the attribute NoImplicitFloat is used.
  2710. if (F.hasFnAttribute(Attribute::NoImplicitFloat))
  2711. return false;
  2712. DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
  2713. // Use the bottom up slp vectorizer to construct chains that start with
  2714. // store instructions.
  2715. BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
  2716. // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
  2717. // delete instructions.
  2718. // Scan the blocks in the function in post order.
  2719. for (auto BB : post_order(&F.getEntryBlock())) {
  2720. // Vectorize trees that end at stores.
  2721. if (unsigned count = collectStores(BB, R)) {
  2722. (void)count;
  2723. DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
  2724. Changed |= vectorizeStoreChains(R);
  2725. }
  2726. // Vectorize trees that end at reductions.
  2727. Changed |= vectorizeChainsInBlock(BB, R);
  2728. }
  2729. if (Changed) {
  2730. R.optimizeGatherSequence();
  2731. DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
  2732. DEBUG(verifyFunction(F));
  2733. }
  2734. return Changed;
  2735. }
  2736. void getAnalysisUsage(AnalysisUsage &AU) const override {
  2737. FunctionPass::getAnalysisUsage(AU);
  2738. AU.addRequired<AssumptionCacheTracker>();
  2739. AU.addRequired<ScalarEvolution>();
  2740. AU.addRequired<AliasAnalysis>();
  2741. AU.addRequired<TargetTransformInfoWrapperPass>();
  2742. AU.addRequired<LoopInfoWrapperPass>();
  2743. AU.addRequired<DominatorTreeWrapperPass>();
  2744. AU.addPreserved<LoopInfoWrapperPass>();
  2745. AU.addPreserved<DominatorTreeWrapperPass>();
  2746. AU.setPreservesCFG();
  2747. }
  2748. private:
  2749. /// \brief Collect memory references and sort them according to their base
  2750. /// object. We sort the stores to their base objects to reduce the cost of the
  2751. /// quadratic search on the stores. TODO: We can further reduce this cost
  2752. /// if we flush the chain creation every time we run into a memory barrier.
  2753. unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
  2754. /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
  2755. bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
  2756. /// \brief Try to vectorize a list of operands.
  2757. /// \@param BuildVector A list of users to ignore for the purpose of
  2758. /// scheduling and that don't need extracting.
  2759. /// \returns true if a value was vectorized.
  2760. bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
  2761. ArrayRef<Value *> BuildVector = None,
  2762. bool allowReorder = false);
  2763. /// \brief Try to vectorize a chain that may start at the operands of \V;
  2764. bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
  2765. /// \brief Vectorize the stores that were collected in StoreRefs.
  2766. bool vectorizeStoreChains(BoUpSLP &R);
  2767. /// \brief Scan the basic block and look for patterns that are likely to start
  2768. /// a vectorization chain.
  2769. bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
  2770. bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
  2771. BoUpSLP &R, unsigned VecRegSize);
  2772. bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
  2773. BoUpSLP &R);
  2774. private:
  2775. StoreListMap StoreRefs;
  2776. unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
  2777. };
  2778. /// \brief Check that the Values in the slice in VL array are still existent in
  2779. /// the WeakVH array.
  2780. /// Vectorization of part of the VL array may cause later values in the VL array
  2781. /// to become invalid. We track when this has happened in the WeakVH array.
  2782. static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
  2783. unsigned SliceBegin, unsigned SliceSize) {
  2784. VL = VL.slice(SliceBegin, SliceSize);
  2785. VH = VH.slice(SliceBegin, SliceSize);
  2786. return !std::equal(VL.begin(), VL.end(), VH.begin());
  2787. }
  2788. bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
  2789. int CostThreshold, BoUpSLP &R,
  2790. unsigned VecRegSize) {
  2791. unsigned ChainLen = Chain.size();
  2792. DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
  2793. << "\n");
  2794. Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
  2795. auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
  2796. unsigned Sz = DL.getTypeSizeInBits(StoreTy);
  2797. unsigned VF = VecRegSize / Sz;
  2798. if (!isPowerOf2_32(Sz) || VF < 2)
  2799. return false;
  2800. // Keep track of values that were deleted by vectorizing in the loop below.
  2801. SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
  2802. bool Changed = false;
  2803. // Look for profitable vectorizable trees at all offsets, starting at zero.
  2804. for (unsigned i = 0, e = ChainLen; i < e; ++i) {
  2805. if (i + VF > e)
  2806. break;
  2807. // Check that a previous iteration of this loop did not delete the Value.
  2808. if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
  2809. continue;
  2810. DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
  2811. << "\n");
  2812. ArrayRef<Value *> Operands = Chain.slice(i, VF);
  2813. R.buildTree(Operands);
  2814. int Cost = R.getTreeCost();
  2815. DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
  2816. if (Cost < CostThreshold) {
  2817. DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
  2818. R.vectorizeTree();
  2819. // Move to the next bundle.
  2820. i += VF - 1;
  2821. Changed = true;
  2822. }
  2823. }
  2824. return Changed;
  2825. }
  2826. bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
  2827. int costThreshold, BoUpSLP &R) {
  2828. SetVector<StoreInst *> Heads, Tails;
  2829. SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
  2830. // We may run into multiple chains that merge into a single chain. We mark the
  2831. // stores that we vectorized so that we don't visit the same store twice.
  2832. BoUpSLP::ValueSet VectorizedStores;
  2833. bool Changed = false;
  2834. // Do a quadratic search on all of the given stores and find
  2835. // all of the pairs of stores that follow each other.
  2836. for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
  2837. for (unsigned j = 0; j < e; ++j) {
  2838. if (i == j)
  2839. continue;
  2840. const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
  2841. if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
  2842. Tails.insert(Stores[j]);
  2843. Heads.insert(Stores[i]);
  2844. ConsecutiveChain[Stores[i]] = Stores[j];
  2845. }
  2846. }
  2847. }
  2848. // For stores that start but don't end a link in the chain:
  2849. for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
  2850. it != e; ++it) {
  2851. if (Tails.count(*it))
  2852. continue;
  2853. // We found a store instr that starts a chain. Now follow the chain and try
  2854. // to vectorize it.
  2855. BoUpSLP::ValueList Operands;
  2856. StoreInst *I = *it;
  2857. // Collect the chain into a list.
  2858. while (Tails.count(I) || Heads.count(I)) {
  2859. if (VectorizedStores.count(I))
  2860. break;
  2861. Operands.push_back(I);
  2862. // Move to the next value in the chain.
  2863. I = ConsecutiveChain[I];
  2864. }
  2865. // FIXME: Is division-by-2 the correct step? Should we assert that the
  2866. // register size is a power-of-2?
  2867. for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
  2868. if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
  2869. // Mark the vectorized stores so that we don't vectorize them again.
  2870. VectorizedStores.insert(Operands.begin(), Operands.end());
  2871. Changed = true;
  2872. break;
  2873. }
  2874. }
  2875. }
  2876. return Changed;
  2877. }
  2878. unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
  2879. unsigned count = 0;
  2880. StoreRefs.clear();
  2881. const DataLayout &DL = BB->getModule()->getDataLayout();
  2882. for (Instruction &I : *BB) {
  2883. StoreInst *SI = dyn_cast<StoreInst>(&I);
  2884. if (!SI)
  2885. continue;
  2886. // Don't touch volatile stores.
  2887. if (!SI->isSimple())
  2888. continue;
  2889. // Check that the pointer points to scalars.
  2890. Type *Ty = SI->getValueOperand()->getType();
  2891. if (!isValidElementType(Ty))
  2892. continue;
  2893. // Find the base pointer.
  2894. Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
  2895. // Save the store locations.
  2896. StoreRefs[Ptr].push_back(SI);
  2897. count++;
  2898. }
  2899. return count;
  2900. }
  2901. bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
  2902. if (!A || !B)
  2903. return false;
  2904. Value *VL[] = { A, B };
  2905. return tryToVectorizeList(VL, R, None, true);
  2906. }
  2907. bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
  2908. ArrayRef<Value *> BuildVector,
  2909. bool allowReorder) {
  2910. if (VL.size() < 2)
  2911. return false;
  2912. DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
  2913. // Check that all of the parts are scalar instructions of the same type.
  2914. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  2915. if (!I0)
  2916. return false;
  2917. unsigned Opcode0 = I0->getOpcode();
  2918. const DataLayout &DL = I0->getModule()->getDataLayout();
  2919. Type *Ty0 = I0->getType();
  2920. unsigned Sz = DL.getTypeSizeInBits(Ty0);
  2921. // FIXME: Register size should be a parameter to this function, so we can
  2922. // try different vectorization factors.
  2923. unsigned VF = MinVecRegSize / Sz;
  2924. for (Value *V : VL) {
  2925. Type *Ty = V->getType();
  2926. if (!isValidElementType(Ty))
  2927. return false;
  2928. Instruction *Inst = dyn_cast<Instruction>(V);
  2929. if (!Inst || Inst->getOpcode() != Opcode0)
  2930. return false;
  2931. }
  2932. bool Changed = false;
  2933. // Keep track of values that were deleted by vectorizing in the loop below.
  2934. SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
  2935. for (unsigned i = 0, e = VL.size(); i < e; ++i) {
  2936. unsigned OpsWidth = 0;
  2937. if (i + VF > e)
  2938. OpsWidth = e - i;
  2939. else
  2940. OpsWidth = VF;
  2941. if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
  2942. break;
  2943. // Check that a previous iteration of this loop did not delete the Value.
  2944. if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
  2945. continue;
  2946. DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
  2947. << "\n");
  2948. ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
  2949. ArrayRef<Value *> BuildVectorSlice;
  2950. if (!BuildVector.empty())
  2951. BuildVectorSlice = BuildVector.slice(i, OpsWidth);
  2952. R.buildTree(Ops, BuildVectorSlice);
  2953. // TODO: check if we can allow reordering also for other cases than
  2954. // tryToVectorizePair()
  2955. if (allowReorder && R.shouldReorder()) {
  2956. assert(Ops.size() == 2);
  2957. assert(BuildVectorSlice.empty());
  2958. Value *ReorderedOps[] = { Ops[1], Ops[0] };
  2959. R.buildTree(ReorderedOps, None);
  2960. }
  2961. int Cost = R.getTreeCost();
  2962. if (Cost < -SLPCostThreshold) {
  2963. DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
  2964. Value *VectorizedRoot = R.vectorizeTree();
  2965. // Reconstruct the build vector by extracting the vectorized root. This
  2966. // way we handle the case where some elements of the vector are undefined.
  2967. // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
  2968. if (!BuildVectorSlice.empty()) {
  2969. // The insert point is the last build vector instruction. The vectorized
  2970. // root will precede it. This guarantees that we get an instruction. The
  2971. // vectorized tree could have been constant folded.
  2972. Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
  2973. unsigned VecIdx = 0;
  2974. for (auto &V : BuildVectorSlice) {
  2975. IRBuilder<true, NoFolder> Builder(
  2976. ++BasicBlock::iterator(InsertAfter));
  2977. InsertElementInst *IE = cast<InsertElementInst>(V);
  2978. Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
  2979. VectorizedRoot, Builder.getInt32(VecIdx++)));
  2980. IE->setOperand(1, Extract);
  2981. IE->removeFromParent();
  2982. IE->insertAfter(Extract);
  2983. InsertAfter = IE;
  2984. }
  2985. }
  2986. // Move to the next bundle.
  2987. i += VF - 1;
  2988. Changed = true;
  2989. }
  2990. }
  2991. return Changed;
  2992. }
  2993. bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
  2994. if (!V)
  2995. return false;
  2996. // Try to vectorize V.
  2997. if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
  2998. return true;
  2999. BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
  3000. BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
  3001. // Try to skip B.
  3002. if (B && B->hasOneUse()) {
  3003. BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
  3004. BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
  3005. if (tryToVectorizePair(A, B0, R)) {
  3006. return true;
  3007. }
  3008. if (tryToVectorizePair(A, B1, R)) {
  3009. return true;
  3010. }
  3011. }
  3012. // Try to skip A.
  3013. if (A && A->hasOneUse()) {
  3014. BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
  3015. BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
  3016. if (tryToVectorizePair(A0, B, R)) {
  3017. return true;
  3018. }
  3019. if (tryToVectorizePair(A1, B, R)) {
  3020. return true;
  3021. }
  3022. }
  3023. return 0;
  3024. }
  3025. /// \brief Generate a shuffle mask to be used in a reduction tree.
  3026. ///
  3027. /// \param VecLen The length of the vector to be reduced.
  3028. /// \param NumEltsToRdx The number of elements that should be reduced in the
  3029. /// vector.
  3030. /// \param IsPairwise Whether the reduction is a pairwise or splitting
  3031. /// reduction. A pairwise reduction will generate a mask of
  3032. /// <0,2,...> or <1,3,..> while a splitting reduction will generate
  3033. /// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
  3034. /// \param IsLeft True will generate a mask of even elements, odd otherwise.
  3035. static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
  3036. bool IsPairwise, bool IsLeft,
  3037. IRBuilder<> &Builder) {
  3038. assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
  3039. SmallVector<Constant *, 32> ShuffleMask(
  3040. VecLen, UndefValue::get(Builder.getInt32Ty()));
  3041. if (IsPairwise)
  3042. // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
  3043. for (unsigned i = 0; i != NumEltsToRdx; ++i)
  3044. ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
  3045. else
  3046. // Move the upper half of the vector to the lower half.
  3047. for (unsigned i = 0; i != NumEltsToRdx; ++i)
  3048. ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
  3049. return ConstantVector::get(ShuffleMask);
  3050. }
  3051. /// Model horizontal reductions.
  3052. ///
  3053. /// A horizontal reduction is a tree of reduction operations (currently add and
  3054. /// fadd) that has operations that can be put into a vector as its leaf.
  3055. /// For example, this tree:
  3056. ///
  3057. /// mul mul mul mul
  3058. /// \ / \ /
  3059. /// + +
  3060. /// \ /
  3061. /// +
  3062. /// This tree has "mul" as its reduced values and "+" as its reduction
  3063. /// operations. A reduction might be feeding into a store or a binary operation
  3064. /// feeding a phi.
  3065. /// ...
  3066. /// \ /
  3067. /// +
  3068. /// |
  3069. /// phi +=
  3070. ///
  3071. /// Or:
  3072. /// ...
  3073. /// \ /
  3074. /// +
  3075. /// |
  3076. /// *p =
  3077. ///
  3078. class HorizontalReduction {
  3079. SmallVector<Value *, 16> ReductionOps;
  3080. SmallVector<Value *, 32> ReducedVals;
  3081. BinaryOperator *ReductionRoot;
  3082. PHINode *ReductionPHI;
  3083. /// The opcode of the reduction.
  3084. unsigned ReductionOpcode;
  3085. /// The opcode of the values we perform a reduction on.
  3086. unsigned ReducedValueOpcode;
  3087. /// The width of one full horizontal reduction operation.
  3088. unsigned ReduxWidth;
  3089. /// Should we model this reduction as a pairwise reduction tree or a tree that
  3090. /// splits the vector in halves and adds those halves.
  3091. bool IsPairwiseReduction;
  3092. public:
  3093. HorizontalReduction()
  3094. : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
  3095. ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
  3096. /// \brief Try to find a reduction tree.
  3097. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
  3098. assert((!Phi ||
  3099. std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
  3100. "Thi phi needs to use the binary operator");
  3101. // We could have a initial reductions that is not an add.
  3102. // r *= v1 + v2 + v3 + v4
  3103. // In such a case start looking for a tree rooted in the first '+'.
  3104. if (Phi) {
  3105. if (B->getOperand(0) == Phi) {
  3106. Phi = nullptr;
  3107. B = dyn_cast<BinaryOperator>(B->getOperand(1));
  3108. } else if (B->getOperand(1) == Phi) {
  3109. Phi = nullptr;
  3110. B = dyn_cast<BinaryOperator>(B->getOperand(0));
  3111. }
  3112. }
  3113. if (!B)
  3114. return false;
  3115. Type *Ty = B->getType();
  3116. if (!isValidElementType(Ty))
  3117. return false;
  3118. const DataLayout &DL = B->getModule()->getDataLayout();
  3119. ReductionOpcode = B->getOpcode();
  3120. ReducedValueOpcode = 0;
  3121. // FIXME: Register size should be a parameter to this function, so we can
  3122. // try different vectorization factors.
  3123. ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
  3124. ReductionRoot = B;
  3125. ReductionPHI = Phi;
  3126. if (ReduxWidth < 4)
  3127. return false;
  3128. // We currently only support adds.
  3129. if (ReductionOpcode != Instruction::Add &&
  3130. ReductionOpcode != Instruction::FAdd)
  3131. return false;
  3132. // Post order traverse the reduction tree starting at B. We only handle true
  3133. // trees containing only binary operators.
  3134. SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
  3135. Stack.push_back(std::make_pair(B, 0));
  3136. while (!Stack.empty()) {
  3137. BinaryOperator *TreeN = Stack.back().first;
  3138. unsigned EdgeToVist = Stack.back().second++;
  3139. bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
  3140. // Only handle trees in the current basic block.
  3141. if (TreeN->getParent() != B->getParent())
  3142. return false;
  3143. // Each tree node needs to have one user except for the ultimate
  3144. // reduction.
  3145. if (!TreeN->hasOneUse() && TreeN != B)
  3146. return false;
  3147. // Postorder vist.
  3148. if (EdgeToVist == 2 || IsReducedValue) {
  3149. if (IsReducedValue) {
  3150. // Make sure that the opcodes of the operations that we are going to
  3151. // reduce match.
  3152. if (!ReducedValueOpcode)
  3153. ReducedValueOpcode = TreeN->getOpcode();
  3154. else if (ReducedValueOpcode != TreeN->getOpcode())
  3155. return false;
  3156. ReducedVals.push_back(TreeN);
  3157. } else {
  3158. // We need to be able to reassociate the adds.
  3159. if (!TreeN->isAssociative())
  3160. return false;
  3161. ReductionOps.push_back(TreeN);
  3162. }
  3163. // Retract.
  3164. Stack.pop_back();
  3165. continue;
  3166. }
  3167. // Visit left or right.
  3168. Value *NextV = TreeN->getOperand(EdgeToVist);
  3169. BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
  3170. if (Next)
  3171. Stack.push_back(std::make_pair(Next, 0));
  3172. else if (NextV != Phi)
  3173. return false;
  3174. }
  3175. return true;
  3176. }
  3177. /// \brief Attempt to vectorize the tree found by
  3178. /// matchAssociativeReduction.
  3179. bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
  3180. if (ReducedVals.empty())
  3181. return false;
  3182. unsigned NumReducedVals = ReducedVals.size();
  3183. if (NumReducedVals < ReduxWidth)
  3184. return false;
  3185. Value *VectorizedTree = nullptr;
  3186. IRBuilder<> Builder(ReductionRoot);
  3187. FastMathFlags Unsafe;
  3188. Unsafe.setUnsafeAlgebra();
  3189. Builder.SetFastMathFlags(Unsafe);
  3190. unsigned i = 0;
  3191. for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
  3192. V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
  3193. // Estimate cost.
  3194. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
  3195. if (Cost >= -SLPCostThreshold)
  3196. break;
  3197. DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
  3198. << ". (HorRdx)\n");
  3199. // Vectorize a tree.
  3200. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
  3201. Value *VectorizedRoot = V.vectorizeTree();
  3202. // Emit a reduction.
  3203. Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
  3204. if (VectorizedTree) {
  3205. Builder.SetCurrentDebugLocation(Loc);
  3206. VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
  3207. ReducedSubTree, "bin.rdx");
  3208. } else
  3209. VectorizedTree = ReducedSubTree;
  3210. }
  3211. if (VectorizedTree) {
  3212. // Finish the reduction.
  3213. for (; i < NumReducedVals; ++i) {
  3214. Builder.SetCurrentDebugLocation(
  3215. cast<Instruction>(ReducedVals[i])->getDebugLoc());
  3216. VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
  3217. ReducedVals[i]);
  3218. }
  3219. // Update users.
  3220. if (ReductionPHI) {
  3221. assert(ReductionRoot && "Need a reduction operation");
  3222. ReductionRoot->setOperand(0, VectorizedTree);
  3223. ReductionRoot->setOperand(1, ReductionPHI);
  3224. } else
  3225. ReductionRoot->replaceAllUsesWith(VectorizedTree);
  3226. }
  3227. return VectorizedTree != nullptr;
  3228. }
  3229. private:
  3230. /// \brief Calcuate the cost of a reduction.
  3231. int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
  3232. Type *ScalarTy = FirstReducedVal->getType();
  3233. Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
  3234. int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
  3235. int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
  3236. IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
  3237. int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
  3238. int ScalarReduxCost =
  3239. ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
  3240. DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
  3241. << " for reduction that starts with " << *FirstReducedVal
  3242. << " (It is a "
  3243. << (IsPairwiseReduction ? "pairwise" : "splitting")
  3244. << " reduction)\n");
  3245. return VecReduxCost - ScalarReduxCost;
  3246. }
  3247. static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
  3248. Value *R, const Twine &Name = "") {
  3249. if (Opcode == Instruction::FAdd)
  3250. return Builder.CreateFAdd(L, R, Name);
  3251. return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
  3252. }
  3253. /// \brief Emit a horizontal reduction of the vectorized value.
  3254. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
  3255. assert(VectorizedValue && "Need to have a vectorized tree node");
  3256. assert(isPowerOf2_32(ReduxWidth) &&
  3257. "We only handle power-of-two reductions for now");
  3258. Value *TmpVec = VectorizedValue;
  3259. for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
  3260. if (IsPairwiseReduction) {
  3261. Value *LeftMask =
  3262. createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
  3263. Value *RightMask =
  3264. createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
  3265. Value *LeftShuf = Builder.CreateShuffleVector(
  3266. TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
  3267. Value *RightShuf = Builder.CreateShuffleVector(
  3268. TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
  3269. "rdx.shuf.r");
  3270. TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
  3271. "bin.rdx");
  3272. } else {
  3273. Value *UpperHalf =
  3274. createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
  3275. Value *Shuf = Builder.CreateShuffleVector(
  3276. TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
  3277. TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
  3278. }
  3279. }
  3280. // The result is in the first element of the vector.
  3281. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
  3282. }
  3283. };
  3284. /// \brief Recognize construction of vectors like
  3285. /// %ra = insertelement <4 x float> undef, float %s0, i32 0
  3286. /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
  3287. /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
  3288. /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
  3289. ///
  3290. /// Returns true if it matches
  3291. ///
  3292. static bool findBuildVector(InsertElementInst *FirstInsertElem,
  3293. SmallVectorImpl<Value *> &BuildVector,
  3294. SmallVectorImpl<Value *> &BuildVectorOpds) {
  3295. if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
  3296. return false;
  3297. InsertElementInst *IE = FirstInsertElem;
  3298. while (true) {
  3299. BuildVector.push_back(IE);
  3300. BuildVectorOpds.push_back(IE->getOperand(1));
  3301. if (IE->use_empty())
  3302. return false;
  3303. InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
  3304. if (!NextUse)
  3305. return true;
  3306. // If this isn't the final use, make sure the next insertelement is the only
  3307. // use. It's OK if the final constructed vector is used multiple times
  3308. if (!IE->hasOneUse())
  3309. return false;
  3310. IE = NextUse;
  3311. }
  3312. return false;
  3313. }
  3314. static bool PhiTypeSorterFunc(Value *V, Value *V2) {
  3315. return V->getType() < V2->getType();
  3316. }
  3317. bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
  3318. bool Changed = false;
  3319. SmallVector<Value *, 4> Incoming;
  3320. SmallSet<Value *, 16> VisitedInstrs;
  3321. bool HaveVectorizedPhiNodes = true;
  3322. while (HaveVectorizedPhiNodes) {
  3323. HaveVectorizedPhiNodes = false;
  3324. // Collect the incoming values from the PHIs.
  3325. Incoming.clear();
  3326. for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
  3327. ++instr) {
  3328. PHINode *P = dyn_cast<PHINode>(instr);
  3329. if (!P)
  3330. break;
  3331. if (!VisitedInstrs.count(P))
  3332. Incoming.push_back(P);
  3333. }
  3334. // Sort by type.
  3335. std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
  3336. // Try to vectorize elements base on their type.
  3337. for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
  3338. E = Incoming.end();
  3339. IncIt != E;) {
  3340. // Look for the next elements with the same type.
  3341. SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
  3342. while (SameTypeIt != E &&
  3343. (*SameTypeIt)->getType() == (*IncIt)->getType()) {
  3344. VisitedInstrs.insert(*SameTypeIt);
  3345. ++SameTypeIt;
  3346. }
  3347. // Try to vectorize them.
  3348. unsigned NumElts = (SameTypeIt - IncIt);
  3349. DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
  3350. if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
  3351. // Success start over because instructions might have been changed.
  3352. HaveVectorizedPhiNodes = true;
  3353. Changed = true;
  3354. break;
  3355. }
  3356. // Start over at the next instruction of a different type (or the end).
  3357. IncIt = SameTypeIt;
  3358. }
  3359. }
  3360. VisitedInstrs.clear();
  3361. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
  3362. // We may go through BB multiple times so skip the one we have checked.
  3363. if (!VisitedInstrs.insert(it).second)
  3364. continue;
  3365. if (isa<DbgInfoIntrinsic>(it))
  3366. continue;
  3367. // Try to vectorize reductions that use PHINodes.
  3368. if (PHINode *P = dyn_cast<PHINode>(it)) {
  3369. // Check that the PHI is a reduction PHI.
  3370. if (P->getNumIncomingValues() != 2)
  3371. return Changed;
  3372. Value *Rdx =
  3373. (P->getIncomingBlock(0) == BB
  3374. ? (P->getIncomingValue(0))
  3375. : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
  3376. : nullptr));
  3377. // Check if this is a Binary Operator.
  3378. BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
  3379. if (!BI)
  3380. continue;
  3381. // Try to match and vectorize a horizontal reduction.
  3382. HorizontalReduction HorRdx;
  3383. if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
  3384. HorRdx.tryToReduce(R, TTI)) {
  3385. Changed = true;
  3386. it = BB->begin();
  3387. e = BB->end();
  3388. continue;
  3389. }
  3390. Value *Inst = BI->getOperand(0);
  3391. if (Inst == P)
  3392. Inst = BI->getOperand(1);
  3393. if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
  3394. // We would like to start over since some instructions are deleted
  3395. // and the iterator may become invalid value.
  3396. Changed = true;
  3397. it = BB->begin();
  3398. e = BB->end();
  3399. continue;
  3400. }
  3401. continue;
  3402. }
  3403. // Try to vectorize horizontal reductions feeding into a store.
  3404. if (ShouldStartVectorizeHorAtStore)
  3405. if (StoreInst *SI = dyn_cast<StoreInst>(it))
  3406. if (BinaryOperator *BinOp =
  3407. dyn_cast<BinaryOperator>(SI->getValueOperand())) {
  3408. HorizontalReduction HorRdx;
  3409. if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
  3410. HorRdx.tryToReduce(R, TTI)) ||
  3411. tryToVectorize(BinOp, R))) {
  3412. Changed = true;
  3413. it = BB->begin();
  3414. e = BB->end();
  3415. continue;
  3416. }
  3417. }
  3418. // Try to vectorize horizontal reductions feeding into a return.
  3419. if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
  3420. if (RI->getNumOperands() != 0)
  3421. if (BinaryOperator *BinOp =
  3422. dyn_cast<BinaryOperator>(RI->getOperand(0))) {
  3423. DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
  3424. if (tryToVectorizePair(BinOp->getOperand(0),
  3425. BinOp->getOperand(1), R)) {
  3426. Changed = true;
  3427. it = BB->begin();
  3428. e = BB->end();
  3429. continue;
  3430. }
  3431. }
  3432. // Try to vectorize trees that start at compare instructions.
  3433. if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
  3434. if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
  3435. Changed = true;
  3436. // We would like to start over since some instructions are deleted
  3437. // and the iterator may become invalid value.
  3438. it = BB->begin();
  3439. e = BB->end();
  3440. continue;
  3441. }
  3442. for (int i = 0; i < 2; ++i) {
  3443. if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
  3444. if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
  3445. Changed = true;
  3446. // We would like to start over since some instructions are deleted
  3447. // and the iterator may become invalid value.
  3448. it = BB->begin();
  3449. e = BB->end();
  3450. break;
  3451. }
  3452. }
  3453. }
  3454. continue;
  3455. }
  3456. // Try to vectorize trees that start at insertelement instructions.
  3457. if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
  3458. SmallVector<Value *, 16> BuildVector;
  3459. SmallVector<Value *, 16> BuildVectorOpds;
  3460. if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
  3461. continue;
  3462. // Vectorize starting with the build vector operands ignoring the
  3463. // BuildVector instructions for the purpose of scheduling and user
  3464. // extraction.
  3465. if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
  3466. Changed = true;
  3467. it = BB->begin();
  3468. e = BB->end();
  3469. }
  3470. continue;
  3471. }
  3472. }
  3473. return Changed;
  3474. }
  3475. bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
  3476. bool Changed = false;
  3477. // Attempt to sort and vectorize each of the store-groups.
  3478. for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
  3479. it != e; ++it) {
  3480. if (it->second.size() < 2)
  3481. continue;
  3482. DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
  3483. << it->second.size() << ".\n");
  3484. // Process the stores in chunks of 16.
  3485. // TODO: The limit of 16 inhibits greater vectorization factors.
  3486. // For example, AVX2 supports v32i8. Increasing this limit, however,
  3487. // may cause a significant compile-time increase.
  3488. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
  3489. unsigned Len = std::min<unsigned>(CE - CI, 16);
  3490. Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
  3491. -SLPCostThreshold, R);
  3492. }
  3493. }
  3494. return Changed;
  3495. }
  3496. } // end anonymous namespace
  3497. char SLPVectorizer::ID = 0;
  3498. static const char lv_name[] = "SLP Vectorizer";
  3499. INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
  3500. INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
  3501. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  3502. INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  3503. INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
  3504. INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
  3505. INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
  3506. namespace llvm {
  3507. Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
  3508. }