aoptx86.pas 266 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234
  1. {
  2. Copyright (c) 1998-2002 by Florian Klaempfl and Jonas Maebe
  3. This unit contains the peephole optimizer.
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit aoptx86;
  18. {$i fpcdefs.inc}
  19. {$define DEBUG_AOPTCPU}
  20. interface
  21. uses
  22. globtype,
  23. cpubase,
  24. aasmtai,aasmcpu,
  25. cgbase,cgutils,
  26. aopt,aoptobj;
  27. type
  28. TOptsToCheck = (
  29. aoc_MovAnd2Mov_3
  30. );
  31. TX86AsmOptimizer = class(TAsmOptimizer)
  32. { some optimizations are very expensive to check, so the
  33. pre opt pass can be used to set some flags, depending on the found
  34. instructions if it is worth to check a certain optimization }
  35. OptsToCheck : set of TOptsToCheck;
  36. function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override;
  37. function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override;
  38. function RegReadByInstruction(reg : TRegister; hp : tai) : boolean;
  39. function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
  40. function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  41. {
  42. In comparison with GetNextInstructionUsingReg, GetNextInstructionUsingRegTrackingUse tracks
  43. the use of a register by allocs/dealloc, so it can ignore calls.
  44. In the following example, GetNextInstructionUsingReg will return the second movq,
  45. GetNextInstructionUsingRegTrackingUse won't.
  46. movq %rdi,%rax
  47. # Register rdi released
  48. # Register rdi allocated
  49. movq %rax,%rdi
  50. While in this example:
  51. movq %rdi,%rax
  52. call proc
  53. movq %rdi,%rax
  54. GetNextInstructionUsingRegTrackingUse will return the second instruction while GetNextInstructionUsingReg
  55. won't.
  56. }
  57. function GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  58. function RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean; override;
  59. private
  60. function SkipSimpleInstructions(var hp1: tai): Boolean;
  61. protected
  62. class function IsMOVZXAcceptable: Boolean; static; inline;
  63. { checks whether loading a new value in reg1 overwrites the entirety of reg2 }
  64. function Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  65. { checks whether reading the value in reg1 depends on the value of reg2. This
  66. is very similar to SuperRegisterEquals, except it takes into account that
  67. R_SUBH and R_SUBL are independendent (e.g. reading from AL does not
  68. depend on the value in AH). }
  69. function Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  70. { Replaces all references to AOldReg in a memory reference to ANewReg }
  71. class function ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean; static;
  72. { Replaces all references to AOldReg in an operand to ANewReg }
  73. class function ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean; static;
  74. { Replaces all references to AOldReg in an instruction to ANewReg,
  75. except where the register is being written }
  76. function ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
  77. { Returns true if the reference only refers to ESP or EBP (or their 64-bit equivalents),
  78. or writes to a global symbol }
  79. class function IsRefSafe(const ref: PReference): Boolean; static; inline;
  80. { Returns true if the given MOV instruction can be safely converted to CMOV }
  81. class function CanBeCMOV(p : tai) : boolean; static;
  82. function DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  83. procedure DebugMsg(const s : string; p : tai);inline;
  84. class function IsExitCode(p : tai) : boolean; static;
  85. class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
  86. procedure RemoveLastDeallocForFuncRes(p : tai);
  87. function DoSubAddOpt(var p : tai) : Boolean;
  88. function PrePeepholeOptSxx(var p : tai) : boolean;
  89. function PrePeepholeOptIMUL(var p : tai) : boolean;
  90. function OptPass1AND(var p : tai) : boolean;
  91. function OptPass1_V_MOVAP(var p : tai) : boolean;
  92. function OptPass1VOP(var p : tai) : boolean;
  93. function OptPass1MOV(var p : tai) : boolean;
  94. function OptPass1Movx(var p : tai) : boolean;
  95. function OptPass1MOVXX(var p : tai) : boolean;
  96. function OptPass1OP(var p : tai) : boolean;
  97. function OptPass1LEA(var p : tai) : boolean;
  98. function OptPass1Sub(var p : tai) : boolean;
  99. function OptPass1SHLSAL(var p : tai) : boolean;
  100. function OptPass1SETcc(var p : tai) : boolean;
  101. function OptPass1FSTP(var p : tai) : boolean;
  102. function OptPass1FLD(var p : tai) : boolean;
  103. function OptPass1Cmp(var p : tai) : boolean;
  104. function OptPass1PXor(var p : tai) : boolean;
  105. function OptPass2MOV(var p : tai) : boolean;
  106. function OptPass2Imul(var p : tai) : boolean;
  107. function OptPass2Jmp(var p : tai) : boolean;
  108. function OptPass2Jcc(var p : tai) : boolean;
  109. function OptPass2Lea(var p: tai): Boolean;
  110. function OptPass2SUB(var p: tai): Boolean;
  111. function PostPeepholeOptMov(var p : tai) : Boolean;
  112. {$ifdef x86_64} { These post-peephole optimisations only affect 64-bit registers. [Kit] }
  113. function PostPeepholeOptMovzx(var p : tai) : Boolean;
  114. function PostPeepholeOptXor(var p : tai) : Boolean;
  115. {$endif}
  116. function PostPeepholeOptMOVSX(var p : tai) : boolean;
  117. function PostPeepholeOptCmp(var p : tai) : Boolean;
  118. function PostPeepholeOptTestOr(var p : tai) : Boolean;
  119. function PostPeepholeOptCall(var p : tai) : Boolean;
  120. function PostPeepholeOptLea(var p : tai) : Boolean;
  121. function PostPeepholeOptPush(var p: tai): Boolean;
  122. procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
  123. { Processor-dependent reference optimisation }
  124. class procedure OptimizeRefs(var p: taicpu); static;
  125. end;
  126. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  127. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  128. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  129. function MatchInstruction(const instr: tai; const ops: array of TAsmOp; const opsize: topsizes): boolean;
  130. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  131. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  132. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  133. function RefsEqual(const r1, r2: treference): boolean;
  134. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  135. { returns true, if ref is a reference using only the registers passed as base and index
  136. and having an offset }
  137. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  138. implementation
  139. uses
  140. cutils,verbose,
  141. systems,
  142. globals,
  143. cpuinfo,
  144. procinfo,
  145. paramgr,
  146. aasmbase,
  147. aoptbase,aoptutils,
  148. symconst,symsym,
  149. cgx86,
  150. itcpugas;
  151. {$ifdef DEBUG_AOPTCPU}
  152. const
  153. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  154. {$else DEBUG_AOPTCPU}
  155. { Empty strings help the optimizer to remove string concatenations that won't
  156. ever appear to the user on release builds. [Kit] }
  157. const
  158. SPeepholeOptimization = '';
  159. {$endif DEBUG_AOPTCPU}
  160. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  161. begin
  162. result :=
  163. (instr.typ = ait_instruction) and
  164. (taicpu(instr).opcode = op) and
  165. ((opsize = []) or (taicpu(instr).opsize in opsize));
  166. end;
  167. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  168. begin
  169. result :=
  170. (instr.typ = ait_instruction) and
  171. ((taicpu(instr).opcode = op1) or
  172. (taicpu(instr).opcode = op2)
  173. ) and
  174. ((opsize = []) or (taicpu(instr).opsize in opsize));
  175. end;
  176. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  177. begin
  178. result :=
  179. (instr.typ = ait_instruction) and
  180. ((taicpu(instr).opcode = op1) or
  181. (taicpu(instr).opcode = op2) or
  182. (taicpu(instr).opcode = op3)
  183. ) and
  184. ((opsize = []) or (taicpu(instr).opsize in opsize));
  185. end;
  186. function MatchInstruction(const instr : tai;const ops : array of TAsmOp;
  187. const opsize : topsizes) : boolean;
  188. var
  189. op : TAsmOp;
  190. begin
  191. result:=false;
  192. for op in ops do
  193. begin
  194. if (instr.typ = ait_instruction) and
  195. (taicpu(instr).opcode = op) and
  196. ((opsize = []) or (taicpu(instr).opsize in opsize)) then
  197. begin
  198. result:=true;
  199. exit;
  200. end;
  201. end;
  202. end;
  203. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  204. begin
  205. result := (oper.typ = top_reg) and (oper.reg = reg);
  206. end;
  207. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  208. begin
  209. result := (oper.typ = top_const) and (oper.val = a);
  210. end;
  211. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  212. begin
  213. result := oper1.typ = oper2.typ;
  214. if result then
  215. case oper1.typ of
  216. top_const:
  217. Result:=oper1.val = oper2.val;
  218. top_reg:
  219. Result:=oper1.reg = oper2.reg;
  220. top_ref:
  221. Result:=RefsEqual(oper1.ref^, oper2.ref^);
  222. else
  223. internalerror(2013102801);
  224. end
  225. end;
  226. function RefsEqual(const r1, r2: treference): boolean;
  227. begin
  228. RefsEqual :=
  229. (r1.offset = r2.offset) and
  230. (r1.segment = r2.segment) and (r1.base = r2.base) and
  231. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  232. (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  233. (r1.relsymbol = r2.relsymbol) and
  234. (r1.volatility=[]) and
  235. (r2.volatility=[]);
  236. end;
  237. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  238. begin
  239. Result:=(ref.offset=0) and
  240. (ref.scalefactor in [0,1]) and
  241. (ref.segment=NR_NO) and
  242. (ref.symbol=nil) and
  243. (ref.relsymbol=nil) and
  244. ((base=NR_INVALID) or
  245. (ref.base=base)) and
  246. ((index=NR_INVALID) or
  247. (ref.index=index)) and
  248. (ref.volatility=[]);
  249. end;
  250. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  251. begin
  252. Result:=(ref.scalefactor in [0,1]) and
  253. (ref.segment=NR_NO) and
  254. (ref.symbol=nil) and
  255. (ref.relsymbol=nil) and
  256. ((base=NR_INVALID) or
  257. (ref.base=base)) and
  258. ((index=NR_INVALID) or
  259. (ref.index=index)) and
  260. (ref.volatility=[]);
  261. end;
  262. function InstrReadsFlags(p: tai): boolean;
  263. begin
  264. InstrReadsFlags := true;
  265. case p.typ of
  266. ait_instruction:
  267. if InsProp[taicpu(p).opcode].Ch*
  268. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  269. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  270. Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc,Ch_All]<>[] then
  271. exit;
  272. ait_label:
  273. exit;
  274. else
  275. ;
  276. end;
  277. InstrReadsFlags := false;
  278. end;
  279. function TX86AsmOptimizer.GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  280. begin
  281. Next:=Current;
  282. repeat
  283. Result:=GetNextInstruction(Next,Next);
  284. until not (Result) or
  285. not(cs_opt_level3 in current_settings.optimizerswitches) or
  286. (Next.typ<>ait_instruction) or
  287. RegInInstruction(reg,Next) or
  288. is_calljmp(taicpu(Next).opcode);
  289. end;
  290. function TX86AsmOptimizer.GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  291. begin
  292. if not(cs_opt_level3 in current_settings.optimizerswitches) then
  293. begin
  294. Result:=GetNextInstruction(Current,Next);
  295. exit;
  296. end;
  297. Next:=tai(Current.Next);
  298. Result:=false;
  299. while assigned(Next) do
  300. begin
  301. if ((Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) and not(taicpu(Next).opcode=A_CALL)) or
  302. ((Next.typ=ait_regalloc) and (getsupreg(tai_regalloc(Next).reg)=getsupreg(reg))) or
  303. ((Next.typ=ait_label) and not(labelCanBeSkipped(Tai_Label(Next)))) then
  304. exit
  305. else if (Next.typ=ait_instruction) and RegInInstruction(reg,Next) and not(taicpu(Next).opcode=A_CALL) then
  306. begin
  307. Result:=true;
  308. exit;
  309. end;
  310. Next:=tai(Next.Next);
  311. end;
  312. end;
  313. function TX86AsmOptimizer.InstructionLoadsFromReg(const reg: TRegister;const hp: tai): boolean;
  314. begin
  315. Result:=RegReadByInstruction(reg,hp);
  316. end;
  317. function TX86AsmOptimizer.RegReadByInstruction(reg: TRegister; hp: tai): boolean;
  318. var
  319. p: taicpu;
  320. opcount: longint;
  321. begin
  322. RegReadByInstruction := false;
  323. if hp.typ <> ait_instruction then
  324. exit;
  325. p := taicpu(hp);
  326. case p.opcode of
  327. A_CALL:
  328. regreadbyinstruction := true;
  329. A_IMUL:
  330. case p.ops of
  331. 1:
  332. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  333. (
  334. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  335. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  336. );
  337. 2,3:
  338. regReadByInstruction :=
  339. reginop(reg,p.oper[0]^) or
  340. reginop(reg,p.oper[1]^);
  341. else
  342. InternalError(2019112801);
  343. end;
  344. A_MUL:
  345. begin
  346. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  347. (
  348. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  349. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  350. );
  351. end;
  352. A_IDIV,A_DIV:
  353. begin
  354. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  355. (
  356. (getregtype(reg)=R_INTREGISTER) and
  357. (
  358. (getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B))
  359. )
  360. );
  361. end;
  362. else
  363. begin
  364. if (p.opcode=A_LEA) and is_segment_reg(reg) then
  365. begin
  366. RegReadByInstruction := false;
  367. exit;
  368. end;
  369. for opcount := 0 to p.ops-1 do
  370. if (p.oper[opCount]^.typ = top_ref) and
  371. RegInRef(reg,p.oper[opcount]^.ref^) then
  372. begin
  373. RegReadByInstruction := true;
  374. exit
  375. end;
  376. { special handling for SSE MOVSD }
  377. if (p.opcode=A_MOVSD) and (p.ops>0) then
  378. begin
  379. if p.ops<>2 then
  380. internalerror(2017042702);
  381. regReadByInstruction := reginop(reg,p.oper[0]^) or
  382. (
  383. (p.oper[1]^.typ=top_reg) and (p.oper[0]^.typ=top_reg) and reginop(reg, p.oper[1]^)
  384. );
  385. exit;
  386. end;
  387. with insprop[p.opcode] do
  388. begin
  389. if getregtype(reg)=R_INTREGISTER then
  390. begin
  391. case getsupreg(reg) of
  392. RS_EAX:
  393. if [Ch_REAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
  394. begin
  395. RegReadByInstruction := true;
  396. exit
  397. end;
  398. RS_ECX:
  399. if [Ch_RECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
  400. begin
  401. RegReadByInstruction := true;
  402. exit
  403. end;
  404. RS_EDX:
  405. if [Ch_REDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
  406. begin
  407. RegReadByInstruction := true;
  408. exit
  409. end;
  410. RS_EBX:
  411. if [Ch_REBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
  412. begin
  413. RegReadByInstruction := true;
  414. exit
  415. end;
  416. RS_ESP:
  417. if [Ch_RESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
  418. begin
  419. RegReadByInstruction := true;
  420. exit
  421. end;
  422. RS_EBP:
  423. if [Ch_REBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
  424. begin
  425. RegReadByInstruction := true;
  426. exit
  427. end;
  428. RS_ESI:
  429. if [Ch_RESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
  430. begin
  431. RegReadByInstruction := true;
  432. exit
  433. end;
  434. RS_EDI:
  435. if [Ch_REDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
  436. begin
  437. RegReadByInstruction := true;
  438. exit
  439. end;
  440. end;
  441. end;
  442. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  443. begin
  444. if (Ch_RFLAGScc in Ch) and not(getsubreg(reg) in [R_SUBW,R_SUBD,R_SUBQ]) then
  445. begin
  446. case p.condition of
  447. C_A,C_NBE, { CF=0 and ZF=0 }
  448. C_BE,C_NA: { CF=1 or ZF=1 }
  449. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY,R_SUBFLAGZERO];
  450. C_AE,C_NB,C_NC, { CF=0 }
  451. C_B,C_NAE,C_C: { CF=1 }
  452. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY];
  453. C_NE,C_NZ, { ZF=0 }
  454. C_E,C_Z: { ZF=1 }
  455. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO];
  456. C_G,C_NLE, { ZF=0 and SF=OF }
  457. C_LE,C_NG: { ZF=1 or SF<>OF }
  458. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO,R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  459. C_GE,C_NL, { SF=OF }
  460. C_L,C_NGE: { SF<>OF }
  461. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  462. C_NO, { OF=0 }
  463. C_O: { OF=1 }
  464. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGOVERFLOW];
  465. C_NP,C_PO, { PF=0 }
  466. C_P,C_PE: { PF=1 }
  467. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGPARITY];
  468. C_NS, { SF=0 }
  469. C_S: { SF=1 }
  470. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN];
  471. else
  472. internalerror(2017042701);
  473. end;
  474. if RegReadByInstruction then
  475. exit;
  476. end;
  477. case getsubreg(reg) of
  478. R_SUBW,R_SUBD,R_SUBQ:
  479. RegReadByInstruction :=
  480. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  481. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  482. Ch_RDirFlag,Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc]*Ch<>[];
  483. R_SUBFLAGCARRY:
  484. RegReadByInstruction:=[Ch_RCarryFlag,Ch_RWCarryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  485. R_SUBFLAGPARITY:
  486. RegReadByInstruction:=[Ch_RParityFlag,Ch_RWParityFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  487. R_SUBFLAGAUXILIARY:
  488. RegReadByInstruction:=[Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  489. R_SUBFLAGZERO:
  490. RegReadByInstruction:=[Ch_RZeroFlag,Ch_RWZeroFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  491. R_SUBFLAGSIGN:
  492. RegReadByInstruction:=[Ch_RSignFlag,Ch_RWSignFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  493. R_SUBFLAGOVERFLOW:
  494. RegReadByInstruction:=[Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  495. R_SUBFLAGINTERRUPT:
  496. RegReadByInstruction:=[Ch_RFlags,Ch_RWFlags]*Ch<>[];
  497. R_SUBFLAGDIRECTION:
  498. RegReadByInstruction:=[Ch_RDirFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  499. else
  500. internalerror(2017042601);
  501. end;
  502. exit;
  503. end;
  504. if (Ch_NoReadIfEqualRegs in Ch) and (p.ops=2) and
  505. (p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
  506. (p.oper[0]^.reg=p.oper[1]^.reg) then
  507. exit;
  508. if ([CH_RWOP1,CH_ROP1,CH_MOP1]*Ch<>[]) and reginop(reg,p.oper[0]^) then
  509. begin
  510. RegReadByInstruction := true;
  511. exit
  512. end;
  513. if ([Ch_RWOP2,Ch_ROP2,Ch_MOP2]*Ch<>[]) and reginop(reg,p.oper[1]^) then
  514. begin
  515. RegReadByInstruction := true;
  516. exit
  517. end;
  518. if ([Ch_RWOP3,Ch_ROP3,Ch_MOP3]*Ch<>[]) and reginop(reg,p.oper[2]^) then
  519. begin
  520. RegReadByInstruction := true;
  521. exit
  522. end;
  523. if ([Ch_RWOP4,Ch_ROP4,Ch_MOP4]*Ch<>[]) and reginop(reg,p.oper[3]^) then
  524. begin
  525. RegReadByInstruction := true;
  526. exit
  527. end;
  528. end;
  529. end;
  530. end;
  531. end;
  532. function TX86AsmOptimizer.RegInInstruction(Reg: TRegister; p1: tai): Boolean;
  533. begin
  534. result:=false;
  535. if p1.typ<>ait_instruction then
  536. exit;
  537. if (Ch_All in insprop[taicpu(p1).opcode].Ch) then
  538. exit(true);
  539. if (getregtype(reg)=R_INTREGISTER) and
  540. { change information for xmm movsd are not correct }
  541. ((taicpu(p1).opcode<>A_MOVSD) or (taicpu(p1).ops=0)) then
  542. begin
  543. case getsupreg(reg) of
  544. { RS_EAX = RS_RAX on x86-64 }
  545. RS_EAX:
  546. result:=([Ch_REAX,Ch_RRAX,Ch_WEAX,Ch_WRAX,Ch_RWEAX,Ch_RWRAX,Ch_MEAX,Ch_MRAX]*insprop[taicpu(p1).opcode].Ch)<>[];
  547. RS_ECX:
  548. result:=([Ch_RECX,Ch_RRCX,Ch_WECX,Ch_WRCX,Ch_RWECX,Ch_RWRCX,Ch_MECX,Ch_MRCX]*insprop[taicpu(p1).opcode].Ch)<>[];
  549. RS_EDX:
  550. result:=([Ch_REDX,Ch_RRDX,Ch_WEDX,Ch_WRDX,Ch_RWEDX,Ch_RWRDX,Ch_MEDX,Ch_MRDX]*insprop[taicpu(p1).opcode].Ch)<>[];
  551. RS_EBX:
  552. result:=([Ch_REBX,Ch_RRBX,Ch_WEBX,Ch_WRBX,Ch_RWEBX,Ch_RWRBX,Ch_MEBX,Ch_MRBX]*insprop[taicpu(p1).opcode].Ch)<>[];
  553. RS_ESP:
  554. result:=([Ch_RESP,Ch_RRSP,Ch_WESP,Ch_WRSP,Ch_RWESP,Ch_RWRSP,Ch_MESP,Ch_MRSP]*insprop[taicpu(p1).opcode].Ch)<>[];
  555. RS_EBP:
  556. result:=([Ch_REBP,Ch_RRBP,Ch_WEBP,Ch_WRBP,Ch_RWEBP,Ch_RWRBP,Ch_MEBP,Ch_MRBP]*insprop[taicpu(p1).opcode].Ch)<>[];
  557. RS_ESI:
  558. result:=([Ch_RESI,Ch_RRSI,Ch_WESI,Ch_WRSI,Ch_RWESI,Ch_RWRSI,Ch_MESI,Ch_MRSI,Ch_RMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  559. RS_EDI:
  560. result:=([Ch_REDI,Ch_RRDI,Ch_WEDI,Ch_WRDI,Ch_RWEDI,Ch_RWRDI,Ch_MEDI,Ch_MRDI,Ch_WMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  561. else
  562. ;
  563. end;
  564. if result then
  565. exit;
  566. end
  567. else if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  568. begin
  569. if ([Ch_RFlags,Ch_WFlags,Ch_RWFlags,Ch_RFLAGScc]*insprop[taicpu(p1).opcode].Ch)<>[] then
  570. exit(true);
  571. case getsubreg(reg) of
  572. R_SUBFLAGCARRY:
  573. Result:=([Ch_RCarryFlag,Ch_RWCarryFlag,Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  574. R_SUBFLAGPARITY:
  575. Result:=([Ch_RParityFlag,Ch_RWParityFlag,Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  576. R_SUBFLAGAUXILIARY:
  577. Result:=([Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  578. R_SUBFLAGZERO:
  579. Result:=([Ch_RZeroFlag,Ch_RWZeroFlag,Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  580. R_SUBFLAGSIGN:
  581. Result:=([Ch_RSignFlag,Ch_RWSignFlag,Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  582. R_SUBFLAGOVERFLOW:
  583. Result:=([Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  584. R_SUBFLAGINTERRUPT:
  585. Result:=([Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  586. R_SUBFLAGDIRECTION:
  587. Result:=([Ch_RDirFlag,Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  588. else
  589. ;
  590. end;
  591. if result then
  592. exit;
  593. end
  594. else if (getregtype(reg)=R_FPUREGISTER) and (Ch_FPU in insprop[taicpu(p1).opcode].Ch) then
  595. exit(true);
  596. Result:=inherited RegInInstruction(Reg, p1);
  597. end;
  598. function TX86AsmOptimizer.RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean;
  599. begin
  600. Result := False;
  601. if p1.typ <> ait_instruction then
  602. exit;
  603. with insprop[taicpu(p1).opcode] do
  604. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  605. begin
  606. case getsubreg(reg) of
  607. R_SUBW,R_SUBD,R_SUBQ:
  608. Result :=
  609. [Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
  610. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  611. Ch_W0DirFlag,Ch_W1DirFlag,Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  612. R_SUBFLAGCARRY:
  613. Result:=[Ch_WCarryFlag,Ch_RWCarryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  614. R_SUBFLAGPARITY:
  615. Result:=[Ch_WParityFlag,Ch_RWParityFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  616. R_SUBFLAGAUXILIARY:
  617. Result:=[Ch_WAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  618. R_SUBFLAGZERO:
  619. Result:=[Ch_WZeroFlag,Ch_RWZeroFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  620. R_SUBFLAGSIGN:
  621. Result:=[Ch_WSignFlag,Ch_RWSignFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  622. R_SUBFLAGOVERFLOW:
  623. Result:=[Ch_WOverflowFlag,Ch_RWOverflowFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  624. R_SUBFLAGINTERRUPT:
  625. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  626. R_SUBFLAGDIRECTION:
  627. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  628. else
  629. internalerror(2017042602);
  630. end;
  631. exit;
  632. end;
  633. case taicpu(p1).opcode of
  634. A_CALL:
  635. { We could potentially set Result to False if the register in
  636. question is non-volatile for the subroutine's calling convention,
  637. but this would require detecting the calling convention in use and
  638. also assuming that the routine doesn't contain malformed assembly
  639. language, for example... so it could only be done under -O4 as it
  640. would be considered a side-effect. [Kit] }
  641. Result := True;
  642. A_MOVSD:
  643. { special handling for SSE MOVSD }
  644. if (taicpu(p1).ops>0) then
  645. begin
  646. if taicpu(p1).ops<>2 then
  647. internalerror(2017042703);
  648. Result := (taicpu(p1).oper[1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[1]^);
  649. end;
  650. { VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
  651. so fix it here (FK)
  652. }
  653. A_VMOVSS,
  654. A_VMOVSD:
  655. begin
  656. Result := (taicpu(p1).ops=3) and (taicpu(p1).oper[2]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[2]^);
  657. exit;
  658. end;
  659. A_IMUL:
  660. Result := (taicpu(p1).oper[taicpu(p1).ops-1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[taicpu(p1).ops-1]^);
  661. else
  662. ;
  663. end;
  664. if Result then
  665. exit;
  666. with insprop[taicpu(p1).opcode] do
  667. begin
  668. if getregtype(reg)=R_INTREGISTER then
  669. begin
  670. case getsupreg(reg) of
  671. RS_EAX:
  672. if [Ch_WEAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
  673. begin
  674. Result := True;
  675. exit
  676. end;
  677. RS_ECX:
  678. if [Ch_WECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
  679. begin
  680. Result := True;
  681. exit
  682. end;
  683. RS_EDX:
  684. if [Ch_WEDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
  685. begin
  686. Result := True;
  687. exit
  688. end;
  689. RS_EBX:
  690. if [Ch_WEBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
  691. begin
  692. Result := True;
  693. exit
  694. end;
  695. RS_ESP:
  696. if [Ch_WESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
  697. begin
  698. Result := True;
  699. exit
  700. end;
  701. RS_EBP:
  702. if [Ch_WEBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
  703. begin
  704. Result := True;
  705. exit
  706. end;
  707. RS_ESI:
  708. if [Ch_WESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
  709. begin
  710. Result := True;
  711. exit
  712. end;
  713. RS_EDI:
  714. if [Ch_WEDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
  715. begin
  716. Result := True;
  717. exit
  718. end;
  719. end;
  720. end;
  721. if ([CH_RWOP1,CH_WOP1,CH_MOP1]*Ch<>[]) and reginop(reg,taicpu(p1).oper[0]^) then
  722. begin
  723. Result := true;
  724. exit
  725. end;
  726. if ([Ch_RWOP2,Ch_WOP2,Ch_MOP2]*Ch<>[]) and reginop(reg,taicpu(p1).oper[1]^) then
  727. begin
  728. Result := true;
  729. exit
  730. end;
  731. if ([Ch_RWOP3,Ch_WOP3,Ch_MOP3]*Ch<>[]) and reginop(reg,taicpu(p1).oper[2]^) then
  732. begin
  733. Result := true;
  734. exit
  735. end;
  736. if ([Ch_RWOP4,Ch_WOP4,Ch_MOP4]*Ch<>[]) and reginop(reg,taicpu(p1).oper[3]^) then
  737. begin
  738. Result := true;
  739. exit
  740. end;
  741. end;
  742. end;
  743. {$ifdef DEBUG_AOPTCPU}
  744. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);
  745. begin
  746. asml.insertbefore(tai_comment.Create(strpnew(s)), p);
  747. end;
  748. function debug_tostr(i: tcgint): string; inline;
  749. begin
  750. Result := tostr(i);
  751. end;
  752. function debug_regname(r: TRegister): string; inline;
  753. begin
  754. Result := '%' + std_regname(r);
  755. end;
  756. { Debug output function - creates a string representation of an operator }
  757. function debug_operstr(oper: TOper): string;
  758. begin
  759. case oper.typ of
  760. top_const:
  761. Result := '$' + debug_tostr(oper.val);
  762. top_reg:
  763. Result := debug_regname(oper.reg);
  764. top_ref:
  765. begin
  766. if oper.ref^.offset <> 0 then
  767. Result := debug_tostr(oper.ref^.offset) + '('
  768. else
  769. Result := '(';
  770. if (oper.ref^.base <> NR_INVALID) and (oper.ref^.base <> NR_NO) then
  771. begin
  772. Result := Result + debug_regname(oper.ref^.base);
  773. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  774. Result := Result + ',' + debug_regname(oper.ref^.index);
  775. end
  776. else
  777. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  778. Result := Result + debug_regname(oper.ref^.index);
  779. if (oper.ref^.scalefactor > 1) then
  780. Result := Result + ',' + debug_tostr(oper.ref^.scalefactor) + ')'
  781. else
  782. Result := Result + ')';
  783. end;
  784. else
  785. Result := '[UNKNOWN]';
  786. end;
  787. end;
  788. function debug_op2str(opcode: tasmop): string; inline;
  789. begin
  790. Result := std_op2str[opcode];
  791. end;
  792. function debug_opsize2str(opsize: topsize): string; inline;
  793. begin
  794. Result := gas_opsize2str[opsize];
  795. end;
  796. {$else DEBUG_AOPTCPU}
  797. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);inline;
  798. begin
  799. end;
  800. function debug_tostr(i: tcgint): string; inline;
  801. begin
  802. Result := '';
  803. end;
  804. function debug_regname(r: TRegister): string; inline;
  805. begin
  806. Result := '';
  807. end;
  808. function debug_operstr(oper: TOper): string; inline;
  809. begin
  810. Result := '';
  811. end;
  812. function debug_op2str(opcode: tasmop): string; inline;
  813. begin
  814. Result := '';
  815. end;
  816. function debug_opsize2str(opsize: topsize): string; inline;
  817. begin
  818. Result := '';
  819. end;
  820. {$endif DEBUG_AOPTCPU}
  821. class function TX86AsmOptimizer.IsMOVZXAcceptable: Boolean; inline;
  822. begin
  823. {$ifdef x86_64}
  824. { Always fine on x86-64 }
  825. Result := True;
  826. {$else x86_64}
  827. Result :=
  828. {$ifdef i8086}
  829. (current_settings.cputype >= cpu_386) and
  830. {$endif i8086}
  831. (
  832. { Always accept if optimising for size }
  833. (cs_opt_size in current_settings.optimizerswitches) or
  834. { From the Pentium II onwards, MOVZX only takes 1 cycle. [Kit] }
  835. (current_settings.optimizecputype >= cpu_Pentium2)
  836. );
  837. {$endif x86_64}
  838. end;
  839. function TX86AsmOptimizer.Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  840. begin
  841. if not SuperRegistersEqual(reg1,reg2) then
  842. exit(false);
  843. if getregtype(reg1)<>R_INTREGISTER then
  844. exit(true); {because SuperRegisterEqual is true}
  845. case getsubreg(reg1) of
  846. { A write to R_SUBL doesn't change R_SUBH and if reg2 is R_SUBW or
  847. higher, it preserves the high bits, so the new value depends on
  848. reg2's previous value. In other words, it is equivalent to doing:
  849. reg2 := (reg2 and $ffffff00) or byte(reg1); }
  850. R_SUBL:
  851. exit(getsubreg(reg2)=R_SUBL);
  852. { A write to R_SUBH doesn't change R_SUBL and if reg2 is R_SUBW or
  853. higher, it actually does a:
  854. reg2 := (reg2 and $ffff00ff) or (reg1 and $ff00); }
  855. R_SUBH:
  856. exit(getsubreg(reg2)=R_SUBH);
  857. { If reg2 is R_SUBD or larger, a write to R_SUBW preserves the high 16
  858. bits of reg2:
  859. reg2 := (reg2 and $ffff0000) or word(reg1); }
  860. R_SUBW:
  861. exit(getsubreg(reg2) in [R_SUBL,R_SUBH,R_SUBW]);
  862. { a write to R_SUBD always overwrites every other subregister,
  863. because it clears the high 32 bits of R_SUBQ on x86_64 }
  864. R_SUBD,
  865. R_SUBQ:
  866. exit(true);
  867. else
  868. internalerror(2017042801);
  869. end;
  870. end;
  871. function TX86AsmOptimizer.Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  872. begin
  873. if not SuperRegistersEqual(reg1,reg2) then
  874. exit(false);
  875. if getregtype(reg1)<>R_INTREGISTER then
  876. exit(true); {because SuperRegisterEqual is true}
  877. case getsubreg(reg1) of
  878. R_SUBL:
  879. exit(getsubreg(reg2)<>R_SUBH);
  880. R_SUBH:
  881. exit(getsubreg(reg2)<>R_SUBL);
  882. R_SUBW,
  883. R_SUBD,
  884. R_SUBQ:
  885. exit(true);
  886. else
  887. internalerror(2017042802);
  888. end;
  889. end;
  890. function TX86AsmOptimizer.PrePeepholeOptSxx(var p : tai) : boolean;
  891. var
  892. hp1 : tai;
  893. l : TCGInt;
  894. begin
  895. result:=false;
  896. { changes the code sequence
  897. shr/sar const1, x
  898. shl const2, x
  899. to
  900. either "sar/and", "shl/and" or just "and" depending on const1 and const2 }
  901. if GetNextInstruction(p, hp1) and
  902. MatchInstruction(hp1,A_SHL,[]) and
  903. (taicpu(p).oper[0]^.typ = top_const) and
  904. (taicpu(hp1).oper[0]^.typ = top_const) and
  905. (taicpu(hp1).opsize = taicpu(p).opsize) and
  906. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[1]^.typ) and
  907. OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^) then
  908. begin
  909. if (taicpu(p).oper[0]^.val > taicpu(hp1).oper[0]^.val) and
  910. not(cs_opt_size in current_settings.optimizerswitches) then
  911. begin
  912. { shr/sar const1, %reg
  913. shl const2, %reg
  914. with const1 > const2 }
  915. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  916. taicpu(hp1).opcode := A_AND;
  917. l := (1 shl (taicpu(hp1).oper[0]^.val)) - 1;
  918. case taicpu(p).opsize Of
  919. S_B: taicpu(hp1).loadConst(0,l Xor $ff);
  920. S_W: taicpu(hp1).loadConst(0,l Xor $ffff);
  921. S_L: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffff));
  922. S_Q: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffffffffffff));
  923. else
  924. Internalerror(2017050703)
  925. end;
  926. end
  927. else if (taicpu(p).oper[0]^.val<taicpu(hp1).oper[0]^.val) and
  928. not(cs_opt_size in current_settings.optimizerswitches) then
  929. begin
  930. { shr/sar const1, %reg
  931. shl const2, %reg
  932. with const1 < const2 }
  933. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val-taicpu(p).oper[0]^.val);
  934. taicpu(p).opcode := A_AND;
  935. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  936. case taicpu(p).opsize Of
  937. S_B: taicpu(p).loadConst(0,l Xor $ff);
  938. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  939. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  940. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  941. else
  942. Internalerror(2017050702)
  943. end;
  944. end
  945. else if (taicpu(p).oper[0]^.val = taicpu(hp1).oper[0]^.val) then
  946. begin
  947. { shr/sar const1, %reg
  948. shl const2, %reg
  949. with const1 = const2 }
  950. taicpu(p).opcode := A_AND;
  951. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  952. case taicpu(p).opsize Of
  953. S_B: taicpu(p).loadConst(0,l Xor $ff);
  954. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  955. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  956. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  957. else
  958. Internalerror(2017050701)
  959. end;
  960. asml.remove(hp1);
  961. hp1.free;
  962. end;
  963. end;
  964. end;
  965. function TX86AsmOptimizer.PrePeepholeOptIMUL(var p : tai) : boolean;
  966. var
  967. opsize : topsize;
  968. hp1 : tai;
  969. tmpref : treference;
  970. ShiftValue : Cardinal;
  971. BaseValue : TCGInt;
  972. begin
  973. result:=false;
  974. opsize:=taicpu(p).opsize;
  975. { changes certain "imul const, %reg"'s to lea sequences }
  976. if (MatchOpType(taicpu(p),top_const,top_reg) or
  977. MatchOpType(taicpu(p),top_const,top_reg,top_reg)) and
  978. (opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
  979. if (taicpu(p).oper[0]^.val = 1) then
  980. if (taicpu(p).ops = 2) then
  981. { remove "imul $1, reg" }
  982. begin
  983. DebugMsg(SPeepholeOptimization + 'Imul2Nop done',p);
  984. Result := RemoveCurrentP(p);
  985. end
  986. else
  987. { change "imul $1, reg1, reg2" to "mov reg1, reg2" }
  988. begin
  989. hp1 := taicpu.Op_Reg_Reg(A_MOV, opsize, taicpu(p).oper[1]^.reg,taicpu(p).oper[2]^.reg);
  990. InsertLLItem(p.previous, p.next, hp1);
  991. DebugMsg(SPeepholeOptimization + 'Imul2Mov done',p);
  992. p.free;
  993. p := hp1;
  994. end
  995. else if ((taicpu(p).ops <= 2) or
  996. (taicpu(p).oper[2]^.typ = Top_Reg)) and
  997. not(cs_opt_size in current_settings.optimizerswitches) and
  998. (not(GetNextInstruction(p, hp1)) or
  999. not((tai(hp1).typ = ait_instruction) and
  1000. ((taicpu(hp1).opcode=A_Jcc) and
  1001. (taicpu(hp1).condition in [C_O,C_NO])))) then
  1002. begin
  1003. {
  1004. imul X, reg1, reg2 to
  1005. lea (reg1,reg1,Y), reg2
  1006. shl ZZ,reg2
  1007. imul XX, reg1 to
  1008. lea (reg1,reg1,YY), reg1
  1009. shl ZZ,reg2
  1010. This optimziation makes sense for pretty much every x86, except the VIA Nano3000: it has IMUL latency 2, lea/shl pair as well,
  1011. it does not exist as a separate optimization target in FPC though.
  1012. This optimziation can be applied as long as only two bits are set in the constant and those two bits are separated by
  1013. at most two zeros
  1014. }
  1015. reference_reset(tmpref,1,[]);
  1016. if (PopCnt(QWord(taicpu(p).oper[0]^.val))=2) and (BsrQWord(taicpu(p).oper[0]^.val)-BsfQWord(taicpu(p).oper[0]^.val)<=3) then
  1017. begin
  1018. ShiftValue:=BsfQWord(taicpu(p).oper[0]^.val);
  1019. BaseValue:=taicpu(p).oper[0]^.val shr ShiftValue;
  1020. TmpRef.base := taicpu(p).oper[1]^.reg;
  1021. TmpRef.index := taicpu(p).oper[1]^.reg;
  1022. if not(BaseValue in [3,5,9]) then
  1023. Internalerror(2018110101);
  1024. TmpRef.ScaleFactor := BaseValue-1;
  1025. if (taicpu(p).ops = 2) then
  1026. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[1]^.reg)
  1027. else
  1028. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[2]^.reg);
  1029. AsmL.InsertAfter(hp1,p);
  1030. DebugMsg(SPeepholeOptimization + 'Imul2LeaShl done',p);
  1031. taicpu(hp1).fileinfo:=taicpu(p).fileinfo;
  1032. RemoveCurrentP(p, hp1);
  1033. if ShiftValue>0 then
  1034. AsmL.InsertAfter(taicpu.op_const_reg(A_SHL, opsize, ShiftValue, taicpu(hp1).oper[1]^.reg),hp1);
  1035. end;
  1036. end;
  1037. end;
  1038. function TX86AsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  1039. var
  1040. p: taicpu;
  1041. begin
  1042. if not assigned(hp) or
  1043. (hp.typ <> ait_instruction) then
  1044. begin
  1045. Result := false;
  1046. exit;
  1047. end;
  1048. p := taicpu(hp);
  1049. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  1050. with insprop[p.opcode] do
  1051. begin
  1052. case getsubreg(reg) of
  1053. R_SUBW,R_SUBD,R_SUBQ:
  1054. Result:=
  1055. RegLoadedWithNewValue(NR_CARRYFLAG,hp) and
  1056. RegLoadedWithNewValue(NR_PARITYFLAG,hp) and
  1057. RegLoadedWithNewValue(NR_AUXILIARYFLAG,hp) and
  1058. RegLoadedWithNewValue(NR_ZEROFLAG,hp) and
  1059. RegLoadedWithNewValue(NR_SIGNFLAG,hp) and
  1060. RegLoadedWithNewValue(NR_OVERFLOWFLAG,hp);
  1061. R_SUBFLAGCARRY:
  1062. Result:=[Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag,Ch_WFlags]*Ch<>[];
  1063. R_SUBFLAGPARITY:
  1064. Result:=[Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag,Ch_WFlags]*Ch<>[];
  1065. R_SUBFLAGAUXILIARY:
  1066. Result:=[Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_WFlags]*Ch<>[];
  1067. R_SUBFLAGZERO:
  1068. Result:=[Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag,Ch_WFlags]*Ch<>[];
  1069. R_SUBFLAGSIGN:
  1070. Result:=[Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag,Ch_WFlags]*Ch<>[];
  1071. R_SUBFLAGOVERFLOW:
  1072. Result:=[Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag,Ch_WFlags]*Ch<>[];
  1073. R_SUBFLAGINTERRUPT:
  1074. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*Ch<>[];
  1075. R_SUBFLAGDIRECTION:
  1076. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*Ch<>[];
  1077. else
  1078. begin
  1079. writeln(getsubreg(reg));
  1080. internalerror(2017050501);
  1081. end;
  1082. end;
  1083. exit;
  1084. end;
  1085. Result :=
  1086. (((p.opcode = A_MOV) or
  1087. (p.opcode = A_MOVZX) or
  1088. (p.opcode = A_MOVSX) or
  1089. (p.opcode = A_LEA) or
  1090. (p.opcode = A_VMOVSS) or
  1091. (p.opcode = A_VMOVSD) or
  1092. (p.opcode = A_VMOVAPD) or
  1093. (p.opcode = A_VMOVAPS) or
  1094. (p.opcode = A_VMOVQ) or
  1095. (p.opcode = A_MOVSS) or
  1096. (p.opcode = A_MOVSD) or
  1097. (p.opcode = A_MOVQ) or
  1098. (p.opcode = A_MOVAPD) or
  1099. (p.opcode = A_MOVAPS) or
  1100. {$ifndef x86_64}
  1101. (p.opcode = A_LDS) or
  1102. (p.opcode = A_LES) or
  1103. {$endif not x86_64}
  1104. (p.opcode = A_LFS) or
  1105. (p.opcode = A_LGS) or
  1106. (p.opcode = A_LSS)) and
  1107. (p.ops=2) and { A_MOVSD can have zero operands, so this check is needed }
  1108. (p.oper[1]^.typ = top_reg) and
  1109. (Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)) and
  1110. ((p.oper[0]^.typ = top_const) or
  1111. ((p.oper[0]^.typ = top_reg) and
  1112. not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))) or
  1113. ((p.oper[0]^.typ = top_ref) and
  1114. not RegInRef(reg,p.oper[0]^.ref^)))) or
  1115. ((p.opcode = A_POP) and
  1116. (Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg))) or
  1117. ((p.opcode = A_IMUL) and
  1118. (p.ops=3) and
  1119. (Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg,reg)) and
  1120. (((p.oper[1]^.typ=top_reg) and not(Reg1ReadDependsOnReg2(p.oper[1]^.reg,reg))) or
  1121. ((p.oper[1]^.typ=top_ref) and not(RegInRef(reg,p.oper[1]^.ref^))))) or
  1122. ((((p.opcode = A_IMUL) or
  1123. (p.opcode = A_MUL)) and
  1124. (p.ops=1)) and
  1125. (((p.oper[0]^.typ=top_reg) and not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))) or
  1126. ((p.oper[0]^.typ=top_ref) and not(RegInRef(reg,p.oper[0]^.ref^)))) and
  1127. (((p.opsize=S_B) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg))) or
  1128. ((p.opsize=S_W) and Reg1WriteOverwritesReg2Entirely(NR_DX,reg)) or
  1129. ((p.opsize=S_L) and Reg1WriteOverwritesReg2Entirely(NR_EDX,reg))
  1130. {$ifdef x86_64}
  1131. or ((p.opsize=S_Q) and Reg1WriteOverwritesReg2Entirely(NR_RDX,reg))
  1132. {$endif x86_64}
  1133. )) or
  1134. ((p.opcode = A_CWD) and Reg1WriteOverwritesReg2Entirely(NR_DX,reg)) or
  1135. ((p.opcode = A_CDQ) and Reg1WriteOverwritesReg2Entirely(NR_EDX,reg)) or
  1136. {$ifdef x86_64}
  1137. ((p.opcode = A_CQO) and Reg1WriteOverwritesReg2Entirely(NR_RDX,reg)) or
  1138. {$endif x86_64}
  1139. ((p.opcode = A_CBW) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg))) or
  1140. {$ifndef x86_64}
  1141. ((p.opcode = A_LDS) and (reg=NR_DS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1142. ((p.opcode = A_LES) and (reg=NR_ES) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1143. {$endif not x86_64}
  1144. ((p.opcode = A_LFS) and (reg=NR_FS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1145. ((p.opcode = A_LGS) and (reg=NR_GS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1146. ((p.opcode = A_LSS) and (reg=NR_SS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1147. {$ifndef x86_64}
  1148. ((p.opcode = A_AAM) and Reg1WriteOverwritesReg2Entirely(NR_AH,reg)) or
  1149. {$endif not x86_64}
  1150. ((p.opcode = A_LAHF) and Reg1WriteOverwritesReg2Entirely(NR_AH,reg)) or
  1151. ((p.opcode = A_LODSB) and Reg1WriteOverwritesReg2Entirely(NR_AL,reg)) or
  1152. ((p.opcode = A_LODSW) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg)) or
  1153. ((p.opcode = A_LODSD) and Reg1WriteOverwritesReg2Entirely(NR_EAX,reg)) or
  1154. {$ifdef x86_64}
  1155. ((p.opcode = A_LODSQ) and Reg1WriteOverwritesReg2Entirely(NR_RAX,reg)) or
  1156. {$endif x86_64}
  1157. ((p.opcode = A_SETcc) and (p.oper[0]^.typ=top_reg) and Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg)) or
  1158. (((p.opcode = A_FSTSW) or
  1159. (p.opcode = A_FNSTSW)) and
  1160. (p.oper[0]^.typ=top_reg) and
  1161. Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg)) or
  1162. (((p.opcode = A_XOR) or (p.opcode = A_SUB) or (p.opcode = A_SBB)) and
  1163. (p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
  1164. (p.oper[0]^.reg=p.oper[1]^.reg) and
  1165. Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg));
  1166. end;
  1167. class function TX86AsmOptimizer.IsExitCode(p : tai) : boolean;
  1168. var
  1169. hp2,hp3 : tai;
  1170. begin
  1171. { some x86-64 issue a NOP before the real exit code }
  1172. if MatchInstruction(p,A_NOP,[]) then
  1173. GetNextInstruction(p,p);
  1174. result:=assigned(p) and (p.typ=ait_instruction) and
  1175. ((taicpu(p).opcode = A_RET) or
  1176. ((taicpu(p).opcode=A_LEAVE) and
  1177. GetNextInstruction(p,hp2) and
  1178. MatchInstruction(hp2,A_RET,[S_NO])
  1179. ) or
  1180. (((taicpu(p).opcode=A_LEA) and
  1181. MatchOpType(taicpu(p),top_ref,top_reg) and
  1182. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  1183. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  1184. ) and
  1185. GetNextInstruction(p,hp2) and
  1186. MatchInstruction(hp2,A_RET,[S_NO])
  1187. ) or
  1188. ((((taicpu(p).opcode=A_MOV) and
  1189. MatchOpType(taicpu(p),top_reg,top_reg) and
  1190. (taicpu(p).oper[0]^.reg=current_procinfo.framepointer) and
  1191. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)) or
  1192. ((taicpu(p).opcode=A_LEA) and
  1193. MatchOpType(taicpu(p),top_ref,top_reg) and
  1194. (taicpu(p).oper[0]^.ref^.base=current_procinfo.framepointer) and
  1195. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  1196. )
  1197. ) and
  1198. GetNextInstruction(p,hp2) and
  1199. MatchInstruction(hp2,A_POP,[reg2opsize(current_procinfo.framepointer)]) and
  1200. MatchOpType(taicpu(hp2),top_reg) and
  1201. (taicpu(hp2).oper[0]^.reg=current_procinfo.framepointer) and
  1202. GetNextInstruction(hp2,hp3) and
  1203. MatchInstruction(hp3,A_RET,[S_NO])
  1204. )
  1205. );
  1206. end;
  1207. class function TX86AsmOptimizer.isFoldableArithOp(hp1: taicpu; reg: tregister): boolean;
  1208. begin
  1209. isFoldableArithOp := False;
  1210. case hp1.opcode of
  1211. A_ADD,A_SUB,A_OR,A_XOR,A_AND,A_SHL,A_SHR,A_SAR:
  1212. isFoldableArithOp :=
  1213. ((taicpu(hp1).oper[0]^.typ = top_const) or
  1214. ((taicpu(hp1).oper[0]^.typ = top_reg) and
  1215. (taicpu(hp1).oper[0]^.reg <> reg))) and
  1216. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1217. (taicpu(hp1).oper[1]^.reg = reg);
  1218. A_INC,A_DEC,A_NEG,A_NOT:
  1219. isFoldableArithOp :=
  1220. (taicpu(hp1).oper[0]^.typ = top_reg) and
  1221. (taicpu(hp1).oper[0]^.reg = reg);
  1222. else
  1223. ;
  1224. end;
  1225. end;
  1226. procedure TX86AsmOptimizer.RemoveLastDeallocForFuncRes(p: tai);
  1227. procedure DoRemoveLastDeallocForFuncRes( supreg: tsuperregister);
  1228. var
  1229. hp2: tai;
  1230. begin
  1231. hp2 := p;
  1232. repeat
  1233. hp2 := tai(hp2.previous);
  1234. if assigned(hp2) and
  1235. (hp2.typ = ait_regalloc) and
  1236. (tai_regalloc(hp2).ratype=ra_dealloc) and
  1237. (getregtype(tai_regalloc(hp2).reg) = R_INTREGISTER) and
  1238. (getsupreg(tai_regalloc(hp2).reg) = supreg) then
  1239. begin
  1240. asml.remove(hp2);
  1241. hp2.free;
  1242. break;
  1243. end;
  1244. until not(assigned(hp2)) or regInInstruction(newreg(R_INTREGISTER,supreg,R_SUBWHOLE),hp2);
  1245. end;
  1246. begin
  1247. case current_procinfo.procdef.returndef.typ of
  1248. arraydef,recorddef,pointerdef,
  1249. stringdef,enumdef,procdef,objectdef,errordef,
  1250. filedef,setdef,procvardef,
  1251. classrefdef,forwarddef:
  1252. DoRemoveLastDeallocForFuncRes(RS_EAX);
  1253. orddef:
  1254. if current_procinfo.procdef.returndef.size <> 0 then
  1255. begin
  1256. DoRemoveLastDeallocForFuncRes(RS_EAX);
  1257. { for int64/qword }
  1258. if current_procinfo.procdef.returndef.size = 8 then
  1259. DoRemoveLastDeallocForFuncRes(RS_EDX);
  1260. end;
  1261. else
  1262. ;
  1263. end;
  1264. end;
  1265. function TX86AsmOptimizer.OptPass1_V_MOVAP(var p : tai) : boolean;
  1266. var
  1267. hp1,hp2 : tai;
  1268. begin
  1269. result:=false;
  1270. if MatchOpType(taicpu(p),top_reg,top_reg) then
  1271. begin
  1272. { vmova* reg1,reg1
  1273. =>
  1274. <nop> }
  1275. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  1276. begin
  1277. RemoveCurrentP(p);
  1278. result:=true;
  1279. exit;
  1280. end
  1281. else if GetNextInstruction(p,hp1) then
  1282. begin
  1283. if MatchInstruction(hp1,[taicpu(p).opcode],[S_NO]) and
  1284. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  1285. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  1286. begin
  1287. { vmova* reg1,reg2
  1288. vmova* reg2,reg3
  1289. dealloc reg2
  1290. =>
  1291. vmova* reg1,reg3 }
  1292. TransferUsedRegs(TmpUsedRegs);
  1293. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1294. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  1295. begin
  1296. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 1',p);
  1297. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  1298. asml.Remove(hp1);
  1299. hp1.Free;
  1300. result:=true;
  1301. exit;
  1302. end
  1303. { special case:
  1304. vmova* reg1,reg2
  1305. vmova* reg2,reg1
  1306. =>
  1307. vmova* reg1,reg2 }
  1308. else if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) then
  1309. begin
  1310. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 2',p);
  1311. asml.Remove(hp1);
  1312. hp1.Free;
  1313. result:=true;
  1314. exit;
  1315. end
  1316. end
  1317. else if ((MatchInstruction(p,[A_MOVAPS,A_VMOVAPS],[S_NO]) and
  1318. MatchInstruction(hp1,[A_MOVSS,A_VMOVSS],[S_NO])) or
  1319. ((MatchInstruction(p,[A_MOVAPD,A_VMOVAPD],[S_NO]) and
  1320. MatchInstruction(hp1,[A_MOVSD,A_VMOVSD],[S_NO])))
  1321. ) and
  1322. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  1323. begin
  1324. { vmova* reg1,reg2
  1325. vmovs* reg2,<op>
  1326. dealloc reg2
  1327. =>
  1328. vmovs* reg1,reg3 }
  1329. TransferUsedRegs(TmpUsedRegs);
  1330. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1331. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  1332. begin
  1333. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVS*2(V)MOVS* 1',p);
  1334. taicpu(p).opcode:=taicpu(hp1).opcode;
  1335. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  1336. asml.Remove(hp1);
  1337. hp1.Free;
  1338. result:=true;
  1339. exit;
  1340. end
  1341. end;
  1342. end;
  1343. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) then
  1344. begin
  1345. if MatchInstruction(hp1,[A_VFMADDPD,
  1346. A_VFMADD132PD,
  1347. A_VFMADD132PS,
  1348. A_VFMADD132SD,
  1349. A_VFMADD132SS,
  1350. A_VFMADD213PD,
  1351. A_VFMADD213PS,
  1352. A_VFMADD213SD,
  1353. A_VFMADD213SS,
  1354. A_VFMADD231PD,
  1355. A_VFMADD231PS,
  1356. A_VFMADD231SD,
  1357. A_VFMADD231SS,
  1358. A_VFMADDSUB132PD,
  1359. A_VFMADDSUB132PS,
  1360. A_VFMADDSUB213PD,
  1361. A_VFMADDSUB213PS,
  1362. A_VFMADDSUB231PD,
  1363. A_VFMADDSUB231PS,
  1364. A_VFMSUB132PD,
  1365. A_VFMSUB132PS,
  1366. A_VFMSUB132SD,
  1367. A_VFMSUB132SS,
  1368. A_VFMSUB213PD,
  1369. A_VFMSUB213PS,
  1370. A_VFMSUB213SD,
  1371. A_VFMSUB213SS,
  1372. A_VFMSUB231PD,
  1373. A_VFMSUB231PS,
  1374. A_VFMSUB231SD,
  1375. A_VFMSUB231SS,
  1376. A_VFMSUBADD132PD,
  1377. A_VFMSUBADD132PS,
  1378. A_VFMSUBADD213PD,
  1379. A_VFMSUBADD213PS,
  1380. A_VFMSUBADD231PD,
  1381. A_VFMSUBADD231PS,
  1382. A_VFNMADD132PD,
  1383. A_VFNMADD132PS,
  1384. A_VFNMADD132SD,
  1385. A_VFNMADD132SS,
  1386. A_VFNMADD213PD,
  1387. A_VFNMADD213PS,
  1388. A_VFNMADD213SD,
  1389. A_VFNMADD213SS,
  1390. A_VFNMADD231PD,
  1391. A_VFNMADD231PS,
  1392. A_VFNMADD231SD,
  1393. A_VFNMADD231SS,
  1394. A_VFNMSUB132PD,
  1395. A_VFNMSUB132PS,
  1396. A_VFNMSUB132SD,
  1397. A_VFNMSUB132SS,
  1398. A_VFNMSUB213PD,
  1399. A_VFNMSUB213PS,
  1400. A_VFNMSUB213SD,
  1401. A_VFNMSUB213SS,
  1402. A_VFNMSUB231PD,
  1403. A_VFNMSUB231PS,
  1404. A_VFNMSUB231SD,
  1405. A_VFNMSUB231SS],[S_NO]) and
  1406. { we mix single and double opperations here because we assume that the compiler
  1407. generates vmovapd only after double operations and vmovaps only after single operations }
  1408. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[2]^) and
  1409. GetNextInstruction(hp1,hp2) and
  1410. MatchInstruction(hp2,[A_VMOVAPD,A_VMOVAPS,A_MOVAPD,A_MOVAPS],[S_NO]) and
  1411. MatchOperand(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) then
  1412. begin
  1413. TransferUsedRegs(TmpUsedRegs);
  1414. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1415. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  1416. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  1417. begin
  1418. taicpu(hp1).loadoper(2,taicpu(p).oper[0]^);
  1419. RemoveCurrentP(p, hp1); // <-- Is this actually safe? hp1 is not necessarily the next instruction. [Kit]
  1420. asml.Remove(hp2);
  1421. hp2.Free;
  1422. end;
  1423. end
  1424. else if (hp1.typ = ait_instruction) and
  1425. GetNextInstruction(hp1, hp2) and
  1426. MatchInstruction(hp2,taicpu(p).opcode,[]) and
  1427. OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  1428. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  1429. MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^) and
  1430. (((taicpu(p).opcode=A_MOVAPS) and
  1431. ((taicpu(hp1).opcode=A_ADDSS) or (taicpu(hp1).opcode=A_SUBSS) or
  1432. (taicpu(hp1).opcode=A_MULSS) or (taicpu(hp1).opcode=A_DIVSS))) or
  1433. ((taicpu(p).opcode=A_MOVAPD) and
  1434. ((taicpu(hp1).opcode=A_ADDSD) or (taicpu(hp1).opcode=A_SUBSD) or
  1435. (taicpu(hp1).opcode=A_MULSD) or (taicpu(hp1).opcode=A_DIVSD)))
  1436. ) then
  1437. { change
  1438. movapX reg,reg2
  1439. addsX/subsX/... reg3, reg2
  1440. movapX reg2,reg
  1441. to
  1442. addsX/subsX/... reg3,reg
  1443. }
  1444. begin
  1445. TransferUsedRegs(TmpUsedRegs);
  1446. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1447. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  1448. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  1449. begin
  1450. DebugMsg(SPeepholeOptimization + 'MovapXOpMovapX2Op ('+
  1451. debug_op2str(taicpu(p).opcode)+' '+
  1452. debug_op2str(taicpu(hp1).opcode)+' '+
  1453. debug_op2str(taicpu(hp2).opcode)+') done',p);
  1454. { we cannot eliminate the first move if
  1455. the operations uses the same register for source and dest }
  1456. if not(OpsEqual(taicpu(hp1).oper[1]^,taicpu(hp1).oper[0]^)) then
  1457. RemoveCurrentP(p, nil);
  1458. p:=hp1;
  1459. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  1460. asml.remove(hp2);
  1461. hp2.Free;
  1462. result:=true;
  1463. end;
  1464. end;
  1465. end;
  1466. end;
  1467. end;
  1468. function TX86AsmOptimizer.OptPass1VOP(var p : tai) : boolean;
  1469. var
  1470. hp1 : tai;
  1471. begin
  1472. result:=false;
  1473. { replace
  1474. V<Op>X %mreg1,%mreg2,%mreg3
  1475. VMovX %mreg3,%mreg4
  1476. dealloc %mreg3
  1477. by
  1478. V<Op>X %mreg1,%mreg2,%mreg4
  1479. ?
  1480. }
  1481. if GetNextInstruction(p,hp1) and
  1482. { we mix single and double operations here because we assume that the compiler
  1483. generates vmovapd only after double operations and vmovaps only after single operations }
  1484. MatchInstruction(hp1,A_VMOVAPD,A_VMOVAPS,[S_NO]) and
  1485. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  1486. (taicpu(hp1).oper[1]^.typ=top_reg) then
  1487. begin
  1488. TransferUsedRegs(TmpUsedRegs);
  1489. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1490. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  1491. begin
  1492. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  1493. DebugMsg(SPeepholeOptimization + 'VOpVmov2VOp done',p);
  1494. asml.Remove(hp1);
  1495. hp1.Free;
  1496. result:=true;
  1497. end;
  1498. end;
  1499. end;
  1500. { Replaces all references to AOldReg in a memory reference to ANewReg }
  1501. class function TX86AsmOptimizer.ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean;
  1502. var
  1503. OldSupReg: TSuperRegister;
  1504. OldSubReg, MemSubReg: TSubRegister;
  1505. begin
  1506. Result := False;
  1507. { For safety reasons, only check for exact register matches }
  1508. { Check base register }
  1509. if (ref.base = AOldReg) then
  1510. begin
  1511. ref.base := ANewReg;
  1512. Result := True;
  1513. end;
  1514. { Check index register }
  1515. if (ref.index = AOldReg) then
  1516. begin
  1517. ref.index := ANewReg;
  1518. Result := True;
  1519. end;
  1520. end;
  1521. { Replaces all references to AOldReg in an operand to ANewReg }
  1522. class function TX86AsmOptimizer.ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean;
  1523. var
  1524. OldSupReg, NewSupReg: TSuperRegister;
  1525. OldSubReg, NewSubReg, MemSubReg: TSubRegister;
  1526. OldRegType: TRegisterType;
  1527. ThisOper: POper;
  1528. begin
  1529. ThisOper := p.oper[OperIdx]; { Faster to access overall }
  1530. Result := False;
  1531. if (AOldReg = NR_NO) or (ANewReg = NR_NO) then
  1532. InternalError(2020011801);
  1533. OldSupReg := getsupreg(AOldReg);
  1534. OldSubReg := getsubreg(AOldReg);
  1535. OldRegType := getregtype(AOldReg);
  1536. NewSupReg := getsupreg(ANewReg);
  1537. NewSubReg := getsubreg(ANewReg);
  1538. if OldRegType <> getregtype(ANewReg) then
  1539. InternalError(2020011802);
  1540. if OldSubReg <> NewSubReg then
  1541. InternalError(2020011803);
  1542. case ThisOper^.typ of
  1543. top_reg:
  1544. if (
  1545. (ThisOper^.reg = AOldReg) or
  1546. (
  1547. (OldRegType = R_INTREGISTER) and
  1548. (getsupreg(ThisOper^.reg) = OldSupReg) and
  1549. (getregtype(ThisOper^.reg) = R_INTREGISTER) and
  1550. (
  1551. (getsubreg(ThisOper^.reg) <= OldSubReg)
  1552. {$ifndef x86_64}
  1553. and (
  1554. { Under i386 and i8086, ESI, EDI, EBP and ESP
  1555. don't have an 8-bit representation }
  1556. (getsubreg(ThisOper^.reg) >= R_SUBW) or
  1557. not (NewSupReg in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  1558. )
  1559. {$endif x86_64}
  1560. )
  1561. )
  1562. ) then
  1563. begin
  1564. ThisOper^.reg := newreg(getregtype(ANewReg), NewSupReg, getsubreg(p.oper[OperIdx]^.reg));;
  1565. Result := True;
  1566. end;
  1567. top_ref:
  1568. if ReplaceRegisterInRef(ThisOper^.ref^, AOldReg, ANewReg) then
  1569. Result := True;
  1570. else
  1571. ;
  1572. end;
  1573. end;
  1574. { Replaces all references to AOldReg in an instruction to ANewReg }
  1575. function TX86AsmOptimizer.ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
  1576. const
  1577. ReadFlag: array[0..3] of TInsChange = (Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Rop4);
  1578. var
  1579. OperIdx: Integer;
  1580. begin
  1581. Result := False;
  1582. for OperIdx := 0 to p.ops - 1 do
  1583. if (ReadFlag[OperIdx] in InsProp[p.Opcode].Ch) and
  1584. { The shift and rotate instructions can only use CL }
  1585. not (
  1586. (OperIdx = 0) and
  1587. { This second condition just helps to avoid unnecessarily
  1588. calling MatchInstruction for 10 different opcodes }
  1589. (p.oper[0]^.reg = NR_CL) and
  1590. MatchInstruction(p, [A_RCL, A_RCR, A_ROL, A_ROR, A_SAL, A_SAR, A_SHL, A_SHLD, A_SHR, A_SHRD], [])
  1591. ) then
  1592. Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
  1593. end;
  1594. class function TX86AsmOptimizer.IsRefSafe(const ref: PReference): Boolean; inline;
  1595. begin
  1596. Result :=
  1597. (ref^.index = NR_NO) and
  1598. (
  1599. {$ifdef x86_64}
  1600. (
  1601. (ref^.base = NR_RIP) and
  1602. (ref^.refaddr in [addr_pic, addr_pic_no_got])
  1603. ) or
  1604. {$endif x86_64}
  1605. (ref^.base = NR_STACK_POINTER_REG) or
  1606. (ref^.base = current_procinfo.framepointer)
  1607. );
  1608. end;
  1609. function TX86AsmOptimizer.DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  1610. var
  1611. CurrentReg, ReplaceReg: TRegister;
  1612. SubReg: TSubRegister;
  1613. begin
  1614. Result := False;
  1615. ReplaceReg := taicpu(p_mov).oper[0]^.reg;
  1616. CurrentReg := taicpu(p_mov).oper[1]^.reg;
  1617. case hp.opcode of
  1618. A_FSTSW, A_FNSTSW,
  1619. A_IN, A_INS, A_OUT, A_OUTS,
  1620. A_CMPS, A_LODS, A_MOVS, A_SCAS, A_STOS:
  1621. { These routines have explicit operands, but they are restricted in
  1622. what they can be (e.g. IN and OUT can only read from AL, AX or
  1623. EAX. }
  1624. Exit;
  1625. A_IMUL:
  1626. begin
  1627. { The 1-operand version writes to implicit registers
  1628. The 2-operand version reads from the first operator, and reads
  1629. from and writes to the second (equivalent to Ch_ROp1, ChRWOp2).
  1630. the 3-operand version reads from a register that it doesn't write to
  1631. }
  1632. case hp.ops of
  1633. 1:
  1634. if (
  1635. (
  1636. (hp.opsize = S_B) and (getsupreg(CurrentReg) <> RS_EAX)
  1637. ) or
  1638. not (getsupreg(CurrentReg) in [RS_EAX, RS_EDX])
  1639. ) and ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  1640. begin
  1641. Result := True;
  1642. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 1)', hp);
  1643. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1644. end;
  1645. 2:
  1646. { Only modify the first parameter }
  1647. if ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  1648. begin
  1649. Result := True;
  1650. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 2)', hp);
  1651. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1652. end;
  1653. 3:
  1654. { Only modify the second parameter }
  1655. if ReplaceRegisterInOper(hp, 1, CurrentReg, ReplaceReg) then
  1656. begin
  1657. Result := True;
  1658. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 3)', hp);
  1659. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1660. end;
  1661. else
  1662. InternalError(2020012901);
  1663. end;
  1664. end;
  1665. else
  1666. if (hp.ops > 0) and
  1667. ReplaceRegisterInInstruction(hp, CurrentReg, ReplaceReg) then
  1668. begin
  1669. Result := True;
  1670. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovXXX2MovXXX)', hp);
  1671. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1672. end;
  1673. end;
  1674. end;
  1675. function TX86AsmOptimizer.OptPass1MOV(var p : tai) : boolean;
  1676. var
  1677. hp1, hp2, hp3: tai;
  1678. procedure convert_mov_value(signed_movop: tasmop; max_value: tcgint); inline;
  1679. begin
  1680. if taicpu(hp1).opcode = signed_movop then
  1681. begin
  1682. if taicpu(p).oper[0]^.val > max_value shr 1 then
  1683. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val - max_value - 1 { Convert to signed }
  1684. end
  1685. else
  1686. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and max_value; { Trim to unsigned }
  1687. end;
  1688. var
  1689. GetNextInstruction_p, TempRegUsed: Boolean;
  1690. PreMessage, RegName1, RegName2, InputVal, MaskNum: string;
  1691. NewSize: topsize;
  1692. CurrentReg: TRegister;
  1693. begin
  1694. Result:=false;
  1695. GetNextInstruction_p:=GetNextInstruction(p, hp1);
  1696. { remove mov reg1,reg1? }
  1697. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^)
  1698. then
  1699. begin
  1700. DebugMsg(SPeepholeOptimization + 'Mov2Nop 1 done',p);
  1701. { take care of the register (de)allocs following p }
  1702. RemoveCurrentP(p, hp1);
  1703. Result:=true;
  1704. exit;
  1705. end;
  1706. { All the next optimisations require a next instruction }
  1707. if not GetNextInstruction_p or (hp1.typ <> ait_instruction) then
  1708. Exit;
  1709. { Look for:
  1710. mov %reg1,%reg2
  1711. ??? %reg2,r/m
  1712. Change to:
  1713. mov %reg1,%reg2
  1714. ??? %reg1,r/m
  1715. }
  1716. if MatchOpType(taicpu(p), top_reg, top_reg) then
  1717. begin
  1718. CurrentReg := taicpu(p).oper[1]^.reg;
  1719. if RegReadByInstruction(CurrentReg, hp1) and
  1720. DeepMOVOpt(taicpu(p), taicpu(hp1)) then
  1721. begin
  1722. TransferUsedRegs(TmpUsedRegs);
  1723. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  1724. if not RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs) and
  1725. { Just in case something didn't get modified (e.g. an
  1726. implicit register) }
  1727. not RegReadByInstruction(CurrentReg, hp1) then
  1728. begin
  1729. { We can remove the original MOV }
  1730. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3 done',p);
  1731. Asml.Remove(p);
  1732. p.Free;
  1733. p := hp1;
  1734. { TmpUsedRegs contains the results of "UpdateUsedRegs(tai(p.Next))" already,
  1735. so just restore it to UsedRegs instead of calculating it again }
  1736. RestoreUsedRegs(TmpUsedRegs);
  1737. Result := True;
  1738. Exit;
  1739. end;
  1740. { If we know a MOV instruction has become a null operation, we might as well
  1741. get rid of it now to save time. }
  1742. if (taicpu(hp1).opcode = A_MOV) and
  1743. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1744. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
  1745. { Just being a register is enough to confirm it's a null operation }
  1746. (taicpu(hp1).oper[0]^.typ = top_reg) then
  1747. begin
  1748. Result := True;
  1749. { Speed-up to reduce a pipeline stall... if we had something like...
  1750. movl %eax,%edx
  1751. movw %dx,%ax
  1752. ... the second instruction would change to movw %ax,%ax, but
  1753. given that it is now %ax that's active rather than %eax,
  1754. penalties might occur due to a partial register write, so instead,
  1755. change it to a MOVZX instruction when optimising for speed.
  1756. }
  1757. if not (cs_opt_size in current_settings.optimizerswitches) and
  1758. IsMOVZXAcceptable and
  1759. (taicpu(hp1).opsize < taicpu(p).opsize)
  1760. {$ifdef x86_64}
  1761. { operations already implicitly set the upper 64 bits to zero }
  1762. and not ((taicpu(hp1).opsize = S_L) and (taicpu(p).opsize = S_Q))
  1763. {$endif x86_64}
  1764. then
  1765. begin
  1766. CurrentReg := taicpu(hp1).oper[1]^.reg;
  1767. DebugMsg(SPeepholeOptimization + 'Zero-extension to minimise pipeline stall (Mov2Movz)',hp1);
  1768. case taicpu(p).opsize of
  1769. S_W:
  1770. if taicpu(hp1).opsize = S_B then
  1771. taicpu(hp1).opsize := S_BL
  1772. else
  1773. InternalError(2020012911);
  1774. S_L{$ifdef x86_64}, S_Q{$endif x86_64}:
  1775. case taicpu(hp1).opsize of
  1776. S_B:
  1777. taicpu(hp1).opsize := S_BL;
  1778. S_W:
  1779. taicpu(hp1).opsize := S_WL;
  1780. else
  1781. InternalError(2020012912);
  1782. end;
  1783. else
  1784. InternalError(2020012910);
  1785. end;
  1786. taicpu(hp1).opcode := A_MOVZX;
  1787. taicpu(hp1).oper[1]^.reg := newreg(getregtype(CurrentReg), getsupreg(CurrentReg), R_SUBD)
  1788. end
  1789. else
  1790. begin
  1791. GetNextInstruction_p := GetNextInstruction(hp1, hp2);
  1792. DebugMsg(SPeepholeOptimization + 'Mov2Nop 4 done',hp1);
  1793. asml.remove(hp1);
  1794. hp1.free;
  1795. { The instruction after what was hp1 is now the immediate next instruction,
  1796. so we can continue to make optimisations if it's present }
  1797. if not GetNextInstruction_p or (hp2.typ <> ait_instruction) then
  1798. Exit;
  1799. hp1 := hp2;
  1800. end;
  1801. end;
  1802. end;
  1803. end;
  1804. { Depending on the DeepMOVOpt above, it may turn out that hp1 completely
  1805. overwrites the original destination register. e.g.
  1806. movl ###,%reg2d
  1807. movslq ###,%reg2q (### doesn't have to be the same as the first one)
  1808. In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
  1809. }
  1810. if (taicpu(p).oper[1]^.typ = top_reg) and
  1811. MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
  1812. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1813. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  1814. begin
  1815. if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
  1816. begin
  1817. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  1818. case taicpu(p).oper[0]^.typ of
  1819. top_const:
  1820. { We have something like:
  1821. movb $x, %regb
  1822. movzbl %regb,%regd
  1823. Change to:
  1824. movl $x, %regd
  1825. }
  1826. begin
  1827. case taicpu(hp1).opsize of
  1828. S_BW:
  1829. begin
  1830. convert_mov_value(A_MOVSX, $FF);
  1831. setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
  1832. taicpu(p).opsize := S_W;
  1833. end;
  1834. S_BL:
  1835. begin
  1836. convert_mov_value(A_MOVSX, $FF);
  1837. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  1838. taicpu(p).opsize := S_L;
  1839. end;
  1840. S_WL:
  1841. begin
  1842. convert_mov_value(A_MOVSX, $FFFF);
  1843. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  1844. taicpu(p).opsize := S_L;
  1845. end;
  1846. {$ifdef x86_64}
  1847. S_BQ:
  1848. begin
  1849. convert_mov_value(A_MOVSX, $FF);
  1850. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1851. taicpu(p).opsize := S_Q;
  1852. end;
  1853. S_WQ:
  1854. begin
  1855. convert_mov_value(A_MOVSX, $FFFF);
  1856. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1857. taicpu(p).opsize := S_Q;
  1858. end;
  1859. S_LQ:
  1860. begin
  1861. convert_mov_value(A_MOVSXD, $FFFFFFFF); { Note it's MOVSXD, not MOVSX }
  1862. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1863. taicpu(p).opsize := S_Q;
  1864. end;
  1865. {$endif x86_64}
  1866. else
  1867. { If hp1 was a MOV instruction, it should have been
  1868. optimised already }
  1869. InternalError(2020021001);
  1870. end;
  1871. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
  1872. asml.Remove(hp1);
  1873. hp1.Free;
  1874. Result := True;
  1875. Exit;
  1876. end;
  1877. top_ref:
  1878. { We have something like:
  1879. movb mem, %regb
  1880. movzbl %regb,%regd
  1881. Change to:
  1882. movzbl mem, %regd
  1883. }
  1884. if (taicpu(p).oper[0]^.ref^.refaddr<>addr_full) and (IsMOVZXAcceptable or (taicpu(hp1).opcode<>A_MOVZX)) then
  1885. begin
  1886. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
  1887. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  1888. RemoveCurrentP(p, hp1);
  1889. Result:=True;
  1890. Exit;
  1891. end;
  1892. else
  1893. if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
  1894. { Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
  1895. Exit;
  1896. end;
  1897. end
  1898. { The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
  1899. and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
  1900. optimised }
  1901. else
  1902. begin
  1903. DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
  1904. RemoveCurrentP(p, hp1);
  1905. Result := True;
  1906. Exit;
  1907. end;
  1908. end;
  1909. if (taicpu(hp1).opcode = A_AND) and
  1910. (taicpu(p).oper[1]^.typ = top_reg) and
  1911. MatchOpType(taicpu(hp1),top_const,top_reg) then
  1912. begin
  1913. if MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) then
  1914. begin
  1915. case taicpu(p).opsize of
  1916. S_L:
  1917. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  1918. begin
  1919. { Optimize out:
  1920. mov x, %reg
  1921. and ffffffffh, %reg
  1922. }
  1923. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 1 done',p);
  1924. asml.remove(hp1);
  1925. hp1.free;
  1926. Result:=true;
  1927. exit;
  1928. end;
  1929. S_Q: { TODO: Confirm if this is even possible }
  1930. if (taicpu(hp1).oper[0]^.val = $ffffffffffffffff) then
  1931. begin
  1932. { Optimize out:
  1933. mov x, %reg
  1934. and ffffffffffffffffh, %reg
  1935. }
  1936. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 2 done',p);
  1937. asml.remove(hp1);
  1938. hp1.free;
  1939. Result:=true;
  1940. exit;
  1941. end;
  1942. else
  1943. ;
  1944. end;
  1945. if ((taicpu(p).oper[0]^.typ=top_reg) or
  1946. ((taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr<>addr_full))) and
  1947. GetNextInstruction(hp1,hp2) and
  1948. MatchInstruction(hp2,A_TEST,[taicpu(p).opsize]) and
  1949. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp2).oper[1]^) and
  1950. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^) and
  1951. GetNextInstruction(hp2,hp3) and
  1952. MatchInstruction(hp3,A_Jcc,A_Setcc,[S_NO]) and
  1953. (taicpu(hp3).condition in [C_E,C_NE]) then
  1954. begin
  1955. TransferUsedRegs(TmpUsedRegs);
  1956. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  1957. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  1958. if not(RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp2, TmpUsedRegs)) then
  1959. begin
  1960. DebugMsg(SPeepholeOptimization + 'MovAndTest2Test done',p);
  1961. taicpu(hp1).loadoper(1,taicpu(p).oper[0]^);
  1962. taicpu(hp1).opcode:=A_TEST;
  1963. asml.Remove(hp2);
  1964. hp2.free;
  1965. RemoveCurrentP(p, hp1);
  1966. Result:=true;
  1967. exit;
  1968. end;
  1969. end;
  1970. end
  1971. else if IsMOVZXAcceptable and
  1972. (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(hp1).oper[1]^.typ = top_reg) and
  1973. (taicpu(p).oper[0]^.typ <> top_const) and { MOVZX only supports registers and memory, not immediates (use MOV for that!) }
  1974. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  1975. then
  1976. begin
  1977. InputVal := debug_operstr(taicpu(p).oper[0]^);
  1978. MaskNum := debug_tostr(taicpu(hp1).oper[0]^.val);
  1979. case taicpu(p).opsize of
  1980. S_B:
  1981. if (taicpu(hp1).oper[0]^.val = $ff) then
  1982. begin
  1983. { Convert:
  1984. movb x, %regl movb x, %regl
  1985. andw ffh, %regw andl ffh, %regd
  1986. To:
  1987. movzbw x, %regd movzbl x, %regd
  1988. (Identical registers, just different sizes)
  1989. }
  1990. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 8-bit register name }
  1991. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 16/32-bit register name }
  1992. case taicpu(hp1).opsize of
  1993. S_W: NewSize := S_BW;
  1994. S_L: NewSize := S_BL;
  1995. {$ifdef x86_64}
  1996. S_Q: NewSize := S_BQ;
  1997. {$endif x86_64}
  1998. else
  1999. InternalError(2018011510);
  2000. end;
  2001. end
  2002. else
  2003. NewSize := S_NO;
  2004. S_W:
  2005. if (taicpu(hp1).oper[0]^.val = $ffff) then
  2006. begin
  2007. { Convert:
  2008. movw x, %regw
  2009. andl ffffh, %regd
  2010. To:
  2011. movzwl x, %regd
  2012. (Identical registers, just different sizes)
  2013. }
  2014. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 16-bit register name }
  2015. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 32-bit register name }
  2016. case taicpu(hp1).opsize of
  2017. S_L: NewSize := S_WL;
  2018. {$ifdef x86_64}
  2019. S_Q: NewSize := S_WQ;
  2020. {$endif x86_64}
  2021. else
  2022. InternalError(2018011511);
  2023. end;
  2024. end
  2025. else
  2026. NewSize := S_NO;
  2027. else
  2028. NewSize := S_NO;
  2029. end;
  2030. if NewSize <> S_NO then
  2031. begin
  2032. PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + InputVal + ',' + RegName1;
  2033. { The actual optimization }
  2034. taicpu(p).opcode := A_MOVZX;
  2035. taicpu(p).changeopsize(NewSize);
  2036. taicpu(p).oper[1]^ := taicpu(hp1).oper[1]^;
  2037. { Safeguard if "and" is followed by a conditional command }
  2038. TransferUsedRegs(TmpUsedRegs);
  2039. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  2040. if (RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  2041. begin
  2042. { At this point, the "and" command is effectively equivalent to
  2043. "test %reg,%reg". This will be handled separately by the
  2044. Peephole Optimizer. [Kit] }
  2045. DebugMsg(SPeepholeOptimization + PreMessage +
  2046. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  2047. end
  2048. else
  2049. begin
  2050. DebugMsg(SPeepholeOptimization + PreMessage + '; and' + debug_opsize2str(taicpu(hp1).opsize) + ' $' + MaskNum + ',' + RegName2 +
  2051. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  2052. asml.Remove(hp1);
  2053. hp1.Free;
  2054. end;
  2055. Result := True;
  2056. Exit;
  2057. end;
  2058. end;
  2059. end;
  2060. { Next instruction is also a MOV ? }
  2061. if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) then
  2062. begin
  2063. if (taicpu(p).oper[1]^.typ = top_reg) and
  2064. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  2065. begin
  2066. CurrentReg := taicpu(p).oper[1]^.reg;
  2067. TransferUsedRegs(TmpUsedRegs);
  2068. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  2069. { we have
  2070. mov x, %treg
  2071. mov %treg, y
  2072. }
  2073. if not(RegInOp(CurrentReg, taicpu(hp1).oper[1]^)) then
  2074. if not(RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs)) then
  2075. { we've got
  2076. mov x, %treg
  2077. mov %treg, y
  2078. with %treg is not used after }
  2079. case taicpu(p).oper[0]^.typ Of
  2080. { top_reg is covered by DeepMOVOpt }
  2081. top_const:
  2082. begin
  2083. { change
  2084. mov const, %treg
  2085. mov %treg, y
  2086. to
  2087. mov const, y
  2088. }
  2089. if (taicpu(hp1).oper[1]^.typ=top_reg) or
  2090. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  2091. begin
  2092. if taicpu(hp1).oper[1]^.typ=top_reg then
  2093. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  2094. taicpu(p).loadOper(1,taicpu(hp1).oper[1]^);
  2095. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 5 done',p);
  2096. asml.remove(hp1);
  2097. hp1.free;
  2098. Result:=true;
  2099. Exit;
  2100. end;
  2101. end;
  2102. top_ref:
  2103. if (taicpu(hp1).oper[1]^.typ = top_reg) then
  2104. begin
  2105. { change
  2106. mov mem, %treg
  2107. mov %treg, %reg
  2108. to
  2109. mov mem, %reg"
  2110. }
  2111. taicpu(p).loadreg(1, taicpu(hp1).oper[1]^.reg);
  2112. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3 done',p);
  2113. asml.remove(hp1);
  2114. hp1.free;
  2115. Result:=true;
  2116. Exit;
  2117. end;
  2118. else
  2119. ;
  2120. end
  2121. else
  2122. { %treg is used afterwards, but all eventualities
  2123. other than the first MOV instruction being a constant
  2124. are covered by DeepMOVOpt, so only check for that }
  2125. if (taicpu(p).oper[0]^.typ = top_const) and
  2126. (
  2127. { For MOV operations, a size saving is only made if the register/const is byte-sized }
  2128. not (cs_opt_size in current_settings.optimizerswitches) or
  2129. (taicpu(hp1).opsize = S_B)
  2130. ) and
  2131. (
  2132. (taicpu(hp1).oper[1]^.typ = top_reg) or
  2133. ((taicpu(p).oper[0]^.val >= low(longint)) and (taicpu(p).oper[0]^.val <= high(longint)))
  2134. ) then
  2135. begin
  2136. DebugMsg(SPeepholeOptimization + debug_operstr(taicpu(hp1).oper[0]^) + ' = $' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 6b)',hp1);
  2137. taicpu(hp1).loadconst(0, taicpu(p).oper[0]^.val);
  2138. end;
  2139. end;
  2140. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  2141. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  2142. { mov reg1, mem1 or mov mem1, reg1
  2143. mov mem2, reg2 mov reg2, mem2}
  2144. begin
  2145. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  2146. { mov reg1, mem1 or mov mem1, reg1
  2147. mov mem2, reg1 mov reg2, mem1}
  2148. begin
  2149. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2150. { Removes the second statement from
  2151. mov reg1, mem1/reg2
  2152. mov mem1/reg2, reg1 }
  2153. begin
  2154. if taicpu(p).oper[0]^.typ=top_reg then
  2155. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2156. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 1',p);
  2157. asml.remove(hp1);
  2158. hp1.free;
  2159. Result:=true;
  2160. exit;
  2161. end
  2162. else
  2163. begin
  2164. TransferUsedRegs(TmpUsedRegs);
  2165. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2166. if (taicpu(p).oper[1]^.typ = top_ref) and
  2167. { mov reg1, mem1
  2168. mov mem2, reg1 }
  2169. (taicpu(hp1).oper[0]^.ref^.refaddr = addr_no) and
  2170. GetNextInstruction(hp1, hp2) and
  2171. MatchInstruction(hp2,A_CMP,[taicpu(p).opsize]) and
  2172. OpsEqual(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^) and
  2173. OpsEqual(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) and
  2174. not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs)) then
  2175. { change to
  2176. mov reg1, mem1 mov reg1, mem1
  2177. mov mem2, reg1 cmp reg1, mem2
  2178. cmp mem1, reg1
  2179. }
  2180. begin
  2181. asml.remove(hp2);
  2182. hp2.free;
  2183. taicpu(hp1).opcode := A_CMP;
  2184. taicpu(hp1).loadref(1,taicpu(hp1).oper[0]^.ref^);
  2185. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  2186. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  2187. DebugMsg(SPeepholeOptimization + 'MovMovCmp2MovCmp done',hp1);
  2188. end;
  2189. end;
  2190. end
  2191. else if (taicpu(p).oper[1]^.typ=top_ref) and
  2192. OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2193. begin
  2194. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  2195. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  2196. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov1 done',p);
  2197. end
  2198. else
  2199. begin
  2200. TransferUsedRegs(TmpUsedRegs);
  2201. if GetNextInstruction(hp1, hp2) and
  2202. MatchOpType(taicpu(p),top_ref,top_reg) and
  2203. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2204. (taicpu(hp1).oper[1]^.typ = top_ref) and
  2205. MatchInstruction(hp2,A_MOV,[taicpu(p).opsize]) and
  2206. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  2207. RefsEqual(taicpu(hp2).oper[0]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  2208. if not RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^) and
  2209. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,tmpUsedRegs)) then
  2210. { mov mem1, %reg1
  2211. mov %reg1, mem2
  2212. mov mem2, reg2
  2213. to:
  2214. mov mem1, reg2
  2215. mov reg2, mem2}
  2216. begin
  2217. AllocRegBetween(taicpu(hp2).oper[1]^.reg,p,hp2,usedregs);
  2218. DebugMsg(SPeepholeOptimization + 'MovMovMov2MovMov 1 done',p);
  2219. taicpu(p).loadoper(1,taicpu(hp2).oper[1]^);
  2220. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  2221. asml.remove(hp2);
  2222. hp2.free;
  2223. end
  2224. {$ifdef i386}
  2225. { this is enabled for i386 only, as the rules to create the reg sets below
  2226. are too complicated for x86-64, so this makes this code too error prone
  2227. on x86-64
  2228. }
  2229. else if (taicpu(p).oper[1]^.reg <> taicpu(hp2).oper[1]^.reg) and
  2230. not(RegInRef(taicpu(p).oper[1]^.reg,taicpu(p).oper[0]^.ref^)) and
  2231. not(RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^)) then
  2232. { mov mem1, reg1 mov mem1, reg1
  2233. mov reg1, mem2 mov reg1, mem2
  2234. mov mem2, reg2 mov mem2, reg1
  2235. to: to:
  2236. mov mem1, reg1 mov mem1, reg1
  2237. mov mem1, reg2 mov reg1, mem2
  2238. mov reg1, mem2
  2239. or (if mem1 depends on reg1
  2240. and/or if mem2 depends on reg2)
  2241. to:
  2242. mov mem1, reg1
  2243. mov reg1, mem2
  2244. mov reg1, reg2
  2245. }
  2246. begin
  2247. taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^);
  2248. taicpu(hp1).loadReg(1,taicpu(hp2).oper[1]^.reg);
  2249. taicpu(hp2).loadRef(1,taicpu(hp2).oper[0]^.ref^);
  2250. taicpu(hp2).loadReg(0,taicpu(p).oper[1]^.reg);
  2251. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  2252. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  2253. (getsupreg(taicpu(p).oper[0]^.ref^.base) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  2254. AllocRegBetween(taicpu(p).oper[0]^.ref^.base,p,hp2,usedregs);
  2255. if (taicpu(p).oper[0]^.ref^.index <> NR_NO) and
  2256. (getsupreg(taicpu(p).oper[0]^.ref^.index) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  2257. AllocRegBetween(taicpu(p).oper[0]^.ref^.index,p,hp2,usedregs);
  2258. end
  2259. else if (taicpu(hp1).Oper[0]^.reg <> taicpu(hp2).Oper[1]^.reg) then
  2260. begin
  2261. taicpu(hp2).loadReg(0,taicpu(hp1).Oper[0]^.reg);
  2262. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  2263. end
  2264. else
  2265. begin
  2266. asml.remove(hp2);
  2267. hp2.free;
  2268. end
  2269. {$endif i386}
  2270. ;
  2271. end;
  2272. end;
  2273. (* { movl [mem1],reg1
  2274. movl [mem1],reg2
  2275. to
  2276. movl [mem1],reg1
  2277. movl reg1,reg2
  2278. }
  2279. else if (taicpu(p).oper[0]^.typ = top_ref) and
  2280. (taicpu(p).oper[1]^.typ = top_reg) and
  2281. (taicpu(hp1).oper[0]^.typ = top_ref) and
  2282. (taicpu(hp1).oper[1]^.typ = top_reg) and
  2283. (taicpu(p).opsize = taicpu(hp1).opsize) and
  2284. RefsEqual(TReference(taicpu(p).oper[0]^^),taicpu(hp1).oper[0]^^.ref^) and
  2285. (taicpu(p).oper[1]^.reg<>taicpu(hp1).oper[0]^^.ref^.base) and
  2286. (taicpu(p).oper[1]^.reg<>taicpu(hp1).oper[0]^^.ref^.index) then
  2287. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg)
  2288. else*)
  2289. { movl const1,[mem1]
  2290. movl [mem1],reg1
  2291. to
  2292. movl const1,reg1
  2293. movl reg1,[mem1]
  2294. }
  2295. if MatchOpType(Taicpu(p),top_const,top_ref) and
  2296. MatchOpType(Taicpu(hp1),top_ref,top_reg) and
  2297. (taicpu(p).opsize = taicpu(hp1).opsize) and
  2298. RefsEqual(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.ref^) and
  2299. not(RegInRef(taicpu(hp1).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^)) then
  2300. begin
  2301. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  2302. taicpu(hp1).loadReg(0,taicpu(hp1).oper[1]^.reg);
  2303. taicpu(hp1).loadRef(1,taicpu(p).oper[1]^.ref^);
  2304. taicpu(p).loadReg(1,taicpu(hp1).oper[0]^.reg);
  2305. taicpu(hp1).fileinfo := taicpu(p).fileinfo;
  2306. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 1',p);
  2307. Result:=true;
  2308. exit;
  2309. end;
  2310. { mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
  2311. end;
  2312. { search further than the next instruction for a mov }
  2313. if
  2314. { check as much as possible before the expensive GetNextInstructionUsingReg call }
  2315. (taicpu(p).oper[1]^.typ = top_reg) and
  2316. (taicpu(p).oper[0]^.typ in [top_reg,top_const]) and
  2317. not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) and
  2318. { we work with hp2 here, so hp1 can be still used later on when
  2319. checking for GetNextInstruction_p }
  2320. { GetNextInstructionUsingReg only searches one instruction ahead unless -O3 is specified }
  2321. GetNextInstructionUsingReg(hp1,hp2,taicpu(p).oper[1]^.reg) and
  2322. MatchInstruction(hp2,A_MOV,[]) and
  2323. MatchOperand(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^) and
  2324. ((taicpu(p).oper[0]^.typ=top_const) or
  2325. ((taicpu(p).oper[0]^.typ=top_reg) and
  2326. not(RegUsedBetween(taicpu(p).oper[0]^.reg, p, hp2))
  2327. )
  2328. ) then
  2329. begin
  2330. { we have
  2331. mov x, %treg
  2332. mov %treg, y
  2333. }
  2334. TransferUsedRegs(TmpUsedRegs);
  2335. TmpUsedRegs[R_INTREGISTER].Update(tai(p.Next));
  2336. { We don't need to call UpdateUsedRegs for every instruction between
  2337. p and hp2 because the register we're concerned about will not
  2338. become deallocated (otherwise GetNextInstructionUsingReg would
  2339. have stopped at an earlier instruction). [Kit] }
  2340. TempRegUsed :=
  2341. RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) or
  2342. RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1);
  2343. case taicpu(p).oper[0]^.typ Of
  2344. top_reg:
  2345. begin
  2346. { change
  2347. mov %reg, %treg
  2348. mov %treg, y
  2349. to
  2350. mov %reg, y
  2351. }
  2352. CurrentReg := taicpu(p).oper[0]^.reg; { Saves on a handful of pointer dereferences }
  2353. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  2354. if taicpu(hp2).oper[1]^.reg = CurrentReg then
  2355. begin
  2356. { %reg = y - remove hp2 completely (doing it here instead of relying on
  2357. the "mov %reg,%reg" optimisation might cut down on a pass iteration) }
  2358. if TempRegUsed then
  2359. begin
  2360. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + RegName1 + '; removed unnecessary instruction (MovMov2MovNop 6b}',hp2);
  2361. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  2362. asml.remove(hp2);
  2363. hp2.Free;
  2364. end
  2365. else
  2366. begin
  2367. asml.remove(hp2);
  2368. hp2.Free;
  2369. { We can remove the original MOV too }
  2370. DebugMsg(SPeepholeOptimization + 'MovMov2NopNop 6b done',p);
  2371. RemoveCurrentP(p, hp1);
  2372. Result:=true;
  2373. Exit;
  2374. end;
  2375. end
  2376. else
  2377. begin
  2378. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  2379. taicpu(hp2).loadReg(0, CurrentReg);
  2380. if TempRegUsed then
  2381. begin
  2382. { Don't remove the first instruction if the temporary register is in use }
  2383. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_regname(CurrentReg) + '; changed to minimise pipeline stall (MovMov2Mov 6a}',hp2);
  2384. { No need to set Result to True. If there's another instruction later on
  2385. that can be optimised, it will be detected when the main Pass 1 loop
  2386. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
  2387. end
  2388. else
  2389. begin
  2390. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 6 done',p);
  2391. RemoveCurrentP(p, hp1);
  2392. Result:=true;
  2393. Exit;
  2394. end;
  2395. end;
  2396. end;
  2397. top_const:
  2398. if not (cs_opt_size in current_settings.optimizerswitches) or (taicpu(hp2).opsize = S_B) then
  2399. begin
  2400. { change
  2401. mov const, %treg
  2402. mov %treg, y
  2403. to
  2404. mov const, y
  2405. }
  2406. if (taicpu(hp2).oper[1]^.typ=top_reg) or
  2407. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  2408. begin
  2409. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  2410. taicpu(hp2).loadOper(0,taicpu(p).oper[0]^);
  2411. if TempRegUsed then
  2412. begin
  2413. { Don't remove the first instruction if the temporary register is in use }
  2414. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 7a)',hp2);
  2415. { No need to set Result to True. If there's another instruction later on
  2416. that can be optimised, it will be detected when the main Pass 1 loop
  2417. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
  2418. end
  2419. else
  2420. begin
  2421. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 7 done',p);
  2422. RemoveCurrentP(p, hp1);
  2423. Result:=true;
  2424. Exit;
  2425. end;
  2426. end;
  2427. end;
  2428. else
  2429. Internalerror(2019103001);
  2430. end;
  2431. end;
  2432. if (aoc_MovAnd2Mov_3 in OptsToCheck) and
  2433. (taicpu(p).oper[1]^.typ = top_reg) and
  2434. (taicpu(p).opsize = S_L) and
  2435. GetNextInstructionUsingRegTrackingUse(p,hp2,taicpu(p).oper[1]^.reg) and
  2436. (taicpu(hp2).opcode = A_AND) and
  2437. (MatchOpType(taicpu(hp2),top_const,top_reg) or
  2438. (MatchOpType(taicpu(hp2),top_reg,top_reg) and
  2439. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^))
  2440. ) then
  2441. begin
  2442. if SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp2).oper[1]^.reg) then
  2443. begin
  2444. if ((taicpu(hp2).oper[0]^.typ=top_const) and (taicpu(hp2).oper[0]^.val = $ffffffff)) or
  2445. ((taicpu(hp2).oper[0]^.typ=top_reg) and (taicpu(hp2).opsize=S_L)) then
  2446. begin
  2447. { Optimize out:
  2448. mov x, %reg
  2449. and ffffffffh, %reg
  2450. }
  2451. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 3 done',p);
  2452. asml.remove(hp2);
  2453. hp2.free;
  2454. Result:=true;
  2455. exit;
  2456. end;
  2457. end;
  2458. end;
  2459. { leave out the mov from "mov reg, x(%frame_pointer); leave/ret" (with
  2460. x >= RetOffset) as it doesn't do anything (it writes either to a
  2461. parameter or to the temporary storage room for the function
  2462. result)
  2463. }
  2464. if IsExitCode(hp1) and
  2465. (taicpu(p).oper[1]^.typ = top_ref) and
  2466. (taicpu(p).oper[1]^.ref^.index = NR_NO) and
  2467. (
  2468. (
  2469. (taicpu(p).oper[1]^.ref^.base = current_procinfo.FramePointer) and
  2470. not (
  2471. assigned(current_procinfo.procdef.funcretsym) and
  2472. (taicpu(p).oper[1]^.ref^.offset <= tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)
  2473. )
  2474. ) or
  2475. { Also discard writes to the stack that are below the base pointer,
  2476. as this is temporary storage rather than a function result on the
  2477. stack, say. }
  2478. (
  2479. (taicpu(p).oper[1]^.ref^.base = NR_STACK_POINTER_REG) and
  2480. (taicpu(p).oper[1]^.ref^.offset < current_procinfo.final_localsize)
  2481. )
  2482. ) then
  2483. begin
  2484. asml.remove(p);
  2485. p.free;
  2486. p:=hp1;
  2487. DebugMsg(SPeepholeOptimization + 'removed deadstore before leave/ret',p);
  2488. RemoveLastDeallocForFuncRes(p);
  2489. Result:=true;
  2490. exit;
  2491. end;
  2492. if MatchOpType(taicpu(p),top_reg,top_ref) and
  2493. MatchInstruction(hp1,A_CMP,A_TEST,[taicpu(p).opsize]) and
  2494. (taicpu(hp1).oper[1]^.typ = top_ref) and
  2495. RefsEqual(taicpu(p).oper[1]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  2496. begin
  2497. { change
  2498. mov reg1, mem1
  2499. test/cmp x, mem1
  2500. to
  2501. mov reg1, mem1
  2502. test/cmp x, reg1
  2503. }
  2504. taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
  2505. DebugMsg(SPeepholeOptimization + 'MovTestCmp2MovTestCmp 1',hp1);
  2506. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2507. exit;
  2508. end;
  2509. if (taicpu(p).oper[1]^.typ = top_reg) and
  2510. (hp1.typ = ait_instruction) and
  2511. GetNextInstruction(hp1, hp2) and
  2512. MatchInstruction(hp2,A_MOV,[]) and
  2513. (SuperRegistersEqual(taicpu(hp2).oper[0]^.reg,taicpu(p).oper[1]^.reg)) and
  2514. (IsFoldableArithOp(taicpu(hp1), taicpu(p).oper[1]^.reg) or
  2515. ((taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and (taicpu(hp2).opsize=S_L) and
  2516. IsFoldableArithOp(taicpu(hp1), newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[1]^.reg),R_SUBQ)))
  2517. ) then
  2518. begin
  2519. if OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  2520. (taicpu(hp2).oper[0]^.typ=top_reg) then
  2521. { change movsX/movzX reg/ref, reg2
  2522. add/sub/or/... reg3/$const, reg2
  2523. mov reg2 reg/ref
  2524. dealloc reg2
  2525. to
  2526. add/sub/or/... reg3/$const, reg/ref }
  2527. begin
  2528. TransferUsedRegs(TmpUsedRegs);
  2529. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2530. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2531. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2532. begin
  2533. { by example:
  2534. movswl %si,%eax movswl %si,%eax p
  2535. decl %eax addl %edx,%eax hp1
  2536. movw %ax,%si movw %ax,%si hp2
  2537. ->
  2538. movswl %si,%eax movswl %si,%eax p
  2539. decw %eax addw %edx,%eax hp1
  2540. movw %ax,%si movw %ax,%si hp2
  2541. }
  2542. DebugMsg(SPeepholeOptimization + 'MovOpMov2Op ('+
  2543. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  2544. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  2545. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  2546. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  2547. {
  2548. ->
  2549. movswl %si,%eax movswl %si,%eax p
  2550. decw %si addw %dx,%si hp1
  2551. movw %ax,%si movw %ax,%si hp2
  2552. }
  2553. case taicpu(hp1).ops of
  2554. 1:
  2555. begin
  2556. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  2557. if taicpu(hp1).oper[0]^.typ=top_reg then
  2558. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2559. end;
  2560. 2:
  2561. begin
  2562. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  2563. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  2564. (taicpu(hp1).opcode<>A_SHL) and
  2565. (taicpu(hp1).opcode<>A_SHR) and
  2566. (taicpu(hp1).opcode<>A_SAR) then
  2567. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2568. end;
  2569. else
  2570. internalerror(2008042701);
  2571. end;
  2572. {
  2573. ->
  2574. decw %si addw %dx,%si p
  2575. }
  2576. asml.remove(hp2);
  2577. hp2.Free;
  2578. RemoveCurrentP(p, hp1);
  2579. Result:=True;
  2580. Exit;
  2581. end;
  2582. end;
  2583. if MatchOpType(taicpu(hp2),top_reg,top_reg) and
  2584. not(SuperRegistersEqual(taicpu(hp1).oper[0]^.reg,taicpu(hp2).oper[1]^.reg)) and
  2585. ((topsize2memsize[taicpu(hp1).opsize]<= topsize2memsize[taicpu(hp2).opsize]) or
  2586. { opsize matters for these opcodes, we could probably work around this, but it is not worth the effort }
  2587. ((taicpu(hp1).opcode<>A_SHL) and (taicpu(hp1).opcode<>A_SHR) and (taicpu(hp1).opcode<>A_SAR))
  2588. )
  2589. {$ifdef i386}
  2590. { byte registers of esi, edi, ebp, esp are not available on i386 }
  2591. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  2592. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(p).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  2593. {$endif i386}
  2594. then
  2595. { change movsX/movzX reg/ref, reg2
  2596. add/sub/or/... regX/$const, reg2
  2597. mov reg2, reg3
  2598. dealloc reg2
  2599. to
  2600. movsX/movzX reg/ref, reg3
  2601. add/sub/or/... reg3/$const, reg3
  2602. }
  2603. begin
  2604. TransferUsedRegs(TmpUsedRegs);
  2605. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2606. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2607. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2608. begin
  2609. { by example:
  2610. movswl %si,%eax movswl %si,%eax p
  2611. decl %eax addl %edx,%eax hp1
  2612. movw %ax,%si movw %ax,%si hp2
  2613. ->
  2614. movswl %si,%eax movswl %si,%eax p
  2615. decw %eax addw %edx,%eax hp1
  2616. movw %ax,%si movw %ax,%si hp2
  2617. }
  2618. DebugMsg(SPeepholeOptimization + 'MovOpMov2MovOp ('+
  2619. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  2620. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  2621. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  2622. { limit size of constants as well to avoid assembler errors, but
  2623. check opsize to avoid overflow when left shifting the 1 }
  2624. if (taicpu(p).oper[0]^.typ=top_const) and (topsize2memsize[taicpu(hp2).opsize]<=63) then
  2625. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and ((qword(1) shl topsize2memsize[taicpu(hp2).opsize])-1);
  2626. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  2627. taicpu(p).changeopsize(taicpu(hp2).opsize);
  2628. if taicpu(p).oper[0]^.typ=top_reg then
  2629. setsubreg(taicpu(p).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2630. taicpu(p).loadoper(1, taicpu(hp2).oper[1]^);
  2631. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,usedregs);
  2632. {
  2633. ->
  2634. movswl %si,%eax movswl %si,%eax p
  2635. decw %si addw %dx,%si hp1
  2636. movw %ax,%si movw %ax,%si hp2
  2637. }
  2638. case taicpu(hp1).ops of
  2639. 1:
  2640. begin
  2641. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  2642. if taicpu(hp1).oper[0]^.typ=top_reg then
  2643. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2644. end;
  2645. 2:
  2646. begin
  2647. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  2648. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  2649. (taicpu(hp1).opcode<>A_SHL) and
  2650. (taicpu(hp1).opcode<>A_SHR) and
  2651. (taicpu(hp1).opcode<>A_SAR) then
  2652. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2653. end;
  2654. else
  2655. internalerror(2018111801);
  2656. end;
  2657. {
  2658. ->
  2659. decw %si addw %dx,%si p
  2660. }
  2661. asml.remove(hp2);
  2662. hp2.Free;
  2663. end;
  2664. end;
  2665. end;
  2666. if MatchInstruction(hp1,A_BTS,A_BTR,[Taicpu(p).opsize]) and
  2667. GetNextInstruction(hp1, hp2) and
  2668. MatchInstruction(hp2,A_OR,[Taicpu(p).opsize]) and
  2669. MatchOperand(Taicpu(p).oper[0]^,0) and
  2670. (Taicpu(p).oper[1]^.typ = top_reg) and
  2671. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp1).oper[1]^) and
  2672. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp2).oper[1]^) then
  2673. { mov reg1,0
  2674. bts reg1,operand1 --> mov reg1,operand2
  2675. or reg1,operand2 bts reg1,operand1}
  2676. begin
  2677. Taicpu(hp2).opcode:=A_MOV;
  2678. asml.remove(hp1);
  2679. insertllitem(hp2,hp2.next,hp1);
  2680. asml.remove(p);
  2681. p.free;
  2682. p:=hp1;
  2683. Result:=true;
  2684. exit;
  2685. end;
  2686. if MatchInstruction(hp1,A_LEA,[S_L]) and
  2687. MatchOpType(Taicpu(p),top_ref,top_reg) and
  2688. ((MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(hp1).oper[1]^.reg,Taicpu(p).oper[1]^.reg) and
  2689. (Taicpu(hp1).oper[0]^.ref^.base<>Taicpu(p).oper[1]^.reg)
  2690. ) or
  2691. (MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(p).oper[1]^.reg,Taicpu(hp1).oper[1]^.reg) and
  2692. (Taicpu(hp1).oper[0]^.ref^.index<>Taicpu(p).oper[1]^.reg)
  2693. )
  2694. ) then
  2695. { mov reg1,ref
  2696. lea reg2,[reg1,reg2]
  2697. to
  2698. add reg2,ref}
  2699. begin
  2700. TransferUsedRegs(TmpUsedRegs);
  2701. { reg1 may not be used afterwards }
  2702. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
  2703. begin
  2704. Taicpu(hp1).opcode:=A_ADD;
  2705. Taicpu(hp1).oper[0]^.ref^:=Taicpu(p).oper[0]^.ref^;
  2706. DebugMsg(SPeepholeOptimization + 'MovLea2Add done',hp1);
  2707. asml.remove(p);
  2708. p.free;
  2709. p:=hp1;
  2710. result:=true;
  2711. exit;
  2712. end;
  2713. end;
  2714. end;
  2715. function TX86AsmOptimizer.OptPass1MOVXX(var p : tai) : boolean;
  2716. var
  2717. hp1 : tai;
  2718. begin
  2719. Result:=false;
  2720. if taicpu(p).ops <> 2 then
  2721. exit;
  2722. if GetNextInstruction(p,hp1) and
  2723. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  2724. (taicpu(hp1).ops = 2) then
  2725. begin
  2726. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  2727. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  2728. { movXX reg1, mem1 or movXX mem1, reg1
  2729. movXX mem2, reg2 movXX reg2, mem2}
  2730. begin
  2731. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  2732. { movXX reg1, mem1 or movXX mem1, reg1
  2733. movXX mem2, reg1 movXX reg2, mem1}
  2734. begin
  2735. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2736. begin
  2737. { Removes the second statement from
  2738. movXX reg1, mem1/reg2
  2739. movXX mem1/reg2, reg1
  2740. }
  2741. if taicpu(p).oper[0]^.typ=top_reg then
  2742. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2743. { Removes the second statement from
  2744. movXX mem1/reg1, reg2
  2745. movXX reg2, mem1/reg1
  2746. }
  2747. if (taicpu(p).oper[1]^.typ=top_reg) and
  2748. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,UsedRegs)) then
  2749. begin
  2750. asml.remove(p);
  2751. p.free;
  2752. GetNextInstruction(hp1,p);
  2753. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2Nop 1 done',p);
  2754. end
  2755. else
  2756. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2MoVXX 1 done',p);
  2757. asml.remove(hp1);
  2758. hp1.free;
  2759. Result:=true;
  2760. exit;
  2761. end
  2762. end;
  2763. end;
  2764. end;
  2765. end;
  2766. function TX86AsmOptimizer.OptPass1OP(var p : tai) : boolean;
  2767. var
  2768. hp1 : tai;
  2769. begin
  2770. result:=false;
  2771. { replace
  2772. <Op>X %mreg1,%mreg2 // Op in [ADD,MUL]
  2773. MovX %mreg2,%mreg1
  2774. dealloc %mreg2
  2775. by
  2776. <Op>X %mreg2,%mreg1
  2777. ?
  2778. }
  2779. if GetNextInstruction(p,hp1) and
  2780. { we mix single and double opperations here because we assume that the compiler
  2781. generates vmovapd only after double operations and vmovaps only after single operations }
  2782. MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
  2783. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2784. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
  2785. (taicpu(p).oper[0]^.typ=top_reg) then
  2786. begin
  2787. TransferUsedRegs(TmpUsedRegs);
  2788. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2789. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2790. begin
  2791. taicpu(p).loadoper(0,taicpu(hp1).oper[0]^);
  2792. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2793. DebugMsg(SPeepholeOptimization + 'OpMov2Op done',p);
  2794. asml.Remove(hp1);
  2795. hp1.Free;
  2796. result:=true;
  2797. end;
  2798. end;
  2799. end;
  2800. function TX86AsmOptimizer.OptPass1LEA(var p : tai) : boolean;
  2801. var
  2802. hp1, hp2, hp3: tai;
  2803. l : ASizeInt;
  2804. ref: Integer;
  2805. saveref: treference;
  2806. begin
  2807. Result:=false;
  2808. { removes seg register prefixes from LEA operations, as they
  2809. don't do anything}
  2810. taicpu(p).oper[0]^.ref^.Segment:=NR_NO;
  2811. { changes "lea (%reg1), %reg2" into "mov %reg1, %reg2" }
  2812. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  2813. (taicpu(p).oper[0]^.ref^.index = NR_NO) and
  2814. { do not mess with leas acessing the stack pointer }
  2815. (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) and
  2816. (not(Assigned(taicpu(p).oper[0]^.ref^.Symbol))) then
  2817. begin
  2818. if (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) and
  2819. (taicpu(p).oper[0]^.ref^.offset = 0) then
  2820. begin
  2821. hp1:=taicpu.op_reg_reg(A_MOV,taicpu(p).opsize,taicpu(p).oper[0]^.ref^.base,
  2822. taicpu(p).oper[1]^.reg);
  2823. InsertLLItem(p.previous,p.next, hp1);
  2824. DebugMsg(SPeepholeOptimization + 'Lea2Mov done',hp1);
  2825. p.free;
  2826. p:=hp1;
  2827. Result:=true;
  2828. exit;
  2829. end
  2830. else if (taicpu(p).oper[0]^.ref^.offset = 0) then
  2831. begin
  2832. DebugMsg(SPeepholeOptimization + 'Lea2Nop done',p);
  2833. RemoveCurrentP(p);
  2834. Result:=true;
  2835. exit;
  2836. end
  2837. { continue to use lea to adjust the stack pointer,
  2838. it is the recommended way, but only if not optimizing for size }
  2839. else if (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) or
  2840. (cs_opt_size in current_settings.optimizerswitches) then
  2841. with taicpu(p).oper[0]^.ref^ do
  2842. if (base = taicpu(p).oper[1]^.reg) then
  2843. begin
  2844. l:=offset;
  2845. if (l=1) and UseIncDec then
  2846. begin
  2847. taicpu(p).opcode:=A_INC;
  2848. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  2849. taicpu(p).ops:=1;
  2850. DebugMsg(SPeepholeOptimization + 'Lea2Inc done',p);
  2851. end
  2852. else if (l=-1) and UseIncDec then
  2853. begin
  2854. taicpu(p).opcode:=A_DEC;
  2855. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  2856. taicpu(p).ops:=1;
  2857. DebugMsg(SPeepholeOptimization + 'Lea2Dec done',p);
  2858. end
  2859. else
  2860. begin
  2861. if (l<0) and (l<>-2147483648) then
  2862. begin
  2863. taicpu(p).opcode:=A_SUB;
  2864. taicpu(p).loadConst(0,-l);
  2865. DebugMsg(SPeepholeOptimization + 'Lea2Sub done',p);
  2866. end
  2867. else
  2868. begin
  2869. taicpu(p).opcode:=A_ADD;
  2870. taicpu(p).loadConst(0,l);
  2871. DebugMsg(SPeepholeOptimization + 'Lea2Add done',p);
  2872. end;
  2873. end;
  2874. Result:=true;
  2875. exit;
  2876. end;
  2877. end;
  2878. if GetNextInstruction(p,hp1) and
  2879. MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  2880. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2881. MatchOpType(Taicpu(hp1),top_reg,top_reg) and
  2882. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) then
  2883. begin
  2884. TransferUsedRegs(TmpUsedRegs);
  2885. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2886. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2887. begin
  2888. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2889. DebugMsg(SPeepholeOptimization + 'LeaMov2Lea done',p);
  2890. asml.Remove(hp1);
  2891. hp1.Free;
  2892. result:=true;
  2893. end;
  2894. end;
  2895. { changes
  2896. lea offset1(regX), reg1
  2897. lea offset2(reg1), reg1
  2898. to
  2899. lea offset1+offset2(regX), reg1 }
  2900. { for now, we do not mess with the stack pointer, thought it might be usefull to remove
  2901. unneeded lea sequences on the stack pointer, it needs to be tested in detail }
  2902. if (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) and
  2903. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  2904. MatchInstruction(hp1,A_LEA,[taicpu(p).opsize]) and
  2905. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
  2906. (taicpu(p).oper[0]^.ref^.relsymbol=nil) and
  2907. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and
  2908. (taicpu(p).oper[0]^.ref^.symbol=nil) and
  2909. (((taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg) and
  2910. (taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) and
  2911. (taicpu(p).oper[0]^.ref^.index=NR_NO) and
  2912. (taicpu(p).oper[0]^.ref^.index=taicpu(hp1).oper[0]^.ref^.index) and
  2913. (taicpu(p).oper[0]^.ref^.scalefactor=taicpu(hp1).oper[0]^.ref^.scalefactor)
  2914. ) or
  2915. ((taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg) and
  2916. (taicpu(p).oper[0]^.ref^.index=NR_NO)
  2917. ) or
  2918. ((taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg) and
  2919. (taicpu(hp1).oper[0]^.ref^.scalefactor in [0,1]) and
  2920. (taicpu(p).oper[0]^.ref^.base=NR_NO) and
  2921. not(RegUsedBetween(taicpu(p).oper[0]^.ref^.index,p,hp1)))
  2922. ) and
  2923. not(RegUsedBetween(taicpu(p).oper[0]^.ref^.base,p,hp1)) and
  2924. (taicpu(p).oper[0]^.ref^.relsymbol=taicpu(hp1).oper[0]^.ref^.relsymbol) and
  2925. (taicpu(p).oper[0]^.ref^.segment=taicpu(hp1).oper[0]^.ref^.segment) and
  2926. (taicpu(p).oper[0]^.ref^.symbol=taicpu(hp1).oper[0]^.ref^.symbol) then
  2927. begin
  2928. DebugMsg(SPeepholeOptimization + 'LeaLea2Lea done',p);
  2929. if taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg then
  2930. begin
  2931. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.base;
  2932. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  2933. { if the register is used as index and base, we have to increase for base as well
  2934. and adapt base }
  2935. if taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg then
  2936. begin
  2937. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  2938. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  2939. end;
  2940. end
  2941. else
  2942. begin
  2943. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  2944. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  2945. end;
  2946. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  2947. begin
  2948. taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
  2949. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  2950. taicpu(hp1).oper[0]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  2951. end;
  2952. RemoveCurrentP(p);
  2953. result:=true;
  2954. exit;
  2955. end;
  2956. { changes
  2957. lea <ref1>, reg1
  2958. <op> ...,<ref. with reg1>,...
  2959. to
  2960. <op> ...,<ref1>,... }
  2961. if (taicpu(p).oper[1]^.reg<>current_procinfo.framepointer) and
  2962. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) and
  2963. GetNextInstruction(p,hp1) and
  2964. (hp1.typ=ait_instruction) and
  2965. not(MatchInstruction(hp1,A_LEA,[])) then
  2966. begin
  2967. { find a reference which uses reg1 }
  2968. if (taicpu(hp1).ops>=1) and (taicpu(hp1).oper[0]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^) then
  2969. ref:=0
  2970. else if (taicpu(hp1).ops>=2) and (taicpu(hp1).oper[1]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^) then
  2971. ref:=1
  2972. else
  2973. ref:=-1;
  2974. if (ref<>-1) and
  2975. { reg1 must be either the base or the index }
  2976. ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) xor (taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg)) then
  2977. begin
  2978. { reg1 can be removed from the reference }
  2979. saveref:=taicpu(hp1).oper[ref]^.ref^;
  2980. if taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg then
  2981. taicpu(hp1).oper[ref]^.ref^.base:=NR_NO
  2982. else if taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg then
  2983. taicpu(hp1).oper[ref]^.ref^.index:=NR_NO
  2984. else
  2985. Internalerror(2019111201);
  2986. { check if the can insert all data of the lea into the second instruction }
  2987. if ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) or (taicpu(hp1).oper[ref]^.ref^.scalefactor in [0,1])) and
  2988. ((taicpu(p).oper[0]^.ref^.base=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.base=NR_NO)) and
  2989. ((taicpu(p).oper[0]^.ref^.index=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.index=NR_NO)) and
  2990. ((taicpu(p).oper[0]^.ref^.symbol=nil) or (taicpu(hp1).oper[ref]^.ref^.symbol=nil)) and
  2991. ((taicpu(p).oper[0]^.ref^.relsymbol=nil) or (taicpu(hp1).oper[ref]^.ref^.relsymbol=nil)) and
  2992. ((taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) or (taicpu(hp1).oper[ref]^.ref^.scalefactor in [0,1])) and
  2993. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.segment=NR_NO)
  2994. {$ifdef x86_64}
  2995. and (abs(taicpu(hp1).oper[ref]^.ref^.offset+taicpu(p).oper[0]^.ref^.offset)<=$7fffffff)
  2996. and (((taicpu(p).oper[0]^.ref^.base<>NR_RIP) and (taicpu(p).oper[0]^.ref^.index<>NR_RIP)) or
  2997. ((taicpu(hp1).oper[ref]^.ref^.base=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.index=NR_NO))
  2998. )
  2999. {$endif x86_64}
  3000. then
  3001. begin
  3002. { reg1 might not used by the second instruction after it is remove from the reference }
  3003. if not(RegInInstruction(taicpu(p).oper[1]^.reg,taicpu(hp1))) then
  3004. begin
  3005. TransferUsedRegs(TmpUsedRegs);
  3006. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3007. { reg1 is not updated so it might not be used afterwards }
  3008. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  3009. begin
  3010. DebugMsg(SPeepholeOptimization + 'LeaOp2Op done',p);
  3011. if taicpu(p).oper[0]^.ref^.base<>NR_NO then
  3012. taicpu(hp1).oper[ref]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  3013. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  3014. taicpu(hp1).oper[ref]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  3015. if taicpu(p).oper[0]^.ref^.symbol<>nil then
  3016. taicpu(hp1).oper[ref]^.ref^.symbol:=taicpu(p).oper[0]^.ref^.symbol;
  3017. if taicpu(p).oper[0]^.ref^.relsymbol<>nil then
  3018. taicpu(hp1).oper[ref]^.ref^.relsymbol:=taicpu(p).oper[0]^.ref^.relsymbol;
  3019. if not(taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) then
  3020. taicpu(hp1).oper[ref]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  3021. inc(taicpu(hp1).oper[ref]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  3022. RemoveCurrentP(p, hp1);
  3023. result:=true;
  3024. exit;
  3025. end
  3026. end;
  3027. end;
  3028. { recover }
  3029. taicpu(hp1).oper[ref]^.ref^:=saveref;
  3030. end;
  3031. end;
  3032. end;
  3033. function TX86AsmOptimizer.DoSubAddOpt(var p: tai): Boolean;
  3034. var
  3035. hp1 : tai;
  3036. begin
  3037. DoSubAddOpt := False;
  3038. if GetLastInstruction(p, hp1) and
  3039. (hp1.typ = ait_instruction) and
  3040. (taicpu(hp1).opsize = taicpu(p).opsize) then
  3041. case taicpu(hp1).opcode Of
  3042. A_DEC:
  3043. if (taicpu(hp1).oper[0]^.typ = top_reg) and
  3044. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  3045. begin
  3046. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+1);
  3047. asml.remove(hp1);
  3048. hp1.free;
  3049. end;
  3050. A_SUB:
  3051. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  3052. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  3053. begin
  3054. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+taicpu(hp1).oper[0]^.val);
  3055. asml.remove(hp1);
  3056. hp1.free;
  3057. end;
  3058. A_ADD:
  3059. begin
  3060. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  3061. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  3062. begin
  3063. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  3064. asml.remove(hp1);
  3065. hp1.free;
  3066. if (taicpu(p).oper[0]^.val = 0) then
  3067. begin
  3068. hp1 := tai(p.next);
  3069. asml.remove(p);
  3070. p.free;
  3071. if not GetLastInstruction(hp1, p) then
  3072. p := hp1;
  3073. DoSubAddOpt := True;
  3074. end
  3075. end;
  3076. end;
  3077. else
  3078. ;
  3079. end;
  3080. end;
  3081. function TX86AsmOptimizer.OptPass1Sub(var p : tai) : boolean;
  3082. {$ifdef i386}
  3083. var
  3084. hp1 : tai;
  3085. {$endif i386}
  3086. begin
  3087. Result:=false;
  3088. { * change "subl $2, %esp; pushw x" to "pushl x"}
  3089. { * change "sub/add const1, reg" or "dec reg" followed by
  3090. "sub const2, reg" to one "sub ..., reg" }
  3091. if MatchOpType(taicpu(p),top_const,top_reg) then
  3092. begin
  3093. {$ifdef i386}
  3094. if (taicpu(p).oper[0]^.val = 2) and
  3095. (taicpu(p).oper[1]^.reg = NR_ESP) and
  3096. { Don't do the sub/push optimization if the sub }
  3097. { comes from setting up the stack frame (JM) }
  3098. (not(GetLastInstruction(p,hp1)) or
  3099. not(MatchInstruction(hp1,A_MOV,[S_L]) and
  3100. MatchOperand(taicpu(hp1).oper[0]^,NR_ESP) and
  3101. MatchOperand(taicpu(hp1).oper[0]^,NR_EBP))) then
  3102. begin
  3103. hp1 := tai(p.next);
  3104. while Assigned(hp1) and
  3105. (tai(hp1).typ in [ait_instruction]+SkipInstr) and
  3106. not RegReadByInstruction(NR_ESP,hp1) and
  3107. not RegModifiedByInstruction(NR_ESP,hp1) do
  3108. hp1 := tai(hp1.next);
  3109. if Assigned(hp1) and
  3110. MatchInstruction(hp1,A_PUSH,[S_W]) then
  3111. begin
  3112. taicpu(hp1).changeopsize(S_L);
  3113. if taicpu(hp1).oper[0]^.typ=top_reg then
  3114. setsubreg(taicpu(hp1).oper[0]^.reg,R_SUBWHOLE);
  3115. hp1 := tai(p.next);
  3116. asml.remove(p);
  3117. p.free;
  3118. p := hp1;
  3119. Result:=true;
  3120. exit;
  3121. end;
  3122. end;
  3123. {$endif i386}
  3124. if DoSubAddOpt(p) then
  3125. Result:=true;
  3126. end;
  3127. end;
  3128. function TX86AsmOptimizer.OptPass1SHLSAL(var p : tai) : boolean;
  3129. var
  3130. TmpBool1,TmpBool2 : Boolean;
  3131. tmpref : treference;
  3132. hp1,hp2: tai;
  3133. begin
  3134. Result:=false;
  3135. if MatchOpType(taicpu(p),top_const,top_reg) and
  3136. (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  3137. (taicpu(p).oper[0]^.val <= 3) then
  3138. { Changes "shl const, %reg32; add const/reg, %reg32" to one lea statement }
  3139. begin
  3140. { should we check the next instruction? }
  3141. TmpBool1 := True;
  3142. { have we found an add/sub which could be
  3143. integrated in the lea? }
  3144. TmpBool2 := False;
  3145. reference_reset(tmpref,2,[]);
  3146. TmpRef.index := taicpu(p).oper[1]^.reg;
  3147. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  3148. while TmpBool1 and
  3149. GetNextInstruction(p, hp1) and
  3150. (tai(hp1).typ = ait_instruction) and
  3151. ((((taicpu(hp1).opcode = A_ADD) or
  3152. (taicpu(hp1).opcode = A_SUB)) and
  3153. (taicpu(hp1).oper[1]^.typ = Top_Reg) and
  3154. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)) or
  3155. (((taicpu(hp1).opcode = A_INC) or
  3156. (taicpu(hp1).opcode = A_DEC)) and
  3157. (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  3158. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg)) or
  3159. ((taicpu(hp1).opcode = A_LEA) and
  3160. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  3161. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg))) and
  3162. (not GetNextInstruction(hp1,hp2) or
  3163. not instrReadsFlags(hp2)) Do
  3164. begin
  3165. TmpBool1 := False;
  3166. if taicpu(hp1).opcode=A_LEA then
  3167. begin
  3168. if (TmpRef.base = NR_NO) and
  3169. (taicpu(hp1).oper[0]^.ref^.symbol=nil) and
  3170. (taicpu(hp1).oper[0]^.ref^.relsymbol=nil) and
  3171. (taicpu(hp1).oper[0]^.ref^.segment=NR_NO) and
  3172. ((taicpu(hp1).oper[0]^.ref^.scalefactor=0) or
  3173. (taicpu(hp1).oper[0]^.ref^.scalefactor*tmpref.scalefactor<=8)) then
  3174. begin
  3175. TmpBool1 := True;
  3176. TmpBool2 := True;
  3177. inc(TmpRef.offset, taicpu(hp1).oper[0]^.ref^.offset);
  3178. if taicpu(hp1).oper[0]^.ref^.scalefactor<>0 then
  3179. tmpref.scalefactor:=tmpref.scalefactor*taicpu(hp1).oper[0]^.ref^.scalefactor;
  3180. TmpRef.base := taicpu(hp1).oper[0]^.ref^.base;
  3181. asml.remove(hp1);
  3182. hp1.free;
  3183. end
  3184. end
  3185. else if (taicpu(hp1).oper[0]^.typ = Top_Const) then
  3186. begin
  3187. TmpBool1 := True;
  3188. TmpBool2 := True;
  3189. case taicpu(hp1).opcode of
  3190. A_ADD:
  3191. inc(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  3192. A_SUB:
  3193. dec(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  3194. else
  3195. internalerror(2019050536);
  3196. end;
  3197. asml.remove(hp1);
  3198. hp1.free;
  3199. end
  3200. else
  3201. if (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  3202. (((taicpu(hp1).opcode = A_ADD) and
  3203. (TmpRef.base = NR_NO)) or
  3204. (taicpu(hp1).opcode = A_INC) or
  3205. (taicpu(hp1).opcode = A_DEC)) then
  3206. begin
  3207. TmpBool1 := True;
  3208. TmpBool2 := True;
  3209. case taicpu(hp1).opcode of
  3210. A_ADD:
  3211. TmpRef.base := taicpu(hp1).oper[0]^.reg;
  3212. A_INC:
  3213. inc(TmpRef.offset);
  3214. A_DEC:
  3215. dec(TmpRef.offset);
  3216. else
  3217. internalerror(2019050535);
  3218. end;
  3219. asml.remove(hp1);
  3220. hp1.free;
  3221. end;
  3222. end;
  3223. if TmpBool2
  3224. {$ifndef x86_64}
  3225. or
  3226. ((current_settings.optimizecputype < cpu_Pentium2) and
  3227. (taicpu(p).oper[0]^.val <= 3) and
  3228. not(cs_opt_size in current_settings.optimizerswitches))
  3229. {$endif x86_64}
  3230. then
  3231. begin
  3232. if not(TmpBool2) and
  3233. (taicpu(p).oper[0]^.val=1) then
  3234. begin
  3235. hp1:=taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
  3236. taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg)
  3237. end
  3238. else
  3239. hp1:=taicpu.op_ref_reg(A_LEA, taicpu(p).opsize, TmpRef,
  3240. taicpu(p).oper[1]^.reg);
  3241. DebugMsg(SPeepholeOptimization + 'ShlAddLeaSubIncDec2Lea',p);
  3242. InsertLLItem(p.previous, p.next, hp1);
  3243. p.free;
  3244. p := hp1;
  3245. end;
  3246. end
  3247. {$ifndef x86_64}
  3248. else if (current_settings.optimizecputype < cpu_Pentium2) and
  3249. MatchOpType(taicpu(p),top_const,top_reg) then
  3250. begin
  3251. { changes "shl $1, %reg" to "add %reg, %reg", which is the same on a 386,
  3252. but faster on a 486, and Tairable in both U and V pipes on the Pentium
  3253. (unlike shl, which is only Tairable in the U pipe) }
  3254. if taicpu(p).oper[0]^.val=1 then
  3255. begin
  3256. hp1 := taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
  3257. taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg);
  3258. InsertLLItem(p.previous, p.next, hp1);
  3259. p.free;
  3260. p := hp1;
  3261. end
  3262. { changes "shl $2, %reg" to "lea (,%reg,4), %reg"
  3263. "shl $3, %reg" to "lea (,%reg,8), %reg }
  3264. else if (taicpu(p).opsize = S_L) and
  3265. (taicpu(p).oper[0]^.val<= 3) then
  3266. begin
  3267. reference_reset(tmpref,2,[]);
  3268. TmpRef.index := taicpu(p).oper[1]^.reg;
  3269. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  3270. hp1 := taicpu.Op_ref_reg(A_LEA,S_L,TmpRef, taicpu(p).oper[1]^.reg);
  3271. InsertLLItem(p.previous, p.next, hp1);
  3272. p.free;
  3273. p := hp1;
  3274. end;
  3275. end
  3276. {$endif x86_64}
  3277. ;
  3278. end;
  3279. function TX86AsmOptimizer.OptPass1SETcc(var p: tai): boolean;
  3280. var
  3281. hp1,hp2,next: tai; SetC, JumpC: TAsmCond; Unconditional: Boolean;
  3282. begin
  3283. Result:=false;
  3284. if MatchOpType(taicpu(p),top_reg) and
  3285. GetNextInstruction(p, hp1) and
  3286. ((MatchInstruction(hp1, A_TEST, [S_B]) and
  3287. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3288. (taicpu(hp1).oper[0]^.reg = taicpu(hp1).oper[1]^.reg)) or
  3289. (MatchInstruction(hp1, A_CMP, [S_B]) and
  3290. MatchOpType(taicpu(hp1),top_const,top_reg) and
  3291. (taicpu(hp1).oper[0]^.val=0))
  3292. ) and
  3293. (taicpu(p).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  3294. GetNextInstruction(hp1, hp2) and
  3295. MatchInstruction(hp2, A_Jcc, []) then
  3296. { Change from: To:
  3297. set(C) %reg j(~C) label
  3298. test %reg,%reg/cmp $0,%reg
  3299. je label
  3300. set(C) %reg j(C) label
  3301. test %reg,%reg/cmp $0,%reg
  3302. jne label
  3303. }
  3304. begin
  3305. next := tai(p.Next);
  3306. TransferUsedRegs(TmpUsedRegs);
  3307. UpdateUsedRegs(TmpUsedRegs, next);
  3308. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  3309. JumpC := taicpu(hp2).condition;
  3310. Unconditional := False;
  3311. if conditions_equal(JumpC, C_E) then
  3312. SetC := inverse_cond(taicpu(p).condition)
  3313. else if conditions_equal(JumpC, C_NE) then
  3314. SetC := taicpu(p).condition
  3315. else
  3316. { We've got something weird here (and inefficent) }
  3317. begin
  3318. DebugMsg('DEBUG: Inefficient jump - check code generation', p);
  3319. SetC := C_NONE;
  3320. { JAE/JNB will always branch (use 'condition_in', since C_AE <> C_NB normally) }
  3321. if condition_in(C_AE, JumpC) then
  3322. Unconditional := True
  3323. else
  3324. { Not sure what to do with this jump - drop out }
  3325. Exit;
  3326. end;
  3327. asml.Remove(hp1);
  3328. hp1.Free;
  3329. if Unconditional then
  3330. MakeUnconditional(taicpu(hp2))
  3331. else
  3332. begin
  3333. if SetC = C_NONE then
  3334. InternalError(2018061401);
  3335. taicpu(hp2).SetCondition(SetC);
  3336. end;
  3337. if not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs) then
  3338. begin
  3339. asml.Remove(p);
  3340. UpdateUsedRegs(next);
  3341. p.Free;
  3342. Result := True;
  3343. p := hp2;
  3344. end;
  3345. DebugMsg(SPeepholeOptimization + 'SETcc/TESTCmp/Jcc -> Jcc',p);
  3346. end;
  3347. end;
  3348. function TX86AsmOptimizer.OptPass1FSTP(var p: tai): boolean;
  3349. { returns true if a "continue" should be done after this optimization }
  3350. var
  3351. hp1, hp2: tai;
  3352. begin
  3353. Result := false;
  3354. if MatchOpType(taicpu(p),top_ref) and
  3355. GetNextInstruction(p, hp1) and
  3356. (hp1.typ = ait_instruction) and
  3357. (((taicpu(hp1).opcode = A_FLD) and
  3358. (taicpu(p).opcode = A_FSTP)) or
  3359. ((taicpu(p).opcode = A_FISTP) and
  3360. (taicpu(hp1).opcode = A_FILD))) and
  3361. MatchOpType(taicpu(hp1),top_ref) and
  3362. (taicpu(hp1).opsize = taicpu(p).opsize) and
  3363. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  3364. begin
  3365. { replacing fstp f;fld f by fst f is only valid for extended because of rounding }
  3366. if (taicpu(p).opsize=S_FX) and
  3367. GetNextInstruction(hp1, hp2) and
  3368. (hp2.typ = ait_instruction) and
  3369. IsExitCode(hp2) and
  3370. (taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
  3371. not(assigned(current_procinfo.procdef.funcretsym) and
  3372. (taicpu(p).oper[0]^.ref^.offset < tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)) and
  3373. (taicpu(p).oper[0]^.ref^.index = NR_NO) then
  3374. begin
  3375. asml.remove(p);
  3376. asml.remove(hp1);
  3377. p.free;
  3378. hp1.free;
  3379. p := hp2;
  3380. RemoveLastDeallocForFuncRes(p);
  3381. Result := true;
  3382. end
  3383. (* can't be done because the store operation rounds
  3384. else
  3385. { fst can't store an extended value! }
  3386. if (taicpu(p).opsize <> S_FX) and
  3387. (taicpu(p).opsize <> S_IQ) then
  3388. begin
  3389. if (taicpu(p).opcode = A_FSTP) then
  3390. taicpu(p).opcode := A_FST
  3391. else taicpu(p).opcode := A_FIST;
  3392. asml.remove(hp1);
  3393. hp1.free;
  3394. end
  3395. *)
  3396. end;
  3397. end;
  3398. function TX86AsmOptimizer.OptPass1FLD(var p : tai) : boolean;
  3399. var
  3400. hp1, hp2: tai;
  3401. begin
  3402. result:=false;
  3403. if MatchOpType(taicpu(p),top_reg) and
  3404. GetNextInstruction(p, hp1) and
  3405. (hp1.typ = Ait_Instruction) and
  3406. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3407. (taicpu(hp1).oper[0]^.reg = NR_ST) and
  3408. (taicpu(hp1).oper[1]^.reg = NR_ST1) then
  3409. { change to
  3410. fld reg fxxx reg,st
  3411. fxxxp st, st1 (hp1)
  3412. Remark: non commutative operations must be reversed!
  3413. }
  3414. begin
  3415. case taicpu(hp1).opcode Of
  3416. A_FMULP,A_FADDP,
  3417. A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  3418. begin
  3419. case taicpu(hp1).opcode Of
  3420. A_FADDP: taicpu(hp1).opcode := A_FADD;
  3421. A_FMULP: taicpu(hp1).opcode := A_FMUL;
  3422. A_FSUBP: taicpu(hp1).opcode := A_FSUBR;
  3423. A_FSUBRP: taicpu(hp1).opcode := A_FSUB;
  3424. A_FDIVP: taicpu(hp1).opcode := A_FDIVR;
  3425. A_FDIVRP: taicpu(hp1).opcode := A_FDIV;
  3426. else
  3427. internalerror(2019050534);
  3428. end;
  3429. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  3430. taicpu(hp1).oper[1]^.reg := NR_ST;
  3431. asml.remove(p);
  3432. p.free;
  3433. p := hp1;
  3434. Result:=true;
  3435. exit;
  3436. end;
  3437. else
  3438. ;
  3439. end;
  3440. end
  3441. else
  3442. if MatchOpType(taicpu(p),top_ref) and
  3443. GetNextInstruction(p, hp2) and
  3444. (hp2.typ = Ait_Instruction) and
  3445. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  3446. (taicpu(p).opsize in [S_FS, S_FL]) and
  3447. (taicpu(hp2).oper[0]^.reg = NR_ST) and
  3448. (taicpu(hp2).oper[1]^.reg = NR_ST1) then
  3449. if GetLastInstruction(p, hp1) and
  3450. MatchInstruction(hp1,A_FLD,A_FST,[taicpu(p).opsize]) and
  3451. MatchOpType(taicpu(hp1),top_ref) and
  3452. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  3453. if ((taicpu(hp2).opcode = A_FMULP) or
  3454. (taicpu(hp2).opcode = A_FADDP)) then
  3455. { change to
  3456. fld/fst mem1 (hp1) fld/fst mem1
  3457. fld mem1 (p) fadd/
  3458. faddp/ fmul st, st
  3459. fmulp st, st1 (hp2) }
  3460. begin
  3461. asml.remove(p);
  3462. p.free;
  3463. p := hp1;
  3464. if (taicpu(hp2).opcode = A_FADDP) then
  3465. taicpu(hp2).opcode := A_FADD
  3466. else
  3467. taicpu(hp2).opcode := A_FMUL;
  3468. taicpu(hp2).oper[1]^.reg := NR_ST;
  3469. end
  3470. else
  3471. { change to
  3472. fld/fst mem1 (hp1) fld/fst mem1
  3473. fld mem1 (p) fld st}
  3474. begin
  3475. taicpu(p).changeopsize(S_FL);
  3476. taicpu(p).loadreg(0,NR_ST);
  3477. end
  3478. else
  3479. begin
  3480. case taicpu(hp2).opcode Of
  3481. A_FMULP,A_FADDP,A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  3482. { change to
  3483. fld/fst mem1 (hp1) fld/fst mem1
  3484. fld mem2 (p) fxxx mem2
  3485. fxxxp st, st1 (hp2) }
  3486. begin
  3487. case taicpu(hp2).opcode Of
  3488. A_FADDP: taicpu(p).opcode := A_FADD;
  3489. A_FMULP: taicpu(p).opcode := A_FMUL;
  3490. A_FSUBP: taicpu(p).opcode := A_FSUBR;
  3491. A_FSUBRP: taicpu(p).opcode := A_FSUB;
  3492. A_FDIVP: taicpu(p).opcode := A_FDIVR;
  3493. A_FDIVRP: taicpu(p).opcode := A_FDIV;
  3494. else
  3495. internalerror(2019050533);
  3496. end;
  3497. asml.remove(hp2);
  3498. hp2.free;
  3499. end
  3500. else
  3501. ;
  3502. end
  3503. end
  3504. end;
  3505. function TX86AsmOptimizer.OptPass1Cmp(var p: tai): boolean;
  3506. var
  3507. v: TCGInt;
  3508. hp1, hp2: tai;
  3509. begin
  3510. Result:=false;
  3511. if taicpu(p).oper[0]^.typ = top_const then
  3512. begin
  3513. { Though GetNextInstruction can be factored out, it is an expensive
  3514. call, so delay calling it until we have first checked cheaper
  3515. conditions that are independent of it. }
  3516. if (taicpu(p).oper[0]^.val = 0) and
  3517. (taicpu(p).oper[1]^.typ = top_reg) and
  3518. GetNextInstruction(p, hp1) and
  3519. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) then
  3520. begin
  3521. hp2 := p;
  3522. { When dealing with "cmp $0,%reg", only ZF and SF contain
  3523. anything meaningful once it's converted to "test %reg,%reg";
  3524. additionally, some jumps will always (or never) branch, so
  3525. evaluate every jump immediately following the
  3526. comparison, optimising the conditions if possible.
  3527. Similarly with SETcc... those that are always set to 0 or 1
  3528. are changed to MOV instructions }
  3529. while GetNextInstruction(hp2, hp1) and
  3530. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) do
  3531. begin
  3532. case taicpu(hp1).condition of
  3533. C_B, C_C, C_NAE, C_O:
  3534. { For B/NAE:
  3535. Will never branch since an unsigned integer can never be below zero
  3536. For C/O:
  3537. Result cannot overflow because 0 is being subtracted
  3538. }
  3539. begin
  3540. if taicpu(hp1).opcode = A_Jcc then
  3541. begin
  3542. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (jump removed)', hp1);
  3543. TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol).decrefs;
  3544. AsmL.Remove(hp1);
  3545. hp1.Free;
  3546. { Since hp1 was deleted, hp2 must not be updated }
  3547. Continue;
  3548. end
  3549. else
  3550. begin
  3551. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (set -> mov 0)', hp1);
  3552. { Convert "set(c) %reg" instruction to "movb 0,%reg" }
  3553. taicpu(hp1).opcode := A_MOV;
  3554. taicpu(hp1).ops := 2;
  3555. taicpu(hp1).condition := C_None;
  3556. taicpu(hp1).opsize := S_B;
  3557. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  3558. taicpu(hp1).loadconst(0, 0);
  3559. end;
  3560. end;
  3561. C_BE, C_NA:
  3562. begin
  3563. { Will only branch if equal to zero }
  3564. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition BE/NA --> E', hp1);
  3565. taicpu(hp1).condition := C_E;
  3566. end;
  3567. C_A, C_NBE:
  3568. begin
  3569. { Will only branch if not equal to zero }
  3570. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition A/NBE --> NE', hp1);
  3571. taicpu(hp1).condition := C_NE;
  3572. end;
  3573. C_AE, C_NB, C_NC, C_NO:
  3574. begin
  3575. { Will always branch }
  3576. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition AE/NB/NC/NO --> Always', hp1);
  3577. if taicpu(hp1).opcode = A_Jcc then
  3578. begin
  3579. MakeUnconditional(taicpu(hp1));
  3580. { Any jumps/set that follow will now be dead code }
  3581. RemoveDeadCodeAfterJump(taicpu(hp1));
  3582. Break;
  3583. end
  3584. else
  3585. begin
  3586. { Convert "set(c) %reg" instruction to "movb 1,%reg" }
  3587. taicpu(hp1).opcode := A_MOV;
  3588. taicpu(hp1).ops := 2;
  3589. taicpu(hp1).condition := C_None;
  3590. taicpu(hp1).opsize := S_B;
  3591. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  3592. taicpu(hp1).loadconst(0, 1);
  3593. end;
  3594. end;
  3595. C_None:
  3596. InternalError(2020012201);
  3597. C_P, C_PE, C_NP, C_PO:
  3598. { We can't handle parity checks and they should never be generated
  3599. after a general-purpose CMP (it's used in some floating-point
  3600. comparisons that don't use CMP) }
  3601. InternalError(2020012202);
  3602. else
  3603. { Zero/Equality, Sign, their complements and all of the
  3604. signed comparisons do not need to be converted };
  3605. end;
  3606. hp2 := hp1;
  3607. end;
  3608. { Convert the instruction to a TEST }
  3609. taicpu(p).opcode := A_TEST;
  3610. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  3611. Result := True;
  3612. Exit;
  3613. end
  3614. else if (taicpu(p).oper[0]^.val = 1) and
  3615. GetNextInstruction(p, hp1) and
  3616. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
  3617. (taicpu(hp1).condition in [C_L, C_NGE]) then
  3618. begin
  3619. { Convert; To:
  3620. cmp $1,r/m cmp $0,r/m
  3621. jl @lbl jle @lbl
  3622. }
  3623. DebugMsg(SPeepholeOptimization + 'Cmp1Jl2Cmp0Jle', p);
  3624. taicpu(p).oper[0]^.val := 0;
  3625. taicpu(hp1).condition := C_LE;
  3626. { If the instruction is now "cmp $0,%reg", convert it to a
  3627. TEST (and effectively do the work of the "cmp $0,%reg" in
  3628. the block above)
  3629. If it's a reference, we can get away with not setting
  3630. Result to True because he haven't evaluated the jump
  3631. in this pass yet.
  3632. }
  3633. if (taicpu(p).oper[1]^.typ = top_reg) then
  3634. begin
  3635. taicpu(p).opcode := A_TEST;
  3636. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  3637. Result := True;
  3638. end;
  3639. Exit;
  3640. end
  3641. else if (taicpu(p).oper[1]^.typ = top_reg) then
  3642. begin
  3643. { cmp register,$8000 neg register
  3644. je target --> jo target
  3645. .... only if register is deallocated before jump.}
  3646. case Taicpu(p).opsize of
  3647. S_B: v:=$80;
  3648. S_W: v:=$8000;
  3649. S_L: v:=qword($80000000);
  3650. { S_Q will never happen: cmp with 64 bit constants is not possible }
  3651. S_Q:
  3652. Exit;
  3653. else
  3654. internalerror(2013112905);
  3655. end;
  3656. if (taicpu(p).oper[0]^.val=v) and
  3657. GetNextInstruction(p, hp1) and
  3658. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
  3659. (Taicpu(hp1).condition in [C_E,C_NE]) then
  3660. begin
  3661. TransferUsedRegs(TmpUsedRegs);
  3662. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  3663. if not(RegInUsedRegs(Taicpu(p).oper[1]^.reg, TmpUsedRegs)) then
  3664. begin
  3665. DebugMsg(SPeepholeOptimization + 'CmpJe2NegJo done',p);
  3666. Taicpu(p).opcode:=A_NEG;
  3667. Taicpu(p).loadoper(0,Taicpu(p).oper[1]^);
  3668. Taicpu(p).clearop(1);
  3669. Taicpu(p).ops:=1;
  3670. if Taicpu(hp1).condition=C_E then
  3671. Taicpu(hp1).condition:=C_O
  3672. else
  3673. Taicpu(hp1).condition:=C_NO;
  3674. Result:=true;
  3675. exit;
  3676. end;
  3677. end;
  3678. end;
  3679. end;
  3680. end;
  3681. function TX86AsmOptimizer.OptPass1PXor(var p: tai): boolean;
  3682. var
  3683. hp1: tai;
  3684. begin
  3685. {
  3686. remove the second (v)pxor from
  3687. (v)pxor reg,reg
  3688. ...
  3689. (v)pxor reg,reg
  3690. }
  3691. Result:=false;
  3692. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  3693. MatchOpType(taicpu(p),top_reg,top_reg) and
  3694. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  3695. MatchInstruction(taicpu(hp1),taicpu(p).opcode,[taicpu(p).opsize]) and
  3696. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  3697. MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^) then
  3698. begin
  3699. DebugMsg(SPeepholeOptimization + 'PXorPXor2PXor done',hp1);
  3700. asml.Remove(hp1);
  3701. hp1.Free;
  3702. Result:=true;
  3703. Exit;
  3704. end;
  3705. end;
  3706. function TX86AsmOptimizer.OptPass2MOV(var p : tai) : boolean;
  3707. function IsXCHGAcceptable: Boolean; inline;
  3708. begin
  3709. { Always accept if optimising for size }
  3710. Result := (cs_opt_size in current_settings.optimizerswitches) or
  3711. (
  3712. {$ifdef x86_64}
  3713. { XCHG takes 3 cycles on AMD Athlon64 }
  3714. (current_settings.optimizecputype >= cpu_core_i)
  3715. {$else x86_64}
  3716. { From the Pentium M onwards, XCHG only has a latency of 2 rather
  3717. than 3, so it becomes a saving compared to three MOVs with two of
  3718. them able to execute simultaneously. [Kit] }
  3719. (current_settings.optimizecputype >= cpu_PentiumM)
  3720. {$endif x86_64}
  3721. );
  3722. end;
  3723. var
  3724. NewRef: TReference;
  3725. hp1,hp2,hp3: tai;
  3726. {$ifndef x86_64}
  3727. hp4: tai;
  3728. OperIdx: Integer;
  3729. {$endif x86_64}
  3730. begin
  3731. Result:=false;
  3732. if not GetNextInstruction(p, hp1) then
  3733. Exit;
  3734. if MatchInstruction(hp1, A_JMP, [S_NO]) then
  3735. begin
  3736. { Sometimes the MOVs that OptPass2JMP produces can be improved
  3737. further, but we can't just put this jump optimisation in pass 1
  3738. because it tends to perform worse when conditional jumps are
  3739. nearby (e.g. when converting CMOV instructions). [Kit] }
  3740. if OptPass2JMP(hp1) then
  3741. { call OptPass1MOV once to potentially merge any MOVs that were created }
  3742. Result := OptPass1MOV(p)
  3743. { OptPass2MOV will now exit but will be called again if OptPass1MOV
  3744. returned True and the instruction is still a MOV, thus checking
  3745. the optimisations below }
  3746. { If OptPass2JMP returned False, no optimisations were done to
  3747. the jump and there are no further optimisations that can be done
  3748. to the MOV instruction on this pass }
  3749. end
  3750. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  3751. (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  3752. MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
  3753. MatchOpType(taicpu(hp1),top_const,top_reg) and
  3754. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
  3755. { be lazy, checking separately for sub would be slightly better }
  3756. (abs(taicpu(hp1).oper[0]^.val)<=$7fffffff) then
  3757. begin
  3758. { Change:
  3759. movl/q %reg1,%reg2 movl/q %reg1,%reg2
  3760. addl/q $x,%reg2 subl/q $x,%reg2
  3761. To:
  3762. leal/q x(%reg1),%reg2 leal/q -x(%reg1),%reg2
  3763. }
  3764. TransferUsedRegs(TmpUsedRegs);
  3765. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  3766. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3767. if not GetNextInstruction(hp1, hp2) or
  3768. (
  3769. { The FLAGS register isn't always tracked properly, so do not
  3770. perform this optimisation if a conditional statement follows }
  3771. not RegReadByInstruction(NR_DEFAULTFLAGS, hp2) and
  3772. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)
  3773. ) then
  3774. begin
  3775. reference_reset(NewRef, 1, []);
  3776. NewRef.base := taicpu(p).oper[0]^.reg;
  3777. NewRef.scalefactor := 1;
  3778. if taicpu(hp1).opcode = A_ADD then
  3779. begin
  3780. DebugMsg(SPeepholeOptimization + 'MovAdd2Lea', p);
  3781. NewRef.offset := taicpu(hp1).oper[0]^.val;
  3782. end
  3783. else
  3784. begin
  3785. DebugMsg(SPeepholeOptimization + 'MovSub2Lea', p);
  3786. NewRef.offset := -taicpu(hp1).oper[0]^.val;
  3787. end;
  3788. taicpu(p).opcode := A_LEA;
  3789. taicpu(p).loadref(0, NewRef);
  3790. Asml.Remove(hp1);
  3791. hp1.Free;
  3792. Result := True;
  3793. Exit;
  3794. end;
  3795. end
  3796. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  3797. {$ifdef x86_64}
  3798. MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
  3799. {$else x86_64}
  3800. MatchInstruction(hp1,A_MOVZX,A_MOVSX,[]) and
  3801. {$endif x86_64}
  3802. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3803. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg) then
  3804. { mov reg1, reg2 mov reg1, reg2
  3805. movzx/sx reg2, reg3 to movzx/sx reg1, reg3}
  3806. begin
  3807. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  3808. DebugMsg(SPeepholeOptimization + 'mov %reg1,%reg2; movzx/sx %reg2,%reg3 -> mov %reg1,%reg2;movzx/sx %reg1,%reg3',p);
  3809. { Don't remove the MOV command without first checking that reg2 isn't used afterwards,
  3810. or unless supreg(reg3) = supreg(reg2)). [Kit] }
  3811. TransferUsedRegs(TmpUsedRegs);
  3812. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3813. if (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) or
  3814. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)
  3815. then
  3816. begin
  3817. asml.remove(p);
  3818. p.free;
  3819. p := hp1;
  3820. Result:=true;
  3821. end;
  3822. exit;
  3823. end
  3824. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  3825. IsXCHGAcceptable and
  3826. { XCHG doesn't support 8-byte registers }
  3827. (taicpu(p).opsize <> S_B) and
  3828. MatchInstruction(hp1, A_MOV, []) and
  3829. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3830. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
  3831. GetNextInstruction(hp1, hp2) and
  3832. MatchInstruction(hp2, A_MOV, []) and
  3833. { Don't need to call MatchOpType for hp2 because the operand matches below cover for it }
  3834. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  3835. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) then
  3836. begin
  3837. { mov %reg1,%reg2
  3838. mov %reg3,%reg1 -> xchg %reg3,%reg1
  3839. mov %reg2,%reg3
  3840. (%reg2 not used afterwards)
  3841. Note that xchg takes 3 cycles to execute, and generally mov's take
  3842. only one cycle apiece, but the first two mov's can be executed in
  3843. parallel, only taking 2 cycles overall. Older processors should
  3844. therefore only optimise for size. [Kit]
  3845. }
  3846. TransferUsedRegs(TmpUsedRegs);
  3847. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  3848. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3849. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) then
  3850. begin
  3851. DebugMsg(SPeepholeOptimization + 'MovMovMov2XChg', p);
  3852. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp1, UsedRegs);
  3853. taicpu(hp1).opcode := A_XCHG;
  3854. asml.Remove(p);
  3855. asml.Remove(hp2);
  3856. p.Free;
  3857. hp2.Free;
  3858. p := hp1;
  3859. Result := True;
  3860. Exit;
  3861. end;
  3862. end
  3863. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  3864. MatchInstruction(hp1, A_SAR, []) then
  3865. begin
  3866. if MatchOperand(taicpu(hp1).oper[0]^, 31) then
  3867. begin
  3868. { the use of %edx also covers the opsize being S_L }
  3869. if MatchOperand(taicpu(hp1).oper[1]^, NR_EDX) then
  3870. begin
  3871. { Note it has to be specifically "movl %eax,%edx", and those specific sub-registers }
  3872. if (taicpu(p).oper[0]^.reg = NR_EAX) and
  3873. (taicpu(p).oper[1]^.reg = NR_EDX) then
  3874. begin
  3875. { Change:
  3876. movl %eax,%edx
  3877. sarl $31,%edx
  3878. To:
  3879. cltd
  3880. }
  3881. DebugMsg(SPeepholeOptimization + 'MovSar2Cltd', p);
  3882. Asml.Remove(hp1);
  3883. hp1.Free;
  3884. taicpu(p).opcode := A_CDQ;
  3885. taicpu(p).opsize := S_NO;
  3886. taicpu(p).clearop(1);
  3887. taicpu(p).clearop(0);
  3888. taicpu(p).ops:=0;
  3889. Result := True;
  3890. end
  3891. else if (cs_opt_size in current_settings.optimizerswitches) and
  3892. (taicpu(p).oper[0]^.reg = NR_EDX) and
  3893. (taicpu(p).oper[1]^.reg = NR_EAX) then
  3894. begin
  3895. { Change:
  3896. movl %edx,%eax
  3897. sarl $31,%edx
  3898. To:
  3899. movl %edx,%eax
  3900. cltd
  3901. Note that this creates a dependency between the two instructions,
  3902. so only perform if optimising for size.
  3903. }
  3904. DebugMsg(SPeepholeOptimization + 'MovSar2MovCltd', p);
  3905. taicpu(hp1).opcode := A_CDQ;
  3906. taicpu(hp1).opsize := S_NO;
  3907. taicpu(hp1).clearop(1);
  3908. taicpu(hp1).clearop(0);
  3909. taicpu(hp1).ops:=0;
  3910. end;
  3911. {$ifndef x86_64}
  3912. end
  3913. { Don't bother if CMOV is supported, because a more optimal
  3914. sequence would have been generated for the Abs() intrinsic }
  3915. else if not(CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) and
  3916. { the use of %eax also covers the opsize being S_L }
  3917. MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) and
  3918. (taicpu(p).oper[0]^.reg = NR_EAX) and
  3919. (taicpu(p).oper[1]^.reg = NR_EDX) and
  3920. GetNextInstruction(hp1, hp2) and
  3921. MatchInstruction(hp2, A_XOR, [S_L]) and
  3922. MatchOperand(taicpu(hp2).oper[0]^, NR_EAX) and
  3923. MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) and
  3924. GetNextInstruction(hp2, hp3) and
  3925. MatchInstruction(hp3, A_SUB, [S_L]) and
  3926. MatchOperand(taicpu(hp3).oper[0]^, NR_EAX) and
  3927. MatchOperand(taicpu(hp3).oper[1]^, NR_EDX) then
  3928. begin
  3929. { Change:
  3930. movl %eax,%edx
  3931. sarl $31,%eax
  3932. xorl %eax,%edx
  3933. subl %eax,%edx
  3934. (Instruction that uses %edx)
  3935. (%eax deallocated)
  3936. (%edx deallocated)
  3937. To:
  3938. cltd
  3939. xorl %edx,%eax <-- Note the registers have swapped
  3940. subl %edx,%eax
  3941. (Instruction that uses %eax) <-- %eax rather than %edx
  3942. }
  3943. TransferUsedRegs(TmpUsedRegs);
  3944. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  3945. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3946. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  3947. if not RegUsedAfterInstruction(NR_EAX, hp3, TmpUsedRegs) then
  3948. begin
  3949. if GetNextInstruction(hp3, hp4) and
  3950. not RegModifiedByInstruction(NR_EDX, hp4) and
  3951. not RegUsedAfterInstruction(NR_EDX, hp4, TmpUsedRegs) then
  3952. begin
  3953. DebugMsg(SPeepholeOptimization + 'abs() intrinsic optimisation', p);
  3954. taicpu(p).opcode := A_CDQ;
  3955. taicpu(p).clearop(1);
  3956. taicpu(p).clearop(0);
  3957. taicpu(p).ops:=0;
  3958. AsmL.Remove(hp1);
  3959. hp1.Free;
  3960. taicpu(hp2).loadreg(0, NR_EDX);
  3961. taicpu(hp2).loadreg(1, NR_EAX);
  3962. taicpu(hp3).loadreg(0, NR_EDX);
  3963. taicpu(hp3).loadreg(1, NR_EAX);
  3964. AllocRegBetween(NR_EAX, hp3, hp4, TmpUsedRegs);
  3965. { Convert references in the following instruction (hp4) from %edx to %eax }
  3966. for OperIdx := 0 to taicpu(hp4).ops - 1 do
  3967. with taicpu(hp4).oper[OperIdx]^ do
  3968. case typ of
  3969. top_reg:
  3970. if reg = NR_EDX then
  3971. reg := NR_EAX;
  3972. top_ref:
  3973. begin
  3974. if ref^.base = NR_EDX then
  3975. ref^.base := NR_EAX;
  3976. if ref^.index = NR_EDX then
  3977. ref^.index := NR_EAX;
  3978. end;
  3979. else
  3980. ;
  3981. end;
  3982. end;
  3983. end;
  3984. {$else x86_64}
  3985. end;
  3986. end
  3987. else if MatchOperand(taicpu(hp1).oper[0]^, 63) and
  3988. { the use of %rdx also covers the opsize being S_Q }
  3989. MatchOperand(taicpu(hp1).oper[1]^, NR_RDX) then
  3990. begin
  3991. { Note it has to be specifically "movq %rax,%rdx", and those specific sub-registers }
  3992. if (taicpu(p).oper[0]^.reg = NR_RAX) and
  3993. (taicpu(p).oper[1]^.reg = NR_RDX) then
  3994. begin
  3995. { Change:
  3996. movq %rax,%rdx
  3997. sarq $63,%rdx
  3998. To:
  3999. cqto
  4000. }
  4001. DebugMsg(SPeepholeOptimization + 'MovSar2Cqto', p);
  4002. Asml.Remove(hp1);
  4003. hp1.Free;
  4004. taicpu(p).opcode := A_CQO;
  4005. taicpu(p).opsize := S_NO;
  4006. taicpu(p).clearop(1);
  4007. taicpu(p).clearop(0);
  4008. taicpu(p).ops:=0;
  4009. Result := True;
  4010. end
  4011. else if (cs_opt_size in current_settings.optimizerswitches) and
  4012. (taicpu(p).oper[0]^.reg = NR_RDX) and
  4013. (taicpu(p).oper[1]^.reg = NR_RAX) then
  4014. begin
  4015. { Change:
  4016. movq %rdx,%rax
  4017. sarq $63,%rdx
  4018. To:
  4019. movq %rdx,%rax
  4020. cqto
  4021. Note that this creates a dependency between the two instructions,
  4022. so only perform if optimising for size.
  4023. }
  4024. DebugMsg(SPeepholeOptimization + 'MovSar2MovCqto', p);
  4025. taicpu(hp1).opcode := A_CQO;
  4026. taicpu(hp1).opsize := S_NO;
  4027. taicpu(hp1).clearop(1);
  4028. taicpu(hp1).clearop(0);
  4029. taicpu(hp1).ops:=0;
  4030. {$endif x86_64}
  4031. end;
  4032. end;
  4033. end
  4034. else if MatchInstruction(hp1, A_MOV, []) and
  4035. (taicpu(hp1).oper[1]^.typ = top_reg) then
  4036. { Though "GetNextInstruction" could be factored out, along with
  4037. the instructions that depend on hp2, it is an expensive call that
  4038. should be delayed for as long as possible, hence we do cheaper
  4039. checks first that are likely to be False. [Kit] }
  4040. begin
  4041. if MatchOperand(taicpu(p).oper[1]^, NR_EDX) and
  4042. (
  4043. (
  4044. (taicpu(hp1).oper[1]^.reg = NR_EAX) and
  4045. (
  4046. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4047. MatchOperand(taicpu(hp1).oper[0]^, NR_EDX)
  4048. )
  4049. ) or
  4050. (
  4051. (taicpu(hp1).oper[1]^.reg = NR_EDX) and
  4052. (
  4053. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4054. MatchOperand(taicpu(hp1).oper[0]^, NR_EAX)
  4055. )
  4056. )
  4057. ) and
  4058. GetNextInstruction(hp1, hp2) and
  4059. MatchInstruction(hp2, A_SAR, []) and
  4060. MatchOperand(taicpu(hp2).oper[0]^, 31) then
  4061. begin
  4062. if MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) then
  4063. begin
  4064. { Change:
  4065. movl r/m,%edx movl r/m,%eax movl r/m,%edx movl r/m,%eax
  4066. movl %edx,%eax or movl %eax,%edx or movl r/m,%eax or movl r/m,%edx
  4067. sarl $31,%edx sarl $31,%edx sarl $31,%edx sarl $31,%edx
  4068. To:
  4069. movl r/m,%eax <- Note the change in register
  4070. cltd
  4071. }
  4072. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCltd', p);
  4073. AllocRegBetween(NR_EAX, p, hp1, UsedRegs);
  4074. taicpu(p).loadreg(1, NR_EAX);
  4075. taicpu(hp1).opcode := A_CDQ;
  4076. taicpu(hp1).clearop(1);
  4077. taicpu(hp1).clearop(0);
  4078. taicpu(hp1).ops:=0;
  4079. AsmL.Remove(hp2);
  4080. hp2.Free;
  4081. (*
  4082. {$ifdef x86_64}
  4083. end
  4084. else if MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) and
  4085. { This code sequence does not get generated - however it might become useful
  4086. if and when 128-bit signed integer types make an appearance, so the code
  4087. is kept here for when it is eventually needed. [Kit] }
  4088. (
  4089. (
  4090. (taicpu(hp1).oper[1]^.reg = NR_RAX) and
  4091. (
  4092. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4093. MatchOperand(taicpu(hp1).oper[0]^, NR_RDX)
  4094. )
  4095. ) or
  4096. (
  4097. (taicpu(hp1).oper[1]^.reg = NR_RDX) and
  4098. (
  4099. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4100. MatchOperand(taicpu(hp1).oper[0]^, NR_RAX)
  4101. )
  4102. )
  4103. ) and
  4104. GetNextInstruction(hp1, hp2) and
  4105. MatchInstruction(hp2, A_SAR, [S_Q]) and
  4106. MatchOperand(taicpu(hp2).oper[0]^, 63) and
  4107. MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) then
  4108. begin
  4109. { Change:
  4110. movq r/m,%rdx movq r/m,%rax movq r/m,%rdx movq r/m,%rax
  4111. movq %rdx,%rax or movq %rax,%rdx or movq r/m,%rax or movq r/m,%rdx
  4112. sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx
  4113. To:
  4114. movq r/m,%rax <- Note the change in register
  4115. cqto
  4116. }
  4117. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCqto', p);
  4118. AllocRegBetween(NR_RAX, p, hp1, UsedRegs);
  4119. taicpu(p).loadreg(1, NR_RAX);
  4120. taicpu(hp1).opcode := A_CQO;
  4121. taicpu(hp1).clearop(1);
  4122. taicpu(hp1).clearop(0);
  4123. taicpu(hp1).ops:=0;
  4124. AsmL.Remove(hp2);
  4125. hp2.Free;
  4126. {$endif x86_64}
  4127. *)
  4128. end;
  4129. end;
  4130. end
  4131. else if (taicpu(p).oper[0]^.typ = top_ref) and
  4132. (hp1.typ = ait_instruction) and
  4133. { while the GetNextInstruction(hp1,hp2) call could be factored out,
  4134. doing it separately in both branches allows to do the cheap checks
  4135. with low probability earlier }
  4136. ((IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
  4137. GetNextInstruction(hp1,hp2) and
  4138. MatchInstruction(hp2,A_MOV,[])
  4139. ) or
  4140. ((taicpu(hp1).opcode=A_LEA) and
  4141. GetNextInstruction(hp1,hp2) and
  4142. MatchInstruction(hp2,A_MOV,[]) and
  4143. ((MatchReference(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.reg,NR_INVALID) and
  4144. (taicpu(hp1).oper[0]^.ref^.index<>taicpu(p).oper[1]^.reg)
  4145. ) or
  4146. (MatchReference(taicpu(hp1).oper[0]^.ref^,NR_INVALID,
  4147. taicpu(p).oper[1]^.reg) and
  4148. (taicpu(hp1).oper[0]^.ref^.base<>taicpu(p).oper[1]^.reg)) or
  4149. (MatchReferenceWithOffset(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.reg,NR_NO)) or
  4150. (MatchReferenceWithOffset(taicpu(hp1).oper[0]^.ref^,NR_NO,taicpu(p).oper[1]^.reg))
  4151. ) and
  4152. ((MatchOperand(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^)) or not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,UsedRegs)))
  4153. )
  4154. ) and
  4155. MatchOperand(taicpu(hp1).oper[taicpu(hp1).ops-1]^,taicpu(hp2).oper[0]^) and
  4156. (taicpu(hp2).oper[1]^.typ = top_ref) then
  4157. begin
  4158. TransferUsedRegs(TmpUsedRegs);
  4159. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  4160. UpdateUsedRegs(TmpUsedRegs,tai(hp1.next));
  4161. if (RefsEqual(taicpu(hp2).oper[1]^.ref^,taicpu(p).oper[0]^.ref^) and
  4162. not(RegUsedAfterInstruction(taicpu(hp2).oper[0]^.reg,hp2,TmpUsedRegs))) then
  4163. { change mov (ref), reg
  4164. add/sub/or/... reg2/$const, reg
  4165. mov reg, (ref)
  4166. # release reg
  4167. to add/sub/or/... reg2/$const, (ref) }
  4168. begin
  4169. case taicpu(hp1).opcode of
  4170. A_INC,A_DEC,A_NOT,A_NEG :
  4171. taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^);
  4172. A_LEA :
  4173. begin
  4174. taicpu(hp1).opcode:=A_ADD;
  4175. if (taicpu(hp1).oper[0]^.ref^.index<>taicpu(p).oper[1]^.reg) and (taicpu(hp1).oper[0]^.ref^.index<>NR_NO) then
  4176. taicpu(hp1).loadreg(0,taicpu(hp1).oper[0]^.ref^.index)
  4177. else if (taicpu(hp1).oper[0]^.ref^.base<>taicpu(p).oper[1]^.reg) and (taicpu(hp1).oper[0]^.ref^.base<>NR_NO) then
  4178. taicpu(hp1).loadreg(0,taicpu(hp1).oper[0]^.ref^.base)
  4179. else
  4180. taicpu(hp1).loadconst(0,taicpu(hp1).oper[0]^.ref^.offset);
  4181. taicpu(hp1).loadRef(1,taicpu(p).oper[0]^.ref^);
  4182. DebugMsg(SPeepholeOptimization + 'FoldLea done',hp1);
  4183. end
  4184. else
  4185. taicpu(hp1).loadRef(1,taicpu(p).oper[0]^.ref^);
  4186. end;
  4187. asml.remove(p);
  4188. asml.remove(hp2);
  4189. p.free;
  4190. hp2.free;
  4191. p := hp1
  4192. end;
  4193. Exit;
  4194. {$ifdef x86_64}
  4195. end
  4196. else if (taicpu(p).opsize = S_L) and
  4197. (taicpu(p).oper[1]^.typ = top_reg) and
  4198. (
  4199. MatchInstruction(hp1, A_MOV,[]) and
  4200. (taicpu(hp1).opsize = S_L) and
  4201. (taicpu(hp1).oper[1]^.typ = top_reg)
  4202. ) and (
  4203. GetNextInstruction(hp1, hp2) and
  4204. (tai(hp2).typ=ait_instruction) and
  4205. (taicpu(hp2).opsize = S_Q) and
  4206. (
  4207. (
  4208. MatchInstruction(hp2, A_ADD,[]) and
  4209. (taicpu(hp2).opsize = S_Q) and
  4210. (taicpu(hp2).oper[0]^.typ = top_reg) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  4211. (
  4212. (
  4213. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(p).oper[1]^.reg)) and
  4214. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4215. ) or (
  4216. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  4217. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  4218. )
  4219. )
  4220. ) or (
  4221. MatchInstruction(hp2, A_LEA,[]) and
  4222. (taicpu(hp2).oper[0]^.ref^.offset = 0) and
  4223. (taicpu(hp2).oper[0]^.ref^.scalefactor <= 1) and
  4224. (
  4225. (
  4226. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(p).oper[1]^.reg)) and
  4227. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4228. ) or (
  4229. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  4230. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(p).oper[1]^.reg))
  4231. )
  4232. ) and (
  4233. (
  4234. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4235. ) or (
  4236. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  4237. )
  4238. )
  4239. )
  4240. )
  4241. ) and (
  4242. GetNextInstruction(hp2, hp3) and
  4243. MatchInstruction(hp3, A_SHR,[]) and
  4244. (taicpu(hp3).opsize = S_Q) and
  4245. (taicpu(hp3).oper[0]^.typ = top_const) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  4246. (taicpu(hp3).oper[0]^.val = 1) and
  4247. (taicpu(hp3).oper[1]^.reg = taicpu(hp2).oper[1]^.reg)
  4248. ) then
  4249. begin
  4250. { Change movl x, reg1d movl x, reg1d
  4251. movl y, reg2d movl y, reg2d
  4252. addq reg2q,reg1q or leaq (reg1q,reg2q),reg1q
  4253. shrq $1, reg1q shrq $1, reg1q
  4254. ( reg1d and reg2d can be switched around in the first two instructions )
  4255. To movl x, reg1d
  4256. addl y, reg1d
  4257. rcrl $1, reg1d
  4258. This corresponds to the common expression (x + y) shr 1, where
  4259. x and y are Cardinals (replacing "shr 1" with "div 2" produces
  4260. smaller code, but won't account for x + y causing an overflow). [Kit]
  4261. }
  4262. if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then
  4263. { Change first MOV command to have the same register as the final output }
  4264. taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg
  4265. else
  4266. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
  4267. { Change second MOV command to an ADD command. This is easier than
  4268. converting the existing command because it means we don't have to
  4269. touch 'y', which might be a complicated reference, and also the
  4270. fact that the third command might either be ADD or LEA. [Kit] }
  4271. taicpu(hp1).opcode := A_ADD;
  4272. { Delete old ADD/LEA instruction }
  4273. asml.remove(hp2);
  4274. hp2.free;
  4275. { Convert "shrq $1, reg1q" to "rcr $1, reg1d" }
  4276. taicpu(hp3).opcode := A_RCR;
  4277. taicpu(hp3).changeopsize(S_L);
  4278. setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD);
  4279. {$endif x86_64}
  4280. end;
  4281. end;
  4282. function TX86AsmOptimizer.OptPass2Imul(var p : tai) : boolean;
  4283. var
  4284. hp1 : tai;
  4285. begin
  4286. Result:=false;
  4287. if (taicpu(p).ops >= 2) and
  4288. ((taicpu(p).oper[0]^.typ = top_const) or
  4289. ((taicpu(p).oper[0]^.typ = top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full))) and
  4290. (taicpu(p).oper[1]^.typ = top_reg) and
  4291. ((taicpu(p).ops = 2) or
  4292. ((taicpu(p).oper[2]^.typ = top_reg) and
  4293. (taicpu(p).oper[2]^.reg = taicpu(p).oper[1]^.reg))) and
  4294. GetLastInstruction(p,hp1) and
  4295. MatchInstruction(hp1,A_MOV,[]) and
  4296. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  4297. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  4298. begin
  4299. TransferUsedRegs(TmpUsedRegs);
  4300. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,p,TmpUsedRegs)) or
  4301. ((taicpu(p).ops = 3) and (taicpu(p).oper[1]^.reg=taicpu(p).oper[2]^.reg)) then
  4302. { change
  4303. mov reg1,reg2
  4304. imul y,reg2 to imul y,reg1,reg2 }
  4305. begin
  4306. taicpu(p).ops := 3;
  4307. taicpu(p).loadreg(2,taicpu(p).oper[1]^.reg);
  4308. taicpu(p).loadreg(1,taicpu(hp1).oper[0]^.reg);
  4309. DebugMsg(SPeepholeOptimization + 'MovImul2Imul done',p);
  4310. asml.remove(hp1);
  4311. hp1.free;
  4312. result:=true;
  4313. end;
  4314. end;
  4315. end;
  4316. procedure TX86AsmOptimizer.ConvertJumpToRET(const p: tai; const ret_p: tai);
  4317. var
  4318. ThisLabel: TAsmLabel;
  4319. begin
  4320. ThisLabel := tasmlabel(taicpu(p).oper[0]^.ref^.symbol);
  4321. ThisLabel.decrefs;
  4322. taicpu(p).opcode := A_RET;
  4323. taicpu(p).is_jmp := false;
  4324. taicpu(p).ops := taicpu(ret_p).ops;
  4325. case taicpu(ret_p).ops of
  4326. 0:
  4327. taicpu(p).clearop(0);
  4328. 1:
  4329. taicpu(p).loadconst(0,taicpu(ret_p).oper[0]^.val);
  4330. else
  4331. internalerror(2016041301);
  4332. end;
  4333. { If the original label is now dead, it might turn out that the label
  4334. immediately follows p. As a result, everything beyond it, which will
  4335. be just some final register configuration and a RET instruction, is
  4336. now dead code. [Kit] }
  4337. { NOTE: This is much faster than introducing a OptPass2RET routine and
  4338. running RemoveDeadCodeAfterJump for each RET instruction, because
  4339. this optimisation rarely happens and most RETs appear at the end of
  4340. routines where there is nothing that can be stripped. [Kit] }
  4341. if not ThisLabel.is_used then
  4342. RemoveDeadCodeAfterJump(p);
  4343. end;
  4344. function TX86AsmOptimizer.OptPass2Jmp(var p : tai) : boolean;
  4345. var
  4346. hp1, hp2, hp3: tai;
  4347. OperIdx: Integer;
  4348. begin
  4349. result:=false;
  4350. if (taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full) and (taicpu(p).oper[0]^.ref^.base=NR_NO) and
  4351. (taicpu(p).oper[0]^.ref^.index=NR_NO) and (taicpu(p).oper[0]^.ref^.symbol is tasmlabel) then
  4352. begin
  4353. hp1:=getlabelwithsym(tasmlabel(taicpu(p).oper[0]^.ref^.symbol));
  4354. if (taicpu(p).condition=C_None) and assigned(hp1) and SkipLabels(hp1,hp1) and (hp1.typ = ait_instruction) then
  4355. begin
  4356. case taicpu(hp1).opcode of
  4357. A_RET:
  4358. {
  4359. change
  4360. jmp .L1
  4361. ...
  4362. .L1:
  4363. ret
  4364. into
  4365. ret
  4366. }
  4367. begin
  4368. ConvertJumpToRET(p, hp1);
  4369. result:=true;
  4370. end;
  4371. A_MOV:
  4372. {
  4373. change
  4374. jmp .L1
  4375. ...
  4376. .L1:
  4377. mov ##, ##
  4378. ret
  4379. into
  4380. mov ##, ##
  4381. ret
  4382. }
  4383. { This optimisation tends to increase code size if the pass 1 MOV optimisations aren't
  4384. re-run, so only do this particular optimisation if optimising for speed or when
  4385. optimisations are very in-depth. [Kit] }
  4386. if (current_settings.optimizerswitches * [cs_opt_level3, cs_opt_size]) <> [cs_opt_size] then
  4387. begin
  4388. GetNextInstruction(hp1, hp2);
  4389. if not Assigned(hp2) then
  4390. Exit;
  4391. if (hp2.typ in [ait_label, ait_align]) then
  4392. SkipLabels(hp2,hp2);
  4393. if Assigned(hp2) and MatchInstruction(hp2, A_RET, [S_NO]) then
  4394. begin
  4395. { Duplicate the MOV instruction }
  4396. hp3:=tai(hp1.getcopy);
  4397. asml.InsertBefore(hp3, p);
  4398. { Make sure the compiler knows about any final registers written here }
  4399. for OperIdx := 0 to 1 do
  4400. with taicpu(hp3).oper[OperIdx]^ do
  4401. begin
  4402. case typ of
  4403. top_ref:
  4404. begin
  4405. if (ref^.base <> NR_NO) {$ifdef x86_64} and (ref^.base <> NR_RIP) {$endif x86_64} then
  4406. AllocRegBetween(ref^.base, hp3, tai(p.Next), UsedRegs);
  4407. if (ref^.index <> NR_NO) {$ifdef x86_64} and (ref^.index <> NR_RIP) {$endif x86_64} then
  4408. AllocRegBetween(ref^.index, hp3, tai(p.Next), UsedRegs);
  4409. end;
  4410. top_reg:
  4411. AllocRegBetween(reg, hp3, tai(p.Next), UsedRegs);
  4412. else
  4413. ;
  4414. end;
  4415. end;
  4416. { Now change the jump into a RET instruction }
  4417. ConvertJumpToRET(p, hp2);
  4418. result:=true;
  4419. end;
  4420. end;
  4421. else
  4422. ;
  4423. end;
  4424. end;
  4425. end;
  4426. end;
  4427. class function TX86AsmOptimizer.CanBeCMOV(p : tai) : boolean;
  4428. begin
  4429. CanBeCMOV:=assigned(p) and
  4430. MatchInstruction(p,A_MOV,[S_W,S_L,S_Q]) and
  4431. { we can't use cmov ref,reg because
  4432. ref could be nil and cmov still throws an exception
  4433. if ref=nil but the mov isn't done (FK)
  4434. or ((taicpu(p).oper[0]^.typ = top_ref) and
  4435. (taicpu(p).oper[0]^.ref^.refaddr = addr_no))
  4436. }
  4437. (taicpu(p).oper[1]^.typ = top_reg) and
  4438. (
  4439. (taicpu(p).oper[0]^.typ = top_reg) or
  4440. { allow references, but only pure symbols or got rel. addressing with RIP as based,
  4441. it is not expected that this can cause a seg. violation }
  4442. (
  4443. (taicpu(p).oper[0]^.typ = top_ref) and
  4444. IsRefSafe(taicpu(p).oper[0]^.ref)
  4445. )
  4446. );
  4447. end;
  4448. function TX86AsmOptimizer.OptPass2Jcc(var p : tai) : boolean;
  4449. var
  4450. hp1,hp2,hp3,hp4,hpmov2: tai;
  4451. carryadd_opcode : TAsmOp;
  4452. l : Longint;
  4453. condition : TAsmCond;
  4454. symbol: TAsmSymbol;
  4455. reg: tsuperregister;
  4456. regavailable: Boolean;
  4457. begin
  4458. result:=false;
  4459. symbol:=nil;
  4460. if GetNextInstruction(p,hp1) then
  4461. begin
  4462. symbol := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  4463. if (hp1.typ=ait_instruction) and
  4464. GetNextInstruction(hp1,hp2) and
  4465. ((hp2.typ=ait_label) or
  4466. { trick to skip align }
  4467. ((hp2.typ=ait_align) and GetNextInstruction(hp2,hp2) and (hp2.typ=ait_label))
  4468. ) and
  4469. (Tasmlabel(symbol) = Tai_label(hp2).labsym) then
  4470. { jb @@1 cmc
  4471. inc/dec operand --> adc/sbb operand,0
  4472. @@1:
  4473. ... and ...
  4474. jnb @@1
  4475. inc/dec operand --> adc/sbb operand,0
  4476. @@1: }
  4477. begin
  4478. carryadd_opcode:=A_NONE;
  4479. if Taicpu(p).condition in [C_NAE,C_B,C_C] then
  4480. begin
  4481. if (Taicpu(hp1).opcode=A_INC) or
  4482. ((Taicpu(hp1).opcode=A_ADD) and
  4483. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4484. (Taicpu(hp1).oper[0]^.val=1)
  4485. ) then
  4486. carryadd_opcode:=A_ADC;
  4487. if (Taicpu(hp1).opcode=A_DEC) or
  4488. ((Taicpu(hp1).opcode=A_SUB) and
  4489. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4490. (Taicpu(hp1).oper[0]^.val=1)
  4491. ) then
  4492. carryadd_opcode:=A_SBB;
  4493. if carryadd_opcode<>A_NONE then
  4494. begin
  4495. Taicpu(p).clearop(0);
  4496. Taicpu(p).ops:=0;
  4497. Taicpu(p).is_jmp:=false;
  4498. Taicpu(p).opcode:=A_CMC;
  4499. Taicpu(p).condition:=C_NONE;
  4500. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2CmcAdc/Sbb',p);
  4501. Taicpu(hp1).ops:=2;
  4502. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  4503. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  4504. else
  4505. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  4506. Taicpu(hp1).loadconst(0,0);
  4507. Taicpu(hp1).opcode:=carryadd_opcode;
  4508. result:=true;
  4509. exit;
  4510. end;
  4511. end
  4512. else if Taicpu(p).condition in [C_AE,C_NB,C_NC] then
  4513. begin
  4514. if (Taicpu(hp1).opcode=A_INC) or
  4515. ((Taicpu(hp1).opcode=A_ADD) and
  4516. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4517. (Taicpu(hp1).oper[0]^.val=1)
  4518. ) then
  4519. carryadd_opcode:=A_ADC;
  4520. if (Taicpu(hp1).opcode=A_DEC) or
  4521. ((Taicpu(hp1).opcode=A_SUB) and
  4522. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4523. (Taicpu(hp1).oper[0]^.val=1)
  4524. ) then
  4525. carryadd_opcode:=A_SBB;
  4526. if carryadd_opcode<>A_NONE then
  4527. begin
  4528. Taicpu(hp1).ops:=2;
  4529. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2Adc/Sbb',p);
  4530. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  4531. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  4532. else
  4533. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  4534. Taicpu(hp1).loadconst(0,0);
  4535. Taicpu(hp1).opcode:=carryadd_opcode;
  4536. RemoveCurrentP(p, hp1);
  4537. result:=true;
  4538. exit;
  4539. end;
  4540. end
  4541. {
  4542. jcc @@1 setcc tmpreg
  4543. inc/dec/add/sub operand -> (movzx tmpreg)
  4544. @@1: add/sub tmpreg,operand
  4545. While this increases code size slightly, it makes the code much faster if the
  4546. jump is unpredictable
  4547. }
  4548. else if not(cs_opt_size in current_settings.optimizerswitches) and
  4549. ((((Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB)) and
  4550. (Taicpu(hp1).oper[0]^.typ=top_const) and
  4551. (Taicpu(hp1).oper[1]^.typ=top_reg) and
  4552. (Taicpu(hp1).oper[0]^.val=1)) or
  4553. ((Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC))
  4554. ) then
  4555. begin
  4556. TransferUsedRegs(TmpUsedRegs);
  4557. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4558. { search for an available register which is volatile }
  4559. regavailable:=false;
  4560. for reg in tcpuregisterset do
  4561. begin
  4562. if (reg in paramanager.get_volatile_registers_int(current_procinfo.procdef.proccalloption)) and
  4563. not(reg in TmpUsedRegs[R_INTREGISTER].GetUsedRegs) and
  4564. not(RegInInstruction(newreg(R_INTREGISTER,reg,R_SUBL),hp1))
  4565. {$ifdef i386}
  4566. and (reg in [RS_EAX,RS_EBX,RS_ECX,RS_EDX])
  4567. {$endif i386}
  4568. then
  4569. begin
  4570. regavailable:=true;
  4571. break;
  4572. end;
  4573. end;
  4574. if regavailable then
  4575. begin
  4576. Taicpu(p).clearop(0);
  4577. Taicpu(p).ops:=1;
  4578. Taicpu(p).is_jmp:=false;
  4579. Taicpu(p).opcode:=A_SETcc;
  4580. DebugMsg(SPeepholeOptimization+'JccAdd2SetccAdd',p);
  4581. Taicpu(p).condition:=inverse_cond(Taicpu(p).condition);
  4582. Taicpu(p).loadreg(0,newreg(R_INTREGISTER,reg,R_SUBL));
  4583. if getsubreg(Taicpu(hp1).oper[1]^.reg)<>R_SUBL then
  4584. begin
  4585. case getsubreg(Taicpu(hp1).oper[1]^.reg) of
  4586. R_SUBW:
  4587. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BW,newreg(R_INTREGISTER,reg,R_SUBL),
  4588. newreg(R_INTREGISTER,reg,R_SUBW));
  4589. R_SUBD,
  4590. R_SUBQ:
  4591. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BL,newreg(R_INTREGISTER,reg,R_SUBL),
  4592. newreg(R_INTREGISTER,reg,R_SUBD));
  4593. else
  4594. Internalerror(2020030601);
  4595. end;
  4596. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  4597. asml.InsertAfter(hp2,p);
  4598. end;
  4599. if (Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC) then
  4600. begin
  4601. Taicpu(hp1).ops:=2;
  4602. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^)
  4603. end;
  4604. Taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,reg,getsubreg(Taicpu(hp1).oper[1]^.reg)));
  4605. AllocRegBetween(newreg(R_INTREGISTER,reg,getsubreg(Taicpu(hp1).oper[1]^.reg)),p,hp1,UsedRegs);
  4606. end;
  4607. end;
  4608. end;
  4609. { Detect the following:
  4610. jmp<cond> @Lbl1
  4611. jmp @Lbl2
  4612. ...
  4613. @Lbl1:
  4614. ret
  4615. Change to:
  4616. jmp<inv_cond> @Lbl2
  4617. ret
  4618. }
  4619. if MatchInstruction(hp1,A_JMP,[]) and (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  4620. begin
  4621. hp2:=getlabelwithsym(TAsmLabel(symbol));
  4622. if Assigned(hp2) and SkipLabels(hp2,hp2) and
  4623. MatchInstruction(hp2,A_RET,[S_NO]) then
  4624. begin
  4625. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  4626. { Change label address to that of the unconditional jump }
  4627. taicpu(p).loadoper(0, taicpu(hp1).oper[0]^);
  4628. TAsmLabel(symbol).DecRefs;
  4629. taicpu(hp1).opcode := A_RET;
  4630. taicpu(hp1).is_jmp := false;
  4631. taicpu(hp1).ops := taicpu(hp2).ops;
  4632. DebugMsg(SPeepholeOptimization+'JccJmpRet2J!ccRet',p);
  4633. case taicpu(hp2).ops of
  4634. 0:
  4635. taicpu(hp1).clearop(0);
  4636. 1:
  4637. taicpu(hp1).loadconst(0,taicpu(hp2).oper[0]^.val);
  4638. else
  4639. internalerror(2016041302);
  4640. end;
  4641. end;
  4642. end;
  4643. end;
  4644. {$ifndef i8086}
  4645. if CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype] then
  4646. begin
  4647. { check for
  4648. jCC xxx
  4649. <several movs>
  4650. xxx:
  4651. }
  4652. l:=0;
  4653. GetNextInstruction(p, hp1);
  4654. while assigned(hp1) and
  4655. CanBeCMOV(hp1) and
  4656. { stop on labels }
  4657. not(hp1.typ=ait_label) do
  4658. begin
  4659. inc(l);
  4660. GetNextInstruction(hp1,hp1);
  4661. end;
  4662. if assigned(hp1) then
  4663. begin
  4664. if FindLabel(tasmlabel(symbol),hp1) then
  4665. begin
  4666. if (l<=4) and (l>0) then
  4667. begin
  4668. condition:=inverse_cond(taicpu(p).condition);
  4669. GetNextInstruction(p,hp1);
  4670. repeat
  4671. if not Assigned(hp1) then
  4672. InternalError(2018062900);
  4673. taicpu(hp1).opcode:=A_CMOVcc;
  4674. taicpu(hp1).condition:=condition;
  4675. UpdateUsedRegs(hp1);
  4676. GetNextInstruction(hp1,hp1);
  4677. until not(CanBeCMOV(hp1));
  4678. { Remember what hp1 is in case there's multiple aligns to get rid of }
  4679. hp2 := hp1;
  4680. repeat
  4681. if not Assigned(hp2) then
  4682. InternalError(2018062910);
  4683. case hp2.typ of
  4684. ait_label:
  4685. { What we expected - break out of the loop (it won't be a dead label at the top of
  4686. a cluster because that was optimised at an earlier stage) }
  4687. Break;
  4688. ait_align:
  4689. { Go to the next entry until a label is found (may be multiple aligns before it) }
  4690. begin
  4691. hp2 := tai(hp2.Next);
  4692. Continue;
  4693. end;
  4694. else
  4695. begin
  4696. { Might be a comment or temporary allocation entry }
  4697. if not (hp2.typ in SkipInstr) then
  4698. InternalError(2018062911);
  4699. hp2 := tai(hp2.Next);
  4700. Continue;
  4701. end;
  4702. end;
  4703. until False;
  4704. { Now we can safely decrement the reference count }
  4705. tasmlabel(symbol).decrefs;
  4706. DebugMsg(SPeepholeOptimization+'JccMov2CMov',p);
  4707. { Remove the original jump }
  4708. asml.Remove(p);
  4709. p.Free;
  4710. GetNextInstruction(hp2, p); { Instruction after the label }
  4711. { Remove the label if this is its final reference }
  4712. if (tasmlabel(symbol).getrefs=0) then
  4713. StripLabelFast(hp1);
  4714. if Assigned(p) then
  4715. begin
  4716. UpdateUsedRegs(p);
  4717. result:=true;
  4718. end;
  4719. exit;
  4720. end;
  4721. end
  4722. else
  4723. begin
  4724. { check further for
  4725. jCC xxx
  4726. <several movs 1>
  4727. jmp yyy
  4728. xxx:
  4729. <several movs 2>
  4730. yyy:
  4731. }
  4732. { hp2 points to jmp yyy }
  4733. hp2:=hp1;
  4734. { skip hp1 to xxx (or an align right before it) }
  4735. GetNextInstruction(hp1, hp1);
  4736. if assigned(hp2) and
  4737. assigned(hp1) and
  4738. (l<=3) and
  4739. (hp2.typ=ait_instruction) and
  4740. (taicpu(hp2).is_jmp) and
  4741. (taicpu(hp2).condition=C_None) and
  4742. { real label and jump, no further references to the
  4743. label are allowed }
  4744. (tasmlabel(symbol).getrefs=1) and
  4745. FindLabel(tasmlabel(symbol),hp1) then
  4746. begin
  4747. l:=0;
  4748. { skip hp1 to <several moves 2> }
  4749. if (hp1.typ = ait_align) then
  4750. GetNextInstruction(hp1, hp1);
  4751. GetNextInstruction(hp1, hpmov2);
  4752. hp1 := hpmov2;
  4753. while assigned(hp1) and
  4754. CanBeCMOV(hp1) do
  4755. begin
  4756. inc(l);
  4757. GetNextInstruction(hp1, hp1);
  4758. end;
  4759. { hp1 points to yyy (or an align right before it) }
  4760. hp3 := hp1;
  4761. if assigned(hp1) and
  4762. FindLabel(tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol),hp1) then
  4763. begin
  4764. condition:=inverse_cond(taicpu(p).condition);
  4765. GetNextInstruction(p,hp1);
  4766. repeat
  4767. taicpu(hp1).opcode:=A_CMOVcc;
  4768. taicpu(hp1).condition:=condition;
  4769. UpdateUsedRegs(hp1);
  4770. GetNextInstruction(hp1,hp1);
  4771. until not(assigned(hp1)) or
  4772. not(CanBeCMOV(hp1));
  4773. condition:=inverse_cond(condition);
  4774. hp1 := hpmov2;
  4775. { hp1 is now at <several movs 2> }
  4776. while Assigned(hp1) and CanBeCMOV(hp1) do
  4777. begin
  4778. taicpu(hp1).opcode:=A_CMOVcc;
  4779. taicpu(hp1).condition:=condition;
  4780. UpdateUsedRegs(hp1);
  4781. GetNextInstruction(hp1,hp1);
  4782. end;
  4783. hp1 := p;
  4784. { Get first instruction after label }
  4785. GetNextInstruction(hp3, p);
  4786. if assigned(p) and (hp3.typ = ait_align) then
  4787. GetNextInstruction(p, p);
  4788. { Don't dereference yet, as doing so will cause
  4789. GetNextInstruction to skip the label and
  4790. optional align marker. [Kit] }
  4791. GetNextInstruction(hp2, hp4);
  4792. DebugMsg(SPeepholeOptimization+'JccMovJmpMov2CMovCMov',hp1);
  4793. { remove jCC }
  4794. asml.remove(hp1);
  4795. hp1.free;
  4796. { Now we can safely decrement it }
  4797. tasmlabel(symbol).decrefs;
  4798. { Remove label xxx (it will have a ref of zero due to the initial check }
  4799. StripLabelFast(hp4);
  4800. { remove jmp }
  4801. symbol := taicpu(hp2).oper[0]^.ref^.symbol;
  4802. asml.remove(hp2);
  4803. hp2.free;
  4804. { As before, now we can safely decrement it }
  4805. tasmlabel(symbol).decrefs;
  4806. { Remove label yyy (and the optional alignment) if its reference falls to zero }
  4807. if tasmlabel(symbol).getrefs = 0 then
  4808. StripLabelFast(hp3);
  4809. if Assigned(p) then
  4810. begin
  4811. UpdateUsedRegs(p);
  4812. result:=true;
  4813. end;
  4814. exit;
  4815. end;
  4816. end;
  4817. end;
  4818. end;
  4819. end;
  4820. {$endif i8086}
  4821. end;
  4822. function TX86AsmOptimizer.OptPass1Movx(var p : tai) : boolean;
  4823. var
  4824. hp1,hp2: tai;
  4825. reg_and_hp1_is_instr: Boolean;
  4826. begin
  4827. result:=false;
  4828. reg_and_hp1_is_instr:=(taicpu(p).oper[1]^.typ = top_reg) and
  4829. GetNextInstruction(p,hp1) and
  4830. (hp1.typ = ait_instruction);
  4831. if reg_and_hp1_is_instr and
  4832. IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
  4833. GetNextInstruction(hp1,hp2) and
  4834. MatchInstruction(hp2,A_MOV,[]) and
  4835. (taicpu(hp2).oper[0]^.typ = top_reg) and
  4836. OpsEqual(taicpu(hp2).oper[1]^,taicpu(p).oper[0]^) and
  4837. {$ifdef i386}
  4838. { not all registers have byte size sub registers on i386 }
  4839. ((taicpu(hp2).opsize<>S_B) or (getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])) and
  4840. {$endif i386}
  4841. (((taicpu(hp1).ops=2) and
  4842. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg))) or
  4843. ((taicpu(hp1).ops=1) and
  4844. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg)))) and
  4845. not(RegUsedAfterInstruction(taicpu(hp2).oper[0]^.reg,hp2,UsedRegs)) then
  4846. begin
  4847. { change movsX/movzX reg/ref, reg2
  4848. add/sub/or/... reg3/$const, reg2
  4849. mov reg2 reg/ref
  4850. to add/sub/or/... reg3/$const, reg/ref }
  4851. { by example:
  4852. movswl %si,%eax movswl %si,%eax p
  4853. decl %eax addl %edx,%eax hp1
  4854. movw %ax,%si movw %ax,%si hp2
  4855. ->
  4856. movswl %si,%eax movswl %si,%eax p
  4857. decw %eax addw %edx,%eax hp1
  4858. movw %ax,%si movw %ax,%si hp2
  4859. }
  4860. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  4861. {
  4862. ->
  4863. movswl %si,%eax movswl %si,%eax p
  4864. decw %si addw %dx,%si hp1
  4865. movw %ax,%si movw %ax,%si hp2
  4866. }
  4867. case taicpu(hp1).ops of
  4868. 1:
  4869. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  4870. 2:
  4871. begin
  4872. taicpu(hp1).loadoper(1,taicpu(hp2).oper[1]^);
  4873. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  4874. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  4875. end;
  4876. else
  4877. internalerror(2008042701);
  4878. end;
  4879. {
  4880. ->
  4881. decw %si addw %dx,%si p
  4882. }
  4883. DebugMsg(SPeepholeOptimization + 'var3',p);
  4884. asml.remove(p);
  4885. asml.remove(hp2);
  4886. p.free;
  4887. hp2.free;
  4888. p:=hp1;
  4889. end
  4890. else if reg_and_hp1_is_instr and
  4891. (taicpu(hp1).opcode = A_MOV) and
  4892. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  4893. (MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)
  4894. {$ifdef x86_64}
  4895. { check for implicit extension to 64 bit }
  4896. or
  4897. ((taicpu(p).opsize in [S_BL,S_WL]) and
  4898. (taicpu(hp1).opsize=S_Q) and
  4899. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg)
  4900. )
  4901. {$endif x86_64}
  4902. )
  4903. then
  4904. begin
  4905. { change
  4906. movx %reg1,%reg2
  4907. mov %reg2,%reg3
  4908. dealloc %reg2
  4909. into
  4910. movx %reg,%reg3
  4911. }
  4912. TransferUsedRegs(TmpUsedRegs);
  4913. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4914. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  4915. begin
  4916. DebugMsg(SPeepholeOptimization + 'MovxMov2Movx',p);
  4917. {$ifdef x86_64}
  4918. if (taicpu(p).opsize in [S_BL,S_WL]) and
  4919. (taicpu(hp1).opsize=S_Q) then
  4920. taicpu(p).loadreg(1,newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg),R_SUBD))
  4921. else
  4922. {$endif x86_64}
  4923. taicpu(p).loadreg(1,taicpu(hp1).oper[1]^.reg);
  4924. asml.remove(hp1);
  4925. hp1.Free;
  4926. end;
  4927. end
  4928. else if taicpu(p).opcode=A_MOVZX then
  4929. begin
  4930. { removes superfluous And's after movzx's }
  4931. if reg_and_hp1_is_instr and
  4932. (taicpu(hp1).opcode = A_AND) and
  4933. MatchOpType(taicpu(hp1),top_const,top_reg) and
  4934. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  4935. begin
  4936. case taicpu(p).opsize Of
  4937. S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
  4938. if (taicpu(hp1).oper[0]^.val = $ff) then
  4939. begin
  4940. DebugMsg(SPeepholeOptimization + 'var4',p);
  4941. asml.remove(hp1);
  4942. hp1.free;
  4943. end;
  4944. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  4945. if (taicpu(hp1).oper[0]^.val = $ffff) then
  4946. begin
  4947. DebugMsg(SPeepholeOptimization + 'var5',p);
  4948. asml.remove(hp1);
  4949. hp1.free;
  4950. end;
  4951. {$ifdef x86_64}
  4952. S_LQ:
  4953. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  4954. begin
  4955. if (cs_asm_source in current_settings.globalswitches) then
  4956. asml.insertbefore(tai_comment.create(strpnew(SPeepholeOptimization + 'var6')),p);
  4957. asml.remove(hp1);
  4958. hp1.Free;
  4959. end;
  4960. {$endif x86_64}
  4961. else
  4962. ;
  4963. end;
  4964. end;
  4965. { changes some movzx constructs to faster synonyms (all examples
  4966. are given with eax/ax, but are also valid for other registers)}
  4967. if MatchOpType(taicpu(p),top_reg,top_reg) then
  4968. begin
  4969. case taicpu(p).opsize of
  4970. { Technically, movzbw %al,%ax cannot be encoded in 32/64-bit mode
  4971. (the machine code is equivalent to movzbl %al,%eax), but the
  4972. code generator still generates that assembler instruction and
  4973. it is silently converted. This should probably be checked.
  4974. [Kit] }
  4975. S_BW:
  4976. begin
  4977. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  4978. (
  4979. not IsMOVZXAcceptable
  4980. { and $0xff,%ax has a smaller encoding but risks a partial write penalty }
  4981. or (
  4982. (cs_opt_size in current_settings.optimizerswitches) and
  4983. (taicpu(p).oper[1]^.reg = NR_AX)
  4984. )
  4985. ) then
  4986. {Change "movzbw %al, %ax" to "andw $0x0ffh, %ax"}
  4987. begin
  4988. DebugMsg(SPeepholeOptimization + 'var7',p);
  4989. taicpu(p).opcode := A_AND;
  4990. taicpu(p).changeopsize(S_W);
  4991. taicpu(p).loadConst(0,$ff);
  4992. Result := True;
  4993. end
  4994. else if not IsMOVZXAcceptable and
  4995. GetNextInstruction(p, hp1) and
  4996. (tai(hp1).typ = ait_instruction) and
  4997. (taicpu(hp1).opcode = A_AND) and
  4998. MatchOpType(taicpu(hp1),top_const,top_reg) and
  4999. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5000. { Change "movzbw %reg1, %reg2; andw $const, %reg2"
  5001. to "movw %reg1, reg2; andw $(const1 and $ff), %reg2"}
  5002. begin
  5003. DebugMsg(SPeepholeOptimization + 'var8',p);
  5004. taicpu(p).opcode := A_MOV;
  5005. taicpu(p).changeopsize(S_W);
  5006. setsubreg(taicpu(p).oper[0]^.reg,R_SUBW);
  5007. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5008. Result := True;
  5009. end;
  5010. end;
  5011. {$ifndef i8086} { movzbl %al,%eax cannot be encoded in 16-bit mode (the machine code is equivalent to movzbw %al,%ax }
  5012. S_BL:
  5013. begin
  5014. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  5015. (
  5016. not IsMOVZXAcceptable
  5017. { and $0xff,%eax has a smaller encoding but risks a partial write penalty }
  5018. or (
  5019. (cs_opt_size in current_settings.optimizerswitches) and
  5020. (taicpu(p).oper[1]^.reg = NR_EAX)
  5021. )
  5022. ) then
  5023. { Change "movzbl %al, %eax" to "andl $0x0ffh, %eax" }
  5024. begin
  5025. DebugMsg(SPeepholeOptimization + 'var9',p);
  5026. taicpu(p).opcode := A_AND;
  5027. taicpu(p).changeopsize(S_L);
  5028. taicpu(p).loadConst(0,$ff);
  5029. Result := True;
  5030. end
  5031. else if not IsMOVZXAcceptable and
  5032. GetNextInstruction(p, hp1) and
  5033. (tai(hp1).typ = ait_instruction) and
  5034. (taicpu(hp1).opcode = A_AND) and
  5035. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5036. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5037. { Change "movzbl %reg1, %reg2; andl $const, %reg2"
  5038. to "movl %reg1, reg2; andl $(const1 and $ff), %reg2"}
  5039. begin
  5040. DebugMsg(SPeepholeOptimization + 'var10',p);
  5041. taicpu(p).opcode := A_MOV;
  5042. taicpu(p).changeopsize(S_L);
  5043. { do not use R_SUBWHOLE
  5044. as movl %rdx,%eax
  5045. is invalid in assembler PM }
  5046. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  5047. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5048. Result := True;
  5049. end;
  5050. end;
  5051. {$endif i8086}
  5052. S_WL:
  5053. if not IsMOVZXAcceptable then
  5054. begin
  5055. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) then
  5056. { Change "movzwl %ax, %eax" to "andl $0x0ffffh, %eax" }
  5057. begin
  5058. DebugMsg(SPeepholeOptimization + 'var11',p);
  5059. taicpu(p).opcode := A_AND;
  5060. taicpu(p).changeopsize(S_L);
  5061. taicpu(p).loadConst(0,$ffff);
  5062. Result := True;
  5063. end
  5064. else if GetNextInstruction(p, hp1) and
  5065. (tai(hp1).typ = ait_instruction) and
  5066. (taicpu(hp1).opcode = A_AND) and
  5067. (taicpu(hp1).oper[0]^.typ = top_const) and
  5068. (taicpu(hp1).oper[1]^.typ = top_reg) and
  5069. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5070. { Change "movzwl %reg1, %reg2; andl $const, %reg2"
  5071. to "movl %reg1, reg2; andl $(const1 and $ffff), %reg2"}
  5072. begin
  5073. DebugMsg(SPeepholeOptimization + 'var12',p);
  5074. taicpu(p).opcode := A_MOV;
  5075. taicpu(p).changeopsize(S_L);
  5076. { do not use R_SUBWHOLE
  5077. as movl %rdx,%eax
  5078. is invalid in assembler PM }
  5079. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  5080. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  5081. Result := True;
  5082. end;
  5083. end;
  5084. else
  5085. InternalError(2017050705);
  5086. end;
  5087. end
  5088. else if not IsMOVZXAcceptable and (taicpu(p).oper[0]^.typ = top_ref) then
  5089. begin
  5090. if GetNextInstruction(p, hp1) and
  5091. (tai(hp1).typ = ait_instruction) and
  5092. (taicpu(hp1).opcode = A_AND) and
  5093. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5094. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5095. begin
  5096. //taicpu(p).opcode := A_MOV;
  5097. case taicpu(p).opsize Of
  5098. S_BL:
  5099. begin
  5100. DebugMsg(SPeepholeOptimization + 'var13',p);
  5101. taicpu(hp1).changeopsize(S_L);
  5102. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5103. end;
  5104. S_WL:
  5105. begin
  5106. DebugMsg(SPeepholeOptimization + 'var14',p);
  5107. taicpu(hp1).changeopsize(S_L);
  5108. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  5109. end;
  5110. S_BW:
  5111. begin
  5112. DebugMsg(SPeepholeOptimization + 'var15',p);
  5113. taicpu(hp1).changeopsize(S_W);
  5114. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5115. end;
  5116. else
  5117. Internalerror(2017050704)
  5118. end;
  5119. Result := True;
  5120. end;
  5121. end;
  5122. end;
  5123. end;
  5124. function TX86AsmOptimizer.OptPass1AND(var p : tai) : boolean;
  5125. var
  5126. hp1 : tai;
  5127. MaskLength : Cardinal;
  5128. begin
  5129. Result:=false;
  5130. if GetNextInstruction(p, hp1) then
  5131. begin
  5132. if MatchOpType(taicpu(p),top_const,top_reg) and
  5133. MatchInstruction(hp1,A_AND,[]) and
  5134. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5135. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  5136. { the second register must contain the first one, so compare their subreg types }
  5137. (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
  5138. (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
  5139. { change
  5140. and const1, reg
  5141. and const2, reg
  5142. to
  5143. and (const1 and const2), reg
  5144. }
  5145. begin
  5146. taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
  5147. DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
  5148. asml.remove(p);
  5149. p.Free;
  5150. p:=hp1;
  5151. Result:=true;
  5152. exit;
  5153. end
  5154. else if MatchOpType(taicpu(p),top_const,top_reg) and
  5155. MatchInstruction(hp1,A_MOVZX,[]) and
  5156. (taicpu(hp1).oper[0]^.typ = top_reg) and
  5157. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
  5158. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  5159. (((taicpu(p).opsize=S_W) and
  5160. (taicpu(hp1).opsize=S_BW)) or
  5161. ((taicpu(p).opsize=S_L) and
  5162. (taicpu(hp1).opsize in [S_WL,S_BL]))
  5163. {$ifdef x86_64}
  5164. or
  5165. ((taicpu(p).opsize=S_Q) and
  5166. (taicpu(hp1).opsize in [S_BQ,S_WQ]))
  5167. {$endif x86_64}
  5168. ) then
  5169. begin
  5170. if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  5171. ((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val)
  5172. ) or
  5173. (((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  5174. ((taicpu(p).oper[0]^.val and $ffff)=taicpu(p).oper[0]^.val))
  5175. then
  5176. begin
  5177. { Unlike MOVSX, MOVZX doesn't actually have a version that zero-extends a
  5178. 32-bit register to a 64-bit register, or even a version called MOVZXD, so
  5179. code that tests for the presence of AND 0xffffffff followed by MOVZX is
  5180. wasted, and is indictive of a compiler bug if it were triggered. [Kit]
  5181. NOTE: To zero-extend from 32 bits to 64 bits, simply use the standard MOV.
  5182. }
  5183. DebugMsg(SPeepholeOptimization + 'AndMovzToAnd done',p);
  5184. asml.remove(hp1);
  5185. hp1.free;
  5186. Exit;
  5187. end;
  5188. end
  5189. else if MatchOpType(taicpu(p),top_const,top_reg) and
  5190. MatchInstruction(hp1,A_SHL,[]) and
  5191. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5192. (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
  5193. begin
  5194. {$ifopt R+}
  5195. {$define RANGE_WAS_ON}
  5196. {$R-}
  5197. {$endif}
  5198. { get length of potential and mask }
  5199. MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
  5200. { really a mask? }
  5201. {$ifdef RANGE_WAS_ON}
  5202. {$R+}
  5203. {$endif}
  5204. if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
  5205. { unmasked part shifted out? }
  5206. ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
  5207. begin
  5208. DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
  5209. RemoveCurrentP(p, hp1);
  5210. Result:=true;
  5211. exit;
  5212. end;
  5213. end
  5214. else if MatchOpType(taicpu(p),top_const,top_reg) and
  5215. MatchInstruction(hp1,A_MOVSX{$ifdef x86_64},A_MOVSXD{$endif x86_64},[]) and
  5216. (taicpu(hp1).oper[0]^.typ = top_reg) and
  5217. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
  5218. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  5219. (((taicpu(p).opsize=S_W) and
  5220. (taicpu(hp1).opsize=S_BW)) or
  5221. ((taicpu(p).opsize=S_L) and
  5222. (taicpu(hp1).opsize in [S_WL,S_BL]))
  5223. {$ifdef x86_64}
  5224. or
  5225. ((taicpu(p).opsize=S_Q) and
  5226. (taicpu(hp1).opsize in [S_BQ,S_WQ,S_LQ]))
  5227. {$endif x86_64}
  5228. ) then
  5229. begin
  5230. if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  5231. ((taicpu(p).oper[0]^.val and $7f)=taicpu(p).oper[0]^.val)
  5232. ) or
  5233. (((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  5234. ((taicpu(p).oper[0]^.val and $7fff)=taicpu(p).oper[0]^.val))
  5235. {$ifdef x86_64}
  5236. or
  5237. (((taicpu(hp1).opsize)=S_LQ) and
  5238. ((taicpu(p).oper[0]^.val and $7fffffff)=taicpu(p).oper[0]^.val)
  5239. )
  5240. {$endif x86_64}
  5241. then
  5242. begin
  5243. DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
  5244. asml.remove(hp1);
  5245. hp1.free;
  5246. Exit;
  5247. end;
  5248. end
  5249. else if (taicpu(p).oper[1]^.typ = top_reg) and
  5250. (hp1.typ = ait_instruction) and
  5251. (taicpu(hp1).is_jmp) and
  5252. (taicpu(hp1).opcode<>A_JMP) and
  5253. not(RegInUsedRegs(taicpu(p).oper[1]^.reg,UsedRegs)) then
  5254. begin
  5255. { change
  5256. and x, reg
  5257. jxx
  5258. to
  5259. test x, reg
  5260. jxx
  5261. if reg is deallocated before the
  5262. jump, but only if it's a conditional jump (PFV)
  5263. }
  5264. taicpu(p).opcode := A_TEST;
  5265. Exit;
  5266. end;
  5267. end;
  5268. { Lone AND tests }
  5269. if MatchOpType(taicpu(p),top_const,top_reg) then
  5270. begin
  5271. {
  5272. - Convert and $0xFF,reg to and reg,reg if reg is 8-bit
  5273. - Convert and $0xFFFF,reg to and reg,reg if reg is 16-bit
  5274. - Convert and $0xFFFFFFFF,reg to and reg,reg if reg is 32-bit
  5275. }
  5276. if ((taicpu(p).oper[0]^.val = $FF) and (taicpu(p).opsize = S_B)) or
  5277. ((taicpu(p).oper[0]^.val = $FFFF) and (taicpu(p).opsize = S_W)) or
  5278. ((taicpu(p).oper[0]^.val = $FFFFFFFF) and (taicpu(p).opsize = S_L)) then
  5279. begin
  5280. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  5281. if taicpu(p).opsize = S_L then
  5282. Include(OptsToCheck,aoc_MovAnd2Mov_3);
  5283. end;
  5284. end;
  5285. end;
  5286. function TX86AsmOptimizer.OptPass2Lea(var p : tai) : Boolean;
  5287. begin
  5288. Result:=false;
  5289. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) and
  5290. MatchReference(taicpu(p).oper[0]^.ref^,taicpu(p).oper[1]^.reg,NR_INVALID) and
  5291. (taicpu(p).oper[0]^.ref^.index<>NR_NO) then
  5292. begin
  5293. taicpu(p).loadreg(1,taicpu(p).oper[0]^.ref^.base);
  5294. taicpu(p).loadreg(0,taicpu(p).oper[0]^.ref^.index);
  5295. taicpu(p).opcode:=A_ADD;
  5296. DebugMsg(SPeepholeOptimization + 'Lea2AddBase done',p);
  5297. result:=true;
  5298. end
  5299. else if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) and
  5300. MatchReference(taicpu(p).oper[0]^.ref^,NR_INVALID,taicpu(p).oper[1]^.reg) and
  5301. (taicpu(p).oper[0]^.ref^.base<>NR_NO) then
  5302. begin
  5303. taicpu(p).loadreg(1,taicpu(p).oper[0]^.ref^.index);
  5304. taicpu(p).loadreg(0,taicpu(p).oper[0]^.ref^.base);
  5305. taicpu(p).opcode:=A_ADD;
  5306. DebugMsg(SPeepholeOptimization + 'Lea2AddIndex done',p);
  5307. result:=true;
  5308. end;
  5309. end;
  5310. function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
  5311. var
  5312. hp1: tai; NewRef: TReference;
  5313. begin
  5314. { Change:
  5315. subl/q $x,%reg1
  5316. movl/q %reg1,%reg2
  5317. To:
  5318. leal/q $-x(%reg1),%reg2
  5319. subl/q $x,%reg1
  5320. Breaks the dependency chain and potentially permits the removal of
  5321. a CMP instruction if one follows.
  5322. }
  5323. Result := False;
  5324. if not (cs_opt_size in current_settings.optimizerswitches) and
  5325. (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  5326. MatchOpType(taicpu(p),top_const,top_reg) and
  5327. GetNextInstruction(p, hp1) and
  5328. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  5329. (taicpu(hp1).oper[1]^.typ = top_reg) and
  5330. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) then
  5331. begin
  5332. { Change the MOV instruction to a LEA instruction, and update the
  5333. first operand }
  5334. reference_reset(NewRef, 1, []);
  5335. NewRef.base := taicpu(p).oper[1]^.reg;
  5336. NewRef.scalefactor := 1;
  5337. NewRef.offset := -taicpu(p).oper[0]^.val;
  5338. taicpu(hp1).opcode := A_LEA;
  5339. taicpu(hp1).loadref(0, NewRef);
  5340. { Move what is now the LEA instruction to before the SUB instruction }
  5341. Asml.Remove(hp1);
  5342. Asml.InsertBefore(hp1, p);
  5343. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
  5344. DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
  5345. Result := True;
  5346. end;
  5347. end;
  5348. function TX86AsmOptimizer.SkipSimpleInstructions(var hp1 : tai) : Boolean;
  5349. begin
  5350. { we can skip all instructions not messing with the stack pointer }
  5351. while assigned(hp1) and {MatchInstruction(taicpu(hp1),[A_LEA,A_MOV,A_MOVQ,A_MOVSQ,A_MOVSX,A_MOVSXD,A_MOVZX,
  5352. A_AND,A_OR,A_XOR,A_ADD,A_SHR,A_SHL,A_IMUL,A_SETcc,A_SAR,A_SUB,A_TEST,A_CMOVcc,
  5353. A_MOVSS,A_MOVSD,A_MOVAPS,A_MOVUPD,A_MOVAPD,A_MOVUPS,
  5354. A_VMOVSS,A_VMOVSD,A_VMOVAPS,A_VMOVUPD,A_VMOVAPD,A_VMOVUPS],[]) and}
  5355. ({(taicpu(hp1).ops=0) or }
  5356. ({(MatchOpType(taicpu(hp1),top_reg,top_reg) or MatchOpType(taicpu(hp1),top_const,top_reg) or
  5357. (MatchOpType(taicpu(hp1),top_ref,top_reg))
  5358. ) and }
  5359. not(RegInInstruction(NR_STACK_POINTER_REG,hp1)) { and not(RegInInstruction(NR_FRAME_POINTER_REG,hp1))}
  5360. )
  5361. ) do
  5362. GetNextInstruction(hp1,hp1);
  5363. Result:=assigned(hp1);
  5364. end;
  5365. function TX86AsmOptimizer.PostPeepholeOptLea(var p : tai) : Boolean;
  5366. var
  5367. hp1, hp2, hp3, hp4: tai;
  5368. begin
  5369. Result:=false;
  5370. { replace
  5371. leal(q) x(<stackpointer>),<stackpointer>
  5372. call procname
  5373. leal(q) -x(<stackpointer>),<stackpointer>
  5374. ret
  5375. by
  5376. jmp procname
  5377. but do it only on level 4 because it destroys stack back traces
  5378. }
  5379. if (cs_opt_level4 in current_settings.optimizerswitches) and
  5380. MatchOpType(taicpu(p),top_ref,top_reg) and
  5381. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  5382. (taicpu(p).oper[0]^.ref^.index=NR_NO) and
  5383. { the -8 or -24 are not required, but bail out early if possible,
  5384. higher values are unlikely }
  5385. ((taicpu(p).oper[0]^.ref^.offset=-8) or
  5386. (taicpu(p).oper[0]^.ref^.offset=-24)) and
  5387. (taicpu(p).oper[0]^.ref^.symbol=nil) and
  5388. (taicpu(p).oper[0]^.ref^.relsymbol=nil) and
  5389. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and
  5390. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG) and
  5391. GetNextInstruction(p, hp1) and
  5392. { Take a copy of hp1 }
  5393. SetAndTest(hp1, hp4) and
  5394. { trick to skip label }
  5395. ((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
  5396. SkipSimpleInstructions(hp1) and
  5397. MatchInstruction(hp1,A_CALL,[S_NO]) and
  5398. GetNextInstruction(hp1, hp2) and
  5399. MatchInstruction(hp2,A_LEA,[taicpu(p).opsize]) and
  5400. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  5401. (taicpu(hp2).oper[0]^.ref^.offset=-taicpu(p).oper[0]^.ref^.offset) and
  5402. (taicpu(hp2).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  5403. (taicpu(hp2).oper[0]^.ref^.index=NR_NO) and
  5404. (taicpu(hp2).oper[0]^.ref^.symbol=nil) and
  5405. (taicpu(hp2).oper[0]^.ref^.relsymbol=nil) and
  5406. (taicpu(hp2).oper[0]^.ref^.segment=NR_NO) and
  5407. (taicpu(hp2).oper[1]^.reg=NR_STACK_POINTER_REG) and
  5408. GetNextInstruction(hp2, hp3) and
  5409. { trick to skip label }
  5410. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  5411. MatchInstruction(hp3,A_RET,[S_NO]) and
  5412. (taicpu(hp3).ops=0) then
  5413. begin
  5414. taicpu(hp1).opcode := A_JMP;
  5415. taicpu(hp1).is_jmp := true;
  5416. DebugMsg(SPeepholeOptimization + 'LeaCallLeaRet2Jmp done',p);
  5417. RemoveCurrentP(p, hp4);
  5418. AsmL.Remove(hp2);
  5419. hp2.free;
  5420. AsmL.Remove(hp3);
  5421. hp3.free;
  5422. Result:=true;
  5423. end;
  5424. end;
  5425. function TX86AsmOptimizer.PostPeepholeOptPush(var p : tai) : Boolean;
  5426. var
  5427. hp1, hp2, hp3, hp4: tai;
  5428. begin
  5429. Result:=false;
  5430. { replace
  5431. push %rax
  5432. call procname
  5433. pop %rcx
  5434. ret
  5435. by
  5436. jmp procname
  5437. but do it only on level 4 because it destroys stack back traces
  5438. It depends on the fact, that the sequence push rax/pop rcx is used for stack alignment as rcx is volatile
  5439. for all supported calling conventions
  5440. }
  5441. if (cs_opt_level4 in current_settings.optimizerswitches) and
  5442. MatchOpType(taicpu(p),top_reg) and
  5443. (taicpu(p).oper[0]^.reg=NR_RAX) and
  5444. GetNextInstruction(p, hp1) and
  5445. { Take a copy of hp1 }
  5446. SetAndTest(hp1, hp4) and
  5447. { trick to skip label }
  5448. ((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
  5449. SkipSimpleInstructions(hp1) and
  5450. MatchInstruction(hp1,A_CALL,[S_NO]) and
  5451. GetNextInstruction(hp1, hp2) and
  5452. MatchInstruction(hp2,A_POP,[taicpu(p).opsize]) and
  5453. MatchOpType(taicpu(hp2),top_reg) and
  5454. (taicpu(hp2).oper[0]^.reg=NR_RCX) and
  5455. GetNextInstruction(hp2, hp3) and
  5456. { trick to skip label }
  5457. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  5458. MatchInstruction(hp3,A_RET,[S_NO]) and
  5459. (taicpu(hp3).ops=0) then
  5460. begin
  5461. taicpu(hp1).opcode := A_JMP;
  5462. taicpu(hp1).is_jmp := true;
  5463. DebugMsg(SPeepholeOptimization + 'PushCallPushRet2Jmp done',p);
  5464. RemoveCurrentP(p, hp4);
  5465. AsmL.Remove(hp2);
  5466. hp2.free;
  5467. AsmL.Remove(hp3);
  5468. hp3.free;
  5469. Result:=true;
  5470. end;
  5471. end;
  5472. function TX86AsmOptimizer.PostPeepholeOptMov(var p : tai) : Boolean;
  5473. var
  5474. Value, RegName: string;
  5475. begin
  5476. Result:=false;
  5477. if (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(p).oper[0]^.typ = top_const) then
  5478. begin
  5479. case taicpu(p).oper[0]^.val of
  5480. 0:
  5481. { Don't make this optimisation if the CPU flags are required, since XOR scrambles them }
  5482. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  5483. begin
  5484. { change "mov $0,%reg" into "xor %reg,%reg" }
  5485. taicpu(p).opcode := A_XOR;
  5486. taicpu(p).loadReg(0,taicpu(p).oper[1]^.reg);
  5487. Result := True;
  5488. end;
  5489. $1..$FFFFFFFF:
  5490. begin
  5491. { Code size reduction by J. Gareth "Kit" Moreton }
  5492. { change 64-bit register to 32-bit register to reduce code size (upper 32 bits will be set to zero) }
  5493. case taicpu(p).opsize of
  5494. S_Q:
  5495. begin
  5496. RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
  5497. Value := debug_tostr(taicpu(p).oper[0]^.val);
  5498. { The actual optimization }
  5499. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5500. taicpu(p).changeopsize(S_L);
  5501. DebugMsg(SPeepholeOptimization + 'movq $' + Value + ',' + RegName + ' -> movl $' + Value + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
  5502. Result := True;
  5503. end;
  5504. else
  5505. { Do nothing };
  5506. end;
  5507. end;
  5508. -1:
  5509. { Don't make this optimisation if the CPU flags are required, since OR scrambles them }
  5510. if (cs_opt_size in current_settings.optimizerswitches) and
  5511. (taicpu(p).opsize <> S_B) and
  5512. not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  5513. begin
  5514. { change "mov $-1,%reg" into "or $-1,%reg" }
  5515. { NOTES:
  5516. - No size saving is made when changing a Word-sized assignment unless the register is AX (smaller encoding)
  5517. - This operation creates a false dependency on the register, so only do it when optimising for size
  5518. - It is possible to set memory operands using this method, but this creates an even greater false dependency, so don't do this at all
  5519. }
  5520. taicpu(p).opcode := A_OR;
  5521. Result := True;
  5522. end;
  5523. end;
  5524. end;
  5525. end;
  5526. function TX86AsmOptimizer.PostPeepholeOptMOVSX(var p : tai) : boolean;
  5527. begin
  5528. Result := False;
  5529. if not MatchOpType(taicpu(p), top_reg, top_reg) then
  5530. Exit;
  5531. { Convert:
  5532. movswl %ax,%eax -> cwtl
  5533. movslq %eax,%rax -> cdqe
  5534. NOTE: Don't convert movswl %al,%ax to cbw, because cbw and cwde
  5535. refer to the same opcode and depends only on the assembler's
  5536. current operand-size attribute. [Kit]
  5537. }
  5538. with taicpu(p) do
  5539. case opsize of
  5540. S_WL:
  5541. if (oper[0]^.reg = NR_AX) and (oper[1]^.reg = NR_EAX) then
  5542. begin
  5543. DebugMsg(SPeepholeOptimization + 'Converted movswl %ax,%eax to cwtl', p);
  5544. opcode := A_CWDE;
  5545. clearop(0);
  5546. clearop(1);
  5547. ops := 0;
  5548. Result := True;
  5549. end;
  5550. {$ifdef x86_64}
  5551. S_LQ:
  5552. if (oper[0]^.reg = NR_EAX) and (oper[1]^.reg = NR_RAX) then
  5553. begin
  5554. DebugMsg(SPeepholeOptimization + 'Converted movslq %eax,%rax to cltq', p);
  5555. opcode := A_CDQE;
  5556. clearop(0);
  5557. clearop(1);
  5558. ops := 0;
  5559. Result := True;
  5560. end;
  5561. {$endif x86_64}
  5562. else
  5563. ;
  5564. end;
  5565. end;
  5566. function TX86AsmOptimizer.PostPeepholeOptCmp(var p : tai) : Boolean;
  5567. begin
  5568. Result:=false;
  5569. { change "cmp $0, %reg" to "test %reg, %reg" }
  5570. if MatchOpType(taicpu(p),top_const,top_reg) and
  5571. (taicpu(p).oper[0]^.val = 0) then
  5572. begin
  5573. taicpu(p).opcode := A_TEST;
  5574. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  5575. Result:=true;
  5576. end;
  5577. end;
  5578. function TX86AsmOptimizer.PostPeepholeOptTestOr(var p : tai) : Boolean;
  5579. var
  5580. IsTestConstX : Boolean;
  5581. hp1,hp2 : tai;
  5582. begin
  5583. Result:=false;
  5584. { removes the line marked with (x) from the sequence
  5585. and/or/xor/add/sub/... $x, %y
  5586. test/or %y, %y | test $-1, %y (x)
  5587. j(n)z _Label
  5588. as the first instruction already adjusts the ZF
  5589. %y operand may also be a reference }
  5590. IsTestConstX:=(taicpu(p).opcode=A_TEST) and
  5591. MatchOperand(taicpu(p).oper[0]^,-1);
  5592. if (OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) or IsTestConstX) and
  5593. GetLastInstruction(p, hp1) and
  5594. (tai(hp1).typ = ait_instruction) and
  5595. GetNextInstruction(p,hp2) and
  5596. MatchInstruction(hp2,A_SETcc,A_Jcc,A_CMOVcc,[]) then
  5597. case taicpu(hp1).opcode Of
  5598. A_ADD, A_SUB, A_OR, A_XOR, A_AND:
  5599. begin
  5600. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  5601. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5602. { and in case of carry for A(E)/B(E)/C/NC }
  5603. ((taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) or
  5604. ((taicpu(hp1).opcode <> A_ADD) and
  5605. (taicpu(hp1).opcode <> A_SUB))) then
  5606. begin
  5607. hp1 := tai(p.next);
  5608. asml.remove(p);
  5609. p.free;
  5610. p := tai(hp1);
  5611. Result:=true;
  5612. end;
  5613. end;
  5614. A_SHL, A_SAL, A_SHR, A_SAR:
  5615. begin
  5616. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  5617. { SHL/SAL/SHR/SAR with a value of 0 do not change the flags }
  5618. { therefore, it's only safe to do this optimization for }
  5619. { shifts by a (nonzero) constant }
  5620. (taicpu(hp1).oper[0]^.typ = top_const) and
  5621. (taicpu(hp1).oper[0]^.val <> 0) and
  5622. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5623. { and in case of carry for A(E)/B(E)/C/NC }
  5624. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  5625. begin
  5626. hp1 := tai(p.next);
  5627. asml.remove(p);
  5628. p.free;
  5629. p := tai(hp1);
  5630. Result:=true;
  5631. end;
  5632. end;
  5633. A_DEC, A_INC, A_NEG:
  5634. begin
  5635. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) and
  5636. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5637. { and in case of carry for A(E)/B(E)/C/NC }
  5638. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  5639. begin
  5640. case taicpu(hp1).opcode of
  5641. A_DEC, A_INC:
  5642. { replace inc/dec with add/sub 1, because inc/dec doesn't set the carry flag }
  5643. begin
  5644. case taicpu(hp1).opcode Of
  5645. A_DEC: taicpu(hp1).opcode := A_SUB;
  5646. A_INC: taicpu(hp1).opcode := A_ADD;
  5647. else
  5648. ;
  5649. end;
  5650. taicpu(hp1).loadoper(1,taicpu(hp1).oper[0]^);
  5651. taicpu(hp1).loadConst(0,1);
  5652. taicpu(hp1).ops:=2;
  5653. end;
  5654. else
  5655. ;
  5656. end;
  5657. hp1 := tai(p.next);
  5658. asml.remove(p);
  5659. p.free;
  5660. p := tai(hp1);
  5661. Result:=true;
  5662. end;
  5663. end
  5664. else
  5665. { change "test $-1,%reg" into "test %reg,%reg" }
  5666. if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
  5667. taicpu(p).loadoper(0,taicpu(p).oper[1]^);
  5668. end { case }
  5669. { change "test $-1,%reg" into "test %reg,%reg" }
  5670. else if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
  5671. taicpu(p).loadoper(0,taicpu(p).oper[1]^);
  5672. end;
  5673. function TX86AsmOptimizer.PostPeepholeOptCall(var p : tai) : Boolean;
  5674. var
  5675. hp1 : tai;
  5676. {$ifndef x86_64}
  5677. hp2 : taicpu;
  5678. {$endif x86_64}
  5679. begin
  5680. Result:=false;
  5681. {$ifndef x86_64}
  5682. { don't do this on modern CPUs, this really hurts them due to
  5683. broken call/ret pairing }
  5684. if (current_settings.optimizecputype < cpu_Pentium2) and
  5685. not(cs_create_pic in current_settings.moduleswitches) and
  5686. GetNextInstruction(p, hp1) and
  5687. MatchInstruction(hp1,A_JMP,[S_NO]) and
  5688. MatchOpType(taicpu(hp1),top_ref) and
  5689. (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  5690. begin
  5691. hp2 := taicpu.Op_sym(A_PUSH,S_L,taicpu(hp1).oper[0]^.ref^.symbol);
  5692. InsertLLItem(p.previous, p, hp2);
  5693. taicpu(p).opcode := A_JMP;
  5694. taicpu(p).is_jmp := true;
  5695. asml.remove(hp1);
  5696. hp1.free;
  5697. Result:=true;
  5698. end
  5699. else
  5700. {$endif x86_64}
  5701. { replace
  5702. call procname
  5703. ret
  5704. by
  5705. jmp procname
  5706. but do it only on level 4 because it destroys stack back traces
  5707. else if the subroutine is marked as no return, remove the ret
  5708. }
  5709. if ((cs_opt_level4 in current_settings.optimizerswitches) or
  5710. (po_noreturn in current_procinfo.procdef.procoptions)) and
  5711. GetNextInstruction(p, hp1) and
  5712. MatchInstruction(hp1,A_RET,[S_NO]) and
  5713. (taicpu(hp1).ops=0) then
  5714. begin
  5715. if (cs_opt_level4 in current_settings.optimizerswitches) and
  5716. { we might destroy stack alignment here if we do not do a call }
  5717. (target_info.stackalign<=sizeof(SizeUInt)) then
  5718. begin
  5719. taicpu(p).opcode := A_JMP;
  5720. taicpu(p).is_jmp := true;
  5721. DebugMsg(SPeepholeOptimization + 'CallRet2Jmp done',p);
  5722. end
  5723. else
  5724. DebugMsg(SPeepholeOptimization + 'CallRet2Call done',p);
  5725. asml.remove(hp1);
  5726. hp1.free;
  5727. Result:=true;
  5728. end;
  5729. end;
  5730. {$ifdef x86_64}
  5731. function TX86AsmOptimizer.PostPeepholeOptMovzx(var p : tai) : Boolean;
  5732. var
  5733. PreMessage: string;
  5734. begin
  5735. Result := False;
  5736. { Code size reduction by J. Gareth "Kit" Moreton }
  5737. { Convert MOVZBQ and MOVZWQ to MOVZBL and MOVZWL respectively if it removes the REX prefix }
  5738. if (taicpu(p).opsize in [S_BQ, S_WQ]) and
  5739. (getsupreg(taicpu(p).oper[1]^.reg) in [RS_RAX, RS_RCX, RS_RDX, RS_RBX, RS_RSI, RS_RDI, RS_RBP, RS_RSP])
  5740. then
  5741. begin
  5742. { Has 64-bit register name and opcode suffix }
  5743. PreMessage := 'movz' + debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' -> movz';
  5744. { The actual optimization }
  5745. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5746. if taicpu(p).opsize = S_BQ then
  5747. taicpu(p).changeopsize(S_BL)
  5748. else
  5749. taicpu(p).changeopsize(S_WL);
  5750. DebugMsg(SPeepholeOptimization + PreMessage +
  5751. debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (removes REX prefix)', p);
  5752. end;
  5753. end;
  5754. function TX86AsmOptimizer.PostPeepholeOptXor(var p : tai) : Boolean;
  5755. var
  5756. PreMessage, RegName: string;
  5757. begin
  5758. { Code size reduction by J. Gareth "Kit" Moreton }
  5759. { change "xorq %reg,%reg" to "xorl %reg,%reg" for %rax, %rcx, %rdx, %rbx, %rsi, %rdi, %rbp and %rsp,
  5760. as this removes the REX prefix }
  5761. Result := False;
  5762. if not OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  5763. Exit;
  5764. if taicpu(p).oper[0]^.typ <> top_reg then
  5765. { Should be impossible if both operands were equal, since one of XOR's operands must be a register }
  5766. InternalError(2018011500);
  5767. case taicpu(p).opsize of
  5768. S_Q:
  5769. begin
  5770. if (getsupreg(taicpu(p).oper[0]^.reg) in [RS_RAX, RS_RCX, RS_RDX, RS_RBX, RS_RSI, RS_RDI, RS_RBP, RS_RSP]) then
  5771. begin
  5772. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 64-bit register name }
  5773. PreMessage := 'xorq ' + RegName + ',' + RegName + ' -> xorl ';
  5774. { The actual optimization }
  5775. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  5776. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5777. taicpu(p).changeopsize(S_L);
  5778. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 32-bit register name }
  5779. DebugMsg(SPeepholeOptimization + PreMessage + RegName + ',' + RegName + ' (removes REX prefix)', p);
  5780. end;
  5781. end;
  5782. else
  5783. ;
  5784. end;
  5785. end;
  5786. {$endif}
  5787. class procedure TX86AsmOptimizer.OptimizeRefs(var p: taicpu);
  5788. var
  5789. OperIdx: Integer;
  5790. begin
  5791. for OperIdx := 0 to p.ops - 1 do
  5792. if p.oper[OperIdx]^.typ = top_ref then
  5793. optimize_ref(p.oper[OperIdx]^.ref^, False);
  5794. end;
  5795. end.