aoptx86.pas 256 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006
  1. {
  2. Copyright (c) 1998-2002 by Florian Klaempfl and Jonas Maebe
  3. This unit contains the peephole optimizer.
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit aoptx86;
  18. {$i fpcdefs.inc}
  19. {$define DEBUG_AOPTCPU}
  20. interface
  21. uses
  22. globtype,
  23. cpubase,
  24. aasmtai,aasmcpu,
  25. cgbase,cgutils,
  26. aopt,aoptobj;
  27. type
  28. TOptsToCheck = (
  29. aoc_MovAnd2Mov_3
  30. );
  31. TX86AsmOptimizer = class(TAsmOptimizer)
  32. { some optimizations are very expensive to check, so the
  33. pre opt pass can be used to set some flags, depending on the found
  34. instructions if it is worth to check a certain optimization }
  35. OptsToCheck : set of TOptsToCheck;
  36. function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override;
  37. function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override;
  38. function RegReadByInstruction(reg : TRegister; hp : tai) : boolean;
  39. function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
  40. function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  41. {
  42. In comparison with GetNextInstructionUsingReg, GetNextInstructionUsingRegTrackingUse tracks
  43. the use of a register by allocs/dealloc, so it can ignore calls.
  44. In the following example, GetNextInstructionUsingReg will return the second movq,
  45. GetNextInstructionUsingRegTrackingUse won't.
  46. movq %rdi,%rax
  47. # Register rdi released
  48. # Register rdi allocated
  49. movq %rax,%rdi
  50. While in this example:
  51. movq %rdi,%rax
  52. call proc
  53. movq %rdi,%rax
  54. GetNextInstructionUsingRegTrackingUse will return the second instruction while GetNextInstructionUsingReg
  55. won't.
  56. }
  57. function GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  58. function RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean; override;
  59. protected
  60. class function IsMOVZXAcceptable: Boolean; static; inline;
  61. { checks whether loading a new value in reg1 overwrites the entirety of reg2 }
  62. function Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  63. { checks whether reading the value in reg1 depends on the value of reg2. This
  64. is very similar to SuperRegisterEquals, except it takes into account that
  65. R_SUBH and R_SUBL are independendent (e.g. reading from AL does not
  66. depend on the value in AH). }
  67. function Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  68. { Replaces all references to AOldReg in a memory reference to ANewReg }
  69. class function ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean; static;
  70. { Replaces all references to AOldReg in an operand to ANewReg }
  71. class function ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean; static;
  72. { Replaces all references to AOldReg in an instruction to ANewReg,
  73. except where the register is being written }
  74. function ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
  75. { Returns true if the reference only refers to ESP or EBP (or their 64-bit equivalents),
  76. or writes to a global symbol }
  77. class function IsRefSafe(const ref: PReference): Boolean; static; inline;
  78. { Returns true if the given MOV instruction can be safely converted to CMOV }
  79. class function CanBeCMOV(p : tai) : boolean; static;
  80. function DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  81. procedure DebugMsg(const s : string; p : tai);inline;
  82. class function IsExitCode(p : tai) : boolean; static;
  83. class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
  84. procedure RemoveLastDeallocForFuncRes(p : tai);
  85. function DoSubAddOpt(var p : tai) : Boolean;
  86. function PrePeepholeOptSxx(var p : tai) : boolean;
  87. function PrePeepholeOptIMUL(var p : tai) : boolean;
  88. function OptPass1AND(var p : tai) : boolean;
  89. function OptPass1_V_MOVAP(var p : tai) : boolean;
  90. function OptPass1VOP(var p : tai) : boolean;
  91. function OptPass1MOV(var p : tai) : boolean;
  92. function OptPass1Movx(var p : tai) : boolean;
  93. function OptPass1MOVXX(var p : tai) : boolean;
  94. function OptPass1OP(var p : tai) : boolean;
  95. function OptPass1LEA(var p : tai) : boolean;
  96. function OptPass1Sub(var p : tai) : boolean;
  97. function OptPass1SHLSAL(var p : tai) : boolean;
  98. function OptPass1SETcc(var p : tai) : boolean;
  99. function OptPass1FSTP(var p : tai) : boolean;
  100. function OptPass1FLD(var p : tai) : boolean;
  101. function OptPass1Cmp(var p : tai) : boolean;
  102. function OptPass2MOV(var p : tai) : boolean;
  103. function OptPass2Imul(var p : tai) : boolean;
  104. function OptPass2Jmp(var p : tai) : boolean;
  105. function OptPass2Jcc(var p : tai) : boolean;
  106. function OptPass2Lea(var p: tai): Boolean;
  107. function OptPass2SUB(var p: tai): Boolean;
  108. function PostPeepholeOptMov(var p : tai) : Boolean;
  109. {$ifdef x86_64} { These post-peephole optimisations only affect 64-bit registers. [Kit] }
  110. function PostPeepholeOptMovzx(var p : tai) : Boolean;
  111. function PostPeepholeOptXor(var p : tai) : Boolean;
  112. {$endif}
  113. function PostPeepholeOptMOVSX(var p : tai) : boolean;
  114. function PostPeepholeOptCmp(var p : tai) : Boolean;
  115. function PostPeepholeOptTestOr(var p : tai) : Boolean;
  116. function PostPeepholeOptCall(var p : tai) : Boolean;
  117. function PostPeepholeOptLea(var p : tai) : Boolean;
  118. procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
  119. { Processor-dependent reference optimisation }
  120. class procedure OptimizeRefs(var p: taicpu); static;
  121. end;
  122. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  123. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  124. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  125. function MatchInstruction(const instr: tai; const ops: array of TAsmOp; const opsize: topsizes): boolean;
  126. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  127. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  128. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  129. function RefsEqual(const r1, r2: treference): boolean;
  130. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  131. { returns true, if ref is a reference using only the registers passed as base and index
  132. and having an offset }
  133. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  134. implementation
  135. uses
  136. cutils,verbose,
  137. systems,
  138. globals,
  139. cpuinfo,
  140. procinfo,
  141. aasmbase,
  142. aoptbase,aoptutils,
  143. symconst,symsym,
  144. cgx86,
  145. itcpugas;
  146. {$ifdef DEBUG_AOPTCPU}
  147. const
  148. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  149. {$else DEBUG_AOPTCPU}
  150. { Empty strings help the optimizer to remove string concatenations that won't
  151. ever appear to the user on release builds. [Kit] }
  152. const
  153. SPeepholeOptimization = '';
  154. {$endif DEBUG_AOPTCPU}
  155. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  156. begin
  157. result :=
  158. (instr.typ = ait_instruction) and
  159. (taicpu(instr).opcode = op) and
  160. ((opsize = []) or (taicpu(instr).opsize in opsize));
  161. end;
  162. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  163. begin
  164. result :=
  165. (instr.typ = ait_instruction) and
  166. ((taicpu(instr).opcode = op1) or
  167. (taicpu(instr).opcode = op2)
  168. ) and
  169. ((opsize = []) or (taicpu(instr).opsize in opsize));
  170. end;
  171. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  172. begin
  173. result :=
  174. (instr.typ = ait_instruction) and
  175. ((taicpu(instr).opcode = op1) or
  176. (taicpu(instr).opcode = op2) or
  177. (taicpu(instr).opcode = op3)
  178. ) and
  179. ((opsize = []) or (taicpu(instr).opsize in opsize));
  180. end;
  181. function MatchInstruction(const instr : tai;const ops : array of TAsmOp;
  182. const opsize : topsizes) : boolean;
  183. var
  184. op : TAsmOp;
  185. begin
  186. result:=false;
  187. for op in ops do
  188. begin
  189. if (instr.typ = ait_instruction) and
  190. (taicpu(instr).opcode = op) and
  191. ((opsize = []) or (taicpu(instr).opsize in opsize)) then
  192. begin
  193. result:=true;
  194. exit;
  195. end;
  196. end;
  197. end;
  198. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  199. begin
  200. result := (oper.typ = top_reg) and (oper.reg = reg);
  201. end;
  202. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  203. begin
  204. result := (oper.typ = top_const) and (oper.val = a);
  205. end;
  206. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  207. begin
  208. result := oper1.typ = oper2.typ;
  209. if result then
  210. case oper1.typ of
  211. top_const:
  212. Result:=oper1.val = oper2.val;
  213. top_reg:
  214. Result:=oper1.reg = oper2.reg;
  215. top_ref:
  216. Result:=RefsEqual(oper1.ref^, oper2.ref^);
  217. else
  218. internalerror(2013102801);
  219. end
  220. end;
  221. function RefsEqual(const r1, r2: treference): boolean;
  222. begin
  223. RefsEqual :=
  224. (r1.offset = r2.offset) and
  225. (r1.segment = r2.segment) and (r1.base = r2.base) and
  226. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  227. (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  228. (r1.relsymbol = r2.relsymbol) and
  229. (r1.volatility=[]) and
  230. (r2.volatility=[]);
  231. end;
  232. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  233. begin
  234. Result:=(ref.offset=0) and
  235. (ref.scalefactor in [0,1]) and
  236. (ref.segment=NR_NO) and
  237. (ref.symbol=nil) and
  238. (ref.relsymbol=nil) and
  239. ((base=NR_INVALID) or
  240. (ref.base=base)) and
  241. ((index=NR_INVALID) or
  242. (ref.index=index)) and
  243. (ref.volatility=[]);
  244. end;
  245. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  246. begin
  247. Result:=(ref.scalefactor in [0,1]) and
  248. (ref.segment=NR_NO) and
  249. (ref.symbol=nil) and
  250. (ref.relsymbol=nil) and
  251. ((base=NR_INVALID) or
  252. (ref.base=base)) and
  253. ((index=NR_INVALID) or
  254. (ref.index=index)) and
  255. (ref.volatility=[]);
  256. end;
  257. function InstrReadsFlags(p: tai): boolean;
  258. begin
  259. InstrReadsFlags := true;
  260. case p.typ of
  261. ait_instruction:
  262. if InsProp[taicpu(p).opcode].Ch*
  263. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  264. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  265. Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc,Ch_All]<>[] then
  266. exit;
  267. ait_label:
  268. exit;
  269. else
  270. ;
  271. end;
  272. InstrReadsFlags := false;
  273. end;
  274. function TX86AsmOptimizer.GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  275. begin
  276. Next:=Current;
  277. repeat
  278. Result:=GetNextInstruction(Next,Next);
  279. until not (Result) or
  280. not(cs_opt_level3 in current_settings.optimizerswitches) or
  281. (Next.typ<>ait_instruction) or
  282. RegInInstruction(reg,Next) or
  283. is_calljmp(taicpu(Next).opcode);
  284. end;
  285. function TX86AsmOptimizer.GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  286. begin
  287. if not(cs_opt_level3 in current_settings.optimizerswitches) then
  288. begin
  289. Result:=GetNextInstruction(Current,Next);
  290. exit;
  291. end;
  292. Next:=tai(Current.Next);
  293. Result:=false;
  294. while assigned(Next) do
  295. begin
  296. if ((Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) and not(taicpu(Next).opcode=A_CALL)) or
  297. ((Next.typ=ait_regalloc) and (getsupreg(tai_regalloc(Next).reg)=getsupreg(reg))) or
  298. ((Next.typ=ait_label) and not(labelCanBeSkipped(Tai_Label(Next)))) then
  299. exit
  300. else if (Next.typ=ait_instruction) and RegInInstruction(reg,Next) and not(taicpu(Next).opcode=A_CALL) then
  301. begin
  302. Result:=true;
  303. exit;
  304. end;
  305. Next:=tai(Next.Next);
  306. end;
  307. end;
  308. function TX86AsmOptimizer.InstructionLoadsFromReg(const reg: TRegister;const hp: tai): boolean;
  309. begin
  310. Result:=RegReadByInstruction(reg,hp);
  311. end;
  312. function TX86AsmOptimizer.RegReadByInstruction(reg: TRegister; hp: tai): boolean;
  313. var
  314. p: taicpu;
  315. opcount: longint;
  316. begin
  317. RegReadByInstruction := false;
  318. if hp.typ <> ait_instruction then
  319. exit;
  320. p := taicpu(hp);
  321. case p.opcode of
  322. A_CALL:
  323. regreadbyinstruction := true;
  324. A_IMUL:
  325. case p.ops of
  326. 1:
  327. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  328. (
  329. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  330. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  331. );
  332. 2,3:
  333. regReadByInstruction :=
  334. reginop(reg,p.oper[0]^) or
  335. reginop(reg,p.oper[1]^);
  336. else
  337. InternalError(2019112801);
  338. end;
  339. A_MUL:
  340. begin
  341. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  342. (
  343. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  344. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  345. );
  346. end;
  347. A_IDIV,A_DIV:
  348. begin
  349. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  350. (
  351. (getregtype(reg)=R_INTREGISTER) and
  352. (
  353. (getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B))
  354. )
  355. );
  356. end;
  357. else
  358. begin
  359. if (p.opcode=A_LEA) and is_segment_reg(reg) then
  360. begin
  361. RegReadByInstruction := false;
  362. exit;
  363. end;
  364. for opcount := 0 to p.ops-1 do
  365. if (p.oper[opCount]^.typ = top_ref) and
  366. RegInRef(reg,p.oper[opcount]^.ref^) then
  367. begin
  368. RegReadByInstruction := true;
  369. exit
  370. end;
  371. { special handling for SSE MOVSD }
  372. if (p.opcode=A_MOVSD) and (p.ops>0) then
  373. begin
  374. if p.ops<>2 then
  375. internalerror(2017042702);
  376. regReadByInstruction := reginop(reg,p.oper[0]^) or
  377. (
  378. (p.oper[1]^.typ=top_reg) and (p.oper[0]^.typ=top_reg) and reginop(reg, p.oper[1]^)
  379. );
  380. exit;
  381. end;
  382. with insprop[p.opcode] do
  383. begin
  384. if getregtype(reg)=R_INTREGISTER then
  385. begin
  386. case getsupreg(reg) of
  387. RS_EAX:
  388. if [Ch_REAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
  389. begin
  390. RegReadByInstruction := true;
  391. exit
  392. end;
  393. RS_ECX:
  394. if [Ch_RECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
  395. begin
  396. RegReadByInstruction := true;
  397. exit
  398. end;
  399. RS_EDX:
  400. if [Ch_REDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
  401. begin
  402. RegReadByInstruction := true;
  403. exit
  404. end;
  405. RS_EBX:
  406. if [Ch_REBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
  407. begin
  408. RegReadByInstruction := true;
  409. exit
  410. end;
  411. RS_ESP:
  412. if [Ch_RESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
  413. begin
  414. RegReadByInstruction := true;
  415. exit
  416. end;
  417. RS_EBP:
  418. if [Ch_REBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
  419. begin
  420. RegReadByInstruction := true;
  421. exit
  422. end;
  423. RS_ESI:
  424. if [Ch_RESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
  425. begin
  426. RegReadByInstruction := true;
  427. exit
  428. end;
  429. RS_EDI:
  430. if [Ch_REDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
  431. begin
  432. RegReadByInstruction := true;
  433. exit
  434. end;
  435. end;
  436. end;
  437. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  438. begin
  439. if (Ch_RFLAGScc in Ch) and not(getsubreg(reg) in [R_SUBW,R_SUBD,R_SUBQ]) then
  440. begin
  441. case p.condition of
  442. C_A,C_NBE, { CF=0 and ZF=0 }
  443. C_BE,C_NA: { CF=1 or ZF=1 }
  444. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY,R_SUBFLAGZERO];
  445. C_AE,C_NB,C_NC, { CF=0 }
  446. C_B,C_NAE,C_C: { CF=1 }
  447. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY];
  448. C_NE,C_NZ, { ZF=0 }
  449. C_E,C_Z: { ZF=1 }
  450. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO];
  451. C_G,C_NLE, { ZF=0 and SF=OF }
  452. C_LE,C_NG: { ZF=1 or SF<>OF }
  453. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO,R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  454. C_GE,C_NL, { SF=OF }
  455. C_L,C_NGE: { SF<>OF }
  456. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  457. C_NO, { OF=0 }
  458. C_O: { OF=1 }
  459. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGOVERFLOW];
  460. C_NP,C_PO, { PF=0 }
  461. C_P,C_PE: { PF=1 }
  462. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGPARITY];
  463. C_NS, { SF=0 }
  464. C_S: { SF=1 }
  465. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN];
  466. else
  467. internalerror(2017042701);
  468. end;
  469. if RegReadByInstruction then
  470. exit;
  471. end;
  472. case getsubreg(reg) of
  473. R_SUBW,R_SUBD,R_SUBQ:
  474. RegReadByInstruction :=
  475. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  476. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  477. Ch_RDirFlag,Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc]*Ch<>[];
  478. R_SUBFLAGCARRY:
  479. RegReadByInstruction:=[Ch_RCarryFlag,Ch_RWCarryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  480. R_SUBFLAGPARITY:
  481. RegReadByInstruction:=[Ch_RParityFlag,Ch_RWParityFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  482. R_SUBFLAGAUXILIARY:
  483. RegReadByInstruction:=[Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  484. R_SUBFLAGZERO:
  485. RegReadByInstruction:=[Ch_RZeroFlag,Ch_RWZeroFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  486. R_SUBFLAGSIGN:
  487. RegReadByInstruction:=[Ch_RSignFlag,Ch_RWSignFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  488. R_SUBFLAGOVERFLOW:
  489. RegReadByInstruction:=[Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  490. R_SUBFLAGINTERRUPT:
  491. RegReadByInstruction:=[Ch_RFlags,Ch_RWFlags]*Ch<>[];
  492. R_SUBFLAGDIRECTION:
  493. RegReadByInstruction:=[Ch_RDirFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  494. else
  495. internalerror(2017042601);
  496. end;
  497. exit;
  498. end;
  499. if (Ch_NoReadIfEqualRegs in Ch) and (p.ops=2) and
  500. (p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
  501. (p.oper[0]^.reg=p.oper[1]^.reg) then
  502. exit;
  503. if ([CH_RWOP1,CH_ROP1,CH_MOP1]*Ch<>[]) and reginop(reg,p.oper[0]^) then
  504. begin
  505. RegReadByInstruction := true;
  506. exit
  507. end;
  508. if ([Ch_RWOP2,Ch_ROP2,Ch_MOP2]*Ch<>[]) and reginop(reg,p.oper[1]^) then
  509. begin
  510. RegReadByInstruction := true;
  511. exit
  512. end;
  513. if ([Ch_RWOP3,Ch_ROP3,Ch_MOP3]*Ch<>[]) and reginop(reg,p.oper[2]^) then
  514. begin
  515. RegReadByInstruction := true;
  516. exit
  517. end;
  518. if ([Ch_RWOP4,Ch_ROP4,Ch_MOP4]*Ch<>[]) and reginop(reg,p.oper[3]^) then
  519. begin
  520. RegReadByInstruction := true;
  521. exit
  522. end;
  523. end;
  524. end;
  525. end;
  526. end;
  527. function TX86AsmOptimizer.RegInInstruction(Reg: TRegister; p1: tai): Boolean;
  528. begin
  529. result:=false;
  530. if p1.typ<>ait_instruction then
  531. exit;
  532. if (Ch_All in insprop[taicpu(p1).opcode].Ch) then
  533. exit(true);
  534. if (getregtype(reg)=R_INTREGISTER) and
  535. { change information for xmm movsd are not correct }
  536. ((taicpu(p1).opcode<>A_MOVSD) or (taicpu(p1).ops=0)) then
  537. begin
  538. case getsupreg(reg) of
  539. { RS_EAX = RS_RAX on x86-64 }
  540. RS_EAX:
  541. result:=([Ch_REAX,Ch_RRAX,Ch_WEAX,Ch_WRAX,Ch_RWEAX,Ch_RWRAX,Ch_MEAX,Ch_MRAX]*insprop[taicpu(p1).opcode].Ch)<>[];
  542. RS_ECX:
  543. result:=([Ch_RECX,Ch_RRCX,Ch_WECX,Ch_WRCX,Ch_RWECX,Ch_RWRCX,Ch_MECX,Ch_MRCX]*insprop[taicpu(p1).opcode].Ch)<>[];
  544. RS_EDX:
  545. result:=([Ch_REDX,Ch_RRDX,Ch_WEDX,Ch_WRDX,Ch_RWEDX,Ch_RWRDX,Ch_MEDX,Ch_MRDX]*insprop[taicpu(p1).opcode].Ch)<>[];
  546. RS_EBX:
  547. result:=([Ch_REBX,Ch_RRBX,Ch_WEBX,Ch_WRBX,Ch_RWEBX,Ch_RWRBX,Ch_MEBX,Ch_MRBX]*insprop[taicpu(p1).opcode].Ch)<>[];
  548. RS_ESP:
  549. result:=([Ch_RESP,Ch_RRSP,Ch_WESP,Ch_WRSP,Ch_RWESP,Ch_RWRSP,Ch_MESP,Ch_MRSP]*insprop[taicpu(p1).opcode].Ch)<>[];
  550. RS_EBP:
  551. result:=([Ch_REBP,Ch_RRBP,Ch_WEBP,Ch_WRBP,Ch_RWEBP,Ch_RWRBP,Ch_MEBP,Ch_MRBP]*insprop[taicpu(p1).opcode].Ch)<>[];
  552. RS_ESI:
  553. result:=([Ch_RESI,Ch_RRSI,Ch_WESI,Ch_WRSI,Ch_RWESI,Ch_RWRSI,Ch_MESI,Ch_MRSI,Ch_RMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  554. RS_EDI:
  555. result:=([Ch_REDI,Ch_RRDI,Ch_WEDI,Ch_WRDI,Ch_RWEDI,Ch_RWRDI,Ch_MEDI,Ch_MRDI,Ch_WMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  556. else
  557. ;
  558. end;
  559. if result then
  560. exit;
  561. end
  562. else if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  563. begin
  564. if ([Ch_RFlags,Ch_WFlags,Ch_RWFlags,Ch_RFLAGScc]*insprop[taicpu(p1).opcode].Ch)<>[] then
  565. exit(true);
  566. case getsubreg(reg) of
  567. R_SUBFLAGCARRY:
  568. Result:=([Ch_RCarryFlag,Ch_RWCarryFlag,Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  569. R_SUBFLAGPARITY:
  570. Result:=([Ch_RParityFlag,Ch_RWParityFlag,Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  571. R_SUBFLAGAUXILIARY:
  572. Result:=([Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  573. R_SUBFLAGZERO:
  574. Result:=([Ch_RZeroFlag,Ch_RWZeroFlag,Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  575. R_SUBFLAGSIGN:
  576. Result:=([Ch_RSignFlag,Ch_RWSignFlag,Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  577. R_SUBFLAGOVERFLOW:
  578. Result:=([Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  579. R_SUBFLAGINTERRUPT:
  580. Result:=([Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  581. R_SUBFLAGDIRECTION:
  582. Result:=([Ch_RDirFlag,Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  583. else
  584. ;
  585. end;
  586. if result then
  587. exit;
  588. end
  589. else if (getregtype(reg)=R_FPUREGISTER) and (Ch_FPU in insprop[taicpu(p1).opcode].Ch) then
  590. exit(true);
  591. Result:=inherited RegInInstruction(Reg, p1);
  592. end;
  593. function TX86AsmOptimizer.RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean;
  594. begin
  595. Result := False;
  596. if p1.typ <> ait_instruction then
  597. exit;
  598. with insprop[taicpu(p1).opcode] do
  599. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  600. begin
  601. case getsubreg(reg) of
  602. R_SUBW,R_SUBD,R_SUBQ:
  603. Result :=
  604. [Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
  605. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  606. Ch_W0DirFlag,Ch_W1DirFlag,Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  607. R_SUBFLAGCARRY:
  608. Result:=[Ch_WCarryFlag,Ch_RWCarryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  609. R_SUBFLAGPARITY:
  610. Result:=[Ch_WParityFlag,Ch_RWParityFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  611. R_SUBFLAGAUXILIARY:
  612. Result:=[Ch_WAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  613. R_SUBFLAGZERO:
  614. Result:=[Ch_WZeroFlag,Ch_RWZeroFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  615. R_SUBFLAGSIGN:
  616. Result:=[Ch_WSignFlag,Ch_RWSignFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  617. R_SUBFLAGOVERFLOW:
  618. Result:=[Ch_WOverflowFlag,Ch_RWOverflowFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  619. R_SUBFLAGINTERRUPT:
  620. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  621. R_SUBFLAGDIRECTION:
  622. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  623. else
  624. internalerror(2017042602);
  625. end;
  626. exit;
  627. end;
  628. case taicpu(p1).opcode of
  629. A_CALL:
  630. { We could potentially set Result to False if the register in
  631. question is non-volatile for the subroutine's calling convention,
  632. but this would require detecting the calling convention in use and
  633. also assuming that the routine doesn't contain malformed assembly
  634. language, for example... so it could only be done under -O4 as it
  635. would be considered a side-effect. [Kit] }
  636. Result := True;
  637. A_MOVSD:
  638. { special handling for SSE MOVSD }
  639. if (taicpu(p1).ops>0) then
  640. begin
  641. if taicpu(p1).ops<>2 then
  642. internalerror(2017042703);
  643. Result := (taicpu(p1).oper[1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[1]^);
  644. end;
  645. { VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
  646. so fix it here (FK)
  647. }
  648. A_VMOVSS,
  649. A_VMOVSD:
  650. begin
  651. Result := (taicpu(p1).ops=3) and (taicpu(p1).oper[2]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[2]^);
  652. exit;
  653. end;
  654. A_IMUL:
  655. Result := (taicpu(p1).oper[taicpu(p1).ops-1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[taicpu(p1).ops-1]^);
  656. else
  657. ;
  658. end;
  659. if Result then
  660. exit;
  661. with insprop[taicpu(p1).opcode] do
  662. begin
  663. if getregtype(reg)=R_INTREGISTER then
  664. begin
  665. case getsupreg(reg) of
  666. RS_EAX:
  667. if [Ch_WEAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
  668. begin
  669. Result := True;
  670. exit
  671. end;
  672. RS_ECX:
  673. if [Ch_WECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
  674. begin
  675. Result := True;
  676. exit
  677. end;
  678. RS_EDX:
  679. if [Ch_WEDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
  680. begin
  681. Result := True;
  682. exit
  683. end;
  684. RS_EBX:
  685. if [Ch_WEBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
  686. begin
  687. Result := True;
  688. exit
  689. end;
  690. RS_ESP:
  691. if [Ch_WESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
  692. begin
  693. Result := True;
  694. exit
  695. end;
  696. RS_EBP:
  697. if [Ch_WEBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
  698. begin
  699. Result := True;
  700. exit
  701. end;
  702. RS_ESI:
  703. if [Ch_WESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
  704. begin
  705. Result := True;
  706. exit
  707. end;
  708. RS_EDI:
  709. if [Ch_WEDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
  710. begin
  711. Result := True;
  712. exit
  713. end;
  714. end;
  715. end;
  716. if ([CH_RWOP1,CH_WOP1,CH_MOP1]*Ch<>[]) and reginop(reg,taicpu(p1).oper[0]^) then
  717. begin
  718. Result := true;
  719. exit
  720. end;
  721. if ([Ch_RWOP2,Ch_WOP2,Ch_MOP2]*Ch<>[]) and reginop(reg,taicpu(p1).oper[1]^) then
  722. begin
  723. Result := true;
  724. exit
  725. end;
  726. if ([Ch_RWOP3,Ch_WOP3,Ch_MOP3]*Ch<>[]) and reginop(reg,taicpu(p1).oper[2]^) then
  727. begin
  728. Result := true;
  729. exit
  730. end;
  731. if ([Ch_RWOP4,Ch_WOP4,Ch_MOP4]*Ch<>[]) and reginop(reg,taicpu(p1).oper[3]^) then
  732. begin
  733. Result := true;
  734. exit
  735. end;
  736. end;
  737. end;
  738. {$ifdef DEBUG_AOPTCPU}
  739. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);
  740. begin
  741. asml.insertbefore(tai_comment.Create(strpnew(s)), p);
  742. end;
  743. function debug_tostr(i: tcgint): string; inline;
  744. begin
  745. Result := tostr(i);
  746. end;
  747. function debug_regname(r: TRegister): string; inline;
  748. begin
  749. Result := '%' + std_regname(r);
  750. end;
  751. { Debug output function - creates a string representation of an operator }
  752. function debug_operstr(oper: TOper): string;
  753. begin
  754. case oper.typ of
  755. top_const:
  756. Result := '$' + debug_tostr(oper.val);
  757. top_reg:
  758. Result := debug_regname(oper.reg);
  759. top_ref:
  760. begin
  761. if oper.ref^.offset <> 0 then
  762. Result := debug_tostr(oper.ref^.offset) + '('
  763. else
  764. Result := '(';
  765. if (oper.ref^.base <> NR_INVALID) and (oper.ref^.base <> NR_NO) then
  766. begin
  767. Result := Result + debug_regname(oper.ref^.base);
  768. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  769. Result := Result + ',' + debug_regname(oper.ref^.index);
  770. end
  771. else
  772. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  773. Result := Result + debug_regname(oper.ref^.index);
  774. if (oper.ref^.scalefactor > 1) then
  775. Result := Result + ',' + debug_tostr(oper.ref^.scalefactor) + ')'
  776. else
  777. Result := Result + ')';
  778. end;
  779. else
  780. Result := '[UNKNOWN]';
  781. end;
  782. end;
  783. function debug_op2str(opcode: tasmop): string; inline;
  784. begin
  785. Result := std_op2str[opcode];
  786. end;
  787. function debug_opsize2str(opsize: topsize): string; inline;
  788. begin
  789. Result := gas_opsize2str[opsize];
  790. end;
  791. {$else DEBUG_AOPTCPU}
  792. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);inline;
  793. begin
  794. end;
  795. function debug_tostr(i: tcgint): string; inline;
  796. begin
  797. Result := '';
  798. end;
  799. function debug_regname(r: TRegister): string; inline;
  800. begin
  801. Result := '';
  802. end;
  803. function debug_operstr(oper: TOper): string; inline;
  804. begin
  805. Result := '';
  806. end;
  807. function debug_op2str(opcode: tasmop): string; inline;
  808. begin
  809. Result := '';
  810. end;
  811. function debug_opsize2str(opsize: topsize): string; inline;
  812. begin
  813. Result := '';
  814. end;
  815. {$endif DEBUG_AOPTCPU}
  816. class function TX86AsmOptimizer.IsMOVZXAcceptable: Boolean; inline;
  817. begin
  818. {$ifdef x86_64}
  819. { Always fine on x86-64 }
  820. Result := True;
  821. {$else x86_64}
  822. Result :=
  823. {$ifdef i8086}
  824. (current_settings.cputype >= cpu_386) and
  825. {$endif i8086}
  826. (
  827. { Always accept if optimising for size }
  828. (cs_opt_size in current_settings.optimizerswitches) or
  829. { From the Pentium II onwards, MOVZX only takes 1 cycle. [Kit] }
  830. (current_settings.optimizecputype >= cpu_Pentium2)
  831. );
  832. {$endif x86_64}
  833. end;
  834. function TX86AsmOptimizer.Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  835. begin
  836. if not SuperRegistersEqual(reg1,reg2) then
  837. exit(false);
  838. if getregtype(reg1)<>R_INTREGISTER then
  839. exit(true); {because SuperRegisterEqual is true}
  840. case getsubreg(reg1) of
  841. { A write to R_SUBL doesn't change R_SUBH and if reg2 is R_SUBW or
  842. higher, it preserves the high bits, so the new value depends on
  843. reg2's previous value. In other words, it is equivalent to doing:
  844. reg2 := (reg2 and $ffffff00) or byte(reg1); }
  845. R_SUBL:
  846. exit(getsubreg(reg2)=R_SUBL);
  847. { A write to R_SUBH doesn't change R_SUBL and if reg2 is R_SUBW or
  848. higher, it actually does a:
  849. reg2 := (reg2 and $ffff00ff) or (reg1 and $ff00); }
  850. R_SUBH:
  851. exit(getsubreg(reg2)=R_SUBH);
  852. { If reg2 is R_SUBD or larger, a write to R_SUBW preserves the high 16
  853. bits of reg2:
  854. reg2 := (reg2 and $ffff0000) or word(reg1); }
  855. R_SUBW:
  856. exit(getsubreg(reg2) in [R_SUBL,R_SUBH,R_SUBW]);
  857. { a write to R_SUBD always overwrites every other subregister,
  858. because it clears the high 32 bits of R_SUBQ on x86_64 }
  859. R_SUBD,
  860. R_SUBQ:
  861. exit(true);
  862. else
  863. internalerror(2017042801);
  864. end;
  865. end;
  866. function TX86AsmOptimizer.Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  867. begin
  868. if not SuperRegistersEqual(reg1,reg2) then
  869. exit(false);
  870. if getregtype(reg1)<>R_INTREGISTER then
  871. exit(true); {because SuperRegisterEqual is true}
  872. case getsubreg(reg1) of
  873. R_SUBL:
  874. exit(getsubreg(reg2)<>R_SUBH);
  875. R_SUBH:
  876. exit(getsubreg(reg2)<>R_SUBL);
  877. R_SUBW,
  878. R_SUBD,
  879. R_SUBQ:
  880. exit(true);
  881. else
  882. internalerror(2017042802);
  883. end;
  884. end;
  885. function TX86AsmOptimizer.PrePeepholeOptSxx(var p : tai) : boolean;
  886. var
  887. hp1 : tai;
  888. l : TCGInt;
  889. begin
  890. result:=false;
  891. { changes the code sequence
  892. shr/sar const1, x
  893. shl const2, x
  894. to
  895. either "sar/and", "shl/and" or just "and" depending on const1 and const2 }
  896. if GetNextInstruction(p, hp1) and
  897. MatchInstruction(hp1,A_SHL,[]) and
  898. (taicpu(p).oper[0]^.typ = top_const) and
  899. (taicpu(hp1).oper[0]^.typ = top_const) and
  900. (taicpu(hp1).opsize = taicpu(p).opsize) and
  901. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[1]^.typ) and
  902. OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^) then
  903. begin
  904. if (taicpu(p).oper[0]^.val > taicpu(hp1).oper[0]^.val) and
  905. not(cs_opt_size in current_settings.optimizerswitches) then
  906. begin
  907. { shr/sar const1, %reg
  908. shl const2, %reg
  909. with const1 > const2 }
  910. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  911. taicpu(hp1).opcode := A_AND;
  912. l := (1 shl (taicpu(hp1).oper[0]^.val)) - 1;
  913. case taicpu(p).opsize Of
  914. S_B: taicpu(hp1).loadConst(0,l Xor $ff);
  915. S_W: taicpu(hp1).loadConst(0,l Xor $ffff);
  916. S_L: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffff));
  917. S_Q: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffffffffffff));
  918. else
  919. Internalerror(2017050703)
  920. end;
  921. end
  922. else if (taicpu(p).oper[0]^.val<taicpu(hp1).oper[0]^.val) and
  923. not(cs_opt_size in current_settings.optimizerswitches) then
  924. begin
  925. { shr/sar const1, %reg
  926. shl const2, %reg
  927. with const1 < const2 }
  928. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val-taicpu(p).oper[0]^.val);
  929. taicpu(p).opcode := A_AND;
  930. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  931. case taicpu(p).opsize Of
  932. S_B: taicpu(p).loadConst(0,l Xor $ff);
  933. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  934. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  935. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  936. else
  937. Internalerror(2017050702)
  938. end;
  939. end
  940. else if (taicpu(p).oper[0]^.val = taicpu(hp1).oper[0]^.val) then
  941. begin
  942. { shr/sar const1, %reg
  943. shl const2, %reg
  944. with const1 = const2 }
  945. taicpu(p).opcode := A_AND;
  946. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  947. case taicpu(p).opsize Of
  948. S_B: taicpu(p).loadConst(0,l Xor $ff);
  949. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  950. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  951. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  952. else
  953. Internalerror(2017050701)
  954. end;
  955. asml.remove(hp1);
  956. hp1.free;
  957. end;
  958. end;
  959. end;
  960. function TX86AsmOptimizer.PrePeepholeOptIMUL(var p : tai) : boolean;
  961. var
  962. opsize : topsize;
  963. hp1 : tai;
  964. tmpref : treference;
  965. ShiftValue : Cardinal;
  966. BaseValue : TCGInt;
  967. begin
  968. result:=false;
  969. opsize:=taicpu(p).opsize;
  970. { changes certain "imul const, %reg"'s to lea sequences }
  971. if (MatchOpType(taicpu(p),top_const,top_reg) or
  972. MatchOpType(taicpu(p),top_const,top_reg,top_reg)) and
  973. (opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
  974. if (taicpu(p).oper[0]^.val = 1) then
  975. if (taicpu(p).ops = 2) then
  976. { remove "imul $1, reg" }
  977. begin
  978. hp1 := tai(p.Next);
  979. DebugMsg(SPeepholeOptimization + 'Imul2Nop done',p);
  980. RemoveCurrentP(p);
  981. result:=true;
  982. end
  983. else
  984. { change "imul $1, reg1, reg2" to "mov reg1, reg2" }
  985. begin
  986. hp1 := taicpu.Op_Reg_Reg(A_MOV, opsize, taicpu(p).oper[1]^.reg,taicpu(p).oper[2]^.reg);
  987. InsertLLItem(p.previous, p.next, hp1);
  988. DebugMsg(SPeepholeOptimization + 'Imul2Mov done',p);
  989. p.free;
  990. p := hp1;
  991. end
  992. else if ((taicpu(p).ops <= 2) or
  993. (taicpu(p).oper[2]^.typ = Top_Reg)) and
  994. not(cs_opt_size in current_settings.optimizerswitches) and
  995. (not(GetNextInstruction(p, hp1)) or
  996. not((tai(hp1).typ = ait_instruction) and
  997. ((taicpu(hp1).opcode=A_Jcc) and
  998. (taicpu(hp1).condition in [C_O,C_NO])))) then
  999. begin
  1000. {
  1001. imul X, reg1, reg2 to
  1002. lea (reg1,reg1,Y), reg2
  1003. shl ZZ,reg2
  1004. imul XX, reg1 to
  1005. lea (reg1,reg1,YY), reg1
  1006. shl ZZ,reg2
  1007. This optimziation makes sense for pretty much every x86, except the VIA Nano3000: it has IMUL latency 2, lea/shl pair as well,
  1008. it does not exist as a separate optimization target in FPC though.
  1009. This optimziation can be applied as long as only two bits are set in the constant and those two bits are separated by
  1010. at most two zeros
  1011. }
  1012. reference_reset(tmpref,1,[]);
  1013. if (PopCnt(QWord(taicpu(p).oper[0]^.val))=2) and (BsrQWord(taicpu(p).oper[0]^.val)-BsfQWord(taicpu(p).oper[0]^.val)<=3) then
  1014. begin
  1015. ShiftValue:=BsfQWord(taicpu(p).oper[0]^.val);
  1016. BaseValue:=taicpu(p).oper[0]^.val shr ShiftValue;
  1017. TmpRef.base := taicpu(p).oper[1]^.reg;
  1018. TmpRef.index := taicpu(p).oper[1]^.reg;
  1019. if not(BaseValue in [3,5,9]) then
  1020. Internalerror(2018110101);
  1021. TmpRef.ScaleFactor := BaseValue-1;
  1022. if (taicpu(p).ops = 2) then
  1023. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[1]^.reg)
  1024. else
  1025. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[2]^.reg);
  1026. AsmL.InsertAfter(hp1,p);
  1027. DebugMsg(SPeepholeOptimization + 'Imul2LeaShl done',p);
  1028. taicpu(hp1).fileinfo:=taicpu(p).fileinfo;
  1029. RemoveCurrentP(p);
  1030. if ShiftValue>0 then
  1031. AsmL.InsertAfter(taicpu.op_const_reg(A_SHL, opsize, ShiftValue, taicpu(hp1).oper[1]^.reg),hp1);
  1032. end;
  1033. end;
  1034. end;
  1035. function TX86AsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  1036. var
  1037. p: taicpu;
  1038. begin
  1039. if not assigned(hp) or
  1040. (hp.typ <> ait_instruction) then
  1041. begin
  1042. Result := false;
  1043. exit;
  1044. end;
  1045. p := taicpu(hp);
  1046. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  1047. with insprop[p.opcode] do
  1048. begin
  1049. case getsubreg(reg) of
  1050. R_SUBW,R_SUBD,R_SUBQ:
  1051. Result:=
  1052. RegLoadedWithNewValue(NR_CARRYFLAG,hp) and
  1053. RegLoadedWithNewValue(NR_PARITYFLAG,hp) and
  1054. RegLoadedWithNewValue(NR_AUXILIARYFLAG,hp) and
  1055. RegLoadedWithNewValue(NR_ZEROFLAG,hp) and
  1056. RegLoadedWithNewValue(NR_SIGNFLAG,hp) and
  1057. RegLoadedWithNewValue(NR_OVERFLOWFLAG,hp);
  1058. R_SUBFLAGCARRY:
  1059. Result:=[Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag,Ch_WFlags]*Ch<>[];
  1060. R_SUBFLAGPARITY:
  1061. Result:=[Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag,Ch_WFlags]*Ch<>[];
  1062. R_SUBFLAGAUXILIARY:
  1063. Result:=[Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_WFlags]*Ch<>[];
  1064. R_SUBFLAGZERO:
  1065. Result:=[Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag,Ch_WFlags]*Ch<>[];
  1066. R_SUBFLAGSIGN:
  1067. Result:=[Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag,Ch_WFlags]*Ch<>[];
  1068. R_SUBFLAGOVERFLOW:
  1069. Result:=[Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag,Ch_WFlags]*Ch<>[];
  1070. R_SUBFLAGINTERRUPT:
  1071. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*Ch<>[];
  1072. R_SUBFLAGDIRECTION:
  1073. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*Ch<>[];
  1074. else
  1075. begin
  1076. writeln(getsubreg(reg));
  1077. internalerror(2017050501);
  1078. end;
  1079. end;
  1080. exit;
  1081. end;
  1082. Result :=
  1083. (((p.opcode = A_MOV) or
  1084. (p.opcode = A_MOVZX) or
  1085. (p.opcode = A_MOVSX) or
  1086. (p.opcode = A_LEA) or
  1087. (p.opcode = A_VMOVSS) or
  1088. (p.opcode = A_VMOVSD) or
  1089. (p.opcode = A_VMOVAPD) or
  1090. (p.opcode = A_VMOVAPS) or
  1091. (p.opcode = A_VMOVQ) or
  1092. (p.opcode = A_MOVSS) or
  1093. (p.opcode = A_MOVSD) or
  1094. (p.opcode = A_MOVQ) or
  1095. (p.opcode = A_MOVAPD) or
  1096. (p.opcode = A_MOVAPS) or
  1097. {$ifndef x86_64}
  1098. (p.opcode = A_LDS) or
  1099. (p.opcode = A_LES) or
  1100. {$endif not x86_64}
  1101. (p.opcode = A_LFS) or
  1102. (p.opcode = A_LGS) or
  1103. (p.opcode = A_LSS)) and
  1104. (p.ops=2) and { A_MOVSD can have zero operands, so this check is needed }
  1105. (p.oper[1]^.typ = top_reg) and
  1106. (Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)) and
  1107. ((p.oper[0]^.typ = top_const) or
  1108. ((p.oper[0]^.typ = top_reg) and
  1109. not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))) or
  1110. ((p.oper[0]^.typ = top_ref) and
  1111. not RegInRef(reg,p.oper[0]^.ref^)))) or
  1112. ((p.opcode = A_POP) and
  1113. (Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg))) or
  1114. ((p.opcode = A_IMUL) and
  1115. (p.ops=3) and
  1116. (Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg,reg)) and
  1117. (((p.oper[1]^.typ=top_reg) and not(Reg1ReadDependsOnReg2(p.oper[1]^.reg,reg))) or
  1118. ((p.oper[1]^.typ=top_ref) and not(RegInRef(reg,p.oper[1]^.ref^))))) or
  1119. ((((p.opcode = A_IMUL) or
  1120. (p.opcode = A_MUL)) and
  1121. (p.ops=1)) and
  1122. (((p.oper[0]^.typ=top_reg) and not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))) or
  1123. ((p.oper[0]^.typ=top_ref) and not(RegInRef(reg,p.oper[0]^.ref^)))) and
  1124. (((p.opsize=S_B) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg))) or
  1125. ((p.opsize=S_W) and Reg1WriteOverwritesReg2Entirely(NR_DX,reg)) or
  1126. ((p.opsize=S_L) and Reg1WriteOverwritesReg2Entirely(NR_EDX,reg))
  1127. {$ifdef x86_64}
  1128. or ((p.opsize=S_Q) and Reg1WriteOverwritesReg2Entirely(NR_RDX,reg))
  1129. {$endif x86_64}
  1130. )) or
  1131. ((p.opcode = A_CWD) and Reg1WriteOverwritesReg2Entirely(NR_DX,reg)) or
  1132. ((p.opcode = A_CDQ) and Reg1WriteOverwritesReg2Entirely(NR_EDX,reg)) or
  1133. {$ifdef x86_64}
  1134. ((p.opcode = A_CQO) and Reg1WriteOverwritesReg2Entirely(NR_RDX,reg)) or
  1135. {$endif x86_64}
  1136. ((p.opcode = A_CBW) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg))) or
  1137. {$ifndef x86_64}
  1138. ((p.opcode = A_LDS) and (reg=NR_DS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1139. ((p.opcode = A_LES) and (reg=NR_ES) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1140. {$endif not x86_64}
  1141. ((p.opcode = A_LFS) and (reg=NR_FS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1142. ((p.opcode = A_LGS) and (reg=NR_GS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1143. ((p.opcode = A_LSS) and (reg=NR_SS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1144. {$ifndef x86_64}
  1145. ((p.opcode = A_AAM) and Reg1WriteOverwritesReg2Entirely(NR_AH,reg)) or
  1146. {$endif not x86_64}
  1147. ((p.opcode = A_LAHF) and Reg1WriteOverwritesReg2Entirely(NR_AH,reg)) or
  1148. ((p.opcode = A_LODSB) and Reg1WriteOverwritesReg2Entirely(NR_AL,reg)) or
  1149. ((p.opcode = A_LODSW) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg)) or
  1150. ((p.opcode = A_LODSD) and Reg1WriteOverwritesReg2Entirely(NR_EAX,reg)) or
  1151. {$ifdef x86_64}
  1152. ((p.opcode = A_LODSQ) and Reg1WriteOverwritesReg2Entirely(NR_RAX,reg)) or
  1153. {$endif x86_64}
  1154. ((p.opcode = A_SETcc) and (p.oper[0]^.typ=top_reg) and Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg)) or
  1155. (((p.opcode = A_FSTSW) or
  1156. (p.opcode = A_FNSTSW)) and
  1157. (p.oper[0]^.typ=top_reg) and
  1158. Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg)) or
  1159. (((p.opcode = A_XOR) or (p.opcode = A_SUB) or (p.opcode = A_SBB)) and
  1160. (p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
  1161. (p.oper[0]^.reg=p.oper[1]^.reg) and
  1162. Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg));
  1163. end;
  1164. class function TX86AsmOptimizer.IsExitCode(p : tai) : boolean;
  1165. var
  1166. hp2,hp3 : tai;
  1167. begin
  1168. { some x86-64 issue a NOP before the real exit code }
  1169. if MatchInstruction(p,A_NOP,[]) then
  1170. GetNextInstruction(p,p);
  1171. result:=assigned(p) and (p.typ=ait_instruction) and
  1172. ((taicpu(p).opcode = A_RET) or
  1173. ((taicpu(p).opcode=A_LEAVE) and
  1174. GetNextInstruction(p,hp2) and
  1175. MatchInstruction(hp2,A_RET,[S_NO])
  1176. ) or
  1177. (((taicpu(p).opcode=A_LEA) and
  1178. MatchOpType(taicpu(p),top_ref,top_reg) and
  1179. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  1180. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  1181. ) and
  1182. GetNextInstruction(p,hp2) and
  1183. MatchInstruction(hp2,A_RET,[S_NO])
  1184. ) or
  1185. ((((taicpu(p).opcode=A_MOV) and
  1186. MatchOpType(taicpu(p),top_reg,top_reg) and
  1187. (taicpu(p).oper[0]^.reg=current_procinfo.framepointer) and
  1188. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)) or
  1189. ((taicpu(p).opcode=A_LEA) and
  1190. MatchOpType(taicpu(p),top_ref,top_reg) and
  1191. (taicpu(p).oper[0]^.ref^.base=current_procinfo.framepointer) and
  1192. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  1193. )
  1194. ) and
  1195. GetNextInstruction(p,hp2) and
  1196. MatchInstruction(hp2,A_POP,[reg2opsize(current_procinfo.framepointer)]) and
  1197. MatchOpType(taicpu(hp2),top_reg) and
  1198. (taicpu(hp2).oper[0]^.reg=current_procinfo.framepointer) and
  1199. GetNextInstruction(hp2,hp3) and
  1200. MatchInstruction(hp3,A_RET,[S_NO])
  1201. )
  1202. );
  1203. end;
  1204. class function TX86AsmOptimizer.isFoldableArithOp(hp1: taicpu; reg: tregister): boolean;
  1205. begin
  1206. isFoldableArithOp := False;
  1207. case hp1.opcode of
  1208. A_ADD,A_SUB,A_OR,A_XOR,A_AND,A_SHL,A_SHR,A_SAR:
  1209. isFoldableArithOp :=
  1210. ((taicpu(hp1).oper[0]^.typ = top_const) or
  1211. ((taicpu(hp1).oper[0]^.typ = top_reg) and
  1212. (taicpu(hp1).oper[0]^.reg <> reg))) and
  1213. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1214. (taicpu(hp1).oper[1]^.reg = reg);
  1215. A_INC,A_DEC,A_NEG,A_NOT:
  1216. isFoldableArithOp :=
  1217. (taicpu(hp1).oper[0]^.typ = top_reg) and
  1218. (taicpu(hp1).oper[0]^.reg = reg);
  1219. else
  1220. ;
  1221. end;
  1222. end;
  1223. procedure TX86AsmOptimizer.RemoveLastDeallocForFuncRes(p: tai);
  1224. procedure DoRemoveLastDeallocForFuncRes( supreg: tsuperregister);
  1225. var
  1226. hp2: tai;
  1227. begin
  1228. hp2 := p;
  1229. repeat
  1230. hp2 := tai(hp2.previous);
  1231. if assigned(hp2) and
  1232. (hp2.typ = ait_regalloc) and
  1233. (tai_regalloc(hp2).ratype=ra_dealloc) and
  1234. (getregtype(tai_regalloc(hp2).reg) = R_INTREGISTER) and
  1235. (getsupreg(tai_regalloc(hp2).reg) = supreg) then
  1236. begin
  1237. asml.remove(hp2);
  1238. hp2.free;
  1239. break;
  1240. end;
  1241. until not(assigned(hp2)) or regInInstruction(newreg(R_INTREGISTER,supreg,R_SUBWHOLE),hp2);
  1242. end;
  1243. begin
  1244. case current_procinfo.procdef.returndef.typ of
  1245. arraydef,recorddef,pointerdef,
  1246. stringdef,enumdef,procdef,objectdef,errordef,
  1247. filedef,setdef,procvardef,
  1248. classrefdef,forwarddef:
  1249. DoRemoveLastDeallocForFuncRes(RS_EAX);
  1250. orddef:
  1251. if current_procinfo.procdef.returndef.size <> 0 then
  1252. begin
  1253. DoRemoveLastDeallocForFuncRes(RS_EAX);
  1254. { for int64/qword }
  1255. if current_procinfo.procdef.returndef.size = 8 then
  1256. DoRemoveLastDeallocForFuncRes(RS_EDX);
  1257. end;
  1258. else
  1259. ;
  1260. end;
  1261. end;
  1262. function TX86AsmOptimizer.OptPass1_V_MOVAP(var p : tai) : boolean;
  1263. var
  1264. hp1,hp2 : tai;
  1265. begin
  1266. result:=false;
  1267. if MatchOpType(taicpu(p),top_reg,top_reg) then
  1268. begin
  1269. { vmova* reg1,reg1
  1270. =>
  1271. <nop> }
  1272. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  1273. begin
  1274. GetNextInstruction(p,hp1);
  1275. RemoveCurrentP(p);
  1276. p:=hp1;
  1277. result:=true;
  1278. exit;
  1279. end
  1280. else if GetNextInstruction(p,hp1) then
  1281. begin
  1282. if MatchInstruction(hp1,[taicpu(p).opcode],[S_NO]) and
  1283. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  1284. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  1285. begin
  1286. { vmova* reg1,reg2
  1287. vmova* reg2,reg3
  1288. dealloc reg2
  1289. =>
  1290. vmova* reg1,reg3 }
  1291. TransferUsedRegs(TmpUsedRegs);
  1292. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1293. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  1294. begin
  1295. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 1',p);
  1296. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  1297. asml.Remove(hp1);
  1298. hp1.Free;
  1299. result:=true;
  1300. exit;
  1301. end
  1302. { special case:
  1303. vmova* reg1,reg2
  1304. vmova* reg2,reg1
  1305. =>
  1306. vmova* reg1,reg2 }
  1307. else if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) then
  1308. begin
  1309. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 2',p);
  1310. asml.Remove(hp1);
  1311. hp1.Free;
  1312. result:=true;
  1313. exit;
  1314. end
  1315. end
  1316. else if ((MatchInstruction(p,[A_MOVAPS,A_VMOVAPS],[S_NO]) and
  1317. MatchInstruction(hp1,[A_MOVSS,A_VMOVSS],[S_NO])) or
  1318. ((MatchInstruction(p,[A_MOVAPD,A_VMOVAPD],[S_NO]) and
  1319. MatchInstruction(hp1,[A_MOVSD,A_VMOVSD],[S_NO])))
  1320. ) and
  1321. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  1322. begin
  1323. { vmova* reg1,reg2
  1324. vmovs* reg2,<op>
  1325. dealloc reg2
  1326. =>
  1327. vmovs* reg1,reg3 }
  1328. TransferUsedRegs(TmpUsedRegs);
  1329. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1330. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  1331. begin
  1332. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVS*2(V)MOVS* 1',p);
  1333. taicpu(p).opcode:=taicpu(hp1).opcode;
  1334. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  1335. asml.Remove(hp1);
  1336. hp1.Free;
  1337. result:=true;
  1338. exit;
  1339. end
  1340. end;
  1341. end;
  1342. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) then
  1343. begin
  1344. if MatchInstruction(hp1,[A_VFMADDPD,
  1345. A_VFMADD132PD,
  1346. A_VFMADD132PS,
  1347. A_VFMADD132SD,
  1348. A_VFMADD132SS,
  1349. A_VFMADD213PD,
  1350. A_VFMADD213PS,
  1351. A_VFMADD213SD,
  1352. A_VFMADD213SS,
  1353. A_VFMADD231PD,
  1354. A_VFMADD231PS,
  1355. A_VFMADD231SD,
  1356. A_VFMADD231SS,
  1357. A_VFMADDSUB132PD,
  1358. A_VFMADDSUB132PS,
  1359. A_VFMADDSUB213PD,
  1360. A_VFMADDSUB213PS,
  1361. A_VFMADDSUB231PD,
  1362. A_VFMADDSUB231PS,
  1363. A_VFMSUB132PD,
  1364. A_VFMSUB132PS,
  1365. A_VFMSUB132SD,
  1366. A_VFMSUB132SS,
  1367. A_VFMSUB213PD,
  1368. A_VFMSUB213PS,
  1369. A_VFMSUB213SD,
  1370. A_VFMSUB213SS,
  1371. A_VFMSUB231PD,
  1372. A_VFMSUB231PS,
  1373. A_VFMSUB231SD,
  1374. A_VFMSUB231SS,
  1375. A_VFMSUBADD132PD,
  1376. A_VFMSUBADD132PS,
  1377. A_VFMSUBADD213PD,
  1378. A_VFMSUBADD213PS,
  1379. A_VFMSUBADD231PD,
  1380. A_VFMSUBADD231PS,
  1381. A_VFNMADD132PD,
  1382. A_VFNMADD132PS,
  1383. A_VFNMADD132SD,
  1384. A_VFNMADD132SS,
  1385. A_VFNMADD213PD,
  1386. A_VFNMADD213PS,
  1387. A_VFNMADD213SD,
  1388. A_VFNMADD213SS,
  1389. A_VFNMADD231PD,
  1390. A_VFNMADD231PS,
  1391. A_VFNMADD231SD,
  1392. A_VFNMADD231SS,
  1393. A_VFNMSUB132PD,
  1394. A_VFNMSUB132PS,
  1395. A_VFNMSUB132SD,
  1396. A_VFNMSUB132SS,
  1397. A_VFNMSUB213PD,
  1398. A_VFNMSUB213PS,
  1399. A_VFNMSUB213SD,
  1400. A_VFNMSUB213SS,
  1401. A_VFNMSUB231PD,
  1402. A_VFNMSUB231PS,
  1403. A_VFNMSUB231SD,
  1404. A_VFNMSUB231SS],[S_NO]) and
  1405. { we mix single and double opperations here because we assume that the compiler
  1406. generates vmovapd only after double operations and vmovaps only after single operations }
  1407. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[2]^) and
  1408. GetNextInstruction(hp1,hp2) and
  1409. MatchInstruction(hp2,[A_VMOVAPD,A_VMOVAPS,A_MOVAPD,A_MOVAPS],[S_NO]) and
  1410. MatchOperand(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) then
  1411. begin
  1412. TransferUsedRegs(TmpUsedRegs);
  1413. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1414. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  1415. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  1416. begin
  1417. taicpu(hp1).loadoper(2,taicpu(p).oper[0]^);
  1418. RemoveCurrentP(p);
  1419. asml.Remove(hp2);
  1420. hp2.Free;
  1421. p:=hp1;
  1422. end;
  1423. end
  1424. else if (hp1.typ = ait_instruction) and
  1425. GetNextInstruction(hp1, hp2) and
  1426. MatchInstruction(hp2,taicpu(p).opcode,[]) and
  1427. OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  1428. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  1429. MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^) and
  1430. (((taicpu(p).opcode=A_MOVAPS) and
  1431. ((taicpu(hp1).opcode=A_ADDSS) or (taicpu(hp1).opcode=A_SUBSS) or
  1432. (taicpu(hp1).opcode=A_MULSS) or (taicpu(hp1).opcode=A_DIVSS))) or
  1433. ((taicpu(p).opcode=A_MOVAPD) and
  1434. ((taicpu(hp1).opcode=A_ADDSD) or (taicpu(hp1).opcode=A_SUBSD) or
  1435. (taicpu(hp1).opcode=A_MULSD) or (taicpu(hp1).opcode=A_DIVSD)))
  1436. ) then
  1437. { change
  1438. movapX reg,reg2
  1439. addsX/subsX/... reg3, reg2
  1440. movapX reg2,reg
  1441. to
  1442. addsX/subsX/... reg3,reg
  1443. }
  1444. begin
  1445. TransferUsedRegs(TmpUsedRegs);
  1446. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1447. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  1448. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  1449. begin
  1450. DebugMsg(SPeepholeOptimization + 'MovapXOpMovapX2Op ('+
  1451. debug_op2str(taicpu(p).opcode)+' '+
  1452. debug_op2str(taicpu(hp1).opcode)+' '+
  1453. debug_op2str(taicpu(hp2).opcode)+') done',p);
  1454. { we cannot eliminate the first move if
  1455. the operations uses the same register for source and dest }
  1456. if not(OpsEqual(taicpu(hp1).oper[1]^,taicpu(hp1).oper[0]^)) then
  1457. RemoveCurrentP(p);
  1458. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  1459. asml.remove(hp2);
  1460. hp2.Free;
  1461. p:=hp1;
  1462. result:=true;
  1463. end;
  1464. end;
  1465. end;
  1466. end;
  1467. end;
  1468. function TX86AsmOptimizer.OptPass1VOP(var p : tai) : boolean;
  1469. var
  1470. hp1 : tai;
  1471. begin
  1472. result:=false;
  1473. { replace
  1474. V<Op>X %mreg1,%mreg2,%mreg3
  1475. VMovX %mreg3,%mreg4
  1476. dealloc %mreg3
  1477. by
  1478. V<Op>X %mreg1,%mreg2,%mreg4
  1479. ?
  1480. }
  1481. if GetNextInstruction(p,hp1) and
  1482. { we mix single and double operations here because we assume that the compiler
  1483. generates vmovapd only after double operations and vmovaps only after single operations }
  1484. MatchInstruction(hp1,A_VMOVAPD,A_VMOVAPS,[S_NO]) and
  1485. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  1486. (taicpu(hp1).oper[1]^.typ=top_reg) then
  1487. begin
  1488. TransferUsedRegs(TmpUsedRegs);
  1489. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1490. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  1491. begin
  1492. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  1493. DebugMsg(SPeepholeOptimization + 'VOpVmov2VOp done',p);
  1494. asml.Remove(hp1);
  1495. hp1.Free;
  1496. result:=true;
  1497. end;
  1498. end;
  1499. end;
  1500. { Replaces all references to AOldReg in a memory reference to ANewReg }
  1501. class function TX86AsmOptimizer.ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean;
  1502. var
  1503. OldSupReg: TSuperRegister;
  1504. OldSubReg, MemSubReg: TSubRegister;
  1505. begin
  1506. Result := False;
  1507. { For safety reasons, only check for exact register matches }
  1508. { Check base register }
  1509. if (ref.base = AOldReg) then
  1510. begin
  1511. ref.base := ANewReg;
  1512. Result := True;
  1513. end;
  1514. { Check index register }
  1515. if (ref.index = AOldReg) then
  1516. begin
  1517. ref.index := ANewReg;
  1518. Result := True;
  1519. end;
  1520. end;
  1521. { Replaces all references to AOldReg in an operand to ANewReg }
  1522. class function TX86AsmOptimizer.ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean;
  1523. var
  1524. OldSupReg, NewSupReg: TSuperRegister;
  1525. OldSubReg, NewSubReg, MemSubReg: TSubRegister;
  1526. OldRegType: TRegisterType;
  1527. ThisOper: POper;
  1528. begin
  1529. ThisOper := p.oper[OperIdx]; { Faster to access overall }
  1530. Result := False;
  1531. if (AOldReg = NR_NO) or (ANewReg = NR_NO) then
  1532. InternalError(2020011801);
  1533. OldSupReg := getsupreg(AOldReg);
  1534. OldSubReg := getsubreg(AOldReg);
  1535. OldRegType := getregtype(AOldReg);
  1536. NewSupReg := getsupreg(ANewReg);
  1537. NewSubReg := getsubreg(ANewReg);
  1538. if OldRegType <> getregtype(ANewReg) then
  1539. InternalError(2020011802);
  1540. if OldSubReg <> NewSubReg then
  1541. InternalError(2020011803);
  1542. case ThisOper^.typ of
  1543. top_reg:
  1544. if (
  1545. (ThisOper^.reg = AOldReg) or
  1546. (
  1547. (OldRegType = R_INTREGISTER) and
  1548. (getsupreg(ThisOper^.reg) = OldSupReg) and
  1549. (getregtype(ThisOper^.reg) = R_INTREGISTER) and
  1550. (
  1551. (getsubreg(ThisOper^.reg) <= OldSubReg)
  1552. {$ifndef x86_64}
  1553. and (
  1554. { Under i386 and i8086, ESI, EDI, EBP and ESP
  1555. don't have an 8-bit representation }
  1556. (getsubreg(ThisOper^.reg) >= R_SUBW) or
  1557. not (NewSupReg in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  1558. )
  1559. {$endif x86_64}
  1560. )
  1561. )
  1562. ) then
  1563. begin
  1564. ThisOper^.reg := newreg(getregtype(ANewReg), NewSupReg, getsubreg(p.oper[OperIdx]^.reg));;
  1565. Result := True;
  1566. end;
  1567. top_ref:
  1568. if ReplaceRegisterInRef(ThisOper^.ref^, AOldReg, ANewReg) then
  1569. Result := True;
  1570. else
  1571. ;
  1572. end;
  1573. end;
  1574. { Replaces all references to AOldReg in an instruction to ANewReg }
  1575. function TX86AsmOptimizer.ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
  1576. const
  1577. ReadFlag: array[0..3] of TInsChange = (Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Rop4);
  1578. var
  1579. OperIdx: Integer;
  1580. begin
  1581. Result := False;
  1582. for OperIdx := 0 to p.ops - 1 do
  1583. if (ReadFlag[OperIdx] in InsProp[p.Opcode].Ch) and
  1584. { The shift and rotate instructions can only use CL }
  1585. not (
  1586. (OperIdx = 0) and
  1587. { This second condition just helps to avoid unnecessarily
  1588. calling MatchInstruction for 10 different opcodes }
  1589. (p.oper[0]^.reg = NR_CL) and
  1590. MatchInstruction(p, [A_RCL, A_RCR, A_ROL, A_ROR, A_SAL, A_SAR, A_SHL, A_SHLD, A_SHR, A_SHRD], [])
  1591. ) then
  1592. Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
  1593. end;
  1594. class function TX86AsmOptimizer.IsRefSafe(const ref: PReference): Boolean; inline;
  1595. begin
  1596. Result :=
  1597. (ref^.index = NR_NO) and
  1598. (
  1599. {$ifdef x86_64}
  1600. (
  1601. (ref^.base = NR_RIP) and
  1602. (ref^.refaddr in [addr_pic, addr_pic_no_got])
  1603. ) or
  1604. {$endif x86_64}
  1605. (ref^.base = NR_STACK_POINTER_REG) or
  1606. (ref^.base = current_procinfo.framepointer)
  1607. );
  1608. end;
  1609. function TX86AsmOptimizer.DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  1610. var
  1611. CurrentReg, ReplaceReg: TRegister;
  1612. SubReg: TSubRegister;
  1613. begin
  1614. Result := False;
  1615. ReplaceReg := taicpu(p_mov).oper[0]^.reg;
  1616. CurrentReg := taicpu(p_mov).oper[1]^.reg;
  1617. case hp.opcode of
  1618. A_FSTSW, A_FNSTSW,
  1619. A_IN, A_INS, A_OUT, A_OUTS,
  1620. A_CMPS, A_LODS, A_MOVS, A_SCAS, A_STOS:
  1621. { These routines have explicit operands, but they are restricted in
  1622. what they can be (e.g. IN and OUT can only read from AL, AX or
  1623. EAX. }
  1624. Exit;
  1625. A_IMUL:
  1626. begin
  1627. { The 1-operand version writes to implicit registers
  1628. The 2-operand version reads from the first operator, and reads
  1629. from and writes to the second (equivalent to Ch_ROp1, ChRWOp2).
  1630. the 3-operand version reads from a register that it doesn't write to
  1631. }
  1632. case hp.ops of
  1633. 1:
  1634. if (
  1635. (
  1636. (hp.opsize = S_B) and (getsupreg(CurrentReg) <> RS_EAX)
  1637. ) or
  1638. not (getsupreg(CurrentReg) in [RS_EAX, RS_EDX])
  1639. ) and ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  1640. begin
  1641. Result := True;
  1642. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 1)', hp);
  1643. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1644. end;
  1645. 2:
  1646. { Only modify the first parameter }
  1647. if ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  1648. begin
  1649. Result := True;
  1650. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 2)', hp);
  1651. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1652. end;
  1653. 3:
  1654. { Only modify the second parameter }
  1655. if ReplaceRegisterInOper(hp, 1, CurrentReg, ReplaceReg) then
  1656. begin
  1657. Result := True;
  1658. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 3)', hp);
  1659. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1660. end;
  1661. else
  1662. InternalError(2020012901);
  1663. end;
  1664. end;
  1665. else
  1666. if (hp.ops > 0) and
  1667. ReplaceRegisterInInstruction(hp, CurrentReg, ReplaceReg) then
  1668. begin
  1669. Result := True;
  1670. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovXXX2MovXXX)', hp);
  1671. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1672. end;
  1673. end;
  1674. end;
  1675. function TX86AsmOptimizer.OptPass1MOV(var p : tai) : boolean;
  1676. var
  1677. hp1, hp2, hp4: tai;
  1678. GetNextInstruction_p, TempRegUsed: Boolean;
  1679. PreMessage, RegName1, RegName2, InputVal, MaskNum: string;
  1680. NewSize: topsize;
  1681. CurrentReg: TRegister;
  1682. begin
  1683. Result:=false;
  1684. GetNextInstruction_p:=GetNextInstruction(p, hp1);
  1685. { remove mov reg1,reg1? }
  1686. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^)
  1687. then
  1688. begin
  1689. DebugMsg(SPeepholeOptimization + 'Mov2Nop 1 done',p);
  1690. { take care of the register (de)allocs following p }
  1691. RemoveCurrentP(p);
  1692. p:=hp1;
  1693. Result:=true;
  1694. exit;
  1695. end;
  1696. { All the next optimisations require a next instruction }
  1697. if not GetNextInstruction_p or (hp1.typ <> ait_instruction) then
  1698. Exit;
  1699. { Look for:
  1700. mov %reg1,%reg2
  1701. ??? %reg2,r/m
  1702. Change to:
  1703. mov %reg1,%reg2
  1704. ??? %reg1,r/m
  1705. }
  1706. if MatchOpType(taicpu(p), top_reg, top_reg) then
  1707. begin
  1708. CurrentReg := taicpu(p).oper[1]^.reg;
  1709. if RegReadByInstruction(CurrentReg, hp1) and
  1710. DeepMOVOpt(taicpu(p), taicpu(hp1)) then
  1711. begin
  1712. TransferUsedRegs(TmpUsedRegs);
  1713. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  1714. if not RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs) and
  1715. { Just in case something didn't get modified (e.g. an
  1716. implicit register) }
  1717. not RegReadByInstruction(CurrentReg, hp1) then
  1718. begin
  1719. { We can remove the original MOV }
  1720. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3 done',p);
  1721. Asml.Remove(p);
  1722. p.Free;
  1723. p := hp1;
  1724. { TmpUsedRegs contains the results of "UpdateUsedRegs(tai(p.Next))" already,
  1725. so just restore it to UsedRegs instead of calculating it again }
  1726. RestoreUsedRegs(TmpUsedRegs);
  1727. Result := True;
  1728. Exit;
  1729. end;
  1730. { If we know a MOV instruction has become a null operation, we might as well
  1731. get rid of it now to save time. }
  1732. if (taicpu(hp1).opcode = A_MOV) and
  1733. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1734. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
  1735. { Just being a register is enough to confirm it's a null operation }
  1736. (taicpu(hp1).oper[0]^.typ = top_reg) then
  1737. begin
  1738. Result := True;
  1739. { Speed-up to reduce a pipeline stall... if we had something like...
  1740. movl %eax,%edx
  1741. movw %dx,%ax
  1742. ... the second instruction would change to movw %ax,%ax, but
  1743. given that it is now %ax that's active rather than %eax,
  1744. penalties might occur due to a partial register write, so instead,
  1745. change it to a MOVZX instruction when optimising for speed.
  1746. }
  1747. if not (cs_opt_size in current_settings.optimizerswitches) and
  1748. IsMOVZXAcceptable and
  1749. (taicpu(hp1).opsize < taicpu(p).opsize)
  1750. {$ifdef x86_64}
  1751. { operations already implicitly set the upper 64 bits to zero }
  1752. and not ((taicpu(hp1).opsize = S_L) and (taicpu(p).opsize = S_Q))
  1753. {$endif x86_64}
  1754. then
  1755. begin
  1756. CurrentReg := taicpu(hp1).oper[1]^.reg;
  1757. DebugMsg(SPeepholeOptimization + 'Zero-extension to minimise pipeline stall (Mov2Movz)',hp1);
  1758. case taicpu(p).opsize of
  1759. S_W:
  1760. if taicpu(hp1).opsize = S_B then
  1761. taicpu(hp1).opsize := S_BL
  1762. else
  1763. InternalError(2020012911);
  1764. S_L{$ifdef x86_64}, S_Q{$endif x86_64}:
  1765. case taicpu(hp1).opsize of
  1766. S_B:
  1767. taicpu(hp1).opsize := S_BL;
  1768. S_W:
  1769. taicpu(hp1).opsize := S_WL;
  1770. else
  1771. InternalError(2020012912);
  1772. end;
  1773. else
  1774. InternalError(2020012910);
  1775. end;
  1776. taicpu(hp1).opcode := A_MOVZX;
  1777. taicpu(hp1).oper[1]^.reg := newreg(getregtype(CurrentReg), getsupreg(CurrentReg), R_SUBD)
  1778. end
  1779. else
  1780. begin
  1781. GetNextInstruction_p := GetNextInstruction(hp1, hp2);
  1782. DebugMsg(SPeepholeOptimization + 'Mov2Nop 4 done',hp1);
  1783. asml.remove(hp1);
  1784. hp1.free;
  1785. { The instruction after what was hp1 is now the immediate next instruction,
  1786. so we can continue to make optimisations if it's present }
  1787. if not GetNextInstruction_p or (hp2.typ <> ait_instruction) then
  1788. Exit;
  1789. hp1 := hp2;
  1790. end;
  1791. end;
  1792. end;
  1793. end;
  1794. { Depending on the DeepMOVOpt above, it may turn out that hp1 completely
  1795. overwrites the original destination register. e.g.
  1796. movl ###,%reg2d
  1797. movslq ###,%reg2q (### doesn't have to be the same as the first one)
  1798. In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
  1799. }
  1800. if (taicpu(p).oper[1]^.typ = top_reg) and
  1801. MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
  1802. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1803. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  1804. begin
  1805. if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
  1806. begin
  1807. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  1808. case taicpu(p).oper[0]^.typ of
  1809. top_const:
  1810. { We have something like:
  1811. movb $x, %regb
  1812. movzbl %regb,%regd
  1813. Change to:
  1814. movl $x, %regd
  1815. }
  1816. begin
  1817. case taicpu(hp1).opsize of
  1818. S_BW:
  1819. begin
  1820. if (taicpu(hp1).opcode = A_MOVSX) and
  1821. (taicpu(p).oper[0]^.val > $7F) then
  1822. taicpu(p).oper[0]^.val := taicpu(p).oper[0]^.val - $100; { Convert to signed }
  1823. setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
  1824. taicpu(p).opsize := S_W;
  1825. end;
  1826. S_BL:
  1827. begin
  1828. if (taicpu(hp1).opcode = A_MOVSX) and
  1829. (taicpu(p).oper[0]^.val > $7F) then
  1830. taicpu(p).oper[0]^.val := taicpu(p).oper[0]^.val - $100; { Convert to signed }
  1831. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  1832. taicpu(p).opsize := S_L;
  1833. end;
  1834. S_WL:
  1835. begin
  1836. if (taicpu(hp1).opcode = A_MOVSX) and
  1837. (taicpu(p).oper[0]^.val > $7FFF) then
  1838. taicpu(p).oper[0]^.val := taicpu(p).oper[0]^.val - $10000; { Convert to signed }
  1839. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  1840. taicpu(p).opsize := S_L;
  1841. end;
  1842. {$ifdef x86_64}
  1843. S_BQ:
  1844. begin
  1845. if (taicpu(hp1).opcode = A_MOVSX) and
  1846. (taicpu(p).oper[0]^.val > $7F) then
  1847. taicpu(p).oper[0]^.val := taicpu(p).oper[0]^.val - $100; { Convert to signed }
  1848. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1849. taicpu(p).opsize := S_Q;
  1850. end;
  1851. S_WQ:
  1852. begin
  1853. if (taicpu(hp1).opcode = A_MOVSX) and
  1854. (taicpu(p).oper[0]^.val > $7FFF) then
  1855. taicpu(p).oper[0]^.val := taicpu(p).oper[0]^.val - $10000; { Convert to signed }
  1856. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1857. taicpu(p).opsize := S_Q;
  1858. end;
  1859. S_LQ:
  1860. begin
  1861. if (taicpu(hp1).opcode = A_MOVSXD) and { Note it's MOVSXD, not MOVSX }
  1862. (taicpu(p).oper[0]^.val > $7FFFFFFF) then
  1863. taicpu(p).oper[0]^.val := taicpu(p).oper[0]^.val - $100000000; { Convert to signed }
  1864. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1865. taicpu(p).opsize := S_Q;
  1866. end;
  1867. {$endif x86_64}
  1868. else
  1869. { If hp1 was a MOV instruction, it should have been
  1870. optimised already }
  1871. InternalError(2020021001);
  1872. end;
  1873. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
  1874. asml.Remove(hp1);
  1875. hp1.Free;
  1876. Result := True;
  1877. Exit;
  1878. end;
  1879. top_ref:
  1880. { We have something like:
  1881. movb mem, %regb
  1882. movzbl %regb,%regd
  1883. Change to:
  1884. movzbl mem, %regd
  1885. }
  1886. if (taicpu(p).oper[0]^.ref^.refaddr<>addr_full) and (IsMOVZXAcceptable or (taicpu(hp1).opcode<>A_MOVZX)) then
  1887. begin
  1888. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
  1889. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  1890. RemoveCurrentP(p);
  1891. Result:=True;
  1892. Exit;
  1893. end;
  1894. else
  1895. if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
  1896. { Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
  1897. Exit;
  1898. end;
  1899. end
  1900. { The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
  1901. and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
  1902. optimised }
  1903. else
  1904. begin
  1905. DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
  1906. RemoveCurrentP(p);
  1907. p:=hp1;
  1908. Result := True;
  1909. Exit;
  1910. end;
  1911. end;
  1912. if (taicpu(hp1).opcode = A_AND) and
  1913. (taicpu(p).oper[1]^.typ = top_reg) and
  1914. MatchOpType(taicpu(hp1),top_const,top_reg) then
  1915. begin
  1916. if MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) then
  1917. begin
  1918. case taicpu(p).opsize of
  1919. S_L:
  1920. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  1921. begin
  1922. { Optimize out:
  1923. mov x, %reg
  1924. and ffffffffh, %reg
  1925. }
  1926. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 1 done',p);
  1927. asml.remove(hp1);
  1928. hp1.free;
  1929. Result:=true;
  1930. exit;
  1931. end;
  1932. S_Q: { TODO: Confirm if this is even possible }
  1933. if (taicpu(hp1).oper[0]^.val = $ffffffffffffffff) then
  1934. begin
  1935. { Optimize out:
  1936. mov x, %reg
  1937. and ffffffffffffffffh, %reg
  1938. }
  1939. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 2 done',p);
  1940. asml.remove(hp1);
  1941. hp1.free;
  1942. Result:=true;
  1943. exit;
  1944. end;
  1945. else
  1946. ;
  1947. end;
  1948. end
  1949. else if IsMOVZXAcceptable and
  1950. (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(hp1).oper[1]^.typ = top_reg) and
  1951. (taicpu(p).oper[0]^.typ <> top_const) and { MOVZX only supports registers and memory, not immediates (use MOV for that!) }
  1952. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  1953. then
  1954. begin
  1955. InputVal := debug_operstr(taicpu(p).oper[0]^);
  1956. MaskNum := debug_tostr(taicpu(hp1).oper[0]^.val);
  1957. case taicpu(p).opsize of
  1958. S_B:
  1959. if (taicpu(hp1).oper[0]^.val = $ff) then
  1960. begin
  1961. { Convert:
  1962. movb x, %regl movb x, %regl
  1963. andw ffh, %regw andl ffh, %regd
  1964. To:
  1965. movzbw x, %regd movzbl x, %regd
  1966. (Identical registers, just different sizes)
  1967. }
  1968. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 8-bit register name }
  1969. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 16/32-bit register name }
  1970. case taicpu(hp1).opsize of
  1971. S_W: NewSize := S_BW;
  1972. S_L: NewSize := S_BL;
  1973. {$ifdef x86_64}
  1974. S_Q: NewSize := S_BQ;
  1975. {$endif x86_64}
  1976. else
  1977. InternalError(2018011510);
  1978. end;
  1979. end
  1980. else
  1981. NewSize := S_NO;
  1982. S_W:
  1983. if (taicpu(hp1).oper[0]^.val = $ffff) then
  1984. begin
  1985. { Convert:
  1986. movw x, %regw
  1987. andl ffffh, %regd
  1988. To:
  1989. movzwl x, %regd
  1990. (Identical registers, just different sizes)
  1991. }
  1992. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 16-bit register name }
  1993. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 32-bit register name }
  1994. case taicpu(hp1).opsize of
  1995. S_L: NewSize := S_WL;
  1996. {$ifdef x86_64}
  1997. S_Q: NewSize := S_WQ;
  1998. {$endif x86_64}
  1999. else
  2000. InternalError(2018011511);
  2001. end;
  2002. end
  2003. else
  2004. NewSize := S_NO;
  2005. else
  2006. NewSize := S_NO;
  2007. end;
  2008. if NewSize <> S_NO then
  2009. begin
  2010. PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + InputVal + ',' + RegName1;
  2011. { The actual optimization }
  2012. taicpu(p).opcode := A_MOVZX;
  2013. taicpu(p).changeopsize(NewSize);
  2014. taicpu(p).oper[1]^ := taicpu(hp1).oper[1]^;
  2015. { Safeguard if "and" is followed by a conditional command }
  2016. TransferUsedRegs(TmpUsedRegs);
  2017. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  2018. if (RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  2019. begin
  2020. { At this point, the "and" command is effectively equivalent to
  2021. "test %reg,%reg". This will be handled separately by the
  2022. Peephole Optimizer. [Kit] }
  2023. DebugMsg(SPeepholeOptimization + PreMessage +
  2024. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  2025. end
  2026. else
  2027. begin
  2028. DebugMsg(SPeepholeOptimization + PreMessage + '; and' + debug_opsize2str(taicpu(hp1).opsize) + ' $' + MaskNum + ',' + RegName2 +
  2029. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  2030. asml.Remove(hp1);
  2031. hp1.Free;
  2032. end;
  2033. Result := True;
  2034. Exit;
  2035. end;
  2036. end;
  2037. end;
  2038. { Next instruction is also a MOV ? }
  2039. if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) then
  2040. begin
  2041. if (taicpu(p).oper[1]^.typ = top_reg) and
  2042. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  2043. begin
  2044. CurrentReg := taicpu(p).oper[1]^.reg;
  2045. TransferUsedRegs(TmpUsedRegs);
  2046. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  2047. { we have
  2048. mov x, %treg
  2049. mov %treg, y
  2050. }
  2051. if not(RegInOp(CurrentReg, taicpu(hp1).oper[1]^)) then
  2052. if not(RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs)) then
  2053. { we've got
  2054. mov x, %treg
  2055. mov %treg, y
  2056. with %treg is not used after }
  2057. case taicpu(p).oper[0]^.typ Of
  2058. { top_reg is covered by DeepMOVOpt }
  2059. top_const:
  2060. begin
  2061. { change
  2062. mov const, %treg
  2063. mov %treg, y
  2064. to
  2065. mov const, y
  2066. }
  2067. if (taicpu(hp1).oper[1]^.typ=top_reg) or
  2068. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  2069. begin
  2070. if taicpu(hp1).oper[1]^.typ=top_reg then
  2071. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  2072. taicpu(p).loadOper(1,taicpu(hp1).oper[1]^);
  2073. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 5 done',p);
  2074. asml.remove(hp1);
  2075. hp1.free;
  2076. Result:=true;
  2077. Exit;
  2078. end;
  2079. end;
  2080. top_ref:
  2081. if (taicpu(hp1).oper[1]^.typ = top_reg) then
  2082. begin
  2083. { change
  2084. mov mem, %treg
  2085. mov %treg, %reg
  2086. to
  2087. mov mem, %reg"
  2088. }
  2089. taicpu(p).loadreg(1, taicpu(hp1).oper[1]^.reg);
  2090. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3 done',p);
  2091. asml.remove(hp1);
  2092. hp1.free;
  2093. Result:=true;
  2094. Exit;
  2095. end;
  2096. else
  2097. ;
  2098. end
  2099. else
  2100. { %treg is used afterwards, but all eventualities
  2101. other than the first MOV instruction being a constant
  2102. are covered by DeepMOVOpt, so only check for that }
  2103. if (taicpu(p).oper[0]^.typ = top_const) and
  2104. (
  2105. { For MOV operations, a size saving is only made if the register/const is byte-sized }
  2106. not (cs_opt_size in current_settings.optimizerswitches) or
  2107. (taicpu(hp1).opsize = S_B)
  2108. ) and
  2109. (
  2110. (taicpu(hp1).oper[1]^.typ = top_reg) or
  2111. ((taicpu(p).oper[0]^.val >= low(longint)) and (taicpu(p).oper[0]^.val <= high(longint)))
  2112. ) then
  2113. begin
  2114. DebugMsg(SPeepholeOptimization + debug_operstr(taicpu(hp1).oper[0]^) + ' = $' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 6b)',hp1);
  2115. taicpu(hp1).loadconst(0, taicpu(p).oper[0]^.val);
  2116. end;
  2117. end;
  2118. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  2119. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  2120. { mov reg1, mem1 or mov mem1, reg1
  2121. mov mem2, reg2 mov reg2, mem2}
  2122. begin
  2123. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  2124. { mov reg1, mem1 or mov mem1, reg1
  2125. mov mem2, reg1 mov reg2, mem1}
  2126. begin
  2127. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2128. { Removes the second statement from
  2129. mov reg1, mem1/reg2
  2130. mov mem1/reg2, reg1 }
  2131. begin
  2132. if taicpu(p).oper[0]^.typ=top_reg then
  2133. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2134. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 1',p);
  2135. asml.remove(hp1);
  2136. hp1.free;
  2137. Result:=true;
  2138. exit;
  2139. end
  2140. else
  2141. begin
  2142. TransferUsedRegs(TmpUsedRegs);
  2143. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2144. if (taicpu(p).oper[1]^.typ = top_ref) and
  2145. { mov reg1, mem1
  2146. mov mem2, reg1 }
  2147. (taicpu(hp1).oper[0]^.ref^.refaddr = addr_no) and
  2148. GetNextInstruction(hp1, hp2) and
  2149. MatchInstruction(hp2,A_CMP,[taicpu(p).opsize]) and
  2150. OpsEqual(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^) and
  2151. OpsEqual(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) and
  2152. not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs)) then
  2153. { change to
  2154. mov reg1, mem1 mov reg1, mem1
  2155. mov mem2, reg1 cmp reg1, mem2
  2156. cmp mem1, reg1
  2157. }
  2158. begin
  2159. asml.remove(hp2);
  2160. hp2.free;
  2161. taicpu(hp1).opcode := A_CMP;
  2162. taicpu(hp1).loadref(1,taicpu(hp1).oper[0]^.ref^);
  2163. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  2164. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  2165. DebugMsg(SPeepholeOptimization + 'MovMovCmp2MovCmp done',hp1);
  2166. end;
  2167. end;
  2168. end
  2169. else if (taicpu(p).oper[1]^.typ=top_ref) and
  2170. OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2171. begin
  2172. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  2173. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  2174. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov1 done',p);
  2175. end
  2176. else
  2177. begin
  2178. TransferUsedRegs(TmpUsedRegs);
  2179. if GetNextInstruction(hp1, hp2) and
  2180. MatchOpType(taicpu(p),top_ref,top_reg) and
  2181. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2182. (taicpu(hp1).oper[1]^.typ = top_ref) and
  2183. MatchInstruction(hp2,A_MOV,[taicpu(p).opsize]) and
  2184. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  2185. RefsEqual(taicpu(hp2).oper[0]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  2186. if not RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^) and
  2187. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,tmpUsedRegs)) then
  2188. { mov mem1, %reg1
  2189. mov %reg1, mem2
  2190. mov mem2, reg2
  2191. to:
  2192. mov mem1, reg2
  2193. mov reg2, mem2}
  2194. begin
  2195. AllocRegBetween(taicpu(hp2).oper[1]^.reg,p,hp2,usedregs);
  2196. DebugMsg(SPeepholeOptimization + 'MovMovMov2MovMov 1 done',p);
  2197. taicpu(p).loadoper(1,taicpu(hp2).oper[1]^);
  2198. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  2199. asml.remove(hp2);
  2200. hp2.free;
  2201. end
  2202. {$ifdef i386}
  2203. { this is enabled for i386 only, as the rules to create the reg sets below
  2204. are too complicated for x86-64, so this makes this code too error prone
  2205. on x86-64
  2206. }
  2207. else if (taicpu(p).oper[1]^.reg <> taicpu(hp2).oper[1]^.reg) and
  2208. not(RegInRef(taicpu(p).oper[1]^.reg,taicpu(p).oper[0]^.ref^)) and
  2209. not(RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^)) then
  2210. { mov mem1, reg1 mov mem1, reg1
  2211. mov reg1, mem2 mov reg1, mem2
  2212. mov mem2, reg2 mov mem2, reg1
  2213. to: to:
  2214. mov mem1, reg1 mov mem1, reg1
  2215. mov mem1, reg2 mov reg1, mem2
  2216. mov reg1, mem2
  2217. or (if mem1 depends on reg1
  2218. and/or if mem2 depends on reg2)
  2219. to:
  2220. mov mem1, reg1
  2221. mov reg1, mem2
  2222. mov reg1, reg2
  2223. }
  2224. begin
  2225. taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^);
  2226. taicpu(hp1).loadReg(1,taicpu(hp2).oper[1]^.reg);
  2227. taicpu(hp2).loadRef(1,taicpu(hp2).oper[0]^.ref^);
  2228. taicpu(hp2).loadReg(0,taicpu(p).oper[1]^.reg);
  2229. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  2230. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  2231. (getsupreg(taicpu(p).oper[0]^.ref^.base) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  2232. AllocRegBetween(taicpu(p).oper[0]^.ref^.base,p,hp2,usedregs);
  2233. if (taicpu(p).oper[0]^.ref^.index <> NR_NO) and
  2234. (getsupreg(taicpu(p).oper[0]^.ref^.index) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  2235. AllocRegBetween(taicpu(p).oper[0]^.ref^.index,p,hp2,usedregs);
  2236. end
  2237. else if (taicpu(hp1).Oper[0]^.reg <> taicpu(hp2).Oper[1]^.reg) then
  2238. begin
  2239. taicpu(hp2).loadReg(0,taicpu(hp1).Oper[0]^.reg);
  2240. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  2241. end
  2242. else
  2243. begin
  2244. asml.remove(hp2);
  2245. hp2.free;
  2246. end
  2247. {$endif i386}
  2248. ;
  2249. end;
  2250. end;
  2251. (* { movl [mem1],reg1
  2252. movl [mem1],reg2
  2253. to
  2254. movl [mem1],reg1
  2255. movl reg1,reg2
  2256. }
  2257. else if (taicpu(p).oper[0]^.typ = top_ref) and
  2258. (taicpu(p).oper[1]^.typ = top_reg) and
  2259. (taicpu(hp1).oper[0]^.typ = top_ref) and
  2260. (taicpu(hp1).oper[1]^.typ = top_reg) and
  2261. (taicpu(p).opsize = taicpu(hp1).opsize) and
  2262. RefsEqual(TReference(taicpu(p).oper[0]^^),taicpu(hp1).oper[0]^^.ref^) and
  2263. (taicpu(p).oper[1]^.reg<>taicpu(hp1).oper[0]^^.ref^.base) and
  2264. (taicpu(p).oper[1]^.reg<>taicpu(hp1).oper[0]^^.ref^.index) then
  2265. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg)
  2266. else*)
  2267. { movl const1,[mem1]
  2268. movl [mem1],reg1
  2269. to
  2270. movl const1,reg1
  2271. movl reg1,[mem1]
  2272. }
  2273. if MatchOpType(Taicpu(p),top_const,top_ref) and
  2274. MatchOpType(Taicpu(hp1),top_ref,top_reg) and
  2275. (taicpu(p).opsize = taicpu(hp1).opsize) and
  2276. RefsEqual(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.ref^) and
  2277. not(RegInRef(taicpu(hp1).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^)) then
  2278. begin
  2279. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  2280. taicpu(hp1).loadReg(0,taicpu(hp1).oper[1]^.reg);
  2281. taicpu(hp1).loadRef(1,taicpu(p).oper[1]^.ref^);
  2282. taicpu(p).loadReg(1,taicpu(hp1).oper[0]^.reg);
  2283. taicpu(hp1).fileinfo := taicpu(p).fileinfo;
  2284. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 1',p);
  2285. Result:=true;
  2286. exit;
  2287. end;
  2288. { mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
  2289. end;
  2290. { search further than the next instruction for a mov }
  2291. if
  2292. { check as much as possible before the expensive GetNextInstructionUsingReg call }
  2293. (taicpu(p).oper[1]^.typ = top_reg) and
  2294. (taicpu(p).oper[0]^.typ in [top_reg,top_const]) and
  2295. not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) and
  2296. { we work with hp2 here, so hp1 can be still used later on when
  2297. checking for GetNextInstruction_p }
  2298. { GetNextInstructionUsingReg only searches one instruction ahead unless -O3 is specified }
  2299. GetNextInstructionUsingReg(hp1,hp2,taicpu(p).oper[1]^.reg) and
  2300. MatchInstruction(hp2,A_MOV,[]) and
  2301. MatchOperand(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^) and
  2302. ((taicpu(p).oper[0]^.typ=top_const) or
  2303. ((taicpu(p).oper[0]^.typ=top_reg) and
  2304. not(RegUsedBetween(taicpu(p).oper[0]^.reg, p, hp2))
  2305. )
  2306. ) then
  2307. begin
  2308. { we have
  2309. mov x, %treg
  2310. mov %treg, y
  2311. }
  2312. TransferUsedRegs(TmpUsedRegs);
  2313. TmpUsedRegs[R_INTREGISTER].Update(tai(p.Next));
  2314. { We don't need to call UpdateUsedRegs for every instruction between
  2315. p and hp2 because the register we're concerned about will not
  2316. become deallocated (otherwise GetNextInstructionUsingReg would
  2317. have stopped at an earlier instruction). [Kit] }
  2318. TempRegUsed :=
  2319. RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) or
  2320. RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1);
  2321. case taicpu(p).oper[0]^.typ Of
  2322. top_reg:
  2323. begin
  2324. { change
  2325. mov %reg, %treg
  2326. mov %treg, y
  2327. to
  2328. mov %reg, y
  2329. }
  2330. CurrentReg := taicpu(p).oper[0]^.reg; { Saves on a handful of pointer dereferences }
  2331. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  2332. if taicpu(hp2).oper[1]^.reg = CurrentReg then
  2333. begin
  2334. { %reg = y - remove hp2 completely (doing it here instead of relying on
  2335. the "mov %reg,%reg" optimisation might cut down on a pass iteration) }
  2336. if TempRegUsed then
  2337. begin
  2338. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + RegName1 + '; removed unnecessary instruction (MovMov2MovNop 6b}',hp2);
  2339. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  2340. asml.remove(hp2);
  2341. hp2.Free;
  2342. end
  2343. else
  2344. begin
  2345. asml.remove(hp2);
  2346. hp2.Free;
  2347. { We can remove the original MOV too }
  2348. DebugMsg(SPeepholeOptimization + 'MovMov2NopNop 6b done',p);
  2349. RemoveCurrentP(p);
  2350. p:=hp1;
  2351. Result:=true;
  2352. Exit;
  2353. end;
  2354. end
  2355. else
  2356. begin
  2357. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  2358. taicpu(hp2).loadReg(0, CurrentReg);
  2359. if TempRegUsed then
  2360. begin
  2361. { Don't remove the first instruction if the temporary register is in use }
  2362. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_regname(CurrentReg) + '; changed to minimise pipeline stall (MovMov2Mov 6a}',hp2);
  2363. { No need to set Result to True. If there's another instruction later on
  2364. that can be optimised, it will be detected when the main Pass 1 loop
  2365. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
  2366. end
  2367. else
  2368. begin
  2369. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 6 done',p);
  2370. RemoveCurrentP(p);
  2371. p:=hp1;
  2372. Result:=true;
  2373. Exit;
  2374. end;
  2375. end;
  2376. end;
  2377. top_const:
  2378. if not (cs_opt_size in current_settings.optimizerswitches) or (taicpu(hp2).opsize = S_B) then
  2379. begin
  2380. { change
  2381. mov const, %treg
  2382. mov %treg, y
  2383. to
  2384. mov const, y
  2385. }
  2386. if (taicpu(hp2).oper[1]^.typ=top_reg) or
  2387. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  2388. begin
  2389. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  2390. taicpu(hp2).loadOper(0,taicpu(p).oper[0]^);
  2391. if TempRegUsed then
  2392. begin
  2393. { Don't remove the first instruction if the temporary register is in use }
  2394. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 7a)',hp2);
  2395. { No need to set Result to True. If there's another instruction later on
  2396. that can be optimised, it will be detected when the main Pass 1 loop
  2397. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
  2398. end
  2399. else
  2400. begin
  2401. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 7 done',p);
  2402. RemoveCurrentP(p);
  2403. p:=hp1;
  2404. Result:=true;
  2405. Exit;
  2406. end;
  2407. end;
  2408. end;
  2409. else
  2410. Internalerror(2019103001);
  2411. end;
  2412. end;
  2413. if (aoc_MovAnd2Mov_3 in OptsToCheck) and
  2414. (taicpu(p).oper[1]^.typ = top_reg) and
  2415. (taicpu(p).opsize = S_L) and
  2416. GetNextInstructionUsingRegTrackingUse(p,hp2,taicpu(p).oper[1]^.reg) and
  2417. (taicpu(hp2).opcode = A_AND) and
  2418. (MatchOpType(taicpu(hp2),top_const,top_reg) or
  2419. (MatchOpType(taicpu(hp2),top_reg,top_reg) and
  2420. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^))
  2421. ) then
  2422. begin
  2423. if SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp2).oper[1]^.reg) then
  2424. begin
  2425. if ((taicpu(hp2).oper[0]^.typ=top_const) and (taicpu(hp2).oper[0]^.val = $ffffffff)) or
  2426. ((taicpu(hp2).oper[0]^.typ=top_reg) and (taicpu(hp2).opsize=S_L)) then
  2427. begin
  2428. { Optimize out:
  2429. mov x, %reg
  2430. and ffffffffh, %reg
  2431. }
  2432. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 3 done',p);
  2433. asml.remove(hp2);
  2434. hp2.free;
  2435. Result:=true;
  2436. exit;
  2437. end;
  2438. end;
  2439. end;
  2440. { leave out the mov from "mov reg, x(%frame_pointer); leave/ret" (with
  2441. x >= RetOffset) as it doesn't do anything (it writes either to a
  2442. parameter or to the temporary storage room for the function
  2443. result)
  2444. }
  2445. if IsExitCode(hp1) and
  2446. (taicpu(p).oper[1]^.typ = top_ref) and
  2447. (taicpu(p).oper[1]^.ref^.index = NR_NO) and
  2448. (
  2449. (
  2450. (taicpu(p).oper[1]^.ref^.base = current_procinfo.FramePointer) and
  2451. not (
  2452. assigned(current_procinfo.procdef.funcretsym) and
  2453. (taicpu(p).oper[1]^.ref^.offset <= tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)
  2454. )
  2455. ) or
  2456. { Also discard writes to the stack that are below the base pointer,
  2457. as this is temporary storage rather than a function result on the
  2458. stack, say. }
  2459. (
  2460. (taicpu(p).oper[1]^.ref^.base = NR_STACK_POINTER_REG) and
  2461. (taicpu(p).oper[1]^.ref^.offset < current_procinfo.final_localsize)
  2462. )
  2463. ) then
  2464. begin
  2465. asml.remove(p);
  2466. p.free;
  2467. p:=hp1;
  2468. DebugMsg(SPeepholeOptimization + 'removed deadstore before leave/ret',p);
  2469. RemoveLastDeallocForFuncRes(p);
  2470. Result:=true;
  2471. exit;
  2472. end;
  2473. if MatchOpType(taicpu(p),top_reg,top_ref) and
  2474. MatchInstruction(hp1,A_CMP,A_TEST,[taicpu(p).opsize]) and
  2475. (taicpu(hp1).oper[1]^.typ = top_ref) and
  2476. RefsEqual(taicpu(p).oper[1]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  2477. begin
  2478. { change
  2479. mov reg1, mem1
  2480. test/cmp x, mem1
  2481. to
  2482. mov reg1, mem1
  2483. test/cmp x, reg1
  2484. }
  2485. taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
  2486. DebugMsg(SPeepholeOptimization + 'MovTestCmp2MovTestCmp 1',hp1);
  2487. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2488. exit;
  2489. end;
  2490. if (taicpu(p).oper[1]^.typ = top_reg) and
  2491. (hp1.typ = ait_instruction) and
  2492. GetNextInstruction(hp1, hp2) and
  2493. MatchInstruction(hp2,A_MOV,[]) and
  2494. (SuperRegistersEqual(taicpu(hp2).oper[0]^.reg,taicpu(p).oper[1]^.reg)) and
  2495. (IsFoldableArithOp(taicpu(hp1), taicpu(p).oper[1]^.reg) or
  2496. ((taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and (taicpu(hp2).opsize=S_L) and
  2497. IsFoldableArithOp(taicpu(hp1), newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[1]^.reg),R_SUBQ)))
  2498. ) then
  2499. begin
  2500. if OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  2501. (taicpu(hp2).oper[0]^.typ=top_reg) then
  2502. { change movsX/movzX reg/ref, reg2
  2503. add/sub/or/... reg3/$const, reg2
  2504. mov reg2 reg/ref
  2505. dealloc reg2
  2506. to
  2507. add/sub/or/... reg3/$const, reg/ref }
  2508. begin
  2509. TransferUsedRegs(TmpUsedRegs);
  2510. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2511. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2512. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2513. begin
  2514. { by example:
  2515. movswl %si,%eax movswl %si,%eax p
  2516. decl %eax addl %edx,%eax hp1
  2517. movw %ax,%si movw %ax,%si hp2
  2518. ->
  2519. movswl %si,%eax movswl %si,%eax p
  2520. decw %eax addw %edx,%eax hp1
  2521. movw %ax,%si movw %ax,%si hp2
  2522. }
  2523. DebugMsg(SPeepholeOptimization + 'MovOpMov2Op ('+
  2524. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  2525. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  2526. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  2527. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  2528. {
  2529. ->
  2530. movswl %si,%eax movswl %si,%eax p
  2531. decw %si addw %dx,%si hp1
  2532. movw %ax,%si movw %ax,%si hp2
  2533. }
  2534. case taicpu(hp1).ops of
  2535. 1:
  2536. begin
  2537. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  2538. if taicpu(hp1).oper[0]^.typ=top_reg then
  2539. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2540. end;
  2541. 2:
  2542. begin
  2543. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  2544. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  2545. (taicpu(hp1).opcode<>A_SHL) and
  2546. (taicpu(hp1).opcode<>A_SHR) and
  2547. (taicpu(hp1).opcode<>A_SAR) then
  2548. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2549. end;
  2550. else
  2551. internalerror(2008042701);
  2552. end;
  2553. {
  2554. ->
  2555. decw %si addw %dx,%si p
  2556. }
  2557. asml.remove(hp2);
  2558. hp2.Free;
  2559. RemoveCurrentP(p);
  2560. Result:=True;
  2561. Exit;
  2562. end;
  2563. end;
  2564. if MatchOpType(taicpu(hp2),top_reg,top_reg) and
  2565. not(SuperRegistersEqual(taicpu(hp1).oper[0]^.reg,taicpu(hp2).oper[1]^.reg)) and
  2566. ((topsize2memsize[taicpu(hp1).opsize]<= topsize2memsize[taicpu(hp2).opsize]) or
  2567. { opsize matters for these opcodes, we could probably work around this, but it is not worth the effort }
  2568. ((taicpu(hp1).opcode<>A_SHL) and (taicpu(hp1).opcode<>A_SHR) and (taicpu(hp1).opcode<>A_SAR))
  2569. )
  2570. {$ifdef i386}
  2571. { byte registers of esi, edi, ebp, esp are not available on i386 }
  2572. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  2573. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(p).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  2574. {$endif i386}
  2575. then
  2576. { change movsX/movzX reg/ref, reg2
  2577. add/sub/or/... regX/$const, reg2
  2578. mov reg2, reg3
  2579. dealloc reg2
  2580. to
  2581. movsX/movzX reg/ref, reg3
  2582. add/sub/or/... reg3/$const, reg3
  2583. }
  2584. begin
  2585. TransferUsedRegs(TmpUsedRegs);
  2586. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2587. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2588. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2589. begin
  2590. { by example:
  2591. movswl %si,%eax movswl %si,%eax p
  2592. decl %eax addl %edx,%eax hp1
  2593. movw %ax,%si movw %ax,%si hp2
  2594. ->
  2595. movswl %si,%eax movswl %si,%eax p
  2596. decw %eax addw %edx,%eax hp1
  2597. movw %ax,%si movw %ax,%si hp2
  2598. }
  2599. DebugMsg(SPeepholeOptimization + 'MovOpMov2MovOp ('+
  2600. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  2601. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  2602. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  2603. { limit size of constants as well to avoid assembler errors, but
  2604. check opsize to avoid overflow when left shifting the 1 }
  2605. if (taicpu(p).oper[0]^.typ=top_const) and (topsize2memsize[taicpu(hp2).opsize]<=63) then
  2606. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and ((qword(1) shl topsize2memsize[taicpu(hp2).opsize])-1);
  2607. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  2608. taicpu(p).changeopsize(taicpu(hp2).opsize);
  2609. if taicpu(p).oper[0]^.typ=top_reg then
  2610. setsubreg(taicpu(p).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2611. taicpu(p).loadoper(1, taicpu(hp2).oper[1]^);
  2612. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,usedregs);
  2613. {
  2614. ->
  2615. movswl %si,%eax movswl %si,%eax p
  2616. decw %si addw %dx,%si hp1
  2617. movw %ax,%si movw %ax,%si hp2
  2618. }
  2619. case taicpu(hp1).ops of
  2620. 1:
  2621. begin
  2622. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  2623. if taicpu(hp1).oper[0]^.typ=top_reg then
  2624. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2625. end;
  2626. 2:
  2627. begin
  2628. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  2629. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  2630. (taicpu(hp1).opcode<>A_SHL) and
  2631. (taicpu(hp1).opcode<>A_SHR) and
  2632. (taicpu(hp1).opcode<>A_SAR) then
  2633. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2634. end;
  2635. else
  2636. internalerror(2018111801);
  2637. end;
  2638. {
  2639. ->
  2640. decw %si addw %dx,%si p
  2641. }
  2642. asml.remove(hp2);
  2643. hp2.Free;
  2644. end;
  2645. end;
  2646. end;
  2647. if MatchInstruction(hp1,A_BTS,A_BTR,[Taicpu(p).opsize]) and
  2648. GetNextInstruction(hp1, hp2) and
  2649. MatchInstruction(hp2,A_OR,[Taicpu(p).opsize]) and
  2650. MatchOperand(Taicpu(p).oper[0]^,0) and
  2651. (Taicpu(p).oper[1]^.typ = top_reg) and
  2652. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp1).oper[1]^) and
  2653. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp2).oper[1]^) then
  2654. { mov reg1,0
  2655. bts reg1,operand1 --> mov reg1,operand2
  2656. or reg1,operand2 bts reg1,operand1}
  2657. begin
  2658. Taicpu(hp2).opcode:=A_MOV;
  2659. asml.remove(hp1);
  2660. insertllitem(hp2,hp2.next,hp1);
  2661. asml.remove(p);
  2662. p.free;
  2663. p:=hp1;
  2664. Result:=true;
  2665. exit;
  2666. end;
  2667. if MatchInstruction(hp1,A_LEA,[S_L]) and
  2668. MatchOpType(Taicpu(p),top_ref,top_reg) and
  2669. ((MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(hp1).oper[1]^.reg,Taicpu(p).oper[1]^.reg) and
  2670. (Taicpu(hp1).oper[0]^.ref^.base<>Taicpu(p).oper[1]^.reg)
  2671. ) or
  2672. (MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(p).oper[1]^.reg,Taicpu(hp1).oper[1]^.reg) and
  2673. (Taicpu(hp1).oper[0]^.ref^.index<>Taicpu(p).oper[1]^.reg)
  2674. )
  2675. ) then
  2676. { mov reg1,ref
  2677. lea reg2,[reg1,reg2]
  2678. to
  2679. add reg2,ref}
  2680. begin
  2681. TransferUsedRegs(TmpUsedRegs);
  2682. { reg1 may not be used afterwards }
  2683. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
  2684. begin
  2685. Taicpu(hp1).opcode:=A_ADD;
  2686. Taicpu(hp1).oper[0]^.ref^:=Taicpu(p).oper[0]^.ref^;
  2687. DebugMsg(SPeepholeOptimization + 'MovLea2Add done',hp1);
  2688. asml.remove(p);
  2689. p.free;
  2690. p:=hp1;
  2691. result:=true;
  2692. exit;
  2693. end;
  2694. end;
  2695. end;
  2696. function TX86AsmOptimizer.OptPass1MOVXX(var p : tai) : boolean;
  2697. var
  2698. hp1 : tai;
  2699. begin
  2700. Result:=false;
  2701. if taicpu(p).ops <> 2 then
  2702. exit;
  2703. if GetNextInstruction(p,hp1) and
  2704. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  2705. (taicpu(hp1).ops = 2) then
  2706. begin
  2707. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  2708. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  2709. { movXX reg1, mem1 or movXX mem1, reg1
  2710. movXX mem2, reg2 movXX reg2, mem2}
  2711. begin
  2712. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  2713. { movXX reg1, mem1 or movXX mem1, reg1
  2714. movXX mem2, reg1 movXX reg2, mem1}
  2715. begin
  2716. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2717. begin
  2718. { Removes the second statement from
  2719. movXX reg1, mem1/reg2
  2720. movXX mem1/reg2, reg1
  2721. }
  2722. if taicpu(p).oper[0]^.typ=top_reg then
  2723. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2724. { Removes the second statement from
  2725. movXX mem1/reg1, reg2
  2726. movXX reg2, mem1/reg1
  2727. }
  2728. if (taicpu(p).oper[1]^.typ=top_reg) and
  2729. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,UsedRegs)) then
  2730. begin
  2731. asml.remove(p);
  2732. p.free;
  2733. GetNextInstruction(hp1,p);
  2734. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2Nop 1 done',p);
  2735. end
  2736. else
  2737. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2MoVXX 1 done',p);
  2738. asml.remove(hp1);
  2739. hp1.free;
  2740. Result:=true;
  2741. exit;
  2742. end
  2743. end;
  2744. end;
  2745. end;
  2746. end;
  2747. function TX86AsmOptimizer.OptPass1OP(var p : tai) : boolean;
  2748. var
  2749. hp1 : tai;
  2750. begin
  2751. result:=false;
  2752. { replace
  2753. <Op>X %mreg1,%mreg2 // Op in [ADD,MUL]
  2754. MovX %mreg2,%mreg1
  2755. dealloc %mreg2
  2756. by
  2757. <Op>X %mreg2,%mreg1
  2758. ?
  2759. }
  2760. if GetNextInstruction(p,hp1) and
  2761. { we mix single and double opperations here because we assume that the compiler
  2762. generates vmovapd only after double operations and vmovaps only after single operations }
  2763. MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
  2764. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2765. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
  2766. (taicpu(p).oper[0]^.typ=top_reg) then
  2767. begin
  2768. TransferUsedRegs(TmpUsedRegs);
  2769. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2770. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2771. begin
  2772. taicpu(p).loadoper(0,taicpu(hp1).oper[0]^);
  2773. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2774. DebugMsg(SPeepholeOptimization + 'OpMov2Op done',p);
  2775. asml.Remove(hp1);
  2776. hp1.Free;
  2777. result:=true;
  2778. end;
  2779. end;
  2780. end;
  2781. function TX86AsmOptimizer.OptPass1LEA(var p : tai) : boolean;
  2782. var
  2783. hp1, hp2, hp3: tai;
  2784. l : ASizeInt;
  2785. ref: Integer;
  2786. saveref: treference;
  2787. begin
  2788. Result:=false;
  2789. { removes seg register prefixes from LEA operations, as they
  2790. don't do anything}
  2791. taicpu(p).oper[0]^.ref^.Segment:=NR_NO;
  2792. { changes "lea (%reg1), %reg2" into "mov %reg1, %reg2" }
  2793. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  2794. (taicpu(p).oper[0]^.ref^.index = NR_NO) and
  2795. { do not mess with leas acessing the stack pointer }
  2796. (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) and
  2797. (not(Assigned(taicpu(p).oper[0]^.ref^.Symbol))) then
  2798. begin
  2799. if (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) and
  2800. (taicpu(p).oper[0]^.ref^.offset = 0) then
  2801. begin
  2802. hp1:=taicpu.op_reg_reg(A_MOV,taicpu(p).opsize,taicpu(p).oper[0]^.ref^.base,
  2803. taicpu(p).oper[1]^.reg);
  2804. InsertLLItem(p.previous,p.next, hp1);
  2805. DebugMsg(SPeepholeOptimization + 'Lea2Mov done',hp1);
  2806. p.free;
  2807. p:=hp1;
  2808. Result:=true;
  2809. exit;
  2810. end
  2811. else if (taicpu(p).oper[0]^.ref^.offset = 0) then
  2812. begin
  2813. DebugMsg(SPeepholeOptimization + 'Lea2Nop done',p);
  2814. RemoveCurrentP(p);
  2815. Result:=true;
  2816. exit;
  2817. end
  2818. { continue to use lea to adjust the stack pointer,
  2819. it is the recommended way, but only if not optimizing for size }
  2820. else if (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) or
  2821. (cs_opt_size in current_settings.optimizerswitches) then
  2822. with taicpu(p).oper[0]^.ref^ do
  2823. if (base = taicpu(p).oper[1]^.reg) then
  2824. begin
  2825. l:=offset;
  2826. if (l=1) and UseIncDec then
  2827. begin
  2828. taicpu(p).opcode:=A_INC;
  2829. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  2830. taicpu(p).ops:=1;
  2831. DebugMsg(SPeepholeOptimization + 'Lea2Inc done',p);
  2832. end
  2833. else if (l=-1) and UseIncDec then
  2834. begin
  2835. taicpu(p).opcode:=A_DEC;
  2836. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  2837. taicpu(p).ops:=1;
  2838. DebugMsg(SPeepholeOptimization + 'Lea2Dec done',p);
  2839. end
  2840. else
  2841. begin
  2842. if (l<0) and (l<>-2147483648) then
  2843. begin
  2844. taicpu(p).opcode:=A_SUB;
  2845. taicpu(p).loadConst(0,-l);
  2846. DebugMsg(SPeepholeOptimization + 'Lea2Sub done',p);
  2847. end
  2848. else
  2849. begin
  2850. taicpu(p).opcode:=A_ADD;
  2851. taicpu(p).loadConst(0,l);
  2852. DebugMsg(SPeepholeOptimization + 'Lea2Add done',p);
  2853. end;
  2854. end;
  2855. Result:=true;
  2856. exit;
  2857. end;
  2858. end;
  2859. if GetNextInstruction(p,hp1) and
  2860. MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  2861. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2862. MatchOpType(Taicpu(hp1),top_reg,top_reg) and
  2863. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) then
  2864. begin
  2865. TransferUsedRegs(TmpUsedRegs);
  2866. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2867. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2868. begin
  2869. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2870. DebugMsg(SPeepholeOptimization + 'LeaMov2Lea done',p);
  2871. asml.Remove(hp1);
  2872. hp1.Free;
  2873. result:=true;
  2874. end;
  2875. end;
  2876. { changes
  2877. lea offset1(regX), reg1
  2878. lea offset2(reg1), reg1
  2879. to
  2880. lea offset1+offset2(regX), reg1 }
  2881. { for now, we do not mess with the stack pointer, thought it might be usefull to remove
  2882. unneeded lea sequences on the stack pointer, it needs to be tested in detail }
  2883. if (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) and
  2884. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  2885. MatchInstruction(hp1,A_LEA,[taicpu(p).opsize]) and
  2886. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
  2887. (taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg) and
  2888. (taicpu(p).oper[0]^.ref^.relsymbol=nil) and
  2889. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and
  2890. (taicpu(p).oper[0]^.ref^.symbol=nil) and
  2891. (((taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) and
  2892. (taicpu(p).oper[0]^.ref^.index=NR_NO) and
  2893. (taicpu(p).oper[0]^.ref^.index=taicpu(hp1).oper[0]^.ref^.index) and
  2894. (taicpu(p).oper[0]^.ref^.scalefactor=taicpu(hp1).oper[0]^.ref^.scalefactor)
  2895. ) or
  2896. ((taicpu(hp1).oper[0]^.ref^.scalefactor in [0,1]) and
  2897. (taicpu(p).oper[0]^.ref^.base=NR_NO) and
  2898. not(RegUsedBetween(taicpu(p).oper[0]^.ref^.index,p,hp1)))
  2899. ) and
  2900. not(RegUsedBetween(taicpu(p).oper[0]^.ref^.base,p,hp1)) and
  2901. (taicpu(p).oper[0]^.ref^.relsymbol=taicpu(hp1).oper[0]^.ref^.relsymbol) and
  2902. (taicpu(p).oper[0]^.ref^.segment=taicpu(hp1).oper[0]^.ref^.segment) and
  2903. (taicpu(p).oper[0]^.ref^.symbol=taicpu(hp1).oper[0]^.ref^.symbol) then
  2904. begin
  2905. DebugMsg(SPeepholeOptimization + 'LeaLea2Lea done',p);
  2906. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  2907. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  2908. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  2909. begin
  2910. taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
  2911. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  2912. taicpu(hp1).oper[0]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  2913. end;
  2914. RemoveCurrentP(p);
  2915. result:=true;
  2916. exit;
  2917. end;
  2918. { changes
  2919. lea <ref1>, reg1
  2920. <op> ...,<ref. with reg1>,...
  2921. to
  2922. <op> ...,<ref1>,... }
  2923. if (taicpu(p).oper[1]^.reg<>current_procinfo.framepointer) and
  2924. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) and
  2925. GetNextInstruction(p,hp1) and
  2926. (hp1.typ=ait_instruction) and
  2927. not(MatchInstruction(hp1,A_LEA,[])) then
  2928. begin
  2929. { find a reference which uses reg1 }
  2930. if (taicpu(hp1).ops>=1) and (taicpu(hp1).oper[0]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^) then
  2931. ref:=0
  2932. else if (taicpu(hp1).ops>=2) and (taicpu(hp1).oper[1]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^) then
  2933. ref:=1
  2934. else
  2935. ref:=-1;
  2936. if (ref<>-1) and
  2937. { reg1 must be either the base or the index }
  2938. ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) xor (taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg)) then
  2939. begin
  2940. { reg1 can be removed from the reference }
  2941. saveref:=taicpu(hp1).oper[ref]^.ref^;
  2942. if taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg then
  2943. taicpu(hp1).oper[ref]^.ref^.base:=NR_NO
  2944. else if taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg then
  2945. taicpu(hp1).oper[ref]^.ref^.index:=NR_NO
  2946. else
  2947. Internalerror(2019111201);
  2948. { check if the can insert all data of the lea into the second instruction }
  2949. if ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) or (taicpu(hp1).oper[ref]^.ref^.scalefactor in [0,1])) and
  2950. ((taicpu(p).oper[0]^.ref^.base=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.base=NR_NO)) and
  2951. ((taicpu(p).oper[0]^.ref^.index=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.index=NR_NO)) and
  2952. ((taicpu(p).oper[0]^.ref^.symbol=nil) or (taicpu(hp1).oper[ref]^.ref^.symbol=nil)) and
  2953. ((taicpu(p).oper[0]^.ref^.relsymbol=nil) or (taicpu(hp1).oper[ref]^.ref^.relsymbol=nil)) and
  2954. ((taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) or (taicpu(hp1).oper[ref]^.ref^.scalefactor in [0,1])) and
  2955. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.segment=NR_NO)
  2956. {$ifdef x86_64}
  2957. and (abs(taicpu(hp1).oper[ref]^.ref^.offset+taicpu(p).oper[0]^.ref^.offset)<=$7fffffff)
  2958. and (((taicpu(p).oper[0]^.ref^.base<>NR_RIP) and (taicpu(p).oper[0]^.ref^.index<>NR_RIP)) or
  2959. ((taicpu(hp1).oper[ref]^.ref^.base=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.index=NR_NO))
  2960. )
  2961. {$endif x86_64}
  2962. then
  2963. begin
  2964. { reg1 might not used by the second instruction after it is remove from the reference }
  2965. if not(RegInInstruction(taicpu(p).oper[1]^.reg,taicpu(hp1))) then
  2966. begin
  2967. TransferUsedRegs(TmpUsedRegs);
  2968. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2969. { reg1 is not updated so it might not be used afterwards }
  2970. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2971. begin
  2972. DebugMsg(SPeepholeOptimization + 'LeaOp2Op done',p);
  2973. if taicpu(p).oper[0]^.ref^.base<>NR_NO then
  2974. taicpu(hp1).oper[ref]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  2975. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  2976. taicpu(hp1).oper[ref]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  2977. if taicpu(p).oper[0]^.ref^.symbol<>nil then
  2978. taicpu(hp1).oper[ref]^.ref^.symbol:=taicpu(p).oper[0]^.ref^.symbol;
  2979. if taicpu(p).oper[0]^.ref^.relsymbol<>nil then
  2980. taicpu(hp1).oper[ref]^.ref^.relsymbol:=taicpu(p).oper[0]^.ref^.relsymbol;
  2981. if not(taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) then
  2982. taicpu(hp1).oper[ref]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  2983. inc(taicpu(hp1).oper[ref]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  2984. RemoveCurrentP(p);
  2985. result:=true;
  2986. exit;
  2987. end
  2988. end;
  2989. end;
  2990. { recover }
  2991. taicpu(hp1).oper[ref]^.ref^:=saveref;
  2992. end;
  2993. end;
  2994. end;
  2995. function TX86AsmOptimizer.DoSubAddOpt(var p: tai): Boolean;
  2996. var
  2997. hp1 : tai;
  2998. begin
  2999. DoSubAddOpt := False;
  3000. if GetLastInstruction(p, hp1) and
  3001. (hp1.typ = ait_instruction) and
  3002. (taicpu(hp1).opsize = taicpu(p).opsize) then
  3003. case taicpu(hp1).opcode Of
  3004. A_DEC:
  3005. if (taicpu(hp1).oper[0]^.typ = top_reg) and
  3006. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  3007. begin
  3008. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+1);
  3009. asml.remove(hp1);
  3010. hp1.free;
  3011. end;
  3012. A_SUB:
  3013. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  3014. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  3015. begin
  3016. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+taicpu(hp1).oper[0]^.val);
  3017. asml.remove(hp1);
  3018. hp1.free;
  3019. end;
  3020. A_ADD:
  3021. begin
  3022. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  3023. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  3024. begin
  3025. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  3026. asml.remove(hp1);
  3027. hp1.free;
  3028. if (taicpu(p).oper[0]^.val = 0) then
  3029. begin
  3030. hp1 := tai(p.next);
  3031. asml.remove(p);
  3032. p.free;
  3033. if not GetLastInstruction(hp1, p) then
  3034. p := hp1;
  3035. DoSubAddOpt := True;
  3036. end
  3037. end;
  3038. end;
  3039. else
  3040. ;
  3041. end;
  3042. end;
  3043. function TX86AsmOptimizer.OptPass1Sub(var p : tai) : boolean;
  3044. {$ifdef i386}
  3045. var
  3046. hp1 : tai;
  3047. {$endif i386}
  3048. begin
  3049. Result:=false;
  3050. { * change "subl $2, %esp; pushw x" to "pushl x"}
  3051. { * change "sub/add const1, reg" or "dec reg" followed by
  3052. "sub const2, reg" to one "sub ..., reg" }
  3053. if MatchOpType(taicpu(p),top_const,top_reg) then
  3054. begin
  3055. {$ifdef i386}
  3056. if (taicpu(p).oper[0]^.val = 2) and
  3057. (taicpu(p).oper[1]^.reg = NR_ESP) and
  3058. { Don't do the sub/push optimization if the sub }
  3059. { comes from setting up the stack frame (JM) }
  3060. (not(GetLastInstruction(p,hp1)) or
  3061. not(MatchInstruction(hp1,A_MOV,[S_L]) and
  3062. MatchOperand(taicpu(hp1).oper[0]^,NR_ESP) and
  3063. MatchOperand(taicpu(hp1).oper[0]^,NR_EBP))) then
  3064. begin
  3065. hp1 := tai(p.next);
  3066. while Assigned(hp1) and
  3067. (tai(hp1).typ in [ait_instruction]+SkipInstr) and
  3068. not RegReadByInstruction(NR_ESP,hp1) and
  3069. not RegModifiedByInstruction(NR_ESP,hp1) do
  3070. hp1 := tai(hp1.next);
  3071. if Assigned(hp1) and
  3072. MatchInstruction(hp1,A_PUSH,[S_W]) then
  3073. begin
  3074. taicpu(hp1).changeopsize(S_L);
  3075. if taicpu(hp1).oper[0]^.typ=top_reg then
  3076. setsubreg(taicpu(hp1).oper[0]^.reg,R_SUBWHOLE);
  3077. hp1 := tai(p.next);
  3078. asml.remove(p);
  3079. p.free;
  3080. p := hp1;
  3081. Result:=true;
  3082. exit;
  3083. end;
  3084. end;
  3085. {$endif i386}
  3086. if DoSubAddOpt(p) then
  3087. Result:=true;
  3088. end;
  3089. end;
  3090. function TX86AsmOptimizer.OptPass1SHLSAL(var p : tai) : boolean;
  3091. var
  3092. TmpBool1,TmpBool2 : Boolean;
  3093. tmpref : treference;
  3094. hp1,hp2: tai;
  3095. begin
  3096. Result:=false;
  3097. if MatchOpType(taicpu(p),top_const,top_reg) and
  3098. (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  3099. (taicpu(p).oper[0]^.val <= 3) then
  3100. { Changes "shl const, %reg32; add const/reg, %reg32" to one lea statement }
  3101. begin
  3102. { should we check the next instruction? }
  3103. TmpBool1 := True;
  3104. { have we found an add/sub which could be
  3105. integrated in the lea? }
  3106. TmpBool2 := False;
  3107. reference_reset(tmpref,2,[]);
  3108. TmpRef.index := taicpu(p).oper[1]^.reg;
  3109. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  3110. while TmpBool1 and
  3111. GetNextInstruction(p, hp1) and
  3112. (tai(hp1).typ = ait_instruction) and
  3113. ((((taicpu(hp1).opcode = A_ADD) or
  3114. (taicpu(hp1).opcode = A_SUB)) and
  3115. (taicpu(hp1).oper[1]^.typ = Top_Reg) and
  3116. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)) or
  3117. (((taicpu(hp1).opcode = A_INC) or
  3118. (taicpu(hp1).opcode = A_DEC)) and
  3119. (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  3120. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg)) or
  3121. ((taicpu(hp1).opcode = A_LEA) and
  3122. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  3123. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg))) and
  3124. (not GetNextInstruction(hp1,hp2) or
  3125. not instrReadsFlags(hp2)) Do
  3126. begin
  3127. TmpBool1 := False;
  3128. if taicpu(hp1).opcode=A_LEA then
  3129. begin
  3130. if (TmpRef.base = NR_NO) and
  3131. (taicpu(hp1).oper[0]^.ref^.symbol=nil) and
  3132. (taicpu(hp1).oper[0]^.ref^.relsymbol=nil) and
  3133. (taicpu(hp1).oper[0]^.ref^.segment=NR_NO) and
  3134. ((taicpu(hp1).oper[0]^.ref^.scalefactor=0) or
  3135. (taicpu(hp1).oper[0]^.ref^.scalefactor*tmpref.scalefactor<=8)) then
  3136. begin
  3137. TmpBool1 := True;
  3138. TmpBool2 := True;
  3139. inc(TmpRef.offset, taicpu(hp1).oper[0]^.ref^.offset);
  3140. if taicpu(hp1).oper[0]^.ref^.scalefactor<>0 then
  3141. tmpref.scalefactor:=tmpref.scalefactor*taicpu(hp1).oper[0]^.ref^.scalefactor;
  3142. TmpRef.base := taicpu(hp1).oper[0]^.ref^.base;
  3143. asml.remove(hp1);
  3144. hp1.free;
  3145. end
  3146. end
  3147. else if (taicpu(hp1).oper[0]^.typ = Top_Const) then
  3148. begin
  3149. TmpBool1 := True;
  3150. TmpBool2 := True;
  3151. case taicpu(hp1).opcode of
  3152. A_ADD:
  3153. inc(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  3154. A_SUB:
  3155. dec(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  3156. else
  3157. internalerror(2019050536);
  3158. end;
  3159. asml.remove(hp1);
  3160. hp1.free;
  3161. end
  3162. else
  3163. if (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  3164. (((taicpu(hp1).opcode = A_ADD) and
  3165. (TmpRef.base = NR_NO)) or
  3166. (taicpu(hp1).opcode = A_INC) or
  3167. (taicpu(hp1).opcode = A_DEC)) then
  3168. begin
  3169. TmpBool1 := True;
  3170. TmpBool2 := True;
  3171. case taicpu(hp1).opcode of
  3172. A_ADD:
  3173. TmpRef.base := taicpu(hp1).oper[0]^.reg;
  3174. A_INC:
  3175. inc(TmpRef.offset);
  3176. A_DEC:
  3177. dec(TmpRef.offset);
  3178. else
  3179. internalerror(2019050535);
  3180. end;
  3181. asml.remove(hp1);
  3182. hp1.free;
  3183. end;
  3184. end;
  3185. if TmpBool2
  3186. {$ifndef x86_64}
  3187. or
  3188. ((current_settings.optimizecputype < cpu_Pentium2) and
  3189. (taicpu(p).oper[0]^.val <= 3) and
  3190. not(cs_opt_size in current_settings.optimizerswitches))
  3191. {$endif x86_64}
  3192. then
  3193. begin
  3194. if not(TmpBool2) and
  3195. (taicpu(p).oper[0]^.val=1) then
  3196. begin
  3197. hp1:=taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
  3198. taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg)
  3199. end
  3200. else
  3201. hp1:=taicpu.op_ref_reg(A_LEA, taicpu(p).opsize, TmpRef,
  3202. taicpu(p).oper[1]^.reg);
  3203. DebugMsg(SPeepholeOptimization + 'ShlAddLeaSubIncDec2Lea',p);
  3204. InsertLLItem(p.previous, p.next, hp1);
  3205. p.free;
  3206. p := hp1;
  3207. end;
  3208. end
  3209. {$ifndef x86_64}
  3210. else if (current_settings.optimizecputype < cpu_Pentium2) and
  3211. MatchOpType(taicpu(p),top_const,top_reg) then
  3212. begin
  3213. { changes "shl $1, %reg" to "add %reg, %reg", which is the same on a 386,
  3214. but faster on a 486, and Tairable in both U and V pipes on the Pentium
  3215. (unlike shl, which is only Tairable in the U pipe) }
  3216. if taicpu(p).oper[0]^.val=1 then
  3217. begin
  3218. hp1 := taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
  3219. taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg);
  3220. InsertLLItem(p.previous, p.next, hp1);
  3221. p.free;
  3222. p := hp1;
  3223. end
  3224. { changes "shl $2, %reg" to "lea (,%reg,4), %reg"
  3225. "shl $3, %reg" to "lea (,%reg,8), %reg }
  3226. else if (taicpu(p).opsize = S_L) and
  3227. (taicpu(p).oper[0]^.val<= 3) then
  3228. begin
  3229. reference_reset(tmpref,2,[]);
  3230. TmpRef.index := taicpu(p).oper[1]^.reg;
  3231. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  3232. hp1 := taicpu.Op_ref_reg(A_LEA,S_L,TmpRef, taicpu(p).oper[1]^.reg);
  3233. InsertLLItem(p.previous, p.next, hp1);
  3234. p.free;
  3235. p := hp1;
  3236. end;
  3237. end
  3238. {$endif x86_64}
  3239. ;
  3240. end;
  3241. function TX86AsmOptimizer.OptPass1SETcc(var p: tai): boolean;
  3242. var
  3243. hp1,hp2,next: tai; SetC, JumpC: TAsmCond; Unconditional: Boolean;
  3244. begin
  3245. Result:=false;
  3246. if MatchOpType(taicpu(p),top_reg) and
  3247. GetNextInstruction(p, hp1) and
  3248. ((MatchInstruction(hp1, A_TEST, [S_B]) and
  3249. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3250. (taicpu(hp1).oper[0]^.reg = taicpu(hp1).oper[1]^.reg)) or
  3251. (MatchInstruction(hp1, A_CMP, [S_B]) and
  3252. MatchOpType(taicpu(hp1),top_const,top_reg) and
  3253. (taicpu(hp1).oper[0]^.val=0))
  3254. ) and
  3255. (taicpu(p).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  3256. GetNextInstruction(hp1, hp2) and
  3257. MatchInstruction(hp2, A_Jcc, []) then
  3258. { Change from: To:
  3259. set(C) %reg j(~C) label
  3260. test %reg,%reg/cmp $0,%reg
  3261. je label
  3262. set(C) %reg j(C) label
  3263. test %reg,%reg/cmp $0,%reg
  3264. jne label
  3265. }
  3266. begin
  3267. next := tai(p.Next);
  3268. TransferUsedRegs(TmpUsedRegs);
  3269. UpdateUsedRegs(TmpUsedRegs, next);
  3270. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  3271. JumpC := taicpu(hp2).condition;
  3272. Unconditional := False;
  3273. if conditions_equal(JumpC, C_E) then
  3274. SetC := inverse_cond(taicpu(p).condition)
  3275. else if conditions_equal(JumpC, C_NE) then
  3276. SetC := taicpu(p).condition
  3277. else
  3278. { We've got something weird here (and inefficent) }
  3279. begin
  3280. DebugMsg('DEBUG: Inefficient jump - check code generation', p);
  3281. SetC := C_NONE;
  3282. { JAE/JNB will always branch (use 'condition_in', since C_AE <> C_NB normally) }
  3283. if condition_in(C_AE, JumpC) then
  3284. Unconditional := True
  3285. else
  3286. { Not sure what to do with this jump - drop out }
  3287. Exit;
  3288. end;
  3289. asml.Remove(hp1);
  3290. hp1.Free;
  3291. if Unconditional then
  3292. MakeUnconditional(taicpu(hp2))
  3293. else
  3294. begin
  3295. if SetC = C_NONE then
  3296. InternalError(2018061401);
  3297. taicpu(hp2).SetCondition(SetC);
  3298. end;
  3299. if not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs) then
  3300. begin
  3301. asml.Remove(p);
  3302. UpdateUsedRegs(next);
  3303. p.Free;
  3304. Result := True;
  3305. p := hp2;
  3306. end;
  3307. DebugMsg(SPeepholeOptimization + 'SETcc/TESTCmp/Jcc -> Jcc',p);
  3308. end;
  3309. end;
  3310. function TX86AsmOptimizer.OptPass1FSTP(var p: tai): boolean;
  3311. { returns true if a "continue" should be done after this optimization }
  3312. var
  3313. hp1, hp2: tai;
  3314. begin
  3315. Result := false;
  3316. if MatchOpType(taicpu(p),top_ref) and
  3317. GetNextInstruction(p, hp1) and
  3318. (hp1.typ = ait_instruction) and
  3319. (((taicpu(hp1).opcode = A_FLD) and
  3320. (taicpu(p).opcode = A_FSTP)) or
  3321. ((taicpu(p).opcode = A_FISTP) and
  3322. (taicpu(hp1).opcode = A_FILD))) and
  3323. MatchOpType(taicpu(hp1),top_ref) and
  3324. (taicpu(hp1).opsize = taicpu(p).opsize) and
  3325. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  3326. begin
  3327. { replacing fstp f;fld f by fst f is only valid for extended because of rounding }
  3328. if (taicpu(p).opsize=S_FX) and
  3329. GetNextInstruction(hp1, hp2) and
  3330. (hp2.typ = ait_instruction) and
  3331. IsExitCode(hp2) and
  3332. (taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
  3333. not(assigned(current_procinfo.procdef.funcretsym) and
  3334. (taicpu(p).oper[0]^.ref^.offset < tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)) and
  3335. (taicpu(p).oper[0]^.ref^.index = NR_NO) then
  3336. begin
  3337. asml.remove(p);
  3338. asml.remove(hp1);
  3339. p.free;
  3340. hp1.free;
  3341. p := hp2;
  3342. RemoveLastDeallocForFuncRes(p);
  3343. Result := true;
  3344. end
  3345. (* can't be done because the store operation rounds
  3346. else
  3347. { fst can't store an extended value! }
  3348. if (taicpu(p).opsize <> S_FX) and
  3349. (taicpu(p).opsize <> S_IQ) then
  3350. begin
  3351. if (taicpu(p).opcode = A_FSTP) then
  3352. taicpu(p).opcode := A_FST
  3353. else taicpu(p).opcode := A_FIST;
  3354. asml.remove(hp1);
  3355. hp1.free;
  3356. end
  3357. *)
  3358. end;
  3359. end;
  3360. function TX86AsmOptimizer.OptPass1FLD(var p : tai) : boolean;
  3361. var
  3362. hp1, hp2: tai;
  3363. begin
  3364. result:=false;
  3365. if MatchOpType(taicpu(p),top_reg) and
  3366. GetNextInstruction(p, hp1) and
  3367. (hp1.typ = Ait_Instruction) and
  3368. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3369. (taicpu(hp1).oper[0]^.reg = NR_ST) and
  3370. (taicpu(hp1).oper[1]^.reg = NR_ST1) then
  3371. { change to
  3372. fld reg fxxx reg,st
  3373. fxxxp st, st1 (hp1)
  3374. Remark: non commutative operations must be reversed!
  3375. }
  3376. begin
  3377. case taicpu(hp1).opcode Of
  3378. A_FMULP,A_FADDP,
  3379. A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  3380. begin
  3381. case taicpu(hp1).opcode Of
  3382. A_FADDP: taicpu(hp1).opcode := A_FADD;
  3383. A_FMULP: taicpu(hp1).opcode := A_FMUL;
  3384. A_FSUBP: taicpu(hp1).opcode := A_FSUBR;
  3385. A_FSUBRP: taicpu(hp1).opcode := A_FSUB;
  3386. A_FDIVP: taicpu(hp1).opcode := A_FDIVR;
  3387. A_FDIVRP: taicpu(hp1).opcode := A_FDIV;
  3388. else
  3389. internalerror(2019050534);
  3390. end;
  3391. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  3392. taicpu(hp1).oper[1]^.reg := NR_ST;
  3393. asml.remove(p);
  3394. p.free;
  3395. p := hp1;
  3396. Result:=true;
  3397. exit;
  3398. end;
  3399. else
  3400. ;
  3401. end;
  3402. end
  3403. else
  3404. if MatchOpType(taicpu(p),top_ref) and
  3405. GetNextInstruction(p, hp2) and
  3406. (hp2.typ = Ait_Instruction) and
  3407. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  3408. (taicpu(p).opsize in [S_FS, S_FL]) and
  3409. (taicpu(hp2).oper[0]^.reg = NR_ST) and
  3410. (taicpu(hp2).oper[1]^.reg = NR_ST1) then
  3411. if GetLastInstruction(p, hp1) and
  3412. MatchInstruction(hp1,A_FLD,A_FST,[taicpu(p).opsize]) and
  3413. MatchOpType(taicpu(hp1),top_ref) and
  3414. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  3415. if ((taicpu(hp2).opcode = A_FMULP) or
  3416. (taicpu(hp2).opcode = A_FADDP)) then
  3417. { change to
  3418. fld/fst mem1 (hp1) fld/fst mem1
  3419. fld mem1 (p) fadd/
  3420. faddp/ fmul st, st
  3421. fmulp st, st1 (hp2) }
  3422. begin
  3423. asml.remove(p);
  3424. p.free;
  3425. p := hp1;
  3426. if (taicpu(hp2).opcode = A_FADDP) then
  3427. taicpu(hp2).opcode := A_FADD
  3428. else
  3429. taicpu(hp2).opcode := A_FMUL;
  3430. taicpu(hp2).oper[1]^.reg := NR_ST;
  3431. end
  3432. else
  3433. { change to
  3434. fld/fst mem1 (hp1) fld/fst mem1
  3435. fld mem1 (p) fld st}
  3436. begin
  3437. taicpu(p).changeopsize(S_FL);
  3438. taicpu(p).loadreg(0,NR_ST);
  3439. end
  3440. else
  3441. begin
  3442. case taicpu(hp2).opcode Of
  3443. A_FMULP,A_FADDP,A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  3444. { change to
  3445. fld/fst mem1 (hp1) fld/fst mem1
  3446. fld mem2 (p) fxxx mem2
  3447. fxxxp st, st1 (hp2) }
  3448. begin
  3449. case taicpu(hp2).opcode Of
  3450. A_FADDP: taicpu(p).opcode := A_FADD;
  3451. A_FMULP: taicpu(p).opcode := A_FMUL;
  3452. A_FSUBP: taicpu(p).opcode := A_FSUBR;
  3453. A_FSUBRP: taicpu(p).opcode := A_FSUB;
  3454. A_FDIVP: taicpu(p).opcode := A_FDIVR;
  3455. A_FDIVRP: taicpu(p).opcode := A_FDIV;
  3456. else
  3457. internalerror(2019050533);
  3458. end;
  3459. asml.remove(hp2);
  3460. hp2.free;
  3461. end
  3462. else
  3463. ;
  3464. end
  3465. end
  3466. end;
  3467. function TX86AsmOptimizer.OptPass1Cmp(var p: tai): boolean;
  3468. var
  3469. v: TCGInt;
  3470. hp1, hp2: tai;
  3471. begin
  3472. Result:=false;
  3473. if taicpu(p).oper[0]^.typ = top_const then
  3474. begin
  3475. { Though GetNextInstruction can be factored out, it is an expensive
  3476. call, so delay calling it until we have first checked cheaper
  3477. conditions that are independent of it. }
  3478. if (taicpu(p).oper[0]^.val = 0) and
  3479. (taicpu(p).oper[1]^.typ = top_reg) and
  3480. GetNextInstruction(p, hp1) and
  3481. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) then
  3482. begin
  3483. hp2 := p;
  3484. { When dealing with "cmp $0,%reg", only ZF and SF contain
  3485. anything meaningful once it's converted to "test %reg,%reg";
  3486. additionally, some jumps will always (or never) branch, so
  3487. evaluate every jump immediately following the
  3488. comparison, optimising the conditions if possible.
  3489. Similarly with SETcc... those that are always set to 0 or 1
  3490. are changed to MOV instructions }
  3491. while GetNextInstruction(hp2, hp1) and
  3492. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) do
  3493. begin
  3494. case taicpu(hp1).condition of
  3495. C_B, C_C, C_NAE, C_O:
  3496. { For B/NAE:
  3497. Will never branch since an unsigned integer can never be below zero
  3498. For C/O:
  3499. Result cannot overflow because 0 is being subtracted
  3500. }
  3501. begin
  3502. if taicpu(hp1).opcode = A_Jcc then
  3503. begin
  3504. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (jump removed)', hp1);
  3505. TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol).decrefs;
  3506. AsmL.Remove(hp1);
  3507. hp1.Free;
  3508. { Since hp1 was deleted, hp2 must not be updated }
  3509. Continue;
  3510. end
  3511. else
  3512. begin
  3513. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (set -> mov 0)', hp1);
  3514. { Convert "set(c) %reg" instruction to "movb 0,%reg" }
  3515. taicpu(hp1).opcode := A_MOV;
  3516. taicpu(hp1).condition := C_None;
  3517. taicpu(hp1).opsize := S_B;
  3518. taicpu(hp1).allocate_oper(2);
  3519. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  3520. taicpu(hp1).loadconst(0, 0);
  3521. end;
  3522. end;
  3523. C_BE, C_NA:
  3524. begin
  3525. { Will only branch if equal to zero }
  3526. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition BE/NA --> E', hp1);
  3527. taicpu(hp1).condition := C_E;
  3528. end;
  3529. C_A, C_NBE:
  3530. begin
  3531. { Will only branch if not equal to zero }
  3532. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition A/NBE --> NE', hp1);
  3533. taicpu(hp1).condition := C_NE;
  3534. end;
  3535. C_AE, C_NB, C_NC, C_NO:
  3536. begin
  3537. { Will always branch }
  3538. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition AE/NB/NC/NO --> Always', hp1);
  3539. if taicpu(hp1).opcode = A_Jcc then
  3540. begin
  3541. MakeUnconditional(taicpu(hp1));
  3542. { Any jumps/set that follow will now be dead code }
  3543. RemoveDeadCodeAfterJump(taicpu(hp1));
  3544. Break;
  3545. end
  3546. else
  3547. begin
  3548. { Convert "set(c) %reg" instruction to "movb 1,%reg" }
  3549. taicpu(hp1).opcode := A_MOV;
  3550. taicpu(hp1).condition := C_None;
  3551. taicpu(hp1).opsize := S_B;
  3552. taicpu(hp1).allocate_oper(2);
  3553. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  3554. taicpu(hp1).loadconst(0, 1);
  3555. end;
  3556. end;
  3557. C_None:
  3558. InternalError(2020012201);
  3559. C_P, C_PE, C_NP, C_PO:
  3560. { We can't handle parity checks and they should never be generated
  3561. after a general-purpose CMP (it's used in some floating-point
  3562. comparisons that don't use CMP) }
  3563. InternalError(2020012202);
  3564. else
  3565. { Zero/Equality, Sign, their complements and all of the
  3566. signed comparisons do not need to be converted };
  3567. end;
  3568. hp2 := hp1;
  3569. end;
  3570. { Convert the instruction to a TEST }
  3571. taicpu(p).opcode := A_TEST;
  3572. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  3573. Result := True;
  3574. Exit;
  3575. end
  3576. else if (taicpu(p).oper[0]^.val = 1) and
  3577. GetNextInstruction(p, hp1) and
  3578. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
  3579. (taicpu(hp1).condition in [C_L, C_NGE]) then
  3580. begin
  3581. { Convert; To:
  3582. cmp $1,r/m cmp $0,r/m
  3583. jl @lbl jle @lbl
  3584. }
  3585. DebugMsg(SPeepholeOptimization + 'Cmp1Jl2Cmp0Jle', p);
  3586. taicpu(p).oper[0]^.val := 0;
  3587. taicpu(hp1).condition := C_LE;
  3588. { If the instruction is now "cmp $0,%reg", convert it to a
  3589. TEST (and effectively do the work of the "cmp $0,%reg" in
  3590. the block above)
  3591. If it's a reference, we can get away with not setting
  3592. Result to True because he haven't evaluated the jump
  3593. in this pass yet.
  3594. }
  3595. if (taicpu(p).oper[1]^.typ = top_reg) then
  3596. begin
  3597. taicpu(p).opcode := A_TEST;
  3598. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  3599. Result := True;
  3600. end;
  3601. Exit;
  3602. end
  3603. else if (taicpu(p).oper[1]^.typ = top_reg) then
  3604. begin
  3605. { cmp register,$8000 neg register
  3606. je target --> jo target
  3607. .... only if register is deallocated before jump.}
  3608. case Taicpu(p).opsize of
  3609. S_B: v:=$80;
  3610. S_W: v:=$8000;
  3611. S_L: v:=qword($80000000);
  3612. { S_Q will never happen: cmp with 64 bit constants is not possible }
  3613. S_Q:
  3614. Exit;
  3615. else
  3616. internalerror(2013112905);
  3617. end;
  3618. if (taicpu(p).oper[0]^.val=v) and
  3619. GetNextInstruction(p, hp1) and
  3620. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
  3621. (Taicpu(hp1).condition in [C_E,C_NE]) then
  3622. begin
  3623. TransferUsedRegs(TmpUsedRegs);
  3624. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  3625. if not(RegInUsedRegs(Taicpu(p).oper[1]^.reg, TmpUsedRegs)) then
  3626. begin
  3627. DebugMsg(SPeepholeOptimization + 'CmpJe2NegJo done',p);
  3628. Taicpu(p).opcode:=A_NEG;
  3629. Taicpu(p).loadoper(0,Taicpu(p).oper[1]^);
  3630. Taicpu(p).clearop(1);
  3631. Taicpu(p).ops:=1;
  3632. if Taicpu(hp1).condition=C_E then
  3633. Taicpu(hp1).condition:=C_O
  3634. else
  3635. Taicpu(hp1).condition:=C_NO;
  3636. Result:=true;
  3637. exit;
  3638. end;
  3639. end;
  3640. end;
  3641. end;
  3642. end;
  3643. function TX86AsmOptimizer.OptPass2MOV(var p : tai) : boolean;
  3644. function IsXCHGAcceptable: Boolean; inline;
  3645. begin
  3646. { Always accept if optimising for size }
  3647. Result := (cs_opt_size in current_settings.optimizerswitches) or
  3648. (
  3649. {$ifdef x86_64}
  3650. { XCHG takes 3 cycles on AMD Athlon64 }
  3651. (current_settings.optimizecputype >= cpu_core_i)
  3652. {$else x86_64}
  3653. { From the Pentium M onwards, XCHG only has a latency of 2 rather
  3654. than 3, so it becomes a saving compared to three MOVs with two of
  3655. them able to execute simultaneously. [Kit] }
  3656. (current_settings.optimizecputype >= cpu_PentiumM)
  3657. {$endif x86_64}
  3658. );
  3659. end;
  3660. var
  3661. NewRef: TReference;
  3662. hp1,hp2,hp3: tai;
  3663. {$ifndef x86_64}
  3664. hp4: tai;
  3665. OperIdx: Integer;
  3666. {$endif x86_64}
  3667. begin
  3668. Result:=false;
  3669. if not GetNextInstruction(p, hp1) then
  3670. Exit;
  3671. if MatchInstruction(hp1, A_JMP, [S_NO]) then
  3672. begin
  3673. { Sometimes the MOVs that OptPass2JMP produces can be improved
  3674. further, but we can't just put this jump optimisation in pass 1
  3675. because it tends to perform worse when conditional jumps are
  3676. nearby (e.g. when converting CMOV instructions). [Kit] }
  3677. if OptPass2JMP(hp1) then
  3678. { call OptPass1MOV once to potentially merge any MOVs that were created }
  3679. Result := OptPass1MOV(p)
  3680. { OptPass2MOV will now exit but will be called again if OptPass1MOV
  3681. returned True and the instruction is still a MOV, thus checking
  3682. the optimisations below }
  3683. { If OptPass2JMP returned False, no optimisations were done to
  3684. the jump and there are no further optimisations that can be done
  3685. to the MOV instruction on this pass }
  3686. end
  3687. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  3688. (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  3689. MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
  3690. MatchOpType(taicpu(hp1),top_const,top_reg) and
  3691. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
  3692. { be lazy, checking separately for sub would be slightly better }
  3693. (abs(taicpu(hp1).oper[0]^.val)<=$7fffffff) then
  3694. begin
  3695. { Change:
  3696. movl/q %reg1,%reg2 movl/q %reg1,%reg2
  3697. addl/q $x,%reg2 subl/q $x,%reg2
  3698. To:
  3699. leal/q x(%reg1),%reg2 leal/q -x(%reg1),%reg2
  3700. }
  3701. TransferUsedRegs(TmpUsedRegs);
  3702. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  3703. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3704. if not GetNextInstruction(hp1, hp2) or
  3705. (
  3706. { The FLAGS register isn't always tracked properly, so do not
  3707. perform this optimisation if a conditional statement follows }
  3708. not RegReadByInstruction(NR_DEFAULTFLAGS, hp2) and
  3709. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)
  3710. ) then
  3711. begin
  3712. reference_reset(NewRef, 1, []);
  3713. NewRef.base := taicpu(p).oper[0]^.reg;
  3714. NewRef.scalefactor := 1;
  3715. if taicpu(hp1).opcode = A_ADD then
  3716. begin
  3717. DebugMsg(SPeepholeOptimization + 'MovAdd2Lea', p);
  3718. NewRef.offset := taicpu(hp1).oper[0]^.val;
  3719. end
  3720. else
  3721. begin
  3722. DebugMsg(SPeepholeOptimization + 'MovSub2Lea', p);
  3723. NewRef.offset := -taicpu(hp1).oper[0]^.val;
  3724. end;
  3725. taicpu(p).opcode := A_LEA;
  3726. taicpu(p).loadref(0, NewRef);
  3727. Asml.Remove(hp1);
  3728. hp1.Free;
  3729. Result := True;
  3730. Exit;
  3731. end;
  3732. end
  3733. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  3734. {$ifdef x86_64}
  3735. MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
  3736. {$else x86_64}
  3737. MatchInstruction(hp1,A_MOVZX,A_MOVSX,[]) and
  3738. {$endif x86_64}
  3739. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3740. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg) then
  3741. { mov reg1, reg2 mov reg1, reg2
  3742. movzx/sx reg2, reg3 to movzx/sx reg1, reg3}
  3743. begin
  3744. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  3745. DebugMsg(SPeepholeOptimization + 'mov %reg1,%reg2; movzx/sx %reg2,%reg3 -> mov %reg1,%reg2;movzx/sx %reg1,%reg3',p);
  3746. { Don't remove the MOV command without first checking that reg2 isn't used afterwards,
  3747. or unless supreg(reg3) = supreg(reg2)). [Kit] }
  3748. TransferUsedRegs(TmpUsedRegs);
  3749. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3750. if (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) or
  3751. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)
  3752. then
  3753. begin
  3754. asml.remove(p);
  3755. p.free;
  3756. p := hp1;
  3757. Result:=true;
  3758. end;
  3759. exit;
  3760. end
  3761. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  3762. IsXCHGAcceptable and
  3763. { XCHG doesn't support 8-byte registers }
  3764. (taicpu(p).opsize <> S_B) and
  3765. MatchInstruction(hp1, A_MOV, []) and
  3766. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3767. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
  3768. GetNextInstruction(hp1, hp2) and
  3769. MatchInstruction(hp2, A_MOV, []) and
  3770. { Don't need to call MatchOpType for hp2 because the operand matches below cover for it }
  3771. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  3772. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) then
  3773. begin
  3774. { mov %reg1,%reg2
  3775. mov %reg3,%reg1 -> xchg %reg3,%reg1
  3776. mov %reg2,%reg3
  3777. (%reg2 not used afterwards)
  3778. Note that xchg takes 3 cycles to execute, and generally mov's take
  3779. only one cycle apiece, but the first two mov's can be executed in
  3780. parallel, only taking 2 cycles overall. Older processors should
  3781. therefore only optimise for size. [Kit]
  3782. }
  3783. TransferUsedRegs(TmpUsedRegs);
  3784. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  3785. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3786. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) then
  3787. begin
  3788. DebugMsg(SPeepholeOptimization + 'MovMovMov2XChg', p);
  3789. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp1, UsedRegs);
  3790. taicpu(hp1).opcode := A_XCHG;
  3791. asml.Remove(p);
  3792. asml.Remove(hp2);
  3793. p.Free;
  3794. hp2.Free;
  3795. p := hp1;
  3796. Result := True;
  3797. Exit;
  3798. end;
  3799. end
  3800. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  3801. MatchInstruction(hp1, A_SAR, []) then
  3802. begin
  3803. if MatchOperand(taicpu(hp1).oper[0]^, 31) then
  3804. begin
  3805. { the use of %edx also covers the opsize being S_L }
  3806. if MatchOperand(taicpu(hp1).oper[1]^, NR_EDX) then
  3807. begin
  3808. { Note it has to be specifically "movl %eax,%edx", and those specific sub-registers }
  3809. if (taicpu(p).oper[0]^.reg = NR_EAX) and
  3810. (taicpu(p).oper[1]^.reg = NR_EDX) then
  3811. begin
  3812. { Change:
  3813. movl %eax,%edx
  3814. sarl $31,%edx
  3815. To:
  3816. cltd
  3817. }
  3818. DebugMsg(SPeepholeOptimization + 'MovSar2Cltd', p);
  3819. Asml.Remove(hp1);
  3820. hp1.Free;
  3821. taicpu(p).opcode := A_CDQ;
  3822. taicpu(p).opsize := S_NO;
  3823. taicpu(p).clearop(1);
  3824. taicpu(p).clearop(0);
  3825. taicpu(p).ops:=0;
  3826. Result := True;
  3827. end
  3828. else if (cs_opt_size in current_settings.optimizerswitches) and
  3829. (taicpu(p).oper[0]^.reg = NR_EDX) and
  3830. (taicpu(p).oper[1]^.reg = NR_EAX) then
  3831. begin
  3832. { Change:
  3833. movl %edx,%eax
  3834. sarl $31,%edx
  3835. To:
  3836. movl %edx,%eax
  3837. cltd
  3838. Note that this creates a dependency between the two instructions,
  3839. so only perform if optimising for size.
  3840. }
  3841. DebugMsg(SPeepholeOptimization + 'MovSar2MovCltd', p);
  3842. taicpu(hp1).opcode := A_CDQ;
  3843. taicpu(hp1).opsize := S_NO;
  3844. taicpu(hp1).clearop(1);
  3845. taicpu(hp1).clearop(0);
  3846. taicpu(hp1).ops:=0;
  3847. end;
  3848. {$ifndef x86_64}
  3849. end
  3850. { Don't bother if CMOV is supported, because a more optimal
  3851. sequence would have been generated for the Abs() intrinsic }
  3852. else if not(CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) and
  3853. { the use of %eax also covers the opsize being S_L }
  3854. MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) and
  3855. (taicpu(p).oper[0]^.reg = NR_EAX) and
  3856. (taicpu(p).oper[1]^.reg = NR_EDX) and
  3857. GetNextInstruction(hp1, hp2) and
  3858. MatchInstruction(hp2, A_XOR, [S_L]) and
  3859. MatchOperand(taicpu(hp2).oper[0]^, NR_EAX) and
  3860. MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) and
  3861. GetNextInstruction(hp2, hp3) and
  3862. MatchInstruction(hp3, A_SUB, [S_L]) and
  3863. MatchOperand(taicpu(hp3).oper[0]^, NR_EAX) and
  3864. MatchOperand(taicpu(hp3).oper[1]^, NR_EDX) then
  3865. begin
  3866. { Change:
  3867. movl %eax,%edx
  3868. sarl $31,%eax
  3869. xorl %eax,%edx
  3870. subl %eax,%edx
  3871. (Instruction that uses %edx)
  3872. (%eax deallocated)
  3873. (%edx deallocated)
  3874. To:
  3875. cltd
  3876. xorl %edx,%eax <-- Note the registers have swapped
  3877. subl %edx,%eax
  3878. (Instruction that uses %eax) <-- %eax rather than %edx
  3879. }
  3880. TransferUsedRegs(TmpUsedRegs);
  3881. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  3882. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3883. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  3884. if not RegUsedAfterInstruction(NR_EAX, hp3, TmpUsedRegs) then
  3885. begin
  3886. if GetNextInstruction(hp3, hp4) and
  3887. not RegModifiedByInstruction(NR_EDX, hp4) and
  3888. not RegUsedAfterInstruction(NR_EDX, hp4, TmpUsedRegs) then
  3889. begin
  3890. DebugMsg(SPeepholeOptimization + 'abs() intrinsic optimisation', p);
  3891. taicpu(p).opcode := A_CDQ;
  3892. taicpu(p).clearop(1);
  3893. taicpu(p).clearop(0);
  3894. taicpu(p).ops:=0;
  3895. AsmL.Remove(hp1);
  3896. hp1.Free;
  3897. taicpu(hp2).loadreg(0, NR_EDX);
  3898. taicpu(hp2).loadreg(1, NR_EAX);
  3899. taicpu(hp3).loadreg(0, NR_EDX);
  3900. taicpu(hp3).loadreg(1, NR_EAX);
  3901. AllocRegBetween(NR_EAX, hp3, hp4, TmpUsedRegs);
  3902. { Convert references in the following instruction (hp4) from %edx to %eax }
  3903. for OperIdx := 0 to taicpu(hp4).ops - 1 do
  3904. with taicpu(hp4).oper[OperIdx]^ do
  3905. case typ of
  3906. top_reg:
  3907. if reg = NR_EDX then
  3908. reg := NR_EAX;
  3909. top_ref:
  3910. begin
  3911. if ref^.base = NR_EDX then
  3912. ref^.base := NR_EAX;
  3913. if ref^.index = NR_EDX then
  3914. ref^.index := NR_EAX;
  3915. end;
  3916. else
  3917. ;
  3918. end;
  3919. end;
  3920. end;
  3921. {$else x86_64}
  3922. end;
  3923. end
  3924. else if MatchOperand(taicpu(hp1).oper[0]^, 63) and
  3925. { the use of %rdx also covers the opsize being S_Q }
  3926. MatchOperand(taicpu(hp1).oper[1]^, NR_RDX) then
  3927. begin
  3928. { Note it has to be specifically "movq %rax,%rdx", and those specific sub-registers }
  3929. if (taicpu(p).oper[0]^.reg = NR_RAX) and
  3930. (taicpu(p).oper[1]^.reg = NR_RDX) then
  3931. begin
  3932. { Change:
  3933. movq %rax,%rdx
  3934. sarq $63,%rdx
  3935. To:
  3936. cqto
  3937. }
  3938. DebugMsg(SPeepholeOptimization + 'MovSar2Cqto', p);
  3939. Asml.Remove(hp1);
  3940. hp1.Free;
  3941. taicpu(p).opcode := A_CQO;
  3942. taicpu(p).opsize := S_NO;
  3943. taicpu(p).clearop(1);
  3944. taicpu(p).clearop(0);
  3945. taicpu(p).ops:=0;
  3946. Result := True;
  3947. end
  3948. else if (cs_opt_size in current_settings.optimizerswitches) and
  3949. (taicpu(p).oper[0]^.reg = NR_RDX) and
  3950. (taicpu(p).oper[1]^.reg = NR_RAX) then
  3951. begin
  3952. { Change:
  3953. movq %rdx,%rax
  3954. sarq $63,%rdx
  3955. To:
  3956. movq %rdx,%rax
  3957. cqto
  3958. Note that this creates a dependency between the two instructions,
  3959. so only perform if optimising for size.
  3960. }
  3961. DebugMsg(SPeepholeOptimization + 'MovSar2MovCqto', p);
  3962. taicpu(hp1).opcode := A_CQO;
  3963. taicpu(hp1).opsize := S_NO;
  3964. taicpu(hp1).clearop(1);
  3965. taicpu(hp1).clearop(0);
  3966. taicpu(hp1).ops:=0;
  3967. {$endif x86_64}
  3968. end;
  3969. end;
  3970. end
  3971. else if MatchInstruction(hp1, A_MOV, []) and
  3972. (taicpu(hp1).oper[1]^.typ = top_reg) then
  3973. { Though "GetNextInstruction" could be factored out, along with
  3974. the instructions that depend on hp2, it is an expensive call that
  3975. should be delayed for as long as possible, hence we do cheaper
  3976. checks first that are likely to be False. [Kit] }
  3977. begin
  3978. if MatchOperand(taicpu(p).oper[1]^, NR_EDX) and
  3979. (
  3980. (
  3981. (taicpu(hp1).oper[1]^.reg = NR_EAX) and
  3982. (
  3983. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  3984. MatchOperand(taicpu(hp1).oper[0]^, NR_EDX)
  3985. )
  3986. ) or
  3987. (
  3988. (taicpu(hp1).oper[1]^.reg = NR_EDX) and
  3989. (
  3990. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  3991. MatchOperand(taicpu(hp1).oper[0]^, NR_EAX)
  3992. )
  3993. )
  3994. ) and
  3995. GetNextInstruction(hp1, hp2) and
  3996. MatchInstruction(hp2, A_SAR, []) and
  3997. MatchOperand(taicpu(hp2).oper[0]^, 31) then
  3998. begin
  3999. if MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) then
  4000. begin
  4001. { Change:
  4002. movl r/m,%edx movl r/m,%eax movl r/m,%edx movl r/m,%eax
  4003. movl %edx,%eax or movl %eax,%edx or movl r/m,%eax or movl r/m,%edx
  4004. sarl $31,%edx sarl $31,%edx sarl $31,%edx sarl $31,%edx
  4005. To:
  4006. movl r/m,%eax <- Note the change in register
  4007. cltd
  4008. }
  4009. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCltd', p);
  4010. AllocRegBetween(NR_EAX, p, hp1, UsedRegs);
  4011. taicpu(p).loadreg(1, NR_EAX);
  4012. taicpu(hp1).opcode := A_CDQ;
  4013. taicpu(hp1).clearop(1);
  4014. taicpu(hp1).clearop(0);
  4015. taicpu(hp1).ops:=0;
  4016. AsmL.Remove(hp2);
  4017. hp2.Free;
  4018. (*
  4019. {$ifdef x86_64}
  4020. end
  4021. else if MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) and
  4022. { This code sequence does not get generated - however it might become useful
  4023. if and when 128-bit signed integer types make an appearance, so the code
  4024. is kept here for when it is eventually needed. [Kit] }
  4025. (
  4026. (
  4027. (taicpu(hp1).oper[1]^.reg = NR_RAX) and
  4028. (
  4029. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4030. MatchOperand(taicpu(hp1).oper[0]^, NR_RDX)
  4031. )
  4032. ) or
  4033. (
  4034. (taicpu(hp1).oper[1]^.reg = NR_RDX) and
  4035. (
  4036. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4037. MatchOperand(taicpu(hp1).oper[0]^, NR_RAX)
  4038. )
  4039. )
  4040. ) and
  4041. GetNextInstruction(hp1, hp2) and
  4042. MatchInstruction(hp2, A_SAR, [S_Q]) and
  4043. MatchOperand(taicpu(hp2).oper[0]^, 63) and
  4044. MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) then
  4045. begin
  4046. { Change:
  4047. movq r/m,%rdx movq r/m,%rax movq r/m,%rdx movq r/m,%rax
  4048. movq %rdx,%rax or movq %rax,%rdx or movq r/m,%rax or movq r/m,%rdx
  4049. sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx
  4050. To:
  4051. movq r/m,%rax <- Note the change in register
  4052. cqto
  4053. }
  4054. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCqto', p);
  4055. AllocRegBetween(NR_RAX, p, hp1, UsedRegs);
  4056. taicpu(p).loadreg(1, NR_RAX);
  4057. taicpu(hp1).opcode := A_CQO;
  4058. taicpu(hp1).clearop(1);
  4059. taicpu(hp1).clearop(0);
  4060. taicpu(hp1).ops:=0;
  4061. AsmL.Remove(hp2);
  4062. hp2.Free;
  4063. {$endif x86_64}
  4064. *)
  4065. end;
  4066. end;
  4067. end
  4068. else if (taicpu(p).oper[0]^.typ = top_ref) and
  4069. (hp1.typ = ait_instruction) and
  4070. { while the GetNextInstruction(hp1,hp2) call could be factored out,
  4071. doing it separately in both branches allows to do the cheap checks
  4072. with low probability earlier }
  4073. ((IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
  4074. GetNextInstruction(hp1,hp2) and
  4075. MatchInstruction(hp2,A_MOV,[])
  4076. ) or
  4077. ((taicpu(hp1).opcode=A_LEA) and
  4078. GetNextInstruction(hp1,hp2) and
  4079. MatchInstruction(hp2,A_MOV,[]) and
  4080. ((MatchReference(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.reg,NR_INVALID) and
  4081. (taicpu(hp1).oper[0]^.ref^.index<>taicpu(p).oper[1]^.reg)
  4082. ) or
  4083. (MatchReference(taicpu(hp1).oper[0]^.ref^,NR_INVALID,
  4084. taicpu(p).oper[1]^.reg) and
  4085. (taicpu(hp1).oper[0]^.ref^.base<>taicpu(p).oper[1]^.reg)) or
  4086. (MatchReferenceWithOffset(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.reg,NR_NO)) or
  4087. (MatchReferenceWithOffset(taicpu(hp1).oper[0]^.ref^,NR_NO,taicpu(p).oper[1]^.reg))
  4088. ) and
  4089. ((MatchOperand(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^)) or not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,UsedRegs)))
  4090. )
  4091. ) and
  4092. MatchOperand(taicpu(hp1).oper[taicpu(hp1).ops-1]^,taicpu(hp2).oper[0]^) and
  4093. (taicpu(hp2).oper[1]^.typ = top_ref) then
  4094. begin
  4095. TransferUsedRegs(TmpUsedRegs);
  4096. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  4097. UpdateUsedRegs(TmpUsedRegs,tai(hp1.next));
  4098. if (RefsEqual(taicpu(hp2).oper[1]^.ref^,taicpu(p).oper[0]^.ref^) and
  4099. not(RegUsedAfterInstruction(taicpu(hp2).oper[0]^.reg,hp2,TmpUsedRegs))) then
  4100. { change mov (ref), reg
  4101. add/sub/or/... reg2/$const, reg
  4102. mov reg, (ref)
  4103. # release reg
  4104. to add/sub/or/... reg2/$const, (ref) }
  4105. begin
  4106. case taicpu(hp1).opcode of
  4107. A_INC,A_DEC,A_NOT,A_NEG :
  4108. taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^);
  4109. A_LEA :
  4110. begin
  4111. taicpu(hp1).opcode:=A_ADD;
  4112. if (taicpu(hp1).oper[0]^.ref^.index<>taicpu(p).oper[1]^.reg) and (taicpu(hp1).oper[0]^.ref^.index<>NR_NO) then
  4113. taicpu(hp1).loadreg(0,taicpu(hp1).oper[0]^.ref^.index)
  4114. else if (taicpu(hp1).oper[0]^.ref^.base<>taicpu(p).oper[1]^.reg) and (taicpu(hp1).oper[0]^.ref^.base<>NR_NO) then
  4115. taicpu(hp1).loadreg(0,taicpu(hp1).oper[0]^.ref^.base)
  4116. else
  4117. taicpu(hp1).loadconst(0,taicpu(hp1).oper[0]^.ref^.offset);
  4118. taicpu(hp1).loadRef(1,taicpu(p).oper[0]^.ref^);
  4119. DebugMsg(SPeepholeOptimization + 'FoldLea done',hp1);
  4120. end
  4121. else
  4122. taicpu(hp1).loadRef(1,taicpu(p).oper[0]^.ref^);
  4123. end;
  4124. asml.remove(p);
  4125. asml.remove(hp2);
  4126. p.free;
  4127. hp2.free;
  4128. p := hp1
  4129. end;
  4130. Exit;
  4131. {$ifdef x86_64}
  4132. end
  4133. else if (taicpu(p).opsize = S_L) and
  4134. (taicpu(p).oper[1]^.typ = top_reg) and
  4135. (
  4136. MatchInstruction(hp1, A_MOV,[]) and
  4137. (taicpu(hp1).opsize = S_L) and
  4138. (taicpu(hp1).oper[1]^.typ = top_reg)
  4139. ) and (
  4140. GetNextInstruction(hp1, hp2) and
  4141. (tai(hp2).typ=ait_instruction) and
  4142. (taicpu(hp2).opsize = S_Q) and
  4143. (
  4144. (
  4145. MatchInstruction(hp2, A_ADD,[]) and
  4146. (taicpu(hp2).opsize = S_Q) and
  4147. (taicpu(hp2).oper[0]^.typ = top_reg) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  4148. (
  4149. (
  4150. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(p).oper[1]^.reg)) and
  4151. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4152. ) or (
  4153. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  4154. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  4155. )
  4156. )
  4157. ) or (
  4158. MatchInstruction(hp2, A_LEA,[]) and
  4159. (taicpu(hp2).oper[0]^.ref^.offset = 0) and
  4160. (taicpu(hp2).oper[0]^.ref^.scalefactor <= 1) and
  4161. (
  4162. (
  4163. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(p).oper[1]^.reg)) and
  4164. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4165. ) or (
  4166. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  4167. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(p).oper[1]^.reg))
  4168. )
  4169. ) and (
  4170. (
  4171. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4172. ) or (
  4173. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  4174. )
  4175. )
  4176. )
  4177. )
  4178. ) and (
  4179. GetNextInstruction(hp2, hp3) and
  4180. MatchInstruction(hp3, A_SHR,[]) and
  4181. (taicpu(hp3).opsize = S_Q) and
  4182. (taicpu(hp3).oper[0]^.typ = top_const) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  4183. (taicpu(hp3).oper[0]^.val = 1) and
  4184. (taicpu(hp3).oper[1]^.reg = taicpu(hp2).oper[1]^.reg)
  4185. ) then
  4186. begin
  4187. { Change movl x, reg1d movl x, reg1d
  4188. movl y, reg2d movl y, reg2d
  4189. addq reg2q,reg1q or leaq (reg1q,reg2q),reg1q
  4190. shrq $1, reg1q shrq $1, reg1q
  4191. ( reg1d and reg2d can be switched around in the first two instructions )
  4192. To movl x, reg1d
  4193. addl y, reg1d
  4194. rcrl $1, reg1d
  4195. This corresponds to the common expression (x + y) shr 1, where
  4196. x and y are Cardinals (replacing "shr 1" with "div 2" produces
  4197. smaller code, but won't account for x + y causing an overflow). [Kit]
  4198. }
  4199. if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then
  4200. { Change first MOV command to have the same register as the final output }
  4201. taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg
  4202. else
  4203. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
  4204. { Change second MOV command to an ADD command. This is easier than
  4205. converting the existing command because it means we don't have to
  4206. touch 'y', which might be a complicated reference, and also the
  4207. fact that the third command might either be ADD or LEA. [Kit] }
  4208. taicpu(hp1).opcode := A_ADD;
  4209. { Delete old ADD/LEA instruction }
  4210. asml.remove(hp2);
  4211. hp2.free;
  4212. { Convert "shrq $1, reg1q" to "rcr $1, reg1d" }
  4213. taicpu(hp3).opcode := A_RCR;
  4214. taicpu(hp3).changeopsize(S_L);
  4215. setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD);
  4216. {$endif x86_64}
  4217. end;
  4218. end;
  4219. function TX86AsmOptimizer.OptPass2Imul(var p : tai) : boolean;
  4220. var
  4221. hp1 : tai;
  4222. begin
  4223. Result:=false;
  4224. if (taicpu(p).ops >= 2) and
  4225. ((taicpu(p).oper[0]^.typ = top_const) or
  4226. ((taicpu(p).oper[0]^.typ = top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full))) and
  4227. (taicpu(p).oper[1]^.typ = top_reg) and
  4228. ((taicpu(p).ops = 2) or
  4229. ((taicpu(p).oper[2]^.typ = top_reg) and
  4230. (taicpu(p).oper[2]^.reg = taicpu(p).oper[1]^.reg))) and
  4231. GetLastInstruction(p,hp1) and
  4232. MatchInstruction(hp1,A_MOV,[]) and
  4233. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  4234. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  4235. begin
  4236. TransferUsedRegs(TmpUsedRegs);
  4237. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,p,TmpUsedRegs)) or
  4238. ((taicpu(p).ops = 3) and (taicpu(p).oper[1]^.reg=taicpu(p).oper[2]^.reg)) then
  4239. { change
  4240. mov reg1,reg2
  4241. imul y,reg2 to imul y,reg1,reg2 }
  4242. begin
  4243. taicpu(p).ops := 3;
  4244. taicpu(p).loadreg(2,taicpu(p).oper[1]^.reg);
  4245. taicpu(p).loadreg(1,taicpu(hp1).oper[0]^.reg);
  4246. DebugMsg(SPeepholeOptimization + 'MovImul2Imul done',p);
  4247. asml.remove(hp1);
  4248. hp1.free;
  4249. result:=true;
  4250. end;
  4251. end;
  4252. end;
  4253. procedure TX86AsmOptimizer.ConvertJumpToRET(const p: tai; const ret_p: tai);
  4254. var
  4255. ThisLabel: TAsmLabel;
  4256. begin
  4257. ThisLabel := tasmlabel(taicpu(p).oper[0]^.ref^.symbol);
  4258. ThisLabel.decrefs;
  4259. taicpu(p).opcode := A_RET;
  4260. taicpu(p).is_jmp := false;
  4261. taicpu(p).ops := taicpu(ret_p).ops;
  4262. case taicpu(ret_p).ops of
  4263. 0:
  4264. taicpu(p).clearop(0);
  4265. 1:
  4266. taicpu(p).loadconst(0,taicpu(ret_p).oper[0]^.val);
  4267. else
  4268. internalerror(2016041301);
  4269. end;
  4270. { If the original label is now dead, it might turn out that the label
  4271. immediately follows p. As a result, everything beyond it, which will
  4272. be just some final register configuration and a RET instruction, is
  4273. now dead code. [Kit] }
  4274. { NOTE: This is much faster than introducing a OptPass2RET routine and
  4275. running RemoveDeadCodeAfterJump for each RET instruction, because
  4276. this optimisation rarely happens and most RETs appear at the end of
  4277. routines where there is nothing that can be stripped. [Kit] }
  4278. if not ThisLabel.is_used then
  4279. RemoveDeadCodeAfterJump(p);
  4280. end;
  4281. function TX86AsmOptimizer.OptPass2Jmp(var p : tai) : boolean;
  4282. var
  4283. hp1, hp2, hp3: tai;
  4284. OperIdx: Integer;
  4285. begin
  4286. result:=false;
  4287. if (taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full) and (taicpu(p).oper[0]^.ref^.base=NR_NO) and
  4288. (taicpu(p).oper[0]^.ref^.index=NR_NO) and (taicpu(p).oper[0]^.ref^.symbol is tasmlabel) then
  4289. begin
  4290. hp1:=getlabelwithsym(tasmlabel(taicpu(p).oper[0]^.ref^.symbol));
  4291. if (taicpu(p).condition=C_None) and assigned(hp1) and SkipLabels(hp1,hp1) and (hp1.typ = ait_instruction) then
  4292. begin
  4293. case taicpu(hp1).opcode of
  4294. A_RET:
  4295. {
  4296. change
  4297. jmp .L1
  4298. ...
  4299. .L1:
  4300. ret
  4301. into
  4302. ret
  4303. }
  4304. begin
  4305. ConvertJumpToRET(p, hp1);
  4306. result:=true;
  4307. end;
  4308. A_MOV:
  4309. {
  4310. change
  4311. jmp .L1
  4312. ...
  4313. .L1:
  4314. mov ##, ##
  4315. ret
  4316. into
  4317. mov ##, ##
  4318. ret
  4319. }
  4320. { This optimisation tends to increase code size if the pass 1 MOV optimisations aren't
  4321. re-run, so only do this particular optimisation if optimising for speed or when
  4322. optimisations are very in-depth. [Kit] }
  4323. if (current_settings.optimizerswitches * [cs_opt_level3, cs_opt_size]) <> [cs_opt_size] then
  4324. begin
  4325. GetNextInstruction(hp1, hp2);
  4326. if not Assigned(hp2) then
  4327. Exit;
  4328. if (hp2.typ in [ait_label, ait_align]) then
  4329. SkipLabels(hp2,hp2);
  4330. if Assigned(hp2) and MatchInstruction(hp2, A_RET, [S_NO]) then
  4331. begin
  4332. { Duplicate the MOV instruction }
  4333. hp3:=tai(hp1.getcopy);
  4334. asml.InsertBefore(hp3, p);
  4335. { Make sure the compiler knows about any final registers written here }
  4336. for OperIdx := 0 to 1 do
  4337. with taicpu(hp3).oper[OperIdx]^ do
  4338. begin
  4339. case typ of
  4340. top_ref:
  4341. begin
  4342. if (ref^.base <> NR_NO) {$ifdef x86_64} and (ref^.base <> NR_RIP) {$endif x86_64} then
  4343. AllocRegBetween(ref^.base, hp3, tai(p.Next), UsedRegs);
  4344. if (ref^.index <> NR_NO) {$ifdef x86_64} and (ref^.index <> NR_RIP) {$endif x86_64} then
  4345. AllocRegBetween(ref^.index, hp3, tai(p.Next), UsedRegs);
  4346. end;
  4347. top_reg:
  4348. AllocRegBetween(reg, hp3, tai(p.Next), UsedRegs);
  4349. else
  4350. ;
  4351. end;
  4352. end;
  4353. { Now change the jump into a RET instruction }
  4354. ConvertJumpToRET(p, hp2);
  4355. result:=true;
  4356. end;
  4357. end;
  4358. else
  4359. ;
  4360. end;
  4361. end;
  4362. end;
  4363. end;
  4364. class function TX86AsmOptimizer.CanBeCMOV(p : tai) : boolean;
  4365. begin
  4366. CanBeCMOV:=assigned(p) and
  4367. MatchInstruction(p,A_MOV,[S_W,S_L,S_Q]) and
  4368. { we can't use cmov ref,reg because
  4369. ref could be nil and cmov still throws an exception
  4370. if ref=nil but the mov isn't done (FK)
  4371. or ((taicpu(p).oper[0]^.typ = top_ref) and
  4372. (taicpu(p).oper[0]^.ref^.refaddr = addr_no))
  4373. }
  4374. (taicpu(p).oper[1]^.typ = top_reg) and
  4375. (
  4376. (taicpu(p).oper[0]^.typ = top_reg) or
  4377. { allow references, but only pure symbols or got rel. addressing with RIP as based,
  4378. it is not expected that this can cause a seg. violation }
  4379. (
  4380. (taicpu(p).oper[0]^.typ = top_ref) and
  4381. IsRefSafe(taicpu(p).oper[0]^.ref)
  4382. )
  4383. );
  4384. end;
  4385. function TX86AsmOptimizer.OptPass2Jcc(var p : tai) : boolean;
  4386. var
  4387. hp1,hp2,hp3,hp4,hpmov2: tai;
  4388. carryadd_opcode : TAsmOp;
  4389. l : Longint;
  4390. condition : TAsmCond;
  4391. symbol: TAsmSymbol;
  4392. begin
  4393. result:=false;
  4394. symbol:=nil;
  4395. if GetNextInstruction(p,hp1) then
  4396. begin
  4397. symbol := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  4398. if (hp1.typ=ait_instruction) and
  4399. GetNextInstruction(hp1,hp2) and
  4400. ((hp2.typ=ait_label) or
  4401. { trick to skip align }
  4402. ((hp2.typ=ait_align) and GetNextInstruction(hp2,hp2) and (hp2.typ=ait_label))
  4403. ) and
  4404. (Tasmlabel(symbol) = Tai_label(hp2).labsym) then
  4405. { jb @@1 cmc
  4406. inc/dec operand --> adc/sbb operand,0
  4407. @@1:
  4408. ... and ...
  4409. jnb @@1
  4410. inc/dec operand --> adc/sbb operand,0
  4411. @@1: }
  4412. begin
  4413. carryadd_opcode:=A_NONE;
  4414. if Taicpu(p).condition in [C_NAE,C_B] then
  4415. begin
  4416. if (Taicpu(hp1).opcode=A_INC) or
  4417. ((Taicpu(hp1).opcode=A_ADD) and
  4418. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4419. (Taicpu(hp1).oper[0]^.val=1)
  4420. ) then
  4421. carryadd_opcode:=A_ADC;
  4422. if (Taicpu(hp1).opcode=A_DEC) or
  4423. ((Taicpu(hp1).opcode=A_SUB) and
  4424. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4425. (Taicpu(hp1).oper[0]^.val=1)
  4426. ) then
  4427. carryadd_opcode:=A_SBB;
  4428. if carryadd_opcode<>A_NONE then
  4429. begin
  4430. Taicpu(p).clearop(0);
  4431. Taicpu(p).ops:=0;
  4432. Taicpu(p).is_jmp:=false;
  4433. Taicpu(p).opcode:=A_CMC;
  4434. Taicpu(p).condition:=C_NONE;
  4435. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2CmcAdc/Sbb',p);
  4436. Taicpu(hp1).ops:=2;
  4437. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  4438. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  4439. else
  4440. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  4441. Taicpu(hp1).loadconst(0,0);
  4442. Taicpu(hp1).opcode:=carryadd_opcode;
  4443. result:=true;
  4444. exit;
  4445. end;
  4446. end
  4447. else if Taicpu(p).condition in [C_AE,C_NB] then
  4448. begin
  4449. if (Taicpu(hp1).opcode=A_INC) or
  4450. ((Taicpu(hp1).opcode=A_ADD) and
  4451. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4452. (Taicpu(hp1).oper[0]^.val=1)
  4453. ) then
  4454. carryadd_opcode:=A_ADC;
  4455. if (Taicpu(hp1).opcode=A_DEC) or
  4456. ((Taicpu(hp1).opcode=A_SUB) and
  4457. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4458. (Taicpu(hp1).oper[0]^.val=1)
  4459. ) then
  4460. carryadd_opcode:=A_SBB;
  4461. if carryadd_opcode<>A_NONE then
  4462. begin
  4463. Taicpu(hp1).ops:=2;
  4464. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2Adc/Sbb',p);
  4465. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  4466. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  4467. else
  4468. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  4469. Taicpu(hp1).loadconst(0,0);
  4470. Taicpu(hp1).opcode:=carryadd_opcode;
  4471. RemoveCurrentP(p);
  4472. p:=hp1;
  4473. result:=true;
  4474. exit;
  4475. end;
  4476. end;
  4477. end;
  4478. { Detect the following:
  4479. jmp<cond> @Lbl1
  4480. jmp @Lbl2
  4481. ...
  4482. @Lbl1:
  4483. ret
  4484. Change to:
  4485. jmp<inv_cond> @Lbl2
  4486. ret
  4487. }
  4488. if MatchInstruction(hp1,A_JMP,[]) and (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  4489. begin
  4490. hp2:=getlabelwithsym(TAsmLabel(symbol));
  4491. if Assigned(hp2) and SkipLabels(hp2,hp2) and
  4492. MatchInstruction(hp2,A_RET,[S_NO]) then
  4493. begin
  4494. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  4495. { Change label address to that of the unconditional jump }
  4496. taicpu(p).loadoper(0, taicpu(hp1).oper[0]^);
  4497. TAsmLabel(symbol).DecRefs;
  4498. taicpu(hp1).opcode := A_RET;
  4499. taicpu(hp1).is_jmp := false;
  4500. taicpu(hp1).ops := taicpu(hp2).ops;
  4501. DebugMsg(SPeepholeOptimization+'JccJmpRet2J!ccRet',p);
  4502. case taicpu(hp2).ops of
  4503. 0:
  4504. taicpu(hp1).clearop(0);
  4505. 1:
  4506. taicpu(hp1).loadconst(0,taicpu(hp2).oper[0]^.val);
  4507. else
  4508. internalerror(2016041302);
  4509. end;
  4510. end;
  4511. end;
  4512. end;
  4513. {$ifndef i8086}
  4514. if CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype] then
  4515. begin
  4516. { check for
  4517. jCC xxx
  4518. <several movs>
  4519. xxx:
  4520. }
  4521. l:=0;
  4522. GetNextInstruction(p, hp1);
  4523. while assigned(hp1) and
  4524. CanBeCMOV(hp1) and
  4525. { stop on labels }
  4526. not(hp1.typ=ait_label) do
  4527. begin
  4528. inc(l);
  4529. GetNextInstruction(hp1,hp1);
  4530. end;
  4531. if assigned(hp1) then
  4532. begin
  4533. if FindLabel(tasmlabel(symbol),hp1) then
  4534. begin
  4535. if (l<=4) and (l>0) then
  4536. begin
  4537. condition:=inverse_cond(taicpu(p).condition);
  4538. GetNextInstruction(p,hp1);
  4539. repeat
  4540. if not Assigned(hp1) then
  4541. InternalError(2018062900);
  4542. taicpu(hp1).opcode:=A_CMOVcc;
  4543. taicpu(hp1).condition:=condition;
  4544. UpdateUsedRegs(hp1);
  4545. GetNextInstruction(hp1,hp1);
  4546. until not(CanBeCMOV(hp1));
  4547. { Remember what hp1 is in case there's multiple aligns to get rid of }
  4548. hp2 := hp1;
  4549. repeat
  4550. if not Assigned(hp2) then
  4551. InternalError(2018062910);
  4552. case hp2.typ of
  4553. ait_label:
  4554. { What we expected - break out of the loop (it won't be a dead label at the top of
  4555. a cluster because that was optimised at an earlier stage) }
  4556. Break;
  4557. ait_align:
  4558. { Go to the next entry until a label is found (may be multiple aligns before it) }
  4559. begin
  4560. hp2 := tai(hp2.Next);
  4561. Continue;
  4562. end;
  4563. else
  4564. begin
  4565. { Might be a comment or temporary allocation entry }
  4566. if not (hp2.typ in SkipInstr) then
  4567. InternalError(2018062911);
  4568. hp2 := tai(hp2.Next);
  4569. Continue;
  4570. end;
  4571. end;
  4572. until False;
  4573. { Now we can safely decrement the reference count }
  4574. tasmlabel(symbol).decrefs;
  4575. DebugMsg(SPeepholeOptimization+'JccMov2CMov',p);
  4576. { Remove the original jump }
  4577. asml.Remove(p);
  4578. p.Free;
  4579. GetNextInstruction(hp2, p); { Instruction after the label }
  4580. { Remove the label if this is its final reference }
  4581. if (tasmlabel(symbol).getrefs=0) then
  4582. StripLabelFast(hp1);
  4583. if Assigned(p) then
  4584. begin
  4585. UpdateUsedRegs(p);
  4586. result:=true;
  4587. end;
  4588. exit;
  4589. end;
  4590. end
  4591. else
  4592. begin
  4593. { check further for
  4594. jCC xxx
  4595. <several movs 1>
  4596. jmp yyy
  4597. xxx:
  4598. <several movs 2>
  4599. yyy:
  4600. }
  4601. { hp2 points to jmp yyy }
  4602. hp2:=hp1;
  4603. { skip hp1 to xxx (or an align right before it) }
  4604. GetNextInstruction(hp1, hp1);
  4605. if assigned(hp2) and
  4606. assigned(hp1) and
  4607. (l<=3) and
  4608. (hp2.typ=ait_instruction) and
  4609. (taicpu(hp2).is_jmp) and
  4610. (taicpu(hp2).condition=C_None) and
  4611. { real label and jump, no further references to the
  4612. label are allowed }
  4613. (tasmlabel(symbol).getrefs=1) and
  4614. FindLabel(tasmlabel(symbol),hp1) then
  4615. begin
  4616. l:=0;
  4617. { skip hp1 to <several moves 2> }
  4618. if (hp1.typ = ait_align) then
  4619. GetNextInstruction(hp1, hp1);
  4620. GetNextInstruction(hp1, hpmov2);
  4621. hp1 := hpmov2;
  4622. while assigned(hp1) and
  4623. CanBeCMOV(hp1) do
  4624. begin
  4625. inc(l);
  4626. GetNextInstruction(hp1, hp1);
  4627. end;
  4628. { hp1 points to yyy (or an align right before it) }
  4629. hp3 := hp1;
  4630. if assigned(hp1) and
  4631. FindLabel(tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol),hp1) then
  4632. begin
  4633. condition:=inverse_cond(taicpu(p).condition);
  4634. GetNextInstruction(p,hp1);
  4635. repeat
  4636. taicpu(hp1).opcode:=A_CMOVcc;
  4637. taicpu(hp1).condition:=condition;
  4638. UpdateUsedRegs(hp1);
  4639. GetNextInstruction(hp1,hp1);
  4640. until not(assigned(hp1)) or
  4641. not(CanBeCMOV(hp1));
  4642. condition:=inverse_cond(condition);
  4643. hp1 := hpmov2;
  4644. { hp1 is now at <several movs 2> }
  4645. while Assigned(hp1) and CanBeCMOV(hp1) do
  4646. begin
  4647. taicpu(hp1).opcode:=A_CMOVcc;
  4648. taicpu(hp1).condition:=condition;
  4649. UpdateUsedRegs(hp1);
  4650. GetNextInstruction(hp1,hp1);
  4651. end;
  4652. hp1 := p;
  4653. { Get first instruction after label }
  4654. GetNextInstruction(hp3, p);
  4655. if assigned(p) and (hp3.typ = ait_align) then
  4656. GetNextInstruction(p, p);
  4657. { Don't dereference yet, as doing so will cause
  4658. GetNextInstruction to skip the label and
  4659. optional align marker. [Kit] }
  4660. GetNextInstruction(hp2, hp4);
  4661. DebugMsg(SPeepholeOptimization+'JccMovJmpMov2CMovCMov',hp1);
  4662. { remove jCC }
  4663. asml.remove(hp1);
  4664. hp1.free;
  4665. { Now we can safely decrement it }
  4666. tasmlabel(symbol).decrefs;
  4667. { Remove label xxx (it will have a ref of zero due to the initial check }
  4668. StripLabelFast(hp4);
  4669. { remove jmp }
  4670. symbol := taicpu(hp2).oper[0]^.ref^.symbol;
  4671. asml.remove(hp2);
  4672. hp2.free;
  4673. { As before, now we can safely decrement it }
  4674. tasmlabel(symbol).decrefs;
  4675. { Remove label yyy (and the optional alignment) if its reference falls to zero }
  4676. if tasmlabel(symbol).getrefs = 0 then
  4677. StripLabelFast(hp3);
  4678. if Assigned(p) then
  4679. begin
  4680. UpdateUsedRegs(p);
  4681. result:=true;
  4682. end;
  4683. exit;
  4684. end;
  4685. end;
  4686. end;
  4687. end;
  4688. end;
  4689. {$endif i8086}
  4690. end;
  4691. function TX86AsmOptimizer.OptPass1Movx(var p : tai) : boolean;
  4692. var
  4693. hp1,hp2: tai;
  4694. reg_and_hp1_is_instr: Boolean;
  4695. begin
  4696. result:=false;
  4697. reg_and_hp1_is_instr:=(taicpu(p).oper[1]^.typ = top_reg) and
  4698. GetNextInstruction(p,hp1) and
  4699. (hp1.typ = ait_instruction);
  4700. if reg_and_hp1_is_instr and
  4701. IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
  4702. GetNextInstruction(hp1,hp2) and
  4703. MatchInstruction(hp2,A_MOV,[]) and
  4704. (taicpu(hp2).oper[0]^.typ = top_reg) and
  4705. OpsEqual(taicpu(hp2).oper[1]^,taicpu(p).oper[0]^) and
  4706. {$ifdef i386}
  4707. { not all registers have byte size sub registers on i386 }
  4708. ((taicpu(hp2).opsize<>S_B) or (getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])) and
  4709. {$endif i386}
  4710. (((taicpu(hp1).ops=2) and
  4711. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg))) or
  4712. ((taicpu(hp1).ops=1) and
  4713. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg)))) and
  4714. not(RegUsedAfterInstruction(taicpu(hp2).oper[0]^.reg,hp2,UsedRegs)) then
  4715. begin
  4716. { change movsX/movzX reg/ref, reg2
  4717. add/sub/or/... reg3/$const, reg2
  4718. mov reg2 reg/ref
  4719. to add/sub/or/... reg3/$const, reg/ref }
  4720. { by example:
  4721. movswl %si,%eax movswl %si,%eax p
  4722. decl %eax addl %edx,%eax hp1
  4723. movw %ax,%si movw %ax,%si hp2
  4724. ->
  4725. movswl %si,%eax movswl %si,%eax p
  4726. decw %eax addw %edx,%eax hp1
  4727. movw %ax,%si movw %ax,%si hp2
  4728. }
  4729. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  4730. {
  4731. ->
  4732. movswl %si,%eax movswl %si,%eax p
  4733. decw %si addw %dx,%si hp1
  4734. movw %ax,%si movw %ax,%si hp2
  4735. }
  4736. case taicpu(hp1).ops of
  4737. 1:
  4738. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  4739. 2:
  4740. begin
  4741. taicpu(hp1).loadoper(1,taicpu(hp2).oper[1]^);
  4742. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  4743. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  4744. end;
  4745. else
  4746. internalerror(2008042701);
  4747. end;
  4748. {
  4749. ->
  4750. decw %si addw %dx,%si p
  4751. }
  4752. DebugMsg(SPeepholeOptimization + 'var3',p);
  4753. asml.remove(p);
  4754. asml.remove(hp2);
  4755. p.free;
  4756. hp2.free;
  4757. p:=hp1;
  4758. end
  4759. else if taicpu(p).opcode=A_MOVZX then
  4760. begin
  4761. { removes superfluous And's after movzx's }
  4762. if reg_and_hp1_is_instr and
  4763. (taicpu(hp1).opcode = A_AND) and
  4764. MatchOpType(taicpu(hp1),top_const,top_reg) and
  4765. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  4766. begin
  4767. case taicpu(p).opsize Of
  4768. S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
  4769. if (taicpu(hp1).oper[0]^.val = $ff) then
  4770. begin
  4771. DebugMsg(SPeepholeOptimization + 'var4',p);
  4772. asml.remove(hp1);
  4773. hp1.free;
  4774. end;
  4775. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  4776. if (taicpu(hp1).oper[0]^.val = $ffff) then
  4777. begin
  4778. DebugMsg(SPeepholeOptimization + 'var5',p);
  4779. asml.remove(hp1);
  4780. hp1.free;
  4781. end;
  4782. {$ifdef x86_64}
  4783. S_LQ:
  4784. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  4785. begin
  4786. if (cs_asm_source in current_settings.globalswitches) then
  4787. asml.insertbefore(tai_comment.create(strpnew(SPeepholeOptimization + 'var6')),p);
  4788. asml.remove(hp1);
  4789. hp1.Free;
  4790. end;
  4791. {$endif x86_64}
  4792. else
  4793. ;
  4794. end;
  4795. end;
  4796. { changes some movzx constructs to faster synonyms (all examples
  4797. are given with eax/ax, but are also valid for other registers)}
  4798. if MatchOpType(taicpu(p),top_reg,top_reg) then
  4799. begin
  4800. case taicpu(p).opsize of
  4801. { Technically, movzbw %al,%ax cannot be encoded in 32/64-bit mode
  4802. (the machine code is equivalent to movzbl %al,%eax), but the
  4803. code generator still generates that assembler instruction and
  4804. it is silently converted. This should probably be checked.
  4805. [Kit] }
  4806. S_BW:
  4807. begin
  4808. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  4809. (
  4810. not IsMOVZXAcceptable
  4811. { and $0xff,%ax has a smaller encoding but risks a partial write penalty }
  4812. or (
  4813. (cs_opt_size in current_settings.optimizerswitches) and
  4814. (taicpu(p).oper[1]^.reg = NR_AX)
  4815. )
  4816. ) then
  4817. {Change "movzbw %al, %ax" to "andw $0x0ffh, %ax"}
  4818. begin
  4819. DebugMsg(SPeepholeOptimization + 'var7',p);
  4820. taicpu(p).opcode := A_AND;
  4821. taicpu(p).changeopsize(S_W);
  4822. taicpu(p).loadConst(0,$ff);
  4823. Result := True;
  4824. end
  4825. else if not IsMOVZXAcceptable and
  4826. GetNextInstruction(p, hp1) and
  4827. (tai(hp1).typ = ait_instruction) and
  4828. (taicpu(hp1).opcode = A_AND) and
  4829. MatchOpType(taicpu(hp1),top_const,top_reg) and
  4830. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  4831. { Change "movzbw %reg1, %reg2; andw $const, %reg2"
  4832. to "movw %reg1, reg2; andw $(const1 and $ff), %reg2"}
  4833. begin
  4834. DebugMsg(SPeepholeOptimization + 'var8',p);
  4835. taicpu(p).opcode := A_MOV;
  4836. taicpu(p).changeopsize(S_W);
  4837. setsubreg(taicpu(p).oper[0]^.reg,R_SUBW);
  4838. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  4839. Result := True;
  4840. end;
  4841. end;
  4842. {$ifndef i8086} { movzbl %al,%eax cannot be encoded in 16-bit mode (the machine code is equivalent to movzbw %al,%ax }
  4843. S_BL:
  4844. begin
  4845. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  4846. (
  4847. not IsMOVZXAcceptable
  4848. { and $0xff,%eax has a smaller encoding but risks a partial write penalty }
  4849. or (
  4850. (cs_opt_size in current_settings.optimizerswitches) and
  4851. (taicpu(p).oper[1]^.reg = NR_EAX)
  4852. )
  4853. ) then
  4854. { Change "movzbl %al, %eax" to "andl $0x0ffh, %eax" }
  4855. begin
  4856. DebugMsg(SPeepholeOptimization + 'var9',p);
  4857. taicpu(p).opcode := A_AND;
  4858. taicpu(p).changeopsize(S_L);
  4859. taicpu(p).loadConst(0,$ff);
  4860. Result := True;
  4861. end
  4862. else if not IsMOVZXAcceptable and
  4863. GetNextInstruction(p, hp1) and
  4864. (tai(hp1).typ = ait_instruction) and
  4865. (taicpu(hp1).opcode = A_AND) and
  4866. MatchOpType(taicpu(hp1),top_const,top_reg) and
  4867. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  4868. { Change "movzbl %reg1, %reg2; andl $const, %reg2"
  4869. to "movl %reg1, reg2; andl $(const1 and $ff), %reg2"}
  4870. begin
  4871. DebugMsg(SPeepholeOptimization + 'var10',p);
  4872. taicpu(p).opcode := A_MOV;
  4873. taicpu(p).changeopsize(S_L);
  4874. { do not use R_SUBWHOLE
  4875. as movl %rdx,%eax
  4876. is invalid in assembler PM }
  4877. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  4878. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  4879. Result := True;
  4880. end;
  4881. end;
  4882. {$endif i8086}
  4883. S_WL:
  4884. if not IsMOVZXAcceptable then
  4885. begin
  4886. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) then
  4887. { Change "movzwl %ax, %eax" to "andl $0x0ffffh, %eax" }
  4888. begin
  4889. DebugMsg(SPeepholeOptimization + 'var11',p);
  4890. taicpu(p).opcode := A_AND;
  4891. taicpu(p).changeopsize(S_L);
  4892. taicpu(p).loadConst(0,$ffff);
  4893. Result := True;
  4894. end
  4895. else if GetNextInstruction(p, hp1) and
  4896. (tai(hp1).typ = ait_instruction) and
  4897. (taicpu(hp1).opcode = A_AND) and
  4898. (taicpu(hp1).oper[0]^.typ = top_const) and
  4899. (taicpu(hp1).oper[1]^.typ = top_reg) and
  4900. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  4901. { Change "movzwl %reg1, %reg2; andl $const, %reg2"
  4902. to "movl %reg1, reg2; andl $(const1 and $ffff), %reg2"}
  4903. begin
  4904. DebugMsg(SPeepholeOptimization + 'var12',p);
  4905. taicpu(p).opcode := A_MOV;
  4906. taicpu(p).changeopsize(S_L);
  4907. { do not use R_SUBWHOLE
  4908. as movl %rdx,%eax
  4909. is invalid in assembler PM }
  4910. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  4911. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  4912. Result := True;
  4913. end;
  4914. end;
  4915. else
  4916. InternalError(2017050705);
  4917. end;
  4918. end
  4919. else if not IsMOVZXAcceptable and (taicpu(p).oper[0]^.typ = top_ref) then
  4920. begin
  4921. if GetNextInstruction(p, hp1) and
  4922. (tai(hp1).typ = ait_instruction) and
  4923. (taicpu(hp1).opcode = A_AND) and
  4924. MatchOpType(taicpu(hp1),top_const,top_reg) and
  4925. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  4926. begin
  4927. //taicpu(p).opcode := A_MOV;
  4928. case taicpu(p).opsize Of
  4929. S_BL:
  4930. begin
  4931. DebugMsg(SPeepholeOptimization + 'var13',p);
  4932. taicpu(hp1).changeopsize(S_L);
  4933. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  4934. end;
  4935. S_WL:
  4936. begin
  4937. DebugMsg(SPeepholeOptimization + 'var14',p);
  4938. taicpu(hp1).changeopsize(S_L);
  4939. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  4940. end;
  4941. S_BW:
  4942. begin
  4943. DebugMsg(SPeepholeOptimization + 'var15',p);
  4944. taicpu(hp1).changeopsize(S_W);
  4945. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  4946. end;
  4947. else
  4948. Internalerror(2017050704)
  4949. end;
  4950. Result := True;
  4951. end;
  4952. end;
  4953. end;
  4954. end;
  4955. function TX86AsmOptimizer.OptPass1AND(var p : tai) : boolean;
  4956. var
  4957. hp1 : tai;
  4958. MaskLength : Cardinal;
  4959. begin
  4960. Result:=false;
  4961. if GetNextInstruction(p, hp1) then
  4962. begin
  4963. if MatchOpType(taicpu(p),top_const,top_reg) and
  4964. MatchInstruction(hp1,A_AND,[]) and
  4965. MatchOpType(taicpu(hp1),top_const,top_reg) and
  4966. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  4967. { the second register must contain the first one, so compare their subreg types }
  4968. (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
  4969. (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
  4970. { change
  4971. and const1, reg
  4972. and const2, reg
  4973. to
  4974. and (const1 and const2), reg
  4975. }
  4976. begin
  4977. taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
  4978. DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
  4979. asml.remove(p);
  4980. p.Free;
  4981. p:=hp1;
  4982. Result:=true;
  4983. exit;
  4984. end
  4985. else if MatchOpType(taicpu(p),top_const,top_reg) and
  4986. MatchInstruction(hp1,A_MOVZX,[]) and
  4987. (taicpu(hp1).oper[0]^.typ = top_reg) and
  4988. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
  4989. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  4990. (((taicpu(p).opsize=S_W) and
  4991. (taicpu(hp1).opsize=S_BW)) or
  4992. ((taicpu(p).opsize=S_L) and
  4993. (taicpu(hp1).opsize in [S_WL,S_BL]))
  4994. {$ifdef x86_64}
  4995. or
  4996. ((taicpu(p).opsize=S_Q) and
  4997. (taicpu(hp1).opsize in [S_BQ,S_WQ]))
  4998. {$endif x86_64}
  4999. ) then
  5000. begin
  5001. if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  5002. ((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val)
  5003. ) or
  5004. (((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  5005. ((taicpu(p).oper[0]^.val and $ffff)=taicpu(p).oper[0]^.val))
  5006. then
  5007. begin
  5008. { Unlike MOVSX, MOVZX doesn't actually have a version that zero-extends a
  5009. 32-bit register to a 64-bit register, or even a version called MOVZXD, so
  5010. code that tests for the presence of AND 0xffffffff followed by MOVZX is
  5011. wasted, and is indictive of a compiler bug if it were triggered. [Kit]
  5012. NOTE: To zero-extend from 32 bits to 64 bits, simply use the standard MOV.
  5013. }
  5014. DebugMsg(SPeepholeOptimization + 'AndMovzToAnd done',p);
  5015. asml.remove(hp1);
  5016. hp1.free;
  5017. Exit;
  5018. end;
  5019. end
  5020. else if MatchOpType(taicpu(p),top_const,top_reg) and
  5021. MatchInstruction(hp1,A_SHL,[]) and
  5022. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5023. (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
  5024. begin
  5025. {$ifopt R+}
  5026. {$define RANGE_WAS_ON}
  5027. {$R-}
  5028. {$endif}
  5029. { get length of potential and mask }
  5030. MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
  5031. { really a mask? }
  5032. {$ifdef RANGE_WAS_ON}
  5033. {$R+}
  5034. {$endif}
  5035. if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
  5036. { unmasked part shifted out? }
  5037. ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
  5038. begin
  5039. DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
  5040. RemoveCurrentP(p);
  5041. p:=hp1;
  5042. Result:=true;
  5043. exit;
  5044. end;
  5045. end
  5046. else if MatchOpType(taicpu(p),top_const,top_reg) and
  5047. MatchInstruction(hp1,A_MOVSX{$ifdef x86_64},A_MOVSXD{$endif x86_64},[]) and
  5048. (taicpu(hp1).oper[0]^.typ = top_reg) and
  5049. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
  5050. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  5051. (((taicpu(p).opsize=S_W) and
  5052. (taicpu(hp1).opsize=S_BW)) or
  5053. ((taicpu(p).opsize=S_L) and
  5054. (taicpu(hp1).opsize in [S_WL,S_BL]))
  5055. {$ifdef x86_64}
  5056. or
  5057. ((taicpu(p).opsize=S_Q) and
  5058. (taicpu(hp1).opsize in [S_BQ,S_WQ,S_LQ]))
  5059. {$endif x86_64}
  5060. ) then
  5061. begin
  5062. if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  5063. ((taicpu(p).oper[0]^.val and $7f)=taicpu(p).oper[0]^.val)
  5064. ) or
  5065. (((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  5066. ((taicpu(p).oper[0]^.val and $7fff)=taicpu(p).oper[0]^.val))
  5067. {$ifdef x86_64}
  5068. or
  5069. (((taicpu(hp1).opsize)=S_LQ) and
  5070. ((taicpu(p).oper[0]^.val and $7fffffff)=taicpu(p).oper[0]^.val)
  5071. )
  5072. {$endif x86_64}
  5073. then
  5074. begin
  5075. DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
  5076. asml.remove(hp1);
  5077. hp1.free;
  5078. Exit;
  5079. end;
  5080. end
  5081. else if (taicpu(p).oper[1]^.typ = top_reg) and
  5082. (hp1.typ = ait_instruction) and
  5083. (taicpu(hp1).is_jmp) and
  5084. (taicpu(hp1).opcode<>A_JMP) and
  5085. not(RegInUsedRegs(taicpu(p).oper[1]^.reg,UsedRegs)) then
  5086. begin
  5087. { change
  5088. and x, reg
  5089. jxx
  5090. to
  5091. test x, reg
  5092. jxx
  5093. if reg is deallocated before the
  5094. jump, but only if it's a conditional jump (PFV)
  5095. }
  5096. taicpu(p).opcode := A_TEST;
  5097. Exit;
  5098. end;
  5099. end;
  5100. { Lone AND tests }
  5101. if MatchOpType(taicpu(p),top_const,top_reg) then
  5102. begin
  5103. {
  5104. - Convert and $0xFF,reg to and reg,reg if reg is 8-bit
  5105. - Convert and $0xFFFF,reg to and reg,reg if reg is 16-bit
  5106. - Convert and $0xFFFFFFFF,reg to and reg,reg if reg is 32-bit
  5107. }
  5108. if ((taicpu(p).oper[0]^.val = $FF) and (taicpu(p).opsize = S_B)) or
  5109. ((taicpu(p).oper[0]^.val = $FFFF) and (taicpu(p).opsize = S_W)) or
  5110. ((taicpu(p).oper[0]^.val = $FFFFFFFF) and (taicpu(p).opsize = S_L)) then
  5111. begin
  5112. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  5113. if taicpu(p).opsize = S_L then
  5114. Include(OptsToCheck,aoc_MovAnd2Mov_3);
  5115. end;
  5116. end;
  5117. end;
  5118. function TX86AsmOptimizer.OptPass2Lea(var p : tai) : Boolean;
  5119. begin
  5120. Result:=false;
  5121. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) and
  5122. MatchReference(taicpu(p).oper[0]^.ref^,taicpu(p).oper[1]^.reg,NR_INVALID) and
  5123. (taicpu(p).oper[0]^.ref^.index<>NR_NO) then
  5124. begin
  5125. taicpu(p).loadreg(1,taicpu(p).oper[0]^.ref^.base);
  5126. taicpu(p).loadreg(0,taicpu(p).oper[0]^.ref^.index);
  5127. taicpu(p).opcode:=A_ADD;
  5128. DebugMsg(SPeepholeOptimization + 'Lea2AddBase done',p);
  5129. result:=true;
  5130. end
  5131. else if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) and
  5132. MatchReference(taicpu(p).oper[0]^.ref^,NR_INVALID,taicpu(p).oper[1]^.reg) and
  5133. (taicpu(p).oper[0]^.ref^.base<>NR_NO) then
  5134. begin
  5135. taicpu(p).loadreg(1,taicpu(p).oper[0]^.ref^.index);
  5136. taicpu(p).loadreg(0,taicpu(p).oper[0]^.ref^.base);
  5137. taicpu(p).opcode:=A_ADD;
  5138. DebugMsg(SPeepholeOptimization + 'Lea2AddIndex done',p);
  5139. result:=true;
  5140. end;
  5141. end;
  5142. function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
  5143. var
  5144. hp1: tai; NewRef: TReference;
  5145. begin
  5146. { Change:
  5147. subl/q $x,%reg1
  5148. movl/q %reg1,%reg2
  5149. To:
  5150. leal/q $-x(%reg1),%reg2
  5151. subl/q $x,%reg1
  5152. Breaks the dependency chain and potentially permits the removal of
  5153. a CMP instruction if one follows.
  5154. }
  5155. Result := False;
  5156. if not (cs_opt_size in current_settings.optimizerswitches) and
  5157. (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  5158. MatchOpType(taicpu(p),top_const,top_reg) and
  5159. GetNextInstruction(p, hp1) and
  5160. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  5161. (taicpu(hp1).oper[1]^.typ = top_reg) and
  5162. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) then
  5163. begin
  5164. { Change the MOV instruction to a LEA instruction, and update the
  5165. first operand }
  5166. reference_reset(NewRef, 1, []);
  5167. NewRef.base := taicpu(p).oper[1]^.reg;
  5168. NewRef.scalefactor := 1;
  5169. NewRef.offset := -taicpu(p).oper[0]^.val;
  5170. taicpu(hp1).opcode := A_LEA;
  5171. taicpu(hp1).loadref(0, NewRef);
  5172. { Move what is now the LEA instruction to before the SUB instruction }
  5173. Asml.Remove(hp1);
  5174. Asml.InsertBefore(hp1, p);
  5175. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
  5176. DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
  5177. Result := True;
  5178. end;
  5179. end;
  5180. function TX86AsmOptimizer.PostPeepholeOptLea(var p : tai) : Boolean;
  5181. function SkipSimpleInstructions(var hp1 : tai) : Boolean;
  5182. begin
  5183. { we can skip all instructions not messing with the stack pointer }
  5184. while assigned(hp1) and {MatchInstruction(taicpu(hp1),[A_LEA,A_MOV,A_MOVQ,A_MOVSQ,A_MOVSX,A_MOVSXD,A_MOVZX,
  5185. A_AND,A_OR,A_XOR,A_ADD,A_SHR,A_SHL,A_IMUL,A_SETcc,A_SAR,A_SUB,A_TEST,A_CMOVcc,
  5186. A_MOVSS,A_MOVSD,A_MOVAPS,A_MOVUPD,A_MOVAPD,A_MOVUPS,
  5187. A_VMOVSS,A_VMOVSD,A_VMOVAPS,A_VMOVUPD,A_VMOVAPD,A_VMOVUPS],[]) and}
  5188. ({(taicpu(hp1).ops=0) or }
  5189. ({(MatchOpType(taicpu(hp1),top_reg,top_reg) or MatchOpType(taicpu(hp1),top_const,top_reg) or
  5190. (MatchOpType(taicpu(hp1),top_ref,top_reg))
  5191. ) and }
  5192. not(RegInInstruction(NR_STACK_POINTER_REG,hp1)) { and not(RegInInstruction(NR_FRAME_POINTER_REG,hp1))}
  5193. )
  5194. ) do
  5195. GetNextInstruction(hp1,hp1);
  5196. Result:=assigned(hp1);
  5197. end;
  5198. var
  5199. hp1, hp2, hp3: tai;
  5200. begin
  5201. Result:=false;
  5202. { replace
  5203. leal(q) x(<stackpointer>),<stackpointer>
  5204. call procname
  5205. leal(q) -x(<stackpointer>),<stackpointer>
  5206. ret
  5207. by
  5208. jmp procname
  5209. but do it only on level 4 because it destroys stack back traces
  5210. }
  5211. if (cs_opt_level4 in current_settings.optimizerswitches) and
  5212. MatchOpType(taicpu(p),top_ref,top_reg) and
  5213. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  5214. (taicpu(p).oper[0]^.ref^.index=NR_NO) and
  5215. { the -8 or -24 are not required, but bail out early if possible,
  5216. higher values are unlikely }
  5217. ((taicpu(p).oper[0]^.ref^.offset=-8) or
  5218. (taicpu(p).oper[0]^.ref^.offset=-24)) and
  5219. (taicpu(p).oper[0]^.ref^.symbol=nil) and
  5220. (taicpu(p).oper[0]^.ref^.relsymbol=nil) and
  5221. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and
  5222. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG) and
  5223. GetNextInstruction(p, hp1) and
  5224. { trick to skip label }
  5225. ((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
  5226. SkipSimpleInstructions(hp1) and
  5227. MatchInstruction(hp1,A_CALL,[S_NO]) and
  5228. GetNextInstruction(hp1, hp2) and
  5229. MatchInstruction(hp2,A_LEA,[taicpu(p).opsize]) and
  5230. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  5231. (taicpu(hp2).oper[0]^.ref^.offset=-taicpu(p).oper[0]^.ref^.offset) and
  5232. (taicpu(hp2).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  5233. (taicpu(hp2).oper[0]^.ref^.index=NR_NO) and
  5234. (taicpu(hp2).oper[0]^.ref^.symbol=nil) and
  5235. (taicpu(hp2).oper[0]^.ref^.relsymbol=nil) and
  5236. (taicpu(hp2).oper[0]^.ref^.segment=NR_NO) and
  5237. (taicpu(hp2).oper[1]^.reg=NR_STACK_POINTER_REG) and
  5238. GetNextInstruction(hp2, hp3) and
  5239. { trick to skip label }
  5240. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  5241. MatchInstruction(hp3,A_RET,[S_NO]) and
  5242. (taicpu(hp3).ops=0) then
  5243. begin
  5244. taicpu(hp1).opcode := A_JMP;
  5245. taicpu(hp1).is_jmp := true;
  5246. DebugMsg(SPeepholeOptimization + 'LeaCallLeaRet2Jmp done',p);
  5247. RemoveCurrentP(p);
  5248. AsmL.Remove(hp2);
  5249. hp2.free;
  5250. AsmL.Remove(hp3);
  5251. hp3.free;
  5252. Result:=true;
  5253. end;
  5254. end;
  5255. function TX86AsmOptimizer.PostPeepholeOptMov(var p : tai) : Boolean;
  5256. var
  5257. Value, RegName: string;
  5258. begin
  5259. Result:=false;
  5260. if (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(p).oper[0]^.typ = top_const) then
  5261. begin
  5262. case taicpu(p).oper[0]^.val of
  5263. 0:
  5264. { Don't make this optimisation if the CPU flags are required, since XOR scrambles them }
  5265. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  5266. begin
  5267. { change "mov $0,%reg" into "xor %reg,%reg" }
  5268. taicpu(p).opcode := A_XOR;
  5269. taicpu(p).loadReg(0,taicpu(p).oper[1]^.reg);
  5270. Result := True;
  5271. end;
  5272. $1..$FFFFFFFF:
  5273. begin
  5274. { Code size reduction by J. Gareth "Kit" Moreton }
  5275. { change 64-bit register to 32-bit register to reduce code size (upper 32 bits will be set to zero) }
  5276. case taicpu(p).opsize of
  5277. S_Q:
  5278. begin
  5279. RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
  5280. Value := debug_tostr(taicpu(p).oper[0]^.val);
  5281. { The actual optimization }
  5282. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5283. taicpu(p).changeopsize(S_L);
  5284. DebugMsg(SPeepholeOptimization + 'movq $' + Value + ',' + RegName + ' -> movl $' + Value + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
  5285. Result := True;
  5286. end;
  5287. else
  5288. { Do nothing };
  5289. end;
  5290. end;
  5291. -1:
  5292. { Don't make this optimisation if the CPU flags are required, since OR scrambles them }
  5293. if (cs_opt_size in current_settings.optimizerswitches) and
  5294. (taicpu(p).opsize <> S_B) and
  5295. not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  5296. begin
  5297. { change "mov $-1,%reg" into "or $-1,%reg" }
  5298. { NOTES:
  5299. - No size saving is made when changing a Word-sized assignment unless the register is AX (smaller encoding)
  5300. - This operation creates a false dependency on the register, so only do it when optimising for size
  5301. - It is possible to set memory operands using this method, but this creates an even greater false dependency, so don't do this at all
  5302. }
  5303. taicpu(p).opcode := A_OR;
  5304. Result := True;
  5305. end;
  5306. end;
  5307. end;
  5308. end;
  5309. function TX86AsmOptimizer.PostPeepholeOptMOVSX(var p : tai) : boolean;
  5310. begin
  5311. Result := False;
  5312. if not MatchOpType(taicpu(p), top_reg, top_reg) then
  5313. Exit;
  5314. { Convert:
  5315. movswl %ax,%eax -> cwtl
  5316. movslq %eax,%rax -> cdqe
  5317. NOTE: Don't convert movswl %al,%ax to cbw, because cbw and cwde
  5318. refer to the same opcode and depends only on the assembler's
  5319. current operand-size attribute. [Kit]
  5320. }
  5321. with taicpu(p) do
  5322. case opsize of
  5323. S_WL:
  5324. if (oper[0]^.reg = NR_AX) and (oper[1]^.reg = NR_EAX) then
  5325. begin
  5326. DebugMsg(SPeepholeOptimization + 'Converted movswl %ax,%eax to cwtl', p);
  5327. opcode := A_CWDE;
  5328. clearop(0);
  5329. clearop(1);
  5330. ops := 0;
  5331. Result := True;
  5332. end;
  5333. {$ifdef x86_64}
  5334. S_LQ:
  5335. if (oper[0]^.reg = NR_EAX) and (oper[1]^.reg = NR_RAX) then
  5336. begin
  5337. DebugMsg(SPeepholeOptimization + 'Converted movslq %eax,%rax to cltq', p);
  5338. opcode := A_CDQE;
  5339. clearop(0);
  5340. clearop(1);
  5341. ops := 0;
  5342. Result := True;
  5343. end;
  5344. {$endif x86_64}
  5345. else
  5346. ;
  5347. end;
  5348. end;
  5349. function TX86AsmOptimizer.PostPeepholeOptCmp(var p : tai) : Boolean;
  5350. begin
  5351. Result:=false;
  5352. { change "cmp $0, %reg" to "test %reg, %reg" }
  5353. if MatchOpType(taicpu(p),top_const,top_reg) and
  5354. (taicpu(p).oper[0]^.val = 0) then
  5355. begin
  5356. taicpu(p).opcode := A_TEST;
  5357. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  5358. Result:=true;
  5359. end;
  5360. end;
  5361. function TX86AsmOptimizer.PostPeepholeOptTestOr(var p : tai) : Boolean;
  5362. var
  5363. IsTestConstX : Boolean;
  5364. hp1,hp2 : tai;
  5365. begin
  5366. Result:=false;
  5367. { removes the line marked with (x) from the sequence
  5368. and/or/xor/add/sub/... $x, %y
  5369. test/or %y, %y | test $-1, %y (x)
  5370. j(n)z _Label
  5371. as the first instruction already adjusts the ZF
  5372. %y operand may also be a reference }
  5373. IsTestConstX:=(taicpu(p).opcode=A_TEST) and
  5374. MatchOperand(taicpu(p).oper[0]^,-1);
  5375. if (OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) or IsTestConstX) and
  5376. GetLastInstruction(p, hp1) and
  5377. (tai(hp1).typ = ait_instruction) and
  5378. GetNextInstruction(p,hp2) and
  5379. MatchInstruction(hp2,A_SETcc,A_Jcc,A_CMOVcc,[]) then
  5380. case taicpu(hp1).opcode Of
  5381. A_ADD, A_SUB, A_OR, A_XOR, A_AND:
  5382. begin
  5383. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  5384. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5385. { and in case of carry for A(E)/B(E)/C/NC }
  5386. ((taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) or
  5387. ((taicpu(hp1).opcode <> A_ADD) and
  5388. (taicpu(hp1).opcode <> A_SUB))) then
  5389. begin
  5390. hp1 := tai(p.next);
  5391. asml.remove(p);
  5392. p.free;
  5393. p := tai(hp1);
  5394. Result:=true;
  5395. end;
  5396. end;
  5397. A_SHL, A_SAL, A_SHR, A_SAR:
  5398. begin
  5399. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  5400. { SHL/SAL/SHR/SAR with a value of 0 do not change the flags }
  5401. { therefore, it's only safe to do this optimization for }
  5402. { shifts by a (nonzero) constant }
  5403. (taicpu(hp1).oper[0]^.typ = top_const) and
  5404. (taicpu(hp1).oper[0]^.val <> 0) and
  5405. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5406. { and in case of carry for A(E)/B(E)/C/NC }
  5407. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  5408. begin
  5409. hp1 := tai(p.next);
  5410. asml.remove(p);
  5411. p.free;
  5412. p := tai(hp1);
  5413. Result:=true;
  5414. end;
  5415. end;
  5416. A_DEC, A_INC, A_NEG:
  5417. begin
  5418. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) and
  5419. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5420. { and in case of carry for A(E)/B(E)/C/NC }
  5421. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  5422. begin
  5423. case taicpu(hp1).opcode of
  5424. A_DEC, A_INC:
  5425. { replace inc/dec with add/sub 1, because inc/dec doesn't set the carry flag }
  5426. begin
  5427. case taicpu(hp1).opcode Of
  5428. A_DEC: taicpu(hp1).opcode := A_SUB;
  5429. A_INC: taicpu(hp1).opcode := A_ADD;
  5430. else
  5431. ;
  5432. end;
  5433. taicpu(hp1).loadoper(1,taicpu(hp1).oper[0]^);
  5434. taicpu(hp1).loadConst(0,1);
  5435. taicpu(hp1).ops:=2;
  5436. end;
  5437. else
  5438. ;
  5439. end;
  5440. hp1 := tai(p.next);
  5441. asml.remove(p);
  5442. p.free;
  5443. p := tai(hp1);
  5444. Result:=true;
  5445. end;
  5446. end
  5447. else
  5448. { change "test $-1,%reg" into "test %reg,%reg" }
  5449. if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
  5450. taicpu(p).loadoper(0,taicpu(p).oper[1]^);
  5451. end { case }
  5452. { change "test $-1,%reg" into "test %reg,%reg" }
  5453. else if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
  5454. taicpu(p).loadoper(0,taicpu(p).oper[1]^);
  5455. end;
  5456. function TX86AsmOptimizer.PostPeepholeOptCall(var p : tai) : Boolean;
  5457. var
  5458. hp1 : tai;
  5459. {$ifndef x86_64}
  5460. hp2 : taicpu;
  5461. {$endif x86_64}
  5462. begin
  5463. Result:=false;
  5464. {$ifndef x86_64}
  5465. { don't do this on modern CPUs, this really hurts them due to
  5466. broken call/ret pairing }
  5467. if (current_settings.optimizecputype < cpu_Pentium2) and
  5468. not(cs_create_pic in current_settings.moduleswitches) and
  5469. GetNextInstruction(p, hp1) and
  5470. MatchInstruction(hp1,A_JMP,[S_NO]) and
  5471. MatchOpType(taicpu(hp1),top_ref) and
  5472. (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  5473. begin
  5474. hp2 := taicpu.Op_sym(A_PUSH,S_L,taicpu(hp1).oper[0]^.ref^.symbol);
  5475. InsertLLItem(p.previous, p, hp2);
  5476. taicpu(p).opcode := A_JMP;
  5477. taicpu(p).is_jmp := true;
  5478. asml.remove(hp1);
  5479. hp1.free;
  5480. Result:=true;
  5481. end
  5482. else
  5483. {$endif x86_64}
  5484. { replace
  5485. call procname
  5486. ret
  5487. by
  5488. jmp procname
  5489. but do it only on level 4 because it destroys stack back traces
  5490. else if the subroutine is marked as no return, remove the ret
  5491. }
  5492. if ((cs_opt_level4 in current_settings.optimizerswitches) or
  5493. (po_noreturn in current_procinfo.procdef.procoptions)) and
  5494. GetNextInstruction(p, hp1) and
  5495. MatchInstruction(hp1,A_RET,[S_NO]) and
  5496. (taicpu(hp1).ops=0) then
  5497. begin
  5498. if (cs_opt_level4 in current_settings.optimizerswitches) and
  5499. { we might destroy stack alignment here if we do not do a call }
  5500. (target_info.stackalign<=sizeof(SizeUInt)) then
  5501. begin
  5502. taicpu(p).opcode := A_JMP;
  5503. taicpu(p).is_jmp := true;
  5504. DebugMsg(SPeepholeOptimization + 'CallRet2Jmp done',p);
  5505. end
  5506. else
  5507. DebugMsg(SPeepholeOptimization + 'CallRet2Call done',p);
  5508. asml.remove(hp1);
  5509. hp1.free;
  5510. Result:=true;
  5511. end;
  5512. end;
  5513. {$ifdef x86_64}
  5514. function TX86AsmOptimizer.PostPeepholeOptMovzx(var p : tai) : Boolean;
  5515. var
  5516. PreMessage: string;
  5517. begin
  5518. Result := False;
  5519. { Code size reduction by J. Gareth "Kit" Moreton }
  5520. { Convert MOVZBQ and MOVZWQ to MOVZBL and MOVZWL respectively if it removes the REX prefix }
  5521. if (taicpu(p).opsize in [S_BQ, S_WQ]) and
  5522. (getsupreg(taicpu(p).oper[1]^.reg) in [RS_RAX, RS_RCX, RS_RDX, RS_RBX, RS_RSI, RS_RDI, RS_RBP, RS_RSP])
  5523. then
  5524. begin
  5525. { Has 64-bit register name and opcode suffix }
  5526. PreMessage := 'movz' + debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' -> movz';
  5527. { The actual optimization }
  5528. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5529. if taicpu(p).opsize = S_BQ then
  5530. taicpu(p).changeopsize(S_BL)
  5531. else
  5532. taicpu(p).changeopsize(S_WL);
  5533. DebugMsg(SPeepholeOptimization + PreMessage +
  5534. debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (removes REX prefix)', p);
  5535. end;
  5536. end;
  5537. function TX86AsmOptimizer.PostPeepholeOptXor(var p : tai) : Boolean;
  5538. var
  5539. PreMessage, RegName: string;
  5540. begin
  5541. { Code size reduction by J. Gareth "Kit" Moreton }
  5542. { change "xorq %reg,%reg" to "xorl %reg,%reg" for %rax, %rcx, %rdx, %rbx, %rsi, %rdi, %rbp and %rsp,
  5543. as this removes the REX prefix }
  5544. Result := False;
  5545. if not OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  5546. Exit;
  5547. if taicpu(p).oper[0]^.typ <> top_reg then
  5548. { Should be impossible if both operands were equal, since one of XOR's operands must be a register }
  5549. InternalError(2018011500);
  5550. case taicpu(p).opsize of
  5551. S_Q:
  5552. begin
  5553. if (getsupreg(taicpu(p).oper[0]^.reg) in [RS_RAX, RS_RCX, RS_RDX, RS_RBX, RS_RSI, RS_RDI, RS_RBP, RS_RSP]) then
  5554. begin
  5555. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 64-bit register name }
  5556. PreMessage := 'xorq ' + RegName + ',' + RegName + ' -> xorl ';
  5557. { The actual optimization }
  5558. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  5559. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5560. taicpu(p).changeopsize(S_L);
  5561. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 32-bit register name }
  5562. DebugMsg(SPeepholeOptimization + PreMessage + RegName + ',' + RegName + ' (removes REX prefix)', p);
  5563. end;
  5564. end;
  5565. else
  5566. ;
  5567. end;
  5568. end;
  5569. {$endif}
  5570. class procedure TX86AsmOptimizer.OptimizeRefs(var p: taicpu);
  5571. var
  5572. OperIdx: Integer;
  5573. begin
  5574. for OperIdx := 0 to p.ops - 1 do
  5575. if p.oper[OperIdx]^.typ = top_ref then
  5576. optimize_ref(p.oper[OperIdx]^.ref^, False);
  5577. end;
  5578. end.