aoptx86.pas 275 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494
  1. {
  2. Copyright (c) 1998-2002 by Florian Klaempfl and Jonas Maebe
  3. This unit contains the peephole optimizer.
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit aoptx86;
  18. {$i fpcdefs.inc}
  19. {$define DEBUG_AOPTCPU}
  20. interface
  21. uses
  22. globtype,
  23. cpubase,
  24. aasmtai,aasmcpu,
  25. cgbase,cgutils,
  26. aopt,aoptobj;
  27. type
  28. TOptsToCheck = (
  29. aoc_MovAnd2Mov_3
  30. );
  31. TX86AsmOptimizer = class(TAsmOptimizer)
  32. { some optimizations are very expensive to check, so the
  33. pre opt pass can be used to set some flags, depending on the found
  34. instructions if it is worth to check a certain optimization }
  35. OptsToCheck : set of TOptsToCheck;
  36. function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override;
  37. function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override;
  38. function RegReadByInstruction(reg : TRegister; hp : tai) : boolean;
  39. function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
  40. function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  41. {
  42. In comparison with GetNextInstructionUsingReg, GetNextInstructionUsingRegTrackingUse tracks
  43. the use of a register by allocs/dealloc, so it can ignore calls.
  44. In the following example, GetNextInstructionUsingReg will return the second movq,
  45. GetNextInstructionUsingRegTrackingUse won't.
  46. movq %rdi,%rax
  47. # Register rdi released
  48. # Register rdi allocated
  49. movq %rax,%rdi
  50. While in this example:
  51. movq %rdi,%rax
  52. call proc
  53. movq %rdi,%rax
  54. GetNextInstructionUsingRegTrackingUse will return the second instruction while GetNextInstructionUsingReg
  55. won't.
  56. }
  57. function GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  58. function RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean; override;
  59. private
  60. function SkipSimpleInstructions(var hp1: tai): Boolean;
  61. protected
  62. class function IsMOVZXAcceptable: Boolean; static; inline;
  63. { checks whether loading a new value in reg1 overwrites the entirety of reg2 }
  64. function Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  65. { checks whether reading the value in reg1 depends on the value of reg2. This
  66. is very similar to SuperRegisterEquals, except it takes into account that
  67. R_SUBH and R_SUBL are independendent (e.g. reading from AL does not
  68. depend on the value in AH). }
  69. function Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  70. { Replaces all references to AOldReg in a memory reference to ANewReg }
  71. class function ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean; static;
  72. { Replaces all references to AOldReg in an operand to ANewReg }
  73. class function ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean; static;
  74. { Replaces all references to AOldReg in an instruction to ANewReg,
  75. except where the register is being written }
  76. function ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
  77. { Returns true if the reference only refers to ESP or EBP (or their 64-bit equivalents),
  78. or writes to a global symbol }
  79. class function IsRefSafe(const ref: PReference): Boolean; static; inline;
  80. { Returns true if the given MOV instruction can be safely converted to CMOV }
  81. class function CanBeCMOV(p : tai) : boolean; static;
  82. { Converts the LEA instruction to ADD/INC/SUB/DEC. Returns True if the
  83. conversion was successful }
  84. function ConvertLEA(const p : taicpu): Boolean;
  85. function DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  86. procedure DebugMsg(const s : string; p : tai);inline;
  87. class function IsExitCode(p : tai) : boolean; static;
  88. class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
  89. procedure RemoveLastDeallocForFuncRes(p : tai);
  90. function DoSubAddOpt(var p : tai) : Boolean;
  91. function PrePeepholeOptSxx(var p : tai) : boolean;
  92. function PrePeepholeOptIMUL(var p : tai) : boolean;
  93. function OptPass1AND(var p : tai) : boolean;
  94. function OptPass1_V_MOVAP(var p : tai) : boolean;
  95. function OptPass1VOP(var p : tai) : boolean;
  96. function OptPass1MOV(var p : tai) : boolean;
  97. function OptPass1Movx(var p : tai) : boolean;
  98. function OptPass1MOVXX(var p : tai) : boolean;
  99. function OptPass1OP(var p : tai) : boolean;
  100. function OptPass1LEA(var p : tai) : boolean;
  101. function OptPass1Sub(var p : tai) : boolean;
  102. function OptPass1SHLSAL(var p : tai) : boolean;
  103. function OptPass1SETcc(var p : tai) : boolean;
  104. function OptPass1FSTP(var p : tai) : boolean;
  105. function OptPass1FLD(var p : tai) : boolean;
  106. function OptPass1Cmp(var p : tai) : boolean;
  107. function OptPass1PXor(var p : tai) : boolean;
  108. function OptPass1VPXor(var p: tai): boolean;
  109. function OptPass2MOV(var p : tai) : boolean;
  110. function OptPass2Imul(var p : tai) : boolean;
  111. function OptPass2Jmp(var p : tai) : boolean;
  112. function OptPass2Jcc(var p : tai) : boolean;
  113. function OptPass2Lea(var p: tai): Boolean;
  114. function OptPass2SUB(var p: tai): Boolean;
  115. function PostPeepholeOptMov(var p : tai) : Boolean;
  116. {$ifdef x86_64} { These post-peephole optimisations only affect 64-bit registers. [Kit] }
  117. function PostPeepholeOptMovzx(var p : tai) : Boolean;
  118. function PostPeepholeOptXor(var p : tai) : Boolean;
  119. {$endif}
  120. function PostPeepholeOptMOVSX(var p : tai) : boolean;
  121. function PostPeepholeOptCmp(var p : tai) : Boolean;
  122. function PostPeepholeOptTestOr(var p : tai) : Boolean;
  123. function PostPeepholeOptCall(var p : tai) : Boolean;
  124. function PostPeepholeOptLea(var p : tai) : Boolean;
  125. function PostPeepholeOptPush(var p: tai): Boolean;
  126. procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
  127. { Processor-dependent reference optimisation }
  128. class procedure OptimizeRefs(var p: taicpu); static;
  129. end;
  130. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  131. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  132. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  133. function MatchInstruction(const instr: tai; const ops: array of TAsmOp; const opsize: topsizes): boolean;
  134. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  135. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  136. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  137. {$if max_operands>2}
  138. function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
  139. {$endif max_operands>2}
  140. function RefsEqual(const r1, r2: treference): boolean;
  141. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  142. { returns true, if ref is a reference using only the registers passed as base and index
  143. and having an offset }
  144. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  145. implementation
  146. uses
  147. cutils,verbose,
  148. systems,
  149. globals,
  150. cpuinfo,
  151. procinfo,
  152. paramgr,
  153. aasmbase,
  154. aoptbase,aoptutils,
  155. symconst,symsym,
  156. cgx86,
  157. itcpugas;
  158. {$ifdef DEBUG_AOPTCPU}
  159. const
  160. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  161. {$else DEBUG_AOPTCPU}
  162. { Empty strings help the optimizer to remove string concatenations that won't
  163. ever appear to the user on release builds. [Kit] }
  164. const
  165. SPeepholeOptimization = '';
  166. {$endif DEBUG_AOPTCPU}
  167. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  168. begin
  169. result :=
  170. (instr.typ = ait_instruction) and
  171. (taicpu(instr).opcode = op) and
  172. ((opsize = []) or (taicpu(instr).opsize in opsize));
  173. end;
  174. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  175. begin
  176. result :=
  177. (instr.typ = ait_instruction) and
  178. ((taicpu(instr).opcode = op1) or
  179. (taicpu(instr).opcode = op2)
  180. ) and
  181. ((opsize = []) or (taicpu(instr).opsize in opsize));
  182. end;
  183. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  184. begin
  185. result :=
  186. (instr.typ = ait_instruction) and
  187. ((taicpu(instr).opcode = op1) or
  188. (taicpu(instr).opcode = op2) or
  189. (taicpu(instr).opcode = op3)
  190. ) and
  191. ((opsize = []) or (taicpu(instr).opsize in opsize));
  192. end;
  193. function MatchInstruction(const instr : tai;const ops : array of TAsmOp;
  194. const opsize : topsizes) : boolean;
  195. var
  196. op : TAsmOp;
  197. begin
  198. result:=false;
  199. for op in ops do
  200. begin
  201. if (instr.typ = ait_instruction) and
  202. (taicpu(instr).opcode = op) and
  203. ((opsize = []) or (taicpu(instr).opsize in opsize)) then
  204. begin
  205. result:=true;
  206. exit;
  207. end;
  208. end;
  209. end;
  210. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  211. begin
  212. result := (oper.typ = top_reg) and (oper.reg = reg);
  213. end;
  214. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  215. begin
  216. result := (oper.typ = top_const) and (oper.val = a);
  217. end;
  218. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  219. begin
  220. result := oper1.typ = oper2.typ;
  221. if result then
  222. case oper1.typ of
  223. top_const:
  224. Result:=oper1.val = oper2.val;
  225. top_reg:
  226. Result:=oper1.reg = oper2.reg;
  227. top_ref:
  228. Result:=RefsEqual(oper1.ref^, oper2.ref^);
  229. else
  230. internalerror(2013102801);
  231. end
  232. end;
  233. function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
  234. begin
  235. result := (oper1.typ = oper2.typ) and (oper1.typ = oper3.typ);
  236. if result then
  237. case oper1.typ of
  238. top_const:
  239. Result:=(oper1.val = oper2.val) and (oper1.val = oper3.val);
  240. top_reg:
  241. Result:=(oper1.reg = oper2.reg) and (oper1.reg = oper3.reg);
  242. top_ref:
  243. Result:=RefsEqual(oper1.ref^, oper2.ref^) and RefsEqual(oper1.ref^, oper3.ref^);
  244. else
  245. internalerror(2020052401);
  246. end
  247. end;
  248. function RefsEqual(const r1, r2: treference): boolean;
  249. begin
  250. RefsEqual :=
  251. (r1.offset = r2.offset) and
  252. (r1.segment = r2.segment) and (r1.base = r2.base) and
  253. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  254. (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  255. (r1.relsymbol = r2.relsymbol) and
  256. (r1.volatility=[]) and
  257. (r2.volatility=[]);
  258. end;
  259. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  260. begin
  261. Result:=(ref.offset=0) and
  262. (ref.scalefactor in [0,1]) and
  263. (ref.segment=NR_NO) and
  264. (ref.symbol=nil) and
  265. (ref.relsymbol=nil) and
  266. ((base=NR_INVALID) or
  267. (ref.base=base)) and
  268. ((index=NR_INVALID) or
  269. (ref.index=index)) and
  270. (ref.volatility=[]);
  271. end;
  272. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  273. begin
  274. Result:=(ref.scalefactor in [0,1]) and
  275. (ref.segment=NR_NO) and
  276. (ref.symbol=nil) and
  277. (ref.relsymbol=nil) and
  278. ((base=NR_INVALID) or
  279. (ref.base=base)) and
  280. ((index=NR_INVALID) or
  281. (ref.index=index)) and
  282. (ref.volatility=[]);
  283. end;
  284. function InstrReadsFlags(p: tai): boolean;
  285. begin
  286. InstrReadsFlags := true;
  287. case p.typ of
  288. ait_instruction:
  289. if InsProp[taicpu(p).opcode].Ch*
  290. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  291. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  292. Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc,Ch_All]<>[] then
  293. exit;
  294. ait_label:
  295. exit;
  296. else
  297. ;
  298. end;
  299. InstrReadsFlags := false;
  300. end;
  301. function TX86AsmOptimizer.GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  302. begin
  303. Next:=Current;
  304. repeat
  305. Result:=GetNextInstruction(Next,Next);
  306. until not (Result) or
  307. not(cs_opt_level3 in current_settings.optimizerswitches) or
  308. (Next.typ<>ait_instruction) or
  309. RegInInstruction(reg,Next) or
  310. is_calljmp(taicpu(Next).opcode);
  311. end;
  312. function TX86AsmOptimizer.GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  313. begin
  314. if not(cs_opt_level3 in current_settings.optimizerswitches) then
  315. begin
  316. Result:=GetNextInstruction(Current,Next);
  317. exit;
  318. end;
  319. Next:=tai(Current.Next);
  320. Result:=false;
  321. while assigned(Next) do
  322. begin
  323. if ((Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) and not(taicpu(Next).opcode=A_CALL)) or
  324. ((Next.typ=ait_regalloc) and (getsupreg(tai_regalloc(Next).reg)=getsupreg(reg))) or
  325. ((Next.typ=ait_label) and not(labelCanBeSkipped(Tai_Label(Next)))) then
  326. exit
  327. else if (Next.typ=ait_instruction) and RegInInstruction(reg,Next) and not(taicpu(Next).opcode=A_CALL) then
  328. begin
  329. Result:=true;
  330. exit;
  331. end;
  332. Next:=tai(Next.Next);
  333. end;
  334. end;
  335. function TX86AsmOptimizer.InstructionLoadsFromReg(const reg: TRegister;const hp: tai): boolean;
  336. begin
  337. Result:=RegReadByInstruction(reg,hp);
  338. end;
  339. function TX86AsmOptimizer.RegReadByInstruction(reg: TRegister; hp: tai): boolean;
  340. var
  341. p: taicpu;
  342. opcount: longint;
  343. begin
  344. RegReadByInstruction := false;
  345. if hp.typ <> ait_instruction then
  346. exit;
  347. p := taicpu(hp);
  348. case p.opcode of
  349. A_CALL:
  350. regreadbyinstruction := true;
  351. A_IMUL:
  352. case p.ops of
  353. 1:
  354. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  355. (
  356. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  357. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  358. );
  359. 2,3:
  360. regReadByInstruction :=
  361. reginop(reg,p.oper[0]^) or
  362. reginop(reg,p.oper[1]^);
  363. else
  364. InternalError(2019112801);
  365. end;
  366. A_MUL:
  367. begin
  368. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  369. (
  370. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  371. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  372. );
  373. end;
  374. A_IDIV,A_DIV:
  375. begin
  376. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  377. (
  378. (getregtype(reg)=R_INTREGISTER) and
  379. (
  380. (getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B))
  381. )
  382. );
  383. end;
  384. else
  385. begin
  386. if (p.opcode=A_LEA) and is_segment_reg(reg) then
  387. begin
  388. RegReadByInstruction := false;
  389. exit;
  390. end;
  391. for opcount := 0 to p.ops-1 do
  392. if (p.oper[opCount]^.typ = top_ref) and
  393. RegInRef(reg,p.oper[opcount]^.ref^) then
  394. begin
  395. RegReadByInstruction := true;
  396. exit
  397. end;
  398. { special handling for SSE MOVSD }
  399. if (p.opcode=A_MOVSD) and (p.ops>0) then
  400. begin
  401. if p.ops<>2 then
  402. internalerror(2017042702);
  403. regReadByInstruction := reginop(reg,p.oper[0]^) or
  404. (
  405. (p.oper[1]^.typ=top_reg) and (p.oper[0]^.typ=top_reg) and reginop(reg, p.oper[1]^)
  406. );
  407. exit;
  408. end;
  409. with insprop[p.opcode] do
  410. begin
  411. if getregtype(reg)=R_INTREGISTER then
  412. begin
  413. case getsupreg(reg) of
  414. RS_EAX:
  415. if [Ch_REAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
  416. begin
  417. RegReadByInstruction := true;
  418. exit
  419. end;
  420. RS_ECX:
  421. if [Ch_RECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
  422. begin
  423. RegReadByInstruction := true;
  424. exit
  425. end;
  426. RS_EDX:
  427. if [Ch_REDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
  428. begin
  429. RegReadByInstruction := true;
  430. exit
  431. end;
  432. RS_EBX:
  433. if [Ch_REBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
  434. begin
  435. RegReadByInstruction := true;
  436. exit
  437. end;
  438. RS_ESP:
  439. if [Ch_RESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
  440. begin
  441. RegReadByInstruction := true;
  442. exit
  443. end;
  444. RS_EBP:
  445. if [Ch_REBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
  446. begin
  447. RegReadByInstruction := true;
  448. exit
  449. end;
  450. RS_ESI:
  451. if [Ch_RESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
  452. begin
  453. RegReadByInstruction := true;
  454. exit
  455. end;
  456. RS_EDI:
  457. if [Ch_REDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
  458. begin
  459. RegReadByInstruction := true;
  460. exit
  461. end;
  462. end;
  463. end;
  464. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  465. begin
  466. if (Ch_RFLAGScc in Ch) and not(getsubreg(reg) in [R_SUBW,R_SUBD,R_SUBQ]) then
  467. begin
  468. case p.condition of
  469. C_A,C_NBE, { CF=0 and ZF=0 }
  470. C_BE,C_NA: { CF=1 or ZF=1 }
  471. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY,R_SUBFLAGZERO];
  472. C_AE,C_NB,C_NC, { CF=0 }
  473. C_B,C_NAE,C_C: { CF=1 }
  474. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY];
  475. C_NE,C_NZ, { ZF=0 }
  476. C_E,C_Z: { ZF=1 }
  477. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO];
  478. C_G,C_NLE, { ZF=0 and SF=OF }
  479. C_LE,C_NG: { ZF=1 or SF<>OF }
  480. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO,R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  481. C_GE,C_NL, { SF=OF }
  482. C_L,C_NGE: { SF<>OF }
  483. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  484. C_NO, { OF=0 }
  485. C_O: { OF=1 }
  486. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGOVERFLOW];
  487. C_NP,C_PO, { PF=0 }
  488. C_P,C_PE: { PF=1 }
  489. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGPARITY];
  490. C_NS, { SF=0 }
  491. C_S: { SF=1 }
  492. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN];
  493. else
  494. internalerror(2017042701);
  495. end;
  496. if RegReadByInstruction then
  497. exit;
  498. end;
  499. case getsubreg(reg) of
  500. R_SUBW,R_SUBD,R_SUBQ:
  501. RegReadByInstruction :=
  502. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  503. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  504. Ch_RDirFlag,Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc]*Ch<>[];
  505. R_SUBFLAGCARRY:
  506. RegReadByInstruction:=[Ch_RCarryFlag,Ch_RWCarryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  507. R_SUBFLAGPARITY:
  508. RegReadByInstruction:=[Ch_RParityFlag,Ch_RWParityFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  509. R_SUBFLAGAUXILIARY:
  510. RegReadByInstruction:=[Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  511. R_SUBFLAGZERO:
  512. RegReadByInstruction:=[Ch_RZeroFlag,Ch_RWZeroFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  513. R_SUBFLAGSIGN:
  514. RegReadByInstruction:=[Ch_RSignFlag,Ch_RWSignFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  515. R_SUBFLAGOVERFLOW:
  516. RegReadByInstruction:=[Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  517. R_SUBFLAGINTERRUPT:
  518. RegReadByInstruction:=[Ch_RFlags,Ch_RWFlags]*Ch<>[];
  519. R_SUBFLAGDIRECTION:
  520. RegReadByInstruction:=[Ch_RDirFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  521. else
  522. internalerror(2017042601);
  523. end;
  524. exit;
  525. end;
  526. if (Ch_NoReadIfEqualRegs in Ch) and (p.ops=2) and
  527. (p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
  528. (p.oper[0]^.reg=p.oper[1]^.reg) then
  529. exit;
  530. if ([CH_RWOP1,CH_ROP1,CH_MOP1]*Ch<>[]) and reginop(reg,p.oper[0]^) then
  531. begin
  532. RegReadByInstruction := true;
  533. exit
  534. end;
  535. if ([Ch_RWOP2,Ch_ROP2,Ch_MOP2]*Ch<>[]) and reginop(reg,p.oper[1]^) then
  536. begin
  537. RegReadByInstruction := true;
  538. exit
  539. end;
  540. if ([Ch_RWOP3,Ch_ROP3,Ch_MOP3]*Ch<>[]) and reginop(reg,p.oper[2]^) then
  541. begin
  542. RegReadByInstruction := true;
  543. exit
  544. end;
  545. if ([Ch_RWOP4,Ch_ROP4,Ch_MOP4]*Ch<>[]) and reginop(reg,p.oper[3]^) then
  546. begin
  547. RegReadByInstruction := true;
  548. exit
  549. end;
  550. end;
  551. end;
  552. end;
  553. end;
  554. function TX86AsmOptimizer.RegInInstruction(Reg: TRegister; p1: tai): Boolean;
  555. begin
  556. result:=false;
  557. if p1.typ<>ait_instruction then
  558. exit;
  559. if (Ch_All in insprop[taicpu(p1).opcode].Ch) then
  560. exit(true);
  561. if (getregtype(reg)=R_INTREGISTER) and
  562. { change information for xmm movsd are not correct }
  563. ((taicpu(p1).opcode<>A_MOVSD) or (taicpu(p1).ops=0)) then
  564. begin
  565. case getsupreg(reg) of
  566. { RS_EAX = RS_RAX on x86-64 }
  567. RS_EAX:
  568. result:=([Ch_REAX,Ch_RRAX,Ch_WEAX,Ch_WRAX,Ch_RWEAX,Ch_RWRAX,Ch_MEAX,Ch_MRAX]*insprop[taicpu(p1).opcode].Ch)<>[];
  569. RS_ECX:
  570. result:=([Ch_RECX,Ch_RRCX,Ch_WECX,Ch_WRCX,Ch_RWECX,Ch_RWRCX,Ch_MECX,Ch_MRCX]*insprop[taicpu(p1).opcode].Ch)<>[];
  571. RS_EDX:
  572. result:=([Ch_REDX,Ch_RRDX,Ch_WEDX,Ch_WRDX,Ch_RWEDX,Ch_RWRDX,Ch_MEDX,Ch_MRDX]*insprop[taicpu(p1).opcode].Ch)<>[];
  573. RS_EBX:
  574. result:=([Ch_REBX,Ch_RRBX,Ch_WEBX,Ch_WRBX,Ch_RWEBX,Ch_RWRBX,Ch_MEBX,Ch_MRBX]*insprop[taicpu(p1).opcode].Ch)<>[];
  575. RS_ESP:
  576. result:=([Ch_RESP,Ch_RRSP,Ch_WESP,Ch_WRSP,Ch_RWESP,Ch_RWRSP,Ch_MESP,Ch_MRSP]*insprop[taicpu(p1).opcode].Ch)<>[];
  577. RS_EBP:
  578. result:=([Ch_REBP,Ch_RRBP,Ch_WEBP,Ch_WRBP,Ch_RWEBP,Ch_RWRBP,Ch_MEBP,Ch_MRBP]*insprop[taicpu(p1).opcode].Ch)<>[];
  579. RS_ESI:
  580. result:=([Ch_RESI,Ch_RRSI,Ch_WESI,Ch_WRSI,Ch_RWESI,Ch_RWRSI,Ch_MESI,Ch_MRSI,Ch_RMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  581. RS_EDI:
  582. result:=([Ch_REDI,Ch_RRDI,Ch_WEDI,Ch_WRDI,Ch_RWEDI,Ch_RWRDI,Ch_MEDI,Ch_MRDI,Ch_WMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  583. else
  584. ;
  585. end;
  586. if result then
  587. exit;
  588. end
  589. else if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  590. begin
  591. if ([Ch_RFlags,Ch_WFlags,Ch_RWFlags,Ch_RFLAGScc]*insprop[taicpu(p1).opcode].Ch)<>[] then
  592. exit(true);
  593. case getsubreg(reg) of
  594. R_SUBFLAGCARRY:
  595. Result:=([Ch_RCarryFlag,Ch_RWCarryFlag,Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  596. R_SUBFLAGPARITY:
  597. Result:=([Ch_RParityFlag,Ch_RWParityFlag,Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  598. R_SUBFLAGAUXILIARY:
  599. Result:=([Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  600. R_SUBFLAGZERO:
  601. Result:=([Ch_RZeroFlag,Ch_RWZeroFlag,Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  602. R_SUBFLAGSIGN:
  603. Result:=([Ch_RSignFlag,Ch_RWSignFlag,Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  604. R_SUBFLAGOVERFLOW:
  605. Result:=([Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  606. R_SUBFLAGINTERRUPT:
  607. Result:=([Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  608. R_SUBFLAGDIRECTION:
  609. Result:=([Ch_RDirFlag,Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  610. else
  611. ;
  612. end;
  613. if result then
  614. exit;
  615. end
  616. else if (getregtype(reg)=R_FPUREGISTER) and (Ch_FPU in insprop[taicpu(p1).opcode].Ch) then
  617. exit(true);
  618. Result:=inherited RegInInstruction(Reg, p1);
  619. end;
  620. function TX86AsmOptimizer.RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean;
  621. begin
  622. Result := False;
  623. if p1.typ <> ait_instruction then
  624. exit;
  625. with insprop[taicpu(p1).opcode] do
  626. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  627. begin
  628. case getsubreg(reg) of
  629. R_SUBW,R_SUBD,R_SUBQ:
  630. Result :=
  631. [Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
  632. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  633. Ch_W0DirFlag,Ch_W1DirFlag,Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  634. R_SUBFLAGCARRY:
  635. Result:=[Ch_WCarryFlag,Ch_RWCarryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  636. R_SUBFLAGPARITY:
  637. Result:=[Ch_WParityFlag,Ch_RWParityFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  638. R_SUBFLAGAUXILIARY:
  639. Result:=[Ch_WAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  640. R_SUBFLAGZERO:
  641. Result:=[Ch_WZeroFlag,Ch_RWZeroFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  642. R_SUBFLAGSIGN:
  643. Result:=[Ch_WSignFlag,Ch_RWSignFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  644. R_SUBFLAGOVERFLOW:
  645. Result:=[Ch_WOverflowFlag,Ch_RWOverflowFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  646. R_SUBFLAGINTERRUPT:
  647. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  648. R_SUBFLAGDIRECTION:
  649. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  650. else
  651. internalerror(2017042602);
  652. end;
  653. exit;
  654. end;
  655. case taicpu(p1).opcode of
  656. A_CALL:
  657. { We could potentially set Result to False if the register in
  658. question is non-volatile for the subroutine's calling convention,
  659. but this would require detecting the calling convention in use and
  660. also assuming that the routine doesn't contain malformed assembly
  661. language, for example... so it could only be done under -O4 as it
  662. would be considered a side-effect. [Kit] }
  663. Result := True;
  664. A_MOVSD:
  665. { special handling for SSE MOVSD }
  666. if (taicpu(p1).ops>0) then
  667. begin
  668. if taicpu(p1).ops<>2 then
  669. internalerror(2017042703);
  670. Result := (taicpu(p1).oper[1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[1]^);
  671. end;
  672. { VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
  673. so fix it here (FK)
  674. }
  675. A_VMOVSS,
  676. A_VMOVSD:
  677. begin
  678. Result := (taicpu(p1).ops=3) and (taicpu(p1).oper[2]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[2]^);
  679. exit;
  680. end;
  681. A_IMUL:
  682. Result := (taicpu(p1).oper[taicpu(p1).ops-1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[taicpu(p1).ops-1]^);
  683. else
  684. ;
  685. end;
  686. if Result then
  687. exit;
  688. with insprop[taicpu(p1).opcode] do
  689. begin
  690. if getregtype(reg)=R_INTREGISTER then
  691. begin
  692. case getsupreg(reg) of
  693. RS_EAX:
  694. if [Ch_WEAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
  695. begin
  696. Result := True;
  697. exit
  698. end;
  699. RS_ECX:
  700. if [Ch_WECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
  701. begin
  702. Result := True;
  703. exit
  704. end;
  705. RS_EDX:
  706. if [Ch_WEDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
  707. begin
  708. Result := True;
  709. exit
  710. end;
  711. RS_EBX:
  712. if [Ch_WEBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
  713. begin
  714. Result := True;
  715. exit
  716. end;
  717. RS_ESP:
  718. if [Ch_WESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
  719. begin
  720. Result := True;
  721. exit
  722. end;
  723. RS_EBP:
  724. if [Ch_WEBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
  725. begin
  726. Result := True;
  727. exit
  728. end;
  729. RS_ESI:
  730. if [Ch_WESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
  731. begin
  732. Result := True;
  733. exit
  734. end;
  735. RS_EDI:
  736. if [Ch_WEDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
  737. begin
  738. Result := True;
  739. exit
  740. end;
  741. end;
  742. end;
  743. if ([CH_RWOP1,CH_WOP1,CH_MOP1]*Ch<>[]) and reginop(reg,taicpu(p1).oper[0]^) then
  744. begin
  745. Result := true;
  746. exit
  747. end;
  748. if ([Ch_RWOP2,Ch_WOP2,Ch_MOP2]*Ch<>[]) and reginop(reg,taicpu(p1).oper[1]^) then
  749. begin
  750. Result := true;
  751. exit
  752. end;
  753. if ([Ch_RWOP3,Ch_WOP3,Ch_MOP3]*Ch<>[]) and reginop(reg,taicpu(p1).oper[2]^) then
  754. begin
  755. Result := true;
  756. exit
  757. end;
  758. if ([Ch_RWOP4,Ch_WOP4,Ch_MOP4]*Ch<>[]) and reginop(reg,taicpu(p1).oper[3]^) then
  759. begin
  760. Result := true;
  761. exit
  762. end;
  763. end;
  764. end;
  765. {$ifdef DEBUG_AOPTCPU}
  766. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);
  767. begin
  768. asml.insertbefore(tai_comment.Create(strpnew(s)), p);
  769. end;
  770. function debug_tostr(i: tcgint): string; inline;
  771. begin
  772. Result := tostr(i);
  773. end;
  774. function debug_regname(r: TRegister): string; inline;
  775. begin
  776. Result := '%' + std_regname(r);
  777. end;
  778. { Debug output function - creates a string representation of an operator }
  779. function debug_operstr(oper: TOper): string;
  780. begin
  781. case oper.typ of
  782. top_const:
  783. Result := '$' + debug_tostr(oper.val);
  784. top_reg:
  785. Result := debug_regname(oper.reg);
  786. top_ref:
  787. begin
  788. if oper.ref^.offset <> 0 then
  789. Result := debug_tostr(oper.ref^.offset) + '('
  790. else
  791. Result := '(';
  792. if (oper.ref^.base <> NR_INVALID) and (oper.ref^.base <> NR_NO) then
  793. begin
  794. Result := Result + debug_regname(oper.ref^.base);
  795. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  796. Result := Result + ',' + debug_regname(oper.ref^.index);
  797. end
  798. else
  799. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  800. Result := Result + debug_regname(oper.ref^.index);
  801. if (oper.ref^.scalefactor > 1) then
  802. Result := Result + ',' + debug_tostr(oper.ref^.scalefactor) + ')'
  803. else
  804. Result := Result + ')';
  805. end;
  806. else
  807. Result := '[UNKNOWN]';
  808. end;
  809. end;
  810. function debug_op2str(opcode: tasmop): string; inline;
  811. begin
  812. Result := std_op2str[opcode];
  813. end;
  814. function debug_opsize2str(opsize: topsize): string; inline;
  815. begin
  816. Result := gas_opsize2str[opsize];
  817. end;
  818. {$else DEBUG_AOPTCPU}
  819. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);inline;
  820. begin
  821. end;
  822. function debug_tostr(i: tcgint): string; inline;
  823. begin
  824. Result := '';
  825. end;
  826. function debug_regname(r: TRegister): string; inline;
  827. begin
  828. Result := '';
  829. end;
  830. function debug_operstr(oper: TOper): string; inline;
  831. begin
  832. Result := '';
  833. end;
  834. function debug_op2str(opcode: tasmop): string; inline;
  835. begin
  836. Result := '';
  837. end;
  838. function debug_opsize2str(opsize: topsize): string; inline;
  839. begin
  840. Result := '';
  841. end;
  842. {$endif DEBUG_AOPTCPU}
  843. class function TX86AsmOptimizer.IsMOVZXAcceptable: Boolean; inline;
  844. begin
  845. {$ifdef x86_64}
  846. { Always fine on x86-64 }
  847. Result := True;
  848. {$else x86_64}
  849. Result :=
  850. {$ifdef i8086}
  851. (current_settings.cputype >= cpu_386) and
  852. {$endif i8086}
  853. (
  854. { Always accept if optimising for size }
  855. (cs_opt_size in current_settings.optimizerswitches) or
  856. { From the Pentium II onwards, MOVZX only takes 1 cycle. [Kit] }
  857. (current_settings.optimizecputype >= cpu_Pentium2)
  858. );
  859. {$endif x86_64}
  860. end;
  861. function TX86AsmOptimizer.Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  862. begin
  863. if not SuperRegistersEqual(reg1,reg2) then
  864. exit(false);
  865. if getregtype(reg1)<>R_INTREGISTER then
  866. exit(true); {because SuperRegisterEqual is true}
  867. case getsubreg(reg1) of
  868. { A write to R_SUBL doesn't change R_SUBH and if reg2 is R_SUBW or
  869. higher, it preserves the high bits, so the new value depends on
  870. reg2's previous value. In other words, it is equivalent to doing:
  871. reg2 := (reg2 and $ffffff00) or byte(reg1); }
  872. R_SUBL:
  873. exit(getsubreg(reg2)=R_SUBL);
  874. { A write to R_SUBH doesn't change R_SUBL and if reg2 is R_SUBW or
  875. higher, it actually does a:
  876. reg2 := (reg2 and $ffff00ff) or (reg1 and $ff00); }
  877. R_SUBH:
  878. exit(getsubreg(reg2)=R_SUBH);
  879. { If reg2 is R_SUBD or larger, a write to R_SUBW preserves the high 16
  880. bits of reg2:
  881. reg2 := (reg2 and $ffff0000) or word(reg1); }
  882. R_SUBW:
  883. exit(getsubreg(reg2) in [R_SUBL,R_SUBH,R_SUBW]);
  884. { a write to R_SUBD always overwrites every other subregister,
  885. because it clears the high 32 bits of R_SUBQ on x86_64 }
  886. R_SUBD,
  887. R_SUBQ:
  888. exit(true);
  889. else
  890. internalerror(2017042801);
  891. end;
  892. end;
  893. function TX86AsmOptimizer.Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  894. begin
  895. if not SuperRegistersEqual(reg1,reg2) then
  896. exit(false);
  897. if getregtype(reg1)<>R_INTREGISTER then
  898. exit(true); {because SuperRegisterEqual is true}
  899. case getsubreg(reg1) of
  900. R_SUBL:
  901. exit(getsubreg(reg2)<>R_SUBH);
  902. R_SUBH:
  903. exit(getsubreg(reg2)<>R_SUBL);
  904. R_SUBW,
  905. R_SUBD,
  906. R_SUBQ:
  907. exit(true);
  908. else
  909. internalerror(2017042802);
  910. end;
  911. end;
  912. function TX86AsmOptimizer.PrePeepholeOptSxx(var p : tai) : boolean;
  913. var
  914. hp1 : tai;
  915. l : TCGInt;
  916. begin
  917. result:=false;
  918. { changes the code sequence
  919. shr/sar const1, x
  920. shl const2, x
  921. to
  922. either "sar/and", "shl/and" or just "and" depending on const1 and const2 }
  923. if GetNextInstruction(p, hp1) and
  924. MatchInstruction(hp1,A_SHL,[]) and
  925. (taicpu(p).oper[0]^.typ = top_const) and
  926. (taicpu(hp1).oper[0]^.typ = top_const) and
  927. (taicpu(hp1).opsize = taicpu(p).opsize) and
  928. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[1]^.typ) and
  929. OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^) then
  930. begin
  931. if (taicpu(p).oper[0]^.val > taicpu(hp1).oper[0]^.val) and
  932. not(cs_opt_size in current_settings.optimizerswitches) then
  933. begin
  934. { shr/sar const1, %reg
  935. shl const2, %reg
  936. with const1 > const2 }
  937. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  938. taicpu(hp1).opcode := A_AND;
  939. l := (1 shl (taicpu(hp1).oper[0]^.val)) - 1;
  940. case taicpu(p).opsize Of
  941. S_B: taicpu(hp1).loadConst(0,l Xor $ff);
  942. S_W: taicpu(hp1).loadConst(0,l Xor $ffff);
  943. S_L: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffff));
  944. S_Q: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffffffffffff));
  945. else
  946. Internalerror(2017050703)
  947. end;
  948. end
  949. else if (taicpu(p).oper[0]^.val<taicpu(hp1).oper[0]^.val) and
  950. not(cs_opt_size in current_settings.optimizerswitches) then
  951. begin
  952. { shr/sar const1, %reg
  953. shl const2, %reg
  954. with const1 < const2 }
  955. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val-taicpu(p).oper[0]^.val);
  956. taicpu(p).opcode := A_AND;
  957. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  958. case taicpu(p).opsize Of
  959. S_B: taicpu(p).loadConst(0,l Xor $ff);
  960. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  961. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  962. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  963. else
  964. Internalerror(2017050702)
  965. end;
  966. end
  967. else if (taicpu(p).oper[0]^.val = taicpu(hp1).oper[0]^.val) then
  968. begin
  969. { shr/sar const1, %reg
  970. shl const2, %reg
  971. with const1 = const2 }
  972. taicpu(p).opcode := A_AND;
  973. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  974. case taicpu(p).opsize Of
  975. S_B: taicpu(p).loadConst(0,l Xor $ff);
  976. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  977. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  978. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  979. else
  980. Internalerror(2017050701)
  981. end;
  982. asml.remove(hp1);
  983. hp1.free;
  984. end;
  985. end;
  986. end;
  987. function TX86AsmOptimizer.PrePeepholeOptIMUL(var p : tai) : boolean;
  988. var
  989. opsize : topsize;
  990. hp1 : tai;
  991. tmpref : treference;
  992. ShiftValue : Cardinal;
  993. BaseValue : TCGInt;
  994. begin
  995. result:=false;
  996. opsize:=taicpu(p).opsize;
  997. { changes certain "imul const, %reg"'s to lea sequences }
  998. if (MatchOpType(taicpu(p),top_const,top_reg) or
  999. MatchOpType(taicpu(p),top_const,top_reg,top_reg)) and
  1000. (opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
  1001. if (taicpu(p).oper[0]^.val = 1) then
  1002. if (taicpu(p).ops = 2) then
  1003. { remove "imul $1, reg" }
  1004. begin
  1005. DebugMsg(SPeepholeOptimization + 'Imul2Nop done',p);
  1006. Result := RemoveCurrentP(p);
  1007. end
  1008. else
  1009. { change "imul $1, reg1, reg2" to "mov reg1, reg2" }
  1010. begin
  1011. hp1 := taicpu.Op_Reg_Reg(A_MOV, opsize, taicpu(p).oper[1]^.reg,taicpu(p).oper[2]^.reg);
  1012. InsertLLItem(p.previous, p.next, hp1);
  1013. DebugMsg(SPeepholeOptimization + 'Imul2Mov done',p);
  1014. p.free;
  1015. p := hp1;
  1016. end
  1017. else if ((taicpu(p).ops <= 2) or
  1018. (taicpu(p).oper[2]^.typ = Top_Reg)) and
  1019. not(cs_opt_size in current_settings.optimizerswitches) and
  1020. (not(GetNextInstruction(p, hp1)) or
  1021. not((tai(hp1).typ = ait_instruction) and
  1022. ((taicpu(hp1).opcode=A_Jcc) and
  1023. (taicpu(hp1).condition in [C_O,C_NO])))) then
  1024. begin
  1025. {
  1026. imul X, reg1, reg2 to
  1027. lea (reg1,reg1,Y), reg2
  1028. shl ZZ,reg2
  1029. imul XX, reg1 to
  1030. lea (reg1,reg1,YY), reg1
  1031. shl ZZ,reg2
  1032. This optimziation makes sense for pretty much every x86, except the VIA Nano3000: it has IMUL latency 2, lea/shl pair as well,
  1033. it does not exist as a separate optimization target in FPC though.
  1034. This optimziation can be applied as long as only two bits are set in the constant and those two bits are separated by
  1035. at most two zeros
  1036. }
  1037. reference_reset(tmpref,1,[]);
  1038. if (PopCnt(QWord(taicpu(p).oper[0]^.val))=2) and (BsrQWord(taicpu(p).oper[0]^.val)-BsfQWord(taicpu(p).oper[0]^.val)<=3) then
  1039. begin
  1040. ShiftValue:=BsfQWord(taicpu(p).oper[0]^.val);
  1041. BaseValue:=taicpu(p).oper[0]^.val shr ShiftValue;
  1042. TmpRef.base := taicpu(p).oper[1]^.reg;
  1043. TmpRef.index := taicpu(p).oper[1]^.reg;
  1044. if not(BaseValue in [3,5,9]) then
  1045. Internalerror(2018110101);
  1046. TmpRef.ScaleFactor := BaseValue-1;
  1047. if (taicpu(p).ops = 2) then
  1048. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[1]^.reg)
  1049. else
  1050. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[2]^.reg);
  1051. AsmL.InsertAfter(hp1,p);
  1052. DebugMsg(SPeepholeOptimization + 'Imul2LeaShl done',p);
  1053. taicpu(hp1).fileinfo:=taicpu(p).fileinfo;
  1054. RemoveCurrentP(p, hp1);
  1055. if ShiftValue>0 then
  1056. AsmL.InsertAfter(taicpu.op_const_reg(A_SHL, opsize, ShiftValue, taicpu(hp1).oper[1]^.reg),hp1);
  1057. end;
  1058. end;
  1059. end;
  1060. function TX86AsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  1061. var
  1062. p: taicpu;
  1063. begin
  1064. if not assigned(hp) or
  1065. (hp.typ <> ait_instruction) then
  1066. begin
  1067. Result := false;
  1068. exit;
  1069. end;
  1070. p := taicpu(hp);
  1071. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  1072. with insprop[p.opcode] do
  1073. begin
  1074. case getsubreg(reg) of
  1075. R_SUBW,R_SUBD,R_SUBQ:
  1076. Result:=
  1077. RegLoadedWithNewValue(NR_CARRYFLAG,hp) and
  1078. RegLoadedWithNewValue(NR_PARITYFLAG,hp) and
  1079. RegLoadedWithNewValue(NR_AUXILIARYFLAG,hp) and
  1080. RegLoadedWithNewValue(NR_ZEROFLAG,hp) and
  1081. RegLoadedWithNewValue(NR_SIGNFLAG,hp) and
  1082. RegLoadedWithNewValue(NR_OVERFLOWFLAG,hp);
  1083. R_SUBFLAGCARRY:
  1084. Result:=[Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag,Ch_WFlags]*Ch<>[];
  1085. R_SUBFLAGPARITY:
  1086. Result:=[Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag,Ch_WFlags]*Ch<>[];
  1087. R_SUBFLAGAUXILIARY:
  1088. Result:=[Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_WFlags]*Ch<>[];
  1089. R_SUBFLAGZERO:
  1090. Result:=[Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag,Ch_WFlags]*Ch<>[];
  1091. R_SUBFLAGSIGN:
  1092. Result:=[Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag,Ch_WFlags]*Ch<>[];
  1093. R_SUBFLAGOVERFLOW:
  1094. Result:=[Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag,Ch_WFlags]*Ch<>[];
  1095. R_SUBFLAGINTERRUPT:
  1096. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*Ch<>[];
  1097. R_SUBFLAGDIRECTION:
  1098. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*Ch<>[];
  1099. else
  1100. begin
  1101. writeln(getsubreg(reg));
  1102. internalerror(2017050501);
  1103. end;
  1104. end;
  1105. exit;
  1106. end;
  1107. Result :=
  1108. (((p.opcode = A_MOV) or
  1109. (p.opcode = A_MOVZX) or
  1110. (p.opcode = A_MOVSX) or
  1111. (p.opcode = A_LEA) or
  1112. (p.opcode = A_VMOVSS) or
  1113. (p.opcode = A_VMOVSD) or
  1114. (p.opcode = A_VMOVAPD) or
  1115. (p.opcode = A_VMOVAPS) or
  1116. (p.opcode = A_VMOVQ) or
  1117. (p.opcode = A_MOVSS) or
  1118. (p.opcode = A_MOVSD) or
  1119. (p.opcode = A_MOVQ) or
  1120. (p.opcode = A_MOVAPD) or
  1121. (p.opcode = A_MOVAPS) or
  1122. {$ifndef x86_64}
  1123. (p.opcode = A_LDS) or
  1124. (p.opcode = A_LES) or
  1125. {$endif not x86_64}
  1126. (p.opcode = A_LFS) or
  1127. (p.opcode = A_LGS) or
  1128. (p.opcode = A_LSS)) and
  1129. (p.ops=2) and { A_MOVSD can have zero operands, so this check is needed }
  1130. (p.oper[1]^.typ = top_reg) and
  1131. (Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)) and
  1132. ((p.oper[0]^.typ = top_const) or
  1133. ((p.oper[0]^.typ = top_reg) and
  1134. not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))) or
  1135. ((p.oper[0]^.typ = top_ref) and
  1136. not RegInRef(reg,p.oper[0]^.ref^)))) or
  1137. ((p.opcode = A_POP) and
  1138. (Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg))) or
  1139. ((p.opcode = A_IMUL) and
  1140. (p.ops=3) and
  1141. (Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg,reg)) and
  1142. (((p.oper[1]^.typ=top_reg) and not(Reg1ReadDependsOnReg2(p.oper[1]^.reg,reg))) or
  1143. ((p.oper[1]^.typ=top_ref) and not(RegInRef(reg,p.oper[1]^.ref^))))) or
  1144. ((((p.opcode = A_IMUL) or
  1145. (p.opcode = A_MUL)) and
  1146. (p.ops=1)) and
  1147. (((p.oper[0]^.typ=top_reg) and not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))) or
  1148. ((p.oper[0]^.typ=top_ref) and not(RegInRef(reg,p.oper[0]^.ref^)))) and
  1149. (((p.opsize=S_B) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg))) or
  1150. ((p.opsize=S_W) and Reg1WriteOverwritesReg2Entirely(NR_DX,reg)) or
  1151. ((p.opsize=S_L) and Reg1WriteOverwritesReg2Entirely(NR_EDX,reg))
  1152. {$ifdef x86_64}
  1153. or ((p.opsize=S_Q) and Reg1WriteOverwritesReg2Entirely(NR_RDX,reg))
  1154. {$endif x86_64}
  1155. )) or
  1156. ((p.opcode = A_CWD) and Reg1WriteOverwritesReg2Entirely(NR_DX,reg)) or
  1157. ((p.opcode = A_CDQ) and Reg1WriteOverwritesReg2Entirely(NR_EDX,reg)) or
  1158. {$ifdef x86_64}
  1159. ((p.opcode = A_CQO) and Reg1WriteOverwritesReg2Entirely(NR_RDX,reg)) or
  1160. {$endif x86_64}
  1161. ((p.opcode = A_CBW) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg))) or
  1162. {$ifndef x86_64}
  1163. ((p.opcode = A_LDS) and (reg=NR_DS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1164. ((p.opcode = A_LES) and (reg=NR_ES) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1165. {$endif not x86_64}
  1166. ((p.opcode = A_LFS) and (reg=NR_FS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1167. ((p.opcode = A_LGS) and (reg=NR_GS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1168. ((p.opcode = A_LSS) and (reg=NR_SS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1169. {$ifndef x86_64}
  1170. ((p.opcode = A_AAM) and Reg1WriteOverwritesReg2Entirely(NR_AH,reg)) or
  1171. {$endif not x86_64}
  1172. ((p.opcode = A_LAHF) and Reg1WriteOverwritesReg2Entirely(NR_AH,reg)) or
  1173. ((p.opcode = A_LODSB) and Reg1WriteOverwritesReg2Entirely(NR_AL,reg)) or
  1174. ((p.opcode = A_LODSW) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg)) or
  1175. ((p.opcode = A_LODSD) and Reg1WriteOverwritesReg2Entirely(NR_EAX,reg)) or
  1176. {$ifdef x86_64}
  1177. ((p.opcode = A_LODSQ) and Reg1WriteOverwritesReg2Entirely(NR_RAX,reg)) or
  1178. {$endif x86_64}
  1179. ((p.opcode = A_SETcc) and (p.oper[0]^.typ=top_reg) and Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg)) or
  1180. (((p.opcode = A_FSTSW) or
  1181. (p.opcode = A_FNSTSW)) and
  1182. (p.oper[0]^.typ=top_reg) and
  1183. Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg)) or
  1184. (((p.opcode = A_XOR) or (p.opcode = A_SUB) or (p.opcode = A_SBB)) and
  1185. (p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
  1186. (p.oper[0]^.reg=p.oper[1]^.reg) and
  1187. Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg));
  1188. end;
  1189. class function TX86AsmOptimizer.IsExitCode(p : tai) : boolean;
  1190. var
  1191. hp2,hp3 : tai;
  1192. begin
  1193. { some x86-64 issue a NOP before the real exit code }
  1194. if MatchInstruction(p,A_NOP,[]) then
  1195. GetNextInstruction(p,p);
  1196. result:=assigned(p) and (p.typ=ait_instruction) and
  1197. ((taicpu(p).opcode = A_RET) or
  1198. ((taicpu(p).opcode=A_LEAVE) and
  1199. GetNextInstruction(p,hp2) and
  1200. MatchInstruction(hp2,A_RET,[S_NO])
  1201. ) or
  1202. (((taicpu(p).opcode=A_LEA) and
  1203. MatchOpType(taicpu(p),top_ref,top_reg) and
  1204. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  1205. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  1206. ) and
  1207. GetNextInstruction(p,hp2) and
  1208. MatchInstruction(hp2,A_RET,[S_NO])
  1209. ) or
  1210. ((((taicpu(p).opcode=A_MOV) and
  1211. MatchOpType(taicpu(p),top_reg,top_reg) and
  1212. (taicpu(p).oper[0]^.reg=current_procinfo.framepointer) and
  1213. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)) or
  1214. ((taicpu(p).opcode=A_LEA) and
  1215. MatchOpType(taicpu(p),top_ref,top_reg) and
  1216. (taicpu(p).oper[0]^.ref^.base=current_procinfo.framepointer) and
  1217. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  1218. )
  1219. ) and
  1220. GetNextInstruction(p,hp2) and
  1221. MatchInstruction(hp2,A_POP,[reg2opsize(current_procinfo.framepointer)]) and
  1222. MatchOpType(taicpu(hp2),top_reg) and
  1223. (taicpu(hp2).oper[0]^.reg=current_procinfo.framepointer) and
  1224. GetNextInstruction(hp2,hp3) and
  1225. MatchInstruction(hp3,A_RET,[S_NO])
  1226. )
  1227. );
  1228. end;
  1229. class function TX86AsmOptimizer.isFoldableArithOp(hp1: taicpu; reg: tregister): boolean;
  1230. begin
  1231. isFoldableArithOp := False;
  1232. case hp1.opcode of
  1233. A_ADD,A_SUB,A_OR,A_XOR,A_AND,A_SHL,A_SHR,A_SAR:
  1234. isFoldableArithOp :=
  1235. ((taicpu(hp1).oper[0]^.typ = top_const) or
  1236. ((taicpu(hp1).oper[0]^.typ = top_reg) and
  1237. (taicpu(hp1).oper[0]^.reg <> reg))) and
  1238. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1239. (taicpu(hp1).oper[1]^.reg = reg);
  1240. A_INC,A_DEC,A_NEG,A_NOT:
  1241. isFoldableArithOp :=
  1242. (taicpu(hp1).oper[0]^.typ = top_reg) and
  1243. (taicpu(hp1).oper[0]^.reg = reg);
  1244. else
  1245. ;
  1246. end;
  1247. end;
  1248. procedure TX86AsmOptimizer.RemoveLastDeallocForFuncRes(p: tai);
  1249. procedure DoRemoveLastDeallocForFuncRes( supreg: tsuperregister);
  1250. var
  1251. hp2: tai;
  1252. begin
  1253. hp2 := p;
  1254. repeat
  1255. hp2 := tai(hp2.previous);
  1256. if assigned(hp2) and
  1257. (hp2.typ = ait_regalloc) and
  1258. (tai_regalloc(hp2).ratype=ra_dealloc) and
  1259. (getregtype(tai_regalloc(hp2).reg) = R_INTREGISTER) and
  1260. (getsupreg(tai_regalloc(hp2).reg) = supreg) then
  1261. begin
  1262. asml.remove(hp2);
  1263. hp2.free;
  1264. break;
  1265. end;
  1266. until not(assigned(hp2)) or regInInstruction(newreg(R_INTREGISTER,supreg,R_SUBWHOLE),hp2);
  1267. end;
  1268. begin
  1269. case current_procinfo.procdef.returndef.typ of
  1270. arraydef,recorddef,pointerdef,
  1271. stringdef,enumdef,procdef,objectdef,errordef,
  1272. filedef,setdef,procvardef,
  1273. classrefdef,forwarddef:
  1274. DoRemoveLastDeallocForFuncRes(RS_EAX);
  1275. orddef:
  1276. if current_procinfo.procdef.returndef.size <> 0 then
  1277. begin
  1278. DoRemoveLastDeallocForFuncRes(RS_EAX);
  1279. { for int64/qword }
  1280. if current_procinfo.procdef.returndef.size = 8 then
  1281. DoRemoveLastDeallocForFuncRes(RS_EDX);
  1282. end;
  1283. else
  1284. ;
  1285. end;
  1286. end;
  1287. function TX86AsmOptimizer.OptPass1_V_MOVAP(var p : tai) : boolean;
  1288. var
  1289. hp1,hp2 : tai;
  1290. begin
  1291. result:=false;
  1292. if MatchOpType(taicpu(p),top_reg,top_reg) then
  1293. begin
  1294. { vmova* reg1,reg1
  1295. =>
  1296. <nop> }
  1297. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  1298. begin
  1299. RemoveCurrentP(p);
  1300. result:=true;
  1301. exit;
  1302. end
  1303. else if GetNextInstruction(p,hp1) then
  1304. begin
  1305. if MatchInstruction(hp1,[taicpu(p).opcode],[S_NO]) and
  1306. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  1307. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  1308. begin
  1309. { vmova* reg1,reg2
  1310. vmova* reg2,reg3
  1311. dealloc reg2
  1312. =>
  1313. vmova* reg1,reg3 }
  1314. TransferUsedRegs(TmpUsedRegs);
  1315. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1316. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  1317. begin
  1318. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 1',p);
  1319. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  1320. asml.Remove(hp1);
  1321. hp1.Free;
  1322. result:=true;
  1323. exit;
  1324. end
  1325. { special case:
  1326. vmova* reg1,reg2
  1327. vmova* reg2,reg1
  1328. =>
  1329. vmova* reg1,reg2 }
  1330. else if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) then
  1331. begin
  1332. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 2',p);
  1333. asml.Remove(hp1);
  1334. hp1.Free;
  1335. result:=true;
  1336. exit;
  1337. end
  1338. end
  1339. else if ((MatchInstruction(p,[A_MOVAPS,A_VMOVAPS],[S_NO]) and
  1340. MatchInstruction(hp1,[A_MOVSS,A_VMOVSS],[S_NO])) or
  1341. ((MatchInstruction(p,[A_MOVAPD,A_VMOVAPD],[S_NO]) and
  1342. MatchInstruction(hp1,[A_MOVSD,A_VMOVSD],[S_NO])))
  1343. ) and
  1344. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  1345. begin
  1346. { vmova* reg1,reg2
  1347. vmovs* reg2,<op>
  1348. dealloc reg2
  1349. =>
  1350. vmovs* reg1,reg3 }
  1351. TransferUsedRegs(TmpUsedRegs);
  1352. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1353. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  1354. begin
  1355. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVS*2(V)MOVS* 1',p);
  1356. taicpu(p).opcode:=taicpu(hp1).opcode;
  1357. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  1358. asml.Remove(hp1);
  1359. hp1.Free;
  1360. result:=true;
  1361. exit;
  1362. end
  1363. end;
  1364. end;
  1365. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) then
  1366. begin
  1367. if MatchInstruction(hp1,[A_VFMADDPD,
  1368. A_VFMADD132PD,
  1369. A_VFMADD132PS,
  1370. A_VFMADD132SD,
  1371. A_VFMADD132SS,
  1372. A_VFMADD213PD,
  1373. A_VFMADD213PS,
  1374. A_VFMADD213SD,
  1375. A_VFMADD213SS,
  1376. A_VFMADD231PD,
  1377. A_VFMADD231PS,
  1378. A_VFMADD231SD,
  1379. A_VFMADD231SS,
  1380. A_VFMADDSUB132PD,
  1381. A_VFMADDSUB132PS,
  1382. A_VFMADDSUB213PD,
  1383. A_VFMADDSUB213PS,
  1384. A_VFMADDSUB231PD,
  1385. A_VFMADDSUB231PS,
  1386. A_VFMSUB132PD,
  1387. A_VFMSUB132PS,
  1388. A_VFMSUB132SD,
  1389. A_VFMSUB132SS,
  1390. A_VFMSUB213PD,
  1391. A_VFMSUB213PS,
  1392. A_VFMSUB213SD,
  1393. A_VFMSUB213SS,
  1394. A_VFMSUB231PD,
  1395. A_VFMSUB231PS,
  1396. A_VFMSUB231SD,
  1397. A_VFMSUB231SS,
  1398. A_VFMSUBADD132PD,
  1399. A_VFMSUBADD132PS,
  1400. A_VFMSUBADD213PD,
  1401. A_VFMSUBADD213PS,
  1402. A_VFMSUBADD231PD,
  1403. A_VFMSUBADD231PS,
  1404. A_VFNMADD132PD,
  1405. A_VFNMADD132PS,
  1406. A_VFNMADD132SD,
  1407. A_VFNMADD132SS,
  1408. A_VFNMADD213PD,
  1409. A_VFNMADD213PS,
  1410. A_VFNMADD213SD,
  1411. A_VFNMADD213SS,
  1412. A_VFNMADD231PD,
  1413. A_VFNMADD231PS,
  1414. A_VFNMADD231SD,
  1415. A_VFNMADD231SS,
  1416. A_VFNMSUB132PD,
  1417. A_VFNMSUB132PS,
  1418. A_VFNMSUB132SD,
  1419. A_VFNMSUB132SS,
  1420. A_VFNMSUB213PD,
  1421. A_VFNMSUB213PS,
  1422. A_VFNMSUB213SD,
  1423. A_VFNMSUB213SS,
  1424. A_VFNMSUB231PD,
  1425. A_VFNMSUB231PS,
  1426. A_VFNMSUB231SD,
  1427. A_VFNMSUB231SS],[S_NO]) and
  1428. { we mix single and double opperations here because we assume that the compiler
  1429. generates vmovapd only after double operations and vmovaps only after single operations }
  1430. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[2]^) and
  1431. GetNextInstruction(hp1,hp2) and
  1432. MatchInstruction(hp2,[A_VMOVAPD,A_VMOVAPS,A_MOVAPD,A_MOVAPS],[S_NO]) and
  1433. MatchOperand(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) then
  1434. begin
  1435. TransferUsedRegs(TmpUsedRegs);
  1436. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1437. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  1438. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  1439. begin
  1440. taicpu(hp1).loadoper(2,taicpu(p).oper[0]^);
  1441. RemoveCurrentP(p, hp1); // <-- Is this actually safe? hp1 is not necessarily the next instruction. [Kit]
  1442. asml.Remove(hp2);
  1443. hp2.Free;
  1444. end;
  1445. end
  1446. else if (hp1.typ = ait_instruction) and
  1447. GetNextInstruction(hp1, hp2) and
  1448. MatchInstruction(hp2,taicpu(p).opcode,[]) and
  1449. OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  1450. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  1451. MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^) and
  1452. (((taicpu(p).opcode=A_MOVAPS) and
  1453. ((taicpu(hp1).opcode=A_ADDSS) or (taicpu(hp1).opcode=A_SUBSS) or
  1454. (taicpu(hp1).opcode=A_MULSS) or (taicpu(hp1).opcode=A_DIVSS))) or
  1455. ((taicpu(p).opcode=A_MOVAPD) and
  1456. ((taicpu(hp1).opcode=A_ADDSD) or (taicpu(hp1).opcode=A_SUBSD) or
  1457. (taicpu(hp1).opcode=A_MULSD) or (taicpu(hp1).opcode=A_DIVSD)))
  1458. ) then
  1459. { change
  1460. movapX reg,reg2
  1461. addsX/subsX/... reg3, reg2
  1462. movapX reg2,reg
  1463. to
  1464. addsX/subsX/... reg3,reg
  1465. }
  1466. begin
  1467. TransferUsedRegs(TmpUsedRegs);
  1468. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1469. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  1470. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  1471. begin
  1472. DebugMsg(SPeepholeOptimization + 'MovapXOpMovapX2Op ('+
  1473. debug_op2str(taicpu(p).opcode)+' '+
  1474. debug_op2str(taicpu(hp1).opcode)+' '+
  1475. debug_op2str(taicpu(hp2).opcode)+') done',p);
  1476. { we cannot eliminate the first move if
  1477. the operations uses the same register for source and dest }
  1478. if not(OpsEqual(taicpu(hp1).oper[1]^,taicpu(hp1).oper[0]^)) then
  1479. RemoveCurrentP(p, nil);
  1480. p:=hp1;
  1481. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  1482. asml.remove(hp2);
  1483. hp2.Free;
  1484. result:=true;
  1485. end;
  1486. end;
  1487. end;
  1488. end;
  1489. end;
  1490. function TX86AsmOptimizer.OptPass1VOP(var p : tai) : boolean;
  1491. var
  1492. hp1 : tai;
  1493. begin
  1494. result:=false;
  1495. { replace
  1496. V<Op>X %mreg1,%mreg2,%mreg3
  1497. VMovX %mreg3,%mreg4
  1498. dealloc %mreg3
  1499. by
  1500. V<Op>X %mreg1,%mreg2,%mreg4
  1501. ?
  1502. }
  1503. if GetNextInstruction(p,hp1) and
  1504. { we mix single and double operations here because we assume that the compiler
  1505. generates vmovapd only after double operations and vmovaps only after single operations }
  1506. MatchInstruction(hp1,A_VMOVAPD,A_VMOVAPS,[S_NO]) and
  1507. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  1508. (taicpu(hp1).oper[1]^.typ=top_reg) then
  1509. begin
  1510. TransferUsedRegs(TmpUsedRegs);
  1511. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1512. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  1513. begin
  1514. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  1515. DebugMsg(SPeepholeOptimization + 'VOpVmov2VOp done',p);
  1516. asml.Remove(hp1);
  1517. hp1.Free;
  1518. result:=true;
  1519. end;
  1520. end;
  1521. end;
  1522. { Replaces all references to AOldReg in a memory reference to ANewReg }
  1523. class function TX86AsmOptimizer.ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean;
  1524. var
  1525. OldSupReg: TSuperRegister;
  1526. OldSubReg, MemSubReg: TSubRegister;
  1527. begin
  1528. Result := False;
  1529. { For safety reasons, only check for exact register matches }
  1530. { Check base register }
  1531. if (ref.base = AOldReg) then
  1532. begin
  1533. ref.base := ANewReg;
  1534. Result := True;
  1535. end;
  1536. { Check index register }
  1537. if (ref.index = AOldReg) then
  1538. begin
  1539. ref.index := ANewReg;
  1540. Result := True;
  1541. end;
  1542. end;
  1543. { Replaces all references to AOldReg in an operand to ANewReg }
  1544. class function TX86AsmOptimizer.ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean;
  1545. var
  1546. OldSupReg, NewSupReg: TSuperRegister;
  1547. OldSubReg, NewSubReg, MemSubReg: TSubRegister;
  1548. OldRegType: TRegisterType;
  1549. ThisOper: POper;
  1550. begin
  1551. ThisOper := p.oper[OperIdx]; { Faster to access overall }
  1552. Result := False;
  1553. if (AOldReg = NR_NO) or (ANewReg = NR_NO) then
  1554. InternalError(2020011801);
  1555. OldSupReg := getsupreg(AOldReg);
  1556. OldSubReg := getsubreg(AOldReg);
  1557. OldRegType := getregtype(AOldReg);
  1558. NewSupReg := getsupreg(ANewReg);
  1559. NewSubReg := getsubreg(ANewReg);
  1560. if OldRegType <> getregtype(ANewReg) then
  1561. InternalError(2020011802);
  1562. if OldSubReg <> NewSubReg then
  1563. InternalError(2020011803);
  1564. case ThisOper^.typ of
  1565. top_reg:
  1566. if (
  1567. (ThisOper^.reg = AOldReg) or
  1568. (
  1569. (OldRegType = R_INTREGISTER) and
  1570. (getsupreg(ThisOper^.reg) = OldSupReg) and
  1571. (getregtype(ThisOper^.reg) = R_INTREGISTER) and
  1572. (
  1573. (getsubreg(ThisOper^.reg) <= OldSubReg)
  1574. {$ifndef x86_64}
  1575. and (
  1576. { Under i386 and i8086, ESI, EDI, EBP and ESP
  1577. don't have an 8-bit representation }
  1578. (getsubreg(ThisOper^.reg) >= R_SUBW) or
  1579. not (NewSupReg in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  1580. )
  1581. {$endif x86_64}
  1582. )
  1583. )
  1584. ) then
  1585. begin
  1586. ThisOper^.reg := newreg(getregtype(ANewReg), NewSupReg, getsubreg(p.oper[OperIdx]^.reg));
  1587. Result := True;
  1588. end;
  1589. top_ref:
  1590. if ReplaceRegisterInRef(ThisOper^.ref^, AOldReg, ANewReg) then
  1591. Result := True;
  1592. else
  1593. ;
  1594. end;
  1595. end;
  1596. { Replaces all references to AOldReg in an instruction to ANewReg }
  1597. function TX86AsmOptimizer.ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
  1598. const
  1599. ReadFlag: array[0..3] of TInsChange = (Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Rop4);
  1600. var
  1601. OperIdx: Integer;
  1602. begin
  1603. Result := False;
  1604. for OperIdx := 0 to p.ops - 1 do
  1605. if (ReadFlag[OperIdx] in InsProp[p.Opcode].Ch) and
  1606. { The shift and rotate instructions can only use CL }
  1607. not (
  1608. (OperIdx = 0) and
  1609. { This second condition just helps to avoid unnecessarily
  1610. calling MatchInstruction for 10 different opcodes }
  1611. (p.oper[0]^.reg = NR_CL) and
  1612. MatchInstruction(p, [A_RCL, A_RCR, A_ROL, A_ROR, A_SAL, A_SAR, A_SHL, A_SHLD, A_SHR, A_SHRD], [])
  1613. ) then
  1614. Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
  1615. end;
  1616. class function TX86AsmOptimizer.IsRefSafe(const ref: PReference): Boolean; inline;
  1617. begin
  1618. Result :=
  1619. (ref^.index = NR_NO) and
  1620. (
  1621. {$ifdef x86_64}
  1622. (
  1623. (ref^.base = NR_RIP) and
  1624. (ref^.refaddr in [addr_pic, addr_pic_no_got])
  1625. ) or
  1626. {$endif x86_64}
  1627. (ref^.base = NR_STACK_POINTER_REG) or
  1628. (ref^.base = current_procinfo.framepointer)
  1629. );
  1630. end;
  1631. function TX86AsmOptimizer.ConvertLEA(const p: taicpu): Boolean;
  1632. var
  1633. l: asizeint;
  1634. begin
  1635. Result := False;
  1636. { Should have been checked previously }
  1637. if p.opcode <> A_LEA then
  1638. InternalError(2020072501);
  1639. { do not mess with the stack point as adjusting it by lea is recommend, except if we optimize for size }
  1640. if (p.oper[1]^.reg=NR_STACK_POINTER_REG) and
  1641. not(cs_opt_size in current_settings.optimizerswitches) then
  1642. exit;
  1643. with p.oper[0]^.ref^ do
  1644. begin
  1645. if (base <> p.oper[1]^.reg) or
  1646. (index <> NR_NO) or
  1647. assigned(symbol) then
  1648. exit;
  1649. l:=offset;
  1650. if (l=1) and UseIncDec then
  1651. begin
  1652. p.opcode:=A_INC;
  1653. p.loadreg(0,p.oper[1]^.reg);
  1654. p.ops:=1;
  1655. DebugMsg(SPeepholeOptimization + 'Lea2Inc done',p);
  1656. end
  1657. else if (l=-1) and UseIncDec then
  1658. begin
  1659. p.opcode:=A_DEC;
  1660. p.loadreg(0,p.oper[1]^.reg);
  1661. p.ops:=1;
  1662. DebugMsg(SPeepholeOptimization + 'Lea2Dec done',p);
  1663. end
  1664. else
  1665. begin
  1666. if (l<0) and (l<>-2147483648) then
  1667. begin
  1668. p.opcode:=A_SUB;
  1669. p.loadConst(0,-l);
  1670. DebugMsg(SPeepholeOptimization + 'Lea2Sub done',p);
  1671. end
  1672. else
  1673. begin
  1674. p.opcode:=A_ADD;
  1675. p.loadConst(0,l);
  1676. DebugMsg(SPeepholeOptimization + 'Lea2Add done',p);
  1677. end;
  1678. end;
  1679. end;
  1680. Result := True;
  1681. end;
  1682. function TX86AsmOptimizer.DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  1683. var
  1684. CurrentReg, ReplaceReg: TRegister;
  1685. SubReg: TSubRegister;
  1686. begin
  1687. Result := False;
  1688. ReplaceReg := taicpu(p_mov).oper[0]^.reg;
  1689. CurrentReg := taicpu(p_mov).oper[1]^.reg;
  1690. case hp.opcode of
  1691. A_FSTSW, A_FNSTSW,
  1692. A_IN, A_INS, A_OUT, A_OUTS,
  1693. A_CMPS, A_LODS, A_MOVS, A_SCAS, A_STOS:
  1694. { These routines have explicit operands, but they are restricted in
  1695. what they can be (e.g. IN and OUT can only read from AL, AX or
  1696. EAX. }
  1697. Exit;
  1698. A_IMUL:
  1699. begin
  1700. { The 1-operand version writes to implicit registers
  1701. The 2-operand version reads from the first operator, and reads
  1702. from and writes to the second (equivalent to Ch_ROp1, ChRWOp2).
  1703. the 3-operand version reads from a register that it doesn't write to
  1704. }
  1705. case hp.ops of
  1706. 1:
  1707. if (
  1708. (
  1709. (hp.opsize = S_B) and (getsupreg(CurrentReg) <> RS_EAX)
  1710. ) or
  1711. not (getsupreg(CurrentReg) in [RS_EAX, RS_EDX])
  1712. ) and ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  1713. begin
  1714. Result := True;
  1715. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 1)', hp);
  1716. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1717. end;
  1718. 2:
  1719. { Only modify the first parameter }
  1720. if ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  1721. begin
  1722. Result := True;
  1723. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 2)', hp);
  1724. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1725. end;
  1726. 3:
  1727. { Only modify the second parameter }
  1728. if ReplaceRegisterInOper(hp, 1, CurrentReg, ReplaceReg) then
  1729. begin
  1730. Result := True;
  1731. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 3)', hp);
  1732. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1733. end;
  1734. else
  1735. InternalError(2020012901);
  1736. end;
  1737. end;
  1738. else
  1739. if (hp.ops > 0) and
  1740. ReplaceRegisterInInstruction(hp, CurrentReg, ReplaceReg) then
  1741. begin
  1742. Result := True;
  1743. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovXXX2MovXXX)', hp);
  1744. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1745. end;
  1746. end;
  1747. end;
  1748. function TX86AsmOptimizer.OptPass1MOV(var p : tai) : boolean;
  1749. var
  1750. hp1, hp2, hp3: tai;
  1751. procedure convert_mov_value(signed_movop: tasmop; max_value: tcgint); inline;
  1752. begin
  1753. if taicpu(hp1).opcode = signed_movop then
  1754. begin
  1755. if taicpu(p).oper[0]^.val > max_value shr 1 then
  1756. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val - max_value - 1 { Convert to signed }
  1757. end
  1758. else
  1759. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and max_value; { Trim to unsigned }
  1760. end;
  1761. var
  1762. GetNextInstruction_p, TempRegUsed: Boolean;
  1763. PreMessage, RegName1, RegName2, InputVal, MaskNum: string;
  1764. NewSize: topsize;
  1765. CurrentReg: TRegister;
  1766. begin
  1767. Result:=false;
  1768. GetNextInstruction_p:=GetNextInstruction(p, hp1);
  1769. { remove mov reg1,reg1? }
  1770. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^)
  1771. then
  1772. begin
  1773. DebugMsg(SPeepholeOptimization + 'Mov2Nop 1 done',p);
  1774. { take care of the register (de)allocs following p }
  1775. RemoveCurrentP(p, hp1);
  1776. Result:=true;
  1777. exit;
  1778. end;
  1779. { All the next optimisations require a next instruction }
  1780. if not GetNextInstruction_p or (hp1.typ <> ait_instruction) then
  1781. Exit;
  1782. { Look for:
  1783. mov %reg1,%reg2
  1784. ??? %reg2,r/m
  1785. Change to:
  1786. mov %reg1,%reg2
  1787. ??? %reg1,r/m
  1788. }
  1789. if MatchOpType(taicpu(p), top_reg, top_reg) then
  1790. begin
  1791. CurrentReg := taicpu(p).oper[1]^.reg;
  1792. if RegReadByInstruction(CurrentReg, hp1) and
  1793. DeepMOVOpt(taicpu(p), taicpu(hp1)) then
  1794. begin
  1795. TransferUsedRegs(TmpUsedRegs);
  1796. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  1797. if not RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs) and
  1798. { Just in case something didn't get modified (e.g. an
  1799. implicit register) }
  1800. not RegReadByInstruction(CurrentReg, hp1) then
  1801. begin
  1802. { We can remove the original MOV }
  1803. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3 done',p);
  1804. Asml.Remove(p);
  1805. p.Free;
  1806. p := hp1;
  1807. { TmpUsedRegs contains the results of "UpdateUsedRegs(tai(p.Next))" already,
  1808. so just restore it to UsedRegs instead of calculating it again }
  1809. RestoreUsedRegs(TmpUsedRegs);
  1810. Result := True;
  1811. Exit;
  1812. end;
  1813. { If we know a MOV instruction has become a null operation, we might as well
  1814. get rid of it now to save time. }
  1815. if (taicpu(hp1).opcode = A_MOV) and
  1816. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1817. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
  1818. { Just being a register is enough to confirm it's a null operation }
  1819. (taicpu(hp1).oper[0]^.typ = top_reg) then
  1820. begin
  1821. Result := True;
  1822. { Speed-up to reduce a pipeline stall... if we had something like...
  1823. movl %eax,%edx
  1824. movw %dx,%ax
  1825. ... the second instruction would change to movw %ax,%ax, but
  1826. given that it is now %ax that's active rather than %eax,
  1827. penalties might occur due to a partial register write, so instead,
  1828. change it to a MOVZX instruction when optimising for speed.
  1829. }
  1830. if not (cs_opt_size in current_settings.optimizerswitches) and
  1831. IsMOVZXAcceptable and
  1832. (taicpu(hp1).opsize < taicpu(p).opsize)
  1833. {$ifdef x86_64}
  1834. { operations already implicitly set the upper 64 bits to zero }
  1835. and not ((taicpu(hp1).opsize = S_L) and (taicpu(p).opsize = S_Q))
  1836. {$endif x86_64}
  1837. then
  1838. begin
  1839. CurrentReg := taicpu(hp1).oper[1]^.reg;
  1840. DebugMsg(SPeepholeOptimization + 'Zero-extension to minimise pipeline stall (Mov2Movz)',hp1);
  1841. case taicpu(p).opsize of
  1842. S_W:
  1843. if taicpu(hp1).opsize = S_B then
  1844. taicpu(hp1).opsize := S_BL
  1845. else
  1846. InternalError(2020012911);
  1847. S_L{$ifdef x86_64}, S_Q{$endif x86_64}:
  1848. case taicpu(hp1).opsize of
  1849. S_B:
  1850. taicpu(hp1).opsize := S_BL;
  1851. S_W:
  1852. taicpu(hp1).opsize := S_WL;
  1853. else
  1854. InternalError(2020012912);
  1855. end;
  1856. else
  1857. InternalError(2020012910);
  1858. end;
  1859. taicpu(hp1).opcode := A_MOVZX;
  1860. taicpu(hp1).oper[1]^.reg := newreg(getregtype(CurrentReg), getsupreg(CurrentReg), R_SUBD)
  1861. end
  1862. else
  1863. begin
  1864. GetNextInstruction_p := GetNextInstruction(hp1, hp2);
  1865. DebugMsg(SPeepholeOptimization + 'Mov2Nop 4 done',hp1);
  1866. asml.remove(hp1);
  1867. hp1.free;
  1868. { The instruction after what was hp1 is now the immediate next instruction,
  1869. so we can continue to make optimisations if it's present }
  1870. if not GetNextInstruction_p or (hp2.typ <> ait_instruction) then
  1871. Exit;
  1872. hp1 := hp2;
  1873. end;
  1874. end;
  1875. end;
  1876. end;
  1877. { Depending on the DeepMOVOpt above, it may turn out that hp1 completely
  1878. overwrites the original destination register. e.g.
  1879. movl ###,%reg2d
  1880. movslq ###,%reg2q (### doesn't have to be the same as the first one)
  1881. In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
  1882. }
  1883. if (taicpu(p).oper[1]^.typ = top_reg) and
  1884. MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
  1885. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1886. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  1887. begin
  1888. if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
  1889. begin
  1890. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  1891. case taicpu(p).oper[0]^.typ of
  1892. top_const:
  1893. { We have something like:
  1894. movb $x, %regb
  1895. movzbl %regb,%regd
  1896. Change to:
  1897. movl $x, %regd
  1898. }
  1899. begin
  1900. case taicpu(hp1).opsize of
  1901. S_BW:
  1902. begin
  1903. convert_mov_value(A_MOVSX, $FF);
  1904. setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
  1905. taicpu(p).opsize := S_W;
  1906. end;
  1907. S_BL:
  1908. begin
  1909. convert_mov_value(A_MOVSX, $FF);
  1910. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  1911. taicpu(p).opsize := S_L;
  1912. end;
  1913. S_WL:
  1914. begin
  1915. convert_mov_value(A_MOVSX, $FFFF);
  1916. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  1917. taicpu(p).opsize := S_L;
  1918. end;
  1919. {$ifdef x86_64}
  1920. S_BQ:
  1921. begin
  1922. convert_mov_value(A_MOVSX, $FF);
  1923. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1924. taicpu(p).opsize := S_Q;
  1925. end;
  1926. S_WQ:
  1927. begin
  1928. convert_mov_value(A_MOVSX, $FFFF);
  1929. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1930. taicpu(p).opsize := S_Q;
  1931. end;
  1932. S_LQ:
  1933. begin
  1934. convert_mov_value(A_MOVSXD, $FFFFFFFF); { Note it's MOVSXD, not MOVSX }
  1935. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1936. taicpu(p).opsize := S_Q;
  1937. end;
  1938. {$endif x86_64}
  1939. else
  1940. { If hp1 was a MOV instruction, it should have been
  1941. optimised already }
  1942. InternalError(2020021001);
  1943. end;
  1944. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
  1945. asml.Remove(hp1);
  1946. hp1.Free;
  1947. Result := True;
  1948. Exit;
  1949. end;
  1950. top_ref:
  1951. { We have something like:
  1952. movb mem, %regb
  1953. movzbl %regb,%regd
  1954. Change to:
  1955. movzbl mem, %regd
  1956. }
  1957. if (taicpu(p).oper[0]^.ref^.refaddr<>addr_full) and (IsMOVZXAcceptable or (taicpu(hp1).opcode<>A_MOVZX)) then
  1958. begin
  1959. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
  1960. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  1961. RemoveCurrentP(p, hp1);
  1962. Result:=True;
  1963. Exit;
  1964. end;
  1965. else
  1966. if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
  1967. { Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
  1968. Exit;
  1969. end;
  1970. end
  1971. { The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
  1972. and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
  1973. optimised }
  1974. else
  1975. begin
  1976. DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
  1977. RemoveCurrentP(p, hp1);
  1978. Result := True;
  1979. Exit;
  1980. end;
  1981. end;
  1982. if (taicpu(hp1).opcode = A_AND) and
  1983. (taicpu(p).oper[1]^.typ = top_reg) and
  1984. MatchOpType(taicpu(hp1),top_const,top_reg) then
  1985. begin
  1986. if MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) then
  1987. begin
  1988. case taicpu(p).opsize of
  1989. S_L:
  1990. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  1991. begin
  1992. { Optimize out:
  1993. mov x, %reg
  1994. and ffffffffh, %reg
  1995. }
  1996. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 1 done',p);
  1997. asml.remove(hp1);
  1998. hp1.free;
  1999. Result:=true;
  2000. exit;
  2001. end;
  2002. S_Q: { TODO: Confirm if this is even possible }
  2003. if (taicpu(hp1).oper[0]^.val = $ffffffffffffffff) then
  2004. begin
  2005. { Optimize out:
  2006. mov x, %reg
  2007. and ffffffffffffffffh, %reg
  2008. }
  2009. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 2 done',p);
  2010. asml.remove(hp1);
  2011. hp1.free;
  2012. Result:=true;
  2013. exit;
  2014. end;
  2015. else
  2016. ;
  2017. end;
  2018. if ((taicpu(p).oper[0]^.typ=top_reg) or
  2019. ((taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr<>addr_full))) and
  2020. GetNextInstruction(hp1,hp2) and
  2021. MatchInstruction(hp2,A_TEST,[taicpu(p).opsize]) and
  2022. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp2).oper[1]^) and
  2023. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^) and
  2024. GetNextInstruction(hp2,hp3) and
  2025. MatchInstruction(hp3,A_Jcc,A_Setcc,[S_NO]) and
  2026. (taicpu(hp3).condition in [C_E,C_NE]) then
  2027. begin
  2028. TransferUsedRegs(TmpUsedRegs);
  2029. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  2030. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  2031. if not(RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp2, TmpUsedRegs)) then
  2032. begin
  2033. DebugMsg(SPeepholeOptimization + 'MovAndTest2Test done',p);
  2034. taicpu(hp1).loadoper(1,taicpu(p).oper[0]^);
  2035. taicpu(hp1).opcode:=A_TEST;
  2036. asml.Remove(hp2);
  2037. hp2.free;
  2038. RemoveCurrentP(p, hp1);
  2039. Result:=true;
  2040. exit;
  2041. end;
  2042. end;
  2043. end
  2044. else if IsMOVZXAcceptable and
  2045. (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(hp1).oper[1]^.typ = top_reg) and
  2046. (taicpu(p).oper[0]^.typ <> top_const) and { MOVZX only supports registers and memory, not immediates (use MOV for that!) }
  2047. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  2048. then
  2049. begin
  2050. InputVal := debug_operstr(taicpu(p).oper[0]^);
  2051. MaskNum := debug_tostr(taicpu(hp1).oper[0]^.val);
  2052. case taicpu(p).opsize of
  2053. S_B:
  2054. if (taicpu(hp1).oper[0]^.val = $ff) then
  2055. begin
  2056. { Convert:
  2057. movb x, %regl movb x, %regl
  2058. andw ffh, %regw andl ffh, %regd
  2059. To:
  2060. movzbw x, %regd movzbl x, %regd
  2061. (Identical registers, just different sizes)
  2062. }
  2063. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 8-bit register name }
  2064. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 16/32-bit register name }
  2065. case taicpu(hp1).opsize of
  2066. S_W: NewSize := S_BW;
  2067. S_L: NewSize := S_BL;
  2068. {$ifdef x86_64}
  2069. S_Q: NewSize := S_BQ;
  2070. {$endif x86_64}
  2071. else
  2072. InternalError(2018011510);
  2073. end;
  2074. end
  2075. else
  2076. NewSize := S_NO;
  2077. S_W:
  2078. if (taicpu(hp1).oper[0]^.val = $ffff) then
  2079. begin
  2080. { Convert:
  2081. movw x, %regw
  2082. andl ffffh, %regd
  2083. To:
  2084. movzwl x, %regd
  2085. (Identical registers, just different sizes)
  2086. }
  2087. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 16-bit register name }
  2088. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 32-bit register name }
  2089. case taicpu(hp1).opsize of
  2090. S_L: NewSize := S_WL;
  2091. {$ifdef x86_64}
  2092. S_Q: NewSize := S_WQ;
  2093. {$endif x86_64}
  2094. else
  2095. InternalError(2018011511);
  2096. end;
  2097. end
  2098. else
  2099. NewSize := S_NO;
  2100. else
  2101. NewSize := S_NO;
  2102. end;
  2103. if NewSize <> S_NO then
  2104. begin
  2105. PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + InputVal + ',' + RegName1;
  2106. { The actual optimization }
  2107. taicpu(p).opcode := A_MOVZX;
  2108. taicpu(p).changeopsize(NewSize);
  2109. taicpu(p).oper[1]^ := taicpu(hp1).oper[1]^;
  2110. { Safeguard if "and" is followed by a conditional command }
  2111. TransferUsedRegs(TmpUsedRegs);
  2112. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  2113. if (RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  2114. begin
  2115. { At this point, the "and" command is effectively equivalent to
  2116. "test %reg,%reg". This will be handled separately by the
  2117. Peephole Optimizer. [Kit] }
  2118. DebugMsg(SPeepholeOptimization + PreMessage +
  2119. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  2120. end
  2121. else
  2122. begin
  2123. DebugMsg(SPeepholeOptimization + PreMessage + '; and' + debug_opsize2str(taicpu(hp1).opsize) + ' $' + MaskNum + ',' + RegName2 +
  2124. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  2125. asml.Remove(hp1);
  2126. hp1.Free;
  2127. end;
  2128. Result := True;
  2129. Exit;
  2130. end;
  2131. end;
  2132. end;
  2133. { Next instruction is also a MOV ? }
  2134. if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) then
  2135. begin
  2136. if (taicpu(p).oper[1]^.typ = top_reg) and
  2137. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  2138. begin
  2139. CurrentReg := taicpu(p).oper[1]^.reg;
  2140. TransferUsedRegs(TmpUsedRegs);
  2141. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  2142. { we have
  2143. mov x, %treg
  2144. mov %treg, y
  2145. }
  2146. if not(RegInOp(CurrentReg, taicpu(hp1).oper[1]^)) then
  2147. if not(RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs)) then
  2148. { we've got
  2149. mov x, %treg
  2150. mov %treg, y
  2151. with %treg is not used after }
  2152. case taicpu(p).oper[0]^.typ Of
  2153. { top_reg is covered by DeepMOVOpt }
  2154. top_const:
  2155. begin
  2156. { change
  2157. mov const, %treg
  2158. mov %treg, y
  2159. to
  2160. mov const, y
  2161. }
  2162. if (taicpu(hp1).oper[1]^.typ=top_reg) or
  2163. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  2164. begin
  2165. if taicpu(hp1).oper[1]^.typ=top_reg then
  2166. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  2167. taicpu(p).loadOper(1,taicpu(hp1).oper[1]^);
  2168. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 5 done',p);
  2169. asml.remove(hp1);
  2170. hp1.free;
  2171. Result:=true;
  2172. Exit;
  2173. end;
  2174. end;
  2175. top_ref:
  2176. if (taicpu(hp1).oper[1]^.typ = top_reg) then
  2177. begin
  2178. { change
  2179. mov mem, %treg
  2180. mov %treg, %reg
  2181. to
  2182. mov mem, %reg"
  2183. }
  2184. taicpu(p).loadreg(1, taicpu(hp1).oper[1]^.reg);
  2185. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3 done',p);
  2186. asml.remove(hp1);
  2187. hp1.free;
  2188. Result:=true;
  2189. Exit;
  2190. end;
  2191. else
  2192. ;
  2193. end
  2194. else
  2195. { %treg is used afterwards, but all eventualities
  2196. other than the first MOV instruction being a constant
  2197. are covered by DeepMOVOpt, so only check for that }
  2198. if (taicpu(p).oper[0]^.typ = top_const) and
  2199. (
  2200. { For MOV operations, a size saving is only made if the register/const is byte-sized }
  2201. not (cs_opt_size in current_settings.optimizerswitches) or
  2202. (taicpu(hp1).opsize = S_B)
  2203. ) and
  2204. (
  2205. (taicpu(hp1).oper[1]^.typ = top_reg) or
  2206. ((taicpu(p).oper[0]^.val >= low(longint)) and (taicpu(p).oper[0]^.val <= high(longint)))
  2207. ) then
  2208. begin
  2209. DebugMsg(SPeepholeOptimization + debug_operstr(taicpu(hp1).oper[0]^) + ' = $' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 6b)',hp1);
  2210. taicpu(hp1).loadconst(0, taicpu(p).oper[0]^.val);
  2211. end;
  2212. end;
  2213. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  2214. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  2215. { mov reg1, mem1 or mov mem1, reg1
  2216. mov mem2, reg2 mov reg2, mem2}
  2217. begin
  2218. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  2219. { mov reg1, mem1 or mov mem1, reg1
  2220. mov mem2, reg1 mov reg2, mem1}
  2221. begin
  2222. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2223. { Removes the second statement from
  2224. mov reg1, mem1/reg2
  2225. mov mem1/reg2, reg1 }
  2226. begin
  2227. if taicpu(p).oper[0]^.typ=top_reg then
  2228. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2229. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 1',p);
  2230. asml.remove(hp1);
  2231. hp1.free;
  2232. Result:=true;
  2233. exit;
  2234. end
  2235. else
  2236. begin
  2237. TransferUsedRegs(TmpUsedRegs);
  2238. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2239. if (taicpu(p).oper[1]^.typ = top_ref) and
  2240. { mov reg1, mem1
  2241. mov mem2, reg1 }
  2242. (taicpu(hp1).oper[0]^.ref^.refaddr = addr_no) and
  2243. GetNextInstruction(hp1, hp2) and
  2244. MatchInstruction(hp2,A_CMP,[taicpu(p).opsize]) and
  2245. OpsEqual(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^) and
  2246. OpsEqual(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) and
  2247. not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs)) then
  2248. { change to
  2249. mov reg1, mem1 mov reg1, mem1
  2250. mov mem2, reg1 cmp reg1, mem2
  2251. cmp mem1, reg1
  2252. }
  2253. begin
  2254. asml.remove(hp2);
  2255. hp2.free;
  2256. taicpu(hp1).opcode := A_CMP;
  2257. taicpu(hp1).loadref(1,taicpu(hp1).oper[0]^.ref^);
  2258. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  2259. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  2260. DebugMsg(SPeepholeOptimization + 'MovMovCmp2MovCmp done',hp1);
  2261. end;
  2262. end;
  2263. end
  2264. else if (taicpu(p).oper[1]^.typ=top_ref) and
  2265. OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2266. begin
  2267. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  2268. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  2269. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov1 done',p);
  2270. end
  2271. else
  2272. begin
  2273. TransferUsedRegs(TmpUsedRegs);
  2274. if GetNextInstruction(hp1, hp2) and
  2275. MatchOpType(taicpu(p),top_ref,top_reg) and
  2276. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2277. (taicpu(hp1).oper[1]^.typ = top_ref) and
  2278. MatchInstruction(hp2,A_MOV,[taicpu(p).opsize]) and
  2279. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  2280. RefsEqual(taicpu(hp2).oper[0]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  2281. if not RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^) and
  2282. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,tmpUsedRegs)) then
  2283. { mov mem1, %reg1
  2284. mov %reg1, mem2
  2285. mov mem2, reg2
  2286. to:
  2287. mov mem1, reg2
  2288. mov reg2, mem2}
  2289. begin
  2290. AllocRegBetween(taicpu(hp2).oper[1]^.reg,p,hp2,usedregs);
  2291. DebugMsg(SPeepholeOptimization + 'MovMovMov2MovMov 1 done',p);
  2292. taicpu(p).loadoper(1,taicpu(hp2).oper[1]^);
  2293. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  2294. asml.remove(hp2);
  2295. hp2.free;
  2296. end
  2297. {$ifdef i386}
  2298. { this is enabled for i386 only, as the rules to create the reg sets below
  2299. are too complicated for x86-64, so this makes this code too error prone
  2300. on x86-64
  2301. }
  2302. else if (taicpu(p).oper[1]^.reg <> taicpu(hp2).oper[1]^.reg) and
  2303. not(RegInRef(taicpu(p).oper[1]^.reg,taicpu(p).oper[0]^.ref^)) and
  2304. not(RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^)) then
  2305. { mov mem1, reg1 mov mem1, reg1
  2306. mov reg1, mem2 mov reg1, mem2
  2307. mov mem2, reg2 mov mem2, reg1
  2308. to: to:
  2309. mov mem1, reg1 mov mem1, reg1
  2310. mov mem1, reg2 mov reg1, mem2
  2311. mov reg1, mem2
  2312. or (if mem1 depends on reg1
  2313. and/or if mem2 depends on reg2)
  2314. to:
  2315. mov mem1, reg1
  2316. mov reg1, mem2
  2317. mov reg1, reg2
  2318. }
  2319. begin
  2320. taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^);
  2321. taicpu(hp1).loadReg(1,taicpu(hp2).oper[1]^.reg);
  2322. taicpu(hp2).loadRef(1,taicpu(hp2).oper[0]^.ref^);
  2323. taicpu(hp2).loadReg(0,taicpu(p).oper[1]^.reg);
  2324. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  2325. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  2326. (getsupreg(taicpu(p).oper[0]^.ref^.base) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  2327. AllocRegBetween(taicpu(p).oper[0]^.ref^.base,p,hp2,usedregs);
  2328. if (taicpu(p).oper[0]^.ref^.index <> NR_NO) and
  2329. (getsupreg(taicpu(p).oper[0]^.ref^.index) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  2330. AllocRegBetween(taicpu(p).oper[0]^.ref^.index,p,hp2,usedregs);
  2331. end
  2332. else if (taicpu(hp1).Oper[0]^.reg <> taicpu(hp2).Oper[1]^.reg) then
  2333. begin
  2334. taicpu(hp2).loadReg(0,taicpu(hp1).Oper[0]^.reg);
  2335. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  2336. end
  2337. else
  2338. begin
  2339. asml.remove(hp2);
  2340. hp2.free;
  2341. end
  2342. {$endif i386}
  2343. ;
  2344. end;
  2345. end
  2346. { movl [mem1],reg1
  2347. movl [mem1],reg2
  2348. to
  2349. movl [mem1],reg1
  2350. movl reg1,reg2
  2351. }
  2352. else if MatchOpType(taicpu(p),top_ref,top_reg) and
  2353. MatchOpType(taicpu(hp1),top_ref,top_reg) and
  2354. (taicpu(p).opsize = taicpu(hp1).opsize) and
  2355. RefsEqual(taicpu(p).oper[0]^.ref^,taicpu(hp1).oper[0]^.ref^) and
  2356. (taicpu(p).oper[0]^.ref^.volatility=[]) and
  2357. (taicpu(hp1).oper[0]^.ref^.volatility=[]) and
  2358. not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.base)) and
  2359. not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.index)) then
  2360. begin
  2361. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 2',p);
  2362. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  2363. end;
  2364. { movl const1,[mem1]
  2365. movl [mem1],reg1
  2366. to
  2367. movl const1,reg1
  2368. movl reg1,[mem1]
  2369. }
  2370. if MatchOpType(Taicpu(p),top_const,top_ref) and
  2371. MatchOpType(Taicpu(hp1),top_ref,top_reg) and
  2372. (taicpu(p).opsize = taicpu(hp1).opsize) and
  2373. RefsEqual(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.ref^) and
  2374. not(RegInRef(taicpu(hp1).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^)) then
  2375. begin
  2376. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  2377. taicpu(hp1).loadReg(0,taicpu(hp1).oper[1]^.reg);
  2378. taicpu(hp1).loadRef(1,taicpu(p).oper[1]^.ref^);
  2379. taicpu(p).loadReg(1,taicpu(hp1).oper[0]^.reg);
  2380. taicpu(hp1).fileinfo := taicpu(p).fileinfo;
  2381. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 1',p);
  2382. Result:=true;
  2383. exit;
  2384. end;
  2385. { mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
  2386. end;
  2387. { search further than the next instruction for a mov }
  2388. if
  2389. { check as much as possible before the expensive GetNextInstructionUsingReg call }
  2390. (taicpu(p).oper[1]^.typ = top_reg) and
  2391. (taicpu(p).oper[0]^.typ in [top_reg,top_const]) and
  2392. not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) and
  2393. { we work with hp2 here, so hp1 can be still used later on when
  2394. checking for GetNextInstruction_p }
  2395. { GetNextInstructionUsingReg only searches one instruction ahead unless -O3 is specified }
  2396. GetNextInstructionUsingReg(hp1,hp2,taicpu(p).oper[1]^.reg) and
  2397. (hp2.typ=ait_instruction) then
  2398. begin
  2399. case taicpu(hp2).opcode of
  2400. A_MOV:
  2401. if MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^.reg) and
  2402. ((taicpu(p).oper[0]^.typ=top_const) or
  2403. ((taicpu(p).oper[0]^.typ=top_reg) and
  2404. not(RegUsedBetween(taicpu(p).oper[0]^.reg, p, hp2))
  2405. )
  2406. ) then
  2407. begin
  2408. { we have
  2409. mov x, %treg
  2410. mov %treg, y
  2411. }
  2412. TransferUsedRegs(TmpUsedRegs);
  2413. TmpUsedRegs[R_INTREGISTER].Update(tai(p.Next));
  2414. { We don't need to call UpdateUsedRegs for every instruction between
  2415. p and hp2 because the register we're concerned about will not
  2416. become deallocated (otherwise GetNextInstructionUsingReg would
  2417. have stopped at an earlier instruction). [Kit] }
  2418. TempRegUsed :=
  2419. RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) or
  2420. RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1);
  2421. case taicpu(p).oper[0]^.typ Of
  2422. top_reg:
  2423. begin
  2424. { change
  2425. mov %reg, %treg
  2426. mov %treg, y
  2427. to
  2428. mov %reg, y
  2429. }
  2430. CurrentReg := taicpu(p).oper[0]^.reg; { Saves on a handful of pointer dereferences }
  2431. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  2432. if taicpu(hp2).oper[1]^.reg = CurrentReg then
  2433. begin
  2434. { %reg = y - remove hp2 completely (doing it here instead of relying on
  2435. the "mov %reg,%reg" optimisation might cut down on a pass iteration) }
  2436. if TempRegUsed then
  2437. begin
  2438. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + RegName1 + '; removed unnecessary instruction (MovMov2MovNop 6b}',hp2);
  2439. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  2440. asml.remove(hp2);
  2441. hp2.Free;
  2442. end
  2443. else
  2444. begin
  2445. asml.remove(hp2);
  2446. hp2.Free;
  2447. { We can remove the original MOV too }
  2448. DebugMsg(SPeepholeOptimization + 'MovMov2NopNop 6b done',p);
  2449. RemoveCurrentP(p, hp1);
  2450. Result:=true;
  2451. Exit;
  2452. end;
  2453. end
  2454. else
  2455. begin
  2456. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  2457. taicpu(hp2).loadReg(0, CurrentReg);
  2458. if TempRegUsed then
  2459. begin
  2460. { Don't remove the first instruction if the temporary register is in use }
  2461. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_regname(CurrentReg) + '; changed to minimise pipeline stall (MovMov2Mov 6a}',hp2);
  2462. { No need to set Result to True. If there's another instruction later on
  2463. that can be optimised, it will be detected when the main Pass 1 loop
  2464. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
  2465. end
  2466. else
  2467. begin
  2468. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 6 done',p);
  2469. RemoveCurrentP(p, hp1);
  2470. Result:=true;
  2471. Exit;
  2472. end;
  2473. end;
  2474. end;
  2475. top_const:
  2476. if not (cs_opt_size in current_settings.optimizerswitches) or (taicpu(hp2).opsize = S_B) then
  2477. begin
  2478. { change
  2479. mov const, %treg
  2480. mov %treg, y
  2481. to
  2482. mov const, y
  2483. }
  2484. if (taicpu(hp2).oper[1]^.typ=top_reg) or
  2485. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  2486. begin
  2487. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  2488. taicpu(hp2).loadOper(0,taicpu(p).oper[0]^);
  2489. if TempRegUsed then
  2490. begin
  2491. { Don't remove the first instruction if the temporary register is in use }
  2492. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 7a)',hp2);
  2493. { No need to set Result to True. If there's another instruction later on
  2494. that can be optimised, it will be detected when the main Pass 1 loop
  2495. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
  2496. end
  2497. else
  2498. begin
  2499. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 7 done',p);
  2500. RemoveCurrentP(p, hp1);
  2501. Result:=true;
  2502. Exit;
  2503. end;
  2504. end;
  2505. end;
  2506. else
  2507. Internalerror(2019103001);
  2508. end;
  2509. end;
  2510. A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  2511. if MatchOpType(taicpu(hp2), top_reg, top_reg) and
  2512. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  2513. SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  2514. begin
  2515. {
  2516. Change from:
  2517. mov ###, %reg
  2518. ...
  2519. movs/z %reg,%reg (Same register, just different sizes)
  2520. To:
  2521. movs/z ###, %reg (Longer version)
  2522. ...
  2523. (remove)
  2524. }
  2525. DebugMsg(SPeepholeOptimization + 'MovMovs/z2Mov/s/z done', p);
  2526. taicpu(p).oper[1]^.reg := taicpu(hp2).oper[1]^.reg;
  2527. { Keep the first instruction as mov if ### is a constant }
  2528. if taicpu(p).oper[0]^.typ = top_const then
  2529. taicpu(p).opsize := reg2opsize(taicpu(hp2).oper[1]^.reg)
  2530. else
  2531. begin
  2532. taicpu(p).opcode := taicpu(hp2).opcode;
  2533. taicpu(p).opsize := taicpu(hp2).opsize;
  2534. end;
  2535. DebugMsg(SPeepholeOptimization + 'Removed movs/z instruction and extended earlier write (MovMovs/z2Mov/s/z)', hp2);
  2536. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp2, UsedRegs);
  2537. AsmL.Remove(hp2);
  2538. hp2.Free;
  2539. Result := True;
  2540. Exit;
  2541. end;
  2542. else
  2543. ;
  2544. end;
  2545. end;
  2546. if (aoc_MovAnd2Mov_3 in OptsToCheck) and
  2547. (taicpu(p).oper[1]^.typ = top_reg) and
  2548. (taicpu(p).opsize = S_L) and
  2549. GetNextInstructionUsingRegTrackingUse(p,hp2,taicpu(p).oper[1]^.reg) and
  2550. (taicpu(hp2).opcode = A_AND) and
  2551. (MatchOpType(taicpu(hp2),top_const,top_reg) or
  2552. (MatchOpType(taicpu(hp2),top_reg,top_reg) and
  2553. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^))
  2554. ) then
  2555. begin
  2556. if SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp2).oper[1]^.reg) then
  2557. begin
  2558. if ((taicpu(hp2).oper[0]^.typ=top_const) and (taicpu(hp2).oper[0]^.val = $ffffffff)) or
  2559. ((taicpu(hp2).oper[0]^.typ=top_reg) and (taicpu(hp2).opsize=S_L)) then
  2560. begin
  2561. { Optimize out:
  2562. mov x, %reg
  2563. and ffffffffh, %reg
  2564. }
  2565. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 3 done',p);
  2566. asml.remove(hp2);
  2567. hp2.free;
  2568. Result:=true;
  2569. exit;
  2570. end;
  2571. end;
  2572. end;
  2573. { leave out the mov from "mov reg, x(%frame_pointer); leave/ret" (with
  2574. x >= RetOffset) as it doesn't do anything (it writes either to a
  2575. parameter or to the temporary storage room for the function
  2576. result)
  2577. }
  2578. if IsExitCode(hp1) and
  2579. (taicpu(p).oper[1]^.typ = top_ref) and
  2580. (taicpu(p).oper[1]^.ref^.index = NR_NO) and
  2581. (
  2582. (
  2583. (taicpu(p).oper[1]^.ref^.base = current_procinfo.FramePointer) and
  2584. not (
  2585. assigned(current_procinfo.procdef.funcretsym) and
  2586. (taicpu(p).oper[1]^.ref^.offset <= tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)
  2587. )
  2588. ) or
  2589. { Also discard writes to the stack that are below the base pointer,
  2590. as this is temporary storage rather than a function result on the
  2591. stack, say. }
  2592. (
  2593. (taicpu(p).oper[1]^.ref^.base = NR_STACK_POINTER_REG) and
  2594. (taicpu(p).oper[1]^.ref^.offset < current_procinfo.final_localsize)
  2595. )
  2596. ) then
  2597. begin
  2598. asml.remove(p);
  2599. p.free;
  2600. p:=hp1;
  2601. DebugMsg(SPeepholeOptimization + 'removed deadstore before leave/ret',p);
  2602. RemoveLastDeallocForFuncRes(p);
  2603. Result:=true;
  2604. exit;
  2605. end;
  2606. if MatchOpType(taicpu(p),top_reg,top_ref) and
  2607. MatchInstruction(hp1,A_CMP,A_TEST,[taicpu(p).opsize]) and
  2608. (taicpu(hp1).oper[1]^.typ = top_ref) and
  2609. RefsEqual(taicpu(p).oper[1]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  2610. begin
  2611. { change
  2612. mov reg1, mem1
  2613. test/cmp x, mem1
  2614. to
  2615. mov reg1, mem1
  2616. test/cmp x, reg1
  2617. }
  2618. taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
  2619. DebugMsg(SPeepholeOptimization + 'MovTestCmp2MovTestCmp 1',hp1);
  2620. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2621. exit;
  2622. end;
  2623. if MatchInstruction(hp1,A_LEA,[S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  2624. { If the flags register is in use, don't change the instruction to an
  2625. ADD otherwise this will scramble the flags. [Kit] }
  2626. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  2627. begin
  2628. if MatchOpType(Taicpu(p),top_ref,top_reg) and
  2629. ((MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(hp1).oper[1]^.reg,Taicpu(p).oper[1]^.reg) and
  2630. (Taicpu(hp1).oper[0]^.ref^.base<>Taicpu(p).oper[1]^.reg)
  2631. ) or
  2632. (MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(p).oper[1]^.reg,Taicpu(hp1).oper[1]^.reg) and
  2633. (Taicpu(hp1).oper[0]^.ref^.index<>Taicpu(p).oper[1]^.reg)
  2634. )
  2635. ) then
  2636. { mov reg1,ref
  2637. lea reg2,[reg1,reg2]
  2638. to
  2639. add reg2,ref}
  2640. begin
  2641. TransferUsedRegs(TmpUsedRegs);
  2642. { reg1 may not be used afterwards }
  2643. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
  2644. begin
  2645. Taicpu(hp1).opcode:=A_ADD;
  2646. Taicpu(hp1).oper[0]^.ref^:=Taicpu(p).oper[0]^.ref^;
  2647. DebugMsg(SPeepholeOptimization + 'MovLea2Add done',hp1);
  2648. RemoveCurrentp(p, hp1);
  2649. result:=true;
  2650. exit;
  2651. end;
  2652. end;
  2653. { If the LEA instruction can be converted into an arithmetic instruction,
  2654. it may be possible to then fold it in the next optimisation, otherwise
  2655. there's nothing more that can be optimised here. }
  2656. if not ConvertLEA(taicpu(hp1)) then
  2657. Exit;
  2658. end;
  2659. if (taicpu(p).oper[1]^.typ = top_reg) and
  2660. (hp1.typ = ait_instruction) and
  2661. GetNextInstruction(hp1, hp2) and
  2662. MatchInstruction(hp2,A_MOV,[]) and
  2663. (SuperRegistersEqual(taicpu(hp2).oper[0]^.reg,taicpu(p).oper[1]^.reg)) and
  2664. (
  2665. IsFoldableArithOp(taicpu(hp1), taicpu(p).oper[1]^.reg)
  2666. {$ifdef x86_64}
  2667. or
  2668. (
  2669. (taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and (taicpu(hp2).opsize=S_L) and
  2670. IsFoldableArithOp(taicpu(hp1), newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[1]^.reg),R_SUBQ))
  2671. )
  2672. {$endif x86_64}
  2673. ) then
  2674. begin
  2675. if OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  2676. (taicpu(hp2).oper[0]^.typ=top_reg) then
  2677. { change movsX/movzX reg/ref, reg2
  2678. add/sub/or/... reg3/$const, reg2
  2679. mov reg2 reg/ref
  2680. dealloc reg2
  2681. to
  2682. add/sub/or/... reg3/$const, reg/ref }
  2683. begin
  2684. TransferUsedRegs(TmpUsedRegs);
  2685. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2686. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2687. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2688. begin
  2689. { by example:
  2690. movswl %si,%eax movswl %si,%eax p
  2691. decl %eax addl %edx,%eax hp1
  2692. movw %ax,%si movw %ax,%si hp2
  2693. ->
  2694. movswl %si,%eax movswl %si,%eax p
  2695. decw %eax addw %edx,%eax hp1
  2696. movw %ax,%si movw %ax,%si hp2
  2697. }
  2698. DebugMsg(SPeepholeOptimization + 'MovOpMov2Op ('+
  2699. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  2700. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  2701. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  2702. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  2703. {
  2704. ->
  2705. movswl %si,%eax movswl %si,%eax p
  2706. decw %si addw %dx,%si hp1
  2707. movw %ax,%si movw %ax,%si hp2
  2708. }
  2709. case taicpu(hp1).ops of
  2710. 1:
  2711. begin
  2712. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  2713. if taicpu(hp1).oper[0]^.typ=top_reg then
  2714. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2715. end;
  2716. 2:
  2717. begin
  2718. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  2719. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  2720. (taicpu(hp1).opcode<>A_SHL) and
  2721. (taicpu(hp1).opcode<>A_SHR) and
  2722. (taicpu(hp1).opcode<>A_SAR) then
  2723. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2724. end;
  2725. else
  2726. internalerror(2008042701);
  2727. end;
  2728. {
  2729. ->
  2730. decw %si addw %dx,%si p
  2731. }
  2732. asml.remove(hp2);
  2733. hp2.Free;
  2734. RemoveCurrentP(p, hp1);
  2735. Result:=True;
  2736. Exit;
  2737. end;
  2738. end;
  2739. if MatchOpType(taicpu(hp2),top_reg,top_reg) and
  2740. not(SuperRegistersEqual(taicpu(hp1).oper[0]^.reg,taicpu(hp2).oper[1]^.reg)) and
  2741. ((topsize2memsize[taicpu(hp1).opsize]<= topsize2memsize[taicpu(hp2).opsize]) or
  2742. { opsize matters for these opcodes, we could probably work around this, but it is not worth the effort }
  2743. ((taicpu(hp1).opcode<>A_SHL) and (taicpu(hp1).opcode<>A_SHR) and (taicpu(hp1).opcode<>A_SAR))
  2744. )
  2745. {$ifdef i386}
  2746. { byte registers of esi, edi, ebp, esp are not available on i386 }
  2747. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  2748. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(p).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  2749. {$endif i386}
  2750. then
  2751. { change movsX/movzX reg/ref, reg2
  2752. add/sub/or/... regX/$const, reg2
  2753. mov reg2, reg3
  2754. dealloc reg2
  2755. to
  2756. movsX/movzX reg/ref, reg3
  2757. add/sub/or/... reg3/$const, reg3
  2758. }
  2759. begin
  2760. TransferUsedRegs(TmpUsedRegs);
  2761. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2762. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2763. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2764. begin
  2765. { by example:
  2766. movswl %si,%eax movswl %si,%eax p
  2767. decl %eax addl %edx,%eax hp1
  2768. movw %ax,%si movw %ax,%si hp2
  2769. ->
  2770. movswl %si,%eax movswl %si,%eax p
  2771. decw %eax addw %edx,%eax hp1
  2772. movw %ax,%si movw %ax,%si hp2
  2773. }
  2774. DebugMsg(SPeepholeOptimization + 'MovOpMov2MovOp ('+
  2775. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  2776. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  2777. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  2778. { limit size of constants as well to avoid assembler errors, but
  2779. check opsize to avoid overflow when left shifting the 1 }
  2780. if (taicpu(p).oper[0]^.typ=top_const) and (topsize2memsize[taicpu(hp2).opsize]<=63) then
  2781. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and ((qword(1) shl topsize2memsize[taicpu(hp2).opsize])-1);
  2782. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  2783. taicpu(p).changeopsize(taicpu(hp2).opsize);
  2784. if taicpu(p).oper[0]^.typ=top_reg then
  2785. setsubreg(taicpu(p).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2786. taicpu(p).loadoper(1, taicpu(hp2).oper[1]^);
  2787. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,usedregs);
  2788. {
  2789. ->
  2790. movswl %si,%eax movswl %si,%eax p
  2791. decw %si addw %dx,%si hp1
  2792. movw %ax,%si movw %ax,%si hp2
  2793. }
  2794. case taicpu(hp1).ops of
  2795. 1:
  2796. begin
  2797. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  2798. if taicpu(hp1).oper[0]^.typ=top_reg then
  2799. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2800. end;
  2801. 2:
  2802. begin
  2803. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  2804. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  2805. (taicpu(hp1).opcode<>A_SHL) and
  2806. (taicpu(hp1).opcode<>A_SHR) and
  2807. (taicpu(hp1).opcode<>A_SAR) then
  2808. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2809. end;
  2810. else
  2811. internalerror(2018111801);
  2812. end;
  2813. {
  2814. ->
  2815. decw %si addw %dx,%si p
  2816. }
  2817. asml.remove(hp2);
  2818. hp2.Free;
  2819. end;
  2820. end;
  2821. end;
  2822. if MatchInstruction(hp1,A_BTS,A_BTR,[Taicpu(p).opsize]) and
  2823. GetNextInstruction(hp1, hp2) and
  2824. MatchInstruction(hp2,A_OR,[Taicpu(p).opsize]) and
  2825. MatchOperand(Taicpu(p).oper[0]^,0) and
  2826. (Taicpu(p).oper[1]^.typ = top_reg) and
  2827. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp1).oper[1]^) and
  2828. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp2).oper[1]^) then
  2829. { mov reg1,0
  2830. bts reg1,operand1 --> mov reg1,operand2
  2831. or reg1,operand2 bts reg1,operand1}
  2832. begin
  2833. Taicpu(hp2).opcode:=A_MOV;
  2834. asml.remove(hp1);
  2835. insertllitem(hp2,hp2.next,hp1);
  2836. asml.remove(p);
  2837. p.free;
  2838. p:=hp1;
  2839. Result:=true;
  2840. exit;
  2841. end;
  2842. end;
  2843. function TX86AsmOptimizer.OptPass1MOVXX(var p : tai) : boolean;
  2844. var
  2845. hp1 : tai;
  2846. begin
  2847. Result:=false;
  2848. if taicpu(p).ops <> 2 then
  2849. exit;
  2850. if GetNextInstruction(p,hp1) and
  2851. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  2852. (taicpu(hp1).ops = 2) then
  2853. begin
  2854. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  2855. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  2856. { movXX reg1, mem1 or movXX mem1, reg1
  2857. movXX mem2, reg2 movXX reg2, mem2}
  2858. begin
  2859. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  2860. { movXX reg1, mem1 or movXX mem1, reg1
  2861. movXX mem2, reg1 movXX reg2, mem1}
  2862. begin
  2863. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2864. begin
  2865. { Removes the second statement from
  2866. movXX reg1, mem1/reg2
  2867. movXX mem1/reg2, reg1
  2868. }
  2869. if taicpu(p).oper[0]^.typ=top_reg then
  2870. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2871. { Removes the second statement from
  2872. movXX mem1/reg1, reg2
  2873. movXX reg2, mem1/reg1
  2874. }
  2875. if (taicpu(p).oper[1]^.typ=top_reg) and
  2876. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,UsedRegs)) then
  2877. begin
  2878. asml.remove(p);
  2879. p.free;
  2880. GetNextInstruction(hp1,p);
  2881. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2Nop 1 done',p);
  2882. end
  2883. else
  2884. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2MoVXX 1 done',p);
  2885. asml.remove(hp1);
  2886. hp1.free;
  2887. Result:=true;
  2888. exit;
  2889. end
  2890. end;
  2891. end;
  2892. end;
  2893. end;
  2894. function TX86AsmOptimizer.OptPass1OP(var p : tai) : boolean;
  2895. var
  2896. hp1 : tai;
  2897. begin
  2898. result:=false;
  2899. { replace
  2900. <Op>X %mreg1,%mreg2 // Op in [ADD,MUL]
  2901. MovX %mreg2,%mreg1
  2902. dealloc %mreg2
  2903. by
  2904. <Op>X %mreg2,%mreg1
  2905. ?
  2906. }
  2907. if GetNextInstruction(p,hp1) and
  2908. { we mix single and double opperations here because we assume that the compiler
  2909. generates vmovapd only after double operations and vmovaps only after single operations }
  2910. MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
  2911. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2912. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
  2913. (taicpu(p).oper[0]^.typ=top_reg) then
  2914. begin
  2915. TransferUsedRegs(TmpUsedRegs);
  2916. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2917. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2918. begin
  2919. taicpu(p).loadoper(0,taicpu(hp1).oper[0]^);
  2920. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2921. DebugMsg(SPeepholeOptimization + 'OpMov2Op done',p);
  2922. asml.Remove(hp1);
  2923. hp1.Free;
  2924. result:=true;
  2925. end;
  2926. end;
  2927. end;
  2928. function TX86AsmOptimizer.OptPass1LEA(var p : tai) : boolean;
  2929. var
  2930. hp1, hp2, hp3: tai;
  2931. l : ASizeInt;
  2932. ref: Integer;
  2933. saveref: treference;
  2934. begin
  2935. Result:=false;
  2936. { removes seg register prefixes from LEA operations, as they
  2937. don't do anything}
  2938. taicpu(p).oper[0]^.ref^.Segment:=NR_NO;
  2939. { changes "lea (%reg1), %reg2" into "mov %reg1, %reg2" }
  2940. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  2941. (taicpu(p).oper[0]^.ref^.index = NR_NO) and
  2942. { do not mess with leas acessing the stack pointer }
  2943. (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) and
  2944. (not(Assigned(taicpu(p).oper[0]^.ref^.Symbol))) then
  2945. begin
  2946. if (taicpu(p).oper[0]^.ref^.offset = 0) then
  2947. begin
  2948. if (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) then
  2949. begin
  2950. hp1:=taicpu.op_reg_reg(A_MOV,taicpu(p).opsize,taicpu(p).oper[0]^.ref^.base,
  2951. taicpu(p).oper[1]^.reg);
  2952. InsertLLItem(p.previous,p.next, hp1);
  2953. DebugMsg(SPeepholeOptimization + 'Lea2Mov done',hp1);
  2954. p.free;
  2955. p:=hp1;
  2956. end
  2957. else
  2958. begin
  2959. DebugMsg(SPeepholeOptimization + 'Lea2Nop done',p);
  2960. RemoveCurrentP(p);
  2961. end;
  2962. Result:=true;
  2963. exit;
  2964. end
  2965. else if (
  2966. { continue to use lea to adjust the stack pointer,
  2967. it is the recommended way, but only if not optimizing for size }
  2968. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) or
  2969. (cs_opt_size in current_settings.optimizerswitches)
  2970. ) and
  2971. { If the flags register is in use, don't change the instruction
  2972. to an ADD otherwise this will scramble the flags. [Kit] }
  2973. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  2974. ConvertLEA(taicpu(p)) then
  2975. begin
  2976. Result:=true;
  2977. exit;
  2978. end;
  2979. end;
  2980. if GetNextInstruction(p,hp1) and
  2981. MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  2982. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2983. MatchOpType(Taicpu(hp1),top_reg,top_reg) and
  2984. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) then
  2985. begin
  2986. TransferUsedRegs(TmpUsedRegs);
  2987. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2988. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2989. begin
  2990. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2991. DebugMsg(SPeepholeOptimization + 'LeaMov2Lea done',p);
  2992. asml.Remove(hp1);
  2993. hp1.Free;
  2994. result:=true;
  2995. end;
  2996. end;
  2997. { changes
  2998. lea offset1(regX), reg1
  2999. lea offset2(reg1), reg1
  3000. to
  3001. lea offset1+offset2(regX), reg1 }
  3002. { for now, we do not mess with the stack pointer, thought it might be usefull to remove
  3003. unneeded lea sequences on the stack pointer, it needs to be tested in detail }
  3004. if (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) and
  3005. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  3006. MatchInstruction(hp1,A_LEA,[taicpu(p).opsize]) and
  3007. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
  3008. (taicpu(p).oper[0]^.ref^.relsymbol=nil) and
  3009. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and
  3010. (taicpu(p).oper[0]^.ref^.symbol=nil) and
  3011. (((taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg) and
  3012. (taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) and
  3013. (taicpu(p).oper[0]^.ref^.index=NR_NO) and
  3014. (taicpu(p).oper[0]^.ref^.index=taicpu(hp1).oper[0]^.ref^.index) and
  3015. (taicpu(p).oper[0]^.ref^.scalefactor=taicpu(hp1).oper[0]^.ref^.scalefactor)
  3016. ) or
  3017. ((taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg) and
  3018. (taicpu(p).oper[0]^.ref^.index=NR_NO)
  3019. ) or
  3020. ((taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg) and
  3021. (taicpu(hp1).oper[0]^.ref^.scalefactor in [0,1]) and
  3022. (taicpu(p).oper[0]^.ref^.base=NR_NO) and
  3023. not(RegUsedBetween(taicpu(p).oper[0]^.ref^.index,p,hp1)))
  3024. ) and
  3025. not(RegUsedBetween(taicpu(p).oper[0]^.ref^.base,p,hp1)) and
  3026. (taicpu(p).oper[0]^.ref^.relsymbol=taicpu(hp1).oper[0]^.ref^.relsymbol) and
  3027. (taicpu(p).oper[0]^.ref^.segment=taicpu(hp1).oper[0]^.ref^.segment) and
  3028. (taicpu(p).oper[0]^.ref^.symbol=taicpu(hp1).oper[0]^.ref^.symbol) then
  3029. begin
  3030. DebugMsg(SPeepholeOptimization + 'LeaLea2Lea done',p);
  3031. if taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg then
  3032. begin
  3033. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.base;
  3034. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  3035. { if the register is used as index and base, we have to increase for base as well
  3036. and adapt base }
  3037. if taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg then
  3038. begin
  3039. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  3040. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  3041. end;
  3042. end
  3043. else
  3044. begin
  3045. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  3046. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  3047. end;
  3048. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  3049. begin
  3050. taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
  3051. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  3052. taicpu(hp1).oper[0]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  3053. end;
  3054. RemoveCurrentP(p);
  3055. result:=true;
  3056. exit;
  3057. end;
  3058. { changes
  3059. lea <ref1>, reg1
  3060. <op> ...,<ref. with reg1>,...
  3061. to
  3062. <op> ...,<ref1>,... }
  3063. if (taicpu(p).oper[1]^.reg<>current_procinfo.framepointer) and
  3064. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) and
  3065. GetNextInstruction(p,hp1) and
  3066. (hp1.typ=ait_instruction) and
  3067. not(MatchInstruction(hp1,A_LEA,[])) then
  3068. begin
  3069. { find a reference which uses reg1 }
  3070. if (taicpu(hp1).ops>=1) and (taicpu(hp1).oper[0]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^) then
  3071. ref:=0
  3072. else if (taicpu(hp1).ops>=2) and (taicpu(hp1).oper[1]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^) then
  3073. ref:=1
  3074. else
  3075. ref:=-1;
  3076. if (ref<>-1) and
  3077. { reg1 must be either the base or the index }
  3078. ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) xor (taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg)) then
  3079. begin
  3080. { reg1 can be removed from the reference }
  3081. saveref:=taicpu(hp1).oper[ref]^.ref^;
  3082. if taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg then
  3083. taicpu(hp1).oper[ref]^.ref^.base:=NR_NO
  3084. else if taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg then
  3085. taicpu(hp1).oper[ref]^.ref^.index:=NR_NO
  3086. else
  3087. Internalerror(2019111201);
  3088. { check if the can insert all data of the lea into the second instruction }
  3089. if ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) or (taicpu(hp1).oper[ref]^.ref^.scalefactor in [0,1])) and
  3090. ((taicpu(p).oper[0]^.ref^.base=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.base=NR_NO)) and
  3091. ((taicpu(p).oper[0]^.ref^.index=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.index=NR_NO)) and
  3092. ((taicpu(p).oper[0]^.ref^.symbol=nil) or (taicpu(hp1).oper[ref]^.ref^.symbol=nil)) and
  3093. ((taicpu(p).oper[0]^.ref^.relsymbol=nil) or (taicpu(hp1).oper[ref]^.ref^.relsymbol=nil)) and
  3094. ((taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) or (taicpu(hp1).oper[ref]^.ref^.scalefactor in [0,1])) and
  3095. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.segment=NR_NO)
  3096. {$ifdef x86_64}
  3097. and (abs(taicpu(hp1).oper[ref]^.ref^.offset+taicpu(p).oper[0]^.ref^.offset)<=$7fffffff)
  3098. and (((taicpu(p).oper[0]^.ref^.base<>NR_RIP) and (taicpu(p).oper[0]^.ref^.index<>NR_RIP)) or
  3099. ((taicpu(hp1).oper[ref]^.ref^.base=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.index=NR_NO))
  3100. )
  3101. {$endif x86_64}
  3102. then
  3103. begin
  3104. { reg1 might not used by the second instruction after it is remove from the reference }
  3105. if not(RegInInstruction(taicpu(p).oper[1]^.reg,taicpu(hp1))) then
  3106. begin
  3107. TransferUsedRegs(TmpUsedRegs);
  3108. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3109. { reg1 is not updated so it might not be used afterwards }
  3110. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  3111. begin
  3112. DebugMsg(SPeepholeOptimization + 'LeaOp2Op done',p);
  3113. if taicpu(p).oper[0]^.ref^.base<>NR_NO then
  3114. taicpu(hp1).oper[ref]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  3115. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  3116. taicpu(hp1).oper[ref]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  3117. if taicpu(p).oper[0]^.ref^.symbol<>nil then
  3118. taicpu(hp1).oper[ref]^.ref^.symbol:=taicpu(p).oper[0]^.ref^.symbol;
  3119. if taicpu(p).oper[0]^.ref^.relsymbol<>nil then
  3120. taicpu(hp1).oper[ref]^.ref^.relsymbol:=taicpu(p).oper[0]^.ref^.relsymbol;
  3121. if not(taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) then
  3122. taicpu(hp1).oper[ref]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  3123. inc(taicpu(hp1).oper[ref]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  3124. RemoveCurrentP(p, hp1);
  3125. result:=true;
  3126. exit;
  3127. end
  3128. end;
  3129. end;
  3130. { recover }
  3131. taicpu(hp1).oper[ref]^.ref^:=saveref;
  3132. end;
  3133. end;
  3134. end;
  3135. function TX86AsmOptimizer.DoSubAddOpt(var p: tai): Boolean;
  3136. var
  3137. hp1 : tai;
  3138. begin
  3139. DoSubAddOpt := False;
  3140. if GetLastInstruction(p, hp1) and
  3141. (hp1.typ = ait_instruction) and
  3142. (taicpu(hp1).opsize = taicpu(p).opsize) then
  3143. case taicpu(hp1).opcode Of
  3144. A_DEC:
  3145. if (taicpu(hp1).oper[0]^.typ = top_reg) and
  3146. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  3147. begin
  3148. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+1);
  3149. asml.remove(hp1);
  3150. hp1.free;
  3151. end;
  3152. A_SUB:
  3153. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  3154. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  3155. begin
  3156. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+taicpu(hp1).oper[0]^.val);
  3157. asml.remove(hp1);
  3158. hp1.free;
  3159. end;
  3160. A_ADD:
  3161. begin
  3162. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  3163. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  3164. begin
  3165. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  3166. asml.remove(hp1);
  3167. hp1.free;
  3168. if (taicpu(p).oper[0]^.val = 0) then
  3169. begin
  3170. hp1 := tai(p.next);
  3171. asml.remove(p);
  3172. p.free;
  3173. if not GetLastInstruction(hp1, p) then
  3174. p := hp1;
  3175. DoSubAddOpt := True;
  3176. end
  3177. end;
  3178. end;
  3179. else
  3180. ;
  3181. end;
  3182. end;
  3183. function TX86AsmOptimizer.OptPass1Sub(var p : tai) : boolean;
  3184. {$ifdef i386}
  3185. var
  3186. hp1 : tai;
  3187. {$endif i386}
  3188. begin
  3189. Result:=false;
  3190. { * change "subl $2, %esp; pushw x" to "pushl x"}
  3191. { * change "sub/add const1, reg" or "dec reg" followed by
  3192. "sub const2, reg" to one "sub ..., reg" }
  3193. if MatchOpType(taicpu(p),top_const,top_reg) then
  3194. begin
  3195. {$ifdef i386}
  3196. if (taicpu(p).oper[0]^.val = 2) and
  3197. (taicpu(p).oper[1]^.reg = NR_ESP) and
  3198. { Don't do the sub/push optimization if the sub }
  3199. { comes from setting up the stack frame (JM) }
  3200. (not(GetLastInstruction(p,hp1)) or
  3201. not(MatchInstruction(hp1,A_MOV,[S_L]) and
  3202. MatchOperand(taicpu(hp1).oper[0]^,NR_ESP) and
  3203. MatchOperand(taicpu(hp1).oper[0]^,NR_EBP))) then
  3204. begin
  3205. hp1 := tai(p.next);
  3206. while Assigned(hp1) and
  3207. (tai(hp1).typ in [ait_instruction]+SkipInstr) and
  3208. not RegReadByInstruction(NR_ESP,hp1) and
  3209. not RegModifiedByInstruction(NR_ESP,hp1) do
  3210. hp1 := tai(hp1.next);
  3211. if Assigned(hp1) and
  3212. MatchInstruction(hp1,A_PUSH,[S_W]) then
  3213. begin
  3214. taicpu(hp1).changeopsize(S_L);
  3215. if taicpu(hp1).oper[0]^.typ=top_reg then
  3216. setsubreg(taicpu(hp1).oper[0]^.reg,R_SUBWHOLE);
  3217. hp1 := tai(p.next);
  3218. asml.remove(p);
  3219. p.free;
  3220. p := hp1;
  3221. Result:=true;
  3222. exit;
  3223. end;
  3224. end;
  3225. {$endif i386}
  3226. if DoSubAddOpt(p) then
  3227. Result:=true;
  3228. end;
  3229. end;
  3230. function TX86AsmOptimizer.OptPass1SHLSAL(var p : tai) : boolean;
  3231. var
  3232. TmpBool1,TmpBool2 : Boolean;
  3233. tmpref : treference;
  3234. hp1,hp2: tai;
  3235. mask: tcgint;
  3236. begin
  3237. Result:=false;
  3238. { All these optimisations work on "shl/sal const,%reg" }
  3239. if not MatchOpType(taicpu(p),top_const,top_reg) then
  3240. Exit;
  3241. if (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  3242. (taicpu(p).oper[0]^.val <= 3) then
  3243. { Changes "shl const, %reg32; add const/reg, %reg32" to one lea statement }
  3244. begin
  3245. { should we check the next instruction? }
  3246. TmpBool1 := True;
  3247. { have we found an add/sub which could be
  3248. integrated in the lea? }
  3249. TmpBool2 := False;
  3250. reference_reset(tmpref,2,[]);
  3251. TmpRef.index := taicpu(p).oper[1]^.reg;
  3252. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  3253. while TmpBool1 and
  3254. GetNextInstruction(p, hp1) and
  3255. (tai(hp1).typ = ait_instruction) and
  3256. ((((taicpu(hp1).opcode = A_ADD) or
  3257. (taicpu(hp1).opcode = A_SUB)) and
  3258. (taicpu(hp1).oper[1]^.typ = Top_Reg) and
  3259. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)) or
  3260. (((taicpu(hp1).opcode = A_INC) or
  3261. (taicpu(hp1).opcode = A_DEC)) and
  3262. (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  3263. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg)) or
  3264. ((taicpu(hp1).opcode = A_LEA) and
  3265. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  3266. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg))) and
  3267. (not GetNextInstruction(hp1,hp2) or
  3268. not instrReadsFlags(hp2)) Do
  3269. begin
  3270. TmpBool1 := False;
  3271. if taicpu(hp1).opcode=A_LEA then
  3272. begin
  3273. if (TmpRef.base = NR_NO) and
  3274. (taicpu(hp1).oper[0]^.ref^.symbol=nil) and
  3275. (taicpu(hp1).oper[0]^.ref^.relsymbol=nil) and
  3276. (taicpu(hp1).oper[0]^.ref^.segment=NR_NO) and
  3277. ((taicpu(hp1).oper[0]^.ref^.scalefactor=0) or
  3278. (taicpu(hp1).oper[0]^.ref^.scalefactor*tmpref.scalefactor<=8)) then
  3279. begin
  3280. TmpBool1 := True;
  3281. TmpBool2 := True;
  3282. inc(TmpRef.offset, taicpu(hp1).oper[0]^.ref^.offset);
  3283. if taicpu(hp1).oper[0]^.ref^.scalefactor<>0 then
  3284. tmpref.scalefactor:=tmpref.scalefactor*taicpu(hp1).oper[0]^.ref^.scalefactor;
  3285. TmpRef.base := taicpu(hp1).oper[0]^.ref^.base;
  3286. asml.remove(hp1);
  3287. hp1.free;
  3288. end
  3289. end
  3290. else if (taicpu(hp1).oper[0]^.typ = Top_Const) then
  3291. begin
  3292. TmpBool1 := True;
  3293. TmpBool2 := True;
  3294. case taicpu(hp1).opcode of
  3295. A_ADD:
  3296. inc(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  3297. A_SUB:
  3298. dec(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  3299. else
  3300. internalerror(2019050536);
  3301. end;
  3302. asml.remove(hp1);
  3303. hp1.free;
  3304. end
  3305. else
  3306. if (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  3307. (((taicpu(hp1).opcode = A_ADD) and
  3308. (TmpRef.base = NR_NO)) or
  3309. (taicpu(hp1).opcode = A_INC) or
  3310. (taicpu(hp1).opcode = A_DEC)) then
  3311. begin
  3312. TmpBool1 := True;
  3313. TmpBool2 := True;
  3314. case taicpu(hp1).opcode of
  3315. A_ADD:
  3316. TmpRef.base := taicpu(hp1).oper[0]^.reg;
  3317. A_INC:
  3318. inc(TmpRef.offset);
  3319. A_DEC:
  3320. dec(TmpRef.offset);
  3321. else
  3322. internalerror(2019050535);
  3323. end;
  3324. asml.remove(hp1);
  3325. hp1.free;
  3326. end;
  3327. end;
  3328. if TmpBool2
  3329. {$ifndef x86_64}
  3330. or
  3331. ((current_settings.optimizecputype < cpu_Pentium2) and
  3332. (taicpu(p).oper[0]^.val <= 3) and
  3333. not(cs_opt_size in current_settings.optimizerswitches))
  3334. {$endif x86_64}
  3335. then
  3336. begin
  3337. if not(TmpBool2) and
  3338. (taicpu(p).oper[0]^.val=1) then
  3339. begin
  3340. hp1:=taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
  3341. taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg)
  3342. end
  3343. else
  3344. hp1:=taicpu.op_ref_reg(A_LEA, taicpu(p).opsize, TmpRef,
  3345. taicpu(p).oper[1]^.reg);
  3346. DebugMsg(SPeepholeOptimization + 'ShlAddLeaSubIncDec2Lea',p);
  3347. InsertLLItem(p.previous, p.next, hp1);
  3348. p.free;
  3349. p := hp1;
  3350. end;
  3351. end
  3352. {$ifndef x86_64}
  3353. else if (current_settings.optimizecputype < cpu_Pentium2) then
  3354. begin
  3355. { changes "shl $1, %reg" to "add %reg, %reg", which is the same on a 386,
  3356. but faster on a 486, and Tairable in both U and V pipes on the Pentium
  3357. (unlike shl, which is only Tairable in the U pipe) }
  3358. if taicpu(p).oper[0]^.val=1 then
  3359. begin
  3360. hp1 := taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
  3361. taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg);
  3362. InsertLLItem(p.previous, p.next, hp1);
  3363. p.free;
  3364. p := hp1;
  3365. end
  3366. { changes "shl $2, %reg" to "lea (,%reg,4), %reg"
  3367. "shl $3, %reg" to "lea (,%reg,8), %reg }
  3368. else if (taicpu(p).opsize = S_L) and
  3369. (taicpu(p).oper[0]^.val<= 3) then
  3370. begin
  3371. reference_reset(tmpref,2,[]);
  3372. TmpRef.index := taicpu(p).oper[1]^.reg;
  3373. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  3374. hp1 := taicpu.Op_ref_reg(A_LEA,S_L,TmpRef, taicpu(p).oper[1]^.reg);
  3375. InsertLLItem(p.previous, p.next, hp1);
  3376. p.free;
  3377. p := hp1;
  3378. end;
  3379. end
  3380. {$endif x86_64}
  3381. else if
  3382. GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) and MatchOpType(taicpu(hp1), top_const, top_reg) and
  3383. (
  3384. (
  3385. MatchInstruction(hp1, A_AND, [taicpu(p).opsize]) and
  3386. SetAndTest(hp1, hp2)
  3387. {$ifdef x86_64}
  3388. ) or
  3389. (
  3390. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  3391. GetNextInstruction(hp1, hp2) and
  3392. MatchInstruction(hp2, A_AND, [taicpu(p).opsize]) and
  3393. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  3394. (taicpu(hp1).oper[1]^.reg = taicpu(hp2).oper[0]^.reg)
  3395. {$endif x86_64}
  3396. )
  3397. ) and
  3398. (taicpu(p).oper[1]^.reg = taicpu(hp2).oper[1]^.reg) then
  3399. begin
  3400. { Change:
  3401. shl x, %reg1
  3402. mov -(1<<x), %reg2
  3403. and %reg2, %reg1
  3404. Or:
  3405. shl x, %reg1
  3406. and -(1<<x), %reg1
  3407. To just:
  3408. shl x, %reg1
  3409. Since the and operation only zeroes bits that are already zero from the shl operation
  3410. }
  3411. case taicpu(p).oper[0]^.val of
  3412. 8:
  3413. mask:=$FFFFFFFFFFFFFF00;
  3414. 16:
  3415. mask:=$FFFFFFFFFFFF0000;
  3416. 32:
  3417. mask:=$FFFFFFFF00000000;
  3418. 63:
  3419. { Constant pre-calculated to prevent overflow errors with Int64 }
  3420. mask:=$8000000000000000;
  3421. else
  3422. begin
  3423. if taicpu(p).oper[0]^.val >= 64 then
  3424. { Shouldn't happen realistically, since the register
  3425. is guaranteed to be set to zero at this point }
  3426. mask := 0
  3427. else
  3428. mask := -(Int64(1 shl taicpu(p).oper[0]^.val));
  3429. end;
  3430. end;
  3431. if taicpu(hp1).oper[0]^.val = mask then
  3432. begin
  3433. { Everything checks out, perform the optimisation, as long as
  3434. the FLAGS register isn't being used}
  3435. TransferUsedRegs(TmpUsedRegs);
  3436. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3437. {$ifdef x86_64}
  3438. if (hp1 <> hp2) then
  3439. begin
  3440. { "shl/mov/and" version }
  3441. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  3442. { Don't do the optimisation if the FLAGS register is in use }
  3443. if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)) then
  3444. begin
  3445. DebugMsg(SPeepholeOptimization + 'ShlMovAnd2Shl', p);
  3446. { Don't remove the 'mov' instruction if its register is used elsewhere }
  3447. if not(RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs)) then
  3448. begin
  3449. asml.Remove(hp1);
  3450. hp1.Free;
  3451. Result := True;
  3452. end;
  3453. { Only set Result to True if the 'mov' instruction was removed }
  3454. asml.Remove(hp2);
  3455. hp2.Free;
  3456. end;
  3457. end
  3458. else
  3459. {$endif x86_64}
  3460. begin
  3461. { "shl/and" version }
  3462. { Don't do the optimisation if the FLAGS register is in use }
  3463. if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  3464. begin
  3465. DebugMsg(SPeepholeOptimization + 'ShlAnd2Shl', p);
  3466. asml.Remove(hp1);
  3467. hp1.Free;
  3468. Result := True;
  3469. end;
  3470. end;
  3471. Exit;
  3472. end
  3473. else {$ifdef x86_64}if (hp1 = hp2) then{$endif x86_64}
  3474. begin
  3475. { Even if the mask doesn't allow for its removal, we might be
  3476. able to optimise the mask for the "shl/and" version, which
  3477. may permit other peephole optimisations }
  3478. {$ifdef DEBUG_AOPTCPU}
  3479. mask := taicpu(hp1).oper[0]^.val and mask;
  3480. if taicpu(hp1).oper[0]^.val <> mask then
  3481. begin
  3482. DebugMsg(
  3483. SPeepholeOptimization +
  3484. 'Changed mask from $' + debug_tostr(taicpu(hp1).oper[0]^.val) +
  3485. ' to $' + debug_tostr(mask) +
  3486. 'based on previous instruction (ShlAnd2ShlAnd)', hp1);
  3487. taicpu(hp1).oper[0]^.val := mask;
  3488. end;
  3489. {$else DEBUG_AOPTCPU}
  3490. { If debugging is off, just set the operand even if it's the same }
  3491. taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and mask;
  3492. {$endif DEBUG_AOPTCPU}
  3493. end;
  3494. end;
  3495. end;
  3496. function TX86AsmOptimizer.OptPass1SETcc(var p: tai): boolean;
  3497. var
  3498. hp1,hp2,next: tai; SetC, JumpC: TAsmCond; Unconditional: Boolean;
  3499. begin
  3500. Result:=false;
  3501. if MatchOpType(taicpu(p),top_reg) and
  3502. GetNextInstruction(p, hp1) and
  3503. ((MatchInstruction(hp1, A_TEST, [S_B]) and
  3504. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3505. (taicpu(hp1).oper[0]^.reg = taicpu(hp1).oper[1]^.reg)) or
  3506. (MatchInstruction(hp1, A_CMP, [S_B]) and
  3507. MatchOpType(taicpu(hp1),top_const,top_reg) and
  3508. (taicpu(hp1).oper[0]^.val=0))
  3509. ) and
  3510. (taicpu(p).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  3511. GetNextInstruction(hp1, hp2) and
  3512. MatchInstruction(hp2, A_Jcc, []) then
  3513. { Change from: To:
  3514. set(C) %reg j(~C) label
  3515. test %reg,%reg/cmp $0,%reg
  3516. je label
  3517. set(C) %reg j(C) label
  3518. test %reg,%reg/cmp $0,%reg
  3519. jne label
  3520. }
  3521. begin
  3522. next := tai(p.Next);
  3523. TransferUsedRegs(TmpUsedRegs);
  3524. UpdateUsedRegs(TmpUsedRegs, next);
  3525. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  3526. JumpC := taicpu(hp2).condition;
  3527. Unconditional := False;
  3528. if conditions_equal(JumpC, C_E) then
  3529. SetC := inverse_cond(taicpu(p).condition)
  3530. else if conditions_equal(JumpC, C_NE) then
  3531. SetC := taicpu(p).condition
  3532. else
  3533. { We've got something weird here (and inefficent) }
  3534. begin
  3535. DebugMsg('DEBUG: Inefficient jump - check code generation', p);
  3536. SetC := C_NONE;
  3537. { JAE/JNB will always branch (use 'condition_in', since C_AE <> C_NB normally) }
  3538. if condition_in(C_AE, JumpC) then
  3539. Unconditional := True
  3540. else
  3541. { Not sure what to do with this jump - drop out }
  3542. Exit;
  3543. end;
  3544. asml.Remove(hp1);
  3545. hp1.Free;
  3546. if Unconditional then
  3547. MakeUnconditional(taicpu(hp2))
  3548. else
  3549. begin
  3550. if SetC = C_NONE then
  3551. InternalError(2018061401);
  3552. taicpu(hp2).SetCondition(SetC);
  3553. end;
  3554. if not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs) then
  3555. begin
  3556. asml.Remove(p);
  3557. UpdateUsedRegs(next);
  3558. p.Free;
  3559. Result := True;
  3560. p := hp2;
  3561. end;
  3562. DebugMsg(SPeepholeOptimization + 'SETcc/TESTCmp/Jcc -> Jcc',p);
  3563. end;
  3564. end;
  3565. function TX86AsmOptimizer.OptPass1FSTP(var p: tai): boolean;
  3566. { returns true if a "continue" should be done after this optimization }
  3567. var
  3568. hp1, hp2: tai;
  3569. begin
  3570. Result := false;
  3571. if MatchOpType(taicpu(p),top_ref) and
  3572. GetNextInstruction(p, hp1) and
  3573. (hp1.typ = ait_instruction) and
  3574. (((taicpu(hp1).opcode = A_FLD) and
  3575. (taicpu(p).opcode = A_FSTP)) or
  3576. ((taicpu(p).opcode = A_FISTP) and
  3577. (taicpu(hp1).opcode = A_FILD))) and
  3578. MatchOpType(taicpu(hp1),top_ref) and
  3579. (taicpu(hp1).opsize = taicpu(p).opsize) and
  3580. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  3581. begin
  3582. { replacing fstp f;fld f by fst f is only valid for extended because of rounding }
  3583. if (taicpu(p).opsize=S_FX) and
  3584. GetNextInstruction(hp1, hp2) and
  3585. (hp2.typ = ait_instruction) and
  3586. IsExitCode(hp2) and
  3587. (taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
  3588. not(assigned(current_procinfo.procdef.funcretsym) and
  3589. (taicpu(p).oper[0]^.ref^.offset < tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)) and
  3590. (taicpu(p).oper[0]^.ref^.index = NR_NO) then
  3591. begin
  3592. asml.remove(p);
  3593. asml.remove(hp1);
  3594. p.free;
  3595. hp1.free;
  3596. p := hp2;
  3597. RemoveLastDeallocForFuncRes(p);
  3598. Result := true;
  3599. end
  3600. (* can't be done because the store operation rounds
  3601. else
  3602. { fst can't store an extended value! }
  3603. if (taicpu(p).opsize <> S_FX) and
  3604. (taicpu(p).opsize <> S_IQ) then
  3605. begin
  3606. if (taicpu(p).opcode = A_FSTP) then
  3607. taicpu(p).opcode := A_FST
  3608. else taicpu(p).opcode := A_FIST;
  3609. asml.remove(hp1);
  3610. hp1.free;
  3611. end
  3612. *)
  3613. end;
  3614. end;
  3615. function TX86AsmOptimizer.OptPass1FLD(var p : tai) : boolean;
  3616. var
  3617. hp1, hp2: tai;
  3618. begin
  3619. result:=false;
  3620. if MatchOpType(taicpu(p),top_reg) and
  3621. GetNextInstruction(p, hp1) and
  3622. (hp1.typ = Ait_Instruction) and
  3623. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3624. (taicpu(hp1).oper[0]^.reg = NR_ST) and
  3625. (taicpu(hp1).oper[1]^.reg = NR_ST1) then
  3626. { change to
  3627. fld reg fxxx reg,st
  3628. fxxxp st, st1 (hp1)
  3629. Remark: non commutative operations must be reversed!
  3630. }
  3631. begin
  3632. case taicpu(hp1).opcode Of
  3633. A_FMULP,A_FADDP,
  3634. A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  3635. begin
  3636. case taicpu(hp1).opcode Of
  3637. A_FADDP: taicpu(hp1).opcode := A_FADD;
  3638. A_FMULP: taicpu(hp1).opcode := A_FMUL;
  3639. A_FSUBP: taicpu(hp1).opcode := A_FSUBR;
  3640. A_FSUBRP: taicpu(hp1).opcode := A_FSUB;
  3641. A_FDIVP: taicpu(hp1).opcode := A_FDIVR;
  3642. A_FDIVRP: taicpu(hp1).opcode := A_FDIV;
  3643. else
  3644. internalerror(2019050534);
  3645. end;
  3646. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  3647. taicpu(hp1).oper[1]^.reg := NR_ST;
  3648. asml.remove(p);
  3649. p.free;
  3650. p := hp1;
  3651. Result:=true;
  3652. exit;
  3653. end;
  3654. else
  3655. ;
  3656. end;
  3657. end
  3658. else
  3659. if MatchOpType(taicpu(p),top_ref) and
  3660. GetNextInstruction(p, hp2) and
  3661. (hp2.typ = Ait_Instruction) and
  3662. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  3663. (taicpu(p).opsize in [S_FS, S_FL]) and
  3664. (taicpu(hp2).oper[0]^.reg = NR_ST) and
  3665. (taicpu(hp2).oper[1]^.reg = NR_ST1) then
  3666. if GetLastInstruction(p, hp1) and
  3667. MatchInstruction(hp1,A_FLD,A_FST,[taicpu(p).opsize]) and
  3668. MatchOpType(taicpu(hp1),top_ref) and
  3669. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  3670. if ((taicpu(hp2).opcode = A_FMULP) or
  3671. (taicpu(hp2).opcode = A_FADDP)) then
  3672. { change to
  3673. fld/fst mem1 (hp1) fld/fst mem1
  3674. fld mem1 (p) fadd/
  3675. faddp/ fmul st, st
  3676. fmulp st, st1 (hp2) }
  3677. begin
  3678. asml.remove(p);
  3679. p.free;
  3680. p := hp1;
  3681. if (taicpu(hp2).opcode = A_FADDP) then
  3682. taicpu(hp2).opcode := A_FADD
  3683. else
  3684. taicpu(hp2).opcode := A_FMUL;
  3685. taicpu(hp2).oper[1]^.reg := NR_ST;
  3686. end
  3687. else
  3688. { change to
  3689. fld/fst mem1 (hp1) fld/fst mem1
  3690. fld mem1 (p) fld st}
  3691. begin
  3692. taicpu(p).changeopsize(S_FL);
  3693. taicpu(p).loadreg(0,NR_ST);
  3694. end
  3695. else
  3696. begin
  3697. case taicpu(hp2).opcode Of
  3698. A_FMULP,A_FADDP,A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  3699. { change to
  3700. fld/fst mem1 (hp1) fld/fst mem1
  3701. fld mem2 (p) fxxx mem2
  3702. fxxxp st, st1 (hp2) }
  3703. begin
  3704. case taicpu(hp2).opcode Of
  3705. A_FADDP: taicpu(p).opcode := A_FADD;
  3706. A_FMULP: taicpu(p).opcode := A_FMUL;
  3707. A_FSUBP: taicpu(p).opcode := A_FSUBR;
  3708. A_FSUBRP: taicpu(p).opcode := A_FSUB;
  3709. A_FDIVP: taicpu(p).opcode := A_FDIVR;
  3710. A_FDIVRP: taicpu(p).opcode := A_FDIV;
  3711. else
  3712. internalerror(2019050533);
  3713. end;
  3714. asml.remove(hp2);
  3715. hp2.free;
  3716. end
  3717. else
  3718. ;
  3719. end
  3720. end
  3721. end;
  3722. function TX86AsmOptimizer.OptPass1Cmp(var p: tai): boolean;
  3723. var
  3724. v: TCGInt;
  3725. hp1, hp2: tai;
  3726. begin
  3727. Result:=false;
  3728. if taicpu(p).oper[0]^.typ = top_const then
  3729. begin
  3730. { Though GetNextInstruction can be factored out, it is an expensive
  3731. call, so delay calling it until we have first checked cheaper
  3732. conditions that are independent of it. }
  3733. if (taicpu(p).oper[0]^.val = 0) and
  3734. (taicpu(p).oper[1]^.typ = top_reg) and
  3735. GetNextInstruction(p, hp1) and
  3736. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) then
  3737. begin
  3738. hp2 := p;
  3739. { When dealing with "cmp $0,%reg", only ZF and SF contain
  3740. anything meaningful once it's converted to "test %reg,%reg";
  3741. additionally, some jumps will always (or never) branch, so
  3742. evaluate every jump immediately following the
  3743. comparison, optimising the conditions if possible.
  3744. Similarly with SETcc... those that are always set to 0 or 1
  3745. are changed to MOV instructions }
  3746. while GetNextInstruction(hp2, hp1) and
  3747. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) do
  3748. begin
  3749. case taicpu(hp1).condition of
  3750. C_B, C_C, C_NAE, C_O:
  3751. { For B/NAE:
  3752. Will never branch since an unsigned integer can never be below zero
  3753. For C/O:
  3754. Result cannot overflow because 0 is being subtracted
  3755. }
  3756. begin
  3757. if taicpu(hp1).opcode = A_Jcc then
  3758. begin
  3759. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (jump removed)', hp1);
  3760. TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol).decrefs;
  3761. AsmL.Remove(hp1);
  3762. hp1.Free;
  3763. { Since hp1 was deleted, hp2 must not be updated }
  3764. Continue;
  3765. end
  3766. else
  3767. begin
  3768. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (set -> mov 0)', hp1);
  3769. { Convert "set(c) %reg" instruction to "movb 0,%reg" }
  3770. taicpu(hp1).opcode := A_MOV;
  3771. taicpu(hp1).ops := 2;
  3772. taicpu(hp1).condition := C_None;
  3773. taicpu(hp1).opsize := S_B;
  3774. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  3775. taicpu(hp1).loadconst(0, 0);
  3776. end;
  3777. end;
  3778. C_BE, C_NA:
  3779. begin
  3780. { Will only branch if equal to zero }
  3781. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition BE/NA --> E', hp1);
  3782. taicpu(hp1).condition := C_E;
  3783. end;
  3784. C_A, C_NBE:
  3785. begin
  3786. { Will only branch if not equal to zero }
  3787. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition A/NBE --> NE', hp1);
  3788. taicpu(hp1).condition := C_NE;
  3789. end;
  3790. C_AE, C_NB, C_NC, C_NO:
  3791. begin
  3792. { Will always branch }
  3793. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition AE/NB/NC/NO --> Always', hp1);
  3794. if taicpu(hp1).opcode = A_Jcc then
  3795. begin
  3796. MakeUnconditional(taicpu(hp1));
  3797. { Any jumps/set that follow will now be dead code }
  3798. RemoveDeadCodeAfterJump(taicpu(hp1));
  3799. Break;
  3800. end
  3801. else
  3802. begin
  3803. { Convert "set(c) %reg" instruction to "movb 1,%reg" }
  3804. taicpu(hp1).opcode := A_MOV;
  3805. taicpu(hp1).ops := 2;
  3806. taicpu(hp1).condition := C_None;
  3807. taicpu(hp1).opsize := S_B;
  3808. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  3809. taicpu(hp1).loadconst(0, 1);
  3810. end;
  3811. end;
  3812. C_None:
  3813. InternalError(2020012201);
  3814. C_P, C_PE, C_NP, C_PO:
  3815. { We can't handle parity checks and they should never be generated
  3816. after a general-purpose CMP (it's used in some floating-point
  3817. comparisons that don't use CMP) }
  3818. InternalError(2020012202);
  3819. else
  3820. { Zero/Equality, Sign, their complements and all of the
  3821. signed comparisons do not need to be converted };
  3822. end;
  3823. hp2 := hp1;
  3824. end;
  3825. { Convert the instruction to a TEST }
  3826. taicpu(p).opcode := A_TEST;
  3827. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  3828. Result := True;
  3829. Exit;
  3830. end
  3831. else if (taicpu(p).oper[0]^.val = 1) and
  3832. GetNextInstruction(p, hp1) and
  3833. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
  3834. (taicpu(hp1).condition in [C_L, C_NGE]) then
  3835. begin
  3836. { Convert; To:
  3837. cmp $1,r/m cmp $0,r/m
  3838. jl @lbl jle @lbl
  3839. }
  3840. DebugMsg(SPeepholeOptimization + 'Cmp1Jl2Cmp0Jle', p);
  3841. taicpu(p).oper[0]^.val := 0;
  3842. taicpu(hp1).condition := C_LE;
  3843. { If the instruction is now "cmp $0,%reg", convert it to a
  3844. TEST (and effectively do the work of the "cmp $0,%reg" in
  3845. the block above)
  3846. If it's a reference, we can get away with not setting
  3847. Result to True because he haven't evaluated the jump
  3848. in this pass yet.
  3849. }
  3850. if (taicpu(p).oper[1]^.typ = top_reg) then
  3851. begin
  3852. taicpu(p).opcode := A_TEST;
  3853. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  3854. Result := True;
  3855. end;
  3856. Exit;
  3857. end
  3858. else if (taicpu(p).oper[1]^.typ = top_reg) then
  3859. begin
  3860. { cmp register,$8000 neg register
  3861. je target --> jo target
  3862. .... only if register is deallocated before jump.}
  3863. case Taicpu(p).opsize of
  3864. S_B: v:=$80;
  3865. S_W: v:=$8000;
  3866. S_L: v:=qword($80000000);
  3867. { S_Q will never happen: cmp with 64 bit constants is not possible }
  3868. S_Q:
  3869. Exit;
  3870. else
  3871. internalerror(2013112905);
  3872. end;
  3873. if (taicpu(p).oper[0]^.val=v) and
  3874. GetNextInstruction(p, hp1) and
  3875. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
  3876. (Taicpu(hp1).condition in [C_E,C_NE]) then
  3877. begin
  3878. TransferUsedRegs(TmpUsedRegs);
  3879. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  3880. if not(RegInUsedRegs(Taicpu(p).oper[1]^.reg, TmpUsedRegs)) then
  3881. begin
  3882. DebugMsg(SPeepholeOptimization + 'CmpJe2NegJo done',p);
  3883. Taicpu(p).opcode:=A_NEG;
  3884. Taicpu(p).loadoper(0,Taicpu(p).oper[1]^);
  3885. Taicpu(p).clearop(1);
  3886. Taicpu(p).ops:=1;
  3887. if Taicpu(hp1).condition=C_E then
  3888. Taicpu(hp1).condition:=C_O
  3889. else
  3890. Taicpu(hp1).condition:=C_NO;
  3891. Result:=true;
  3892. exit;
  3893. end;
  3894. end;
  3895. end;
  3896. end;
  3897. end;
  3898. function TX86AsmOptimizer.OptPass1PXor(var p: tai): boolean;
  3899. var
  3900. hp1: tai;
  3901. begin
  3902. {
  3903. remove the second (v)pxor from
  3904. pxor reg,reg
  3905. ...
  3906. pxor reg,reg
  3907. }
  3908. Result:=false;
  3909. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  3910. MatchOpType(taicpu(p),top_reg,top_reg) and
  3911. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  3912. MatchInstruction(taicpu(hp1),taicpu(p).opcode,[taicpu(p).opsize]) and
  3913. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  3914. MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^) then
  3915. begin
  3916. DebugMsg(SPeepholeOptimization + 'PXorPXor2PXor done',hp1);
  3917. asml.Remove(hp1);
  3918. hp1.Free;
  3919. Result:=true;
  3920. Exit;
  3921. end;
  3922. end;
  3923. function TX86AsmOptimizer.OptPass1VPXor(var p: tai): boolean;
  3924. var
  3925. hp1: tai;
  3926. begin
  3927. {
  3928. remove the second (v)pxor from
  3929. (v)pxor reg,reg
  3930. ...
  3931. (v)pxor reg,reg
  3932. }
  3933. Result:=false;
  3934. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^,taicpu(p).oper[2]^) and
  3935. MatchOpType(taicpu(p),top_reg,top_reg,top_reg) and
  3936. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  3937. MatchInstruction(taicpu(hp1),taicpu(p).opcode,[taicpu(p).opsize]) and
  3938. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  3939. MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^,taicpu(hp1).oper[2]^) then
  3940. begin
  3941. DebugMsg(SPeepholeOptimization + 'VPXorVPXor2PXor done',hp1);
  3942. asml.Remove(hp1);
  3943. hp1.Free;
  3944. Result:=true;
  3945. Exit;
  3946. end;
  3947. end;
  3948. function TX86AsmOptimizer.OptPass2MOV(var p : tai) : boolean;
  3949. function IsXCHGAcceptable: Boolean; inline;
  3950. begin
  3951. { Always accept if optimising for size }
  3952. Result := (cs_opt_size in current_settings.optimizerswitches) or
  3953. (
  3954. {$ifdef x86_64}
  3955. { XCHG takes 3 cycles on AMD Athlon64 }
  3956. (current_settings.optimizecputype >= cpu_core_i)
  3957. {$else x86_64}
  3958. { From the Pentium M onwards, XCHG only has a latency of 2 rather
  3959. than 3, so it becomes a saving compared to three MOVs with two of
  3960. them able to execute simultaneously. [Kit] }
  3961. (current_settings.optimizecputype >= cpu_PentiumM)
  3962. {$endif x86_64}
  3963. );
  3964. end;
  3965. var
  3966. NewRef: TReference;
  3967. hp1,hp2,hp3: tai;
  3968. {$ifndef x86_64}
  3969. hp4: tai;
  3970. OperIdx: Integer;
  3971. {$endif x86_64}
  3972. begin
  3973. Result:=false;
  3974. if not GetNextInstruction(p, hp1) then
  3975. Exit;
  3976. if MatchInstruction(hp1, A_JMP, [S_NO]) then
  3977. begin
  3978. { Sometimes the MOVs that OptPass2JMP produces can be improved
  3979. further, but we can't just put this jump optimisation in pass 1
  3980. because it tends to perform worse when conditional jumps are
  3981. nearby (e.g. when converting CMOV instructions). [Kit] }
  3982. if OptPass2JMP(hp1) then
  3983. { call OptPass1MOV once to potentially merge any MOVs that were created }
  3984. Result := OptPass1MOV(p)
  3985. { OptPass2MOV will now exit but will be called again if OptPass1MOV
  3986. returned True and the instruction is still a MOV, thus checking
  3987. the optimisations below }
  3988. { If OptPass2JMP returned False, no optimisations were done to
  3989. the jump and there are no further optimisations that can be done
  3990. to the MOV instruction on this pass }
  3991. end
  3992. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  3993. (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  3994. MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
  3995. MatchOpType(taicpu(hp1),top_const,top_reg) and
  3996. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
  3997. { be lazy, checking separately for sub would be slightly better }
  3998. (abs(taicpu(hp1).oper[0]^.val)<=$7fffffff) then
  3999. begin
  4000. { Change:
  4001. movl/q %reg1,%reg2 movl/q %reg1,%reg2
  4002. addl/q $x,%reg2 subl/q $x,%reg2
  4003. To:
  4004. leal/q x(%reg1),%reg2 leal/q -x(%reg1),%reg2
  4005. }
  4006. TransferUsedRegs(TmpUsedRegs);
  4007. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  4008. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  4009. if not GetNextInstruction(hp1, hp2) or
  4010. (
  4011. { The FLAGS register isn't always tracked properly, so do not
  4012. perform this optimisation if a conditional statement follows }
  4013. not RegReadByInstruction(NR_DEFAULTFLAGS, hp2) and
  4014. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)
  4015. ) then
  4016. begin
  4017. reference_reset(NewRef, 1, []);
  4018. NewRef.base := taicpu(p).oper[0]^.reg;
  4019. NewRef.scalefactor := 1;
  4020. if taicpu(hp1).opcode = A_ADD then
  4021. begin
  4022. DebugMsg(SPeepholeOptimization + 'MovAdd2Lea', p);
  4023. NewRef.offset := taicpu(hp1).oper[0]^.val;
  4024. end
  4025. else
  4026. begin
  4027. DebugMsg(SPeepholeOptimization + 'MovSub2Lea', p);
  4028. NewRef.offset := -taicpu(hp1).oper[0]^.val;
  4029. end;
  4030. taicpu(p).opcode := A_LEA;
  4031. taicpu(p).loadref(0, NewRef);
  4032. Asml.Remove(hp1);
  4033. hp1.Free;
  4034. Result := True;
  4035. Exit;
  4036. end;
  4037. end
  4038. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  4039. {$ifdef x86_64}
  4040. MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
  4041. {$else x86_64}
  4042. MatchInstruction(hp1,A_MOVZX,A_MOVSX,[]) and
  4043. {$endif x86_64}
  4044. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  4045. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg) then
  4046. { mov reg1, reg2 mov reg1, reg2
  4047. movzx/sx reg2, reg3 to movzx/sx reg1, reg3}
  4048. begin
  4049. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  4050. DebugMsg(SPeepholeOptimization + 'mov %reg1,%reg2; movzx/sx %reg2,%reg3 -> mov %reg1,%reg2;movzx/sx %reg1,%reg3',p);
  4051. { Don't remove the MOV command without first checking that reg2 isn't used afterwards,
  4052. or unless supreg(reg3) = supreg(reg2)). [Kit] }
  4053. TransferUsedRegs(TmpUsedRegs);
  4054. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4055. if (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) or
  4056. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)
  4057. then
  4058. begin
  4059. asml.remove(p);
  4060. p.free;
  4061. p := hp1;
  4062. Result:=true;
  4063. end;
  4064. exit;
  4065. end
  4066. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  4067. IsXCHGAcceptable and
  4068. { XCHG doesn't support 8-byte registers }
  4069. (taicpu(p).opsize <> S_B) and
  4070. MatchInstruction(hp1, A_MOV, []) and
  4071. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  4072. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
  4073. GetNextInstruction(hp1, hp2) and
  4074. MatchInstruction(hp2, A_MOV, []) and
  4075. { Don't need to call MatchOpType for hp2 because the operand matches below cover for it }
  4076. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  4077. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) then
  4078. begin
  4079. { mov %reg1,%reg2
  4080. mov %reg3,%reg1 -> xchg %reg3,%reg1
  4081. mov %reg2,%reg3
  4082. (%reg2 not used afterwards)
  4083. Note that xchg takes 3 cycles to execute, and generally mov's take
  4084. only one cycle apiece, but the first two mov's can be executed in
  4085. parallel, only taking 2 cycles overall. Older processors should
  4086. therefore only optimise for size. [Kit]
  4087. }
  4088. TransferUsedRegs(TmpUsedRegs);
  4089. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  4090. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  4091. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) then
  4092. begin
  4093. DebugMsg(SPeepholeOptimization + 'MovMovMov2XChg', p);
  4094. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp1, UsedRegs);
  4095. taicpu(hp1).opcode := A_XCHG;
  4096. asml.Remove(p);
  4097. asml.Remove(hp2);
  4098. p.Free;
  4099. hp2.Free;
  4100. p := hp1;
  4101. Result := True;
  4102. Exit;
  4103. end;
  4104. end
  4105. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  4106. MatchInstruction(hp1, A_SAR, []) then
  4107. begin
  4108. if MatchOperand(taicpu(hp1).oper[0]^, 31) then
  4109. begin
  4110. { the use of %edx also covers the opsize being S_L }
  4111. if MatchOperand(taicpu(hp1).oper[1]^, NR_EDX) then
  4112. begin
  4113. { Note it has to be specifically "movl %eax,%edx", and those specific sub-registers }
  4114. if (taicpu(p).oper[0]^.reg = NR_EAX) and
  4115. (taicpu(p).oper[1]^.reg = NR_EDX) then
  4116. begin
  4117. { Change:
  4118. movl %eax,%edx
  4119. sarl $31,%edx
  4120. To:
  4121. cltd
  4122. }
  4123. DebugMsg(SPeepholeOptimization + 'MovSar2Cltd', p);
  4124. Asml.Remove(hp1);
  4125. hp1.Free;
  4126. taicpu(p).opcode := A_CDQ;
  4127. taicpu(p).opsize := S_NO;
  4128. taicpu(p).clearop(1);
  4129. taicpu(p).clearop(0);
  4130. taicpu(p).ops:=0;
  4131. Result := True;
  4132. end
  4133. else if (cs_opt_size in current_settings.optimizerswitches) and
  4134. (taicpu(p).oper[0]^.reg = NR_EDX) and
  4135. (taicpu(p).oper[1]^.reg = NR_EAX) then
  4136. begin
  4137. { Change:
  4138. movl %edx,%eax
  4139. sarl $31,%edx
  4140. To:
  4141. movl %edx,%eax
  4142. cltd
  4143. Note that this creates a dependency between the two instructions,
  4144. so only perform if optimising for size.
  4145. }
  4146. DebugMsg(SPeepholeOptimization + 'MovSar2MovCltd', p);
  4147. taicpu(hp1).opcode := A_CDQ;
  4148. taicpu(hp1).opsize := S_NO;
  4149. taicpu(hp1).clearop(1);
  4150. taicpu(hp1).clearop(0);
  4151. taicpu(hp1).ops:=0;
  4152. end;
  4153. {$ifndef x86_64}
  4154. end
  4155. { Don't bother if CMOV is supported, because a more optimal
  4156. sequence would have been generated for the Abs() intrinsic }
  4157. else if not(CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) and
  4158. { the use of %eax also covers the opsize being S_L }
  4159. MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) and
  4160. (taicpu(p).oper[0]^.reg = NR_EAX) and
  4161. (taicpu(p).oper[1]^.reg = NR_EDX) and
  4162. GetNextInstruction(hp1, hp2) and
  4163. MatchInstruction(hp2, A_XOR, [S_L]) and
  4164. MatchOperand(taicpu(hp2).oper[0]^, NR_EAX) and
  4165. MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) and
  4166. GetNextInstruction(hp2, hp3) and
  4167. MatchInstruction(hp3, A_SUB, [S_L]) and
  4168. MatchOperand(taicpu(hp3).oper[0]^, NR_EAX) and
  4169. MatchOperand(taicpu(hp3).oper[1]^, NR_EDX) then
  4170. begin
  4171. { Change:
  4172. movl %eax,%edx
  4173. sarl $31,%eax
  4174. xorl %eax,%edx
  4175. subl %eax,%edx
  4176. (Instruction that uses %edx)
  4177. (%eax deallocated)
  4178. (%edx deallocated)
  4179. To:
  4180. cltd
  4181. xorl %edx,%eax <-- Note the registers have swapped
  4182. subl %edx,%eax
  4183. (Instruction that uses %eax) <-- %eax rather than %edx
  4184. }
  4185. TransferUsedRegs(TmpUsedRegs);
  4186. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  4187. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  4188. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  4189. if not RegUsedAfterInstruction(NR_EAX, hp3, TmpUsedRegs) then
  4190. begin
  4191. if GetNextInstruction(hp3, hp4) and
  4192. not RegModifiedByInstruction(NR_EDX, hp4) and
  4193. not RegUsedAfterInstruction(NR_EDX, hp4, TmpUsedRegs) then
  4194. begin
  4195. DebugMsg(SPeepholeOptimization + 'abs() intrinsic optimisation', p);
  4196. taicpu(p).opcode := A_CDQ;
  4197. taicpu(p).clearop(1);
  4198. taicpu(p).clearop(0);
  4199. taicpu(p).ops:=0;
  4200. AsmL.Remove(hp1);
  4201. hp1.Free;
  4202. taicpu(hp2).loadreg(0, NR_EDX);
  4203. taicpu(hp2).loadreg(1, NR_EAX);
  4204. taicpu(hp3).loadreg(0, NR_EDX);
  4205. taicpu(hp3).loadreg(1, NR_EAX);
  4206. AllocRegBetween(NR_EAX, hp3, hp4, TmpUsedRegs);
  4207. { Convert references in the following instruction (hp4) from %edx to %eax }
  4208. for OperIdx := 0 to taicpu(hp4).ops - 1 do
  4209. with taicpu(hp4).oper[OperIdx]^ do
  4210. case typ of
  4211. top_reg:
  4212. if getsupreg(reg) = RS_EDX then
  4213. reg := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  4214. top_ref:
  4215. begin
  4216. if getsupreg(reg) = RS_EDX then
  4217. ref^.base := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  4218. if getsupreg(reg) = RS_EDX then
  4219. ref^.index := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  4220. end;
  4221. else
  4222. ;
  4223. end;
  4224. end;
  4225. end;
  4226. {$else x86_64}
  4227. end;
  4228. end
  4229. else if MatchOperand(taicpu(hp1).oper[0]^, 63) and
  4230. { the use of %rdx also covers the opsize being S_Q }
  4231. MatchOperand(taicpu(hp1).oper[1]^, NR_RDX) then
  4232. begin
  4233. { Note it has to be specifically "movq %rax,%rdx", and those specific sub-registers }
  4234. if (taicpu(p).oper[0]^.reg = NR_RAX) and
  4235. (taicpu(p).oper[1]^.reg = NR_RDX) then
  4236. begin
  4237. { Change:
  4238. movq %rax,%rdx
  4239. sarq $63,%rdx
  4240. To:
  4241. cqto
  4242. }
  4243. DebugMsg(SPeepholeOptimization + 'MovSar2Cqto', p);
  4244. Asml.Remove(hp1);
  4245. hp1.Free;
  4246. taicpu(p).opcode := A_CQO;
  4247. taicpu(p).opsize := S_NO;
  4248. taicpu(p).clearop(1);
  4249. taicpu(p).clearop(0);
  4250. taicpu(p).ops:=0;
  4251. Result := True;
  4252. end
  4253. else if (cs_opt_size in current_settings.optimizerswitches) and
  4254. (taicpu(p).oper[0]^.reg = NR_RDX) and
  4255. (taicpu(p).oper[1]^.reg = NR_RAX) then
  4256. begin
  4257. { Change:
  4258. movq %rdx,%rax
  4259. sarq $63,%rdx
  4260. To:
  4261. movq %rdx,%rax
  4262. cqto
  4263. Note that this creates a dependency between the two instructions,
  4264. so only perform if optimising for size.
  4265. }
  4266. DebugMsg(SPeepholeOptimization + 'MovSar2MovCqto', p);
  4267. taicpu(hp1).opcode := A_CQO;
  4268. taicpu(hp1).opsize := S_NO;
  4269. taicpu(hp1).clearop(1);
  4270. taicpu(hp1).clearop(0);
  4271. taicpu(hp1).ops:=0;
  4272. {$endif x86_64}
  4273. end;
  4274. end;
  4275. end
  4276. else if MatchInstruction(hp1, A_MOV, []) and
  4277. (taicpu(hp1).oper[1]^.typ = top_reg) then
  4278. { Though "GetNextInstruction" could be factored out, along with
  4279. the instructions that depend on hp2, it is an expensive call that
  4280. should be delayed for as long as possible, hence we do cheaper
  4281. checks first that are likely to be False. [Kit] }
  4282. begin
  4283. if MatchOperand(taicpu(p).oper[1]^, NR_EDX) and
  4284. (
  4285. (
  4286. (taicpu(hp1).oper[1]^.reg = NR_EAX) and
  4287. (
  4288. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4289. MatchOperand(taicpu(hp1).oper[0]^, NR_EDX)
  4290. )
  4291. ) or
  4292. (
  4293. (taicpu(hp1).oper[1]^.reg = NR_EDX) and
  4294. (
  4295. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4296. MatchOperand(taicpu(hp1).oper[0]^, NR_EAX)
  4297. )
  4298. )
  4299. ) and
  4300. GetNextInstruction(hp1, hp2) and
  4301. MatchInstruction(hp2, A_SAR, []) and
  4302. MatchOperand(taicpu(hp2).oper[0]^, 31) then
  4303. begin
  4304. if MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) then
  4305. begin
  4306. { Change:
  4307. movl r/m,%edx movl r/m,%eax movl r/m,%edx movl r/m,%eax
  4308. movl %edx,%eax or movl %eax,%edx or movl r/m,%eax or movl r/m,%edx
  4309. sarl $31,%edx sarl $31,%edx sarl $31,%edx sarl $31,%edx
  4310. To:
  4311. movl r/m,%eax <- Note the change in register
  4312. cltd
  4313. }
  4314. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCltd', p);
  4315. AllocRegBetween(NR_EAX, p, hp1, UsedRegs);
  4316. taicpu(p).loadreg(1, NR_EAX);
  4317. taicpu(hp1).opcode := A_CDQ;
  4318. taicpu(hp1).clearop(1);
  4319. taicpu(hp1).clearop(0);
  4320. taicpu(hp1).ops:=0;
  4321. AsmL.Remove(hp2);
  4322. hp2.Free;
  4323. (*
  4324. {$ifdef x86_64}
  4325. end
  4326. else if MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) and
  4327. { This code sequence does not get generated - however it might become useful
  4328. if and when 128-bit signed integer types make an appearance, so the code
  4329. is kept here for when it is eventually needed. [Kit] }
  4330. (
  4331. (
  4332. (taicpu(hp1).oper[1]^.reg = NR_RAX) and
  4333. (
  4334. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4335. MatchOperand(taicpu(hp1).oper[0]^, NR_RDX)
  4336. )
  4337. ) or
  4338. (
  4339. (taicpu(hp1).oper[1]^.reg = NR_RDX) and
  4340. (
  4341. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4342. MatchOperand(taicpu(hp1).oper[0]^, NR_RAX)
  4343. )
  4344. )
  4345. ) and
  4346. GetNextInstruction(hp1, hp2) and
  4347. MatchInstruction(hp2, A_SAR, [S_Q]) and
  4348. MatchOperand(taicpu(hp2).oper[0]^, 63) and
  4349. MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) then
  4350. begin
  4351. { Change:
  4352. movq r/m,%rdx movq r/m,%rax movq r/m,%rdx movq r/m,%rax
  4353. movq %rdx,%rax or movq %rax,%rdx or movq r/m,%rax or movq r/m,%rdx
  4354. sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx
  4355. To:
  4356. movq r/m,%rax <- Note the change in register
  4357. cqto
  4358. }
  4359. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCqto', p);
  4360. AllocRegBetween(NR_RAX, p, hp1, UsedRegs);
  4361. taicpu(p).loadreg(1, NR_RAX);
  4362. taicpu(hp1).opcode := A_CQO;
  4363. taicpu(hp1).clearop(1);
  4364. taicpu(hp1).clearop(0);
  4365. taicpu(hp1).ops:=0;
  4366. AsmL.Remove(hp2);
  4367. hp2.Free;
  4368. {$endif x86_64}
  4369. *)
  4370. end;
  4371. end;
  4372. {$ifdef x86_64}
  4373. end
  4374. else if (taicpu(p).opsize = S_L) and
  4375. (taicpu(p).oper[1]^.typ = top_reg) and
  4376. (
  4377. MatchInstruction(hp1, A_MOV,[]) and
  4378. (taicpu(hp1).opsize = S_L) and
  4379. (taicpu(hp1).oper[1]^.typ = top_reg)
  4380. ) and (
  4381. GetNextInstruction(hp1, hp2) and
  4382. (tai(hp2).typ=ait_instruction) and
  4383. (taicpu(hp2).opsize = S_Q) and
  4384. (
  4385. (
  4386. MatchInstruction(hp2, A_ADD,[]) and
  4387. (taicpu(hp2).opsize = S_Q) and
  4388. (taicpu(hp2).oper[0]^.typ = top_reg) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  4389. (
  4390. (
  4391. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(p).oper[1]^.reg)) and
  4392. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4393. ) or (
  4394. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  4395. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  4396. )
  4397. )
  4398. ) or (
  4399. MatchInstruction(hp2, A_LEA,[]) and
  4400. (taicpu(hp2).oper[0]^.ref^.offset = 0) and
  4401. (taicpu(hp2).oper[0]^.ref^.scalefactor <= 1) and
  4402. (
  4403. (
  4404. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(p).oper[1]^.reg)) and
  4405. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4406. ) or (
  4407. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  4408. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(p).oper[1]^.reg))
  4409. )
  4410. ) and (
  4411. (
  4412. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4413. ) or (
  4414. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  4415. )
  4416. )
  4417. )
  4418. )
  4419. ) and (
  4420. GetNextInstruction(hp2, hp3) and
  4421. MatchInstruction(hp3, A_SHR,[]) and
  4422. (taicpu(hp3).opsize = S_Q) and
  4423. (taicpu(hp3).oper[0]^.typ = top_const) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  4424. (taicpu(hp3).oper[0]^.val = 1) and
  4425. (taicpu(hp3).oper[1]^.reg = taicpu(hp2).oper[1]^.reg)
  4426. ) then
  4427. begin
  4428. { Change movl x, reg1d movl x, reg1d
  4429. movl y, reg2d movl y, reg2d
  4430. addq reg2q,reg1q or leaq (reg1q,reg2q),reg1q
  4431. shrq $1, reg1q shrq $1, reg1q
  4432. ( reg1d and reg2d can be switched around in the first two instructions )
  4433. To movl x, reg1d
  4434. addl y, reg1d
  4435. rcrl $1, reg1d
  4436. This corresponds to the common expression (x + y) shr 1, where
  4437. x and y are Cardinals (replacing "shr 1" with "div 2" produces
  4438. smaller code, but won't account for x + y causing an overflow). [Kit]
  4439. }
  4440. if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then
  4441. { Change first MOV command to have the same register as the final output }
  4442. taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg
  4443. else
  4444. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
  4445. { Change second MOV command to an ADD command. This is easier than
  4446. converting the existing command because it means we don't have to
  4447. touch 'y', which might be a complicated reference, and also the
  4448. fact that the third command might either be ADD or LEA. [Kit] }
  4449. taicpu(hp1).opcode := A_ADD;
  4450. { Delete old ADD/LEA instruction }
  4451. asml.remove(hp2);
  4452. hp2.free;
  4453. { Convert "shrq $1, reg1q" to "rcr $1, reg1d" }
  4454. taicpu(hp3).opcode := A_RCR;
  4455. taicpu(hp3).changeopsize(S_L);
  4456. setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD);
  4457. {$endif x86_64}
  4458. end;
  4459. end;
  4460. function TX86AsmOptimizer.OptPass2Imul(var p : tai) : boolean;
  4461. var
  4462. hp1 : tai;
  4463. begin
  4464. Result:=false;
  4465. if (taicpu(p).ops >= 2) and
  4466. ((taicpu(p).oper[0]^.typ = top_const) or
  4467. ((taicpu(p).oper[0]^.typ = top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full))) and
  4468. (taicpu(p).oper[1]^.typ = top_reg) and
  4469. ((taicpu(p).ops = 2) or
  4470. ((taicpu(p).oper[2]^.typ = top_reg) and
  4471. (taicpu(p).oper[2]^.reg = taicpu(p).oper[1]^.reg))) and
  4472. GetLastInstruction(p,hp1) and
  4473. MatchInstruction(hp1,A_MOV,[]) and
  4474. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  4475. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  4476. begin
  4477. TransferUsedRegs(TmpUsedRegs);
  4478. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,p,TmpUsedRegs)) or
  4479. ((taicpu(p).ops = 3) and (taicpu(p).oper[1]^.reg=taicpu(p).oper[2]^.reg)) then
  4480. { change
  4481. mov reg1,reg2
  4482. imul y,reg2 to imul y,reg1,reg2 }
  4483. begin
  4484. taicpu(p).ops := 3;
  4485. taicpu(p).loadreg(2,taicpu(p).oper[1]^.reg);
  4486. taicpu(p).loadreg(1,taicpu(hp1).oper[0]^.reg);
  4487. DebugMsg(SPeepholeOptimization + 'MovImul2Imul done',p);
  4488. asml.remove(hp1);
  4489. hp1.free;
  4490. result:=true;
  4491. end;
  4492. end;
  4493. end;
  4494. procedure TX86AsmOptimizer.ConvertJumpToRET(const p: tai; const ret_p: tai);
  4495. var
  4496. ThisLabel: TAsmLabel;
  4497. begin
  4498. ThisLabel := tasmlabel(taicpu(p).oper[0]^.ref^.symbol);
  4499. ThisLabel.decrefs;
  4500. taicpu(p).opcode := A_RET;
  4501. taicpu(p).is_jmp := false;
  4502. taicpu(p).ops := taicpu(ret_p).ops;
  4503. case taicpu(ret_p).ops of
  4504. 0:
  4505. taicpu(p).clearop(0);
  4506. 1:
  4507. taicpu(p).loadconst(0,taicpu(ret_p).oper[0]^.val);
  4508. else
  4509. internalerror(2016041301);
  4510. end;
  4511. { If the original label is now dead, it might turn out that the label
  4512. immediately follows p. As a result, everything beyond it, which will
  4513. be just some final register configuration and a RET instruction, is
  4514. now dead code. [Kit] }
  4515. { NOTE: This is much faster than introducing a OptPass2RET routine and
  4516. running RemoveDeadCodeAfterJump for each RET instruction, because
  4517. this optimisation rarely happens and most RETs appear at the end of
  4518. routines where there is nothing that can be stripped. [Kit] }
  4519. if not ThisLabel.is_used then
  4520. RemoveDeadCodeAfterJump(p);
  4521. end;
  4522. function TX86AsmOptimizer.OptPass2Jmp(var p : tai) : boolean;
  4523. var
  4524. hp1, hp2, hp3: tai;
  4525. OperIdx: Integer;
  4526. begin
  4527. result:=false;
  4528. if (taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full) and (taicpu(p).oper[0]^.ref^.base=NR_NO) and
  4529. (taicpu(p).oper[0]^.ref^.index=NR_NO) and (taicpu(p).oper[0]^.ref^.symbol is tasmlabel) then
  4530. begin
  4531. hp1:=getlabelwithsym(tasmlabel(taicpu(p).oper[0]^.ref^.symbol));
  4532. if (taicpu(p).condition=C_None) and assigned(hp1) and SkipLabels(hp1,hp1) and (hp1.typ = ait_instruction) then
  4533. begin
  4534. case taicpu(hp1).opcode of
  4535. A_RET:
  4536. {
  4537. change
  4538. jmp .L1
  4539. ...
  4540. .L1:
  4541. ret
  4542. into
  4543. ret
  4544. }
  4545. begin
  4546. ConvertJumpToRET(p, hp1);
  4547. result:=true;
  4548. end;
  4549. A_MOV:
  4550. {
  4551. change
  4552. jmp .L1
  4553. ...
  4554. .L1:
  4555. mov ##, ##
  4556. ret
  4557. into
  4558. mov ##, ##
  4559. ret
  4560. }
  4561. { This optimisation tends to increase code size if the pass 1 MOV optimisations aren't
  4562. re-run, so only do this particular optimisation if optimising for speed or when
  4563. optimisations are very in-depth. [Kit] }
  4564. if (current_settings.optimizerswitches * [cs_opt_level3, cs_opt_size]) <> [cs_opt_size] then
  4565. begin
  4566. GetNextInstruction(hp1, hp2);
  4567. if not Assigned(hp2) then
  4568. Exit;
  4569. if (hp2.typ in [ait_label, ait_align]) then
  4570. SkipLabels(hp2,hp2);
  4571. if Assigned(hp2) and MatchInstruction(hp2, A_RET, [S_NO]) then
  4572. begin
  4573. { Duplicate the MOV instruction }
  4574. hp3:=tai(hp1.getcopy);
  4575. asml.InsertBefore(hp3, p);
  4576. { Make sure the compiler knows about any final registers written here }
  4577. for OperIdx := 0 to 1 do
  4578. with taicpu(hp3).oper[OperIdx]^ do
  4579. begin
  4580. case typ of
  4581. top_ref:
  4582. begin
  4583. if (ref^.base <> NR_NO) {$ifdef x86_64} and (ref^.base <> NR_RIP) {$endif x86_64} then
  4584. AllocRegBetween(ref^.base, hp3, tai(p.Next), UsedRegs);
  4585. if (ref^.index <> NR_NO) {$ifdef x86_64} and (ref^.index <> NR_RIP) {$endif x86_64} then
  4586. AllocRegBetween(ref^.index, hp3, tai(p.Next), UsedRegs);
  4587. end;
  4588. top_reg:
  4589. AllocRegBetween(reg, hp3, tai(p.Next), UsedRegs);
  4590. else
  4591. ;
  4592. end;
  4593. end;
  4594. { Now change the jump into a RET instruction }
  4595. ConvertJumpToRET(p, hp2);
  4596. result:=true;
  4597. end;
  4598. end;
  4599. else
  4600. ;
  4601. end;
  4602. end;
  4603. end;
  4604. end;
  4605. class function TX86AsmOptimizer.CanBeCMOV(p : tai) : boolean;
  4606. begin
  4607. CanBeCMOV:=assigned(p) and
  4608. MatchInstruction(p,A_MOV,[S_W,S_L,S_Q]) and
  4609. { we can't use cmov ref,reg because
  4610. ref could be nil and cmov still throws an exception
  4611. if ref=nil but the mov isn't done (FK)
  4612. or ((taicpu(p).oper[0]^.typ = top_ref) and
  4613. (taicpu(p).oper[0]^.ref^.refaddr = addr_no))
  4614. }
  4615. (taicpu(p).oper[1]^.typ = top_reg) and
  4616. (
  4617. (taicpu(p).oper[0]^.typ = top_reg) or
  4618. { allow references, but only pure symbols or got rel. addressing with RIP as based,
  4619. it is not expected that this can cause a seg. violation }
  4620. (
  4621. (taicpu(p).oper[0]^.typ = top_ref) and
  4622. IsRefSafe(taicpu(p).oper[0]^.ref)
  4623. )
  4624. );
  4625. end;
  4626. function TX86AsmOptimizer.OptPass2Jcc(var p : tai) : boolean;
  4627. var
  4628. hp1,hp2,hp3,hp4,hpmov2: tai;
  4629. carryadd_opcode : TAsmOp;
  4630. l : Longint;
  4631. condition : TAsmCond;
  4632. symbol: TAsmSymbol;
  4633. reg: tsuperregister;
  4634. regavailable: Boolean;
  4635. begin
  4636. result:=false;
  4637. symbol:=nil;
  4638. if GetNextInstruction(p,hp1) then
  4639. begin
  4640. symbol := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  4641. if (hp1.typ=ait_instruction) and
  4642. GetNextInstruction(hp1,hp2) and
  4643. ((hp2.typ=ait_label) or
  4644. { trick to skip align }
  4645. ((hp2.typ=ait_align) and GetNextInstruction(hp2,hp2) and (hp2.typ=ait_label))
  4646. ) and
  4647. (Tasmlabel(symbol) = Tai_label(hp2).labsym) then
  4648. { jb @@1 cmc
  4649. inc/dec operand --> adc/sbb operand,0
  4650. @@1:
  4651. ... and ...
  4652. jnb @@1
  4653. inc/dec operand --> adc/sbb operand,0
  4654. @@1: }
  4655. begin
  4656. carryadd_opcode:=A_NONE;
  4657. if Taicpu(p).condition in [C_NAE,C_B,C_C] then
  4658. begin
  4659. if (Taicpu(hp1).opcode=A_INC) or
  4660. ((Taicpu(hp1).opcode=A_ADD) and
  4661. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4662. (Taicpu(hp1).oper[0]^.val=1)
  4663. ) then
  4664. carryadd_opcode:=A_ADC;
  4665. if (Taicpu(hp1).opcode=A_DEC) or
  4666. ((Taicpu(hp1).opcode=A_SUB) and
  4667. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4668. (Taicpu(hp1).oper[0]^.val=1)
  4669. ) then
  4670. carryadd_opcode:=A_SBB;
  4671. if carryadd_opcode<>A_NONE then
  4672. begin
  4673. Taicpu(p).clearop(0);
  4674. Taicpu(p).ops:=0;
  4675. Taicpu(p).is_jmp:=false;
  4676. Taicpu(p).opcode:=A_CMC;
  4677. Taicpu(p).condition:=C_NONE;
  4678. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2CmcAdc/Sbb',p);
  4679. Taicpu(hp1).ops:=2;
  4680. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  4681. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  4682. else
  4683. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  4684. Taicpu(hp1).loadconst(0,0);
  4685. Taicpu(hp1).opcode:=carryadd_opcode;
  4686. result:=true;
  4687. exit;
  4688. end;
  4689. end
  4690. else if Taicpu(p).condition in [C_AE,C_NB,C_NC] then
  4691. begin
  4692. if (Taicpu(hp1).opcode=A_INC) or
  4693. ((Taicpu(hp1).opcode=A_ADD) and
  4694. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4695. (Taicpu(hp1).oper[0]^.val=1)
  4696. ) then
  4697. carryadd_opcode:=A_ADC;
  4698. if (Taicpu(hp1).opcode=A_DEC) or
  4699. ((Taicpu(hp1).opcode=A_SUB) and
  4700. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4701. (Taicpu(hp1).oper[0]^.val=1)
  4702. ) then
  4703. carryadd_opcode:=A_SBB;
  4704. if carryadd_opcode<>A_NONE then
  4705. begin
  4706. Taicpu(hp1).ops:=2;
  4707. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2Adc/Sbb',p);
  4708. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  4709. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  4710. else
  4711. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  4712. Taicpu(hp1).loadconst(0,0);
  4713. Taicpu(hp1).opcode:=carryadd_opcode;
  4714. RemoveCurrentP(p, hp1);
  4715. result:=true;
  4716. exit;
  4717. end;
  4718. end
  4719. {
  4720. jcc @@1 setcc tmpreg
  4721. inc/dec/add/sub operand -> (movzx tmpreg)
  4722. @@1: add/sub tmpreg,operand
  4723. While this increases code size slightly, it makes the code much faster if the
  4724. jump is unpredictable
  4725. }
  4726. else if not(cs_opt_size in current_settings.optimizerswitches) and
  4727. ((((Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB)) and
  4728. (Taicpu(hp1).oper[0]^.typ=top_const) and
  4729. (Taicpu(hp1).oper[1]^.typ=top_reg) and
  4730. (Taicpu(hp1).oper[0]^.val=1)) or
  4731. ((Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC))
  4732. ) then
  4733. begin
  4734. TransferUsedRegs(TmpUsedRegs);
  4735. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4736. { search for an available register which is volatile }
  4737. regavailable:=false;
  4738. for reg in tcpuregisterset do
  4739. begin
  4740. if (reg in paramanager.get_volatile_registers_int(current_procinfo.procdef.proccalloption)) and
  4741. not(reg in TmpUsedRegs[R_INTREGISTER].GetUsedRegs) and
  4742. not(RegInInstruction(newreg(R_INTREGISTER,reg,R_SUBL),hp1))
  4743. {$ifdef i386}
  4744. and (reg in [RS_EAX,RS_EBX,RS_ECX,RS_EDX])
  4745. {$endif i386}
  4746. then
  4747. begin
  4748. regavailable:=true;
  4749. break;
  4750. end;
  4751. end;
  4752. if regavailable then
  4753. begin
  4754. Taicpu(p).clearop(0);
  4755. Taicpu(p).ops:=1;
  4756. Taicpu(p).is_jmp:=false;
  4757. Taicpu(p).opcode:=A_SETcc;
  4758. DebugMsg(SPeepholeOptimization+'JccAdd2SetccAdd',p);
  4759. Taicpu(p).condition:=inverse_cond(Taicpu(p).condition);
  4760. Taicpu(p).loadreg(0,newreg(R_INTREGISTER,reg,R_SUBL));
  4761. if getsubreg(Taicpu(hp1).oper[1]^.reg)<>R_SUBL then
  4762. begin
  4763. case getsubreg(Taicpu(hp1).oper[1]^.reg) of
  4764. R_SUBW:
  4765. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BW,newreg(R_INTREGISTER,reg,R_SUBL),
  4766. newreg(R_INTREGISTER,reg,R_SUBW));
  4767. R_SUBD,
  4768. R_SUBQ:
  4769. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BL,newreg(R_INTREGISTER,reg,R_SUBL),
  4770. newreg(R_INTREGISTER,reg,R_SUBD));
  4771. else
  4772. Internalerror(2020030601);
  4773. end;
  4774. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  4775. asml.InsertAfter(hp2,p);
  4776. end;
  4777. if (Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC) then
  4778. begin
  4779. Taicpu(hp1).ops:=2;
  4780. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^)
  4781. end;
  4782. Taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,reg,getsubreg(Taicpu(hp1).oper[1]^.reg)));
  4783. AllocRegBetween(newreg(R_INTREGISTER,reg,getsubreg(Taicpu(hp1).oper[1]^.reg)),p,hp1,UsedRegs);
  4784. end;
  4785. end;
  4786. end;
  4787. { Detect the following:
  4788. jmp<cond> @Lbl1
  4789. jmp @Lbl2
  4790. ...
  4791. @Lbl1:
  4792. ret
  4793. Change to:
  4794. jmp<inv_cond> @Lbl2
  4795. ret
  4796. }
  4797. if MatchInstruction(hp1,A_JMP,[]) and (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  4798. begin
  4799. hp2:=getlabelwithsym(TAsmLabel(symbol));
  4800. if Assigned(hp2) and SkipLabels(hp2,hp2) and
  4801. MatchInstruction(hp2,A_RET,[S_NO]) then
  4802. begin
  4803. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  4804. { Change label address to that of the unconditional jump }
  4805. taicpu(p).loadoper(0, taicpu(hp1).oper[0]^);
  4806. TAsmLabel(symbol).DecRefs;
  4807. taicpu(hp1).opcode := A_RET;
  4808. taicpu(hp1).is_jmp := false;
  4809. taicpu(hp1).ops := taicpu(hp2).ops;
  4810. DebugMsg(SPeepholeOptimization+'JccJmpRet2J!ccRet',p);
  4811. case taicpu(hp2).ops of
  4812. 0:
  4813. taicpu(hp1).clearop(0);
  4814. 1:
  4815. taicpu(hp1).loadconst(0,taicpu(hp2).oper[0]^.val);
  4816. else
  4817. internalerror(2016041302);
  4818. end;
  4819. end;
  4820. end;
  4821. end;
  4822. {$ifndef i8086}
  4823. if CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype] then
  4824. begin
  4825. { check for
  4826. jCC xxx
  4827. <several movs>
  4828. xxx:
  4829. }
  4830. l:=0;
  4831. GetNextInstruction(p, hp1);
  4832. while assigned(hp1) and
  4833. CanBeCMOV(hp1) and
  4834. { stop on labels }
  4835. not(hp1.typ=ait_label) do
  4836. begin
  4837. inc(l);
  4838. GetNextInstruction(hp1,hp1);
  4839. end;
  4840. if assigned(hp1) then
  4841. begin
  4842. if FindLabel(tasmlabel(symbol),hp1) then
  4843. begin
  4844. if (l<=4) and (l>0) then
  4845. begin
  4846. condition:=inverse_cond(taicpu(p).condition);
  4847. GetNextInstruction(p,hp1);
  4848. repeat
  4849. if not Assigned(hp1) then
  4850. InternalError(2018062900);
  4851. taicpu(hp1).opcode:=A_CMOVcc;
  4852. taicpu(hp1).condition:=condition;
  4853. UpdateUsedRegs(hp1);
  4854. GetNextInstruction(hp1,hp1);
  4855. until not(CanBeCMOV(hp1));
  4856. { Remember what hp1 is in case there's multiple aligns to get rid of }
  4857. hp2 := hp1;
  4858. repeat
  4859. if not Assigned(hp2) then
  4860. InternalError(2018062910);
  4861. case hp2.typ of
  4862. ait_label:
  4863. { What we expected - break out of the loop (it won't be a dead label at the top of
  4864. a cluster because that was optimised at an earlier stage) }
  4865. Break;
  4866. ait_align:
  4867. { Go to the next entry until a label is found (may be multiple aligns before it) }
  4868. begin
  4869. hp2 := tai(hp2.Next);
  4870. Continue;
  4871. end;
  4872. else
  4873. begin
  4874. { Might be a comment or temporary allocation entry }
  4875. if not (hp2.typ in SkipInstr) then
  4876. InternalError(2018062911);
  4877. hp2 := tai(hp2.Next);
  4878. Continue;
  4879. end;
  4880. end;
  4881. until False;
  4882. { Now we can safely decrement the reference count }
  4883. tasmlabel(symbol).decrefs;
  4884. DebugMsg(SPeepholeOptimization+'JccMov2CMov',p);
  4885. { Remove the original jump }
  4886. asml.Remove(p);
  4887. p.Free;
  4888. GetNextInstruction(hp2, p); { Instruction after the label }
  4889. { Remove the label if this is its final reference }
  4890. if (tasmlabel(symbol).getrefs=0) then
  4891. StripLabelFast(hp1);
  4892. if Assigned(p) then
  4893. begin
  4894. UpdateUsedRegs(p);
  4895. result:=true;
  4896. end;
  4897. exit;
  4898. end;
  4899. end
  4900. else
  4901. begin
  4902. { check further for
  4903. jCC xxx
  4904. <several movs 1>
  4905. jmp yyy
  4906. xxx:
  4907. <several movs 2>
  4908. yyy:
  4909. }
  4910. { hp2 points to jmp yyy }
  4911. hp2:=hp1;
  4912. { skip hp1 to xxx (or an align right before it) }
  4913. GetNextInstruction(hp1, hp1);
  4914. if assigned(hp2) and
  4915. assigned(hp1) and
  4916. (l<=3) and
  4917. (hp2.typ=ait_instruction) and
  4918. (taicpu(hp2).is_jmp) and
  4919. (taicpu(hp2).condition=C_None) and
  4920. { real label and jump, no further references to the
  4921. label are allowed }
  4922. (tasmlabel(symbol).getrefs=1) and
  4923. FindLabel(tasmlabel(symbol),hp1) then
  4924. begin
  4925. l:=0;
  4926. { skip hp1 to <several moves 2> }
  4927. if (hp1.typ = ait_align) then
  4928. GetNextInstruction(hp1, hp1);
  4929. GetNextInstruction(hp1, hpmov2);
  4930. hp1 := hpmov2;
  4931. while assigned(hp1) and
  4932. CanBeCMOV(hp1) do
  4933. begin
  4934. inc(l);
  4935. GetNextInstruction(hp1, hp1);
  4936. end;
  4937. { hp1 points to yyy (or an align right before it) }
  4938. hp3 := hp1;
  4939. if assigned(hp1) and
  4940. FindLabel(tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol),hp1) then
  4941. begin
  4942. condition:=inverse_cond(taicpu(p).condition);
  4943. GetNextInstruction(p,hp1);
  4944. repeat
  4945. taicpu(hp1).opcode:=A_CMOVcc;
  4946. taicpu(hp1).condition:=condition;
  4947. UpdateUsedRegs(hp1);
  4948. GetNextInstruction(hp1,hp1);
  4949. until not(assigned(hp1)) or
  4950. not(CanBeCMOV(hp1));
  4951. condition:=inverse_cond(condition);
  4952. hp1 := hpmov2;
  4953. { hp1 is now at <several movs 2> }
  4954. while Assigned(hp1) and CanBeCMOV(hp1) do
  4955. begin
  4956. taicpu(hp1).opcode:=A_CMOVcc;
  4957. taicpu(hp1).condition:=condition;
  4958. UpdateUsedRegs(hp1);
  4959. GetNextInstruction(hp1,hp1);
  4960. end;
  4961. hp1 := p;
  4962. { Get first instruction after label }
  4963. GetNextInstruction(hp3, p);
  4964. if assigned(p) and (hp3.typ = ait_align) then
  4965. GetNextInstruction(p, p);
  4966. { Don't dereference yet, as doing so will cause
  4967. GetNextInstruction to skip the label and
  4968. optional align marker. [Kit] }
  4969. GetNextInstruction(hp2, hp4);
  4970. DebugMsg(SPeepholeOptimization+'JccMovJmpMov2CMovCMov',hp1);
  4971. { remove jCC }
  4972. asml.remove(hp1);
  4973. hp1.free;
  4974. { Now we can safely decrement it }
  4975. tasmlabel(symbol).decrefs;
  4976. { Remove label xxx (it will have a ref of zero due to the initial check }
  4977. StripLabelFast(hp4);
  4978. { remove jmp }
  4979. symbol := taicpu(hp2).oper[0]^.ref^.symbol;
  4980. asml.remove(hp2);
  4981. hp2.free;
  4982. { As before, now we can safely decrement it }
  4983. tasmlabel(symbol).decrefs;
  4984. { Remove label yyy (and the optional alignment) if its reference falls to zero }
  4985. if tasmlabel(symbol).getrefs = 0 then
  4986. StripLabelFast(hp3);
  4987. if Assigned(p) then
  4988. begin
  4989. UpdateUsedRegs(p);
  4990. result:=true;
  4991. end;
  4992. exit;
  4993. end;
  4994. end;
  4995. end;
  4996. end;
  4997. end;
  4998. {$endif i8086}
  4999. end;
  5000. function TX86AsmOptimizer.OptPass1Movx(var p : tai) : boolean;
  5001. var
  5002. hp1,hp2: tai;
  5003. reg_and_hp1_is_instr: Boolean;
  5004. begin
  5005. result:=false;
  5006. reg_and_hp1_is_instr:=(taicpu(p).oper[1]^.typ = top_reg) and
  5007. GetNextInstruction(p,hp1) and
  5008. (hp1.typ = ait_instruction);
  5009. if reg_and_hp1_is_instr and
  5010. (
  5011. (taicpu(hp1).opcode <> A_LEA) or
  5012. { If the LEA instruction can be converted into an arithmetic instruction,
  5013. it may be possible to then fold it. }
  5014. (
  5015. { If the flags register is in use, don't change the instruction
  5016. to an ADD otherwise this will scramble the flags. [Kit] }
  5017. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  5018. ConvertLEA(taicpu(hp1))
  5019. )
  5020. ) and
  5021. IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
  5022. GetNextInstruction(hp1,hp2) and
  5023. MatchInstruction(hp2,A_MOV,[]) and
  5024. (taicpu(hp2).oper[0]^.typ = top_reg) and
  5025. OpsEqual(taicpu(hp2).oper[1]^,taicpu(p).oper[0]^) and
  5026. ((taicpu(p).opsize in [S_BW,S_BL]) and (taicpu(hp2).opsize=S_B) or
  5027. (taicpu(p).opsize in [S_WL]) and (taicpu(hp2).opsize=S_W)) and
  5028. {$ifdef i386}
  5029. { not all registers have byte size sub registers on i386 }
  5030. ((taicpu(hp2).opsize<>S_B) or (getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])) and
  5031. {$endif i386}
  5032. (((taicpu(hp1).ops=2) and
  5033. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg))) or
  5034. ((taicpu(hp1).ops=1) and
  5035. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg)))) and
  5036. not(RegUsedAfterInstruction(taicpu(hp2).oper[0]^.reg,hp2,UsedRegs)) then
  5037. begin
  5038. { change movsX/movzX reg/ref, reg2
  5039. add/sub/or/... reg3/$const, reg2
  5040. mov reg2 reg/ref
  5041. to add/sub/or/... reg3/$const, reg/ref }
  5042. { by example:
  5043. movswl %si,%eax movswl %si,%eax p
  5044. decl %eax addl %edx,%eax hp1
  5045. movw %ax,%si movw %ax,%si hp2
  5046. ->
  5047. movswl %si,%eax movswl %si,%eax p
  5048. decw %eax addw %edx,%eax hp1
  5049. movw %ax,%si movw %ax,%si hp2
  5050. }
  5051. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  5052. {
  5053. ->
  5054. movswl %si,%eax movswl %si,%eax p
  5055. decw %si addw %dx,%si hp1
  5056. movw %ax,%si movw %ax,%si hp2
  5057. }
  5058. case taicpu(hp1).ops of
  5059. 1:
  5060. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  5061. 2:
  5062. begin
  5063. taicpu(hp1).loadoper(1,taicpu(hp2).oper[1]^);
  5064. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  5065. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  5066. end;
  5067. else
  5068. internalerror(2008042701);
  5069. end;
  5070. {
  5071. ->
  5072. decw %si addw %dx,%si p
  5073. }
  5074. DebugMsg(SPeepholeOptimization + 'var3',p);
  5075. asml.remove(p);
  5076. asml.remove(hp2);
  5077. p.free;
  5078. hp2.free;
  5079. p:=hp1;
  5080. end
  5081. else if reg_and_hp1_is_instr and
  5082. (taicpu(hp1).opcode = A_MOV) and
  5083. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  5084. (MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)
  5085. {$ifdef x86_64}
  5086. { check for implicit extension to 64 bit }
  5087. or
  5088. ((taicpu(p).opsize in [S_BL,S_WL]) and
  5089. (taicpu(hp1).opsize=S_Q) and
  5090. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg)
  5091. )
  5092. {$endif x86_64}
  5093. )
  5094. then
  5095. begin
  5096. { change
  5097. movx %reg1,%reg2
  5098. mov %reg2,%reg3
  5099. dealloc %reg2
  5100. into
  5101. movx %reg,%reg3
  5102. }
  5103. TransferUsedRegs(TmpUsedRegs);
  5104. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  5105. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  5106. begin
  5107. DebugMsg(SPeepholeOptimization + 'MovxMov2Movx',p);
  5108. {$ifdef x86_64}
  5109. if (taicpu(p).opsize in [S_BL,S_WL]) and
  5110. (taicpu(hp1).opsize=S_Q) then
  5111. taicpu(p).loadreg(1,newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg),R_SUBD))
  5112. else
  5113. {$endif x86_64}
  5114. taicpu(p).loadreg(1,taicpu(hp1).oper[1]^.reg);
  5115. asml.remove(hp1);
  5116. hp1.Free;
  5117. end;
  5118. end
  5119. else if reg_and_hp1_is_instr and
  5120. (taicpu(p).oper[0]^.typ = top_reg) and
  5121. (
  5122. (taicpu(hp1).opcode = A_SHL) or (taicpu(hp1).opcode = A_SAL)
  5123. ) and
  5124. (taicpu(hp1).oper[0]^.typ = top_const) and
  5125. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  5126. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  5127. { Minimum shift value allowed is the bit difference between the sizes }
  5128. (taicpu(hp1).oper[0]^.val >=
  5129. { Multiply by 8 because tcgsize2size returns bytes, not bits }
  5130. 8 * (
  5131. tcgsize2size[reg_cgsize(taicpu(p).oper[1]^.reg)] -
  5132. tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
  5133. )
  5134. ) then
  5135. begin
  5136. { For:
  5137. movsx/movzx %reg1,%reg1 (same register, just different sizes)
  5138. shl/sal ##, %reg1
  5139. Remove the movsx/movzx instruction if the shift overwrites the
  5140. extended bits of the register (e.g. movslq %eax,%rax; shlq $32,%rax
  5141. }
  5142. DebugMsg(SPeepholeOptimization + 'MovxShl2Shl',p);
  5143. RemoveCurrentP(p, hp1);
  5144. Result := True;
  5145. Exit;
  5146. end
  5147. else if taicpu(p).opcode=A_MOVZX then
  5148. begin
  5149. { removes superfluous And's after movzx's }
  5150. if reg_and_hp1_is_instr and
  5151. (taicpu(hp1).opcode = A_AND) and
  5152. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5153. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5154. begin
  5155. case taicpu(p).opsize Of
  5156. S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
  5157. if (taicpu(hp1).oper[0]^.val = $ff) then
  5158. begin
  5159. DebugMsg(SPeepholeOptimization + 'var4',p);
  5160. asml.remove(hp1);
  5161. hp1.free;
  5162. end;
  5163. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  5164. if (taicpu(hp1).oper[0]^.val = $ffff) then
  5165. begin
  5166. DebugMsg(SPeepholeOptimization + 'var5',p);
  5167. asml.remove(hp1);
  5168. hp1.free;
  5169. end;
  5170. {$ifdef x86_64}
  5171. S_LQ:
  5172. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  5173. begin
  5174. if (cs_asm_source in current_settings.globalswitches) then
  5175. asml.insertbefore(tai_comment.create(strpnew(SPeepholeOptimization + 'var6')),p);
  5176. asml.remove(hp1);
  5177. hp1.Free;
  5178. end;
  5179. {$endif x86_64}
  5180. else
  5181. ;
  5182. end;
  5183. end;
  5184. { changes some movzx constructs to faster synonyms (all examples
  5185. are given with eax/ax, but are also valid for other registers)}
  5186. if MatchOpType(taicpu(p),top_reg,top_reg) then
  5187. begin
  5188. case taicpu(p).opsize of
  5189. { Technically, movzbw %al,%ax cannot be encoded in 32/64-bit mode
  5190. (the machine code is equivalent to movzbl %al,%eax), but the
  5191. code generator still generates that assembler instruction and
  5192. it is silently converted. This should probably be checked.
  5193. [Kit] }
  5194. S_BW:
  5195. begin
  5196. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  5197. (
  5198. not IsMOVZXAcceptable
  5199. { and $0xff,%ax has a smaller encoding but risks a partial write penalty }
  5200. or (
  5201. (cs_opt_size in current_settings.optimizerswitches) and
  5202. (taicpu(p).oper[1]^.reg = NR_AX)
  5203. )
  5204. ) then
  5205. {Change "movzbw %al, %ax" to "andw $0x0ffh, %ax"}
  5206. begin
  5207. DebugMsg(SPeepholeOptimization + 'var7',p);
  5208. taicpu(p).opcode := A_AND;
  5209. taicpu(p).changeopsize(S_W);
  5210. taicpu(p).loadConst(0,$ff);
  5211. Result := True;
  5212. end
  5213. else if not IsMOVZXAcceptable and
  5214. GetNextInstruction(p, hp1) and
  5215. (tai(hp1).typ = ait_instruction) and
  5216. (taicpu(hp1).opcode = A_AND) and
  5217. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5218. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5219. { Change "movzbw %reg1, %reg2; andw $const, %reg2"
  5220. to "movw %reg1, reg2; andw $(const1 and $ff), %reg2"}
  5221. begin
  5222. DebugMsg(SPeepholeOptimization + 'var8',p);
  5223. taicpu(p).opcode := A_MOV;
  5224. taicpu(p).changeopsize(S_W);
  5225. setsubreg(taicpu(p).oper[0]^.reg,R_SUBW);
  5226. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5227. Result := True;
  5228. end;
  5229. end;
  5230. {$ifndef i8086} { movzbl %al,%eax cannot be encoded in 16-bit mode (the machine code is equivalent to movzbw %al,%ax }
  5231. S_BL:
  5232. begin
  5233. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  5234. (
  5235. not IsMOVZXAcceptable
  5236. { and $0xff,%eax has a smaller encoding but risks a partial write penalty }
  5237. or (
  5238. (cs_opt_size in current_settings.optimizerswitches) and
  5239. (taicpu(p).oper[1]^.reg = NR_EAX)
  5240. )
  5241. ) then
  5242. { Change "movzbl %al, %eax" to "andl $0x0ffh, %eax" }
  5243. begin
  5244. DebugMsg(SPeepholeOptimization + 'var9',p);
  5245. taicpu(p).opcode := A_AND;
  5246. taicpu(p).changeopsize(S_L);
  5247. taicpu(p).loadConst(0,$ff);
  5248. Result := True;
  5249. end
  5250. else if not IsMOVZXAcceptable and
  5251. GetNextInstruction(p, hp1) and
  5252. (tai(hp1).typ = ait_instruction) and
  5253. (taicpu(hp1).opcode = A_AND) and
  5254. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5255. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5256. { Change "movzbl %reg1, %reg2; andl $const, %reg2"
  5257. to "movl %reg1, reg2; andl $(const1 and $ff), %reg2"}
  5258. begin
  5259. DebugMsg(SPeepholeOptimization + 'var10',p);
  5260. taicpu(p).opcode := A_MOV;
  5261. taicpu(p).changeopsize(S_L);
  5262. { do not use R_SUBWHOLE
  5263. as movl %rdx,%eax
  5264. is invalid in assembler PM }
  5265. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  5266. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5267. Result := True;
  5268. end;
  5269. end;
  5270. {$endif i8086}
  5271. S_WL:
  5272. if not IsMOVZXAcceptable then
  5273. begin
  5274. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) then
  5275. { Change "movzwl %ax, %eax" to "andl $0x0ffffh, %eax" }
  5276. begin
  5277. DebugMsg(SPeepholeOptimization + 'var11',p);
  5278. taicpu(p).opcode := A_AND;
  5279. taicpu(p).changeopsize(S_L);
  5280. taicpu(p).loadConst(0,$ffff);
  5281. Result := True;
  5282. end
  5283. else if GetNextInstruction(p, hp1) and
  5284. (tai(hp1).typ = ait_instruction) and
  5285. (taicpu(hp1).opcode = A_AND) and
  5286. (taicpu(hp1).oper[0]^.typ = top_const) and
  5287. (taicpu(hp1).oper[1]^.typ = top_reg) and
  5288. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5289. { Change "movzwl %reg1, %reg2; andl $const, %reg2"
  5290. to "movl %reg1, reg2; andl $(const1 and $ffff), %reg2"}
  5291. begin
  5292. DebugMsg(SPeepholeOptimization + 'var12',p);
  5293. taicpu(p).opcode := A_MOV;
  5294. taicpu(p).changeopsize(S_L);
  5295. { do not use R_SUBWHOLE
  5296. as movl %rdx,%eax
  5297. is invalid in assembler PM }
  5298. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  5299. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  5300. Result := True;
  5301. end;
  5302. end;
  5303. else
  5304. InternalError(2017050705);
  5305. end;
  5306. end
  5307. else if not IsMOVZXAcceptable and (taicpu(p).oper[0]^.typ = top_ref) then
  5308. begin
  5309. if GetNextInstruction(p, hp1) and
  5310. (tai(hp1).typ = ait_instruction) and
  5311. (taicpu(hp1).opcode = A_AND) and
  5312. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5313. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5314. begin
  5315. //taicpu(p).opcode := A_MOV;
  5316. case taicpu(p).opsize Of
  5317. S_BL:
  5318. begin
  5319. DebugMsg(SPeepholeOptimization + 'var13',p);
  5320. taicpu(hp1).changeopsize(S_L);
  5321. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5322. end;
  5323. S_WL:
  5324. begin
  5325. DebugMsg(SPeepholeOptimization + 'var14',p);
  5326. taicpu(hp1).changeopsize(S_L);
  5327. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  5328. end;
  5329. S_BW:
  5330. begin
  5331. DebugMsg(SPeepholeOptimization + 'var15',p);
  5332. taicpu(hp1).changeopsize(S_W);
  5333. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5334. end;
  5335. else
  5336. Internalerror(2017050704)
  5337. end;
  5338. Result := True;
  5339. end;
  5340. end;
  5341. end;
  5342. end;
  5343. function TX86AsmOptimizer.OptPass1AND(var p : tai) : boolean;
  5344. var
  5345. hp1 : tai;
  5346. MaskLength : Cardinal;
  5347. begin
  5348. Result:=false;
  5349. if GetNextInstruction(p, hp1) then
  5350. begin
  5351. if MatchOpType(taicpu(p),top_const,top_reg) and
  5352. MatchInstruction(hp1,A_AND,[]) and
  5353. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5354. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  5355. { the second register must contain the first one, so compare their subreg types }
  5356. (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
  5357. (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
  5358. { change
  5359. and const1, reg
  5360. and const2, reg
  5361. to
  5362. and (const1 and const2), reg
  5363. }
  5364. begin
  5365. taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
  5366. DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
  5367. asml.remove(p);
  5368. p.Free;
  5369. p:=hp1;
  5370. Result:=true;
  5371. exit;
  5372. end
  5373. else if MatchOpType(taicpu(p),top_const,top_reg) and
  5374. MatchInstruction(hp1,A_MOVZX,[]) and
  5375. (taicpu(hp1).oper[0]^.typ = top_reg) and
  5376. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
  5377. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  5378. (((taicpu(p).opsize=S_W) and
  5379. (taicpu(hp1).opsize=S_BW)) or
  5380. ((taicpu(p).opsize=S_L) and
  5381. (taicpu(hp1).opsize in [S_WL,S_BL]))
  5382. {$ifdef x86_64}
  5383. or
  5384. ((taicpu(p).opsize=S_Q) and
  5385. (taicpu(hp1).opsize in [S_BQ,S_WQ]))
  5386. {$endif x86_64}
  5387. ) then
  5388. begin
  5389. if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  5390. ((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val)
  5391. ) or
  5392. (((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  5393. ((taicpu(p).oper[0]^.val and $ffff)=taicpu(p).oper[0]^.val))
  5394. then
  5395. begin
  5396. { Unlike MOVSX, MOVZX doesn't actually have a version that zero-extends a
  5397. 32-bit register to a 64-bit register, or even a version called MOVZXD, so
  5398. code that tests for the presence of AND 0xffffffff followed by MOVZX is
  5399. wasted, and is indictive of a compiler bug if it were triggered. [Kit]
  5400. NOTE: To zero-extend from 32 bits to 64 bits, simply use the standard MOV.
  5401. }
  5402. DebugMsg(SPeepholeOptimization + 'AndMovzToAnd done',p);
  5403. asml.remove(hp1);
  5404. hp1.free;
  5405. Exit;
  5406. end;
  5407. end
  5408. else if MatchOpType(taicpu(p),top_const,top_reg) and
  5409. MatchInstruction(hp1,A_SHL,[]) and
  5410. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5411. (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
  5412. begin
  5413. {$ifopt R+}
  5414. {$define RANGE_WAS_ON}
  5415. {$R-}
  5416. {$endif}
  5417. { get length of potential and mask }
  5418. MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
  5419. { really a mask? }
  5420. {$ifdef RANGE_WAS_ON}
  5421. {$R+}
  5422. {$endif}
  5423. if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
  5424. { unmasked part shifted out? }
  5425. ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
  5426. begin
  5427. DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
  5428. RemoveCurrentP(p, hp1);
  5429. Result:=true;
  5430. exit;
  5431. end;
  5432. end
  5433. else if MatchOpType(taicpu(p),top_const,top_reg) and
  5434. MatchInstruction(hp1,A_MOVSX{$ifdef x86_64},A_MOVSXD{$endif x86_64},[]) and
  5435. (taicpu(hp1).oper[0]^.typ = top_reg) and
  5436. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
  5437. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  5438. (((taicpu(p).opsize=S_W) and
  5439. (taicpu(hp1).opsize=S_BW)) or
  5440. ((taicpu(p).opsize=S_L) and
  5441. (taicpu(hp1).opsize in [S_WL,S_BL]))
  5442. {$ifdef x86_64}
  5443. or
  5444. ((taicpu(p).opsize=S_Q) and
  5445. (taicpu(hp1).opsize in [S_BQ,S_WQ,S_LQ]))
  5446. {$endif x86_64}
  5447. ) then
  5448. begin
  5449. if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  5450. ((taicpu(p).oper[0]^.val and $7f)=taicpu(p).oper[0]^.val)
  5451. ) or
  5452. (((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  5453. ((taicpu(p).oper[0]^.val and $7fff)=taicpu(p).oper[0]^.val))
  5454. {$ifdef x86_64}
  5455. or
  5456. (((taicpu(hp1).opsize)=S_LQ) and
  5457. ((taicpu(p).oper[0]^.val and $7fffffff)=taicpu(p).oper[0]^.val)
  5458. )
  5459. {$endif x86_64}
  5460. then
  5461. begin
  5462. DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
  5463. asml.remove(hp1);
  5464. hp1.free;
  5465. Exit;
  5466. end;
  5467. end
  5468. else if (taicpu(p).oper[1]^.typ = top_reg) and
  5469. (hp1.typ = ait_instruction) and
  5470. (taicpu(hp1).is_jmp) and
  5471. (taicpu(hp1).opcode<>A_JMP) and
  5472. not(RegInUsedRegs(taicpu(p).oper[1]^.reg,UsedRegs)) then
  5473. begin
  5474. { change
  5475. and x, reg
  5476. jxx
  5477. to
  5478. test x, reg
  5479. jxx
  5480. if reg is deallocated before the
  5481. jump, but only if it's a conditional jump (PFV)
  5482. }
  5483. taicpu(p).opcode := A_TEST;
  5484. Exit;
  5485. end;
  5486. end;
  5487. { Lone AND tests }
  5488. if MatchOpType(taicpu(p),top_const,top_reg) then
  5489. begin
  5490. {
  5491. - Convert and $0xFF,reg to and reg,reg if reg is 8-bit
  5492. - Convert and $0xFFFF,reg to and reg,reg if reg is 16-bit
  5493. - Convert and $0xFFFFFFFF,reg to and reg,reg if reg is 32-bit
  5494. }
  5495. if ((taicpu(p).oper[0]^.val = $FF) and (taicpu(p).opsize = S_B)) or
  5496. ((taicpu(p).oper[0]^.val = $FFFF) and (taicpu(p).opsize = S_W)) or
  5497. ((taicpu(p).oper[0]^.val = $FFFFFFFF) and (taicpu(p).opsize = S_L)) then
  5498. begin
  5499. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  5500. if taicpu(p).opsize = S_L then
  5501. begin
  5502. Include(OptsToCheck,aoc_MovAnd2Mov_3);
  5503. Result := True;
  5504. end;
  5505. end;
  5506. end;
  5507. end;
  5508. function TX86AsmOptimizer.OptPass2Lea(var p : tai) : Boolean;
  5509. begin
  5510. Result:=false;
  5511. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) and
  5512. MatchReference(taicpu(p).oper[0]^.ref^,taicpu(p).oper[1]^.reg,NR_INVALID) and
  5513. (taicpu(p).oper[0]^.ref^.index<>NR_NO) then
  5514. begin
  5515. taicpu(p).loadreg(1,taicpu(p).oper[0]^.ref^.base);
  5516. taicpu(p).loadreg(0,taicpu(p).oper[0]^.ref^.index);
  5517. taicpu(p).opcode:=A_ADD;
  5518. DebugMsg(SPeepholeOptimization + 'Lea2AddBase done',p);
  5519. result:=true;
  5520. end
  5521. else if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) and
  5522. MatchReference(taicpu(p).oper[0]^.ref^,NR_INVALID,taicpu(p).oper[1]^.reg) and
  5523. (taicpu(p).oper[0]^.ref^.base<>NR_NO) then
  5524. begin
  5525. taicpu(p).loadreg(1,taicpu(p).oper[0]^.ref^.index);
  5526. taicpu(p).loadreg(0,taicpu(p).oper[0]^.ref^.base);
  5527. taicpu(p).opcode:=A_ADD;
  5528. DebugMsg(SPeepholeOptimization + 'Lea2AddIndex done',p);
  5529. result:=true;
  5530. end;
  5531. end;
  5532. function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
  5533. var
  5534. hp1: tai; NewRef: TReference;
  5535. begin
  5536. { Change:
  5537. subl/q $x,%reg1
  5538. movl/q %reg1,%reg2
  5539. To:
  5540. leal/q $-x(%reg1),%reg2
  5541. subl/q $x,%reg1
  5542. Breaks the dependency chain and potentially permits the removal of
  5543. a CMP instruction if one follows.
  5544. }
  5545. Result := False;
  5546. if not (cs_opt_size in current_settings.optimizerswitches) and
  5547. (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  5548. MatchOpType(taicpu(p),top_const,top_reg) and
  5549. GetNextInstruction(p, hp1) and
  5550. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  5551. (taicpu(hp1).oper[1]^.typ = top_reg) and
  5552. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) then
  5553. begin
  5554. { Change the MOV instruction to a LEA instruction, and update the
  5555. first operand }
  5556. reference_reset(NewRef, 1, []);
  5557. NewRef.base := taicpu(p).oper[1]^.reg;
  5558. NewRef.scalefactor := 1;
  5559. NewRef.offset := -taicpu(p).oper[0]^.val;
  5560. taicpu(hp1).opcode := A_LEA;
  5561. taicpu(hp1).loadref(0, NewRef);
  5562. { Move what is now the LEA instruction to before the SUB instruction }
  5563. Asml.Remove(hp1);
  5564. Asml.InsertBefore(hp1, p);
  5565. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
  5566. DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
  5567. Result := True;
  5568. end;
  5569. end;
  5570. function TX86AsmOptimizer.SkipSimpleInstructions(var hp1 : tai) : Boolean;
  5571. begin
  5572. { we can skip all instructions not messing with the stack pointer }
  5573. while assigned(hp1) and {MatchInstruction(taicpu(hp1),[A_LEA,A_MOV,A_MOVQ,A_MOVSQ,A_MOVSX,A_MOVSXD,A_MOVZX,
  5574. A_AND,A_OR,A_XOR,A_ADD,A_SHR,A_SHL,A_IMUL,A_SETcc,A_SAR,A_SUB,A_TEST,A_CMOVcc,
  5575. A_MOVSS,A_MOVSD,A_MOVAPS,A_MOVUPD,A_MOVAPD,A_MOVUPS,
  5576. A_VMOVSS,A_VMOVSD,A_VMOVAPS,A_VMOVUPD,A_VMOVAPD,A_VMOVUPS],[]) and}
  5577. ({(taicpu(hp1).ops=0) or }
  5578. ({(MatchOpType(taicpu(hp1),top_reg,top_reg) or MatchOpType(taicpu(hp1),top_const,top_reg) or
  5579. (MatchOpType(taicpu(hp1),top_ref,top_reg))
  5580. ) and }
  5581. not(RegInInstruction(NR_STACK_POINTER_REG,hp1)) { and not(RegInInstruction(NR_FRAME_POINTER_REG,hp1))}
  5582. )
  5583. ) do
  5584. GetNextInstruction(hp1,hp1);
  5585. Result:=assigned(hp1);
  5586. end;
  5587. function TX86AsmOptimizer.PostPeepholeOptLea(var p : tai) : Boolean;
  5588. var
  5589. hp1, hp2, hp3, hp4: tai;
  5590. begin
  5591. Result:=false;
  5592. { replace
  5593. leal(q) x(<stackpointer>),<stackpointer>
  5594. call procname
  5595. leal(q) -x(<stackpointer>),<stackpointer>
  5596. ret
  5597. by
  5598. jmp procname
  5599. but do it only on level 4 because it destroys stack back traces
  5600. }
  5601. if (cs_opt_level4 in current_settings.optimizerswitches) and
  5602. MatchOpType(taicpu(p),top_ref,top_reg) and
  5603. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  5604. (taicpu(p).oper[0]^.ref^.index=NR_NO) and
  5605. { the -8 or -24 are not required, but bail out early if possible,
  5606. higher values are unlikely }
  5607. ((taicpu(p).oper[0]^.ref^.offset=-8) or
  5608. (taicpu(p).oper[0]^.ref^.offset=-24)) and
  5609. (taicpu(p).oper[0]^.ref^.symbol=nil) and
  5610. (taicpu(p).oper[0]^.ref^.relsymbol=nil) and
  5611. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and
  5612. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG) and
  5613. GetNextInstruction(p, hp1) and
  5614. { Take a copy of hp1 }
  5615. SetAndTest(hp1, hp4) and
  5616. { trick to skip label }
  5617. ((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
  5618. SkipSimpleInstructions(hp1) and
  5619. MatchInstruction(hp1,A_CALL,[S_NO]) and
  5620. GetNextInstruction(hp1, hp2) and
  5621. MatchInstruction(hp2,A_LEA,[taicpu(p).opsize]) and
  5622. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  5623. (taicpu(hp2).oper[0]^.ref^.offset=-taicpu(p).oper[0]^.ref^.offset) and
  5624. (taicpu(hp2).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  5625. (taicpu(hp2).oper[0]^.ref^.index=NR_NO) and
  5626. (taicpu(hp2).oper[0]^.ref^.symbol=nil) and
  5627. (taicpu(hp2).oper[0]^.ref^.relsymbol=nil) and
  5628. (taicpu(hp2).oper[0]^.ref^.segment=NR_NO) and
  5629. (taicpu(hp2).oper[1]^.reg=NR_STACK_POINTER_REG) and
  5630. GetNextInstruction(hp2, hp3) and
  5631. { trick to skip label }
  5632. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  5633. MatchInstruction(hp3,A_RET,[S_NO]) and
  5634. (taicpu(hp3).ops=0) then
  5635. begin
  5636. taicpu(hp1).opcode := A_JMP;
  5637. taicpu(hp1).is_jmp := true;
  5638. DebugMsg(SPeepholeOptimization + 'LeaCallLeaRet2Jmp done',p);
  5639. RemoveCurrentP(p, hp4);
  5640. AsmL.Remove(hp2);
  5641. hp2.free;
  5642. AsmL.Remove(hp3);
  5643. hp3.free;
  5644. Result:=true;
  5645. end;
  5646. end;
  5647. function TX86AsmOptimizer.PostPeepholeOptPush(var p : tai) : Boolean;
  5648. var
  5649. hp1, hp2, hp3, hp4: tai;
  5650. begin
  5651. Result:=false;
  5652. {$ifdef x86_64}
  5653. { replace
  5654. push %rax
  5655. call procname
  5656. pop %rcx
  5657. ret
  5658. by
  5659. jmp procname
  5660. but do it only on level 4 because it destroys stack back traces
  5661. It depends on the fact, that the sequence push rax/pop rcx is used for stack alignment as rcx is volatile
  5662. for all supported calling conventions
  5663. }
  5664. if (cs_opt_level4 in current_settings.optimizerswitches) and
  5665. MatchOpType(taicpu(p),top_reg) and
  5666. (taicpu(p).oper[0]^.reg=NR_RAX) and
  5667. GetNextInstruction(p, hp1) and
  5668. { Take a copy of hp1 }
  5669. SetAndTest(hp1, hp4) and
  5670. { trick to skip label }
  5671. ((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
  5672. SkipSimpleInstructions(hp1) and
  5673. MatchInstruction(hp1,A_CALL,[S_NO]) and
  5674. GetNextInstruction(hp1, hp2) and
  5675. MatchInstruction(hp2,A_POP,[taicpu(p).opsize]) and
  5676. MatchOpType(taicpu(hp2),top_reg) and
  5677. (taicpu(hp2).oper[0]^.reg=NR_RCX) and
  5678. GetNextInstruction(hp2, hp3) and
  5679. { trick to skip label }
  5680. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  5681. MatchInstruction(hp3,A_RET,[S_NO]) and
  5682. (taicpu(hp3).ops=0) then
  5683. begin
  5684. taicpu(hp1).opcode := A_JMP;
  5685. taicpu(hp1).is_jmp := true;
  5686. DebugMsg(SPeepholeOptimization + 'PushCallPushRet2Jmp done',p);
  5687. RemoveCurrentP(p, hp4);
  5688. AsmL.Remove(hp2);
  5689. hp2.free;
  5690. AsmL.Remove(hp3);
  5691. hp3.free;
  5692. Result:=true;
  5693. end;
  5694. {$endif x86_64}
  5695. end;
  5696. function TX86AsmOptimizer.PostPeepholeOptMov(var p : tai) : Boolean;
  5697. var
  5698. Value, RegName: string;
  5699. begin
  5700. Result:=false;
  5701. if (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(p).oper[0]^.typ = top_const) then
  5702. begin
  5703. case taicpu(p).oper[0]^.val of
  5704. 0:
  5705. { Don't make this optimisation if the CPU flags are required, since XOR scrambles them }
  5706. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  5707. begin
  5708. { change "mov $0,%reg" into "xor %reg,%reg" }
  5709. taicpu(p).opcode := A_XOR;
  5710. taicpu(p).loadReg(0,taicpu(p).oper[1]^.reg);
  5711. Result := True;
  5712. end;
  5713. $1..$FFFFFFFF:
  5714. begin
  5715. { Code size reduction by J. Gareth "Kit" Moreton }
  5716. { change 64-bit register to 32-bit register to reduce code size (upper 32 bits will be set to zero) }
  5717. case taicpu(p).opsize of
  5718. S_Q:
  5719. begin
  5720. RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
  5721. Value := debug_tostr(taicpu(p).oper[0]^.val);
  5722. { The actual optimization }
  5723. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5724. taicpu(p).changeopsize(S_L);
  5725. DebugMsg(SPeepholeOptimization + 'movq $' + Value + ',' + RegName + ' -> movl $' + Value + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
  5726. Result := True;
  5727. end;
  5728. else
  5729. { Do nothing };
  5730. end;
  5731. end;
  5732. -1:
  5733. { Don't make this optimisation if the CPU flags are required, since OR scrambles them }
  5734. if (cs_opt_size in current_settings.optimizerswitches) and
  5735. (taicpu(p).opsize <> S_B) and
  5736. not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  5737. begin
  5738. { change "mov $-1,%reg" into "or $-1,%reg" }
  5739. { NOTES:
  5740. - No size saving is made when changing a Word-sized assignment unless the register is AX (smaller encoding)
  5741. - This operation creates a false dependency on the register, so only do it when optimising for size
  5742. - It is possible to set memory operands using this method, but this creates an even greater false dependency, so don't do this at all
  5743. }
  5744. taicpu(p).opcode := A_OR;
  5745. Result := True;
  5746. end;
  5747. end;
  5748. end;
  5749. end;
  5750. function TX86AsmOptimizer.PostPeepholeOptMOVSX(var p : tai) : boolean;
  5751. begin
  5752. Result := False;
  5753. if not MatchOpType(taicpu(p), top_reg, top_reg) then
  5754. Exit;
  5755. { Convert:
  5756. movswl %ax,%eax -> cwtl
  5757. movslq %eax,%rax -> cdqe
  5758. NOTE: Don't convert movswl %al,%ax to cbw, because cbw and cwde
  5759. refer to the same opcode and depends only on the assembler's
  5760. current operand-size attribute. [Kit]
  5761. }
  5762. with taicpu(p) do
  5763. case opsize of
  5764. S_WL:
  5765. if (oper[0]^.reg = NR_AX) and (oper[1]^.reg = NR_EAX) then
  5766. begin
  5767. DebugMsg(SPeepholeOptimization + 'Converted movswl %ax,%eax to cwtl', p);
  5768. opcode := A_CWDE;
  5769. clearop(0);
  5770. clearop(1);
  5771. ops := 0;
  5772. Result := True;
  5773. end;
  5774. {$ifdef x86_64}
  5775. S_LQ:
  5776. if (oper[0]^.reg = NR_EAX) and (oper[1]^.reg = NR_RAX) then
  5777. begin
  5778. DebugMsg(SPeepholeOptimization + 'Converted movslq %eax,%rax to cltq', p);
  5779. opcode := A_CDQE;
  5780. clearop(0);
  5781. clearop(1);
  5782. ops := 0;
  5783. Result := True;
  5784. end;
  5785. {$endif x86_64}
  5786. else
  5787. ;
  5788. end;
  5789. end;
  5790. function TX86AsmOptimizer.PostPeepholeOptCmp(var p : tai) : Boolean;
  5791. begin
  5792. Result:=false;
  5793. { change "cmp $0, %reg" to "test %reg, %reg" }
  5794. if MatchOpType(taicpu(p),top_const,top_reg) and
  5795. (taicpu(p).oper[0]^.val = 0) then
  5796. begin
  5797. taicpu(p).opcode := A_TEST;
  5798. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  5799. Result:=true;
  5800. end;
  5801. end;
  5802. function TX86AsmOptimizer.PostPeepholeOptTestOr(var p : tai) : Boolean;
  5803. var
  5804. IsTestConstX : Boolean;
  5805. hp1,hp2 : tai;
  5806. begin
  5807. Result:=false;
  5808. { removes the line marked with (x) from the sequence
  5809. and/or/xor/add/sub/... $x, %y
  5810. test/or %y, %y | test $-1, %y (x)
  5811. j(n)z _Label
  5812. as the first instruction already adjusts the ZF
  5813. %y operand may also be a reference }
  5814. IsTestConstX:=(taicpu(p).opcode=A_TEST) and
  5815. MatchOperand(taicpu(p).oper[0]^,-1);
  5816. if (OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) or IsTestConstX) and
  5817. GetLastInstruction(p, hp1) and
  5818. (tai(hp1).typ = ait_instruction) and
  5819. GetNextInstruction(p,hp2) and
  5820. MatchInstruction(hp2,A_SETcc,A_Jcc,A_CMOVcc,[]) then
  5821. case taicpu(hp1).opcode Of
  5822. A_ADD, A_SUB, A_OR, A_XOR, A_AND:
  5823. begin
  5824. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  5825. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5826. { and in case of carry for A(E)/B(E)/C/NC }
  5827. ((taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) or
  5828. ((taicpu(hp1).opcode <> A_ADD) and
  5829. (taicpu(hp1).opcode <> A_SUB))) then
  5830. begin
  5831. hp1 := tai(p.next);
  5832. asml.remove(p);
  5833. p.free;
  5834. p := tai(hp1);
  5835. Result:=true;
  5836. end;
  5837. end;
  5838. A_SHL, A_SAL, A_SHR, A_SAR:
  5839. begin
  5840. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  5841. { SHL/SAL/SHR/SAR with a value of 0 do not change the flags }
  5842. { therefore, it's only safe to do this optimization for }
  5843. { shifts by a (nonzero) constant }
  5844. (taicpu(hp1).oper[0]^.typ = top_const) and
  5845. (taicpu(hp1).oper[0]^.val <> 0) and
  5846. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5847. { and in case of carry for A(E)/B(E)/C/NC }
  5848. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  5849. begin
  5850. hp1 := tai(p.next);
  5851. asml.remove(p);
  5852. p.free;
  5853. p := tai(hp1);
  5854. Result:=true;
  5855. end;
  5856. end;
  5857. A_DEC, A_INC, A_NEG:
  5858. begin
  5859. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) and
  5860. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5861. { and in case of carry for A(E)/B(E)/C/NC }
  5862. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  5863. begin
  5864. case taicpu(hp1).opcode of
  5865. A_DEC, A_INC:
  5866. { replace inc/dec with add/sub 1, because inc/dec doesn't set the carry flag }
  5867. begin
  5868. case taicpu(hp1).opcode Of
  5869. A_DEC: taicpu(hp1).opcode := A_SUB;
  5870. A_INC: taicpu(hp1).opcode := A_ADD;
  5871. else
  5872. ;
  5873. end;
  5874. taicpu(hp1).loadoper(1,taicpu(hp1).oper[0]^);
  5875. taicpu(hp1).loadConst(0,1);
  5876. taicpu(hp1).ops:=2;
  5877. end;
  5878. else
  5879. ;
  5880. end;
  5881. hp1 := tai(p.next);
  5882. asml.remove(p);
  5883. p.free;
  5884. p := tai(hp1);
  5885. Result:=true;
  5886. end;
  5887. end
  5888. else
  5889. { change "test $-1,%reg" into "test %reg,%reg" }
  5890. if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
  5891. taicpu(p).loadoper(0,taicpu(p).oper[1]^);
  5892. end { case }
  5893. { change "test $-1,%reg" into "test %reg,%reg" }
  5894. else if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
  5895. taicpu(p).loadoper(0,taicpu(p).oper[1]^);
  5896. end;
  5897. function TX86AsmOptimizer.PostPeepholeOptCall(var p : tai) : Boolean;
  5898. var
  5899. hp1 : tai;
  5900. {$ifndef x86_64}
  5901. hp2 : taicpu;
  5902. {$endif x86_64}
  5903. begin
  5904. Result:=false;
  5905. {$ifndef x86_64}
  5906. { don't do this on modern CPUs, this really hurts them due to
  5907. broken call/ret pairing }
  5908. if (current_settings.optimizecputype < cpu_Pentium2) and
  5909. not(cs_create_pic in current_settings.moduleswitches) and
  5910. GetNextInstruction(p, hp1) and
  5911. MatchInstruction(hp1,A_JMP,[S_NO]) and
  5912. MatchOpType(taicpu(hp1),top_ref) and
  5913. (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  5914. begin
  5915. hp2 := taicpu.Op_sym(A_PUSH,S_L,taicpu(hp1).oper[0]^.ref^.symbol);
  5916. InsertLLItem(p.previous, p, hp2);
  5917. taicpu(p).opcode := A_JMP;
  5918. taicpu(p).is_jmp := true;
  5919. asml.remove(hp1);
  5920. hp1.free;
  5921. Result:=true;
  5922. end
  5923. else
  5924. {$endif x86_64}
  5925. { replace
  5926. call procname
  5927. ret
  5928. by
  5929. jmp procname
  5930. but do it only on level 4 because it destroys stack back traces
  5931. else if the subroutine is marked as no return, remove the ret
  5932. }
  5933. if ((cs_opt_level4 in current_settings.optimizerswitches) or
  5934. (po_noreturn in current_procinfo.procdef.procoptions)) and
  5935. GetNextInstruction(p, hp1) and
  5936. MatchInstruction(hp1,A_RET,[S_NO]) and
  5937. (taicpu(hp1).ops=0) then
  5938. begin
  5939. if (cs_opt_level4 in current_settings.optimizerswitches) and
  5940. { we might destroy stack alignment here if we do not do a call }
  5941. (target_info.stackalign<=sizeof(SizeUInt)) then
  5942. begin
  5943. taicpu(p).opcode := A_JMP;
  5944. taicpu(p).is_jmp := true;
  5945. DebugMsg(SPeepholeOptimization + 'CallRet2Jmp done',p);
  5946. end
  5947. else
  5948. DebugMsg(SPeepholeOptimization + 'CallRet2Call done',p);
  5949. asml.remove(hp1);
  5950. hp1.free;
  5951. Result:=true;
  5952. end;
  5953. end;
  5954. {$ifdef x86_64}
  5955. function TX86AsmOptimizer.PostPeepholeOptMovzx(var p : tai) : Boolean;
  5956. var
  5957. PreMessage: string;
  5958. begin
  5959. Result := False;
  5960. { Code size reduction by J. Gareth "Kit" Moreton }
  5961. { Convert MOVZBQ and MOVZWQ to MOVZBL and MOVZWL respectively if it removes the REX prefix }
  5962. if (taicpu(p).opsize in [S_BQ, S_WQ]) and
  5963. (getsupreg(taicpu(p).oper[1]^.reg) in [RS_RAX, RS_RCX, RS_RDX, RS_RBX, RS_RSI, RS_RDI, RS_RBP, RS_RSP])
  5964. then
  5965. begin
  5966. { Has 64-bit register name and opcode suffix }
  5967. PreMessage := 'movz' + debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' -> movz';
  5968. { The actual optimization }
  5969. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5970. if taicpu(p).opsize = S_BQ then
  5971. taicpu(p).changeopsize(S_BL)
  5972. else
  5973. taicpu(p).changeopsize(S_WL);
  5974. DebugMsg(SPeepholeOptimization + PreMessage +
  5975. debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (removes REX prefix)', p);
  5976. end;
  5977. end;
  5978. function TX86AsmOptimizer.PostPeepholeOptXor(var p : tai) : Boolean;
  5979. var
  5980. PreMessage, RegName: string;
  5981. begin
  5982. { Code size reduction by J. Gareth "Kit" Moreton }
  5983. { change "xorq %reg,%reg" to "xorl %reg,%reg" for %rax, %rcx, %rdx, %rbx, %rsi, %rdi, %rbp and %rsp,
  5984. as this removes the REX prefix }
  5985. Result := False;
  5986. if not OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  5987. Exit;
  5988. if taicpu(p).oper[0]^.typ <> top_reg then
  5989. { Should be impossible if both operands were equal, since one of XOR's operands must be a register }
  5990. InternalError(2018011500);
  5991. case taicpu(p).opsize of
  5992. S_Q:
  5993. begin
  5994. if (getsupreg(taicpu(p).oper[0]^.reg) in [RS_RAX, RS_RCX, RS_RDX, RS_RBX, RS_RSI, RS_RDI, RS_RBP, RS_RSP]) then
  5995. begin
  5996. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 64-bit register name }
  5997. PreMessage := 'xorq ' + RegName + ',' + RegName + ' -> xorl ';
  5998. { The actual optimization }
  5999. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  6000. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  6001. taicpu(p).changeopsize(S_L);
  6002. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 32-bit register name }
  6003. DebugMsg(SPeepholeOptimization + PreMessage + RegName + ',' + RegName + ' (removes REX prefix)', p);
  6004. end;
  6005. end;
  6006. else
  6007. ;
  6008. end;
  6009. end;
  6010. {$endif}
  6011. class procedure TX86AsmOptimizer.OptimizeRefs(var p: taicpu);
  6012. var
  6013. OperIdx: Integer;
  6014. begin
  6015. for OperIdx := 0 to p.ops - 1 do
  6016. if p.oper[OperIdx]^.typ = top_ref then
  6017. optimize_ref(p.oper[OperIdx]^.ref^, False);
  6018. end;
  6019. end.