aoptx86.pas 475 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077
  1. {
  2. Copyright (c) 1998-2002 by Florian Klaempfl and Jonas Maebe
  3. This unit contains the peephole optimizer.
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit aoptx86;
  18. {$i fpcdefs.inc}
  19. {$ifdef EXTDEBUG}
  20. {$define DEBUG_AOPTCPU}
  21. {$endif EXTDEBUG}
  22. interface
  23. uses
  24. globtype,
  25. cpubase,
  26. aasmtai,aasmcpu,
  27. cgbase,cgutils,
  28. aopt,aoptobj;
  29. type
  30. TOptsToCheck = (
  31. aoc_MovAnd2Mov_3
  32. );
  33. TX86AsmOptimizer = class(TAsmOptimizer)
  34. { some optimizations are very expensive to check, so the
  35. pre opt pass can be used to set some flags, depending on the found
  36. instructions if it is worth to check a certain optimization }
  37. OptsToCheck : set of TOptsToCheck;
  38. function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override;
  39. function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override;
  40. function RegReadByInstruction(reg : TRegister; hp : tai) : boolean;
  41. function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
  42. function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  43. { This version of GetNextInstructionUsingReg will look across conditional jumps,
  44. potentially allowing further optimisation (although it might need to know if
  45. it crossed a conditional jump. }
  46. function GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var CrossJump: Boolean): Boolean;
  47. {
  48. In comparison with GetNextInstructionUsingReg, GetNextInstructionUsingRegTrackingUse tracks
  49. the use of a register by allocs/dealloc, so it can ignore calls.
  50. In the following example, GetNextInstructionUsingReg will return the second movq,
  51. GetNextInstructionUsingRegTrackingUse won't.
  52. movq %rdi,%rax
  53. # Register rdi released
  54. # Register rdi allocated
  55. movq %rax,%rdi
  56. While in this example:
  57. movq %rdi,%rax
  58. call proc
  59. movq %rdi,%rax
  60. GetNextInstructionUsingRegTrackingUse will return the second instruction while GetNextInstructionUsingReg
  61. won't.
  62. }
  63. function GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  64. function RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean; override;
  65. private
  66. function SkipSimpleInstructions(var hp1: tai): Boolean;
  67. protected
  68. class function IsMOVZXAcceptable: Boolean; static; inline;
  69. { Attempts to allocate a volatile integer register for use between p and hp,
  70. using AUsedRegs for the current register usage information. Returns NR_NO
  71. if no free register could be found }
  72. function GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
  73. { Attempts to allocate a volatile MM register for use between p and hp,
  74. using AUsedRegs for the current register usage information. Returns NR_NO
  75. if no free register could be found }
  76. function GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
  77. { checks whether loading a new value in reg1 overwrites the entirety of reg2 }
  78. function Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  79. { checks whether reading the value in reg1 depends on the value of reg2. This
  80. is very similar to SuperRegisterEquals, except it takes into account that
  81. R_SUBH and R_SUBL are independendent (e.g. reading from AL does not
  82. depend on the value in AH). }
  83. function Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  84. { Replaces all references to AOldReg in a memory reference to ANewReg }
  85. class function ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean; static;
  86. { Replaces all references to AOldReg in an operand to ANewReg }
  87. class function ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean; static;
  88. { Replaces all references to AOldReg in an instruction to ANewReg,
  89. except where the register is being written }
  90. class function ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean; static;
  91. { Returns true if the reference only refers to ESP or EBP (or their 64-bit equivalents),
  92. or writes to a global symbol }
  93. class function IsRefSafe(const ref: PReference): Boolean; static;
  94. { Returns true if the given MOV instruction can be safely converted to CMOV }
  95. class function CanBeCMOV(p : tai) : boolean; static;
  96. { Converts the LEA instruction to ADD/INC/SUB/DEC. Returns True if the
  97. conversion was successful }
  98. function ConvertLEA(const p : taicpu): Boolean;
  99. function DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  100. procedure DebugMsg(const s : string; p : tai);inline;
  101. class function IsExitCode(p : tai) : boolean; static;
  102. class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
  103. procedure RemoveLastDeallocForFuncRes(p : tai);
  104. function DoSubAddOpt(var p : tai) : Boolean;
  105. function PrePeepholeOptSxx(var p : tai) : boolean;
  106. function PrePeepholeOptIMUL(var p : tai) : boolean;
  107. function PrePeepholeOptAND(var p : tai) : boolean;
  108. function OptPass1Test(var p: tai): boolean;
  109. function OptPass1Add(var p: tai): boolean;
  110. function OptPass1AND(var p : tai) : boolean;
  111. function OptPass1_V_MOVAP(var p : tai) : boolean;
  112. function OptPass1VOP(var p : tai) : boolean;
  113. function OptPass1MOV(var p : tai) : boolean;
  114. function OptPass1Movx(var p : tai) : boolean;
  115. function OptPass1MOVXX(var p : tai) : boolean;
  116. function OptPass1OP(var p : tai) : boolean;
  117. function OptPass1LEA(var p : tai) : boolean;
  118. function OptPass1Sub(var p : tai) : boolean;
  119. function OptPass1SHLSAL(var p : tai) : boolean;
  120. function OptPass1FSTP(var p : tai) : boolean;
  121. function OptPass1FLD(var p : tai) : boolean;
  122. function OptPass1Cmp(var p : tai) : boolean;
  123. function OptPass1PXor(var p : tai) : boolean;
  124. function OptPass1VPXor(var p: tai): boolean;
  125. function OptPass1Imul(var p : tai) : boolean;
  126. function OptPass1Jcc(var p : tai) : boolean;
  127. function OptPass1SHXX(var p: tai): boolean;
  128. function OptPass1VMOVDQ(var p: tai): Boolean;
  129. function OptPass1_V_Cvtss2sd(var p: tai): boolean;
  130. function OptPass2Movx(var p : tai): Boolean;
  131. function OptPass2MOV(var p : tai) : boolean;
  132. function OptPass2Imul(var p : tai) : boolean;
  133. function OptPass2Jmp(var p : tai) : boolean;
  134. function OptPass2Jcc(var p : tai) : boolean;
  135. function OptPass2Lea(var p: tai): Boolean;
  136. function OptPass2SUB(var p: tai): Boolean;
  137. function OptPass2ADD(var p : tai): Boolean;
  138. function OptPass2SETcc(var p : tai) : boolean;
  139. function CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
  140. function PostPeepholeOptMov(var p : tai) : Boolean;
  141. function PostPeepholeOptMovzx(var p : tai) : Boolean;
  142. {$ifdef x86_64} { These post-peephole optimisations only affect 64-bit registers. [Kit] }
  143. function PostPeepholeOptXor(var p : tai) : Boolean;
  144. {$endif x86_64}
  145. function PostPeepholeOptAnd(var p : tai) : boolean;
  146. function PostPeepholeOptMOVSX(var p : tai) : boolean;
  147. function PostPeepholeOptCmp(var p : tai) : Boolean;
  148. function PostPeepholeOptTestOr(var p : tai) : Boolean;
  149. function PostPeepholeOptCall(var p : tai) : Boolean;
  150. function PostPeepholeOptLea(var p : tai) : Boolean;
  151. function PostPeepholeOptPush(var p: tai): Boolean;
  152. function PostPeepholeOptShr(var p : tai) : boolean;
  153. function PostPeepholeOptADDSUB(var p : tai) : Boolean;
  154. function PostPeepholeOptVPXOR(var p: tai): Boolean;
  155. procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
  156. function CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
  157. function TrySwapMovCmp(var p, hp1: tai): Boolean;
  158. { Processor-dependent reference optimisation }
  159. class procedure OptimizeRefs(var p: taicpu); static;
  160. end;
  161. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  162. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  163. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  164. function MatchInstruction(const instr: tai; const ops: array of TAsmOp; const opsize: topsizes): boolean;
  165. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  166. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  167. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  168. {$if max_operands>2}
  169. function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
  170. {$endif max_operands>2}
  171. function RefsEqual(const r1, r2: treference): boolean;
  172. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  173. { returns true, if ref is a reference using only the registers passed as base and index
  174. and having an offset }
  175. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  176. implementation
  177. uses
  178. cutils,verbose,
  179. systems,
  180. globals,
  181. cpuinfo,
  182. procinfo,
  183. paramgr,
  184. aasmbase,
  185. aoptbase,aoptutils,
  186. symconst,symsym,
  187. cgx86,
  188. itcpugas;
  189. {$ifdef DEBUG_AOPTCPU}
  190. const
  191. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  192. {$else DEBUG_AOPTCPU}
  193. { Empty strings help the optimizer to remove string concatenations that won't
  194. ever appear to the user on release builds. [Kit] }
  195. const
  196. SPeepholeOptimization = '';
  197. {$endif DEBUG_AOPTCPU}
  198. LIST_STEP_SIZE = 4;
  199. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  200. begin
  201. result :=
  202. (instr.typ = ait_instruction) and
  203. (taicpu(instr).opcode = op) and
  204. ((opsize = []) or (taicpu(instr).opsize in opsize));
  205. end;
  206. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  207. begin
  208. result :=
  209. (instr.typ = ait_instruction) and
  210. ((taicpu(instr).opcode = op1) or
  211. (taicpu(instr).opcode = op2)
  212. ) and
  213. ((opsize = []) or (taicpu(instr).opsize in opsize));
  214. end;
  215. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  216. begin
  217. result :=
  218. (instr.typ = ait_instruction) and
  219. ((taicpu(instr).opcode = op1) or
  220. (taicpu(instr).opcode = op2) or
  221. (taicpu(instr).opcode = op3)
  222. ) and
  223. ((opsize = []) or (taicpu(instr).opsize in opsize));
  224. end;
  225. function MatchInstruction(const instr : tai;const ops : array of TAsmOp;
  226. const opsize : topsizes) : boolean;
  227. var
  228. op : TAsmOp;
  229. begin
  230. result:=false;
  231. if (instr.typ <> ait_instruction) or
  232. ((opsize <> []) and not(taicpu(instr).opsize in opsize)) then
  233. exit;
  234. for op in ops do
  235. begin
  236. if taicpu(instr).opcode = op then
  237. begin
  238. result:=true;
  239. exit;
  240. end;
  241. end;
  242. end;
  243. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  244. begin
  245. result := (oper.typ = top_reg) and (oper.reg = reg);
  246. end;
  247. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  248. begin
  249. result := (oper.typ = top_const) and (oper.val = a);
  250. end;
  251. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  252. begin
  253. result := oper1.typ = oper2.typ;
  254. if result then
  255. case oper1.typ of
  256. top_const:
  257. Result:=oper1.val = oper2.val;
  258. top_reg:
  259. Result:=oper1.reg = oper2.reg;
  260. top_ref:
  261. Result:=RefsEqual(oper1.ref^, oper2.ref^);
  262. else
  263. internalerror(2013102801);
  264. end
  265. end;
  266. function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
  267. begin
  268. result := (oper1.typ = oper2.typ) and (oper1.typ = oper3.typ);
  269. if result then
  270. case oper1.typ of
  271. top_const:
  272. Result:=(oper1.val = oper2.val) and (oper1.val = oper3.val);
  273. top_reg:
  274. Result:=(oper1.reg = oper2.reg) and (oper1.reg = oper3.reg);
  275. top_ref:
  276. Result:=RefsEqual(oper1.ref^, oper2.ref^) and RefsEqual(oper1.ref^, oper3.ref^);
  277. else
  278. internalerror(2020052401);
  279. end
  280. end;
  281. function RefsEqual(const r1, r2: treference): boolean;
  282. begin
  283. RefsEqual :=
  284. (r1.offset = r2.offset) and
  285. (r1.segment = r2.segment) and (r1.base = r2.base) and
  286. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  287. (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  288. (r1.relsymbol = r2.relsymbol) and
  289. (r1.volatility=[]) and
  290. (r2.volatility=[]);
  291. end;
  292. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  293. begin
  294. Result:=(ref.offset=0) and
  295. (ref.scalefactor in [0,1]) and
  296. (ref.segment=NR_NO) and
  297. (ref.symbol=nil) and
  298. (ref.relsymbol=nil) and
  299. ((base=NR_INVALID) or
  300. (ref.base=base)) and
  301. ((index=NR_INVALID) or
  302. (ref.index=index)) and
  303. (ref.volatility=[]);
  304. end;
  305. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  306. begin
  307. Result:=(ref.scalefactor in [0,1]) and
  308. (ref.segment=NR_NO) and
  309. (ref.symbol=nil) and
  310. (ref.relsymbol=nil) and
  311. ((base=NR_INVALID) or
  312. (ref.base=base)) and
  313. ((index=NR_INVALID) or
  314. (ref.index=index)) and
  315. (ref.volatility=[]);
  316. end;
  317. function InstrReadsFlags(p: tai): boolean;
  318. begin
  319. InstrReadsFlags := true;
  320. case p.typ of
  321. ait_instruction:
  322. if InsProp[taicpu(p).opcode].Ch*
  323. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  324. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  325. Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc,Ch_All]<>[] then
  326. exit;
  327. ait_label:
  328. exit;
  329. else
  330. ;
  331. end;
  332. InstrReadsFlags := false;
  333. end;
  334. function TX86AsmOptimizer.GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  335. begin
  336. Next:=Current;
  337. repeat
  338. Result:=GetNextInstruction(Next,Next);
  339. until not (Result) or
  340. not(cs_opt_level3 in current_settings.optimizerswitches) or
  341. (Next.typ<>ait_instruction) or
  342. RegInInstruction(reg,Next) or
  343. is_calljmp(taicpu(Next).opcode);
  344. end;
  345. function TX86AsmOptimizer.GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var CrossJump: Boolean): Boolean;
  346. begin
  347. { Note, CrossJump keeps its input value if a conditional jump is not found - it doesn't get set to False }
  348. Next := Current;
  349. repeat
  350. Result := GetNextInstruction(Next,Next);
  351. if Result and (Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) then
  352. if is_calljmpuncondret(taicpu(Next).opcode) then
  353. begin
  354. Result := False;
  355. Exit;
  356. end
  357. else
  358. CrossJump := True;
  359. until not Result or
  360. not (cs_opt_level3 in current_settings.optimizerswitches) or
  361. (Next.typ <> ait_instruction) or
  362. RegInInstruction(reg,Next);
  363. end;
  364. function TX86AsmOptimizer.GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  365. begin
  366. if not(cs_opt_level3 in current_settings.optimizerswitches) then
  367. begin
  368. Result:=GetNextInstruction(Current,Next);
  369. exit;
  370. end;
  371. Next:=tai(Current.Next);
  372. Result:=false;
  373. while assigned(Next) do
  374. begin
  375. if ((Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) and not(taicpu(Next).opcode=A_CALL)) or
  376. ((Next.typ=ait_regalloc) and (getsupreg(tai_regalloc(Next).reg)=getsupreg(reg))) or
  377. ((Next.typ=ait_label) and not(labelCanBeSkipped(Tai_Label(Next)))) then
  378. exit
  379. else if (Next.typ=ait_instruction) and RegInInstruction(reg,Next) and not(taicpu(Next).opcode=A_CALL) then
  380. begin
  381. Result:=true;
  382. exit;
  383. end;
  384. Next:=tai(Next.Next);
  385. end;
  386. end;
  387. function TX86AsmOptimizer.InstructionLoadsFromReg(const reg: TRegister;const hp: tai): boolean;
  388. begin
  389. Result:=RegReadByInstruction(reg,hp);
  390. end;
  391. function TX86AsmOptimizer.RegReadByInstruction(reg: TRegister; hp: tai): boolean;
  392. var
  393. p: taicpu;
  394. opcount: longint;
  395. begin
  396. RegReadByInstruction := false;
  397. if hp.typ <> ait_instruction then
  398. exit;
  399. p := taicpu(hp);
  400. case p.opcode of
  401. A_CALL:
  402. regreadbyinstruction := true;
  403. A_IMUL:
  404. case p.ops of
  405. 1:
  406. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  407. (
  408. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  409. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  410. );
  411. 2,3:
  412. regReadByInstruction :=
  413. reginop(reg,p.oper[0]^) or
  414. reginop(reg,p.oper[1]^);
  415. else
  416. InternalError(2019112801);
  417. end;
  418. A_MUL:
  419. begin
  420. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  421. (
  422. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  423. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  424. );
  425. end;
  426. A_IDIV,A_DIV:
  427. begin
  428. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  429. (
  430. (getregtype(reg)=R_INTREGISTER) and
  431. (
  432. (getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B))
  433. )
  434. );
  435. end;
  436. else
  437. begin
  438. if (p.opcode=A_LEA) and is_segment_reg(reg) then
  439. begin
  440. RegReadByInstruction := false;
  441. exit;
  442. end;
  443. for opcount := 0 to p.ops-1 do
  444. if (p.oper[opCount]^.typ = top_ref) and
  445. RegInRef(reg,p.oper[opcount]^.ref^) then
  446. begin
  447. RegReadByInstruction := true;
  448. exit
  449. end;
  450. { special handling for SSE MOVSD }
  451. if (p.opcode=A_MOVSD) and (p.ops>0) then
  452. begin
  453. if p.ops<>2 then
  454. internalerror(2017042702);
  455. regReadByInstruction := reginop(reg,p.oper[0]^) or
  456. (
  457. (p.oper[1]^.typ=top_reg) and (p.oper[0]^.typ=top_reg) and reginop(reg, p.oper[1]^)
  458. );
  459. exit;
  460. end;
  461. with insprop[p.opcode] do
  462. begin
  463. case getregtype(reg) of
  464. R_INTREGISTER:
  465. begin
  466. case getsupreg(reg) of
  467. RS_EAX:
  468. if [Ch_REAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
  469. begin
  470. RegReadByInstruction := true;
  471. exit
  472. end;
  473. RS_ECX:
  474. if [Ch_RECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
  475. begin
  476. RegReadByInstruction := true;
  477. exit
  478. end;
  479. RS_EDX:
  480. if [Ch_REDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
  481. begin
  482. RegReadByInstruction := true;
  483. exit
  484. end;
  485. RS_EBX:
  486. if [Ch_REBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
  487. begin
  488. RegReadByInstruction := true;
  489. exit
  490. end;
  491. RS_ESP:
  492. if [Ch_RESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
  493. begin
  494. RegReadByInstruction := true;
  495. exit
  496. end;
  497. RS_EBP:
  498. if [Ch_REBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
  499. begin
  500. RegReadByInstruction := true;
  501. exit
  502. end;
  503. RS_ESI:
  504. if [Ch_RESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
  505. begin
  506. RegReadByInstruction := true;
  507. exit
  508. end;
  509. RS_EDI:
  510. if [Ch_REDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
  511. begin
  512. RegReadByInstruction := true;
  513. exit
  514. end;
  515. end;
  516. end;
  517. R_MMREGISTER:
  518. begin
  519. case getsupreg(reg) of
  520. RS_XMM0:
  521. if [Ch_RXMM0,Ch_RWXMM0,Ch_MXMM0]*Ch<>[] then
  522. begin
  523. RegReadByInstruction := true;
  524. exit
  525. end;
  526. end;
  527. end;
  528. else
  529. ;
  530. end;
  531. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  532. begin
  533. if (Ch_RFLAGScc in Ch) and not(getsubreg(reg) in [R_SUBW,R_SUBD,R_SUBQ]) then
  534. begin
  535. case p.condition of
  536. C_A,C_NBE, { CF=0 and ZF=0 }
  537. C_BE,C_NA: { CF=1 or ZF=1 }
  538. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY,R_SUBFLAGZERO];
  539. C_AE,C_NB,C_NC, { CF=0 }
  540. C_B,C_NAE,C_C: { CF=1 }
  541. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY];
  542. C_NE,C_NZ, { ZF=0 }
  543. C_E,C_Z: { ZF=1 }
  544. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO];
  545. C_G,C_NLE, { ZF=0 and SF=OF }
  546. C_LE,C_NG: { ZF=1 or SF<>OF }
  547. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO,R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  548. C_GE,C_NL, { SF=OF }
  549. C_L,C_NGE: { SF<>OF }
  550. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  551. C_NO, { OF=0 }
  552. C_O: { OF=1 }
  553. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGOVERFLOW];
  554. C_NP,C_PO, { PF=0 }
  555. C_P,C_PE: { PF=1 }
  556. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGPARITY];
  557. C_NS, { SF=0 }
  558. C_S: { SF=1 }
  559. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN];
  560. else
  561. internalerror(2017042701);
  562. end;
  563. if RegReadByInstruction then
  564. exit;
  565. end;
  566. case getsubreg(reg) of
  567. R_SUBW,R_SUBD,R_SUBQ:
  568. RegReadByInstruction :=
  569. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  570. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  571. Ch_RDirFlag,Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc]*Ch<>[];
  572. R_SUBFLAGCARRY:
  573. RegReadByInstruction:=[Ch_RCarryFlag,Ch_RWCarryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  574. R_SUBFLAGPARITY:
  575. RegReadByInstruction:=[Ch_RParityFlag,Ch_RWParityFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  576. R_SUBFLAGAUXILIARY:
  577. RegReadByInstruction:=[Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  578. R_SUBFLAGZERO:
  579. RegReadByInstruction:=[Ch_RZeroFlag,Ch_RWZeroFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  580. R_SUBFLAGSIGN:
  581. RegReadByInstruction:=[Ch_RSignFlag,Ch_RWSignFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  582. R_SUBFLAGOVERFLOW:
  583. RegReadByInstruction:=[Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  584. R_SUBFLAGINTERRUPT:
  585. RegReadByInstruction:=[Ch_RFlags,Ch_RWFlags]*Ch<>[];
  586. R_SUBFLAGDIRECTION:
  587. RegReadByInstruction:=[Ch_RDirFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  588. else
  589. internalerror(2017042601);
  590. end;
  591. exit;
  592. end;
  593. if (Ch_NoReadIfEqualRegs in Ch) and (p.ops=2) and
  594. (p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
  595. (p.oper[0]^.reg=p.oper[1]^.reg) then
  596. exit;
  597. if ([CH_RWOP1,CH_ROP1,CH_MOP1]*Ch<>[]) and reginop(reg,p.oper[0]^) then
  598. begin
  599. RegReadByInstruction := true;
  600. exit
  601. end;
  602. if ([Ch_RWOP2,Ch_ROP2,Ch_MOP2]*Ch<>[]) and reginop(reg,p.oper[1]^) then
  603. begin
  604. RegReadByInstruction := true;
  605. exit
  606. end;
  607. if ([Ch_RWOP3,Ch_ROP3,Ch_MOP3]*Ch<>[]) and reginop(reg,p.oper[2]^) then
  608. begin
  609. RegReadByInstruction := true;
  610. exit
  611. end;
  612. if ([Ch_RWOP4,Ch_ROP4,Ch_MOP4]*Ch<>[]) and reginop(reg,p.oper[3]^) then
  613. begin
  614. RegReadByInstruction := true;
  615. exit
  616. end;
  617. end;
  618. end;
  619. end;
  620. end;
  621. function TX86AsmOptimizer.RegInInstruction(Reg: TRegister; p1: tai): Boolean;
  622. begin
  623. result:=false;
  624. if p1.typ<>ait_instruction then
  625. exit;
  626. if (Ch_All in insprop[taicpu(p1).opcode].Ch) then
  627. exit(true);
  628. if (getregtype(reg)=R_INTREGISTER) and
  629. { change information for xmm movsd are not correct }
  630. ((taicpu(p1).opcode<>A_MOVSD) or (taicpu(p1).ops=0)) then
  631. begin
  632. case getsupreg(reg) of
  633. { RS_EAX = RS_RAX on x86-64 }
  634. RS_EAX:
  635. result:=([Ch_REAX,Ch_RRAX,Ch_WEAX,Ch_WRAX,Ch_RWEAX,Ch_RWRAX,Ch_MEAX,Ch_MRAX]*insprop[taicpu(p1).opcode].Ch)<>[];
  636. RS_ECX:
  637. result:=([Ch_RECX,Ch_RRCX,Ch_WECX,Ch_WRCX,Ch_RWECX,Ch_RWRCX,Ch_MECX,Ch_MRCX]*insprop[taicpu(p1).opcode].Ch)<>[];
  638. RS_EDX:
  639. result:=([Ch_REDX,Ch_RRDX,Ch_WEDX,Ch_WRDX,Ch_RWEDX,Ch_RWRDX,Ch_MEDX,Ch_MRDX]*insprop[taicpu(p1).opcode].Ch)<>[];
  640. RS_EBX:
  641. result:=([Ch_REBX,Ch_RRBX,Ch_WEBX,Ch_WRBX,Ch_RWEBX,Ch_RWRBX,Ch_MEBX,Ch_MRBX]*insprop[taicpu(p1).opcode].Ch)<>[];
  642. RS_ESP:
  643. result:=([Ch_RESP,Ch_RRSP,Ch_WESP,Ch_WRSP,Ch_RWESP,Ch_RWRSP,Ch_MESP,Ch_MRSP]*insprop[taicpu(p1).opcode].Ch)<>[];
  644. RS_EBP:
  645. result:=([Ch_REBP,Ch_RRBP,Ch_WEBP,Ch_WRBP,Ch_RWEBP,Ch_RWRBP,Ch_MEBP,Ch_MRBP]*insprop[taicpu(p1).opcode].Ch)<>[];
  646. RS_ESI:
  647. result:=([Ch_RESI,Ch_RRSI,Ch_WESI,Ch_WRSI,Ch_RWESI,Ch_RWRSI,Ch_MESI,Ch_MRSI,Ch_RMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  648. RS_EDI:
  649. result:=([Ch_REDI,Ch_RRDI,Ch_WEDI,Ch_WRDI,Ch_RWEDI,Ch_RWRDI,Ch_MEDI,Ch_MRDI,Ch_WMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  650. else
  651. ;
  652. end;
  653. if result then
  654. exit;
  655. end
  656. else if getregtype(reg)=R_MMREGISTER then
  657. begin
  658. case getsupreg(reg) of
  659. RS_XMM0:
  660. result:=([Ch_RXMM0,Ch_WXMM0,Ch_RWXMM0,Ch_MXMM0]*insprop[taicpu(p1).opcode].Ch)<>[];
  661. else
  662. ;
  663. end;
  664. if result then
  665. exit;
  666. end
  667. else if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  668. begin
  669. if ([Ch_RFlags,Ch_WFlags,Ch_RWFlags,Ch_RFLAGScc]*insprop[taicpu(p1).opcode].Ch)<>[] then
  670. exit(true);
  671. case getsubreg(reg) of
  672. R_SUBFLAGCARRY:
  673. Result:=([Ch_RCarryFlag,Ch_RWCarryFlag,Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  674. R_SUBFLAGPARITY:
  675. Result:=([Ch_RParityFlag,Ch_RWParityFlag,Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  676. R_SUBFLAGAUXILIARY:
  677. Result:=([Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  678. R_SUBFLAGZERO:
  679. Result:=([Ch_RZeroFlag,Ch_RWZeroFlag,Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  680. R_SUBFLAGSIGN:
  681. Result:=([Ch_RSignFlag,Ch_RWSignFlag,Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  682. R_SUBFLAGOVERFLOW:
  683. Result:=([Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  684. R_SUBFLAGINTERRUPT:
  685. Result:=([Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  686. R_SUBFLAGDIRECTION:
  687. Result:=([Ch_RDirFlag,Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  688. R_SUBW,R_SUBD,R_SUBQ:
  689. { Everything except the direction bits }
  690. Result:=
  691. ([Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  692. Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
  693. Ch_W0CarryFlag,Ch_W0ParityFlag,Ch_W0AuxiliaryFlag,Ch_W0ZeroFlag,Ch_W0SignFlag,Ch_W0OverflowFlag,
  694. Ch_W1CarryFlag,Ch_W1ParityFlag,Ch_W1AuxiliaryFlag,Ch_W1ZeroFlag,Ch_W1SignFlag,Ch_W1OverflowFlag,
  695. Ch_WUCarryFlag,Ch_WUParityFlag,Ch_WUAuxiliaryFlag,Ch_WUZeroFlag,Ch_WUSignFlag,Ch_WUOverflowFlag,
  696. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag
  697. ]*insprop[taicpu(p1).opcode].Ch)<>[];
  698. else
  699. ;
  700. end;
  701. if result then
  702. exit;
  703. end
  704. else if (getregtype(reg)=R_FPUREGISTER) and (Ch_FPU in insprop[taicpu(p1).opcode].Ch) then
  705. exit(true);
  706. Result:=inherited RegInInstruction(Reg, p1);
  707. end;
  708. function TX86AsmOptimizer.RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean;
  709. const
  710. WriteOps: array[0..3] of set of TInsChange =
  711. ([CH_RWOP1,CH_WOP1,CH_MOP1],
  712. [Ch_RWOP2,Ch_WOP2,Ch_MOP2],
  713. [Ch_RWOP3,Ch_WOP3,Ch_MOP3],
  714. [Ch_RWOP4,Ch_WOP4,Ch_MOP4]);
  715. var
  716. OperIdx: Integer;
  717. begin
  718. Result := False;
  719. if p1.typ <> ait_instruction then
  720. exit;
  721. with insprop[taicpu(p1).opcode] do
  722. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  723. begin
  724. case getsubreg(reg) of
  725. R_SUBW,R_SUBD,R_SUBQ:
  726. Result :=
  727. [Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
  728. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  729. Ch_W0DirFlag,Ch_W1DirFlag,Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  730. R_SUBFLAGCARRY:
  731. Result:=[Ch_WCarryFlag,Ch_RWCarryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  732. R_SUBFLAGPARITY:
  733. Result:=[Ch_WParityFlag,Ch_RWParityFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  734. R_SUBFLAGAUXILIARY:
  735. Result:=[Ch_WAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  736. R_SUBFLAGZERO:
  737. Result:=[Ch_WZeroFlag,Ch_RWZeroFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  738. R_SUBFLAGSIGN:
  739. Result:=[Ch_WSignFlag,Ch_RWSignFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  740. R_SUBFLAGOVERFLOW:
  741. Result:=[Ch_WOverflowFlag,Ch_RWOverflowFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  742. R_SUBFLAGINTERRUPT:
  743. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  744. R_SUBFLAGDIRECTION:
  745. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  746. else
  747. internalerror(2017042602);
  748. end;
  749. exit;
  750. end;
  751. case taicpu(p1).opcode of
  752. A_CALL:
  753. { We could potentially set Result to False if the register in
  754. question is non-volatile for the subroutine's calling convention,
  755. but this would require detecting the calling convention in use and
  756. also assuming that the routine doesn't contain malformed assembly
  757. language, for example... so it could only be done under -O4 as it
  758. would be considered a side-effect. [Kit] }
  759. Result := True;
  760. A_MOVSD:
  761. { special handling for SSE MOVSD }
  762. if (taicpu(p1).ops>0) then
  763. begin
  764. if taicpu(p1).ops<>2 then
  765. internalerror(2017042703);
  766. Result := (taicpu(p1).oper[1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[1]^);
  767. end;
  768. { VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
  769. so fix it here (FK)
  770. }
  771. A_VMOVSS,
  772. A_VMOVSD:
  773. begin
  774. Result := (taicpu(p1).ops=3) and (taicpu(p1).oper[2]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[2]^);
  775. exit;
  776. end;
  777. A_IMUL:
  778. Result := (taicpu(p1).oper[taicpu(p1).ops-1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[taicpu(p1).ops-1]^);
  779. else
  780. ;
  781. end;
  782. if Result then
  783. exit;
  784. with insprop[taicpu(p1).opcode] do
  785. begin
  786. if getregtype(reg)=R_INTREGISTER then
  787. begin
  788. case getsupreg(reg) of
  789. RS_EAX:
  790. if [Ch_WEAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
  791. begin
  792. Result := True;
  793. exit
  794. end;
  795. RS_ECX:
  796. if [Ch_WECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
  797. begin
  798. Result := True;
  799. exit
  800. end;
  801. RS_EDX:
  802. if [Ch_WEDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
  803. begin
  804. Result := True;
  805. exit
  806. end;
  807. RS_EBX:
  808. if [Ch_WEBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
  809. begin
  810. Result := True;
  811. exit
  812. end;
  813. RS_ESP:
  814. if [Ch_WESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
  815. begin
  816. Result := True;
  817. exit
  818. end;
  819. RS_EBP:
  820. if [Ch_WEBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
  821. begin
  822. Result := True;
  823. exit
  824. end;
  825. RS_ESI:
  826. if [Ch_WESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
  827. begin
  828. Result := True;
  829. exit
  830. end;
  831. RS_EDI:
  832. if [Ch_WEDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
  833. begin
  834. Result := True;
  835. exit
  836. end;
  837. end;
  838. end;
  839. for OperIdx := 0 to taicpu(p1).ops - 1 do
  840. if (WriteOps[OperIdx]*Ch<>[]) and
  841. { The register doesn't get modified inside a reference }
  842. (taicpu(p1).oper[OperIdx]^.typ = top_reg) and
  843. SuperRegistersEqual(reg,taicpu(p1).oper[OperIdx]^.reg) then
  844. begin
  845. Result := true;
  846. exit
  847. end;
  848. end;
  849. end;
  850. {$ifdef DEBUG_AOPTCPU}
  851. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);
  852. begin
  853. asml.insertbefore(tai_comment.Create(strpnew(s)), p);
  854. end;
  855. function debug_tostr(i: tcgint): string; inline;
  856. begin
  857. Result := tostr(i);
  858. end;
  859. function debug_regname(r: TRegister): string; inline;
  860. begin
  861. Result := '%' + std_regname(r);
  862. end;
  863. { Debug output function - creates a string representation of an operator }
  864. function debug_operstr(oper: TOper): string;
  865. begin
  866. case oper.typ of
  867. top_const:
  868. Result := '$' + debug_tostr(oper.val);
  869. top_reg:
  870. Result := debug_regname(oper.reg);
  871. top_ref:
  872. begin
  873. if oper.ref^.offset <> 0 then
  874. Result := debug_tostr(oper.ref^.offset) + '('
  875. else
  876. Result := '(';
  877. if (oper.ref^.base <> NR_INVALID) and (oper.ref^.base <> NR_NO) then
  878. begin
  879. Result := Result + debug_regname(oper.ref^.base);
  880. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  881. Result := Result + ',' + debug_regname(oper.ref^.index);
  882. end
  883. else
  884. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  885. Result := Result + debug_regname(oper.ref^.index);
  886. if (oper.ref^.scalefactor > 1) then
  887. Result := Result + ',' + debug_tostr(oper.ref^.scalefactor) + ')'
  888. else
  889. Result := Result + ')';
  890. end;
  891. else
  892. Result := '[UNKNOWN]';
  893. end;
  894. end;
  895. function debug_op2str(opcode: tasmop): string; inline;
  896. begin
  897. Result := std_op2str[opcode];
  898. end;
  899. function debug_opsize2str(opsize: topsize): string; inline;
  900. begin
  901. Result := gas_opsize2str[opsize];
  902. end;
  903. {$else DEBUG_AOPTCPU}
  904. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);inline;
  905. begin
  906. end;
  907. function debug_tostr(i: tcgint): string; inline;
  908. begin
  909. Result := '';
  910. end;
  911. function debug_regname(r: TRegister): string; inline;
  912. begin
  913. Result := '';
  914. end;
  915. function debug_operstr(oper: TOper): string; inline;
  916. begin
  917. Result := '';
  918. end;
  919. function debug_op2str(opcode: tasmop): string; inline;
  920. begin
  921. Result := '';
  922. end;
  923. function debug_opsize2str(opsize: topsize): string; inline;
  924. begin
  925. Result := '';
  926. end;
  927. {$endif DEBUG_AOPTCPU}
  928. class function TX86AsmOptimizer.IsMOVZXAcceptable: Boolean; inline;
  929. begin
  930. {$ifdef x86_64}
  931. { Always fine on x86-64 }
  932. Result := True;
  933. {$else x86_64}
  934. Result :=
  935. {$ifdef i8086}
  936. (current_settings.cputype >= cpu_386) and
  937. {$endif i8086}
  938. (
  939. { Always accept if optimising for size }
  940. (cs_opt_size in current_settings.optimizerswitches) or
  941. { From the Pentium II onwards, MOVZX only takes 1 cycle. [Kit] }
  942. (current_settings.optimizecputype >= cpu_Pentium2)
  943. );
  944. {$endif x86_64}
  945. end;
  946. { Attempts to allocate a volatile integer register for use between p and hp,
  947. using AUsedRegs for the current register usage information. Returns NR_NO
  948. if no free register could be found }
  949. function TX86AsmOptimizer.GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
  950. var
  951. RegSet: TCPURegisterSet;
  952. CurrentSuperReg: Integer;
  953. CurrentReg: TRegister;
  954. Currentp: tai;
  955. Breakout: Boolean;
  956. begin
  957. { TODO: Currently, only the volatile registers are checked - can this be extended to use any register the procedure has preserved? }
  958. Result := NR_NO;
  959. RegSet := paramanager.get_volatile_registers_int(current_procinfo.procdef.proccalloption);
  960. for CurrentSuperReg in RegSet do
  961. begin
  962. CurrentReg := newreg(R_INTREGISTER, TSuperRegister(CurrentSuperReg), RegSize);
  963. if not AUsedRegs[R_INTREGISTER].IsUsed(CurrentReg) then
  964. begin
  965. Currentp := p;
  966. Breakout := False;
  967. while not Breakout and GetNextInstruction(Currentp, Currentp) and (Currentp <> hp) do
  968. begin
  969. case Currentp.typ of
  970. ait_instruction:
  971. begin
  972. if RegInInstruction(CurrentReg, Currentp) then
  973. begin
  974. Breakout := True;
  975. Break;
  976. end;
  977. { Cannot allocate across an unconditional jump }
  978. if is_calljmpuncondret(taicpu(Currentp).opcode) then
  979. Exit;
  980. end;
  981. ait_marker:
  982. { Don't try anything more if a marker is hit }
  983. Exit;
  984. ait_regalloc:
  985. if (tai_regalloc(Currentp).ratype <> ra_dealloc) and SuperRegistersEqual(CurrentReg, tai_regalloc(Currentp).reg) then
  986. begin
  987. Breakout := True;
  988. Break;
  989. end;
  990. else
  991. ;
  992. end;
  993. end;
  994. if Breakout then
  995. { Try the next register }
  996. Continue;
  997. { We have a free register available }
  998. Result := CurrentReg;
  999. AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
  1000. Exit;
  1001. end;
  1002. end;
  1003. end;
  1004. { Attempts to allocate a volatile MM register for use between p and hp,
  1005. using AUsedRegs for the current register usage information. Returns NR_NO
  1006. if no free register could be found }
  1007. function TX86AsmOptimizer.GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
  1008. var
  1009. RegSet: TCPURegisterSet;
  1010. CurrentSuperReg: Integer;
  1011. CurrentReg: TRegister;
  1012. Currentp: tai;
  1013. Breakout: Boolean;
  1014. begin
  1015. { TODO: Currently, only the volatile registers are checked - can this be extended to use any register the procedure has preserved? }
  1016. Result := NR_NO;
  1017. RegSet := paramanager.get_volatile_registers_mm(current_procinfo.procdef.proccalloption);
  1018. for CurrentSuperReg in RegSet do
  1019. begin
  1020. CurrentReg := newreg(R_MMREGISTER, TSuperRegister(CurrentSuperReg), RegSize);
  1021. if not AUsedRegs[R_MMREGISTER].IsUsed(CurrentReg) then
  1022. begin
  1023. Currentp := p;
  1024. Breakout := False;
  1025. while not Breakout and GetNextInstruction(Currentp, Currentp) and (Currentp <> hp) do
  1026. begin
  1027. case Currentp.typ of
  1028. ait_instruction:
  1029. begin
  1030. if RegInInstruction(CurrentReg, Currentp) then
  1031. begin
  1032. Breakout := True;
  1033. Break;
  1034. end;
  1035. { Cannot allocate across an unconditional jump }
  1036. if is_calljmpuncondret(taicpu(Currentp).opcode) then
  1037. Exit;
  1038. end;
  1039. ait_marker:
  1040. { Don't try anything more if a marker is hit }
  1041. Exit;
  1042. ait_regalloc:
  1043. if (tai_regalloc(Currentp).ratype <> ra_dealloc) and SuperRegistersEqual(CurrentReg, tai_regalloc(Currentp).reg) then
  1044. begin
  1045. Breakout := True;
  1046. Break;
  1047. end;
  1048. else
  1049. ;
  1050. end;
  1051. end;
  1052. if Breakout then
  1053. { Try the next register }
  1054. Continue;
  1055. { We have a free register available }
  1056. Result := CurrentReg;
  1057. AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
  1058. Exit;
  1059. end;
  1060. end;
  1061. end;
  1062. function TX86AsmOptimizer.Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  1063. begin
  1064. if not SuperRegistersEqual(reg1,reg2) then
  1065. exit(false);
  1066. if getregtype(reg1)<>R_INTREGISTER then
  1067. exit(true); {because SuperRegisterEqual is true}
  1068. case getsubreg(reg1) of
  1069. { A write to R_SUBL doesn't change R_SUBH and if reg2 is R_SUBW or
  1070. higher, it preserves the high bits, so the new value depends on
  1071. reg2's previous value. In other words, it is equivalent to doing:
  1072. reg2 := (reg2 and $ffffff00) or byte(reg1); }
  1073. R_SUBL:
  1074. exit(getsubreg(reg2)=R_SUBL);
  1075. { A write to R_SUBH doesn't change R_SUBL and if reg2 is R_SUBW or
  1076. higher, it actually does a:
  1077. reg2 := (reg2 and $ffff00ff) or (reg1 and $ff00); }
  1078. R_SUBH:
  1079. exit(getsubreg(reg2)=R_SUBH);
  1080. { If reg2 is R_SUBD or larger, a write to R_SUBW preserves the high 16
  1081. bits of reg2:
  1082. reg2 := (reg2 and $ffff0000) or word(reg1); }
  1083. R_SUBW:
  1084. exit(getsubreg(reg2) in [R_SUBL,R_SUBH,R_SUBW]);
  1085. { a write to R_SUBD always overwrites every other subregister,
  1086. because it clears the high 32 bits of R_SUBQ on x86_64 }
  1087. R_SUBD,
  1088. R_SUBQ:
  1089. exit(true);
  1090. else
  1091. internalerror(2017042801);
  1092. end;
  1093. end;
  1094. function TX86AsmOptimizer.Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  1095. begin
  1096. if not SuperRegistersEqual(reg1,reg2) then
  1097. exit(false);
  1098. if getregtype(reg1)<>R_INTREGISTER then
  1099. exit(true); {because SuperRegisterEqual is true}
  1100. case getsubreg(reg1) of
  1101. R_SUBL:
  1102. exit(getsubreg(reg2)<>R_SUBH);
  1103. R_SUBH:
  1104. exit(getsubreg(reg2)<>R_SUBL);
  1105. R_SUBW,
  1106. R_SUBD,
  1107. R_SUBQ:
  1108. exit(true);
  1109. else
  1110. internalerror(2017042802);
  1111. end;
  1112. end;
  1113. function TX86AsmOptimizer.PrePeepholeOptSxx(var p : tai) : boolean;
  1114. var
  1115. hp1 : tai;
  1116. l : TCGInt;
  1117. begin
  1118. result:=false;
  1119. { changes the code sequence
  1120. shr/sar const1, x
  1121. shl const2, x
  1122. to
  1123. either "sar/and", "shl/and" or just "and" depending on const1 and const2 }
  1124. if GetNextInstruction(p, hp1) and
  1125. MatchInstruction(hp1,A_SHL,[]) and
  1126. (taicpu(p).oper[0]^.typ = top_const) and
  1127. (taicpu(hp1).oper[0]^.typ = top_const) and
  1128. (taicpu(hp1).opsize = taicpu(p).opsize) and
  1129. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[1]^.typ) and
  1130. OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^) then
  1131. begin
  1132. if (taicpu(p).oper[0]^.val > taicpu(hp1).oper[0]^.val) and
  1133. not(cs_opt_size in current_settings.optimizerswitches) then
  1134. begin
  1135. { shr/sar const1, %reg
  1136. shl const2, %reg
  1137. with const1 > const2 }
  1138. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  1139. taicpu(hp1).opcode := A_AND;
  1140. l := (1 shl (taicpu(hp1).oper[0]^.val)) - 1;
  1141. case taicpu(p).opsize Of
  1142. S_B: taicpu(hp1).loadConst(0,l Xor $ff);
  1143. S_W: taicpu(hp1).loadConst(0,l Xor $ffff);
  1144. S_L: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffff));
  1145. S_Q: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffffffffffff));
  1146. else
  1147. Internalerror(2017050703)
  1148. end;
  1149. end
  1150. else if (taicpu(p).oper[0]^.val<taicpu(hp1).oper[0]^.val) and
  1151. not(cs_opt_size in current_settings.optimizerswitches) then
  1152. begin
  1153. { shr/sar const1, %reg
  1154. shl const2, %reg
  1155. with const1 < const2 }
  1156. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val-taicpu(p).oper[0]^.val);
  1157. taicpu(p).opcode := A_AND;
  1158. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  1159. case taicpu(p).opsize Of
  1160. S_B: taicpu(p).loadConst(0,l Xor $ff);
  1161. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  1162. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  1163. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  1164. else
  1165. Internalerror(2017050702)
  1166. end;
  1167. end
  1168. else if (taicpu(p).oper[0]^.val = taicpu(hp1).oper[0]^.val) then
  1169. begin
  1170. { shr/sar const1, %reg
  1171. shl const2, %reg
  1172. with const1 = const2 }
  1173. taicpu(p).opcode := A_AND;
  1174. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  1175. case taicpu(p).opsize Of
  1176. S_B: taicpu(p).loadConst(0,l Xor $ff);
  1177. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  1178. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  1179. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  1180. else
  1181. Internalerror(2017050701)
  1182. end;
  1183. RemoveInstruction(hp1);
  1184. end;
  1185. end;
  1186. end;
  1187. function TX86AsmOptimizer.PrePeepholeOptIMUL(var p : tai) : boolean;
  1188. var
  1189. opsize : topsize;
  1190. hp1 : tai;
  1191. tmpref : treference;
  1192. ShiftValue : Cardinal;
  1193. BaseValue : TCGInt;
  1194. begin
  1195. result:=false;
  1196. opsize:=taicpu(p).opsize;
  1197. { changes certain "imul const, %reg"'s to lea sequences }
  1198. if (MatchOpType(taicpu(p),top_const,top_reg) or
  1199. MatchOpType(taicpu(p),top_const,top_reg,top_reg)) and
  1200. (opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
  1201. if (taicpu(p).oper[0]^.val = 1) then
  1202. if (taicpu(p).ops = 2) then
  1203. { remove "imul $1, reg" }
  1204. begin
  1205. DebugMsg(SPeepholeOptimization + 'Imul2Nop done',p);
  1206. Result := RemoveCurrentP(p);
  1207. end
  1208. else
  1209. { change "imul $1, reg1, reg2" to "mov reg1, reg2" }
  1210. begin
  1211. hp1 := taicpu.Op_Reg_Reg(A_MOV, opsize, taicpu(p).oper[1]^.reg,taicpu(p).oper[2]^.reg);
  1212. InsertLLItem(p.previous, p.next, hp1);
  1213. DebugMsg(SPeepholeOptimization + 'Imul2Mov done',p);
  1214. p.free;
  1215. p := hp1;
  1216. end
  1217. else if ((taicpu(p).ops <= 2) or
  1218. (taicpu(p).oper[2]^.typ = Top_Reg)) and
  1219. not(cs_opt_size in current_settings.optimizerswitches) and
  1220. (not(GetNextInstruction(p, hp1)) or
  1221. not((tai(hp1).typ = ait_instruction) and
  1222. ((taicpu(hp1).opcode=A_Jcc) and
  1223. (taicpu(hp1).condition in [C_O,C_NO])))) then
  1224. begin
  1225. {
  1226. imul X, reg1, reg2 to
  1227. lea (reg1,reg1,Y), reg2
  1228. shl ZZ,reg2
  1229. imul XX, reg1 to
  1230. lea (reg1,reg1,YY), reg1
  1231. shl ZZ,reg2
  1232. This optimziation makes sense for pretty much every x86, except the VIA Nano3000: it has IMUL latency 2, lea/shl pair as well,
  1233. it does not exist as a separate optimization target in FPC though.
  1234. This optimziation can be applied as long as only two bits are set in the constant and those two bits are separated by
  1235. at most two zeros
  1236. }
  1237. reference_reset(tmpref,1,[]);
  1238. if (PopCnt(QWord(taicpu(p).oper[0]^.val))=2) and (BsrQWord(taicpu(p).oper[0]^.val)-BsfQWord(taicpu(p).oper[0]^.val)<=3) then
  1239. begin
  1240. ShiftValue:=BsfQWord(taicpu(p).oper[0]^.val);
  1241. BaseValue:=taicpu(p).oper[0]^.val shr ShiftValue;
  1242. TmpRef.base := taicpu(p).oper[1]^.reg;
  1243. TmpRef.index := taicpu(p).oper[1]^.reg;
  1244. if not(BaseValue in [3,5,9]) then
  1245. Internalerror(2018110101);
  1246. TmpRef.ScaleFactor := BaseValue-1;
  1247. if (taicpu(p).ops = 2) then
  1248. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[1]^.reg)
  1249. else
  1250. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[2]^.reg);
  1251. AsmL.InsertAfter(hp1,p);
  1252. DebugMsg(SPeepholeOptimization + 'Imul2LeaShl done',p);
  1253. taicpu(hp1).fileinfo:=taicpu(p).fileinfo;
  1254. RemoveCurrentP(p, hp1);
  1255. if ShiftValue>0 then
  1256. AsmL.InsertAfter(taicpu.op_const_reg(A_SHL, opsize, ShiftValue, taicpu(hp1).oper[1]^.reg),hp1);
  1257. end;
  1258. end;
  1259. end;
  1260. function TX86AsmOptimizer.PrePeepholeOptAND(var p : tai) : boolean;
  1261. begin
  1262. Result := False;
  1263. if MatchOperand(taicpu(p).oper[0]^, 0) and
  1264. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  1265. begin
  1266. DebugMsg(SPeepholeOptimization + 'AND 0 -> MOV 0', p);
  1267. taicpu(p).opcode := A_MOV;
  1268. Result := True;
  1269. end;
  1270. end;
  1271. function TX86AsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  1272. var
  1273. p: taicpu absolute hp;
  1274. i: Integer;
  1275. begin
  1276. Result := False;
  1277. if not assigned(hp) or
  1278. (hp.typ <> ait_instruction) then
  1279. Exit;
  1280. // p := taicpu(hp);
  1281. Prefetch(insprop[p.opcode]);
  1282. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  1283. with insprop[p.opcode] do
  1284. begin
  1285. case getsubreg(reg) of
  1286. R_SUBW,R_SUBD,R_SUBQ:
  1287. Result:=
  1288. RegLoadedWithNewValue(NR_CARRYFLAG,hp) and
  1289. RegLoadedWithNewValue(NR_PARITYFLAG,hp) and
  1290. RegLoadedWithNewValue(NR_AUXILIARYFLAG,hp) and
  1291. RegLoadedWithNewValue(NR_ZEROFLAG,hp) and
  1292. RegLoadedWithNewValue(NR_SIGNFLAG,hp) and
  1293. RegLoadedWithNewValue(NR_OVERFLOWFLAG,hp);
  1294. R_SUBFLAGCARRY:
  1295. Result:=[Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag,Ch_WFlags]*Ch<>[];
  1296. R_SUBFLAGPARITY:
  1297. Result:=[Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag,Ch_WFlags]*Ch<>[];
  1298. R_SUBFLAGAUXILIARY:
  1299. Result:=[Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_WFlags]*Ch<>[];
  1300. R_SUBFLAGZERO:
  1301. Result:=[Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag,Ch_WFlags]*Ch<>[];
  1302. R_SUBFLAGSIGN:
  1303. Result:=[Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag,Ch_WFlags]*Ch<>[];
  1304. R_SUBFLAGOVERFLOW:
  1305. Result:=[Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag,Ch_WFlags]*Ch<>[];
  1306. R_SUBFLAGINTERRUPT:
  1307. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*Ch<>[];
  1308. R_SUBFLAGDIRECTION:
  1309. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*Ch<>[];
  1310. else
  1311. begin
  1312. writeln(getsubreg(reg));
  1313. internalerror(2017050501);
  1314. end;
  1315. end;
  1316. exit;
  1317. end;
  1318. { Handle special cases first }
  1319. case p.opcode of
  1320. A_MOV, A_MOVZX, A_MOVSX, A_LEA, A_VMOVSS, A_VMOVSD, A_VMOVAPD,
  1321. A_VMOVAPS, A_VMOVQ, A_MOVSS, A_MOVSD, A_MOVQ, A_MOVAPD, A_MOVAPS:
  1322. begin
  1323. Result :=
  1324. (p.ops=2) and { A_MOVSD can have zero operands, so this check is needed }
  1325. (p.oper[1]^.typ = top_reg) and
  1326. (Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)) and
  1327. (
  1328. (p.oper[0]^.typ = top_const) or
  1329. (
  1330. (p.oper[0]^.typ = top_reg) and
  1331. not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))
  1332. ) or (
  1333. (p.oper[0]^.typ = top_ref) and
  1334. not RegInRef(reg,p.oper[0]^.ref^)
  1335. )
  1336. );
  1337. end;
  1338. A_MUL, A_IMUL:
  1339. Result :=
  1340. (
  1341. (p.ops=3) and { IMUL only }
  1342. (Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg,reg)) and
  1343. (
  1344. (
  1345. (p.oper[1]^.typ=top_reg) and
  1346. not Reg1ReadDependsOnReg2(p.oper[1]^.reg,reg)
  1347. ) or (
  1348. (p.oper[1]^.typ=top_ref) and
  1349. not RegInRef(reg,p.oper[1]^.ref^)
  1350. )
  1351. )
  1352. ) or (
  1353. (
  1354. (p.ops=1) and
  1355. (
  1356. (
  1357. (
  1358. (p.oper[0]^.typ=top_reg) and
  1359. not Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg)
  1360. )
  1361. ) or (
  1362. (p.oper[0]^.typ=top_ref) and
  1363. not RegInRef(reg,p.oper[0]^.ref^)
  1364. )
  1365. ) and (
  1366. (
  1367. (p.opsize=S_B) and
  1368. Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and
  1369. not Reg1ReadDependsOnReg2(NR_AL,reg)
  1370. ) or (
  1371. (p.opsize=S_W) and
  1372. Reg1WriteOverwritesReg2Entirely(NR_DX,reg)
  1373. ) or (
  1374. (p.opsize=S_L) and
  1375. Reg1WriteOverwritesReg2Entirely(NR_EDX,reg)
  1376. {$ifdef x86_64}
  1377. ) or (
  1378. (p.opsize=S_Q) and
  1379. Reg1WriteOverwritesReg2Entirely(NR_RDX,reg)
  1380. {$endif x86_64}
  1381. )
  1382. )
  1383. )
  1384. );
  1385. A_CBW:
  1386. Result := Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg));
  1387. {$ifndef x86_64}
  1388. A_LDS:
  1389. Result := (reg=NR_DS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1390. A_LES:
  1391. Result := (reg=NR_ES) and not(RegInRef(reg,p.oper[0]^.ref^));
  1392. {$endif not x86_64}
  1393. A_LFS:
  1394. Result := (reg=NR_FS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1395. A_LGS:
  1396. Result := (reg=NR_GS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1397. A_LSS:
  1398. Result := (reg=NR_SS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1399. A_LAHF{$ifndef x86_64}, A_AAM{$endif not x86_64}:
  1400. Result := Reg1WriteOverwritesReg2Entirely(NR_AH,reg);
  1401. A_LODSB:
  1402. Result := Reg1WriteOverwritesReg2Entirely(NR_AL,reg);
  1403. A_LODSW:
  1404. Result := Reg1WriteOverwritesReg2Entirely(NR_AX,reg);
  1405. {$ifdef x86_64}
  1406. A_LODSQ:
  1407. Result := Reg1WriteOverwritesReg2Entirely(NR_RAX,reg);
  1408. {$endif x86_64}
  1409. A_LODSD:
  1410. Result := Reg1WriteOverwritesReg2Entirely(NR_EAX,reg);
  1411. A_FSTSW, A_FNSTSW:
  1412. Result := (p.oper[0]^.typ=top_reg) and Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg);
  1413. else
  1414. begin
  1415. with insprop[p.opcode] do
  1416. begin
  1417. if (
  1418. { xor %reg,%reg etc. is classed as a new value }
  1419. (([Ch_NoReadIfEqualRegs]*Ch)<>[]) and
  1420. MatchOpType(p, top_reg, top_reg) and
  1421. (p.oper[0]^.reg = p.oper[1]^.reg) and
  1422. Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)
  1423. ) then
  1424. begin
  1425. Result := True;
  1426. Exit;
  1427. end;
  1428. { Make sure the entire register is overwritten }
  1429. if (getregtype(reg) = R_INTREGISTER) then
  1430. begin
  1431. if (p.ops > 0) then
  1432. begin
  1433. if RegInOp(reg, p.oper[0]^) then
  1434. begin
  1435. if (p.oper[0]^.typ = top_ref) then
  1436. begin
  1437. if RegInRef(reg, p.oper[0]^.ref^) then
  1438. begin
  1439. Result := False;
  1440. Exit;
  1441. end;
  1442. end
  1443. else if (p.oper[0]^.typ = top_reg) then
  1444. begin
  1445. if ([Ch_ROp1, Ch_RWOp1, Ch_MOp1]*Ch<>[]) then
  1446. begin
  1447. Result := False;
  1448. Exit;
  1449. end
  1450. else if ([Ch_WOp1]*Ch<>[]) then
  1451. begin
  1452. if Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg, reg) then
  1453. Result := True
  1454. else
  1455. begin
  1456. Result := False;
  1457. Exit;
  1458. end;
  1459. end;
  1460. end;
  1461. end;
  1462. if (p.ops > 1) then
  1463. begin
  1464. if RegInOp(reg, p.oper[1]^) then
  1465. begin
  1466. if (p.oper[1]^.typ = top_ref) then
  1467. begin
  1468. if RegInRef(reg, p.oper[1]^.ref^) then
  1469. begin
  1470. Result := False;
  1471. Exit;
  1472. end;
  1473. end
  1474. else if (p.oper[1]^.typ = top_reg) then
  1475. begin
  1476. if ([Ch_ROp2, Ch_RWOp2, Ch_MOp2]*Ch<>[]) then
  1477. begin
  1478. Result := False;
  1479. Exit;
  1480. end
  1481. else if ([Ch_WOp2]*Ch<>[]) then
  1482. begin
  1483. if Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg, reg) then
  1484. Result := True
  1485. else
  1486. begin
  1487. Result := False;
  1488. Exit;
  1489. end;
  1490. end;
  1491. end;
  1492. end;
  1493. if (p.ops > 2) then
  1494. begin
  1495. if RegInOp(reg, p.oper[2]^) then
  1496. begin
  1497. if (p.oper[2]^.typ = top_ref) then
  1498. begin
  1499. if RegInRef(reg, p.oper[2]^.ref^) then
  1500. begin
  1501. Result := False;
  1502. Exit;
  1503. end;
  1504. end
  1505. else if (p.oper[2]^.typ = top_reg) then
  1506. begin
  1507. if ([Ch_ROp3, Ch_RWOp3, Ch_MOp3]*Ch<>[]) then
  1508. begin
  1509. Result := False;
  1510. Exit;
  1511. end
  1512. else if ([Ch_WOp3]*Ch<>[]) then
  1513. begin
  1514. if Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg, reg) then
  1515. Result := True
  1516. else
  1517. begin
  1518. Result := False;
  1519. Exit;
  1520. end;
  1521. end;
  1522. end;
  1523. end;
  1524. if (p.ops > 3) and RegInOp(reg, p.oper[3]^) then
  1525. begin
  1526. if (p.oper[3]^.typ = top_ref) then
  1527. begin
  1528. if RegInRef(reg, p.oper[3]^.ref^) then
  1529. begin
  1530. Result := False;
  1531. Exit;
  1532. end;
  1533. end
  1534. else if (p.oper[3]^.typ = top_reg) then
  1535. begin
  1536. if ([Ch_ROp4, Ch_RWOp4, Ch_MOp4]*Ch<>[]) then
  1537. begin
  1538. Result := False;
  1539. Exit;
  1540. end
  1541. else if ([Ch_WOp4]*Ch<>[]) then
  1542. begin
  1543. if Reg1WriteOverwritesReg2Entirely(p.oper[3]^.reg, reg) then
  1544. Result := True
  1545. else
  1546. begin
  1547. Result := False;
  1548. Exit;
  1549. end;
  1550. end;
  1551. end;
  1552. end;
  1553. end;
  1554. end;
  1555. end;
  1556. { Don't do these ones first in case an input operand is equal to an explicit output registers }
  1557. case getsupreg(reg) of
  1558. RS_EAX:
  1559. if ([Ch_WEAX{$ifdef x86_64},Ch_WRAX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EAX, reg) then
  1560. begin
  1561. Result := True;
  1562. Exit;
  1563. end;
  1564. RS_ECX:
  1565. if ([Ch_WECX{$ifdef x86_64},Ch_WRCX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ECX, reg) then
  1566. begin
  1567. Result := True;
  1568. Exit;
  1569. end;
  1570. RS_EDX:
  1571. if ([Ch_REDX{$ifdef x86_64},Ch_WRDX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EDX, reg) then
  1572. begin
  1573. Result := True;
  1574. Exit;
  1575. end;
  1576. RS_EBX:
  1577. if ([Ch_WEBX{$ifdef x86_64},Ch_WRBX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EBX, reg) then
  1578. begin
  1579. Result := True;
  1580. Exit;
  1581. end;
  1582. RS_ESP:
  1583. if ([Ch_WESP{$ifdef x86_64},Ch_WRSP{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ESP, reg) then
  1584. begin
  1585. Result := True;
  1586. Exit;
  1587. end;
  1588. RS_EBP:
  1589. if ([Ch_WEBP{$ifdef x86_64},Ch_WRBP{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EBP, reg) then
  1590. begin
  1591. Result := True;
  1592. Exit;
  1593. end;
  1594. RS_ESI:
  1595. if ([Ch_WESI{$ifdef x86_64},Ch_WRSI{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ESI, reg) then
  1596. begin
  1597. Result := True;
  1598. Exit;
  1599. end;
  1600. RS_EDI:
  1601. if ([Ch_WEDI{$ifdef x86_64},Ch_WRDI{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EDI, reg) then
  1602. begin
  1603. Result := True;
  1604. Exit;
  1605. end;
  1606. else
  1607. ;
  1608. end;
  1609. end;
  1610. end;
  1611. end;
  1612. end;
  1613. end;
  1614. class function TX86AsmOptimizer.IsExitCode(p : tai) : boolean;
  1615. var
  1616. hp2,hp3 : tai;
  1617. begin
  1618. { some x86-64 issue a NOP before the real exit code }
  1619. if MatchInstruction(p,A_NOP,[]) then
  1620. GetNextInstruction(p,p);
  1621. result:=assigned(p) and (p.typ=ait_instruction) and
  1622. ((taicpu(p).opcode = A_RET) or
  1623. ((taicpu(p).opcode=A_LEAVE) and
  1624. GetNextInstruction(p,hp2) and
  1625. MatchInstruction(hp2,A_RET,[S_NO])
  1626. ) or
  1627. (((taicpu(p).opcode=A_LEA) and
  1628. MatchOpType(taicpu(p),top_ref,top_reg) and
  1629. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  1630. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  1631. ) and
  1632. GetNextInstruction(p,hp2) and
  1633. MatchInstruction(hp2,A_RET,[S_NO])
  1634. ) or
  1635. ((((taicpu(p).opcode=A_MOV) and
  1636. MatchOpType(taicpu(p),top_reg,top_reg) and
  1637. (taicpu(p).oper[0]^.reg=current_procinfo.framepointer) and
  1638. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)) or
  1639. ((taicpu(p).opcode=A_LEA) and
  1640. MatchOpType(taicpu(p),top_ref,top_reg) and
  1641. (taicpu(p).oper[0]^.ref^.base=current_procinfo.framepointer) and
  1642. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  1643. )
  1644. ) and
  1645. GetNextInstruction(p,hp2) and
  1646. MatchInstruction(hp2,A_POP,[reg2opsize(current_procinfo.framepointer)]) and
  1647. MatchOpType(taicpu(hp2),top_reg) and
  1648. (taicpu(hp2).oper[0]^.reg=current_procinfo.framepointer) and
  1649. GetNextInstruction(hp2,hp3) and
  1650. MatchInstruction(hp3,A_RET,[S_NO])
  1651. )
  1652. );
  1653. end;
  1654. class function TX86AsmOptimizer.isFoldableArithOp(hp1: taicpu; reg: tregister): boolean;
  1655. begin
  1656. isFoldableArithOp := False;
  1657. case hp1.opcode of
  1658. A_ADD,A_SUB,A_OR,A_XOR,A_AND,A_SHL,A_SHR,A_SAR:
  1659. isFoldableArithOp :=
  1660. ((taicpu(hp1).oper[0]^.typ = top_const) or
  1661. ((taicpu(hp1).oper[0]^.typ = top_reg) and
  1662. (taicpu(hp1).oper[0]^.reg <> reg))) and
  1663. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1664. (taicpu(hp1).oper[1]^.reg = reg);
  1665. A_INC,A_DEC,A_NEG,A_NOT:
  1666. isFoldableArithOp :=
  1667. (taicpu(hp1).oper[0]^.typ = top_reg) and
  1668. (taicpu(hp1).oper[0]^.reg = reg);
  1669. else
  1670. ;
  1671. end;
  1672. end;
  1673. procedure TX86AsmOptimizer.RemoveLastDeallocForFuncRes(p: tai);
  1674. procedure DoRemoveLastDeallocForFuncRes( supreg: tsuperregister);
  1675. var
  1676. hp2: tai;
  1677. begin
  1678. hp2 := p;
  1679. repeat
  1680. hp2 := tai(hp2.previous);
  1681. if assigned(hp2) and
  1682. (hp2.typ = ait_regalloc) and
  1683. (tai_regalloc(hp2).ratype=ra_dealloc) and
  1684. (getregtype(tai_regalloc(hp2).reg) = R_INTREGISTER) and
  1685. (getsupreg(tai_regalloc(hp2).reg) = supreg) then
  1686. begin
  1687. RemoveInstruction(hp2);
  1688. break;
  1689. end;
  1690. until not(assigned(hp2)) or regInInstruction(newreg(R_INTREGISTER,supreg,R_SUBWHOLE),hp2);
  1691. end;
  1692. begin
  1693. case current_procinfo.procdef.returndef.typ of
  1694. arraydef,recorddef,pointerdef,
  1695. stringdef,enumdef,procdef,objectdef,errordef,
  1696. filedef,setdef,procvardef,
  1697. classrefdef,forwarddef:
  1698. DoRemoveLastDeallocForFuncRes(RS_EAX);
  1699. orddef:
  1700. if current_procinfo.procdef.returndef.size <> 0 then
  1701. begin
  1702. DoRemoveLastDeallocForFuncRes(RS_EAX);
  1703. { for int64/qword }
  1704. if current_procinfo.procdef.returndef.size = 8 then
  1705. DoRemoveLastDeallocForFuncRes(RS_EDX);
  1706. end;
  1707. else
  1708. ;
  1709. end;
  1710. end;
  1711. function TX86AsmOptimizer.OptPass1_V_MOVAP(var p : tai) : boolean;
  1712. var
  1713. hp1,hp2 : tai;
  1714. begin
  1715. result:=false;
  1716. if MatchOpType(taicpu(p),top_reg,top_reg) then
  1717. begin
  1718. { vmova* reg1,reg1
  1719. =>
  1720. <nop> }
  1721. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  1722. begin
  1723. RemoveCurrentP(p);
  1724. result:=true;
  1725. exit;
  1726. end
  1727. else if GetNextInstruction(p,hp1) then
  1728. begin
  1729. if MatchInstruction(hp1,[taicpu(p).opcode],[S_NO]) and
  1730. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  1731. begin
  1732. { vmova* reg1,reg2
  1733. vmova* reg2,reg3
  1734. dealloc reg2
  1735. =>
  1736. vmova* reg1,reg3 }
  1737. TransferUsedRegs(TmpUsedRegs);
  1738. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1739. if MatchOpType(taicpu(hp1),top_reg,top_reg) and
  1740. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  1741. begin
  1742. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 1',p);
  1743. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  1744. RemoveInstruction(hp1);
  1745. result:=true;
  1746. exit;
  1747. end
  1748. { special case:
  1749. vmova* reg1,<op>
  1750. vmova* <op>,reg1
  1751. =>
  1752. vmova* reg1,<op> }
  1753. else if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
  1754. ((taicpu(p).oper[0]^.typ<>top_ref) or
  1755. (not(vol_read in taicpu(p).oper[0]^.ref^.volatility))
  1756. ) then
  1757. begin
  1758. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 2',p);
  1759. RemoveInstruction(hp1);
  1760. result:=true;
  1761. exit;
  1762. end
  1763. end
  1764. else if ((MatchInstruction(p,[A_MOVAPS,A_VMOVAPS],[S_NO]) and
  1765. MatchInstruction(hp1,[A_MOVSS,A_VMOVSS],[S_NO])) or
  1766. ((MatchInstruction(p,[A_MOVAPD,A_VMOVAPD],[S_NO]) and
  1767. MatchInstruction(hp1,[A_MOVSD,A_VMOVSD],[S_NO])))
  1768. ) and
  1769. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  1770. begin
  1771. { vmova* reg1,reg2
  1772. vmovs* reg2,<op>
  1773. dealloc reg2
  1774. =>
  1775. vmovs* reg1,reg3 }
  1776. TransferUsedRegs(TmpUsedRegs);
  1777. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1778. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  1779. begin
  1780. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVS*2(V)MOVS* 1',p);
  1781. taicpu(p).opcode:=taicpu(hp1).opcode;
  1782. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  1783. RemoveInstruction(hp1);
  1784. result:=true;
  1785. exit;
  1786. end
  1787. end;
  1788. end;
  1789. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) then
  1790. begin
  1791. if MatchInstruction(hp1,[A_VFMADDPD,
  1792. A_VFMADD132PD,
  1793. A_VFMADD132PS,
  1794. A_VFMADD132SD,
  1795. A_VFMADD132SS,
  1796. A_VFMADD213PD,
  1797. A_VFMADD213PS,
  1798. A_VFMADD213SD,
  1799. A_VFMADD213SS,
  1800. A_VFMADD231PD,
  1801. A_VFMADD231PS,
  1802. A_VFMADD231SD,
  1803. A_VFMADD231SS,
  1804. A_VFMADDSUB132PD,
  1805. A_VFMADDSUB132PS,
  1806. A_VFMADDSUB213PD,
  1807. A_VFMADDSUB213PS,
  1808. A_VFMADDSUB231PD,
  1809. A_VFMADDSUB231PS,
  1810. A_VFMSUB132PD,
  1811. A_VFMSUB132PS,
  1812. A_VFMSUB132SD,
  1813. A_VFMSUB132SS,
  1814. A_VFMSUB213PD,
  1815. A_VFMSUB213PS,
  1816. A_VFMSUB213SD,
  1817. A_VFMSUB213SS,
  1818. A_VFMSUB231PD,
  1819. A_VFMSUB231PS,
  1820. A_VFMSUB231SD,
  1821. A_VFMSUB231SS,
  1822. A_VFMSUBADD132PD,
  1823. A_VFMSUBADD132PS,
  1824. A_VFMSUBADD213PD,
  1825. A_VFMSUBADD213PS,
  1826. A_VFMSUBADD231PD,
  1827. A_VFMSUBADD231PS,
  1828. A_VFNMADD132PD,
  1829. A_VFNMADD132PS,
  1830. A_VFNMADD132SD,
  1831. A_VFNMADD132SS,
  1832. A_VFNMADD213PD,
  1833. A_VFNMADD213PS,
  1834. A_VFNMADD213SD,
  1835. A_VFNMADD213SS,
  1836. A_VFNMADD231PD,
  1837. A_VFNMADD231PS,
  1838. A_VFNMADD231SD,
  1839. A_VFNMADD231SS,
  1840. A_VFNMSUB132PD,
  1841. A_VFNMSUB132PS,
  1842. A_VFNMSUB132SD,
  1843. A_VFNMSUB132SS,
  1844. A_VFNMSUB213PD,
  1845. A_VFNMSUB213PS,
  1846. A_VFNMSUB213SD,
  1847. A_VFNMSUB213SS,
  1848. A_VFNMSUB231PD,
  1849. A_VFNMSUB231PS,
  1850. A_VFNMSUB231SD,
  1851. A_VFNMSUB231SS],[S_NO]) and
  1852. { we mix single and double opperations here because we assume that the compiler
  1853. generates vmovapd only after double operations and vmovaps only after single operations }
  1854. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[2]^) and
  1855. GetNextInstruction(hp1,hp2) and
  1856. MatchInstruction(hp2,[A_VMOVAPD,A_VMOVAPS,A_MOVAPD,A_MOVAPS],[S_NO]) and
  1857. MatchOperand(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) then
  1858. begin
  1859. TransferUsedRegs(TmpUsedRegs);
  1860. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1861. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  1862. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  1863. begin
  1864. taicpu(hp1).loadoper(2,taicpu(p).oper[0]^);
  1865. RemoveCurrentP(p, hp1); // <-- Is this actually safe? hp1 is not necessarily the next instruction. [Kit]
  1866. RemoveInstruction(hp2);
  1867. end;
  1868. end
  1869. else if (hp1.typ = ait_instruction) and
  1870. GetNextInstruction(hp1, hp2) and
  1871. MatchInstruction(hp2,taicpu(p).opcode,[]) and
  1872. OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  1873. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  1874. MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^) and
  1875. (((taicpu(p).opcode=A_MOVAPS) and
  1876. ((taicpu(hp1).opcode=A_ADDSS) or (taicpu(hp1).opcode=A_SUBSS) or
  1877. (taicpu(hp1).opcode=A_MULSS) or (taicpu(hp1).opcode=A_DIVSS))) or
  1878. ((taicpu(p).opcode=A_MOVAPD) and
  1879. ((taicpu(hp1).opcode=A_ADDSD) or (taicpu(hp1).opcode=A_SUBSD) or
  1880. (taicpu(hp1).opcode=A_MULSD) or (taicpu(hp1).opcode=A_DIVSD)))
  1881. ) then
  1882. { change
  1883. movapX reg,reg2
  1884. addsX/subsX/... reg3, reg2
  1885. movapX reg2,reg
  1886. to
  1887. addsX/subsX/... reg3,reg
  1888. }
  1889. begin
  1890. TransferUsedRegs(TmpUsedRegs);
  1891. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1892. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  1893. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  1894. begin
  1895. DebugMsg(SPeepholeOptimization + 'MovapXOpMovapX2Op ('+
  1896. debug_op2str(taicpu(p).opcode)+' '+
  1897. debug_op2str(taicpu(hp1).opcode)+' '+
  1898. debug_op2str(taicpu(hp2).opcode)+') done',p);
  1899. { we cannot eliminate the first move if
  1900. the operations uses the same register for source and dest }
  1901. if not(OpsEqual(taicpu(hp1).oper[1]^,taicpu(hp1).oper[0]^)) then
  1902. RemoveCurrentP(p, nil);
  1903. p:=hp1;
  1904. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  1905. RemoveInstruction(hp2);
  1906. result:=true;
  1907. end;
  1908. end;
  1909. end;
  1910. end;
  1911. end;
  1912. function TX86AsmOptimizer.OptPass1VOP(var p : tai) : boolean;
  1913. var
  1914. hp1 : tai;
  1915. begin
  1916. result:=false;
  1917. { replace
  1918. V<Op>X %mreg1,%mreg2,%mreg3
  1919. VMovX %mreg3,%mreg4
  1920. dealloc %mreg3
  1921. by
  1922. V<Op>X %mreg1,%mreg2,%mreg4
  1923. ?
  1924. }
  1925. if GetNextInstruction(p,hp1) and
  1926. { we mix single and double operations here because we assume that the compiler
  1927. generates vmovapd only after double operations and vmovaps only after single operations }
  1928. MatchInstruction(hp1,A_VMOVAPD,A_VMOVAPS,[S_NO]) and
  1929. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  1930. (taicpu(hp1).oper[1]^.typ=top_reg) then
  1931. begin
  1932. TransferUsedRegs(TmpUsedRegs);
  1933. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1934. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  1935. begin
  1936. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  1937. DebugMsg(SPeepholeOptimization + 'VOpVmov2VOp done',p);
  1938. RemoveInstruction(hp1);
  1939. result:=true;
  1940. end;
  1941. end;
  1942. end;
  1943. { Replaces all references to AOldReg in a memory reference to ANewReg }
  1944. class function TX86AsmOptimizer.ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean;
  1945. begin
  1946. Result := False;
  1947. { For safety reasons, only check for exact register matches }
  1948. { Check base register }
  1949. if (ref.base = AOldReg) then
  1950. begin
  1951. ref.base := ANewReg;
  1952. Result := True;
  1953. end;
  1954. { Check index register }
  1955. if (ref.index = AOldReg) then
  1956. begin
  1957. ref.index := ANewReg;
  1958. Result := True;
  1959. end;
  1960. end;
  1961. { Replaces all references to AOldReg in an operand to ANewReg }
  1962. class function TX86AsmOptimizer.ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean;
  1963. var
  1964. OldSupReg, NewSupReg: TSuperRegister;
  1965. OldSubReg, NewSubReg: TSubRegister;
  1966. OldRegType: TRegisterType;
  1967. ThisOper: POper;
  1968. begin
  1969. ThisOper := p.oper[OperIdx]; { Faster to access overall }
  1970. Result := False;
  1971. if (AOldReg = NR_NO) or (ANewReg = NR_NO) then
  1972. InternalError(2020011801);
  1973. OldSupReg := getsupreg(AOldReg);
  1974. OldSubReg := getsubreg(AOldReg);
  1975. OldRegType := getregtype(AOldReg);
  1976. NewSupReg := getsupreg(ANewReg);
  1977. NewSubReg := getsubreg(ANewReg);
  1978. if OldRegType <> getregtype(ANewReg) then
  1979. InternalError(2020011802);
  1980. if OldSubReg <> NewSubReg then
  1981. InternalError(2020011803);
  1982. case ThisOper^.typ of
  1983. top_reg:
  1984. if (
  1985. (ThisOper^.reg = AOldReg) or
  1986. (
  1987. (OldRegType = R_INTREGISTER) and
  1988. (getsupreg(ThisOper^.reg) = OldSupReg) and
  1989. (getregtype(ThisOper^.reg) = R_INTREGISTER) and
  1990. (
  1991. (getsubreg(ThisOper^.reg) <= OldSubReg)
  1992. {$ifndef x86_64}
  1993. and (
  1994. { Under i386 and i8086, ESI, EDI, EBP and ESP
  1995. don't have an 8-bit representation }
  1996. (getsubreg(ThisOper^.reg) >= R_SUBW) or
  1997. not (NewSupReg in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  1998. )
  1999. {$endif x86_64}
  2000. )
  2001. )
  2002. ) then
  2003. begin
  2004. ThisOper^.reg := newreg(getregtype(ANewReg), NewSupReg, getsubreg(p.oper[OperIdx]^.reg));
  2005. Result := True;
  2006. end;
  2007. top_ref:
  2008. if ReplaceRegisterInRef(ThisOper^.ref^, AOldReg, ANewReg) then
  2009. Result := True;
  2010. else
  2011. ;
  2012. end;
  2013. end;
  2014. { Replaces all references to AOldReg in an instruction to ANewReg }
  2015. class function TX86AsmOptimizer.ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
  2016. const
  2017. ReadFlag: array[0..3] of TInsChange = (Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Rop4);
  2018. var
  2019. OperIdx: Integer;
  2020. begin
  2021. Result := False;
  2022. for OperIdx := 0 to p.ops - 1 do
  2023. if (ReadFlag[OperIdx] in InsProp[p.Opcode].Ch) then
  2024. begin
  2025. { The shift and rotate instructions can only use CL }
  2026. if not (
  2027. (OperIdx = 0) and
  2028. { This second condition just helps to avoid unnecessarily
  2029. calling MatchInstruction for 10 different opcodes }
  2030. (p.oper[0]^.reg = NR_CL) and
  2031. MatchInstruction(p, [A_RCL, A_RCR, A_ROL, A_ROR, A_SAL, A_SAR, A_SHL, A_SHLD, A_SHR, A_SHRD], [])
  2032. ) then
  2033. Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
  2034. end
  2035. else if p.oper[OperIdx]^.typ = top_ref then
  2036. { It's okay to replace registers in references that get written to }
  2037. Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
  2038. end;
  2039. class function TX86AsmOptimizer.IsRefSafe(const ref: PReference): Boolean;
  2040. begin
  2041. with ref^ do
  2042. Result :=
  2043. (index = NR_NO) and
  2044. (
  2045. {$ifdef x86_64}
  2046. (
  2047. (base = NR_RIP) and
  2048. (refaddr in [addr_pic, addr_pic_no_got])
  2049. ) or
  2050. {$endif x86_64}
  2051. (base = NR_STACK_POINTER_REG) or
  2052. (base = current_procinfo.framepointer)
  2053. );
  2054. end;
  2055. function TX86AsmOptimizer.ConvertLEA(const p: taicpu): Boolean;
  2056. var
  2057. l: asizeint;
  2058. begin
  2059. Result := False;
  2060. { Should have been checked previously }
  2061. if p.opcode <> A_LEA then
  2062. InternalError(2020072501);
  2063. { do not mess with the stack point as adjusting it by lea is recommend, except if we optimize for size }
  2064. if (p.oper[1]^.reg=NR_STACK_POINTER_REG) and
  2065. not(cs_opt_size in current_settings.optimizerswitches) then
  2066. exit;
  2067. with p.oper[0]^.ref^ do
  2068. begin
  2069. if (base <> p.oper[1]^.reg) or
  2070. (index <> NR_NO) or
  2071. assigned(symbol) then
  2072. exit;
  2073. l:=offset;
  2074. if (l=1) and UseIncDec then
  2075. begin
  2076. p.opcode:=A_INC;
  2077. p.loadreg(0,p.oper[1]^.reg);
  2078. p.ops:=1;
  2079. DebugMsg(SPeepholeOptimization + 'Lea2Inc done',p);
  2080. end
  2081. else if (l=-1) and UseIncDec then
  2082. begin
  2083. p.opcode:=A_DEC;
  2084. p.loadreg(0,p.oper[1]^.reg);
  2085. p.ops:=1;
  2086. DebugMsg(SPeepholeOptimization + 'Lea2Dec done',p);
  2087. end
  2088. else
  2089. begin
  2090. if (l<0) and (l<>-2147483648) then
  2091. begin
  2092. p.opcode:=A_SUB;
  2093. p.loadConst(0,-l);
  2094. DebugMsg(SPeepholeOptimization + 'Lea2Sub done',p);
  2095. end
  2096. else
  2097. begin
  2098. p.opcode:=A_ADD;
  2099. p.loadConst(0,l);
  2100. DebugMsg(SPeepholeOptimization + 'Lea2Add done',p);
  2101. end;
  2102. end;
  2103. end;
  2104. Result := True;
  2105. end;
  2106. function TX86AsmOptimizer.DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  2107. var
  2108. CurrentReg, ReplaceReg: TRegister;
  2109. begin
  2110. Result := False;
  2111. ReplaceReg := taicpu(p_mov).oper[0]^.reg;
  2112. CurrentReg := taicpu(p_mov).oper[1]^.reg;
  2113. case hp.opcode of
  2114. A_FSTSW, A_FNSTSW,
  2115. A_IN, A_INS, A_OUT, A_OUTS,
  2116. A_CMPS, A_LODS, A_MOVS, A_SCAS, A_STOS:
  2117. { These routines have explicit operands, but they are restricted in
  2118. what they can be (e.g. IN and OUT can only read from AL, AX or
  2119. EAX. }
  2120. Exit;
  2121. A_IMUL:
  2122. begin
  2123. { The 1-operand version writes to implicit registers
  2124. The 2-operand version reads from the first operator, and reads
  2125. from and writes to the second (equivalent to Ch_ROp1, ChRWOp2).
  2126. the 3-operand version reads from a register that it doesn't write to
  2127. }
  2128. case hp.ops of
  2129. 1:
  2130. if (
  2131. (
  2132. (hp.opsize = S_B) and (getsupreg(CurrentReg) <> RS_EAX)
  2133. ) or
  2134. not (getsupreg(CurrentReg) in [RS_EAX, RS_EDX])
  2135. ) and ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  2136. begin
  2137. Result := True;
  2138. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 1)', hp);
  2139. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2140. end;
  2141. 2:
  2142. { Only modify the first parameter }
  2143. if ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  2144. begin
  2145. Result := True;
  2146. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 2)', hp);
  2147. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2148. end;
  2149. 3:
  2150. { Only modify the second parameter }
  2151. if ReplaceRegisterInOper(hp, 1, CurrentReg, ReplaceReg) then
  2152. begin
  2153. Result := True;
  2154. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 3)', hp);
  2155. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2156. end;
  2157. else
  2158. InternalError(2020012901);
  2159. end;
  2160. end;
  2161. else
  2162. if (hp.ops > 0) and
  2163. ReplaceRegisterInInstruction(hp, CurrentReg, ReplaceReg) then
  2164. begin
  2165. Result := True;
  2166. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovXXX2MovXXX)', hp);
  2167. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2168. end;
  2169. end;
  2170. end;
  2171. function TX86AsmOptimizer.OptPass1MOV(var p : tai) : boolean;
  2172. var
  2173. hp1, hp2, hp3: tai;
  2174. DoOptimisation, TempBool: Boolean;
  2175. procedure convert_mov_value(signed_movop: tasmop; max_value: tcgint); inline;
  2176. begin
  2177. if taicpu(hp1).opcode = signed_movop then
  2178. begin
  2179. if taicpu(p).oper[0]^.val > max_value shr 1 then
  2180. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val - max_value - 1 { Convert to signed }
  2181. end
  2182. else
  2183. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and max_value; { Trim to unsigned }
  2184. end;
  2185. var
  2186. GetNextInstruction_p, TempRegUsed, CrossJump: Boolean;
  2187. PreMessage, RegName1, RegName2, InputVal, MaskNum: string;
  2188. NewSize: topsize;
  2189. CurrentReg, ActiveReg: TRegister;
  2190. SourceRef, TargetRef: TReference;
  2191. MovAligned, MovUnaligned: TAsmOp;
  2192. begin
  2193. Result:=false;
  2194. GetNextInstruction_p:=GetNextInstruction(p, hp1);
  2195. { remove mov reg1,reg1? }
  2196. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^)
  2197. then
  2198. begin
  2199. DebugMsg(SPeepholeOptimization + 'Mov2Nop 1 done',p);
  2200. { take care of the register (de)allocs following p }
  2201. RemoveCurrentP(p, hp1);
  2202. Result:=true;
  2203. exit;
  2204. end;
  2205. { All the next optimisations require a next instruction }
  2206. if not GetNextInstruction_p or (hp1.typ <> ait_instruction) then
  2207. Exit;
  2208. { Look for:
  2209. mov %reg1,%reg2
  2210. ??? %reg2,r/m
  2211. Change to:
  2212. mov %reg1,%reg2
  2213. ??? %reg1,r/m
  2214. }
  2215. if MatchOpType(taicpu(p), top_reg, top_reg) then
  2216. begin
  2217. CurrentReg := taicpu(p).oper[1]^.reg;
  2218. if RegReadByInstruction(CurrentReg, hp1) and
  2219. DeepMOVOpt(taicpu(p), taicpu(hp1)) then
  2220. begin
  2221. { A change has occurred, just not in p }
  2222. Result := True;
  2223. TransferUsedRegs(TmpUsedRegs);
  2224. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  2225. if not RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs) and
  2226. { Just in case something didn't get modified (e.g. an
  2227. implicit register) }
  2228. not RegReadByInstruction(CurrentReg, hp1) then
  2229. begin
  2230. { We can remove the original MOV }
  2231. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3 done',p);
  2232. RemoveCurrentp(p, hp1);
  2233. { UsedRegs got updated by RemoveCurrentp }
  2234. Result := True;
  2235. Exit;
  2236. end;
  2237. { If we know a MOV instruction has become a null operation, we might as well
  2238. get rid of it now to save time. }
  2239. if (taicpu(hp1).opcode = A_MOV) and
  2240. (taicpu(hp1).oper[1]^.typ = top_reg) and
  2241. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
  2242. { Just being a register is enough to confirm it's a null operation }
  2243. (taicpu(hp1).oper[0]^.typ = top_reg) then
  2244. begin
  2245. Result := True;
  2246. { Speed-up to reduce a pipeline stall... if we had something like...
  2247. movl %eax,%edx
  2248. movw %dx,%ax
  2249. ... the second instruction would change to movw %ax,%ax, but
  2250. given that it is now %ax that's active rather than %eax,
  2251. penalties might occur due to a partial register write, so instead,
  2252. change it to a MOVZX instruction when optimising for speed.
  2253. }
  2254. if not (cs_opt_size in current_settings.optimizerswitches) and
  2255. IsMOVZXAcceptable and
  2256. (taicpu(hp1).opsize < taicpu(p).opsize)
  2257. {$ifdef x86_64}
  2258. { operations already implicitly set the upper 64 bits to zero }
  2259. and not ((taicpu(hp1).opsize = S_L) and (taicpu(p).opsize = S_Q))
  2260. {$endif x86_64}
  2261. then
  2262. begin
  2263. CurrentReg := taicpu(hp1).oper[1]^.reg;
  2264. DebugMsg(SPeepholeOptimization + 'Zero-extension to minimise pipeline stall (Mov2Movz)',hp1);
  2265. case taicpu(p).opsize of
  2266. S_W:
  2267. if taicpu(hp1).opsize = S_B then
  2268. taicpu(hp1).opsize := S_BL
  2269. else
  2270. InternalError(2020012911);
  2271. S_L{$ifdef x86_64}, S_Q{$endif x86_64}:
  2272. case taicpu(hp1).opsize of
  2273. S_B:
  2274. taicpu(hp1).opsize := S_BL;
  2275. S_W:
  2276. taicpu(hp1).opsize := S_WL;
  2277. else
  2278. InternalError(2020012912);
  2279. end;
  2280. else
  2281. InternalError(2020012910);
  2282. end;
  2283. taicpu(hp1).opcode := A_MOVZX;
  2284. taicpu(hp1).oper[1]^.reg := newreg(getregtype(CurrentReg), getsupreg(CurrentReg), R_SUBD)
  2285. end
  2286. else
  2287. begin
  2288. GetNextInstruction_p := GetNextInstruction(hp1, hp2);
  2289. DebugMsg(SPeepholeOptimization + 'Mov2Nop 4 done',hp1);
  2290. RemoveInstruction(hp1);
  2291. { The instruction after what was hp1 is now the immediate next instruction,
  2292. so we can continue to make optimisations if it's present }
  2293. if not GetNextInstruction_p or (hp2.typ <> ait_instruction) then
  2294. Exit;
  2295. hp1 := hp2;
  2296. end;
  2297. end;
  2298. end;
  2299. end;
  2300. { Depending on the DeepMOVOpt above, it may turn out that hp1 completely
  2301. overwrites the original destination register. e.g.
  2302. movl ###,%reg2d
  2303. movslq ###,%reg2q (### doesn't have to be the same as the first one)
  2304. In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
  2305. }
  2306. if (taicpu(p).oper[1]^.typ = top_reg) and
  2307. MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
  2308. (taicpu(hp1).oper[1]^.typ = top_reg) and
  2309. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  2310. begin
  2311. if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
  2312. begin
  2313. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  2314. case taicpu(p).oper[0]^.typ of
  2315. top_const:
  2316. { We have something like:
  2317. movb $x, %regb
  2318. movzbl %regb,%regd
  2319. Change to:
  2320. movl $x, %regd
  2321. }
  2322. begin
  2323. case taicpu(hp1).opsize of
  2324. S_BW:
  2325. begin
  2326. convert_mov_value(A_MOVSX, $FF);
  2327. setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
  2328. taicpu(p).opsize := S_W;
  2329. end;
  2330. S_BL:
  2331. begin
  2332. convert_mov_value(A_MOVSX, $FF);
  2333. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  2334. taicpu(p).opsize := S_L;
  2335. end;
  2336. S_WL:
  2337. begin
  2338. convert_mov_value(A_MOVSX, $FFFF);
  2339. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  2340. taicpu(p).opsize := S_L;
  2341. end;
  2342. {$ifdef x86_64}
  2343. S_BQ:
  2344. begin
  2345. convert_mov_value(A_MOVSX, $FF);
  2346. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  2347. taicpu(p).opsize := S_Q;
  2348. end;
  2349. S_WQ:
  2350. begin
  2351. convert_mov_value(A_MOVSX, $FFFF);
  2352. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  2353. taicpu(p).opsize := S_Q;
  2354. end;
  2355. S_LQ:
  2356. begin
  2357. convert_mov_value(A_MOVSXD, $FFFFFFFF); { Note it's MOVSXD, not MOVSX }
  2358. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  2359. taicpu(p).opsize := S_Q;
  2360. end;
  2361. {$endif x86_64}
  2362. else
  2363. { If hp1 was a MOV instruction, it should have been
  2364. optimised already }
  2365. InternalError(2020021001);
  2366. end;
  2367. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
  2368. RemoveInstruction(hp1);
  2369. Result := True;
  2370. Exit;
  2371. end;
  2372. top_ref:
  2373. { We have something like:
  2374. movb mem, %regb
  2375. movzbl %regb,%regd
  2376. Change to:
  2377. movzbl mem, %regd
  2378. }
  2379. if (taicpu(p).oper[0]^.ref^.refaddr<>addr_full) and (IsMOVZXAcceptable or (taicpu(hp1).opcode<>A_MOVZX)) then
  2380. begin
  2381. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
  2382. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  2383. RemoveCurrentP(p, hp1);
  2384. Result:=True;
  2385. Exit;
  2386. end;
  2387. else
  2388. if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
  2389. { Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
  2390. Exit;
  2391. end;
  2392. end
  2393. { The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
  2394. and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
  2395. optimised }
  2396. else
  2397. begin
  2398. DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
  2399. RemoveCurrentP(p, hp1);
  2400. Result := True;
  2401. Exit;
  2402. end;
  2403. end;
  2404. if (taicpu(hp1).opcode = A_AND) and
  2405. (taicpu(p).oper[1]^.typ = top_reg) and
  2406. MatchOpType(taicpu(hp1),top_const,top_reg) then
  2407. begin
  2408. if MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) then
  2409. begin
  2410. case taicpu(p).opsize of
  2411. S_L:
  2412. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  2413. begin
  2414. { Optimize out:
  2415. mov x, %reg
  2416. and ffffffffh, %reg
  2417. }
  2418. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 1 done',p);
  2419. RemoveInstruction(hp1);
  2420. Result:=true;
  2421. exit;
  2422. end;
  2423. S_Q: { TODO: Confirm if this is even possible }
  2424. if (taicpu(hp1).oper[0]^.val = $ffffffffffffffff) then
  2425. begin
  2426. { Optimize out:
  2427. mov x, %reg
  2428. and ffffffffffffffffh, %reg
  2429. }
  2430. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 2 done',p);
  2431. RemoveInstruction(hp1);
  2432. Result:=true;
  2433. exit;
  2434. end;
  2435. else
  2436. ;
  2437. end;
  2438. if ((taicpu(p).oper[0]^.typ=top_reg) or
  2439. ((taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr<>addr_full))) and
  2440. GetNextInstruction(hp1,hp2) and
  2441. MatchInstruction(hp2,A_TEST,[taicpu(p).opsize]) and
  2442. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp2).oper[1]^) and
  2443. (MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^) or
  2444. MatchOperand(taicpu(hp2).oper[0]^,-1)) and
  2445. GetNextInstruction(hp2,hp3) and
  2446. MatchInstruction(hp3,A_Jcc,A_Setcc,[]) and
  2447. (taicpu(hp3).condition in [C_E,C_NE]) then
  2448. begin
  2449. TransferUsedRegs(TmpUsedRegs);
  2450. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  2451. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  2452. if not(RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp2, TmpUsedRegs)) then
  2453. begin
  2454. DebugMsg(SPeepholeOptimization + 'MovAndTest2Test done',p);
  2455. taicpu(hp1).loadoper(1,taicpu(p).oper[0]^);
  2456. taicpu(hp1).opcode:=A_TEST;
  2457. RemoveInstruction(hp2);
  2458. RemoveCurrentP(p, hp1);
  2459. Result:=true;
  2460. exit;
  2461. end;
  2462. end;
  2463. end
  2464. else if IsMOVZXAcceptable and
  2465. (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(hp1).oper[1]^.typ = top_reg) and
  2466. (taicpu(p).oper[0]^.typ <> top_const) and { MOVZX only supports registers and memory, not immediates (use MOV for that!) }
  2467. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  2468. then
  2469. begin
  2470. InputVal := debug_operstr(taicpu(p).oper[0]^);
  2471. MaskNum := debug_tostr(taicpu(hp1).oper[0]^.val);
  2472. case taicpu(p).opsize of
  2473. S_B:
  2474. if (taicpu(hp1).oper[0]^.val = $ff) then
  2475. begin
  2476. { Convert:
  2477. movb x, %regl movb x, %regl
  2478. andw ffh, %regw andl ffh, %regd
  2479. To:
  2480. movzbw x, %regd movzbl x, %regd
  2481. (Identical registers, just different sizes)
  2482. }
  2483. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 8-bit register name }
  2484. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 16/32-bit register name }
  2485. case taicpu(hp1).opsize of
  2486. S_W: NewSize := S_BW;
  2487. S_L: NewSize := S_BL;
  2488. {$ifdef x86_64}
  2489. S_Q: NewSize := S_BQ;
  2490. {$endif x86_64}
  2491. else
  2492. InternalError(2018011510);
  2493. end;
  2494. end
  2495. else
  2496. NewSize := S_NO;
  2497. S_W:
  2498. if (taicpu(hp1).oper[0]^.val = $ffff) then
  2499. begin
  2500. { Convert:
  2501. movw x, %regw
  2502. andl ffffh, %regd
  2503. To:
  2504. movzwl x, %regd
  2505. (Identical registers, just different sizes)
  2506. }
  2507. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 16-bit register name }
  2508. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 32-bit register name }
  2509. case taicpu(hp1).opsize of
  2510. S_L: NewSize := S_WL;
  2511. {$ifdef x86_64}
  2512. S_Q: NewSize := S_WQ;
  2513. {$endif x86_64}
  2514. else
  2515. InternalError(2018011511);
  2516. end;
  2517. end
  2518. else
  2519. NewSize := S_NO;
  2520. else
  2521. NewSize := S_NO;
  2522. end;
  2523. if NewSize <> S_NO then
  2524. begin
  2525. PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + InputVal + ',' + RegName1;
  2526. { The actual optimization }
  2527. taicpu(p).opcode := A_MOVZX;
  2528. taicpu(p).changeopsize(NewSize);
  2529. taicpu(p).oper[1]^ := taicpu(hp1).oper[1]^;
  2530. { Safeguard if "and" is followed by a conditional command }
  2531. TransferUsedRegs(TmpUsedRegs);
  2532. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  2533. if (RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  2534. begin
  2535. { At this point, the "and" command is effectively equivalent to
  2536. "test %reg,%reg". This will be handled separately by the
  2537. Peephole Optimizer. [Kit] }
  2538. DebugMsg(SPeepholeOptimization + PreMessage +
  2539. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  2540. end
  2541. else
  2542. begin
  2543. DebugMsg(SPeepholeOptimization + PreMessage + '; and' + debug_opsize2str(taicpu(hp1).opsize) + ' $' + MaskNum + ',' + RegName2 +
  2544. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  2545. RemoveInstruction(hp1);
  2546. end;
  2547. Result := True;
  2548. Exit;
  2549. end;
  2550. end;
  2551. end;
  2552. if (taicpu(hp1).opcode = A_OR) and
  2553. (taicpu(p).oper[1]^.typ = top_reg) and
  2554. MatchOperand(taicpu(p).oper[0]^, 0) and
  2555. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) then
  2556. begin
  2557. { mov 0, %reg
  2558. or ###,%reg
  2559. Change to (only if the flags are not used):
  2560. mov ###,%reg
  2561. }
  2562. TransferUsedRegs(TmpUsedRegs);
  2563. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  2564. DoOptimisation := True;
  2565. { Even if the flags are used, we might be able to do the optimisation
  2566. if the conditions are predictable }
  2567. if RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  2568. begin
  2569. { Only perform if ### = %reg (the same register) or equal to 0,
  2570. so %reg is guaranteed to still have a value of zero }
  2571. if MatchOperand(taicpu(hp1).oper[0]^, 0) or
  2572. MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^.reg) then
  2573. begin
  2574. hp2 := hp1;
  2575. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  2576. while RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  2577. GetNextInstruction(hp2, hp3) do
  2578. begin
  2579. { Don't continue modifying if the flags state is getting changed }
  2580. if RegModifiedByInstruction(NR_DEFAULTFLAGS, hp3) then
  2581. Break;
  2582. UpdateUsedRegs(TmpUsedRegs, tai(hp3.Next));
  2583. if MatchInstruction(hp3, A_Jcc, A_SETcc, A_CMOVcc, []) then
  2584. begin
  2585. if condition_in(C_E, taicpu(hp3).condition) or (taicpu(hp3).condition in [C_NC, C_NS, C_NO]) then
  2586. begin
  2587. { Condition is always true }
  2588. case taicpu(hp3).opcode of
  2589. A_Jcc:
  2590. begin
  2591. DebugMsg(SPeepholeOptimization + 'Condition is always true (jump made unconditional)', hp3);
  2592. { Check for jump shortcuts before we destroy the condition }
  2593. DoJumpOptimizations(hp3, TempBool);
  2594. MakeUnconditional(taicpu(hp3));
  2595. Result := True;
  2596. end;
  2597. A_CMOVcc:
  2598. begin
  2599. DebugMsg(SPeepholeOptimization + 'Condition is always true (CMOVcc -> MOV)', hp3);
  2600. taicpu(hp3).opcode := A_MOV;
  2601. taicpu(hp3).condition := C_None;
  2602. Result := True;
  2603. end;
  2604. A_SETcc:
  2605. begin
  2606. DebugMsg(SPeepholeOptimization + 'Condition is always true (changed to MOV 1)', hp3);
  2607. { Convert "set(c) %reg" instruction to "movb 1,%reg" }
  2608. taicpu(hp3).opcode := A_MOV;
  2609. taicpu(hp3).ops := 2;
  2610. taicpu(hp3).condition := C_None;
  2611. taicpu(hp3).opsize := S_B;
  2612. taicpu(hp3).loadreg(1,taicpu(hp3).oper[0]^.reg);
  2613. taicpu(hp3).loadconst(0, 1);
  2614. Result := True;
  2615. end;
  2616. else
  2617. InternalError(2021090701);
  2618. end;
  2619. end
  2620. else if (taicpu(hp3).condition in [C_A, C_B, C_C, C_G, C_L, C_NE, C_NZ, C_O, C_S]) then
  2621. begin
  2622. { Condition is always false }
  2623. case taicpu(hp3).opcode of
  2624. A_Jcc:
  2625. begin
  2626. DebugMsg(SPeepholeOptimization + 'Condition is always false (jump removed)', hp3);
  2627. TAsmLabel(taicpu(hp3).oper[0]^.ref^.symbol).decrefs;
  2628. RemoveInstruction(hp3);
  2629. Result := True;
  2630. { Since hp3 was deleted, hp2 must not be updated }
  2631. Continue;
  2632. end;
  2633. A_CMOVcc:
  2634. begin
  2635. DebugMsg(SPeepholeOptimization + 'Condition is always false (conditional load removed)', hp3);
  2636. RemoveInstruction(hp3);
  2637. Result := True;
  2638. { Since hp3 was deleted, hp2 must not be updated }
  2639. Continue;
  2640. end;
  2641. A_SETcc:
  2642. begin
  2643. DebugMsg(SPeepholeOptimization + 'Condition is always false (changed to MOV 0)', hp3);
  2644. { Convert "set(c) %reg" instruction to "movb 0,%reg" }
  2645. taicpu(hp3).opcode := A_MOV;
  2646. taicpu(hp3).ops := 2;
  2647. taicpu(hp3).condition := C_None;
  2648. taicpu(hp3).opsize := S_B;
  2649. taicpu(hp3).loadreg(1,taicpu(hp3).oper[0]^.reg);
  2650. taicpu(hp3).loadconst(0, 0);
  2651. Result := True;
  2652. end;
  2653. else
  2654. InternalError(2021090702);
  2655. end;
  2656. end
  2657. else
  2658. { Uncertain what to do - don't optimise (although optimise other conditional statements if present) }
  2659. DoOptimisation := False;
  2660. end;
  2661. hp2 := hp3;
  2662. end;
  2663. { Flags are still in use - don't optimise }
  2664. if DoOptimisation and RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  2665. DoOptimisation := False;
  2666. end
  2667. else
  2668. DoOptimisation := False;
  2669. end;
  2670. if DoOptimisation then
  2671. begin
  2672. {$ifdef x86_64}
  2673. { OR only supports 32-bit sign-extended constants for 64-bit
  2674. instructions, so compensate for this if the constant is
  2675. encoded as a value greater than or equal to 2^31 }
  2676. if (taicpu(hp1).opsize = S_Q) and
  2677. (taicpu(hp1).oper[0]^.typ = top_const) and
  2678. (taicpu(hp1).oper[0]^.val >= $80000000) then
  2679. taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val or $FFFFFFFF00000000;
  2680. {$endif x86_64}
  2681. DebugMsg(SPeepholeOptimization + 'MOV 0 / OR -> MOV', p);
  2682. taicpu(hp1).opcode := A_MOV;
  2683. RemoveCurrentP(p, hp1);
  2684. Result := True;
  2685. Exit;
  2686. end;
  2687. end;
  2688. { Next instruction is also a MOV ? }
  2689. if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) then
  2690. begin
  2691. if (taicpu(p).oper[1]^.typ = top_reg) and
  2692. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  2693. begin
  2694. CurrentReg := taicpu(p).oper[1]^.reg;
  2695. TransferUsedRegs(TmpUsedRegs);
  2696. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  2697. { we have
  2698. mov x, %treg
  2699. mov %treg, y
  2700. }
  2701. if not(RegInOp(CurrentReg, taicpu(hp1).oper[1]^)) then
  2702. if not(RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs)) then
  2703. { we've got
  2704. mov x, %treg
  2705. mov %treg, y
  2706. with %treg is not used after }
  2707. case taicpu(p).oper[0]^.typ Of
  2708. { top_reg is covered by DeepMOVOpt }
  2709. top_const:
  2710. begin
  2711. { change
  2712. mov const, %treg
  2713. mov %treg, y
  2714. to
  2715. mov const, y
  2716. }
  2717. if (taicpu(hp1).oper[1]^.typ=top_reg) or
  2718. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  2719. begin
  2720. if taicpu(hp1).oper[1]^.typ=top_reg then
  2721. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  2722. taicpu(p).loadOper(1,taicpu(hp1).oper[1]^);
  2723. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 5 done',p);
  2724. RemoveInstruction(hp1);
  2725. Result:=true;
  2726. Exit;
  2727. end;
  2728. end;
  2729. top_ref:
  2730. case taicpu(hp1).oper[1]^.typ of
  2731. top_reg:
  2732. begin
  2733. { change
  2734. mov mem, %treg
  2735. mov %treg, %reg
  2736. to
  2737. mov mem, %reg"
  2738. }
  2739. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  2740. taicpu(p).loadreg(1, taicpu(hp1).oper[1]^.reg);
  2741. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3 done',p);
  2742. RemoveInstruction(hp1);
  2743. Result:=true;
  2744. Exit;
  2745. end;
  2746. top_ref:
  2747. begin
  2748. {$ifdef x86_64}
  2749. { Look for the following to simplify:
  2750. mov x(mem1), %reg
  2751. mov %reg, y(mem2)
  2752. mov x+8(mem1), %reg
  2753. mov %reg, y+8(mem2)
  2754. Change to:
  2755. movdqu x(mem1), %xmmreg
  2756. movdqu %xmmreg, y(mem2)
  2757. }
  2758. SourceRef := taicpu(p).oper[0]^.ref^;
  2759. TargetRef := taicpu(hp1).oper[1]^.ref^;
  2760. if (taicpu(p).opsize = S_Q) and
  2761. GetNextInstruction(hp1, hp2) and
  2762. MatchInstruction(hp2, A_MOV, [taicpu(p).opsize]) and
  2763. MatchOpType(taicpu(hp2), top_ref, top_reg) then
  2764. begin
  2765. { Delay calling GetNextInstruction(hp2, hp3) for as long as possible }
  2766. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  2767. Inc(SourceRef.offset, 8);
  2768. if UseAVX then
  2769. begin
  2770. MovAligned := A_VMOVDQA;
  2771. MovUnaligned := A_VMOVDQU;
  2772. end
  2773. else
  2774. begin
  2775. MovAligned := A_MOVDQA;
  2776. MovUnaligned := A_MOVDQU;
  2777. end;
  2778. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
  2779. begin
  2780. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  2781. Inc(TargetRef.offset, 8);
  2782. if GetNextInstruction(hp2, hp3) and
  2783. MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
  2784. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  2785. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  2786. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  2787. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  2788. begin
  2789. CurrentReg := GetMMRegisterBetween(R_SUBMMX, UsedRegs, p, hp3);
  2790. if CurrentReg <> NR_NO then
  2791. begin
  2792. { Remember that the offsets are 8 ahead }
  2793. if ((SourceRef.offset mod 16) = 8) and
  2794. (
  2795. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  2796. (SourceRef.base = current_procinfo.framepointer) or
  2797. ((SourceRef.alignment >= 16) and ((SourceRef.alignment mod 16) = 0))
  2798. ) then
  2799. taicpu(p).opcode := MovAligned
  2800. else
  2801. taicpu(p).opcode := MovUnaligned;
  2802. taicpu(p).opsize := S_XMM;
  2803. taicpu(p).oper[1]^.reg := CurrentReg;
  2804. if ((TargetRef.offset mod 16) = 8) and
  2805. (
  2806. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  2807. (TargetRef.base = current_procinfo.framepointer) or
  2808. ((TargetRef.alignment >= 16) and ((TargetRef.alignment mod 16) = 0))
  2809. ) then
  2810. taicpu(hp1).opcode := MovAligned
  2811. else
  2812. taicpu(hp1).opcode := MovUnaligned;
  2813. taicpu(hp1).opsize := S_XMM;
  2814. taicpu(hp1).oper[0]^.reg := CurrentReg;
  2815. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 1)', p);
  2816. RemoveInstruction(hp2);
  2817. RemoveInstruction(hp3);
  2818. Result := True;
  2819. Exit;
  2820. end;
  2821. end;
  2822. end
  2823. else
  2824. begin
  2825. { See if the next references are 8 less rather than 8 greater }
  2826. Dec(SourceRef.offset, 16); { -8 the other way }
  2827. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
  2828. begin
  2829. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  2830. Dec(TargetRef.offset, 8); { Only 8, not 16, as it wasn't incremented unlike SourceRef }
  2831. if GetNextInstruction(hp2, hp3) and
  2832. MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
  2833. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  2834. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  2835. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  2836. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  2837. begin
  2838. CurrentReg := GetMMRegisterBetween(R_SUBMMX, UsedRegs, p, hp3);
  2839. if CurrentReg <> NR_NO then
  2840. begin
  2841. { hp2 and hp3 are the starting offsets, so mod = 0 this time }
  2842. if ((SourceRef.offset mod 16) = 0) and
  2843. (
  2844. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  2845. (SourceRef.base = current_procinfo.framepointer) or
  2846. ((SourceRef.alignment >= 16) and ((SourceRef.alignment mod 16) = 0))
  2847. ) then
  2848. taicpu(hp2).opcode := MovAligned
  2849. else
  2850. taicpu(hp2).opcode := MovUnaligned;
  2851. taicpu(hp2).opsize := S_XMM;
  2852. taicpu(hp2).oper[1]^.reg := CurrentReg;
  2853. if ((TargetRef.offset mod 16) = 0) and
  2854. (
  2855. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  2856. (TargetRef.base = current_procinfo.framepointer) or
  2857. ((TargetRef.alignment >= 16) and ((TargetRef.alignment mod 16) = 0))
  2858. ) then
  2859. taicpu(hp3).opcode := MovAligned
  2860. else
  2861. taicpu(hp3).opcode := MovUnaligned;
  2862. taicpu(hp3).opsize := S_XMM;
  2863. taicpu(hp3).oper[0]^.reg := CurrentReg;
  2864. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 2)', p);
  2865. RemoveInstruction(hp1);
  2866. RemoveCurrentP(p, hp2);
  2867. Result := True;
  2868. Exit;
  2869. end;
  2870. end;
  2871. end;
  2872. end;
  2873. end;
  2874. {$endif x86_64}
  2875. end;
  2876. else
  2877. { The write target should be a reg or a ref }
  2878. InternalError(2021091601);
  2879. end;
  2880. else
  2881. ;
  2882. end
  2883. else
  2884. { %treg is used afterwards, but all eventualities
  2885. other than the first MOV instruction being a constant
  2886. are covered by DeepMOVOpt, so only check for that }
  2887. if (taicpu(p).oper[0]^.typ = top_const) and
  2888. (
  2889. { For MOV operations, a size saving is only made if the register/const is byte-sized }
  2890. not (cs_opt_size in current_settings.optimizerswitches) or
  2891. (taicpu(hp1).opsize = S_B)
  2892. ) and
  2893. (
  2894. (taicpu(hp1).oper[1]^.typ = top_reg) or
  2895. ((taicpu(p).oper[0]^.val >= low(longint)) and (taicpu(p).oper[0]^.val <= high(longint)))
  2896. ) then
  2897. begin
  2898. DebugMsg(SPeepholeOptimization + debug_operstr(taicpu(hp1).oper[0]^) + ' = $' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 6b)',hp1);
  2899. taicpu(hp1).loadconst(0, taicpu(p).oper[0]^.val);
  2900. end;
  2901. end;
  2902. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  2903. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  2904. { mov reg1, mem1 or mov mem1, reg1
  2905. mov mem2, reg2 mov reg2, mem2}
  2906. begin
  2907. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  2908. { mov reg1, mem1 or mov mem1, reg1
  2909. mov mem2, reg1 mov reg2, mem1}
  2910. begin
  2911. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2912. { Removes the second statement from
  2913. mov reg1, mem1/reg2
  2914. mov mem1/reg2, reg1 }
  2915. begin
  2916. if taicpu(p).oper[0]^.typ=top_reg then
  2917. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2918. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 1',p);
  2919. RemoveInstruction(hp1);
  2920. Result:=true;
  2921. exit;
  2922. end
  2923. else
  2924. begin
  2925. TransferUsedRegs(TmpUsedRegs);
  2926. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2927. if (taicpu(p).oper[1]^.typ = top_ref) and
  2928. { mov reg1, mem1
  2929. mov mem2, reg1 }
  2930. (taicpu(hp1).oper[0]^.ref^.refaddr = addr_no) and
  2931. GetNextInstruction(hp1, hp2) and
  2932. MatchInstruction(hp2,A_CMP,[taicpu(p).opsize]) and
  2933. OpsEqual(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^) and
  2934. OpsEqual(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) and
  2935. not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs)) then
  2936. { change to
  2937. mov reg1, mem1 mov reg1, mem1
  2938. mov mem2, reg1 cmp reg1, mem2
  2939. cmp mem1, reg1
  2940. }
  2941. begin
  2942. RemoveInstruction(hp2);
  2943. taicpu(hp1).opcode := A_CMP;
  2944. taicpu(hp1).loadref(1,taicpu(hp1).oper[0]^.ref^);
  2945. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  2946. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  2947. DebugMsg(SPeepholeOptimization + 'MovMovCmp2MovCmp done',hp1);
  2948. end;
  2949. end;
  2950. end
  2951. else if (taicpu(p).oper[1]^.typ=top_ref) and
  2952. OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2953. begin
  2954. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  2955. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  2956. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov1 done',p);
  2957. end
  2958. else
  2959. begin
  2960. TransferUsedRegs(TmpUsedRegs);
  2961. if GetNextInstruction(hp1, hp2) and
  2962. MatchOpType(taicpu(p),top_ref,top_reg) and
  2963. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2964. (taicpu(hp1).oper[1]^.typ = top_ref) and
  2965. MatchInstruction(hp2,A_MOV,[taicpu(p).opsize]) and
  2966. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  2967. RefsEqual(taicpu(hp2).oper[0]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  2968. if not RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^) and
  2969. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,tmpUsedRegs)) then
  2970. { mov mem1, %reg1
  2971. mov %reg1, mem2
  2972. mov mem2, reg2
  2973. to:
  2974. mov mem1, reg2
  2975. mov reg2, mem2}
  2976. begin
  2977. AllocRegBetween(taicpu(hp2).oper[1]^.reg,p,hp2,usedregs);
  2978. DebugMsg(SPeepholeOptimization + 'MovMovMov2MovMov 1 done',p);
  2979. taicpu(p).loadoper(1,taicpu(hp2).oper[1]^);
  2980. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  2981. RemoveInstruction(hp2);
  2982. Result := True;
  2983. end
  2984. {$ifdef i386}
  2985. { this is enabled for i386 only, as the rules to create the reg sets below
  2986. are too complicated for x86-64, so this makes this code too error prone
  2987. on x86-64
  2988. }
  2989. else if (taicpu(p).oper[1]^.reg <> taicpu(hp2).oper[1]^.reg) and
  2990. not(RegInRef(taicpu(p).oper[1]^.reg,taicpu(p).oper[0]^.ref^)) and
  2991. not(RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^)) then
  2992. { mov mem1, reg1 mov mem1, reg1
  2993. mov reg1, mem2 mov reg1, mem2
  2994. mov mem2, reg2 mov mem2, reg1
  2995. to: to:
  2996. mov mem1, reg1 mov mem1, reg1
  2997. mov mem1, reg2 mov reg1, mem2
  2998. mov reg1, mem2
  2999. or (if mem1 depends on reg1
  3000. and/or if mem2 depends on reg2)
  3001. to:
  3002. mov mem1, reg1
  3003. mov reg1, mem2
  3004. mov reg1, reg2
  3005. }
  3006. begin
  3007. taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^);
  3008. taicpu(hp1).loadReg(1,taicpu(hp2).oper[1]^.reg);
  3009. taicpu(hp2).loadRef(1,taicpu(hp2).oper[0]^.ref^);
  3010. taicpu(hp2).loadReg(0,taicpu(p).oper[1]^.reg);
  3011. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  3012. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  3013. (getsupreg(taicpu(p).oper[0]^.ref^.base) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  3014. AllocRegBetween(taicpu(p).oper[0]^.ref^.base,p,hp2,usedregs);
  3015. if (taicpu(p).oper[0]^.ref^.index <> NR_NO) and
  3016. (getsupreg(taicpu(p).oper[0]^.ref^.index) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  3017. AllocRegBetween(taicpu(p).oper[0]^.ref^.index,p,hp2,usedregs);
  3018. end
  3019. else if (taicpu(hp1).Oper[0]^.reg <> taicpu(hp2).Oper[1]^.reg) then
  3020. begin
  3021. taicpu(hp2).loadReg(0,taicpu(hp1).Oper[0]^.reg);
  3022. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  3023. end
  3024. else
  3025. begin
  3026. RemoveInstruction(hp2);
  3027. end
  3028. {$endif i386}
  3029. ;
  3030. end;
  3031. end
  3032. { movl [mem1],reg1
  3033. movl [mem1],reg2
  3034. to
  3035. movl [mem1],reg1
  3036. movl reg1,reg2
  3037. }
  3038. else if MatchOpType(taicpu(p),top_ref,top_reg) and
  3039. MatchOpType(taicpu(hp1),top_ref,top_reg) and
  3040. (taicpu(p).opsize = taicpu(hp1).opsize) and
  3041. RefsEqual(taicpu(p).oper[0]^.ref^,taicpu(hp1).oper[0]^.ref^) and
  3042. (taicpu(p).oper[0]^.ref^.volatility=[]) and
  3043. (taicpu(hp1).oper[0]^.ref^.volatility=[]) and
  3044. not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.base)) and
  3045. not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.index)) then
  3046. begin
  3047. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 2',p);
  3048. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  3049. end;
  3050. { movl const1,[mem1]
  3051. movl [mem1],reg1
  3052. to
  3053. movl const1,reg1
  3054. movl reg1,[mem1]
  3055. }
  3056. if MatchOpType(Taicpu(p),top_const,top_ref) and
  3057. MatchOpType(Taicpu(hp1),top_ref,top_reg) and
  3058. (taicpu(p).opsize = taicpu(hp1).opsize) and
  3059. RefsEqual(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.ref^) and
  3060. not(RegInRef(taicpu(hp1).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^)) then
  3061. begin
  3062. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  3063. taicpu(hp1).loadReg(0,taicpu(hp1).oper[1]^.reg);
  3064. taicpu(hp1).loadRef(1,taicpu(p).oper[1]^.ref^);
  3065. taicpu(p).loadReg(1,taicpu(hp1).oper[0]^.reg);
  3066. taicpu(hp1).fileinfo := taicpu(p).fileinfo;
  3067. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 1',p);
  3068. Result:=true;
  3069. exit;
  3070. end;
  3071. { mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
  3072. end;
  3073. { search further than the next instruction for a mov (as long as it's not a jump) }
  3074. if not is_calljmpuncondret(taicpu(hp1).opcode) and
  3075. { check as much as possible before the expensive GetNextInstructionUsingRegCond call }
  3076. (taicpu(p).oper[1]^.typ = top_reg) and
  3077. (taicpu(p).oper[0]^.typ in [top_reg,top_const]) and
  3078. not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
  3079. begin
  3080. { we work with hp2 here, so hp1 can be still used later on when
  3081. checking for GetNextInstruction_p }
  3082. hp3 := hp1;
  3083. { Initialise CrossJump (if it becomes True at any point, it will remain True) }
  3084. CrossJump := (taicpu(hp1).opcode = A_Jcc);
  3085. { Saves on a large number of dereferences }
  3086. ActiveReg := taicpu(p).oper[1]^.reg;
  3087. TransferUsedRegs(TmpUsedRegs);
  3088. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  3089. while GetNextInstructionUsingRegCond(hp3,hp2,ActiveReg,CrossJump) and
  3090. { GetNextInstructionUsingRegCond only searches one instruction ahead unless -O3 is specified }
  3091. (hp2.typ=ait_instruction) do
  3092. begin
  3093. case taicpu(hp2).opcode of
  3094. A_POP:
  3095. if MatchOperand(taicpu(hp2).oper[0]^,ActiveReg) then
  3096. begin
  3097. if not CrossJump and
  3098. not RegUsedBetween(ActiveReg, p, hp2) then
  3099. begin
  3100. { We can remove the original MOV since the register
  3101. wasn't used between it and its popping from the stack }
  3102. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3c done',p);
  3103. RemoveCurrentp(p, hp1);
  3104. Result := True;
  3105. Exit;
  3106. end;
  3107. { Can't go any further }
  3108. Break;
  3109. end;
  3110. A_MOV:
  3111. if MatchOperand(taicpu(hp2).oper[0]^,ActiveReg) and
  3112. ((taicpu(p).oper[0]^.typ=top_const) or
  3113. ((taicpu(p).oper[0]^.typ=top_reg) and
  3114. not(RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp2))
  3115. )
  3116. ) then
  3117. begin
  3118. { we have
  3119. mov x, %treg
  3120. mov %treg, y
  3121. }
  3122. { We don't need to call UpdateUsedRegs for every instruction between
  3123. p and hp2 because the register we're concerned about will not
  3124. become deallocated (otherwise GetNextInstructionUsingReg would
  3125. have stopped at an earlier instruction). [Kit] }
  3126. TempRegUsed :=
  3127. CrossJump { Assume the register is in use if it crossed a conditional jump } or
  3128. RegReadByInstruction(ActiveReg, hp3) or
  3129. RegUsedAfterInstruction(ActiveReg, hp2, TmpUsedRegs);
  3130. case taicpu(p).oper[0]^.typ Of
  3131. top_reg:
  3132. begin
  3133. { change
  3134. mov %reg, %treg
  3135. mov %treg, y
  3136. to
  3137. mov %reg, y
  3138. }
  3139. CurrentReg := taicpu(p).oper[0]^.reg; { Saves on a handful of pointer dereferences }
  3140. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  3141. if MatchOperand(taicpu(hp2).oper[1]^, CurrentReg) then
  3142. begin
  3143. { %reg = y - remove hp2 completely (doing it here instead of relying on
  3144. the "mov %reg,%reg" optimisation might cut down on a pass iteration) }
  3145. if TempRegUsed then
  3146. begin
  3147. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + RegName1 + '; removed unnecessary instruction (MovMov2MovNop 6b}',hp2);
  3148. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  3149. { Set the start of the next GetNextInstructionUsingRegCond search
  3150. to start at the entry right before hp2 (which is about to be removed) }
  3151. hp3 := tai(hp2.Previous);
  3152. RemoveInstruction(hp2);
  3153. { See if there's more we can optimise }
  3154. Continue;
  3155. end
  3156. else
  3157. begin
  3158. RemoveInstruction(hp2);
  3159. { We can remove the original MOV too }
  3160. DebugMsg(SPeepholeOptimization + 'MovMov2NopNop 6b done',p);
  3161. RemoveCurrentP(p, hp1);
  3162. Result:=true;
  3163. Exit;
  3164. end;
  3165. end
  3166. else
  3167. begin
  3168. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  3169. taicpu(hp2).loadReg(0, CurrentReg);
  3170. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_regname(CurrentReg) + '; changed to minimise pipeline stall (MovMov2Mov 6a}',hp2);
  3171. { Check to see if the register also appears in the reference }
  3172. if (taicpu(hp2).oper[1]^.typ = top_ref) then
  3173. ReplaceRegisterInRef(taicpu(hp2).oper[1]^.ref^, ActiveReg, CurrentReg);
  3174. { Don't remove the first instruction if the temporary register is in use }
  3175. if not TempRegUsed and
  3176. { ReplaceRegisterInRef won't actually replace the register if it's a different size }
  3177. not RegInOp(ActiveReg, taicpu(hp2).oper[1]^) then
  3178. begin
  3179. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 6 done',p);
  3180. RemoveCurrentP(p, hp1);
  3181. Result:=true;
  3182. Exit;
  3183. end;
  3184. { No need to set Result to True here. If there's another instruction later
  3185. on that can be optimised, it will be detected when the main Pass 1 loop
  3186. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] }
  3187. end;
  3188. end;
  3189. top_const:
  3190. if not (cs_opt_size in current_settings.optimizerswitches) or (taicpu(hp2).opsize = S_B) then
  3191. begin
  3192. { change
  3193. mov const, %treg
  3194. mov %treg, y
  3195. to
  3196. mov const, y
  3197. }
  3198. if (taicpu(hp2).oper[1]^.typ=top_reg) or
  3199. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  3200. begin
  3201. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  3202. taicpu(hp2).loadOper(0,taicpu(p).oper[0]^);
  3203. if TempRegUsed then
  3204. begin
  3205. { Don't remove the first instruction if the temporary register is in use }
  3206. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 7a)',hp2);
  3207. { No need to set Result to True. If there's another instruction later on
  3208. that can be optimised, it will be detected when the main Pass 1 loop
  3209. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
  3210. end
  3211. else
  3212. begin
  3213. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 7 done',p);
  3214. RemoveCurrentP(p, hp1);
  3215. Result:=true;
  3216. Exit;
  3217. end;
  3218. end;
  3219. end;
  3220. else
  3221. Internalerror(2019103001);
  3222. end;
  3223. end
  3224. else
  3225. if MatchOperand(taicpu(hp2).oper[1]^, ActiveReg) then
  3226. begin
  3227. if not CrossJump and
  3228. not RegUsedBetween(ActiveReg, p, hp2) and
  3229. not RegReadByInstruction(ActiveReg, hp2) then
  3230. begin
  3231. { Register is not used before it is overwritten }
  3232. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3a done',p);
  3233. RemoveCurrentp(p, hp1);
  3234. Result := True;
  3235. Exit;
  3236. end;
  3237. if (taicpu(p).oper[0]^.typ = top_const) and
  3238. (taicpu(hp2).oper[0]^.typ = top_const) then
  3239. begin
  3240. if taicpu(p).oper[0]^.val = taicpu(hp2).oper[0]^.val then
  3241. begin
  3242. { Same value - register hasn't changed }
  3243. DebugMsg(SPeepholeOptimization + 'Mov2Nop 2 done', hp2);
  3244. RemoveInstruction(hp2);
  3245. Result := True;
  3246. { See if there's more we can optimise }
  3247. Continue;
  3248. end;
  3249. end;
  3250. end;
  3251. A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  3252. if MatchOpType(taicpu(hp2), top_reg, top_reg) and
  3253. MatchOperand(taicpu(hp2).oper[0]^, ActiveReg) and
  3254. SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, ActiveReg) then
  3255. begin
  3256. {
  3257. Change from:
  3258. mov ###, %reg
  3259. ...
  3260. movs/z %reg,%reg (Same register, just different sizes)
  3261. To:
  3262. movs/z ###, %reg (Longer version)
  3263. ...
  3264. (remove)
  3265. }
  3266. DebugMsg(SPeepholeOptimization + 'MovMovs/z2Mov/s/z done', p);
  3267. taicpu(p).oper[1]^.reg := taicpu(hp2).oper[1]^.reg;
  3268. { Keep the first instruction as mov if ### is a constant }
  3269. if taicpu(p).oper[0]^.typ = top_const then
  3270. taicpu(p).opsize := reg2opsize(taicpu(hp2).oper[1]^.reg)
  3271. else
  3272. begin
  3273. taicpu(p).opcode := taicpu(hp2).opcode;
  3274. taicpu(p).opsize := taicpu(hp2).opsize;
  3275. end;
  3276. DebugMsg(SPeepholeOptimization + 'Removed movs/z instruction and extended earlier write (MovMovs/z2Mov/s/z)', hp2);
  3277. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp2, UsedRegs);
  3278. RemoveInstruction(hp2);
  3279. Result := True;
  3280. Exit;
  3281. end;
  3282. else
  3283. { Move down to the MatchOpType if-block below };
  3284. end;
  3285. { Also catches MOV/S/Z instructions that aren't modified }
  3286. if taicpu(p).oper[0]^.typ = top_reg then
  3287. begin
  3288. CurrentReg := taicpu(p).oper[0]^.reg;
  3289. if
  3290. not RegModifiedByInstruction(CurrentReg, hp3) and
  3291. not RegModifiedBetween(CurrentReg, hp3, hp2) and
  3292. DeepMOVOpt(taicpu(p), taicpu(hp2)) then
  3293. begin
  3294. Result := True;
  3295. { Just in case something didn't get modified (e.g. an
  3296. implicit register). Also, if it does read from this
  3297. register, then there's no longer an advantage to
  3298. changing the register on subsequent instructions.}
  3299. if not RegReadByInstruction(ActiveReg, hp2) then
  3300. begin
  3301. { If a conditional jump was crossed, do not delete
  3302. the original MOV no matter what }
  3303. if not CrossJump and
  3304. { RegEndOfLife returns True if the register is
  3305. deallocated before the next instruction or has
  3306. been loaded with a new value }
  3307. RegEndOfLife(ActiveReg, taicpu(hp2)) then
  3308. begin
  3309. { We can remove the original MOV }
  3310. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3b done',p);
  3311. RemoveCurrentp(p, hp1);
  3312. Exit;
  3313. end;
  3314. if not RegModifiedByInstruction(ActiveReg, hp2) then
  3315. begin
  3316. { See if there's more we can optimise }
  3317. hp3 := hp2;
  3318. Continue;
  3319. end;
  3320. end;
  3321. end;
  3322. end;
  3323. { Break out of the while loop under normal circumstances }
  3324. Break;
  3325. end;
  3326. end;
  3327. if (aoc_MovAnd2Mov_3 in OptsToCheck) and
  3328. (taicpu(p).oper[1]^.typ = top_reg) and
  3329. (taicpu(p).opsize = S_L) and
  3330. GetNextInstructionUsingRegTrackingUse(p,hp2,taicpu(p).oper[1]^.reg) and
  3331. (taicpu(hp2).opcode = A_AND) and
  3332. (MatchOpType(taicpu(hp2),top_const,top_reg) or
  3333. (MatchOpType(taicpu(hp2),top_reg,top_reg) and
  3334. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^))
  3335. ) then
  3336. begin
  3337. if SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp2).oper[1]^.reg) then
  3338. begin
  3339. if ((taicpu(hp2).oper[0]^.typ=top_const) and (taicpu(hp2).oper[0]^.val = $ffffffff)) or
  3340. ((taicpu(hp2).oper[0]^.typ=top_reg) and (taicpu(hp2).opsize=S_L)) then
  3341. begin
  3342. { Optimize out:
  3343. mov x, %reg
  3344. and ffffffffh, %reg
  3345. }
  3346. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 3 done',p);
  3347. RemoveInstruction(hp2);
  3348. Result:=true;
  3349. exit;
  3350. end;
  3351. end;
  3352. end;
  3353. { leave out the mov from "mov reg, x(%frame_pointer); leave/ret" (with
  3354. x >= RetOffset) as it doesn't do anything (it writes either to a
  3355. parameter or to the temporary storage room for the function
  3356. result)
  3357. }
  3358. if IsExitCode(hp1) and
  3359. (taicpu(p).oper[1]^.typ = top_ref) and
  3360. (taicpu(p).oper[1]^.ref^.index = NR_NO) and
  3361. (
  3362. (
  3363. (taicpu(p).oper[1]^.ref^.base = current_procinfo.FramePointer) and
  3364. not (
  3365. assigned(current_procinfo.procdef.funcretsym) and
  3366. (taicpu(p).oper[1]^.ref^.offset <= tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)
  3367. )
  3368. ) or
  3369. { Also discard writes to the stack that are below the base pointer,
  3370. as this is temporary storage rather than a function result on the
  3371. stack, say. }
  3372. (
  3373. (taicpu(p).oper[1]^.ref^.base = NR_STACK_POINTER_REG) and
  3374. (taicpu(p).oper[1]^.ref^.offset < current_procinfo.final_localsize)
  3375. )
  3376. ) then
  3377. begin
  3378. RemoveCurrentp(p, hp1);
  3379. DebugMsg(SPeepholeOptimization + 'removed deadstore before leave/ret',p);
  3380. RemoveLastDeallocForFuncRes(p);
  3381. Result:=true;
  3382. exit;
  3383. end;
  3384. if MatchInstruction(hp1,A_CMP,A_TEST,[taicpu(p).opsize]) then
  3385. begin
  3386. if MatchOpType(taicpu(p),top_reg,top_ref) and
  3387. (taicpu(hp1).oper[1]^.typ = top_ref) and
  3388. RefsEqual(taicpu(p).oper[1]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  3389. begin
  3390. { change
  3391. mov reg1, mem1
  3392. test/cmp x, mem1
  3393. to
  3394. mov reg1, mem1
  3395. test/cmp x, reg1
  3396. }
  3397. taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
  3398. DebugMsg(SPeepholeOptimization + 'MovTestCmp2MovTestCmp 1',hp1);
  3399. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  3400. Result := True;
  3401. Exit;
  3402. end;
  3403. if MatchOpType(taicpu(p),top_ref,top_reg) and
  3404. { The x86 assemblers have difficulty comparing values against absolute addresses }
  3405. (taicpu(p).oper[0]^.ref^.refaddr in [addr_no, addr_pic, addr_pic_no_got]) and
  3406. (taicpu(hp1).oper[0]^.typ <> top_ref) and
  3407. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  3408. (
  3409. (
  3410. (taicpu(hp1).opcode = A_TEST)
  3411. ) or (
  3412. (taicpu(hp1).opcode = A_CMP) and
  3413. { A sanity check more than anything }
  3414. not MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg)
  3415. )
  3416. ) then
  3417. begin
  3418. { change
  3419. mov mem, %reg
  3420. cmp/test x, %reg / test %reg,%reg
  3421. (reg deallocated)
  3422. to
  3423. cmp/test x, mem / cmp 0, mem
  3424. }
  3425. TransferUsedRegs(TmpUsedRegs);
  3426. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  3427. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  3428. begin
  3429. { Convert test %reg,%reg or test $-1,%reg to cmp $0,mem }
  3430. if (taicpu(hp1).opcode = A_TEST) and
  3431. (
  3432. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) or
  3433. MatchOperand(taicpu(hp1).oper[0]^, -1)
  3434. ) then
  3435. begin
  3436. taicpu(hp1).opcode := A_CMP;
  3437. taicpu(hp1).loadconst(0, 0);
  3438. end;
  3439. taicpu(hp1).loadref(1, taicpu(p).oper[0]^.ref^);
  3440. DebugMsg(SPeepholeOptimization + 'MOV/CMP -> CMP (memory check)', p);
  3441. RemoveCurrentP(p, hp1);
  3442. Result := True;
  3443. Exit;
  3444. end;
  3445. end;
  3446. end;
  3447. if MatchInstruction(hp1,A_LEA,[S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  3448. { If the flags register is in use, don't change the instruction to an
  3449. ADD otherwise this will scramble the flags. [Kit] }
  3450. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  3451. begin
  3452. if MatchOpType(Taicpu(p),top_ref,top_reg) and
  3453. ((MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(hp1).oper[1]^.reg,Taicpu(p).oper[1]^.reg) and
  3454. (Taicpu(hp1).oper[0]^.ref^.base<>Taicpu(p).oper[1]^.reg)
  3455. ) or
  3456. (MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(p).oper[1]^.reg,Taicpu(hp1).oper[1]^.reg) and
  3457. (Taicpu(hp1).oper[0]^.ref^.index<>Taicpu(p).oper[1]^.reg)
  3458. )
  3459. ) then
  3460. { mov reg1,ref
  3461. lea reg2,[reg1,reg2]
  3462. to
  3463. add reg2,ref}
  3464. begin
  3465. TransferUsedRegs(TmpUsedRegs);
  3466. { reg1 may not be used afterwards }
  3467. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
  3468. begin
  3469. Taicpu(hp1).opcode:=A_ADD;
  3470. Taicpu(hp1).oper[0]^.ref^:=Taicpu(p).oper[0]^.ref^;
  3471. DebugMsg(SPeepholeOptimization + 'MovLea2Add done',hp1);
  3472. RemoveCurrentp(p, hp1);
  3473. result:=true;
  3474. exit;
  3475. end;
  3476. end;
  3477. { If the LEA instruction can be converted into an arithmetic instruction,
  3478. it may be possible to then fold it in the next optimisation, otherwise
  3479. there's nothing more that can be optimised here. }
  3480. if not ConvertLEA(taicpu(hp1)) then
  3481. Exit;
  3482. end;
  3483. if (taicpu(p).oper[1]^.typ = top_reg) and
  3484. (hp1.typ = ait_instruction) and
  3485. GetNextInstruction(hp1, hp2) and
  3486. MatchInstruction(hp2,A_MOV,[]) and
  3487. (SuperRegistersEqual(taicpu(hp2).oper[0]^.reg,taicpu(p).oper[1]^.reg)) and
  3488. (topsize2memsize[taicpu(hp1).opsize]>=topsize2memsize[taicpu(hp2).opsize]) and
  3489. (
  3490. IsFoldableArithOp(taicpu(hp1), taicpu(p).oper[1]^.reg)
  3491. {$ifdef x86_64}
  3492. or
  3493. (
  3494. (taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and (taicpu(hp2).opsize=S_L) and
  3495. IsFoldableArithOp(taicpu(hp1), newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[1]^.reg),R_SUBQ))
  3496. )
  3497. {$endif x86_64}
  3498. ) then
  3499. begin
  3500. if OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  3501. (taicpu(hp2).oper[0]^.typ=top_reg) then
  3502. { change movsX/movzX reg/ref, reg2
  3503. add/sub/or/... reg3/$const, reg2
  3504. mov reg2 reg/ref
  3505. dealloc reg2
  3506. to
  3507. add/sub/or/... reg3/$const, reg/ref }
  3508. begin
  3509. TransferUsedRegs(TmpUsedRegs);
  3510. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3511. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  3512. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  3513. begin
  3514. { by example:
  3515. movswl %si,%eax movswl %si,%eax p
  3516. decl %eax addl %edx,%eax hp1
  3517. movw %ax,%si movw %ax,%si hp2
  3518. ->
  3519. movswl %si,%eax movswl %si,%eax p
  3520. decw %eax addw %edx,%eax hp1
  3521. movw %ax,%si movw %ax,%si hp2
  3522. }
  3523. DebugMsg(SPeepholeOptimization + 'MovOpMov2Op ('+
  3524. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  3525. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  3526. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  3527. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  3528. {
  3529. ->
  3530. movswl %si,%eax movswl %si,%eax p
  3531. decw %si addw %dx,%si hp1
  3532. movw %ax,%si movw %ax,%si hp2
  3533. }
  3534. case taicpu(hp1).ops of
  3535. 1:
  3536. begin
  3537. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  3538. if taicpu(hp1).oper[0]^.typ=top_reg then
  3539. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  3540. end;
  3541. 2:
  3542. begin
  3543. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  3544. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  3545. (taicpu(hp1).opcode<>A_SHL) and
  3546. (taicpu(hp1).opcode<>A_SHR) and
  3547. (taicpu(hp1).opcode<>A_SAR) then
  3548. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  3549. end;
  3550. else
  3551. internalerror(2008042701);
  3552. end;
  3553. {
  3554. ->
  3555. decw %si addw %dx,%si p
  3556. }
  3557. RemoveInstruction(hp2);
  3558. RemoveCurrentP(p, hp1);
  3559. Result:=True;
  3560. Exit;
  3561. end;
  3562. end;
  3563. if MatchOpType(taicpu(hp2),top_reg,top_reg) and
  3564. not(SuperRegistersEqual(taicpu(hp1).oper[0]^.reg,taicpu(hp2).oper[1]^.reg)) and
  3565. ((topsize2memsize[taicpu(hp1).opsize]<= topsize2memsize[taicpu(hp2).opsize]) or
  3566. { opsize matters for these opcodes, we could probably work around this, but it is not worth the effort }
  3567. ((taicpu(hp1).opcode<>A_SHL) and (taicpu(hp1).opcode<>A_SHR) and (taicpu(hp1).opcode<>A_SAR))
  3568. )
  3569. {$ifdef i386}
  3570. { byte registers of esi, edi, ebp, esp are not available on i386 }
  3571. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  3572. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(p).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  3573. {$endif i386}
  3574. then
  3575. { change movsX/movzX reg/ref, reg2
  3576. add/sub/or/... regX/$const, reg2
  3577. mov reg2, reg3
  3578. dealloc reg2
  3579. to
  3580. movsX/movzX reg/ref, reg3
  3581. add/sub/or/... reg3/$const, reg3
  3582. }
  3583. begin
  3584. TransferUsedRegs(TmpUsedRegs);
  3585. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3586. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  3587. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  3588. begin
  3589. { by example:
  3590. movswl %si,%eax movswl %si,%eax p
  3591. decl %eax addl %edx,%eax hp1
  3592. movw %ax,%si movw %ax,%si hp2
  3593. ->
  3594. movswl %si,%eax movswl %si,%eax p
  3595. decw %eax addw %edx,%eax hp1
  3596. movw %ax,%si movw %ax,%si hp2
  3597. }
  3598. DebugMsg(SPeepholeOptimization + 'MovOpMov2MovOp ('+
  3599. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  3600. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  3601. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  3602. { limit size of constants as well to avoid assembler errors, but
  3603. check opsize to avoid overflow when left shifting the 1 }
  3604. if (taicpu(p).oper[0]^.typ=top_const) and (topsize2memsize[taicpu(hp2).opsize]<=63) then
  3605. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and ((qword(1) shl topsize2memsize[taicpu(hp2).opsize])-1);
  3606. {$ifdef x86_64}
  3607. { Be careful of, for example:
  3608. movl %reg1,%reg2
  3609. addl %reg3,%reg2
  3610. movq %reg2,%reg4
  3611. This will cause problems if the upper 32-bits of %reg3 or %reg4 are non-zero
  3612. }
  3613. if (taicpu(hp1).opsize = S_L) and (taicpu(hp2).opsize = S_Q) then
  3614. begin
  3615. taicpu(hp2).changeopsize(S_L);
  3616. setsubreg(taicpu(hp2).oper[0]^.reg, R_SUBD);
  3617. setsubreg(taicpu(hp2).oper[1]^.reg, R_SUBD);
  3618. end;
  3619. {$endif x86_64}
  3620. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  3621. taicpu(p).changeopsize(taicpu(hp2).opsize);
  3622. if taicpu(p).oper[0]^.typ=top_reg then
  3623. setsubreg(taicpu(p).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  3624. taicpu(p).loadoper(1, taicpu(hp2).oper[1]^);
  3625. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,usedregs);
  3626. {
  3627. ->
  3628. movswl %si,%eax movswl %si,%eax p
  3629. decw %si addw %dx,%si hp1
  3630. movw %ax,%si movw %ax,%si hp2
  3631. }
  3632. case taicpu(hp1).ops of
  3633. 1:
  3634. begin
  3635. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  3636. if taicpu(hp1).oper[0]^.typ=top_reg then
  3637. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  3638. end;
  3639. 2:
  3640. begin
  3641. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  3642. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  3643. (taicpu(hp1).opcode<>A_SHL) and
  3644. (taicpu(hp1).opcode<>A_SHR) and
  3645. (taicpu(hp1).opcode<>A_SAR) then
  3646. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  3647. end;
  3648. else
  3649. internalerror(2018111801);
  3650. end;
  3651. {
  3652. ->
  3653. decw %si addw %dx,%si p
  3654. }
  3655. RemoveInstruction(hp2);
  3656. end;
  3657. end;
  3658. end;
  3659. if MatchInstruction(hp1,A_BTS,A_BTR,[Taicpu(p).opsize]) and
  3660. GetNextInstruction(hp1, hp2) and
  3661. MatchInstruction(hp2,A_OR,[Taicpu(p).opsize]) and
  3662. MatchOperand(Taicpu(p).oper[0]^,0) and
  3663. (Taicpu(p).oper[1]^.typ = top_reg) and
  3664. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp1).oper[1]^) and
  3665. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp2).oper[1]^) then
  3666. { mov reg1,0
  3667. bts reg1,operand1 --> mov reg1,operand2
  3668. or reg1,operand2 bts reg1,operand1}
  3669. begin
  3670. Taicpu(hp2).opcode:=A_MOV;
  3671. DebugMsg(SPeepholeOptimization + 'MovBtsOr2MovBts done',hp1);
  3672. asml.remove(hp1);
  3673. insertllitem(hp2,hp2.next,hp1);
  3674. RemoveCurrentp(p, hp1);
  3675. Result:=true;
  3676. exit;
  3677. end;
  3678. {
  3679. mov ref,reg0
  3680. <op> reg0,reg1
  3681. dealloc reg0
  3682. to
  3683. <op> ref,reg1
  3684. }
  3685. if MatchOpType(taicpu(p),top_ref,top_reg) and
  3686. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3687. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  3688. MatchInstruction(hp1,[A_AND,A_OR,A_XOR,A_ADD,A_SUB,A_CMP],[Taicpu(p).opsize]) and
  3689. not(MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^)) and
  3690. RegEndOfLife(taicpu(p).oper[1]^.reg,taicpu(hp1)) then
  3691. begin
  3692. taicpu(hp1).loadoper(0,taicpu(p).oper[0]^);
  3693. DebugMsg(SPeepholeOptimization + 'MovOp2Op done',hp1);
  3694. RemoveCurrentp(p, hp1);
  3695. Result:=true;
  3696. exit;
  3697. end;
  3698. {$ifdef x86_64}
  3699. { Convert:
  3700. movq x(ref),%reg64
  3701. shrq y,%reg64
  3702. To:
  3703. movq x+4(ref),%reg32
  3704. shrq y-32,%reg32 (Remove if y = 32)
  3705. }
  3706. if (taicpu(p).opsize = S_Q) and
  3707. (taicpu(p).oper[0]^.typ = top_ref) and { Second operand will be a register }
  3708. (taicpu(p).oper[0]^.ref^.offset <= $7FFFFFFB) and
  3709. MatchInstruction(hp1, A_SHR, [taicpu(p).opsize]) and
  3710. MatchOpType(taicpu(hp1), top_const, top_reg) and
  3711. (taicpu(hp1).oper[0]^.val >= 32) and
  3712. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  3713. begin
  3714. RegName1 := debug_regname(taicpu(hp1).oper[1]^.reg);
  3715. PreMessage := 'movq ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
  3716. 'shrq $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> movl ';
  3717. { Convert to 32-bit }
  3718. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  3719. taicpu(p).opsize := S_L;
  3720. Inc(taicpu(p).oper[0]^.ref^.offset, 4);
  3721. PreMessage := PreMessage + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg);
  3722. if (taicpu(hp1).oper[0]^.val = 32) then
  3723. begin
  3724. DebugMsg(SPeepholeOptimization + PreMessage + ' (MovShr2Mov)', p);
  3725. RemoveInstruction(hp1);
  3726. end
  3727. else
  3728. begin
  3729. { This will potentially open up more arithmetic operations since
  3730. the peephole optimizer now has a big hint that only the lower
  3731. 32 bits are currently in use (and opcodes are smaller in size) }
  3732. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  3733. taicpu(hp1).opsize := S_L;
  3734. Dec(taicpu(hp1).oper[0]^.val, 32);
  3735. DebugMsg(SPeepholeOptimization + PreMessage +
  3736. '; shrl $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr2MovShr)', p);
  3737. end;
  3738. Result := True;
  3739. Exit;
  3740. end;
  3741. {$endif x86_64}
  3742. end;
  3743. function TX86AsmOptimizer.OptPass1MOVXX(var p : tai) : boolean;
  3744. var
  3745. hp1 : tai;
  3746. begin
  3747. Result:=false;
  3748. if taicpu(p).ops <> 2 then
  3749. exit;
  3750. if ((taicpu(p).oper[1]^.typ=top_reg) and GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg)) or
  3751. GetNextInstruction(p,hp1) then
  3752. begin
  3753. if MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  3754. (taicpu(hp1).ops = 2) then
  3755. begin
  3756. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  3757. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  3758. { movXX reg1, mem1 or movXX mem1, reg1
  3759. movXX mem2, reg2 movXX reg2, mem2}
  3760. begin
  3761. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  3762. { movXX reg1, mem1 or movXX mem1, reg1
  3763. movXX mem2, reg1 movXX reg2, mem1}
  3764. begin
  3765. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  3766. begin
  3767. { Removes the second statement from
  3768. movXX reg1, mem1/reg2
  3769. movXX mem1/reg2, reg1
  3770. }
  3771. if taicpu(p).oper[0]^.typ=top_reg then
  3772. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  3773. { Removes the second statement from
  3774. movXX mem1/reg1, reg2
  3775. movXX reg2, mem1/reg1
  3776. }
  3777. if (taicpu(p).oper[1]^.typ=top_reg) and
  3778. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,UsedRegs)) then
  3779. begin
  3780. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2Nop 1 done',p);
  3781. RemoveInstruction(hp1);
  3782. RemoveCurrentp(p); { p will now be equal to the instruction that follows what was hp1 }
  3783. Result:=true;
  3784. exit;
  3785. end
  3786. else if (taicpu(hp1).oper[1]^.typ<>top_ref) or (not(vol_write in taicpu(hp1).oper[1]^.ref^.volatility)) and
  3787. (taicpu(hp1).oper[0]^.typ<>top_ref) or (not(vol_read in taicpu(hp1).oper[0]^.ref^.volatility)) then
  3788. begin
  3789. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2MoVXX 1 done',p);
  3790. RemoveInstruction(hp1);
  3791. Result:=true;
  3792. exit;
  3793. end;
  3794. end
  3795. end;
  3796. end;
  3797. end;
  3798. end;
  3799. end;
  3800. function TX86AsmOptimizer.OptPass1OP(var p : tai) : boolean;
  3801. var
  3802. hp1 : tai;
  3803. begin
  3804. result:=false;
  3805. { replace
  3806. <Op>X %mreg1,%mreg2 // Op in [ADD,MUL]
  3807. MovX %mreg2,%mreg1
  3808. dealloc %mreg2
  3809. by
  3810. <Op>X %mreg2,%mreg1
  3811. ?
  3812. }
  3813. if GetNextInstruction(p,hp1) and
  3814. { we mix single and double opperations here because we assume that the compiler
  3815. generates vmovapd only after double operations and vmovaps only after single operations }
  3816. MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
  3817. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  3818. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
  3819. (taicpu(p).oper[0]^.typ=top_reg) then
  3820. begin
  3821. TransferUsedRegs(TmpUsedRegs);
  3822. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3823. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  3824. begin
  3825. taicpu(p).loadoper(0,taicpu(hp1).oper[0]^);
  3826. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  3827. DebugMsg(SPeepholeOptimization + 'OpMov2Op done',p);
  3828. RemoveInstruction(hp1);
  3829. result:=true;
  3830. end;
  3831. end;
  3832. end;
  3833. function TX86AsmOptimizer.OptPass1Test(var p: tai) : boolean;
  3834. var
  3835. hp1, p_label, p_dist, hp1_dist: tai;
  3836. JumpLabel, JumpLabel_dist: TAsmLabel;
  3837. begin
  3838. Result := False;
  3839. if GetNextInstruction(p, hp1) and
  3840. TrySwapMovCmp(p, hp1) then
  3841. begin
  3842. Result := True;
  3843. Exit;
  3844. end;
  3845. { Search for:
  3846. test %reg,%reg
  3847. j(c1) @lbl1
  3848. ...
  3849. @lbl:
  3850. test %reg,%reg (same register)
  3851. j(c2) @lbl2
  3852. If c2 is a subset of c1, change to:
  3853. test %reg,%reg
  3854. j(c1) @lbl2
  3855. (@lbl1 may become a dead label as a result)
  3856. }
  3857. if (taicpu(p).oper[1]^.typ = top_reg) and
  3858. (taicpu(p).oper[0]^.typ = top_reg) and
  3859. (taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
  3860. MatchInstruction(hp1, A_JCC, []) and
  3861. IsJumpToLabel(taicpu(hp1)) then
  3862. begin
  3863. JumpLabel := TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol);
  3864. p_label := nil;
  3865. if Assigned(JumpLabel) then
  3866. p_label := getlabelwithsym(JumpLabel);
  3867. if Assigned(p_label) and
  3868. GetNextInstruction(p_label, p_dist) and
  3869. MatchInstruction(p_dist, A_TEST, []) and
  3870. { It's fine if the second test uses smaller sub-registers }
  3871. (taicpu(p_dist).opsize <= taicpu(p).opsize) and
  3872. MatchOpType(taicpu(p_dist), top_reg, top_reg) and
  3873. SuperRegistersEqual(taicpu(p_dist).oper[0]^.reg, taicpu(p).oper[0]^.reg) and
  3874. SuperRegistersEqual(taicpu(p_dist).oper[1]^.reg, taicpu(p).oper[1]^.reg) and
  3875. GetNextInstruction(p_dist, hp1_dist) and
  3876. MatchInstruction(hp1_dist, A_JCC, []) then { This doesn't have to be an explicit label }
  3877. begin
  3878. JumpLabel_dist := TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol);
  3879. if JumpLabel = JumpLabel_dist then
  3880. { This is an infinite loop }
  3881. Exit;
  3882. { Best optimisation when the first condition is a subset (or equal) of the second }
  3883. if condition_in(taicpu(hp1).condition, taicpu(hp1_dist).condition) then
  3884. begin
  3885. { Any registers used here will already be allocated }
  3886. if Assigned(JumpLabel_dist) then
  3887. JumpLabel_dist.IncRefs;
  3888. if Assigned(JumpLabel) then
  3889. JumpLabel.DecRefs;
  3890. DebugMsg(SPeepholeOptimization + 'TEST/Jcc/@Lbl/TEST/Jcc -> TEST/Jcc, redirecting first jump', hp1);
  3891. taicpu(hp1).loadref(0, taicpu(hp1_dist).oper[0]^.ref^);
  3892. Result := True;
  3893. Exit;
  3894. end;
  3895. end;
  3896. end;
  3897. end;
  3898. function TX86AsmOptimizer.OptPass1Add(var p : tai) : boolean;
  3899. var
  3900. hp1, hp2: tai;
  3901. ActiveReg: TRegister;
  3902. OldOffset: asizeint;
  3903. ThisConst: TCGInt;
  3904. function RegDeallocated: Boolean;
  3905. begin
  3906. TransferUsedRegs(TmpUsedRegs);
  3907. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3908. Result := not(RegUsedAfterInstruction(ActiveReg,hp1,TmpUsedRegs))
  3909. end;
  3910. begin
  3911. result:=false;
  3912. hp1 := nil;
  3913. { replace
  3914. addX const,%reg1
  3915. leaX (%reg1,%reg1,Y),%reg2 // Base or index might not be equal to reg1
  3916. dealloc %reg1
  3917. by
  3918. leaX const+const*Y(%reg1,%reg1,Y),%reg2
  3919. }
  3920. if MatchOpType(taicpu(p),top_const,top_reg) then
  3921. begin
  3922. ActiveReg := taicpu(p).oper[1]^.reg;
  3923. { Ensures the entire register was updated }
  3924. if (taicpu(p).opsize >= S_L) and
  3925. GetNextInstructionUsingReg(p,hp1, ActiveReg) and
  3926. MatchInstruction(hp1,A_LEA,[]) and
  3927. (SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.base) or
  3928. SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.index)) and
  3929. (
  3930. { Cover the case where the register in the reference is also the destination register }
  3931. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ActiveReg) or
  3932. (
  3933. { Try to avoid the expensive check of RegUsedAfterInstruction if we know it will return False }
  3934. not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ActiveReg) and
  3935. RegDeallocated
  3936. )
  3937. ) then
  3938. begin
  3939. OldOffset := taicpu(hp1).oper[0]^.ref^.offset;
  3940. if ActiveReg=taicpu(hp1).oper[0]^.ref^.base then
  3941. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val);
  3942. if ActiveReg=taicpu(hp1).oper[0]^.ref^.index then
  3943. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  3944. if (taicpu(hp1).oper[0]^.ref^.offset > $7FFFFFFF) or (taicpu(hp1).oper[0]^.ref^.offset < -2147483648) then
  3945. begin
  3946. { Overflow; abort }
  3947. taicpu(hp1).oper[0]^.ref^.offset := OldOffset;
  3948. end
  3949. else
  3950. begin
  3951. DebugMsg(SPeepholeOptimization + 'AddLea2Lea done',p);
  3952. if not (cs_opt_level3 in current_settings.optimizerswitches) then
  3953. { hp1 is the immediate next instruction for sure - good for a quick speed boost }
  3954. RemoveCurrentP(p, hp1)
  3955. else
  3956. RemoveCurrentP(p);
  3957. result:=true;
  3958. Exit;
  3959. end;
  3960. end;
  3961. if (
  3962. { Save calling GetNextInstructionUsingReg again }
  3963. Assigned(hp1) or
  3964. GetNextInstructionUsingReg(p,hp1, ActiveReg)
  3965. ) and
  3966. MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
  3967. (taicpu(hp1).oper[1]^.reg = ActiveReg) then
  3968. begin
  3969. if taicpu(hp1).oper[0]^.typ = top_const then
  3970. begin
  3971. { Merge add const1,%reg; add/sub const2,%reg to add const1+/-const2,%reg }
  3972. if taicpu(hp1).opcode = A_ADD then
  3973. ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val
  3974. else
  3975. ThisConst := taicpu(p).oper[0]^.val - taicpu(hp1).oper[0]^.val;
  3976. Result := True;
  3977. { Handle any overflows }
  3978. case taicpu(p).opsize of
  3979. S_B:
  3980. taicpu(p).oper[0]^.val := ThisConst and $FF;
  3981. S_W:
  3982. taicpu(p).oper[0]^.val := ThisConst and $FFFF;
  3983. S_L:
  3984. taicpu(p).oper[0]^.val := ThisConst and $FFFFFFFF;
  3985. {$ifdef x86_64}
  3986. S_Q:
  3987. if (ThisConst > $7FFFFFFF) or (ThisConst < -2147483648) then
  3988. { Overflow; abort }
  3989. Result := False
  3990. else
  3991. taicpu(p).oper[0]^.val := ThisConst;
  3992. {$endif x86_64}
  3993. else
  3994. InternalError(2021102610);
  3995. end;
  3996. { Result may get set to False again if the combined immediate overflows for S_Q sizes }
  3997. if Result then
  3998. begin
  3999. if (taicpu(p).oper[0]^.val < 0) and
  4000. (
  4001. ((taicpu(p).opsize = S_B) and (taicpu(p).oper[0]^.val <> -128)) or
  4002. ((taicpu(p).opsize = S_W) and (taicpu(p).oper[0]^.val <> -32768)) or
  4003. ((taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and (taicpu(p).oper[0]^.val <> -2147483648))
  4004. ) then
  4005. begin
  4006. DebugMsg(SPeepholeOptimization + 'ADD; ADD/SUB -> SUB',p);
  4007. taicpu(p).opcode := A_SUB;
  4008. taicpu(p).oper[0]^.val := -taicpu(p).oper[0]^.val;
  4009. end
  4010. else
  4011. DebugMsg(SPeepholeOptimization + 'ADD; ADD/SUB -> ADD',p);
  4012. RemoveInstruction(hp1);
  4013. end;
  4014. end
  4015. else
  4016. begin
  4017. { Make doubly sure the flags aren't in use because the order of additions may affect them }
  4018. TransferUsedRegs(TmpUsedRegs);
  4019. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4020. hp2 := p;
  4021. while not (cs_opt_level3 in current_settings.optimizerswitches) and
  4022. GetNextInstruction(hp2, hp2) and (hp2 <> hp1) do
  4023. UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
  4024. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  4025. begin
  4026. { Move the constant addition to after the reg/ref addition to improve optimisation }
  4027. DebugMsg(SPeepholeOptimization + 'Add/sub swap 1a done',p);
  4028. Asml.Remove(p);
  4029. Asml.InsertAfter(p, hp1);
  4030. p := hp1;
  4031. Result := True;
  4032. end;
  4033. end;
  4034. end;
  4035. end;
  4036. end;
  4037. function TX86AsmOptimizer.OptPass1LEA(var p : tai) : boolean;
  4038. var
  4039. hp1: tai;
  4040. ref: Integer;
  4041. saveref: treference;
  4042. TempReg: TRegister;
  4043. Multiple: TCGInt;
  4044. begin
  4045. Result:=false;
  4046. { removes seg register prefixes from LEA operations, as they
  4047. don't do anything}
  4048. taicpu(p).oper[0]^.ref^.Segment:=NR_NO;
  4049. { changes "lea (%reg1), %reg2" into "mov %reg1, %reg2" }
  4050. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  4051. (taicpu(p).oper[0]^.ref^.index = NR_NO) and
  4052. (
  4053. { do not mess with leas accessing the stack pointer
  4054. unless it's a null operation }
  4055. (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) or
  4056. (
  4057. (taicpu(p).oper[0]^.ref^.base = NR_STACK_POINTER_REG) and
  4058. (taicpu(p).oper[0]^.ref^.offset = 0)
  4059. )
  4060. ) and
  4061. (not(Assigned(taicpu(p).oper[0]^.ref^.Symbol))) then
  4062. begin
  4063. if (taicpu(p).oper[0]^.ref^.offset = 0) then
  4064. begin
  4065. if (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) then
  4066. begin
  4067. hp1:=taicpu.op_reg_reg(A_MOV,taicpu(p).opsize,taicpu(p).oper[0]^.ref^.base,
  4068. taicpu(p).oper[1]^.reg);
  4069. InsertLLItem(p.previous,p.next, hp1);
  4070. DebugMsg(SPeepholeOptimization + 'Lea2Mov done',hp1);
  4071. p.free;
  4072. p:=hp1;
  4073. end
  4074. else
  4075. begin
  4076. DebugMsg(SPeepholeOptimization + 'Lea2Nop done',p);
  4077. RemoveCurrentP(p);
  4078. end;
  4079. Result:=true;
  4080. exit;
  4081. end
  4082. else if (
  4083. { continue to use lea to adjust the stack pointer,
  4084. it is the recommended way, but only if not optimizing for size }
  4085. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) or
  4086. (cs_opt_size in current_settings.optimizerswitches)
  4087. ) and
  4088. { If the flags register is in use, don't change the instruction
  4089. to an ADD otherwise this will scramble the flags. [Kit] }
  4090. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  4091. ConvertLEA(taicpu(p)) then
  4092. begin
  4093. Result:=true;
  4094. exit;
  4095. end;
  4096. end;
  4097. if GetNextInstruction(p,hp1) and
  4098. (hp1.typ=ait_instruction) then
  4099. begin
  4100. if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  4101. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  4102. MatchOpType(Taicpu(hp1),top_reg,top_reg) and
  4103. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) then
  4104. begin
  4105. TransferUsedRegs(TmpUsedRegs);
  4106. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4107. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  4108. begin
  4109. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  4110. DebugMsg(SPeepholeOptimization + 'LeaMov2Lea done',p);
  4111. RemoveInstruction(hp1);
  4112. result:=true;
  4113. exit;
  4114. end;
  4115. end;
  4116. { changes
  4117. lea <ref1>, reg1
  4118. <op> ...,<ref. with reg1>,...
  4119. to
  4120. <op> ...,<ref1>,... }
  4121. if (taicpu(p).oper[1]^.reg<>current_procinfo.framepointer) and
  4122. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) and
  4123. not(MatchInstruction(hp1,A_LEA,[])) then
  4124. begin
  4125. { find a reference which uses reg1 }
  4126. if (taicpu(hp1).ops>=1) and (taicpu(hp1).oper[0]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^) then
  4127. ref:=0
  4128. else if (taicpu(hp1).ops>=2) and (taicpu(hp1).oper[1]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^) then
  4129. ref:=1
  4130. else
  4131. ref:=-1;
  4132. if (ref<>-1) and
  4133. { reg1 must be either the base or the index }
  4134. ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) xor (taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg)) then
  4135. begin
  4136. { reg1 can be removed from the reference }
  4137. saveref:=taicpu(hp1).oper[ref]^.ref^;
  4138. if taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg then
  4139. taicpu(hp1).oper[ref]^.ref^.base:=NR_NO
  4140. else if taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg then
  4141. taicpu(hp1).oper[ref]^.ref^.index:=NR_NO
  4142. else
  4143. Internalerror(2019111201);
  4144. { check if the can insert all data of the lea into the second instruction }
  4145. if ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
  4146. ((taicpu(p).oper[0]^.ref^.base=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.base=NR_NO)) and
  4147. ((taicpu(p).oper[0]^.ref^.index=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.index=NR_NO)) and
  4148. ((taicpu(p).oper[0]^.ref^.symbol=nil) or (taicpu(hp1).oper[ref]^.ref^.symbol=nil)) and
  4149. ((taicpu(p).oper[0]^.ref^.relsymbol=nil) or (taicpu(hp1).oper[ref]^.ref^.relsymbol=nil)) and
  4150. ((taicpu(p).oper[0]^.ref^.scalefactor <= 1) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
  4151. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.segment=NR_NO)
  4152. {$ifdef x86_64}
  4153. and (abs(taicpu(hp1).oper[ref]^.ref^.offset+taicpu(p).oper[0]^.ref^.offset)<=$7fffffff)
  4154. and (((taicpu(p).oper[0]^.ref^.base<>NR_RIP) and (taicpu(p).oper[0]^.ref^.index<>NR_RIP)) or
  4155. ((taicpu(hp1).oper[ref]^.ref^.base=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.index=NR_NO))
  4156. )
  4157. {$endif x86_64}
  4158. then
  4159. begin
  4160. { reg1 might not used by the second instruction after it is remove from the reference }
  4161. if not(RegInInstruction(taicpu(p).oper[1]^.reg,taicpu(hp1))) then
  4162. begin
  4163. TransferUsedRegs(TmpUsedRegs);
  4164. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4165. { reg1 is not updated so it might not be used afterwards }
  4166. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  4167. begin
  4168. DebugMsg(SPeepholeOptimization + 'LeaOp2Op done',p);
  4169. if taicpu(p).oper[0]^.ref^.base<>NR_NO then
  4170. taicpu(hp1).oper[ref]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  4171. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  4172. taicpu(hp1).oper[ref]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  4173. if taicpu(p).oper[0]^.ref^.symbol<>nil then
  4174. taicpu(hp1).oper[ref]^.ref^.symbol:=taicpu(p).oper[0]^.ref^.symbol;
  4175. if taicpu(p).oper[0]^.ref^.relsymbol<>nil then
  4176. taicpu(hp1).oper[ref]^.ref^.relsymbol:=taicpu(p).oper[0]^.ref^.relsymbol;
  4177. if taicpu(p).oper[0]^.ref^.scalefactor > 1 then
  4178. taicpu(hp1).oper[ref]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  4179. inc(taicpu(hp1).oper[ref]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  4180. RemoveCurrentP(p, hp1);
  4181. result:=true;
  4182. exit;
  4183. end
  4184. end;
  4185. end;
  4186. { recover }
  4187. taicpu(hp1).oper[ref]^.ref^:=saveref;
  4188. end;
  4189. end;
  4190. end;
  4191. { for now, we do not mess with the stack pointer, thought it might be usefull to remove
  4192. unneeded lea sequences on the stack pointer, it needs to be tested in detail }
  4193. if (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) and
  4194. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) then
  4195. begin
  4196. { Check common LEA/LEA conditions }
  4197. if MatchInstruction(hp1,A_LEA,[taicpu(p).opsize]) and
  4198. (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
  4199. (taicpu(p).oper[0]^.ref^.relsymbol = nil) and
  4200. (taicpu(p).oper[0]^.ref^.segment = NR_NO) and
  4201. (taicpu(p).oper[0]^.ref^.symbol = nil) and
  4202. (taicpu(hp1).oper[0]^.ref^.relsymbol = nil) and
  4203. (taicpu(hp1).oper[0]^.ref^.segment = NR_NO) and
  4204. (taicpu(hp1).oper[0]^.ref^.symbol = nil) and
  4205. (
  4206. (taicpu(p).oper[0]^.ref^.base = NR_NO) or { Don't call RegModifiedBetween unnecessarily }
  4207. not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.base,p,hp1))
  4208. ) and (
  4209. (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) or { Don't call RegModifiedBetween unnecessarily }
  4210. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  4211. not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.index,p,hp1))
  4212. ) then
  4213. begin
  4214. { changes
  4215. lea (regX,scale), reg1
  4216. lea offset(reg1,reg1), reg1
  4217. to
  4218. lea offset(regX,scale*2), reg1
  4219. and
  4220. lea (regX,scale1), reg1
  4221. lea offset(reg1,scale2), reg1
  4222. to
  4223. lea offset(regX,scale1*scale2), reg1
  4224. ... so long as the final scale does not exceed 8
  4225. (Similarly, allow the first instruction to be "lea (regX,regX),reg1")
  4226. }
  4227. if (taicpu(p).oper[0]^.ref^.offset = 0) and
  4228. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  4229. (
  4230. (
  4231. (taicpu(p).oper[0]^.ref^.base = NR_NO)
  4232. ) or (
  4233. (taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
  4234. (
  4235. (taicpu(p).oper[0]^.ref^.base = taicpu(p).oper[0]^.ref^.index) and
  4236. not(RegUsedBetween(taicpu(p).oper[0]^.ref^.index, p, hp1))
  4237. )
  4238. )
  4239. ) and (
  4240. (
  4241. { lea (reg1,scale2), reg1 variant }
  4242. (taicpu(hp1).oper[0]^.ref^.base = NR_NO) and
  4243. (
  4244. (
  4245. (taicpu(p).oper[0]^.ref^.base = NR_NO) and
  4246. (taicpu(hp1).oper[0]^.ref^.scalefactor * taicpu(p).oper[0]^.ref^.scalefactor <= 8)
  4247. ) or (
  4248. { lea (regX,regX), reg1 variant }
  4249. (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  4250. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 4)
  4251. )
  4252. )
  4253. ) or (
  4254. { lea (reg1,reg1), reg1 variant }
  4255. (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  4256. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1)
  4257. )
  4258. ) then
  4259. begin
  4260. DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 2 done',p);
  4261. { Make everything homogeneous to make calculations easier }
  4262. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) then
  4263. begin
  4264. if taicpu(p).oper[0]^.ref^.index <> NR_NO then
  4265. { Convert lea (regX,regX),reg1 to lea (regX,2),reg1 }
  4266. taicpu(p).oper[0]^.ref^.scalefactor := 2
  4267. else
  4268. taicpu(p).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.base;
  4269. taicpu(p).oper[0]^.ref^.base := NR_NO;
  4270. end;
  4271. if (taicpu(hp1).oper[0]^.ref^.base = NR_NO) then
  4272. begin
  4273. { Just to prevent miscalculations }
  4274. if (taicpu(hp1).oper[0]^.ref^.scalefactor = 0) then
  4275. taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor
  4276. else
  4277. taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(hp1).oper[0]^.ref^.scalefactor * taicpu(p).oper[0]^.ref^.scalefactor;
  4278. end
  4279. else
  4280. begin
  4281. taicpu(hp1).oper[0]^.ref^.base := NR_NO;
  4282. taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor * 2;
  4283. end;
  4284. taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.index;
  4285. RemoveCurrentP(p);
  4286. result:=true;
  4287. exit;
  4288. end
  4289. { changes
  4290. lea offset1(regX), reg1
  4291. lea offset2(reg1), reg1
  4292. to
  4293. lea offset1+offset2(regX), reg1 }
  4294. else if
  4295. (
  4296. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  4297. (taicpu(p).oper[0]^.ref^.index = NR_NO)
  4298. ) or (
  4299. (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  4300. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
  4301. (
  4302. (
  4303. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  4304. (taicpu(p).oper[0]^.ref^.base = NR_NO)
  4305. ) or (
  4306. (taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
  4307. (
  4308. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  4309. (
  4310. (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
  4311. (
  4312. (taicpu(hp1).oper[0]^.ref^.index = NR_NO) or
  4313. (taicpu(hp1).oper[0]^.ref^.base = NR_NO)
  4314. )
  4315. )
  4316. )
  4317. )
  4318. )
  4319. ) then
  4320. begin
  4321. DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 1 done',p);
  4322. if taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg then
  4323. begin
  4324. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.base;
  4325. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  4326. { if the register is used as index and base, we have to increase for base as well
  4327. and adapt base }
  4328. if taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg then
  4329. begin
  4330. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  4331. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  4332. end;
  4333. end
  4334. else
  4335. begin
  4336. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  4337. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  4338. end;
  4339. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  4340. begin
  4341. taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
  4342. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  4343. taicpu(hp1).oper[0]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  4344. end;
  4345. RemoveCurrentP(p);
  4346. result:=true;
  4347. exit;
  4348. end;
  4349. end;
  4350. { Change:
  4351. leal/q $x(%reg1),%reg2
  4352. ...
  4353. shll/q $y,%reg2
  4354. To:
  4355. leal/q $(x+2^y)(%reg1,2^y),%reg2 (if y <= 3)
  4356. }
  4357. if MatchInstruction(hp1, A_SHL, [taicpu(p).opsize]) and
  4358. MatchOpType(taicpu(hp1), top_const, top_reg) and
  4359. (taicpu(hp1).oper[0]^.val <= 3) then
  4360. begin
  4361. Multiple := 1 shl taicpu(hp1).oper[0]^.val;
  4362. TransferUsedRegs(TmpUsedRegs);
  4363. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  4364. TempReg := taicpu(hp1).oper[1]^.reg; { Store locally to reduce the number of dereferences }
  4365. if
  4366. { This allows the optimisation in some circumstances even if the lea instruction already has a scale factor
  4367. (this works even if scalefactor is zero) }
  4368. ((Multiple * taicpu(p).oper[0]^.ref^.scalefactor) <= 8) and
  4369. { Ensure offset doesn't go out of bounds }
  4370. (abs(taicpu(p).oper[0]^.ref^.offset * Multiple) <= $7FFFFFFF) and
  4371. not (RegInUsedRegs(NR_DEFAULTFLAGS,TmpUsedRegs)) and
  4372. MatchOperand(taicpu(p).oper[1]^, TempReg) and
  4373. (
  4374. (
  4375. not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.base, TempReg) and
  4376. (
  4377. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  4378. (taicpu(p).oper[0]^.ref^.index = NR_INVALID) or
  4379. (
  4380. { Check for lea $x(%reg1,%reg1),%reg2 and treat as it it were lea $x(%reg1,2),%reg2 }
  4381. (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
  4382. (taicpu(p).oper[0]^.ref^.scalefactor <= 1)
  4383. )
  4384. )
  4385. ) or (
  4386. (
  4387. (taicpu(p).oper[0]^.ref^.base = NR_NO) or
  4388. (taicpu(p).oper[0]^.ref^.base = NR_INVALID)
  4389. ) and
  4390. not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.index, TempReg)
  4391. )
  4392. ) then
  4393. begin
  4394. repeat
  4395. with taicpu(p).oper[0]^.ref^ do
  4396. begin
  4397. { Convert lea $x(%reg1,%reg1),%reg2 to lea $x(%reg1,2),%reg2 }
  4398. if index = base then
  4399. begin
  4400. if Multiple > 4 then
  4401. { Optimisation will no longer work because resultant
  4402. scale factor will exceed 8 }
  4403. Break;
  4404. base := NR_NO;
  4405. scalefactor := 2;
  4406. DebugMsg(SPeepholeOptimization + 'lea $x(%reg1,%reg1),%reg2 -> lea $x(%reg1,2),%reg2 for following optimisation', p);
  4407. end
  4408. else if (base <> NR_NO) and (base <> NR_INVALID) then
  4409. begin
  4410. { Scale factor only works on the index register }
  4411. index := base;
  4412. base := NR_NO;
  4413. end;
  4414. { For safety }
  4415. if scalefactor <= 1 then
  4416. begin
  4417. DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 1', p);
  4418. scalefactor := Multiple;
  4419. end
  4420. else
  4421. begin
  4422. DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 2', p);
  4423. scalefactor := scalefactor * Multiple;
  4424. end;
  4425. offset := offset * Multiple;
  4426. end;
  4427. RemoveInstruction(hp1);
  4428. Result := True;
  4429. Exit;
  4430. { This repeat..until loop exists for the benefit of Break }
  4431. until True;
  4432. end;
  4433. end;
  4434. end;
  4435. end;
  4436. function TX86AsmOptimizer.DoSubAddOpt(var p: tai): Boolean;
  4437. var
  4438. hp1 : tai;
  4439. begin
  4440. DoSubAddOpt := False;
  4441. if taicpu(p).oper[0]^.typ <> top_const then
  4442. { Should have been confirmed before calling }
  4443. InternalError(2021102601);
  4444. if GetLastInstruction(p, hp1) and
  4445. (hp1.typ = ait_instruction) and
  4446. (taicpu(hp1).opsize = taicpu(p).opsize) then
  4447. case taicpu(hp1).opcode Of
  4448. A_DEC:
  4449. if MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  4450. begin
  4451. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+1);
  4452. RemoveInstruction(hp1);
  4453. end;
  4454. A_SUB:
  4455. if (taicpu(hp1).oper[0]^.typ = top_const) and
  4456. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  4457. begin
  4458. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+taicpu(hp1).oper[0]^.val);
  4459. RemoveInstruction(hp1);
  4460. end;
  4461. A_ADD:
  4462. begin
  4463. if (taicpu(hp1).oper[0]^.typ = top_const) and
  4464. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  4465. begin
  4466. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  4467. RemoveInstruction(hp1);
  4468. if (taicpu(p).oper[0]^.val = 0) then
  4469. begin
  4470. hp1 := tai(p.next);
  4471. RemoveInstruction(p); { Note, the choice to not use RemoveCurrentp is deliberate }
  4472. if not GetLastInstruction(hp1, p) then
  4473. p := hp1;
  4474. DoSubAddOpt := True;
  4475. end
  4476. end;
  4477. end;
  4478. else
  4479. ;
  4480. end;
  4481. end;
  4482. function TX86AsmOptimizer.OptPass1Sub(var p : tai) : boolean;
  4483. var
  4484. hp1, hp2: tai;
  4485. ActiveReg: TRegister;
  4486. OldOffset: asizeint;
  4487. ThisConst: TCGInt;
  4488. function RegDeallocated: Boolean;
  4489. begin
  4490. TransferUsedRegs(TmpUsedRegs);
  4491. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4492. Result := not(RegUsedAfterInstruction(ActiveReg,hp1,TmpUsedRegs))
  4493. end;
  4494. begin
  4495. Result:=false;
  4496. hp1 := nil;
  4497. { replace
  4498. subX const,%reg1
  4499. leaX (%reg1,%reg1,Y),%reg2 // Base or index might not be equal to reg1
  4500. dealloc %reg1
  4501. by
  4502. leaX -const-const*Y(%reg1,%reg1,Y),%reg2
  4503. }
  4504. if MatchOpType(taicpu(p),top_const,top_reg) then
  4505. begin
  4506. ActiveReg := taicpu(p).oper[1]^.reg;
  4507. { Ensures the entire register was updated }
  4508. if (taicpu(p).opsize >= S_L) and
  4509. GetNextInstructionUsingReg(p,hp1, ActiveReg) and
  4510. MatchInstruction(hp1,A_LEA,[]) and
  4511. (SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.base) or
  4512. SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.index)) and
  4513. (
  4514. { Cover the case where the register in the reference is also the destination register }
  4515. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ActiveReg) or
  4516. (
  4517. { Try to avoid the expensive check of RegUsedAfterInstruction if we know it will return False }
  4518. not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ActiveReg) and
  4519. RegDeallocated
  4520. )
  4521. ) then
  4522. begin
  4523. OldOffset := taicpu(hp1).oper[0]^.ref^.offset;
  4524. if ActiveReg=taicpu(hp1).oper[0]^.ref^.base then
  4525. Dec(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val);
  4526. if ActiveReg=taicpu(hp1).oper[0]^.ref^.index then
  4527. Dec(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  4528. if (taicpu(hp1).oper[0]^.ref^.offset > $7FFFFFFF) or (taicpu(hp1).oper[0]^.ref^.offset < -2147483648) then
  4529. begin
  4530. { Overflow; abort }
  4531. taicpu(hp1).oper[0]^.ref^.offset := OldOffset;
  4532. end
  4533. else
  4534. begin
  4535. DebugMsg(SPeepholeOptimization + 'SubLea2Lea done',p);
  4536. if not (cs_opt_level3 in current_settings.optimizerswitches) then
  4537. { hp1 is the immediate next instruction for sure - good for a quick speed boost }
  4538. RemoveCurrentP(p, hp1)
  4539. else
  4540. RemoveCurrentP(p);
  4541. result:=true;
  4542. Exit;
  4543. end;
  4544. end;
  4545. if (
  4546. { Save calling GetNextInstructionUsingReg again }
  4547. Assigned(hp1) or
  4548. GetNextInstructionUsingReg(p,hp1, ActiveReg)
  4549. ) and
  4550. MatchInstruction(hp1,A_SUB,[taicpu(p).opsize]) and
  4551. (taicpu(hp1).oper[1]^.reg = ActiveReg) then
  4552. begin
  4553. if taicpu(hp1).oper[0]^.typ = top_const then
  4554. begin
  4555. { Merge add const1,%reg; add const2,%reg to add const1+const2,%reg }
  4556. ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val;
  4557. Result := True;
  4558. { Handle any overflows }
  4559. case taicpu(p).opsize of
  4560. S_B:
  4561. taicpu(p).oper[0]^.val := ThisConst and $FF;
  4562. S_W:
  4563. taicpu(p).oper[0]^.val := ThisConst and $FFFF;
  4564. S_L:
  4565. taicpu(p).oper[0]^.val := ThisConst and $FFFFFFFF;
  4566. {$ifdef x86_64}
  4567. S_Q:
  4568. if (ThisConst > $7FFFFFFF) or (ThisConst < -2147483648) then
  4569. { Overflow; abort }
  4570. Result := False
  4571. else
  4572. taicpu(p).oper[0]^.val := ThisConst;
  4573. {$endif x86_64}
  4574. else
  4575. InternalError(2021102610);
  4576. end;
  4577. { Result may get set to False again if the combined immediate overflows for S_Q sizes }
  4578. if Result then
  4579. begin
  4580. if (taicpu(p).oper[0]^.val < 0) and
  4581. (
  4582. ((taicpu(p).opsize = S_B) and (taicpu(p).oper[0]^.val <> -128)) or
  4583. ((taicpu(p).opsize = S_W) and (taicpu(p).oper[0]^.val <> -32768)) or
  4584. ((taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and (taicpu(p).oper[0]^.val <> -2147483648))
  4585. ) then
  4586. begin
  4587. DebugMsg(SPeepholeOptimization + 'SUB; ADD/SUB -> ADD',p);
  4588. taicpu(p).opcode := A_SUB;
  4589. taicpu(p).oper[0]^.val := -taicpu(p).oper[0]^.val;
  4590. end
  4591. else
  4592. DebugMsg(SPeepholeOptimization + 'SUB; ADD/SUB -> SUB',p);
  4593. RemoveInstruction(hp1);
  4594. end;
  4595. end
  4596. else
  4597. begin
  4598. { Make doubly sure the flags aren't in use because the order of subtractions may affect them }
  4599. TransferUsedRegs(TmpUsedRegs);
  4600. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4601. hp2 := p;
  4602. while not (cs_opt_level3 in current_settings.optimizerswitches) and
  4603. GetNextInstruction(hp2, hp2) and (hp2 <> hp1) do
  4604. UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
  4605. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  4606. begin
  4607. { Move the constant subtraction to after the reg/ref addition to improve optimisation }
  4608. DebugMsg(SPeepholeOptimization + 'Add/sub swap 1b done',p);
  4609. Asml.Remove(p);
  4610. Asml.InsertAfter(p, hp1);
  4611. p := hp1;
  4612. Result := True;
  4613. Exit;
  4614. end;
  4615. end;
  4616. end;
  4617. { * change "subl $2, %esp; pushw x" to "pushl x"}
  4618. { * change "sub/add const1, reg" or "dec reg" followed by
  4619. "sub const2, reg" to one "sub ..., reg" }
  4620. {$ifdef i386}
  4621. if (taicpu(p).oper[0]^.val = 2) and
  4622. (ActiveReg = NR_ESP) and
  4623. { Don't do the sub/push optimization if the sub }
  4624. { comes from setting up the stack frame (JM) }
  4625. (not(GetLastInstruction(p,hp1)) or
  4626. not(MatchInstruction(hp1,A_MOV,[S_L]) and
  4627. MatchOperand(taicpu(hp1).oper[0]^,NR_ESP) and
  4628. MatchOperand(taicpu(hp1).oper[0]^,NR_EBP))) then
  4629. begin
  4630. hp1 := tai(p.next);
  4631. while Assigned(hp1) and
  4632. (tai(hp1).typ in [ait_instruction]+SkipInstr) and
  4633. not RegReadByInstruction(NR_ESP,hp1) and
  4634. not RegModifiedByInstruction(NR_ESP,hp1) do
  4635. hp1 := tai(hp1.next);
  4636. if Assigned(hp1) and
  4637. MatchInstruction(hp1,A_PUSH,[S_W]) then
  4638. begin
  4639. taicpu(hp1).changeopsize(S_L);
  4640. if taicpu(hp1).oper[0]^.typ=top_reg then
  4641. setsubreg(taicpu(hp1).oper[0]^.reg,R_SUBWHOLE);
  4642. hp1 := tai(p.next);
  4643. RemoveCurrentp(p, hp1);
  4644. Result:=true;
  4645. exit;
  4646. end;
  4647. end;
  4648. {$endif i386}
  4649. if DoSubAddOpt(p) then
  4650. Result:=true;
  4651. end;
  4652. end;
  4653. function TX86AsmOptimizer.OptPass1SHLSAL(var p : tai) : boolean;
  4654. var
  4655. TmpBool1,TmpBool2 : Boolean;
  4656. tmpref : treference;
  4657. hp1,hp2: tai;
  4658. mask: tcgint;
  4659. begin
  4660. Result:=false;
  4661. { All these optimisations work on "shl/sal const,%reg" }
  4662. if not MatchOpType(taicpu(p),top_const,top_reg) then
  4663. Exit;
  4664. if (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  4665. (taicpu(p).oper[0]^.val <= 3) then
  4666. { Changes "shl const, %reg32; add const/reg, %reg32" to one lea statement }
  4667. begin
  4668. { should we check the next instruction? }
  4669. TmpBool1 := True;
  4670. { have we found an add/sub which could be
  4671. integrated in the lea? }
  4672. TmpBool2 := False;
  4673. reference_reset(tmpref,2,[]);
  4674. TmpRef.index := taicpu(p).oper[1]^.reg;
  4675. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  4676. while TmpBool1 and
  4677. GetNextInstruction(p, hp1) and
  4678. (tai(hp1).typ = ait_instruction) and
  4679. ((((taicpu(hp1).opcode = A_ADD) or
  4680. (taicpu(hp1).opcode = A_SUB)) and
  4681. (taicpu(hp1).oper[1]^.typ = Top_Reg) and
  4682. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)) or
  4683. (((taicpu(hp1).opcode = A_INC) or
  4684. (taicpu(hp1).opcode = A_DEC)) and
  4685. (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  4686. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg)) or
  4687. ((taicpu(hp1).opcode = A_LEA) and
  4688. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  4689. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg))) and
  4690. (not GetNextInstruction(hp1,hp2) or
  4691. not instrReadsFlags(hp2)) Do
  4692. begin
  4693. TmpBool1 := False;
  4694. if taicpu(hp1).opcode=A_LEA then
  4695. begin
  4696. if (TmpRef.base = NR_NO) and
  4697. (taicpu(hp1).oper[0]^.ref^.symbol=nil) and
  4698. (taicpu(hp1).oper[0]^.ref^.relsymbol=nil) and
  4699. (taicpu(hp1).oper[0]^.ref^.segment=NR_NO) and
  4700. ((taicpu(hp1).oper[0]^.ref^.scalefactor=0) or
  4701. (taicpu(hp1).oper[0]^.ref^.scalefactor*tmpref.scalefactor<=8)) then
  4702. begin
  4703. TmpBool1 := True;
  4704. TmpBool2 := True;
  4705. inc(TmpRef.offset, taicpu(hp1).oper[0]^.ref^.offset);
  4706. if taicpu(hp1).oper[0]^.ref^.scalefactor<>0 then
  4707. tmpref.scalefactor:=tmpref.scalefactor*taicpu(hp1).oper[0]^.ref^.scalefactor;
  4708. TmpRef.base := taicpu(hp1).oper[0]^.ref^.base;
  4709. RemoveInstruction(hp1);
  4710. end
  4711. end
  4712. else if (taicpu(hp1).oper[0]^.typ = Top_Const) then
  4713. begin
  4714. TmpBool1 := True;
  4715. TmpBool2 := True;
  4716. case taicpu(hp1).opcode of
  4717. A_ADD:
  4718. inc(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  4719. A_SUB:
  4720. dec(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  4721. else
  4722. internalerror(2019050536);
  4723. end;
  4724. RemoveInstruction(hp1);
  4725. end
  4726. else
  4727. if (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  4728. (((taicpu(hp1).opcode = A_ADD) and
  4729. (TmpRef.base = NR_NO)) or
  4730. (taicpu(hp1).opcode = A_INC) or
  4731. (taicpu(hp1).opcode = A_DEC)) then
  4732. begin
  4733. TmpBool1 := True;
  4734. TmpBool2 := True;
  4735. case taicpu(hp1).opcode of
  4736. A_ADD:
  4737. TmpRef.base := taicpu(hp1).oper[0]^.reg;
  4738. A_INC:
  4739. inc(TmpRef.offset);
  4740. A_DEC:
  4741. dec(TmpRef.offset);
  4742. else
  4743. internalerror(2019050535);
  4744. end;
  4745. RemoveInstruction(hp1);
  4746. end;
  4747. end;
  4748. if TmpBool2
  4749. {$ifndef x86_64}
  4750. or
  4751. ((current_settings.optimizecputype < cpu_Pentium2) and
  4752. (taicpu(p).oper[0]^.val <= 3) and
  4753. not(cs_opt_size in current_settings.optimizerswitches))
  4754. {$endif x86_64}
  4755. then
  4756. begin
  4757. if not(TmpBool2) and
  4758. (taicpu(p).oper[0]^.val=1) then
  4759. begin
  4760. hp1:=taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
  4761. taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg)
  4762. end
  4763. else
  4764. hp1:=taicpu.op_ref_reg(A_LEA, taicpu(p).opsize, TmpRef,
  4765. taicpu(p).oper[1]^.reg);
  4766. DebugMsg(SPeepholeOptimization + 'ShlAddLeaSubIncDec2Lea',p);
  4767. InsertLLItem(p.previous, p.next, hp1);
  4768. p.free;
  4769. p := hp1;
  4770. end;
  4771. end
  4772. {$ifndef x86_64}
  4773. else if (current_settings.optimizecputype < cpu_Pentium2) then
  4774. begin
  4775. { changes "shl $1, %reg" to "add %reg, %reg", which is the same on a 386,
  4776. but faster on a 486, and Tairable in both U and V pipes on the Pentium
  4777. (unlike shl, which is only Tairable in the U pipe) }
  4778. if taicpu(p).oper[0]^.val=1 then
  4779. begin
  4780. hp1 := taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
  4781. taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg);
  4782. InsertLLItem(p.previous, p.next, hp1);
  4783. p.free;
  4784. p := hp1;
  4785. end
  4786. { changes "shl $2, %reg" to "lea (,%reg,4), %reg"
  4787. "shl $3, %reg" to "lea (,%reg,8), %reg }
  4788. else if (taicpu(p).opsize = S_L) and
  4789. (taicpu(p).oper[0]^.val<= 3) then
  4790. begin
  4791. reference_reset(tmpref,2,[]);
  4792. TmpRef.index := taicpu(p).oper[1]^.reg;
  4793. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  4794. hp1 := taicpu.Op_ref_reg(A_LEA,S_L,TmpRef, taicpu(p).oper[1]^.reg);
  4795. InsertLLItem(p.previous, p.next, hp1);
  4796. p.free;
  4797. p := hp1;
  4798. end;
  4799. end
  4800. {$endif x86_64}
  4801. else if
  4802. GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) and MatchOpType(taicpu(hp1), top_const, top_reg) and
  4803. (
  4804. (
  4805. MatchInstruction(hp1, A_AND, [taicpu(p).opsize]) and
  4806. SetAndTest(hp1, hp2)
  4807. {$ifdef x86_64}
  4808. ) or
  4809. (
  4810. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  4811. GetNextInstruction(hp1, hp2) and
  4812. MatchInstruction(hp2, A_AND, [taicpu(p).opsize]) and
  4813. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  4814. (taicpu(hp1).oper[1]^.reg = taicpu(hp2).oper[0]^.reg)
  4815. {$endif x86_64}
  4816. )
  4817. ) and
  4818. (taicpu(p).oper[1]^.reg = taicpu(hp2).oper[1]^.reg) then
  4819. begin
  4820. { Change:
  4821. shl x, %reg1
  4822. mov -(1<<x), %reg2
  4823. and %reg2, %reg1
  4824. Or:
  4825. shl x, %reg1
  4826. and -(1<<x), %reg1
  4827. To just:
  4828. shl x, %reg1
  4829. Since the and operation only zeroes bits that are already zero from the shl operation
  4830. }
  4831. case taicpu(p).oper[0]^.val of
  4832. 8:
  4833. mask:=$FFFFFFFFFFFFFF00;
  4834. 16:
  4835. mask:=$FFFFFFFFFFFF0000;
  4836. 32:
  4837. mask:=$FFFFFFFF00000000;
  4838. 63:
  4839. { Constant pre-calculated to prevent overflow errors with Int64 }
  4840. mask:=$8000000000000000;
  4841. else
  4842. begin
  4843. if taicpu(p).oper[0]^.val >= 64 then
  4844. { Shouldn't happen realistically, since the register
  4845. is guaranteed to be set to zero at this point }
  4846. mask := 0
  4847. else
  4848. mask := -(Int64(1 shl taicpu(p).oper[0]^.val));
  4849. end;
  4850. end;
  4851. if taicpu(hp1).oper[0]^.val = mask then
  4852. begin
  4853. { Everything checks out, perform the optimisation, as long as
  4854. the FLAGS register isn't being used}
  4855. TransferUsedRegs(TmpUsedRegs);
  4856. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4857. {$ifdef x86_64}
  4858. if (hp1 <> hp2) then
  4859. begin
  4860. { "shl/mov/and" version }
  4861. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  4862. { Don't do the optimisation if the FLAGS register is in use }
  4863. if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)) then
  4864. begin
  4865. DebugMsg(SPeepholeOptimization + 'ShlMovAnd2Shl', p);
  4866. { Don't remove the 'mov' instruction if its register is used elsewhere }
  4867. if not(RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs)) then
  4868. begin
  4869. RemoveInstruction(hp1);
  4870. Result := True;
  4871. end;
  4872. { Only set Result to True if the 'mov' instruction was removed }
  4873. RemoveInstruction(hp2);
  4874. end;
  4875. end
  4876. else
  4877. {$endif x86_64}
  4878. begin
  4879. { "shl/and" version }
  4880. { Don't do the optimisation if the FLAGS register is in use }
  4881. if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  4882. begin
  4883. DebugMsg(SPeepholeOptimization + 'ShlAnd2Shl', p);
  4884. RemoveInstruction(hp1);
  4885. Result := True;
  4886. end;
  4887. end;
  4888. Exit;
  4889. end
  4890. else {$ifdef x86_64}if (hp1 = hp2) then{$endif x86_64}
  4891. begin
  4892. { Even if the mask doesn't allow for its removal, we might be
  4893. able to optimise the mask for the "shl/and" version, which
  4894. may permit other peephole optimisations }
  4895. {$ifdef DEBUG_AOPTCPU}
  4896. mask := taicpu(hp1).oper[0]^.val and mask;
  4897. if taicpu(hp1).oper[0]^.val <> mask then
  4898. begin
  4899. DebugMsg(
  4900. SPeepholeOptimization +
  4901. 'Changed mask from $' + debug_tostr(taicpu(hp1).oper[0]^.val) +
  4902. ' to $' + debug_tostr(mask) +
  4903. 'based on previous instruction (ShlAnd2ShlAnd)', hp1);
  4904. taicpu(hp1).oper[0]^.val := mask;
  4905. end;
  4906. {$else DEBUG_AOPTCPU}
  4907. { If debugging is off, just set the operand even if it's the same }
  4908. taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and mask;
  4909. {$endif DEBUG_AOPTCPU}
  4910. end;
  4911. end;
  4912. {
  4913. change
  4914. shl/sal const,reg
  4915. <op> ...(...,reg,1),...
  4916. into
  4917. <op> ...(...,reg,1 shl const),...
  4918. if const in 1..3
  4919. }
  4920. if MatchOpType(taicpu(p), top_const, top_reg) and
  4921. (taicpu(p).oper[0]^.val in [1..3]) and
  4922. GetNextInstruction(p, hp1) and
  4923. MatchInstruction(hp1,A_MOV,A_LEA,[]) and
  4924. MatchOpType(taicpu(hp1), top_ref, top_reg) and
  4925. (taicpu(p).oper[1]^.reg=taicpu(hp1).oper[0]^.ref^.index) and
  4926. (taicpu(p).oper[1]^.reg<>taicpu(hp1).oper[0]^.ref^.base) and
  4927. (taicpu(hp1).oper[0]^.ref^.scalefactor in [0,1]) then
  4928. begin
  4929. TransferUsedRegs(TmpUsedRegs);
  4930. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4931. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
  4932. begin
  4933. taicpu(hp1).oper[0]^.ref^.scalefactor:=1 shl taicpu(p).oper[0]^.val;
  4934. DebugMsg(SPeepholeOptimization + 'ShlOp2Op', p);
  4935. RemoveCurrentP(p);
  4936. Result:=true;
  4937. end;
  4938. end;
  4939. end;
  4940. function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
  4941. var
  4942. CurrentRef: TReference;
  4943. FullReg: TRegister;
  4944. hp1, hp2: tai;
  4945. begin
  4946. Result := False;
  4947. if (first_mov.opsize <> S_B) or (second_mov.opsize <> S_B) then
  4948. Exit;
  4949. { We assume you've checked if the operand is actually a reference by
  4950. this point. If it isn't, you'll most likely get an access violation }
  4951. CurrentRef := first_mov.oper[1]^.ref^;
  4952. { Memory must be aligned }
  4953. if (CurrentRef.offset mod 4) <> 0 then
  4954. Exit;
  4955. Inc(CurrentRef.offset);
  4956. CurrentRef.alignment := 1; { Otherwise references_equal will return False }
  4957. if MatchOperand(second_mov.oper[0]^, 0) and
  4958. references_equal(second_mov.oper[1]^.ref^, CurrentRef) and
  4959. GetNextInstruction(second_mov, hp1) and
  4960. (hp1.typ = ait_instruction) and
  4961. (taicpu(hp1).opcode = A_MOV) and
  4962. MatchOpType(taicpu(hp1), top_const, top_ref) and
  4963. (taicpu(hp1).oper[0]^.val = 0) then
  4964. begin
  4965. Inc(CurrentRef.offset);
  4966. CurrentRef.alignment := taicpu(hp1).oper[1]^.ref^.alignment; { Otherwise references_equal might return False }
  4967. FullReg := newreg(R_INTREGISTER,getsupreg(first_mov.oper[0]^.reg), R_SUBD);
  4968. if references_equal(taicpu(hp1).oper[1]^.ref^, CurrentRef) then
  4969. begin
  4970. case taicpu(hp1).opsize of
  4971. S_B:
  4972. if GetNextInstruction(hp1, hp2) and
  4973. MatchInstruction(taicpu(hp2), A_MOV, [S_B]) and
  4974. MatchOpType(taicpu(hp2), top_const, top_ref) and
  4975. (taicpu(hp2).oper[0]^.val = 0) then
  4976. begin
  4977. Inc(CurrentRef.offset);
  4978. CurrentRef.alignment := 1; { Otherwise references_equal will return False }
  4979. if references_equal(taicpu(hp2).oper[1]^.ref^, CurrentRef) and
  4980. (taicpu(hp2).opsize = S_B) then
  4981. begin
  4982. RemoveInstruction(hp1);
  4983. RemoveInstruction(hp2);
  4984. first_mov.opsize := S_L;
  4985. if first_mov.oper[0]^.typ = top_reg then
  4986. begin
  4987. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVb/MOVb -> MOVZX/MOVl', first_mov);
  4988. { Reuse second_mov as a MOVZX instruction }
  4989. second_mov.opcode := A_MOVZX;
  4990. second_mov.opsize := S_BL;
  4991. second_mov.loadreg(0, first_mov.oper[0]^.reg);
  4992. second_mov.loadreg(1, FullReg);
  4993. first_mov.oper[0]^.reg := FullReg;
  4994. asml.Remove(second_mov);
  4995. asml.InsertBefore(second_mov, first_mov);
  4996. end
  4997. else
  4998. { It's a value }
  4999. begin
  5000. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVb/MOVb -> MOVl', first_mov);
  5001. RemoveInstruction(second_mov);
  5002. end;
  5003. Result := True;
  5004. Exit;
  5005. end;
  5006. end;
  5007. S_W:
  5008. begin
  5009. RemoveInstruction(hp1);
  5010. first_mov.opsize := S_L;
  5011. if first_mov.oper[0]^.typ = top_reg then
  5012. begin
  5013. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVw -> MOVZX/MOVl', first_mov);
  5014. { Reuse second_mov as a MOVZX instruction }
  5015. second_mov.opcode := A_MOVZX;
  5016. second_mov.opsize := S_BL;
  5017. second_mov.loadreg(0, first_mov.oper[0]^.reg);
  5018. second_mov.loadreg(1, FullReg);
  5019. first_mov.oper[0]^.reg := FullReg;
  5020. asml.Remove(second_mov);
  5021. asml.InsertBefore(second_mov, first_mov);
  5022. end
  5023. else
  5024. { It's a value }
  5025. begin
  5026. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVw -> MOVl', first_mov);
  5027. RemoveInstruction(second_mov);
  5028. end;
  5029. Result := True;
  5030. Exit;
  5031. end;
  5032. else
  5033. ;
  5034. end;
  5035. end;
  5036. end;
  5037. end;
  5038. function TX86AsmOptimizer.OptPass1FSTP(var p: tai): boolean;
  5039. { returns true if a "continue" should be done after this optimization }
  5040. var
  5041. hp1, hp2: tai;
  5042. begin
  5043. Result := false;
  5044. if MatchOpType(taicpu(p),top_ref) and
  5045. GetNextInstruction(p, hp1) and
  5046. (hp1.typ = ait_instruction) and
  5047. (((taicpu(hp1).opcode = A_FLD) and
  5048. (taicpu(p).opcode = A_FSTP)) or
  5049. ((taicpu(p).opcode = A_FISTP) and
  5050. (taicpu(hp1).opcode = A_FILD))) and
  5051. MatchOpType(taicpu(hp1),top_ref) and
  5052. (taicpu(hp1).opsize = taicpu(p).opsize) and
  5053. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  5054. begin
  5055. { replacing fstp f;fld f by fst f is only valid for extended because of rounding or if fastmath is on }
  5056. if ((taicpu(p).opsize=S_FX) or (cs_opt_fastmath in current_settings.optimizerswitches)) and
  5057. GetNextInstruction(hp1, hp2) and
  5058. (hp2.typ = ait_instruction) and
  5059. IsExitCode(hp2) and
  5060. (taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
  5061. not(assigned(current_procinfo.procdef.funcretsym) and
  5062. (taicpu(p).oper[0]^.ref^.offset < tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)) and
  5063. (taicpu(p).oper[0]^.ref^.index = NR_NO) then
  5064. begin
  5065. RemoveInstruction(hp1);
  5066. RemoveCurrentP(p, hp2);
  5067. RemoveLastDeallocForFuncRes(p);
  5068. Result := true;
  5069. end
  5070. else
  5071. { we can do this only in fast math mode as fstp is rounding ...
  5072. ... still disabled as it breaks the compiler and/or rtl }
  5073. if ({ (cs_opt_fastmath in current_settings.optimizerswitches) or }
  5074. { ... or if another fstp equal to the first one follows }
  5075. (GetNextInstruction(hp1,hp2) and
  5076. (hp2.typ = ait_instruction) and
  5077. (taicpu(p).opcode=taicpu(hp2).opcode) and
  5078. (taicpu(p).opsize=taicpu(hp2).opsize))
  5079. ) and
  5080. { fst can't store an extended/comp value }
  5081. (taicpu(p).opsize <> S_FX) and
  5082. (taicpu(p).opsize <> S_IQ) then
  5083. begin
  5084. if (taicpu(p).opcode = A_FSTP) then
  5085. taicpu(p).opcode := A_FST
  5086. else
  5087. taicpu(p).opcode := A_FIST;
  5088. DebugMsg(SPeepholeOptimization + 'FstpFld2Fst',p);
  5089. RemoveInstruction(hp1);
  5090. end;
  5091. end;
  5092. end;
  5093. function TX86AsmOptimizer.OptPass1FLD(var p : tai) : boolean;
  5094. var
  5095. hp1, hp2: tai;
  5096. begin
  5097. result:=false;
  5098. if MatchOpType(taicpu(p),top_reg) and
  5099. GetNextInstruction(p, hp1) and
  5100. (hp1.typ = Ait_Instruction) and
  5101. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  5102. (taicpu(hp1).oper[0]^.reg = NR_ST) and
  5103. (taicpu(hp1).oper[1]^.reg = NR_ST1) then
  5104. { change to
  5105. fld reg fxxx reg,st
  5106. fxxxp st, st1 (hp1)
  5107. Remark: non commutative operations must be reversed!
  5108. }
  5109. begin
  5110. case taicpu(hp1).opcode Of
  5111. A_FMULP,A_FADDP,
  5112. A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  5113. begin
  5114. case taicpu(hp1).opcode Of
  5115. A_FADDP: taicpu(hp1).opcode := A_FADD;
  5116. A_FMULP: taicpu(hp1).opcode := A_FMUL;
  5117. A_FSUBP: taicpu(hp1).opcode := A_FSUBR;
  5118. A_FSUBRP: taicpu(hp1).opcode := A_FSUB;
  5119. A_FDIVP: taicpu(hp1).opcode := A_FDIVR;
  5120. A_FDIVRP: taicpu(hp1).opcode := A_FDIV;
  5121. else
  5122. internalerror(2019050534);
  5123. end;
  5124. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  5125. taicpu(hp1).oper[1]^.reg := NR_ST;
  5126. RemoveCurrentP(p, hp1);
  5127. Result:=true;
  5128. exit;
  5129. end;
  5130. else
  5131. ;
  5132. end;
  5133. end
  5134. else
  5135. if MatchOpType(taicpu(p),top_ref) and
  5136. GetNextInstruction(p, hp2) and
  5137. (hp2.typ = Ait_Instruction) and
  5138. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  5139. (taicpu(p).opsize in [S_FS, S_FL]) and
  5140. (taicpu(hp2).oper[0]^.reg = NR_ST) and
  5141. (taicpu(hp2).oper[1]^.reg = NR_ST1) then
  5142. if GetLastInstruction(p, hp1) and
  5143. MatchInstruction(hp1,A_FLD,A_FST,[taicpu(p).opsize]) and
  5144. MatchOpType(taicpu(hp1),top_ref) and
  5145. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  5146. if ((taicpu(hp2).opcode = A_FMULP) or
  5147. (taicpu(hp2).opcode = A_FADDP)) then
  5148. { change to
  5149. fld/fst mem1 (hp1) fld/fst mem1
  5150. fld mem1 (p) fadd/
  5151. faddp/ fmul st, st
  5152. fmulp st, st1 (hp2) }
  5153. begin
  5154. RemoveCurrentP(p, hp1);
  5155. if (taicpu(hp2).opcode = A_FADDP) then
  5156. taicpu(hp2).opcode := A_FADD
  5157. else
  5158. taicpu(hp2).opcode := A_FMUL;
  5159. taicpu(hp2).oper[1]^.reg := NR_ST;
  5160. end
  5161. else
  5162. { change to
  5163. fld/fst mem1 (hp1) fld/fst mem1
  5164. fld mem1 (p) fld st}
  5165. begin
  5166. taicpu(p).changeopsize(S_FL);
  5167. taicpu(p).loadreg(0,NR_ST);
  5168. end
  5169. else
  5170. begin
  5171. case taicpu(hp2).opcode Of
  5172. A_FMULP,A_FADDP,A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  5173. { change to
  5174. fld/fst mem1 (hp1) fld/fst mem1
  5175. fld mem2 (p) fxxx mem2
  5176. fxxxp st, st1 (hp2) }
  5177. begin
  5178. case taicpu(hp2).opcode Of
  5179. A_FADDP: taicpu(p).opcode := A_FADD;
  5180. A_FMULP: taicpu(p).opcode := A_FMUL;
  5181. A_FSUBP: taicpu(p).opcode := A_FSUBR;
  5182. A_FSUBRP: taicpu(p).opcode := A_FSUB;
  5183. A_FDIVP: taicpu(p).opcode := A_FDIVR;
  5184. A_FDIVRP: taicpu(p).opcode := A_FDIV;
  5185. else
  5186. internalerror(2019050533);
  5187. end;
  5188. RemoveInstruction(hp2);
  5189. end
  5190. else
  5191. ;
  5192. end
  5193. end
  5194. end;
  5195. function IsCmpSubset(cond1, cond2: TAsmCond): Boolean; inline;
  5196. begin
  5197. Result := condition_in(cond1, cond2) or
  5198. { Not strictly subsets due to the actual flags checked, but because we're
  5199. comparing integers, E is a subset of AE and GE and their aliases }
  5200. ((cond1 in [C_E, C_Z]) and (cond2 in [C_AE, C_NB, C_NC, C_GE, C_NL]));
  5201. end;
  5202. function TX86AsmOptimizer.OptPass1Cmp(var p: tai): boolean;
  5203. var
  5204. v: TCGInt;
  5205. hp1, hp2, p_dist, p_jump, hp1_dist, p_label, hp1_label: tai;
  5206. FirstMatch: Boolean;
  5207. JumpLabel, JumpLabel_dist, JumpLabel_far: TAsmLabel;
  5208. begin
  5209. Result:=false;
  5210. { All these optimisations need a next instruction }
  5211. if not GetNextInstruction(p, hp1) then
  5212. Exit;
  5213. { Search for:
  5214. cmp ###,###
  5215. j(c1) @lbl1
  5216. ...
  5217. @lbl:
  5218. cmp ###.### (same comparison as above)
  5219. j(c2) @lbl2
  5220. If c1 is a subset of c2, change to:
  5221. cmp ###,###
  5222. j(c2) @lbl2
  5223. (@lbl1 may become a dead label as a result)
  5224. }
  5225. { Also handle cases where there are multiple jumps in a row }
  5226. p_jump := hp1;
  5227. while Assigned(p_jump) and MatchInstruction(p_jump, A_JCC, []) do
  5228. begin
  5229. if IsJumpToLabel(taicpu(p_jump)) then
  5230. begin
  5231. JumpLabel := TAsmLabel(taicpu(p_jump).oper[0]^.ref^.symbol);
  5232. p_label := nil;
  5233. if Assigned(JumpLabel) then
  5234. p_label := getlabelwithsym(JumpLabel);
  5235. if Assigned(p_label) and
  5236. GetNextInstruction(p_label, p_dist) and
  5237. MatchInstruction(p_dist, A_CMP, []) and
  5238. MatchOperand(taicpu(p_dist).oper[0]^, taicpu(p).oper[0]^) and
  5239. MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p).oper[1]^) and
  5240. GetNextInstruction(p_dist, hp1_dist) and
  5241. MatchInstruction(hp1_dist, A_JCC, []) then { This doesn't have to be an explicit label }
  5242. begin
  5243. JumpLabel_dist := TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol);
  5244. if JumpLabel = JumpLabel_dist then
  5245. { This is an infinite loop }
  5246. Exit;
  5247. { Best optimisation when the first condition is a subset (or equal) of the second }
  5248. if IsCmpSubset(taicpu(p_jump).condition, taicpu(hp1_dist).condition) then
  5249. begin
  5250. { Any registers used here will already be allocated }
  5251. if Assigned(JumpLabel_dist) then
  5252. JumpLabel_dist.IncRefs;
  5253. if Assigned(JumpLabel) then
  5254. JumpLabel.DecRefs;
  5255. DebugMsg(SPeepholeOptimization + 'CMP/Jcc/@Lbl/CMP/Jcc -> CMP/Jcc, redirecting first jump', p_jump);
  5256. taicpu(p_jump).condition := taicpu(hp1_dist).condition;
  5257. taicpu(p_jump).loadref(0, taicpu(hp1_dist).oper[0]^.ref^);
  5258. Result := True;
  5259. { Don't exit yet. Since p and p_jump haven't actually been
  5260. removed, we can check for more on this iteration }
  5261. end
  5262. else if IsCmpSubset(taicpu(hp1_dist).condition, inverse_cond(taicpu(p_jump).condition)) and
  5263. GetNextInstruction(hp1_dist, hp1_label) and
  5264. SkipAligns(hp1_label, hp1_label) and
  5265. (hp1_label.typ = ait_label) then
  5266. begin
  5267. JumpLabel_far := tai_label(hp1_label).labsym;
  5268. if (JumpLabel_far = JumpLabel_dist) or (JumpLabel_far = JumpLabel) then
  5269. { This is an infinite loop }
  5270. Exit;
  5271. if Assigned(JumpLabel_far) then
  5272. begin
  5273. { In this situation, if the first jump branches, the second one will never,
  5274. branch so change the destination label to after the second jump }
  5275. DebugMsg(SPeepholeOptimization + 'CMP/Jcc/@Lbl/CMP/Jcc/@Lbl -> CMP/Jcc, redirecting first jump to 2nd label', p_jump);
  5276. if Assigned(JumpLabel) then
  5277. JumpLabel.DecRefs;
  5278. JumpLabel_far.IncRefs;
  5279. taicpu(p_jump).oper[0]^.ref^.symbol := JumpLabel_far;
  5280. Result := True;
  5281. { Don't exit yet. Since p and p_jump haven't actually been
  5282. removed, we can check for more on this iteration }
  5283. Continue;
  5284. end;
  5285. end;
  5286. end;
  5287. end;
  5288. { Search for:
  5289. cmp ###,###
  5290. j(c1) @lbl1
  5291. cmp ###,### (same as first)
  5292. Remove second cmp
  5293. }
  5294. if GetNextInstruction(p_jump, hp2) and
  5295. (
  5296. (
  5297. MatchInstruction(hp2, A_CMP, []) and
  5298. (
  5299. (
  5300. MatchOpType(taicpu(p), top_const, top_reg) and
  5301. (taicpu(hp2).oper[0]^.val = taicpu(p).oper[0]^.val) and
  5302. SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp2).oper[1]^.reg)
  5303. ) or (
  5304. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
  5305. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^)
  5306. )
  5307. )
  5308. ) or (
  5309. { Also match cmp $0,%reg; jcc @lbl; test %reg,%reg }
  5310. MatchOperand(taicpu(p).oper[0]^, 0) and
  5311. (taicpu(p).oper[1]^.typ = top_reg) and
  5312. MatchInstruction(hp2, A_TEST, []) and
  5313. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  5314. (taicpu(hp2).oper[0]^.reg = taicpu(hp2).oper[1]^.reg) and
  5315. SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp2).oper[1]^.reg)
  5316. )
  5317. ) then
  5318. begin
  5319. DebugMsg(SPeepholeOptimization + 'CMP/Jcc/CMP; removed superfluous CMP', hp2);
  5320. RemoveInstruction(hp2);
  5321. Result := True;
  5322. { Continue the while loop in case "Jcc/CMP" follows the second CMP that was just removed }
  5323. end;
  5324. GetNextInstruction(p_jump, p_jump);
  5325. end;
  5326. if taicpu(p).oper[0]^.typ = top_const then
  5327. begin
  5328. if (taicpu(p).oper[0]^.val = 0) and
  5329. (taicpu(p).oper[1]^.typ = top_reg) and
  5330. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) then
  5331. begin
  5332. hp2 := p;
  5333. FirstMatch := True;
  5334. { When dealing with "cmp $0,%reg", only ZF and SF contain
  5335. anything meaningful once it's converted to "test %reg,%reg";
  5336. additionally, some jumps will always (or never) branch, so
  5337. evaluate every jump immediately following the
  5338. comparison, optimising the conditions if possible.
  5339. Similarly with SETcc... those that are always set to 0 or 1
  5340. are changed to MOV instructions }
  5341. while FirstMatch or { Saves calling GetNextInstruction unnecessarily }
  5342. (
  5343. GetNextInstruction(hp2, hp1) and
  5344. MatchInstruction(hp1,A_Jcc,A_SETcc,[])
  5345. ) do
  5346. begin
  5347. FirstMatch := False;
  5348. case taicpu(hp1).condition of
  5349. C_B, C_C, C_NAE, C_O:
  5350. { For B/NAE:
  5351. Will never branch since an unsigned integer can never be below zero
  5352. For C/O:
  5353. Result cannot overflow because 0 is being subtracted
  5354. }
  5355. begin
  5356. if taicpu(hp1).opcode = A_Jcc then
  5357. begin
  5358. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (jump removed)', hp1);
  5359. TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol).decrefs;
  5360. RemoveInstruction(hp1);
  5361. { Since hp1 was deleted, hp2 must not be updated }
  5362. Continue;
  5363. end
  5364. else
  5365. begin
  5366. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (set -> mov 0)', hp1);
  5367. { Convert "set(c) %reg" instruction to "movb 0,%reg" }
  5368. taicpu(hp1).opcode := A_MOV;
  5369. taicpu(hp1).ops := 2;
  5370. taicpu(hp1).condition := C_None;
  5371. taicpu(hp1).opsize := S_B;
  5372. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  5373. taicpu(hp1).loadconst(0, 0);
  5374. end;
  5375. end;
  5376. C_BE, C_NA:
  5377. begin
  5378. { Will only branch if equal to zero }
  5379. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition BE/NA --> E', hp1);
  5380. taicpu(hp1).condition := C_E;
  5381. end;
  5382. C_A, C_NBE:
  5383. begin
  5384. { Will only branch if not equal to zero }
  5385. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition A/NBE --> NE', hp1);
  5386. taicpu(hp1).condition := C_NE;
  5387. end;
  5388. C_AE, C_NB, C_NC, C_NO:
  5389. begin
  5390. { Will always branch }
  5391. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition AE/NB/NC/NO --> Always', hp1);
  5392. if taicpu(hp1).opcode = A_Jcc then
  5393. begin
  5394. MakeUnconditional(taicpu(hp1));
  5395. { Any jumps/set that follow will now be dead code }
  5396. RemoveDeadCodeAfterJump(taicpu(hp1));
  5397. Break;
  5398. end
  5399. else
  5400. begin
  5401. { Convert "set(c) %reg" instruction to "movb 1,%reg" }
  5402. taicpu(hp1).opcode := A_MOV;
  5403. taicpu(hp1).ops := 2;
  5404. taicpu(hp1).condition := C_None;
  5405. taicpu(hp1).opsize := S_B;
  5406. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  5407. taicpu(hp1).loadconst(0, 1);
  5408. end;
  5409. end;
  5410. C_None:
  5411. InternalError(2020012201);
  5412. C_P, C_PE, C_NP, C_PO:
  5413. { We can't handle parity checks and they should never be generated
  5414. after a general-purpose CMP (it's used in some floating-point
  5415. comparisons that don't use CMP) }
  5416. InternalError(2020012202);
  5417. else
  5418. { Zero/Equality, Sign, their complements and all of the
  5419. signed comparisons do not need to be converted };
  5420. end;
  5421. hp2 := hp1;
  5422. end;
  5423. { Convert the instruction to a TEST }
  5424. taicpu(p).opcode := A_TEST;
  5425. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  5426. Result := True;
  5427. Exit;
  5428. end
  5429. else if (taicpu(p).oper[0]^.val = 1) and
  5430. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
  5431. (taicpu(hp1).condition in [C_L, C_NGE]) then
  5432. begin
  5433. { Convert; To:
  5434. cmp $1,r/m cmp $0,r/m
  5435. jl @lbl jle @lbl
  5436. }
  5437. DebugMsg(SPeepholeOptimization + 'Cmp1Jl2Cmp0Jle', p);
  5438. taicpu(p).oper[0]^.val := 0;
  5439. taicpu(hp1).condition := C_LE;
  5440. { If the instruction is now "cmp $0,%reg", convert it to a
  5441. TEST (and effectively do the work of the "cmp $0,%reg" in
  5442. the block above)
  5443. If it's a reference, we can get away with not setting
  5444. Result to True because he haven't evaluated the jump
  5445. in this pass yet.
  5446. }
  5447. if (taicpu(p).oper[1]^.typ = top_reg) then
  5448. begin
  5449. taicpu(p).opcode := A_TEST;
  5450. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  5451. Result := True;
  5452. end;
  5453. Exit;
  5454. end
  5455. else if (taicpu(p).oper[1]^.typ = top_reg)
  5456. {$ifdef x86_64}
  5457. and (taicpu(p).opsize <> S_Q) { S_Q will never happen: cmp with 64 bit constants is not possible }
  5458. {$endif x86_64}
  5459. then
  5460. begin
  5461. { cmp register,$8000 neg register
  5462. je target --> jo target
  5463. .... only if register is deallocated before jump.}
  5464. case Taicpu(p).opsize of
  5465. S_B: v:=$80;
  5466. S_W: v:=$8000;
  5467. S_L: v:=qword($80000000);
  5468. else
  5469. internalerror(2013112905);
  5470. end;
  5471. if (taicpu(p).oper[0]^.val=v) and
  5472. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
  5473. (Taicpu(hp1).condition in [C_E,C_NE]) then
  5474. begin
  5475. TransferUsedRegs(TmpUsedRegs);
  5476. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  5477. if not(RegInUsedRegs(Taicpu(p).oper[1]^.reg, TmpUsedRegs)) then
  5478. begin
  5479. DebugMsg(SPeepholeOptimization + 'CmpJe2NegJo done',p);
  5480. Taicpu(p).opcode:=A_NEG;
  5481. Taicpu(p).loadoper(0,Taicpu(p).oper[1]^);
  5482. Taicpu(p).clearop(1);
  5483. Taicpu(p).ops:=1;
  5484. if Taicpu(hp1).condition=C_E then
  5485. Taicpu(hp1).condition:=C_O
  5486. else
  5487. Taicpu(hp1).condition:=C_NO;
  5488. Result:=true;
  5489. exit;
  5490. end;
  5491. end;
  5492. end;
  5493. end;
  5494. if TrySwapMovCmp(p, hp1) then
  5495. begin
  5496. Result := True;
  5497. Exit;
  5498. end;
  5499. end;
  5500. function TX86AsmOptimizer.OptPass1PXor(var p: tai): boolean;
  5501. var
  5502. hp1: tai;
  5503. begin
  5504. {
  5505. remove the second (v)pxor from
  5506. pxor reg,reg
  5507. ...
  5508. pxor reg,reg
  5509. }
  5510. Result:=false;
  5511. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  5512. MatchOpType(taicpu(p),top_reg,top_reg) and
  5513. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  5514. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  5515. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  5516. MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^) then
  5517. begin
  5518. DebugMsg(SPeepholeOptimization + 'PXorPXor2PXor done',hp1);
  5519. RemoveInstruction(hp1);
  5520. Result:=true;
  5521. Exit;
  5522. end
  5523. {
  5524. replace
  5525. pxor reg1,reg1
  5526. movapd/s reg1,reg2
  5527. dealloc reg1
  5528. by
  5529. pxor reg2,reg2
  5530. }
  5531. else if GetNextInstruction(p,hp1) and
  5532. { we mix single and double opperations here because we assume that the compiler
  5533. generates vmovapd only after double operations and vmovaps only after single operations }
  5534. MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
  5535. MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  5536. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  5537. (taicpu(p).oper[0]^.typ=top_reg) then
  5538. begin
  5539. TransferUsedRegs(TmpUsedRegs);
  5540. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  5541. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  5542. begin
  5543. taicpu(p).loadoper(0,taicpu(hp1).oper[1]^);
  5544. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  5545. DebugMsg(SPeepholeOptimization + 'PXorMovapd2PXor done',p);
  5546. RemoveInstruction(hp1);
  5547. result:=true;
  5548. end;
  5549. end;
  5550. end;
  5551. function TX86AsmOptimizer.OptPass1VPXor(var p: tai): boolean;
  5552. var
  5553. hp1: tai;
  5554. begin
  5555. {
  5556. remove the second (v)pxor from
  5557. (v)pxor reg,reg
  5558. ...
  5559. (v)pxor reg,reg
  5560. }
  5561. Result:=false;
  5562. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^,taicpu(p).oper[2]^) and
  5563. MatchOpType(taicpu(p),top_reg,top_reg,top_reg) and
  5564. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  5565. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  5566. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  5567. MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^,taicpu(hp1).oper[2]^) then
  5568. begin
  5569. DebugMsg(SPeepholeOptimization + 'VPXorVPXor2PXor done',hp1);
  5570. RemoveInstruction(hp1);
  5571. Result:=true;
  5572. Exit;
  5573. end
  5574. else
  5575. Result:=OptPass1VOP(p);
  5576. end;
  5577. function TX86AsmOptimizer.OptPass1Imul(var p: tai): boolean;
  5578. var
  5579. hp1 : tai;
  5580. begin
  5581. result:=false;
  5582. { replace
  5583. IMul const,%mreg1,%mreg2
  5584. Mov %reg2,%mreg3
  5585. dealloc %mreg3
  5586. by
  5587. Imul const,%mreg1,%mreg23
  5588. }
  5589. if (taicpu(p).ops=3) and
  5590. GetNextInstruction(p,hp1) and
  5591. MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  5592. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  5593. (taicpu(hp1).oper[1]^.typ=top_reg) then
  5594. begin
  5595. TransferUsedRegs(TmpUsedRegs);
  5596. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  5597. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  5598. begin
  5599. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  5600. DebugMsg(SPeepholeOptimization + 'ImulMov2Imul done',p);
  5601. RemoveInstruction(hp1);
  5602. result:=true;
  5603. end;
  5604. end;
  5605. end;
  5606. function TX86AsmOptimizer.OptPass1SHXX(var p: tai): boolean;
  5607. var
  5608. hp1 : tai;
  5609. begin
  5610. result:=false;
  5611. { replace
  5612. IMul %reg0,%reg1,%reg2
  5613. Mov %reg2,%reg3
  5614. dealloc %reg2
  5615. by
  5616. Imul %reg0,%reg1,%reg3
  5617. }
  5618. if GetNextInstruction(p,hp1) and
  5619. MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  5620. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  5621. (taicpu(hp1).oper[1]^.typ=top_reg) then
  5622. begin
  5623. TransferUsedRegs(TmpUsedRegs);
  5624. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  5625. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  5626. begin
  5627. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  5628. DebugMsg(SPeepholeOptimization + 'SHXXMov2SHXX done',p);
  5629. RemoveInstruction(hp1);
  5630. result:=true;
  5631. end;
  5632. end;
  5633. end;
  5634. function TX86AsmOptimizer.OptPass1_V_Cvtss2sd(var p: tai): boolean;
  5635. var
  5636. hp1: tai;
  5637. begin
  5638. Result:=false;
  5639. { get rid of
  5640. (v)cvtss2sd reg0,<reg1,>reg2
  5641. (v)cvtss2sd reg2,<reg2,>reg0
  5642. }
  5643. if GetNextInstruction(p,hp1) and
  5644. (((taicpu(p).opcode=A_CVTSS2SD) and MatchInstruction(hp1,A_CVTSD2SS,[taicpu(p).opsize]) and
  5645. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)) or
  5646. ((taicpu(p).opcode=A_VCVTSS2SD) and MatchInstruction(hp1,A_VCVTSD2SS,[taicpu(p).opsize]) and
  5647. MatchOpType(taicpu(p),top_reg,top_reg,top_reg) and
  5648. MatchOpType(taicpu(hp1),top_reg,top_reg,top_reg) and
  5649. (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  5650. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  5651. (getsupreg(taicpu(p).oper[2]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg))
  5652. )
  5653. ) then
  5654. begin
  5655. if getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[2]^.reg) then
  5656. begin
  5657. DebugMsg(SPeepholeOptimization + '(V)Cvtss2CvtSd(V)Cvtsd2ss2Nop done',p);
  5658. RemoveCurrentP(p);
  5659. RemoveInstruction(hp1);
  5660. end
  5661. else
  5662. begin
  5663. DebugMsg(SPeepholeOptimization + '(V)Cvtss2CvtSd(V)Cvtsd2ss2Vmovaps done',p);
  5664. taicpu(p).loadreg(1,taicpu(hp1).oper[2]^.reg);
  5665. taicpu(p).ops:=2;
  5666. taicpu(p).opcode:=A_VMOVAPS;
  5667. RemoveInstruction(hp1);
  5668. end;
  5669. Result:=true;
  5670. Exit;
  5671. end;
  5672. end;
  5673. function TX86AsmOptimizer.OptPass1Jcc(var p : tai) : boolean;
  5674. var
  5675. hp1, hp2, hp3, hp4, hp5: tai;
  5676. ThisReg: TRegister;
  5677. begin
  5678. Result := False;
  5679. if not GetNextInstruction(p,hp1) or (hp1.typ <> ait_instruction) then
  5680. Exit;
  5681. {
  5682. convert
  5683. j<c> .L1
  5684. mov 1,reg
  5685. jmp .L2
  5686. .L1
  5687. mov 0,reg
  5688. .L2
  5689. into
  5690. mov 0,reg
  5691. set<not(c)> reg
  5692. take care of alignment and that the mov 0,reg is not converted into a xor as this
  5693. would destroy the flag contents
  5694. Use MOVZX if size is preferred, since while mov 0,reg is bigger, it can be
  5695. executed at the same time as a previous comparison.
  5696. set<not(c)> reg
  5697. movzx reg, reg
  5698. }
  5699. if MatchInstruction(hp1,A_MOV,[]) and
  5700. (taicpu(hp1).oper[0]^.typ = top_const) and
  5701. (
  5702. (
  5703. (taicpu(hp1).oper[1]^.typ = top_reg)
  5704. {$ifdef i386}
  5705. { Under i386, ESI, EDI, EBP and ESP
  5706. don't have an 8-bit representation }
  5707. and not (getsupreg(taicpu(hp1).oper[1]^.reg) in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  5708. {$endif i386}
  5709. ) or (
  5710. {$ifdef i386}
  5711. (taicpu(hp1).oper[1]^.typ <> top_reg) and
  5712. {$endif i386}
  5713. (taicpu(hp1).opsize = S_B)
  5714. )
  5715. ) and
  5716. GetNextInstruction(hp1,hp2) and
  5717. MatchInstruction(hp2,A_JMP,[]) and (taicpu(hp2).oper[0]^.ref^.refaddr=addr_full) and
  5718. GetNextInstruction(hp2,hp3) and
  5719. SkipAligns(hp3, hp3) and
  5720. (hp3.typ=ait_label) and
  5721. (tasmlabel(taicpu(p).oper[0]^.ref^.symbol)=tai_label(hp3).labsym) and
  5722. GetNextInstruction(hp3,hp4) and
  5723. MatchInstruction(hp4,A_MOV,[taicpu(hp1).opsize]) and
  5724. (taicpu(hp4).oper[0]^.typ = top_const) and
  5725. (
  5726. ((taicpu(hp1).oper[0]^.val = 0) and (taicpu(hp4).oper[0]^.val = 1)) or
  5727. ((taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0))
  5728. ) and
  5729. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp4).oper[1]^) and
  5730. GetNextInstruction(hp4,hp5) and
  5731. SkipAligns(hp5, hp5) and
  5732. (hp5.typ=ait_label) and
  5733. (tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol)=tai_label(hp5).labsym) then
  5734. begin
  5735. if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
  5736. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  5737. tai_label(hp3).labsym.DecRefs;
  5738. { If this isn't the only reference to the middle label, we can
  5739. still make a saving - only that the first jump and everything
  5740. that follows will remain. }
  5741. if (tai_label(hp3).labsym.getrefs = 0) then
  5742. begin
  5743. if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
  5744. DebugMsg(SPeepholeOptimization + 'J(c)Mov1JmpMov0 -> Set(~c)',p)
  5745. else
  5746. DebugMsg(SPeepholeOptimization + 'J(c)Mov0JmpMov1 -> Set(c)',p);
  5747. { remove jump, first label and second MOV (also catching any aligns) }
  5748. repeat
  5749. if not GetNextInstruction(hp2, hp3) then
  5750. InternalError(2021040810);
  5751. RemoveInstruction(hp2);
  5752. hp2 := hp3;
  5753. until hp2 = hp5;
  5754. { Don't decrement reference count before the removal loop
  5755. above, otherwise GetNextInstruction won't stop on the
  5756. the label }
  5757. tai_label(hp5).labsym.DecRefs;
  5758. end
  5759. else
  5760. begin
  5761. if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
  5762. DebugMsg(SPeepholeOptimization + 'J(c)Mov1JmpMov0 -> Set(~c) (partial)',p)
  5763. else
  5764. DebugMsg(SPeepholeOptimization + 'J(c)Mov0JmpMov1 -> Set(c) (partial)',p);
  5765. end;
  5766. taicpu(p).opcode:=A_SETcc;
  5767. taicpu(p).opsize:=S_B;
  5768. taicpu(p).is_jmp:=False;
  5769. if taicpu(hp1).opsize=S_B then
  5770. begin
  5771. taicpu(p).loadoper(0, taicpu(hp1).oper[1]^);
  5772. if taicpu(hp1).oper[1]^.typ = top_reg then
  5773. AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp2, UsedRegs);
  5774. RemoveInstruction(hp1);
  5775. end
  5776. else
  5777. begin
  5778. { Will be a register because the size can't be S_B otherwise }
  5779. ThisReg := newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBL);
  5780. taicpu(p).loadreg(0, ThisReg);
  5781. AllocRegBetween(ThisReg, p, hp2, UsedRegs);
  5782. if (cs_opt_size in current_settings.optimizerswitches) and IsMOVZXAcceptable then
  5783. begin
  5784. case taicpu(hp1).opsize of
  5785. S_W:
  5786. taicpu(hp1).opsize := S_BW;
  5787. S_L:
  5788. taicpu(hp1).opsize := S_BL;
  5789. {$ifdef x86_64}
  5790. S_Q:
  5791. begin
  5792. taicpu(hp1).opsize := S_BL;
  5793. { Change the destination register to 32-bit }
  5794. taicpu(hp1).loadreg(1, newreg(R_INTREGISTER,getsupreg(ThisReg), R_SUBD));
  5795. end;
  5796. {$endif x86_64}
  5797. else
  5798. InternalError(2021040820);
  5799. end;
  5800. taicpu(hp1).opcode := A_MOVZX;
  5801. taicpu(hp1).loadreg(0, ThisReg);
  5802. end
  5803. else
  5804. begin
  5805. AllocRegBetween(NR_FLAGS,p,hp1,UsedRegs);
  5806. { hp1 is already a MOV instruction with the correct register }
  5807. taicpu(hp1).loadconst(0, 0);
  5808. { Inserting it right before p will guarantee that the flags are also tracked }
  5809. asml.Remove(hp1);
  5810. asml.InsertBefore(hp1, p);
  5811. end;
  5812. end;
  5813. Result:=true;
  5814. exit;
  5815. end
  5816. end;
  5817. function TX86AsmOptimizer.OptPass1VMOVDQ(var p: tai): Boolean;
  5818. var
  5819. hp1, hp2, hp3: tai;
  5820. SourceRef, TargetRef: TReference;
  5821. CurrentReg: TRegister;
  5822. begin
  5823. { VMOVDQU/CMOVDQA shouldn't have even been generated }
  5824. if not UseAVX then
  5825. InternalError(2021100501);
  5826. Result := False;
  5827. { Look for the following to simplify:
  5828. vmovdqa/u x(mem1), %xmmreg
  5829. vmovdqa/u %xmmreg, y(mem2)
  5830. vmovdqa/u x+16(mem1), %xmmreg
  5831. vmovdqa/u %xmmreg, y+16(mem2)
  5832. Change to:
  5833. vmovdqa/u x(mem1), %ymmreg
  5834. vmovdqa/u %ymmreg, y(mem2)
  5835. vpxor %ymmreg, %ymmreg, %ymmreg
  5836. ( The VPXOR instruction is to zero the upper half, thus removing the
  5837. need to call the potentially expensive VZEROUPPER instruction. Other
  5838. peephole optimisations can remove VPXOR if it's unnecessary )
  5839. }
  5840. TransferUsedRegs(TmpUsedRegs);
  5841. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  5842. { NOTE: In the optimisations below, if the references dictate that an
  5843. aligned move is possible (i.e. VMOVDQA), the existing instructions
  5844. should already be VMOVDQA because if (x mod 32) = 0, then (x mod 16) = 0 }
  5845. if (taicpu(p).opsize = S_XMM) and
  5846. MatchOpType(taicpu(p), top_ref, top_reg) and
  5847. GetNextInstruction(p, hp1) and
  5848. MatchInstruction(hp1, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
  5849. MatchOpType(taicpu(hp1), top_reg, top_ref) and
  5850. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  5851. begin
  5852. SourceRef := taicpu(p).oper[0]^.ref^;
  5853. TargetRef := taicpu(hp1).oper[1]^.ref^;
  5854. if GetNextInstruction(hp1, hp2) and
  5855. MatchInstruction(hp2, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
  5856. MatchOpType(taicpu(hp2), top_ref, top_reg) then
  5857. begin
  5858. { Delay calling GetNextInstruction(hp2, hp3) for as long as possible }
  5859. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  5860. Inc(SourceRef.offset, 16);
  5861. { Reuse the register in the first block move }
  5862. CurrentReg := newreg(R_MMREGISTER, getsupreg(taicpu(p).oper[1]^.reg), R_SUBMMY);
  5863. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
  5864. begin
  5865. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  5866. Inc(TargetRef.offset, 16);
  5867. if GetNextInstruction(hp2, hp3) and
  5868. MatchInstruction(hp3, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
  5869. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  5870. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  5871. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  5872. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  5873. begin
  5874. { Update the register tracking to the new size }
  5875. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  5876. { Remember that the offsets are 16 ahead }
  5877. { Switch to unaligned if the memory isn't on a 32-byte boundary }
  5878. if not (
  5879. ((SourceRef.offset mod 32) = 16) and
  5880. (SourceRef.alignment >= 32) and ((SourceRef.alignment mod 32) = 0)
  5881. ) then
  5882. taicpu(p).opcode := A_VMOVDQU;
  5883. taicpu(p).opsize := S_YMM;
  5884. taicpu(p).oper[1]^.reg := CurrentReg;
  5885. if not (
  5886. ((TargetRef.offset mod 32) = 16) and
  5887. (TargetRef.alignment >= 32) and ((TargetRef.alignment mod 32) = 0)
  5888. ) then
  5889. taicpu(hp1).opcode := A_VMOVDQU;
  5890. taicpu(hp1).opsize := S_YMM;
  5891. taicpu(hp1).oper[0]^.reg := CurrentReg;
  5892. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 1)', p);
  5893. { If pi_uses_ymm is set, VZEROUPPER is present to do this for us }
  5894. if (pi_uses_ymm in current_procinfo.flags) then
  5895. RemoveInstruction(hp2)
  5896. else
  5897. begin
  5898. taicpu(hp2).opcode := A_VPXOR;
  5899. taicpu(hp2).opsize := S_YMM;
  5900. taicpu(hp2).loadreg(0, CurrentReg);
  5901. taicpu(hp2).loadreg(1, CurrentReg);
  5902. taicpu(hp2).loadreg(2, CurrentReg);
  5903. taicpu(hp2).ops := 3;
  5904. end;
  5905. RemoveInstruction(hp3);
  5906. Result := True;
  5907. Exit;
  5908. end;
  5909. end
  5910. else
  5911. begin
  5912. { See if the next references are 16 less rather than 16 greater }
  5913. Dec(SourceRef.offset, 32); { -16 the other way }
  5914. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
  5915. begin
  5916. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  5917. Dec(TargetRef.offset, 16); { Only 16, not 32, as it wasn't incremented unlike SourceRef }
  5918. if GetNextInstruction(hp2, hp3) and
  5919. MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
  5920. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  5921. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  5922. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  5923. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  5924. begin
  5925. { Update the register tracking to the new size }
  5926. AllocRegBetween(CurrentReg, hp2, hp3, UsedRegs);
  5927. { hp2 and hp3 are the starting offsets, so mod = 0 this time }
  5928. { Switch to unaligned if the memory isn't on a 32-byte boundary }
  5929. if not(
  5930. ((SourceRef.offset mod 32) = 0) and
  5931. (SourceRef.alignment >= 32) and ((SourceRef.alignment mod 32) = 0)
  5932. ) then
  5933. taicpu(hp2).opcode := A_VMOVDQU;
  5934. taicpu(hp2).opsize := S_YMM;
  5935. taicpu(hp2).oper[1]^.reg := CurrentReg;
  5936. if not (
  5937. ((TargetRef.offset mod 32) = 0) and
  5938. (TargetRef.alignment >= 32) and ((TargetRef.alignment mod 32) = 0)
  5939. ) then
  5940. taicpu(hp3).opcode := A_VMOVDQU;
  5941. taicpu(hp3).opsize := S_YMM;
  5942. taicpu(hp3).oper[0]^.reg := CurrentReg;
  5943. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 2)', p);
  5944. { If pi_uses_ymm is set, VZEROUPPER is present to do this for us }
  5945. if (pi_uses_ymm in current_procinfo.flags) then
  5946. RemoveInstruction(hp1)
  5947. else
  5948. begin
  5949. taicpu(hp1).opcode := A_VPXOR;
  5950. taicpu(hp1).opsize := S_YMM;
  5951. taicpu(hp1).loadreg(0, CurrentReg);
  5952. taicpu(hp1).loadreg(1, CurrentReg);
  5953. taicpu(hp1).loadreg(2, CurrentReg);
  5954. taicpu(hp1).ops := 3;
  5955. Asml.Remove(hp1);
  5956. Asml.InsertAfter(hp1, hp3); { Register deallocations will be after hp3 }
  5957. end;
  5958. RemoveCurrentP(p, hp2);
  5959. Result := True;
  5960. Exit;
  5961. end;
  5962. end;
  5963. end;
  5964. end;
  5965. end;
  5966. end;
  5967. function TX86AsmOptimizer.CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
  5968. var
  5969. hp2, hp3, first_assignment: tai;
  5970. IncCount, OperIdx: Integer;
  5971. OrigLabel: TAsmLabel;
  5972. begin
  5973. Count := 0;
  5974. Result := False;
  5975. first_assignment := nil;
  5976. if (LoopCount >= 20) then
  5977. begin
  5978. { Guard against infinite loops }
  5979. Exit;
  5980. end;
  5981. if (taicpu(p).oper[0]^.typ <> top_ref) or
  5982. (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) or
  5983. (taicpu(p).oper[0]^.ref^.base <> NR_NO) or
  5984. (taicpu(p).oper[0]^.ref^.index <> NR_NO) or
  5985. not (taicpu(p).oper[0]^.ref^.symbol is TAsmLabel) then
  5986. Exit;
  5987. OrigLabel := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  5988. {
  5989. change
  5990. jmp .L1
  5991. ...
  5992. .L1:
  5993. mov ##, ## ( multiple movs possible )
  5994. jmp/ret
  5995. into
  5996. mov ##, ##
  5997. jmp/ret
  5998. }
  5999. if not Assigned(hp1) then
  6000. begin
  6001. hp1 := GetLabelWithSym(OrigLabel);
  6002. if not Assigned(hp1) or not SkipLabels(hp1, hp1) then
  6003. Exit;
  6004. end;
  6005. hp2 := hp1;
  6006. while Assigned(hp2) do
  6007. begin
  6008. if Assigned(hp2) and (hp2.typ in [ait_label, ait_align]) then
  6009. SkipLabels(hp2,hp2);
  6010. if not Assigned(hp2) or (hp2.typ <> ait_instruction) then
  6011. Break;
  6012. case taicpu(hp2).opcode of
  6013. A_MOVSS:
  6014. begin
  6015. if taicpu(hp2).ops = 0 then
  6016. { Wrong MOVSS }
  6017. Break;
  6018. Inc(Count);
  6019. if Count >= 5 then
  6020. { Too many to be worthwhile }
  6021. Break;
  6022. GetNextInstruction(hp2, hp2);
  6023. Continue;
  6024. end;
  6025. A_MOV,
  6026. A_MOVD,
  6027. A_MOVQ,
  6028. A_MOVSX,
  6029. {$ifdef x86_64}
  6030. A_MOVSXD,
  6031. {$endif x86_64}
  6032. A_MOVZX,
  6033. A_MOVAPS,
  6034. A_MOVUPS,
  6035. A_MOVSD,
  6036. A_MOVAPD,
  6037. A_MOVUPD,
  6038. A_MOVDQA,
  6039. A_MOVDQU,
  6040. A_VMOVSS,
  6041. A_VMOVAPS,
  6042. A_VMOVUPS,
  6043. A_VMOVSD,
  6044. A_VMOVAPD,
  6045. A_VMOVUPD,
  6046. A_VMOVDQA,
  6047. A_VMOVDQU:
  6048. begin
  6049. Inc(Count);
  6050. if Count >= 5 then
  6051. { Too many to be worthwhile }
  6052. Break;
  6053. GetNextInstruction(hp2, hp2);
  6054. Continue;
  6055. end;
  6056. A_JMP:
  6057. begin
  6058. { Guard against infinite loops }
  6059. if taicpu(hp2).oper[0]^.ref^.symbol = OrigLabel then
  6060. Exit;
  6061. { Analyse this jump first in case it also duplicates assignments }
  6062. if CheckJumpMovTransferOpt(hp2, nil, LoopCount + 1, IncCount) then
  6063. begin
  6064. { Something did change! }
  6065. Result := True;
  6066. Inc(Count, IncCount);
  6067. if Count >= 5 then
  6068. begin
  6069. { Too many to be worthwhile }
  6070. Exit;
  6071. end;
  6072. if MatchInstruction(hp2, [A_JMP, A_RET], []) then
  6073. Break;
  6074. end;
  6075. Result := True;
  6076. Break;
  6077. end;
  6078. A_RET:
  6079. begin
  6080. Result := True;
  6081. Break;
  6082. end;
  6083. else
  6084. Break;
  6085. end;
  6086. end;
  6087. if Result then
  6088. begin
  6089. { A count of zero can happen when CheckJumpMovTransferOpt is called recursively }
  6090. if Count = 0 then
  6091. begin
  6092. Result := False;
  6093. Exit;
  6094. end;
  6095. hp3 := p;
  6096. DebugMsg(SPeepholeOptimization + 'Duplicated ' + debug_tostr(Count) + ' assignment(s) and redirected jump', p);
  6097. while True do
  6098. begin
  6099. if Assigned(hp1) and (hp1.typ in [ait_label, ait_align]) then
  6100. SkipLabels(hp1,hp1);
  6101. if (hp1.typ <> ait_instruction) then
  6102. InternalError(2021040720);
  6103. case taicpu(hp1).opcode of
  6104. A_JMP:
  6105. begin
  6106. { Change the original jump to the new destination }
  6107. OrigLabel.decrefs;
  6108. taicpu(hp1).oper[0]^.ref^.symbol.increfs;
  6109. taicpu(p).loadref(0, taicpu(hp1).oper[0]^.ref^);
  6110. { Set p to the first duplicated assignment so it can get optimised if needs be }
  6111. if not Assigned(first_assignment) then
  6112. InternalError(2021040810)
  6113. else
  6114. p := first_assignment;
  6115. Exit;
  6116. end;
  6117. A_RET:
  6118. begin
  6119. { Now change the jump into a RET instruction }
  6120. ConvertJumpToRET(p, hp1);
  6121. { Set p to the first duplicated assignment so it can get optimised if needs be }
  6122. if not Assigned(first_assignment) then
  6123. InternalError(2021040811)
  6124. else
  6125. p := first_assignment;
  6126. Exit;
  6127. end;
  6128. else
  6129. begin
  6130. { Duplicate the MOV instruction }
  6131. hp3:=tai(hp1.getcopy);
  6132. if first_assignment = nil then
  6133. first_assignment := hp3;
  6134. asml.InsertBefore(hp3, p);
  6135. { Make sure the compiler knows about any final registers written here }
  6136. for OperIdx := 0 to taicpu(hp3).ops - 1 do
  6137. with taicpu(hp3).oper[OperIdx]^ do
  6138. begin
  6139. case typ of
  6140. top_ref:
  6141. begin
  6142. if (ref^.base <> NR_NO) and
  6143. (getsupreg(ref^.base) <> RS_ESP) and
  6144. (getsupreg(ref^.base) <> RS_EBP)
  6145. {$ifdef x86_64} and (ref^.base <> NR_RIP) {$endif x86_64}
  6146. then
  6147. AllocRegBetween(ref^.base, hp3, tai(p.Next), UsedRegs);
  6148. if (ref^.index <> NR_NO) and
  6149. (getsupreg(ref^.index) <> RS_ESP) and
  6150. (getsupreg(ref^.index) <> RS_EBP)
  6151. {$ifdef x86_64} and (ref^.index <> NR_RIP) {$endif x86_64} and
  6152. (ref^.index <> ref^.base) then
  6153. AllocRegBetween(ref^.index, hp3, tai(p.Next), UsedRegs);
  6154. end;
  6155. top_reg:
  6156. AllocRegBetween(reg, hp3, tai(p.Next), UsedRegs);
  6157. else
  6158. ;
  6159. end;
  6160. end;
  6161. end;
  6162. end;
  6163. if not GetNextInstruction(hp1, hp1) then
  6164. { Should have dropped out earlier }
  6165. InternalError(2021040710);
  6166. end;
  6167. end;
  6168. end;
  6169. function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
  6170. var
  6171. hp2: tai;
  6172. X: Integer;
  6173. const
  6174. WriteOp: array[0..3] of set of TInsChange = (
  6175. [Ch_Wop1, Ch_RWop1, Ch_Mop1],
  6176. [Ch_Wop2, Ch_RWop2, Ch_Mop2],
  6177. [Ch_Wop3, Ch_RWop3, Ch_Mop3],
  6178. [Ch_Wop4, Ch_RWop4, Ch_Mop4]);
  6179. RegWriteFlags: array[0..7] of set of TInsChange = (
  6180. { The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
  6181. [Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
  6182. [Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
  6183. [Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
  6184. [Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
  6185. [Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
  6186. [Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
  6187. [Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
  6188. [Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
  6189. begin
  6190. { If we have something like:
  6191. cmp ###,%reg1
  6192. mov 0,%reg2
  6193. And no modified registers are shared, move the instruction to before
  6194. the comparison as this means it can be optimised without worrying
  6195. about the FLAGS register. (CMP/MOV is generated by
  6196. "J(c)Mov1JmpMov0 -> Set(~c)", among other things).
  6197. As long as the second instruction doesn't use the flags or one of the
  6198. registers used by CMP or TEST (also check any references that use the
  6199. registers), then it can be moved prior to the comparison.
  6200. }
  6201. Result := False;
  6202. if (hp1.typ <> ait_instruction) or
  6203. taicpu(hp1).is_jmp or
  6204. RegInInstruction(NR_DEFAULTFLAGS, hp1) then
  6205. Exit;
  6206. { NOP is a pipeline fence, likely marking the beginning of the function
  6207. epilogue, so drop out. Similarly, drop out if POP or RET are
  6208. encountered }
  6209. if MatchInstruction(hp1, A_NOP, A_POP, []) then
  6210. Exit;
  6211. if (taicpu(hp1).opcode = A_MOVSS) and
  6212. (taicpu(hp1).ops = 0) then
  6213. { Wrong MOVSS }
  6214. Exit;
  6215. { Check for writes to specific registers first }
  6216. { EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP in that order }
  6217. for X := 0 to 7 do
  6218. if (RegWriteFlags[X] * InsProp[taicpu(hp1).opcode].Ch <> [])
  6219. and RegInInstruction(newreg(R_INTREGISTER, TSuperRegister(X), R_SUBWHOLE), p) then
  6220. Exit;
  6221. for X := 0 to taicpu(hp1).ops - 1 do
  6222. begin
  6223. { Check to see if this operand writes to something }
  6224. if ((WriteOp[X] * InsProp[taicpu(hp1).opcode].Ch) <> []) and
  6225. { And matches something in the CMP/TEST instruction }
  6226. (
  6227. MatchOperand(taicpu(hp1).oper[X]^, taicpu(p).oper[0]^) or
  6228. MatchOperand(taicpu(hp1).oper[X]^, taicpu(p).oper[1]^) or
  6229. (
  6230. { If it's a register, make sure the register written to doesn't
  6231. appear in the cmp instruction as part of a reference }
  6232. (taicpu(hp1).oper[X]^.typ = top_reg) and
  6233. RegInInstruction(taicpu(hp1).oper[X]^.reg, p)
  6234. )
  6235. ) then
  6236. Exit;
  6237. end;
  6238. { The instruction can be safely moved }
  6239. asml.Remove(hp1);
  6240. { Try to insert after the last instructions where the FLAGS register is not yet in use }
  6241. if not GetLastInstruction(p, hp2) then
  6242. asml.InsertBefore(hp1, p)
  6243. else
  6244. asml.InsertAfter(hp1, hp2);
  6245. DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and ' + debug_op2str(taicpu(hp1).opcode) + ' instructions to improve optimisation potential', hp1);
  6246. for X := 0 to taicpu(hp1).ops - 1 do
  6247. case taicpu(hp1).oper[X]^.typ of
  6248. top_reg:
  6249. AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
  6250. top_ref:
  6251. begin
  6252. if taicpu(hp1).oper[X]^.ref^.base <> NR_NO then
  6253. AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
  6254. if taicpu(hp1).oper[X]^.ref^.index <> NR_NO then
  6255. AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
  6256. end;
  6257. else
  6258. ;
  6259. end;
  6260. if taicpu(hp1).opcode = A_LEA then
  6261. { The flags will be overwritten by the CMP/TEST instruction }
  6262. ConvertLEA(taicpu(hp1));
  6263. Result := True;
  6264. end;
  6265. function TX86AsmOptimizer.OptPass2MOV(var p : tai) : boolean;
  6266. function IsXCHGAcceptable: Boolean; inline;
  6267. begin
  6268. { Always accept if optimising for size }
  6269. Result := (cs_opt_size in current_settings.optimizerswitches) or
  6270. (
  6271. {$ifdef x86_64}
  6272. { XCHG takes 3 cycles on AMD Athlon64 }
  6273. (current_settings.optimizecputype >= cpu_core_i)
  6274. {$else x86_64}
  6275. { From the Pentium M onwards, XCHG only has a latency of 2 rather
  6276. than 3, so it becomes a saving compared to three MOVs with two of
  6277. them able to execute simultaneously. [Kit] }
  6278. (current_settings.optimizecputype >= cpu_PentiumM)
  6279. {$endif x86_64}
  6280. );
  6281. end;
  6282. var
  6283. NewRef: TReference;
  6284. hp1, hp2, hp3, hp4: Tai;
  6285. {$ifndef x86_64}
  6286. OperIdx: Integer;
  6287. {$endif x86_64}
  6288. NewInstr : Taicpu;
  6289. NewAligh : Tai_align;
  6290. DestLabel: TAsmLabel;
  6291. function TryMovArith2Lea(InputInstr: tai): Boolean;
  6292. var
  6293. NextInstr: tai;
  6294. begin
  6295. Result := False;
  6296. UpdateUsedRegs(TmpUsedRegs, tai(InputInstr.Next));
  6297. if not GetNextInstruction(InputInstr, NextInstr) or
  6298. (
  6299. { The FLAGS register isn't always tracked properly, so do not
  6300. perform this optimisation if a conditional statement follows }
  6301. not RegReadByInstruction(NR_DEFAULTFLAGS, NextInstr) and
  6302. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, NextInstr, TmpUsedRegs)
  6303. ) then
  6304. begin
  6305. reference_reset(NewRef, 1, []);
  6306. NewRef.base := taicpu(p).oper[0]^.reg;
  6307. NewRef.scalefactor := 1;
  6308. if taicpu(InputInstr).opcode = A_ADD then
  6309. begin
  6310. DebugMsg(SPeepholeOptimization + 'MovAdd2Lea', p);
  6311. NewRef.offset := taicpu(InputInstr).oper[0]^.val;
  6312. end
  6313. else
  6314. begin
  6315. DebugMsg(SPeepholeOptimization + 'MovSub2Lea', p);
  6316. NewRef.offset := -taicpu(InputInstr).oper[0]^.val;
  6317. end;
  6318. taicpu(p).opcode := A_LEA;
  6319. taicpu(p).loadref(0, NewRef);
  6320. RemoveInstruction(InputInstr);
  6321. Result := True;
  6322. end;
  6323. end;
  6324. begin
  6325. Result:=false;
  6326. { This optimisation adds an instruction, so only do it for speed }
  6327. if not (cs_opt_size in current_settings.optimizerswitches) and
  6328. MatchOpType(taicpu(p), top_const, top_reg) and
  6329. (taicpu(p).oper[0]^.val = 0) then
  6330. begin
  6331. { To avoid compiler warning }
  6332. DestLabel := nil;
  6333. if (p.typ <> ait_instruction) or (taicpu(p).oper[1]^.typ <> top_reg) then
  6334. InternalError(2021040750);
  6335. if not GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.reg) then
  6336. Exit;
  6337. case hp1.typ of
  6338. ait_label:
  6339. begin
  6340. { Change:
  6341. mov $0,%reg mov $0,%reg
  6342. @Lbl1: @Lbl1:
  6343. test %reg,%reg / cmp $0,%reg test %reg,%reg / mov $0,%reg
  6344. je @Lbl2 jne @Lbl2
  6345. To: To:
  6346. mov $0,%reg mov $0,%reg
  6347. jmp @Lbl2 jmp @Lbl3
  6348. (align) (align)
  6349. @Lbl1: @Lbl1:
  6350. test %reg,%reg / cmp $0,%reg test %reg,%reg / cmp $0,%reg
  6351. je @Lbl2 je @Lbl2
  6352. @Lbl3: <-- Only if label exists
  6353. (Not if it's optimised for size)
  6354. }
  6355. if not GetNextInstruction(hp1, hp2) then
  6356. Exit;
  6357. if not (cs_opt_size in current_settings.optimizerswitches) and
  6358. (hp2.typ = ait_instruction) and
  6359. (
  6360. { Register sizes must exactly match }
  6361. (
  6362. (taicpu(hp2).opcode = A_CMP) and
  6363. MatchOperand(taicpu(hp2).oper[0]^, 0) and
  6364. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg)
  6365. ) or (
  6366. (taicpu(hp2).opcode = A_TEST) and
  6367. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  6368. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg)
  6369. )
  6370. ) and GetNextInstruction(hp2, hp3) and
  6371. (hp3.typ = ait_instruction) and
  6372. (taicpu(hp3).opcode = A_JCC) and
  6373. (taicpu(hp3).oper[0]^.typ=top_ref) and (taicpu(hp3).oper[0]^.ref^.refaddr=addr_full) and (taicpu(hp3).oper[0]^.ref^.base=NR_NO) and
  6374. (taicpu(hp3).oper[0]^.ref^.index=NR_NO) and (taicpu(hp3).oper[0]^.ref^.symbol is tasmlabel) then
  6375. begin
  6376. { Check condition of jump }
  6377. { Always true? }
  6378. if condition_in(C_E, taicpu(hp3).condition) then
  6379. begin
  6380. { Copy label symbol and obtain matching label entry for the
  6381. conditional jump, as this will be our destination}
  6382. DestLabel := tasmlabel(taicpu(hp3).oper[0]^.ref^.symbol);
  6383. DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Je -> Mov0JmpLblCmp0Je', p);
  6384. Result := True;
  6385. end
  6386. { Always false? }
  6387. else if condition_in(C_NE, taicpu(hp3).condition) and GetNextInstruction(hp3, hp2) then
  6388. begin
  6389. { This is only worth it if there's a jump to take }
  6390. case hp2.typ of
  6391. ait_instruction:
  6392. begin
  6393. if taicpu(hp2).opcode = A_JMP then
  6394. begin
  6395. DestLabel := tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol);
  6396. { An unconditional jump follows the conditional jump which will always be false,
  6397. so use this jump's destination for the new jump }
  6398. DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Jne -> Mov0JmpLblCmp0Jne (with JMP)', p);
  6399. Result := True;
  6400. end
  6401. else if taicpu(hp2).opcode = A_JCC then
  6402. begin
  6403. DestLabel := tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol);
  6404. if condition_in(C_E, taicpu(hp2).condition) then
  6405. begin
  6406. { A second conditional jump follows the conditional jump which will always be false,
  6407. while the second jump is always True, so use this jump's destination for the new jump }
  6408. DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Jne -> Mov0JmpLblCmp0Jne (with second Jcc)', p);
  6409. Result := True;
  6410. end;
  6411. { Don't risk it if the jump isn't always true (Result remains False) }
  6412. end;
  6413. end;
  6414. else
  6415. { If anything else don't optimise };
  6416. end;
  6417. end;
  6418. if Result then
  6419. begin
  6420. { Just so we have something to insert as a paremeter}
  6421. reference_reset(NewRef, 1, []);
  6422. NewInstr := taicpu.op_ref(A_JMP, S_NO, NewRef);
  6423. { Now actually load the correct parameter }
  6424. NewInstr.loadsymbol(0, DestLabel, 0);
  6425. { Get instruction before original label (may not be p under -O3) }
  6426. if not GetLastInstruction(hp1, hp2) then
  6427. { Shouldn't fail here }
  6428. InternalError(2021040701);
  6429. DestLabel.increfs;
  6430. AsmL.InsertAfter(NewInstr, hp2);
  6431. { Add new alignment field }
  6432. (* AsmL.InsertAfter(
  6433. cai_align.create_max(
  6434. current_settings.alignment.jumpalign,
  6435. current_settings.alignment.jumpalignskipmax
  6436. ),
  6437. NewInstr
  6438. ); *)
  6439. end;
  6440. Exit;
  6441. end;
  6442. end;
  6443. else
  6444. ;
  6445. end;
  6446. end;
  6447. if not GetNextInstruction(p, hp1) then
  6448. Exit;
  6449. if MatchInstruction(hp1, A_JMP, [S_NO]) then
  6450. begin
  6451. { Sometimes the MOVs that OptPass2JMP produces can be improved
  6452. further, but we can't just put this jump optimisation in pass 1
  6453. because it tends to perform worse when conditional jumps are
  6454. nearby (e.g. when converting CMOV instructions). [Kit] }
  6455. if OptPass2JMP(hp1) then
  6456. { call OptPass1MOV once to potentially merge any MOVs that were created }
  6457. Result := OptPass1MOV(p)
  6458. { OptPass2MOV will now exit but will be called again if OptPass1MOV
  6459. returned True and the instruction is still a MOV, thus checking
  6460. the optimisations below }
  6461. { If OptPass2JMP returned False, no optimisations were done to
  6462. the jump and there are no further optimisations that can be done
  6463. to the MOV instruction on this pass }
  6464. end
  6465. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  6466. (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  6467. MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
  6468. (taicpu(hp1).oper[1]^.typ = top_reg) and
  6469. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  6470. begin
  6471. { Change:
  6472. movl/q %reg1,%reg2 movl/q %reg1,%reg2
  6473. addl/q $x,%reg2 subl/q $x,%reg2
  6474. To:
  6475. leal/q x(%reg1),%reg2 leal/q -x(%reg1),%reg2
  6476. }
  6477. if (taicpu(hp1).oper[0]^.typ = top_const) and
  6478. { be lazy, checking separately for sub would be slightly better }
  6479. (abs(taicpu(hp1).oper[0]^.val)<=$7fffffff) then
  6480. begin
  6481. TransferUsedRegs(TmpUsedRegs);
  6482. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  6483. if TryMovArith2Lea(hp1) then
  6484. begin
  6485. Result := True;
  6486. Exit;
  6487. end
  6488. end
  6489. else if not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) and
  6490. GetNextInstructionUsingReg(hp1, hp2, taicpu(p).oper[1]^.reg) and
  6491. { Same as above, but also adds or subtracts to %reg2 in between.
  6492. It's still valid as long as the flags aren't in use }
  6493. MatchInstruction(hp2,A_ADD,A_SUB,[taicpu(p).opsize]) and
  6494. MatchOpType(taicpu(hp2), top_const, top_reg) and
  6495. (taicpu(hp2).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
  6496. { be lazy, checking separately for sub would be slightly better }
  6497. (abs(taicpu(hp2).oper[0]^.val)<=$7fffffff) then
  6498. begin
  6499. TransferUsedRegs(TmpUsedRegs);
  6500. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  6501. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  6502. if TryMovArith2Lea(hp2) then
  6503. begin
  6504. Result := True;
  6505. Exit;
  6506. end;
  6507. end;
  6508. end
  6509. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  6510. {$ifdef x86_64}
  6511. MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
  6512. {$else x86_64}
  6513. MatchInstruction(hp1,A_MOVZX,A_MOVSX,[]) and
  6514. {$endif x86_64}
  6515. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  6516. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg) then
  6517. { mov reg1, reg2 mov reg1, reg2
  6518. movzx/sx reg2, reg3 to movzx/sx reg1, reg3}
  6519. begin
  6520. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  6521. DebugMsg(SPeepholeOptimization + 'mov %reg1,%reg2; movzx/sx %reg2,%reg3 -> mov %reg1,%reg2;movzx/sx %reg1,%reg3',p);
  6522. { Don't remove the MOV command without first checking that reg2 isn't used afterwards,
  6523. or unless supreg(reg3) = supreg(reg2)). [Kit] }
  6524. TransferUsedRegs(TmpUsedRegs);
  6525. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  6526. if (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) or
  6527. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)
  6528. then
  6529. begin
  6530. RemoveCurrentP(p, hp1);
  6531. Result:=true;
  6532. end;
  6533. exit;
  6534. end
  6535. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  6536. IsXCHGAcceptable and
  6537. { XCHG doesn't support 8-byte registers }
  6538. (taicpu(p).opsize <> S_B) and
  6539. MatchInstruction(hp1, A_MOV, []) and
  6540. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  6541. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
  6542. GetNextInstruction(hp1, hp2) and
  6543. MatchInstruction(hp2, A_MOV, []) and
  6544. { Don't need to call MatchOpType for hp2 because the operand matches below cover for it }
  6545. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  6546. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) then
  6547. begin
  6548. { mov %reg1,%reg2
  6549. mov %reg3,%reg1 -> xchg %reg3,%reg1
  6550. mov %reg2,%reg3
  6551. (%reg2 not used afterwards)
  6552. Note that xchg takes 3 cycles to execute, and generally mov's take
  6553. only one cycle apiece, but the first two mov's can be executed in
  6554. parallel, only taking 2 cycles overall. Older processors should
  6555. therefore only optimise for size. [Kit]
  6556. }
  6557. TransferUsedRegs(TmpUsedRegs);
  6558. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  6559. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  6560. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) then
  6561. begin
  6562. DebugMsg(SPeepholeOptimization + 'MovMovMov2XChg', p);
  6563. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp1, UsedRegs);
  6564. taicpu(hp1).opcode := A_XCHG;
  6565. RemoveCurrentP(p, hp1);
  6566. RemoveInstruction(hp2);
  6567. Result := True;
  6568. Exit;
  6569. end;
  6570. end
  6571. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  6572. MatchInstruction(hp1, A_SAR, []) then
  6573. begin
  6574. if MatchOperand(taicpu(hp1).oper[0]^, 31) then
  6575. begin
  6576. { the use of %edx also covers the opsize being S_L }
  6577. if MatchOperand(taicpu(hp1).oper[1]^, NR_EDX) then
  6578. begin
  6579. { Note it has to be specifically "movl %eax,%edx", and those specific sub-registers }
  6580. if (taicpu(p).oper[0]^.reg = NR_EAX) and
  6581. (taicpu(p).oper[1]^.reg = NR_EDX) then
  6582. begin
  6583. { Change:
  6584. movl %eax,%edx
  6585. sarl $31,%edx
  6586. To:
  6587. cltd
  6588. }
  6589. DebugMsg(SPeepholeOptimization + 'MovSar2Cltd', p);
  6590. RemoveInstruction(hp1);
  6591. taicpu(p).opcode := A_CDQ;
  6592. taicpu(p).opsize := S_NO;
  6593. taicpu(p).clearop(1);
  6594. taicpu(p).clearop(0);
  6595. taicpu(p).ops:=0;
  6596. Result := True;
  6597. end
  6598. else if (cs_opt_size in current_settings.optimizerswitches) and
  6599. (taicpu(p).oper[0]^.reg = NR_EDX) and
  6600. (taicpu(p).oper[1]^.reg = NR_EAX) then
  6601. begin
  6602. { Change:
  6603. movl %edx,%eax
  6604. sarl $31,%edx
  6605. To:
  6606. movl %edx,%eax
  6607. cltd
  6608. Note that this creates a dependency between the two instructions,
  6609. so only perform if optimising for size.
  6610. }
  6611. DebugMsg(SPeepholeOptimization + 'MovSar2MovCltd', p);
  6612. taicpu(hp1).opcode := A_CDQ;
  6613. taicpu(hp1).opsize := S_NO;
  6614. taicpu(hp1).clearop(1);
  6615. taicpu(hp1).clearop(0);
  6616. taicpu(hp1).ops:=0;
  6617. end;
  6618. {$ifndef x86_64}
  6619. end
  6620. { Don't bother if CMOV is supported, because a more optimal
  6621. sequence would have been generated for the Abs() intrinsic }
  6622. else if not(CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) and
  6623. { the use of %eax also covers the opsize being S_L }
  6624. MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) and
  6625. (taicpu(p).oper[0]^.reg = NR_EAX) and
  6626. (taicpu(p).oper[1]^.reg = NR_EDX) and
  6627. GetNextInstruction(hp1, hp2) and
  6628. MatchInstruction(hp2, A_XOR, [S_L]) and
  6629. MatchOperand(taicpu(hp2).oper[0]^, NR_EAX) and
  6630. MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) and
  6631. GetNextInstruction(hp2, hp3) and
  6632. MatchInstruction(hp3, A_SUB, [S_L]) and
  6633. MatchOperand(taicpu(hp3).oper[0]^, NR_EAX) and
  6634. MatchOperand(taicpu(hp3).oper[1]^, NR_EDX) then
  6635. begin
  6636. { Change:
  6637. movl %eax,%edx
  6638. sarl $31,%eax
  6639. xorl %eax,%edx
  6640. subl %eax,%edx
  6641. (Instruction that uses %edx)
  6642. (%eax deallocated)
  6643. (%edx deallocated)
  6644. To:
  6645. cltd
  6646. xorl %edx,%eax <-- Note the registers have swapped
  6647. subl %edx,%eax
  6648. (Instruction that uses %eax) <-- %eax rather than %edx
  6649. }
  6650. TransferUsedRegs(TmpUsedRegs);
  6651. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  6652. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  6653. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  6654. if not RegUsedAfterInstruction(NR_EAX, hp3, TmpUsedRegs) then
  6655. begin
  6656. if GetNextInstruction(hp3, hp4) and
  6657. not RegModifiedByInstruction(NR_EDX, hp4) and
  6658. not RegUsedAfterInstruction(NR_EDX, hp4, TmpUsedRegs) then
  6659. begin
  6660. DebugMsg(SPeepholeOptimization + 'abs() intrinsic optimisation', p);
  6661. taicpu(p).opcode := A_CDQ;
  6662. taicpu(p).clearop(1);
  6663. taicpu(p).clearop(0);
  6664. taicpu(p).ops:=0;
  6665. RemoveInstruction(hp1);
  6666. taicpu(hp2).loadreg(0, NR_EDX);
  6667. taicpu(hp2).loadreg(1, NR_EAX);
  6668. taicpu(hp3).loadreg(0, NR_EDX);
  6669. taicpu(hp3).loadreg(1, NR_EAX);
  6670. AllocRegBetween(NR_EAX, hp3, hp4, TmpUsedRegs);
  6671. { Convert references in the following instruction (hp4) from %edx to %eax }
  6672. for OperIdx := 0 to taicpu(hp4).ops - 1 do
  6673. with taicpu(hp4).oper[OperIdx]^ do
  6674. case typ of
  6675. top_reg:
  6676. if getsupreg(reg) = RS_EDX then
  6677. reg := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  6678. top_ref:
  6679. begin
  6680. if getsupreg(reg) = RS_EDX then
  6681. ref^.base := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  6682. if getsupreg(reg) = RS_EDX then
  6683. ref^.index := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  6684. end;
  6685. else
  6686. ;
  6687. end;
  6688. end;
  6689. end;
  6690. {$else x86_64}
  6691. end;
  6692. end
  6693. else if MatchOperand(taicpu(hp1).oper[0]^, 63) and
  6694. { the use of %rdx also covers the opsize being S_Q }
  6695. MatchOperand(taicpu(hp1).oper[1]^, NR_RDX) then
  6696. begin
  6697. { Note it has to be specifically "movq %rax,%rdx", and those specific sub-registers }
  6698. if (taicpu(p).oper[0]^.reg = NR_RAX) and
  6699. (taicpu(p).oper[1]^.reg = NR_RDX) then
  6700. begin
  6701. { Change:
  6702. movq %rax,%rdx
  6703. sarq $63,%rdx
  6704. To:
  6705. cqto
  6706. }
  6707. DebugMsg(SPeepholeOptimization + 'MovSar2Cqto', p);
  6708. RemoveInstruction(hp1);
  6709. taicpu(p).opcode := A_CQO;
  6710. taicpu(p).opsize := S_NO;
  6711. taicpu(p).clearop(1);
  6712. taicpu(p).clearop(0);
  6713. taicpu(p).ops:=0;
  6714. Result := True;
  6715. end
  6716. else if (cs_opt_size in current_settings.optimizerswitches) and
  6717. (taicpu(p).oper[0]^.reg = NR_RDX) and
  6718. (taicpu(p).oper[1]^.reg = NR_RAX) then
  6719. begin
  6720. { Change:
  6721. movq %rdx,%rax
  6722. sarq $63,%rdx
  6723. To:
  6724. movq %rdx,%rax
  6725. cqto
  6726. Note that this creates a dependency between the two instructions,
  6727. so only perform if optimising for size.
  6728. }
  6729. DebugMsg(SPeepholeOptimization + 'MovSar2MovCqto', p);
  6730. taicpu(hp1).opcode := A_CQO;
  6731. taicpu(hp1).opsize := S_NO;
  6732. taicpu(hp1).clearop(1);
  6733. taicpu(hp1).clearop(0);
  6734. taicpu(hp1).ops:=0;
  6735. {$endif x86_64}
  6736. end;
  6737. end;
  6738. end
  6739. else if MatchInstruction(hp1, A_MOV, []) and
  6740. (taicpu(hp1).oper[1]^.typ = top_reg) then
  6741. { Though "GetNextInstruction" could be factored out, along with
  6742. the instructions that depend on hp2, it is an expensive call that
  6743. should be delayed for as long as possible, hence we do cheaper
  6744. checks first that are likely to be False. [Kit] }
  6745. begin
  6746. if (
  6747. (
  6748. MatchOperand(taicpu(p).oper[1]^, NR_EDX) and
  6749. (taicpu(hp1).oper[1]^.reg = NR_EAX) and
  6750. (
  6751. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  6752. MatchOperand(taicpu(hp1).oper[0]^, NR_EDX)
  6753. )
  6754. ) or
  6755. (
  6756. MatchOperand(taicpu(p).oper[1]^, NR_EAX) and
  6757. (taicpu(hp1).oper[1]^.reg = NR_EDX) and
  6758. (
  6759. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  6760. MatchOperand(taicpu(hp1).oper[0]^, NR_EAX)
  6761. )
  6762. )
  6763. ) and
  6764. GetNextInstruction(hp1, hp2) and
  6765. MatchInstruction(hp2, A_SAR, []) and
  6766. MatchOperand(taicpu(hp2).oper[0]^, 31) then
  6767. begin
  6768. if MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) then
  6769. begin
  6770. { Change:
  6771. movl r/m,%edx movl r/m,%eax movl r/m,%edx movl r/m,%eax
  6772. movl %edx,%eax or movl %eax,%edx or movl r/m,%eax or movl r/m,%edx
  6773. sarl $31,%edx sarl $31,%edx sarl $31,%edx sarl $31,%edx
  6774. To:
  6775. movl r/m,%eax <- Note the change in register
  6776. cltd
  6777. }
  6778. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCltd', p);
  6779. AllocRegBetween(NR_EAX, p, hp1, UsedRegs);
  6780. taicpu(p).loadreg(1, NR_EAX);
  6781. taicpu(hp1).opcode := A_CDQ;
  6782. taicpu(hp1).clearop(1);
  6783. taicpu(hp1).clearop(0);
  6784. taicpu(hp1).ops:=0;
  6785. RemoveInstruction(hp2);
  6786. (*
  6787. {$ifdef x86_64}
  6788. end
  6789. else if MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) and
  6790. { This code sequence does not get generated - however it might become useful
  6791. if and when 128-bit signed integer types make an appearance, so the code
  6792. is kept here for when it is eventually needed. [Kit] }
  6793. (
  6794. (
  6795. (taicpu(hp1).oper[1]^.reg = NR_RAX) and
  6796. (
  6797. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  6798. MatchOperand(taicpu(hp1).oper[0]^, NR_RDX)
  6799. )
  6800. ) or
  6801. (
  6802. (taicpu(hp1).oper[1]^.reg = NR_RDX) and
  6803. (
  6804. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  6805. MatchOperand(taicpu(hp1).oper[0]^, NR_RAX)
  6806. )
  6807. )
  6808. ) and
  6809. GetNextInstruction(hp1, hp2) and
  6810. MatchInstruction(hp2, A_SAR, [S_Q]) and
  6811. MatchOperand(taicpu(hp2).oper[0]^, 63) and
  6812. MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) then
  6813. begin
  6814. { Change:
  6815. movq r/m,%rdx movq r/m,%rax movq r/m,%rdx movq r/m,%rax
  6816. movq %rdx,%rax or movq %rax,%rdx or movq r/m,%rax or movq r/m,%rdx
  6817. sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx
  6818. To:
  6819. movq r/m,%rax <- Note the change in register
  6820. cqto
  6821. }
  6822. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCqto', p);
  6823. AllocRegBetween(NR_RAX, p, hp1, UsedRegs);
  6824. taicpu(p).loadreg(1, NR_RAX);
  6825. taicpu(hp1).opcode := A_CQO;
  6826. taicpu(hp1).clearop(1);
  6827. taicpu(hp1).clearop(0);
  6828. taicpu(hp1).ops:=0;
  6829. RemoveInstruction(hp2);
  6830. {$endif x86_64}
  6831. *)
  6832. end;
  6833. end;
  6834. {$ifdef x86_64}
  6835. end
  6836. else if (taicpu(p).opsize = S_L) and
  6837. (taicpu(p).oper[1]^.typ = top_reg) and
  6838. (
  6839. MatchInstruction(hp1, A_MOV,[]) and
  6840. (taicpu(hp1).opsize = S_L) and
  6841. (taicpu(hp1).oper[1]^.typ = top_reg)
  6842. ) and (
  6843. GetNextInstruction(hp1, hp2) and
  6844. (tai(hp2).typ=ait_instruction) and
  6845. (taicpu(hp2).opsize = S_Q) and
  6846. (
  6847. (
  6848. MatchInstruction(hp2, A_ADD,[]) and
  6849. (taicpu(hp2).opsize = S_Q) and
  6850. (taicpu(hp2).oper[0]^.typ = top_reg) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  6851. (
  6852. (
  6853. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(p).oper[1]^.reg)) and
  6854. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  6855. ) or (
  6856. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  6857. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  6858. )
  6859. )
  6860. ) or (
  6861. MatchInstruction(hp2, A_LEA,[]) and
  6862. (taicpu(hp2).oper[0]^.ref^.offset = 0) and
  6863. (taicpu(hp2).oper[0]^.ref^.scalefactor <= 1) and
  6864. (
  6865. (
  6866. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(p).oper[1]^.reg)) and
  6867. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(hp1).oper[1]^.reg))
  6868. ) or (
  6869. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  6870. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(p).oper[1]^.reg))
  6871. )
  6872. ) and (
  6873. (
  6874. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  6875. ) or (
  6876. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  6877. )
  6878. )
  6879. )
  6880. )
  6881. ) and (
  6882. GetNextInstruction(hp2, hp3) and
  6883. MatchInstruction(hp3, A_SHR,[]) and
  6884. (taicpu(hp3).opsize = S_Q) and
  6885. (taicpu(hp3).oper[0]^.typ = top_const) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  6886. (taicpu(hp3).oper[0]^.val = 1) and
  6887. (taicpu(hp3).oper[1]^.reg = taicpu(hp2).oper[1]^.reg)
  6888. ) then
  6889. begin
  6890. { Change movl x, reg1d movl x, reg1d
  6891. movl y, reg2d movl y, reg2d
  6892. addq reg2q,reg1q or leaq (reg1q,reg2q),reg1q
  6893. shrq $1, reg1q shrq $1, reg1q
  6894. ( reg1d and reg2d can be switched around in the first two instructions )
  6895. To movl x, reg1d
  6896. addl y, reg1d
  6897. rcrl $1, reg1d
  6898. This corresponds to the common expression (x + y) shr 1, where
  6899. x and y are Cardinals (replacing "shr 1" with "div 2" produces
  6900. smaller code, but won't account for x + y causing an overflow). [Kit]
  6901. }
  6902. if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then
  6903. { Change first MOV command to have the same register as the final output }
  6904. taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg
  6905. else
  6906. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
  6907. { Change second MOV command to an ADD command. This is easier than
  6908. converting the existing command because it means we don't have to
  6909. touch 'y', which might be a complicated reference, and also the
  6910. fact that the third command might either be ADD or LEA. [Kit] }
  6911. taicpu(hp1).opcode := A_ADD;
  6912. { Delete old ADD/LEA instruction }
  6913. RemoveInstruction(hp2);
  6914. { Convert "shrq $1, reg1q" to "rcr $1, reg1d" }
  6915. taicpu(hp3).opcode := A_RCR;
  6916. taicpu(hp3).changeopsize(S_L);
  6917. setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD);
  6918. {$endif x86_64}
  6919. end;
  6920. end;
  6921. function TX86AsmOptimizer.OptPass2Movx(var p : tai) : boolean;
  6922. var
  6923. ThisReg: TRegister;
  6924. MinSize, MaxSize, TrySmaller, TargetSize: TOpSize;
  6925. TargetSubReg: TSubRegister;
  6926. hp1, hp2: tai;
  6927. RegInUse, RegChanged, p_removed: Boolean;
  6928. { Store list of found instructions so we don't have to call
  6929. GetNextInstructionUsingReg multiple times }
  6930. InstrList: array of taicpu;
  6931. InstrMax, Index: Integer;
  6932. UpperLimit, TrySmallerLimit: TCgInt;
  6933. PreMessage: string;
  6934. { Data flow analysis }
  6935. TestValMin, TestValMax: TCgInt;
  6936. SmallerOverflow: Boolean;
  6937. begin
  6938. Result := False;
  6939. p_removed := False;
  6940. { This is anything but quick! }
  6941. if not(cs_opt_level2 in current_settings.optimizerswitches) then
  6942. Exit;
  6943. SetLength(InstrList, 0);
  6944. InstrMax := -1;
  6945. ThisReg := taicpu(p).oper[1]^.reg;
  6946. case taicpu(p).opsize of
  6947. S_BW, S_BL:
  6948. begin
  6949. {$if defined(i386) or defined(i8086)}
  6950. { If the target size is 8-bit, make sure we can actually encode it }
  6951. if not (GetSupReg(ThisReg) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX]) then
  6952. Exit;
  6953. {$endif i386 or i8086}
  6954. UpperLimit := $FF;
  6955. MinSize := S_B;
  6956. if taicpu(p).opsize = S_BW then
  6957. MaxSize := S_W
  6958. else
  6959. MaxSize := S_L;
  6960. end;
  6961. S_WL:
  6962. begin
  6963. UpperLimit := $FFFF;
  6964. MinSize := S_W;
  6965. MaxSize := S_L;
  6966. end
  6967. else
  6968. InternalError(2020112301);
  6969. end;
  6970. TestValMin := 0;
  6971. TestValMax := UpperLimit;
  6972. TrySmallerLimit := UpperLimit;
  6973. TrySmaller := S_NO;
  6974. SmallerOverflow := False;
  6975. RegChanged := False;
  6976. hp1 := p;
  6977. while GetNextInstructionUsingReg(hp1, hp1, ThisReg) and
  6978. (hp1.typ = ait_instruction) and
  6979. (
  6980. { Under -O1 and -O2, GetNextInstructionUsingReg may return an
  6981. instruction that doesn't actually contain ThisReg }
  6982. (cs_opt_level3 in current_settings.optimizerswitches) or
  6983. RegInInstruction(ThisReg, hp1)
  6984. ) do
  6985. begin
  6986. case taicpu(hp1).opcode of
  6987. A_INC,A_DEC:
  6988. begin
  6989. { Has to be an exact match on the register }
  6990. if not MatchOperand(taicpu(hp1).oper[0]^, ThisReg) then
  6991. Break;
  6992. if taicpu(hp1).opcode = A_INC then
  6993. begin
  6994. Inc(TestValMin);
  6995. Inc(TestValMax);
  6996. end
  6997. else
  6998. begin
  6999. Dec(TestValMin);
  7000. Dec(TestValMax);
  7001. end;
  7002. end;
  7003. A_CMP:
  7004. begin
  7005. if (taicpu(hp1).oper[1]^.typ <> top_reg) or
  7006. { Has to be an exact match on the register }
  7007. (taicpu(hp1).oper[1]^.reg <> ThisReg) or
  7008. (taicpu(hp1).oper[0]^.typ <> top_const) or
  7009. { Make sure the comparison value is not smaller than the
  7010. smallest allowed signed value for the minimum size (e.g.
  7011. -128 for 8-bit) }
  7012. not (
  7013. ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
  7014. { Is it in the negative range? }
  7015. (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val))
  7016. ) then
  7017. Break;
  7018. TestValMin := TestValMin - taicpu(hp1).oper[0]^.val;
  7019. TestValMax := TestValMax - taicpu(hp1).oper[0]^.val;
  7020. if (TestValMin < TrySmallerLimit) or (TestValMax < TrySmallerLimit) or
  7021. (TestValMin > UpperLimit) or (TestValMax > UpperLimit) then
  7022. { Overflow }
  7023. Break;
  7024. { Check to see if the active register is used afterwards }
  7025. TransferUsedRegs(TmpUsedRegs);
  7026. IncludeRegInUsedRegs(ThisReg, TmpUsedRegs);
  7027. if not RegUsedAfterInstruction(ThisReg, hp1, TmpUsedRegs) then
  7028. begin
  7029. case MinSize of
  7030. S_B:
  7031. TargetSubReg := R_SUBL;
  7032. S_W:
  7033. TargetSubReg := R_SUBW;
  7034. else
  7035. InternalError(2021051002);
  7036. end;
  7037. { Update the register to its new size }
  7038. setsubreg(ThisReg, TargetSubReg);
  7039. taicpu(hp1).oper[1]^.reg := ThisReg;
  7040. taicpu(hp1).opsize := MinSize;
  7041. { Convert the input MOVZX to a MOV }
  7042. if (taicpu(p).oper[0]^.typ = top_reg) and
  7043. SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg) then
  7044. begin
  7045. { Or remove it completely! }
  7046. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 1a', p);
  7047. RemoveCurrentP(p);
  7048. p_removed := True;
  7049. end
  7050. else
  7051. begin
  7052. DebugMsg(SPeepholeOptimization + 'Movzx2Mov 1a', p);
  7053. taicpu(p).opcode := A_MOV;
  7054. taicpu(p).oper[1]^.reg := ThisReg;
  7055. taicpu(p).opsize := MinSize;
  7056. end;
  7057. if (InstrMax >= 0) then
  7058. begin
  7059. for Index := 0 to InstrMax do
  7060. begin
  7061. { If p_removed is true, then the original MOV/Z was removed
  7062. and removing the AND instruction may not be safe if it
  7063. appears first }
  7064. if (InstrList[Index].oper[InstrList[Index].ops - 1]^.typ <> top_reg) then
  7065. InternalError(2020112311);
  7066. if InstrList[Index].oper[0]^.typ = top_reg then
  7067. InstrList[Index].oper[0]^.reg := ThisReg;
  7068. InstrList[Index].oper[InstrList[Index].ops - 1]^.reg := ThisReg;
  7069. InstrList[Index].opsize := MinSize;
  7070. end;
  7071. end;
  7072. Result := True;
  7073. Exit;
  7074. end;
  7075. end;
  7076. { OR and XOR are not included because they can too easily fool
  7077. the data flow analysis (they can cause non-linear behaviour) }
  7078. A_ADD,A_SUB,A_AND,A_SHL,A_SHR:
  7079. begin
  7080. if
  7081. (taicpu(hp1).oper[1]^.typ <> top_reg) or
  7082. { Has to be an exact match on the register }
  7083. (taicpu(hp1).oper[1]^.reg <> ThisReg) or not
  7084. (
  7085. (
  7086. (taicpu(hp1).oper[0]^.typ = top_const) and
  7087. (
  7088. (
  7089. (taicpu(hp1).opcode = A_SHL) and
  7090. (
  7091. ((MinSize = S_B) and (taicpu(hp1).oper[0]^.val < 8)) or
  7092. ((MinSize = S_W) and (taicpu(hp1).oper[0]^.val < 16)) or
  7093. ((MinSize = S_L) and (taicpu(hp1).oper[0]^.val < 32))
  7094. )
  7095. ) or (
  7096. (taicpu(hp1).opcode <> A_SHL) and
  7097. (
  7098. ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
  7099. { Is it in the negative range? }
  7100. (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val))
  7101. )
  7102. )
  7103. )
  7104. ) or (
  7105. MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^.reg) and
  7106. ((taicpu(hp1).opcode = A_ADD) or (taicpu(hp1).opcode = A_AND) or (taicpu(hp1).opcode = A_SUB))
  7107. )
  7108. ) then
  7109. Break;
  7110. case taicpu(hp1).opcode of
  7111. A_ADD:
  7112. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  7113. begin
  7114. TestValMin := TestValMin * 2;
  7115. TestValMax := TestValMax * 2;
  7116. end
  7117. else
  7118. begin
  7119. TestValMin := TestValMin + taicpu(hp1).oper[0]^.val;
  7120. TestValMax := TestValMax + taicpu(hp1).oper[0]^.val;
  7121. end;
  7122. A_SUB:
  7123. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  7124. begin
  7125. TestValMin := 0;
  7126. TestValMax := 0;
  7127. end
  7128. else
  7129. begin
  7130. TestValMin := TestValMin - taicpu(hp1).oper[0]^.val;
  7131. TestValMax := TestValMax - taicpu(hp1).oper[0]^.val;
  7132. end;
  7133. A_AND:
  7134. if (taicpu(hp1).oper[0]^.typ = top_const) then
  7135. begin
  7136. { we might be able to go smaller if AND appears first }
  7137. if InstrMax = -1 then
  7138. case MinSize of
  7139. S_B:
  7140. ;
  7141. S_W:
  7142. if ((taicpu(hp1).oper[0]^.val and $FF) = taicpu(hp1).oper[0]^.val) or
  7143. ((not(taicpu(hp1).oper[0]^.val) and $7F) = (not taicpu(hp1).oper[0]^.val)) then
  7144. begin
  7145. TrySmaller := S_B;
  7146. TrySmallerLimit := $FF;
  7147. end;
  7148. S_L:
  7149. if ((taicpu(hp1).oper[0]^.val and $FF) = taicpu(hp1).oper[0]^.val) or
  7150. ((not(taicpu(hp1).oper[0]^.val) and $7F) = (not taicpu(hp1).oper[0]^.val)) then
  7151. begin
  7152. TrySmaller := S_B;
  7153. TrySmallerLimit := $FF;
  7154. end
  7155. else if ((taicpu(hp1).oper[0]^.val and $FFFF) = taicpu(hp1).oper[0]^.val) or
  7156. ((not(taicpu(hp1).oper[0]^.val) and $7FFF) = (not taicpu(hp1).oper[0]^.val)) then
  7157. begin
  7158. TrySmaller := S_W;
  7159. TrySmallerLimit := $FFFF;
  7160. end;
  7161. else
  7162. InternalError(2020112320);
  7163. end;
  7164. TestValMin := TestValMin and taicpu(hp1).oper[0]^.val;
  7165. TestValMax := TestValMax and taicpu(hp1).oper[0]^.val;
  7166. end;
  7167. A_SHL:
  7168. begin
  7169. TestValMin := TestValMin shl taicpu(hp1).oper[0]^.val;
  7170. TestValMax := TestValMax shl taicpu(hp1).oper[0]^.val;
  7171. end;
  7172. A_SHR:
  7173. begin
  7174. { we might be able to go smaller if SHR appears first }
  7175. if InstrMax = -1 then
  7176. case MinSize of
  7177. S_B:
  7178. ;
  7179. S_W:
  7180. if (taicpu(hp1).oper[0]^.val >= 8) then
  7181. begin
  7182. TrySmaller := S_B;
  7183. TrySmallerLimit := $FF;
  7184. end;
  7185. S_L:
  7186. if (taicpu(hp1).oper[0]^.val >= 24) then
  7187. begin
  7188. TrySmaller := S_B;
  7189. TrySmallerLimit := $FF;
  7190. end
  7191. else if (taicpu(hp1).oper[0]^.val >= 16) then
  7192. begin
  7193. TrySmaller := S_W;
  7194. TrySmallerLimit := $FFFF;
  7195. end;
  7196. else
  7197. InternalError(2020112321);
  7198. end;
  7199. TestValMin := TestValMin shr taicpu(hp1).oper[0]^.val;
  7200. TestValMax := TestValMax shr taicpu(hp1).oper[0]^.val;
  7201. end;
  7202. else
  7203. InternalError(2020112303);
  7204. end;
  7205. end;
  7206. (*
  7207. A_IMUL:
  7208. case taicpu(hp1).ops of
  7209. 2:
  7210. begin
  7211. if not MatchOpType(hp1, top_reg, top_reg) or
  7212. { Has to be an exact match on the register }
  7213. (taicpu(hp1).oper[0]^.reg <> ThisReg) or
  7214. (taicpu(hp1).oper[1]^.reg <> ThisReg) then
  7215. Break;
  7216. TestValMin := TestValMin * TestValMin;
  7217. TestValMax := TestValMax * TestValMax;
  7218. end;
  7219. 3:
  7220. begin
  7221. if not MatchOpType(hp1, top_const, top_reg, top_reg) or
  7222. { Has to be an exact match on the register }
  7223. (taicpu(hp1).oper[1]^.reg <> ThisReg) or
  7224. (taicpu(hp1).oper[2]^.reg <> ThisReg) or
  7225. ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
  7226. { Is it in the negative range? }
  7227. (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val)) then
  7228. Break;
  7229. TestValMin := TestValMin * taicpu(hp1).oper[0]^.val;
  7230. TestValMax := TestValMax * taicpu(hp1).oper[0]^.val;
  7231. end;
  7232. else
  7233. Break;
  7234. end;
  7235. A_IDIV:
  7236. case taicpu(hp1).ops of
  7237. 3:
  7238. begin
  7239. if not MatchOpType(hp1, top_const, top_reg, top_reg) or
  7240. { Has to be an exact match on the register }
  7241. (taicpu(hp1).oper[1]^.reg <> ThisReg) or
  7242. (taicpu(hp1).oper[2]^.reg <> ThisReg) or
  7243. ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
  7244. { Is it in the negative range? }
  7245. (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val)) then
  7246. Break;
  7247. TestValMin := TestValMin div taicpu(hp1).oper[0]^.val;
  7248. TestValMax := TestValMax div taicpu(hp1).oper[0]^.val;
  7249. end;
  7250. else
  7251. Break;
  7252. end;
  7253. *)
  7254. A_MOVZX:
  7255. begin
  7256. if not MatchOpType(taicpu(hp1), top_reg, top_reg) then
  7257. Break;
  7258. if not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, ThisReg) then
  7259. begin
  7260. { Because hp1 was obtained via GetNextInstructionUsingReg
  7261. and ThisReg doesn't appear in the first operand, it
  7262. must appear in the second operand and hence gets
  7263. overwritten }
  7264. if (InstrMax = -1) and
  7265. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ThisReg) then
  7266. begin
  7267. { The two MOVZX instructions are adjacent, so remove the first one }
  7268. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 5', p);
  7269. RemoveCurrentP(p);
  7270. Result := True;
  7271. Exit;
  7272. end;
  7273. Break;
  7274. end;
  7275. { The objective here is to try to find a combination that
  7276. removes one of the MOV/Z instructions. }
  7277. case taicpu(hp1).opsize of
  7278. S_WL:
  7279. if (MinSize in [S_B, S_W]) then
  7280. begin
  7281. TargetSize := S_L;
  7282. TargetSubReg := R_SUBD;
  7283. end
  7284. else if ((TrySmaller in [S_B, S_W]) and not SmallerOverflow) then
  7285. begin
  7286. TargetSize := TrySmaller;
  7287. if TrySmaller = S_B then
  7288. TargetSubReg := R_SUBL
  7289. else
  7290. TargetSubReg := R_SUBW;
  7291. end
  7292. else
  7293. Break;
  7294. S_BW:
  7295. if (MinSize in [S_B, S_W]) then
  7296. begin
  7297. TargetSize := S_W;
  7298. TargetSubReg := R_SUBW;
  7299. end
  7300. else if ((TrySmaller = S_B) and not SmallerOverflow) then
  7301. begin
  7302. TargetSize := S_B;
  7303. TargetSubReg := R_SUBL;
  7304. end
  7305. else
  7306. Break;
  7307. S_BL:
  7308. if (MinSize in [S_B, S_W]) then
  7309. begin
  7310. TargetSize := S_L;
  7311. TargetSubReg := R_SUBD;
  7312. end
  7313. else if ((TrySmaller = S_B) and not SmallerOverflow) then
  7314. begin
  7315. TargetSize := S_B;
  7316. TargetSubReg := R_SUBL;
  7317. end
  7318. else
  7319. Break;
  7320. else
  7321. InternalError(2020112302);
  7322. end;
  7323. { Update the register to its new size }
  7324. setsubreg(ThisReg, TargetSubReg);
  7325. if TargetSize = MinSize then
  7326. begin
  7327. { Convert the input MOVZX to a MOV }
  7328. if (taicpu(p).oper[0]^.typ = top_reg) and
  7329. SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg) then
  7330. begin
  7331. { Or remove it completely! }
  7332. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 1', p);
  7333. RemoveCurrentP(p);
  7334. p_removed := True;
  7335. end
  7336. else
  7337. begin
  7338. DebugMsg(SPeepholeOptimization + 'Movzx2Mov 1', p);
  7339. taicpu(p).opcode := A_MOV;
  7340. taicpu(p).oper[1]^.reg := ThisReg;
  7341. taicpu(p).opsize := TargetSize;
  7342. end;
  7343. Result := True;
  7344. end
  7345. else if TargetSize <> MaxSize then
  7346. begin
  7347. case MaxSize of
  7348. S_L:
  7349. if TargetSize = S_W then
  7350. begin
  7351. DebugMsg(SPeepholeOptimization + 'movzbl2movzbw', p);
  7352. taicpu(p).opsize := S_BW;
  7353. taicpu(p).oper[1]^.reg := ThisReg;
  7354. Result := True;
  7355. end
  7356. else
  7357. InternalError(2020112341);
  7358. S_W:
  7359. if TargetSize = S_L then
  7360. begin
  7361. DebugMsg(SPeepholeOptimization + 'movzbw2movzbl', p);
  7362. taicpu(p).opsize := S_BL;
  7363. taicpu(p).oper[1]^.reg := ThisReg;
  7364. Result := True;
  7365. end
  7366. else
  7367. InternalError(2020112342);
  7368. else
  7369. ;
  7370. end;
  7371. end;
  7372. if (MaxSize = TargetSize) or
  7373. ((TargetSize = S_L) and (taicpu(hp1).opsize in [S_L, S_BL, S_WL])) or
  7374. ((TargetSize = S_W) and (taicpu(hp1).opsize in [S_W, S_BW])) then
  7375. begin
  7376. { Convert the output MOVZX to a MOV }
  7377. if SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
  7378. begin
  7379. { Or remove it completely! }
  7380. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 2', hp1);
  7381. { Be careful; if p = hp1 and p was also removed, p
  7382. will become a dangling pointer }
  7383. if p = hp1 then
  7384. RemoveCurrentp(p) { p = hp1 and will then become the next instruction }
  7385. else
  7386. RemoveInstruction(hp1);
  7387. end
  7388. else
  7389. begin
  7390. taicpu(hp1).opcode := A_MOV;
  7391. taicpu(hp1).oper[0]^.reg := ThisReg;
  7392. taicpu(hp1).opsize := TargetSize;
  7393. { Check to see if the active register is used afterwards;
  7394. if not, we can change it and make a saving. }
  7395. RegInUse := False;
  7396. TransferUsedRegs(TmpUsedRegs);
  7397. { The target register may be marked as in use to cross
  7398. a jump to a distant label, so exclude it }
  7399. ExcludeRegFromUsedRegs(taicpu(hp1).oper[1]^.reg, TmpUsedRegs);
  7400. hp2 := p;
  7401. repeat
  7402. UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
  7403. { Explicitly check for the excluded register (don't include the first
  7404. instruction as it may be reading from here }
  7405. if ((p <> hp2) and (RegInInstruction(taicpu(hp1).oper[1]^.reg, hp2))) or
  7406. RegInUsedRegs(taicpu(hp1).oper[1]^.reg, TmpUsedRegs) then
  7407. begin
  7408. RegInUse := True;
  7409. Break;
  7410. end;
  7411. if not GetNextInstruction(hp2, hp2) then
  7412. InternalError(2020112340);
  7413. until (hp2 = hp1);
  7414. if not RegInUse and not RegUsedAfterInstruction(ThisReg, hp1, TmpUsedRegs) then
  7415. begin
  7416. DebugMsg(SPeepholeOptimization + 'Simplified register usage so ' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' = ' + debug_regname(taicpu(p).oper[1]^.reg), p);
  7417. ThisReg := taicpu(hp1).oper[1]^.reg;
  7418. RegChanged := True;
  7419. TransferUsedRegs(TmpUsedRegs);
  7420. AllocRegBetween(ThisReg, p, hp1, TmpUsedRegs);
  7421. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 3', hp1);
  7422. if p = hp1 then
  7423. RemoveCurrentp(p) { p = hp1 and will then become the next instruction }
  7424. else
  7425. RemoveInstruction(hp1);
  7426. { Instruction will become "mov %reg,%reg" }
  7427. if not p_removed and (taicpu(p).opcode = A_MOV) and
  7428. MatchOperand(taicpu(p).oper[0]^, ThisReg) then
  7429. begin
  7430. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 6', p);
  7431. RemoveCurrentP(p);
  7432. p_removed := True;
  7433. end
  7434. else
  7435. taicpu(p).oper[1]^.reg := ThisReg;
  7436. Result := True;
  7437. end
  7438. else
  7439. DebugMsg(SPeepholeOptimization + 'Movzx2Mov 2', hp1);
  7440. end;
  7441. end
  7442. else
  7443. InternalError(2020112330);
  7444. { Now go through every instruction we found and change the
  7445. size. If TargetSize = MaxSize, then almost no changes are
  7446. needed and Result can remain False if it hasn't been set
  7447. yet.
  7448. If RegChanged is True, then the register requires changing
  7449. and so the point about TargetSize = MaxSize doesn't apply. }
  7450. if ((TargetSize <> MaxSize) or RegChanged) and (InstrMax >= 0) then
  7451. begin
  7452. for Index := 0 to InstrMax do
  7453. begin
  7454. { If p_removed is true, then the original MOV/Z was removed
  7455. and removing the AND instruction may not be safe if it
  7456. appears first }
  7457. if (InstrList[Index].oper[InstrList[Index].ops - 1]^.typ <> top_reg) then
  7458. InternalError(2020112310);
  7459. if InstrList[Index].oper[0]^.typ = top_reg then
  7460. InstrList[Index].oper[0]^.reg := ThisReg;
  7461. InstrList[Index].oper[InstrList[Index].ops - 1]^.reg := ThisReg;
  7462. InstrList[Index].opsize := TargetSize;
  7463. end;
  7464. Result := True;
  7465. end;
  7466. Exit;
  7467. end;
  7468. else
  7469. { This includes ADC, SBB, IDIV and SAR }
  7470. Break;
  7471. end;
  7472. if (TestValMin < 0) or (TestValMax < 0) or
  7473. (TestValMin > UpperLimit) or (TestValMax > UpperLimit) then
  7474. { Overflow }
  7475. Break
  7476. else if not SmallerOverflow and (TrySmaller <> S_NO) and
  7477. ((TestValMin > TrySmallerLimit) or (TestValMax > TrySmallerLimit)) then
  7478. SmallerOverflow := True;
  7479. { Contains highest index (so instruction count - 1) }
  7480. Inc(InstrMax);
  7481. if InstrMax > High(InstrList) then
  7482. SetLength(InstrList, InstrMax + LIST_STEP_SIZE);
  7483. InstrList[InstrMax] := taicpu(hp1);
  7484. end;
  7485. end;
  7486. function TX86AsmOptimizer.OptPass2Imul(var p : tai) : boolean;
  7487. var
  7488. hp1 : tai;
  7489. begin
  7490. Result:=false;
  7491. if (taicpu(p).ops >= 2) and
  7492. ((taicpu(p).oper[0]^.typ = top_const) or
  7493. ((taicpu(p).oper[0]^.typ = top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full))) and
  7494. (taicpu(p).oper[1]^.typ = top_reg) and
  7495. ((taicpu(p).ops = 2) or
  7496. ((taicpu(p).oper[2]^.typ = top_reg) and
  7497. (taicpu(p).oper[2]^.reg = taicpu(p).oper[1]^.reg))) and
  7498. GetLastInstruction(p,hp1) and
  7499. MatchInstruction(hp1,A_MOV,[]) and
  7500. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  7501. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  7502. begin
  7503. TransferUsedRegs(TmpUsedRegs);
  7504. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,p,TmpUsedRegs)) or
  7505. ((taicpu(p).ops = 3) and (taicpu(p).oper[1]^.reg=taicpu(p).oper[2]^.reg)) then
  7506. { change
  7507. mov reg1,reg2
  7508. imul y,reg2 to imul y,reg1,reg2 }
  7509. begin
  7510. taicpu(p).ops := 3;
  7511. taicpu(p).loadreg(2,taicpu(p).oper[1]^.reg);
  7512. taicpu(p).loadreg(1,taicpu(hp1).oper[0]^.reg);
  7513. DebugMsg(SPeepholeOptimization + 'MovImul2Imul done',p);
  7514. RemoveInstruction(hp1);
  7515. result:=true;
  7516. end;
  7517. end;
  7518. end;
  7519. procedure TX86AsmOptimizer.ConvertJumpToRET(const p: tai; const ret_p: tai);
  7520. var
  7521. ThisLabel: TAsmLabel;
  7522. begin
  7523. ThisLabel := tasmlabel(taicpu(p).oper[0]^.ref^.symbol);
  7524. ThisLabel.decrefs;
  7525. taicpu(p).opcode := A_RET;
  7526. taicpu(p).is_jmp := false;
  7527. taicpu(p).ops := taicpu(ret_p).ops;
  7528. case taicpu(ret_p).ops of
  7529. 0:
  7530. taicpu(p).clearop(0);
  7531. 1:
  7532. taicpu(p).loadconst(0,taicpu(ret_p).oper[0]^.val);
  7533. else
  7534. internalerror(2016041301);
  7535. end;
  7536. { If the original label is now dead, it might turn out that the label
  7537. immediately follows p. As a result, everything beyond it, which will
  7538. be just some final register configuration and a RET instruction, is
  7539. now dead code. [Kit] }
  7540. { NOTE: This is much faster than introducing a OptPass2RET routine and
  7541. running RemoveDeadCodeAfterJump for each RET instruction, because
  7542. this optimisation rarely happens and most RETs appear at the end of
  7543. routines where there is nothing that can be stripped. [Kit] }
  7544. if not ThisLabel.is_used then
  7545. RemoveDeadCodeAfterJump(p);
  7546. end;
  7547. function TX86AsmOptimizer.OptPass2SETcc(var p: tai): boolean;
  7548. var
  7549. hp1,hp2,next: tai; SetC, JumpC: TAsmCond;
  7550. Unconditional, PotentialModified: Boolean;
  7551. OperPtr: POper;
  7552. NewRef: TReference;
  7553. InstrList: array of taicpu;
  7554. InstrMax, Index: Integer;
  7555. const
  7556. {$ifdef DEBUG_AOPTCPU}
  7557. SNoFlags: shortstring = ' so the flags aren''t modified';
  7558. {$else DEBUG_AOPTCPU}
  7559. SNoFlags = '';
  7560. {$endif DEBUG_AOPTCPU}
  7561. begin
  7562. Result:=false;
  7563. if MatchOpType(taicpu(p),top_reg) and GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
  7564. begin
  7565. if MatchInstruction(hp1, A_TEST, [S_B]) and
  7566. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  7567. (taicpu(hp1).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  7568. (taicpu(p).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  7569. GetNextInstruction(hp1, hp2) and
  7570. MatchInstruction(hp2, A_Jcc, A_SETcc, []) then
  7571. { Change from: To:
  7572. set(C) %reg j(~C) label
  7573. test %reg,%reg/cmp $0,%reg
  7574. je label
  7575. set(C) %reg j(C) label
  7576. test %reg,%reg/cmp $0,%reg
  7577. jne label
  7578. (Also do something similar with sete/setne instead of je/jne)
  7579. }
  7580. begin
  7581. { Before we do anything else, we need to check the instructions
  7582. in between SETcc and TEST to make sure they don't modify the
  7583. FLAGS register - if -O2 or under, there won't be any
  7584. instructions between SET and TEST }
  7585. TransferUsedRegs(TmpUsedRegs);
  7586. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  7587. if (cs_opt_level3 in current_settings.optimizerswitches) then
  7588. begin
  7589. next := p;
  7590. SetLength(InstrList, 0);
  7591. InstrMax := -1;
  7592. PotentialModified := False;
  7593. { Make a note of every instruction that modifies the FLAGS
  7594. register }
  7595. while GetNextInstruction(next, next) and (next <> hp1) do
  7596. begin
  7597. if next.typ <> ait_instruction then
  7598. { GetNextInstructionUsingReg should have returned False }
  7599. InternalError(2021051701);
  7600. if RegModifiedByInstruction(NR_DEFAULTFLAGS, next) then
  7601. begin
  7602. case taicpu(next).opcode of
  7603. A_SETcc,
  7604. A_CMOVcc,
  7605. A_Jcc:
  7606. begin
  7607. if PotentialModified then
  7608. { Not safe because the flags were modified earlier }
  7609. Exit
  7610. else
  7611. { Condition is the same as the initial SETcc, so this is safe
  7612. (don't add to instruction list though) }
  7613. Continue;
  7614. end;
  7615. A_ADD:
  7616. begin
  7617. if (taicpu(next).opsize = S_B) or
  7618. { LEA doesn't support 8-bit operands }
  7619. (taicpu(next).oper[1]^.typ <> top_reg) or
  7620. { Must write to a register }
  7621. (taicpu(next).oper[0]^.typ = top_ref) then
  7622. { Require a constant or a register }
  7623. Exit;
  7624. PotentialModified := True;
  7625. end;
  7626. A_SUB:
  7627. begin
  7628. if (taicpu(next).opsize = S_B) or
  7629. { LEA doesn't support 8-bit operands }
  7630. (taicpu(next).oper[1]^.typ <> top_reg) or
  7631. { Must write to a register }
  7632. (taicpu(next).oper[0]^.typ <> top_const) or
  7633. (taicpu(next).oper[0]^.val = $80000000) then
  7634. { Can't subtract a register with LEA - also
  7635. check that the value isn't -2^31, as this
  7636. can't be negated }
  7637. Exit;
  7638. PotentialModified := True;
  7639. end;
  7640. A_SAL,
  7641. A_SHL:
  7642. begin
  7643. if (taicpu(next).opsize = S_B) or
  7644. { LEA doesn't support 8-bit operands }
  7645. (taicpu(next).oper[1]^.typ <> top_reg) or
  7646. { Must write to a register }
  7647. (taicpu(next).oper[0]^.typ <> top_const) or
  7648. (taicpu(next).oper[0]^.val < 0) or
  7649. (taicpu(next).oper[0]^.val > 3) then
  7650. Exit;
  7651. PotentialModified := True;
  7652. end;
  7653. A_IMUL:
  7654. begin
  7655. if (taicpu(next).ops <> 3) or
  7656. (taicpu(next).oper[1]^.typ <> top_reg) or
  7657. { Must write to a register }
  7658. (taicpu(next).oper[2]^.val in [2,3,4,5,8,9]) then
  7659. { We can convert "imul x,%reg1,%reg2" (where x = 2, 4 or 8)
  7660. to "lea (%reg1,x),%reg2". If x = 3, 5 or 9, we can
  7661. change this to "lea (%reg1,%reg1,(x-1)),%reg2" }
  7662. Exit
  7663. else
  7664. PotentialModified := True;
  7665. end;
  7666. else
  7667. { Don't know how to change this, so abort }
  7668. Exit;
  7669. end;
  7670. { Contains highest index (so instruction count - 1) }
  7671. Inc(InstrMax);
  7672. if InstrMax > High(InstrList) then
  7673. SetLength(InstrList, InstrMax + LIST_STEP_SIZE);
  7674. InstrList[InstrMax] := taicpu(next);
  7675. end;
  7676. UpdateUsedRegs(TmpUsedRegs, tai(next.next));
  7677. end;
  7678. if not Assigned(next) or (next <> hp1) then
  7679. { It should be equal to hp1 }
  7680. InternalError(2021051702);
  7681. { Cycle through each instruction and check to see if we can
  7682. change them to versions that don't modify the flags }
  7683. if (InstrMax >= 0) then
  7684. begin
  7685. for Index := 0 to InstrMax do
  7686. case InstrList[Index].opcode of
  7687. A_ADD:
  7688. begin
  7689. DebugMsg(SPeepholeOptimization + 'ADD -> LEA' + SNoFlags, InstrList[Index]);
  7690. InstrList[Index].opcode := A_LEA;
  7691. reference_reset(NewRef, 1, []);
  7692. NewRef.base := InstrList[Index].oper[1]^.reg;
  7693. if InstrList[Index].oper[0]^.typ = top_reg then
  7694. begin
  7695. NewRef.index := InstrList[Index].oper[0]^.reg;
  7696. NewRef.scalefactor := 1;
  7697. end
  7698. else
  7699. NewRef.offset := InstrList[Index].oper[0]^.val;
  7700. InstrList[Index].loadref(0, NewRef);
  7701. end;
  7702. A_SUB:
  7703. begin
  7704. DebugMsg(SPeepholeOptimization + 'SUB -> LEA' + SNoFlags, InstrList[Index]);
  7705. InstrList[Index].opcode := A_LEA;
  7706. reference_reset(NewRef, 1, []);
  7707. NewRef.base := InstrList[Index].oper[1]^.reg;
  7708. NewRef.offset := -InstrList[Index].oper[0]^.val;
  7709. InstrList[Index].loadref(0, NewRef);
  7710. end;
  7711. A_SHL,
  7712. A_SAL:
  7713. begin
  7714. DebugMsg(SPeepholeOptimization + 'SHL -> LEA' + SNoFlags, InstrList[Index]);
  7715. InstrList[Index].opcode := A_LEA;
  7716. reference_reset(NewRef, 1, []);
  7717. NewRef.index := InstrList[Index].oper[1]^.reg;
  7718. NewRef.scalefactor := 1 shl (InstrList[Index].oper[0]^.val);
  7719. InstrList[Index].loadref(0, NewRef);
  7720. end;
  7721. A_IMUL:
  7722. begin
  7723. DebugMsg(SPeepholeOptimization + 'IMUL -> LEA' + SNoFlags, InstrList[Index]);
  7724. InstrList[Index].opcode := A_LEA;
  7725. reference_reset(NewRef, 1, []);
  7726. NewRef.index := InstrList[Index].oper[1]^.reg;
  7727. case InstrList[Index].oper[0]^.val of
  7728. 2, 4, 8:
  7729. NewRef.scalefactor := InstrList[Index].oper[0]^.val;
  7730. else {3, 5 and 9}
  7731. begin
  7732. NewRef.scalefactor := InstrList[Index].oper[0]^.val - 1;
  7733. NewRef.base := InstrList[Index].oper[1]^.reg;
  7734. end;
  7735. end;
  7736. InstrList[Index].loadref(0, NewRef);
  7737. end;
  7738. else
  7739. InternalError(2021051710);
  7740. end;
  7741. end;
  7742. { Mark the FLAGS register as used across this whole block }
  7743. AllocRegBetween(NR_DEFAULTFLAGS, p, hp1, UsedRegs);
  7744. end;
  7745. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  7746. JumpC := taicpu(hp2).condition;
  7747. Unconditional := False;
  7748. if conditions_equal(JumpC, C_E) then
  7749. SetC := inverse_cond(taicpu(p).condition)
  7750. else if conditions_equal(JumpC, C_NE) then
  7751. SetC := taicpu(p).condition
  7752. else
  7753. { We've got something weird here (and inefficent) }
  7754. begin
  7755. DebugMsg('DEBUG: Inefficient jump - check code generation', p);
  7756. SetC := C_NONE;
  7757. { JAE/JNB will always branch (use 'condition_in', since C_AE <> C_NB normally) }
  7758. if condition_in(C_AE, JumpC) then
  7759. Unconditional := True
  7760. else
  7761. { Not sure what to do with this jump - drop out }
  7762. Exit;
  7763. end;
  7764. RemoveInstruction(hp1);
  7765. if Unconditional then
  7766. MakeUnconditional(taicpu(hp2))
  7767. else
  7768. begin
  7769. if SetC = C_NONE then
  7770. InternalError(2018061402);
  7771. taicpu(hp2).SetCondition(SetC);
  7772. end;
  7773. { as hp2 is a jump, we cannot use RegUsedAfterInstruction but we have to check if it is included in
  7774. TmpUsedRegs }
  7775. if not TmpUsedRegs[getregtype(taicpu(p).oper[0]^.reg)].IsUsed(taicpu(p).oper[0]^.reg) then
  7776. begin
  7777. RemoveCurrentp(p, hp2);
  7778. if taicpu(hp2).opcode = A_SETcc then
  7779. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/SETcc -> SETcc',p)
  7780. else
  7781. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/Jcc -> Jcc',p);
  7782. end
  7783. else
  7784. if taicpu(hp2).opcode = A_SETcc then
  7785. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/SETcc -> SETcc/SETcc',p)
  7786. else
  7787. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/Jcc -> SETcc/Jcc',p);
  7788. Result := True;
  7789. end
  7790. else if
  7791. { Make sure the instructions are adjacent }
  7792. (
  7793. not (cs_opt_level3 in current_settings.optimizerswitches) or
  7794. GetNextInstruction(p, hp1)
  7795. ) and
  7796. MatchInstruction(hp1, A_MOV, [S_B]) and
  7797. { Writing to memory is allowed }
  7798. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^.reg) then
  7799. begin
  7800. {
  7801. Watch out for sequences such as:
  7802. set(c)b %regb
  7803. movb %regb,(ref)
  7804. movb $0,1(ref)
  7805. movb $0,2(ref)
  7806. movb $0,3(ref)
  7807. Much more efficient to turn it into:
  7808. movl $0,%regl
  7809. set(c)b %regb
  7810. movl %regl,(ref)
  7811. Or:
  7812. set(c)b %regb
  7813. movzbl %regb,%regl
  7814. movl %regl,(ref)
  7815. }
  7816. if (taicpu(hp1).oper[1]^.typ = top_ref) and
  7817. GetNextInstruction(hp1, hp2) and
  7818. MatchInstruction(hp2, A_MOV, [S_B]) and
  7819. (taicpu(hp2).oper[1]^.typ = top_ref) and
  7820. CheckMemoryWrite(taicpu(hp1), taicpu(hp2)) then
  7821. begin
  7822. { Don't do anything else except set Result to True }
  7823. end
  7824. else
  7825. begin
  7826. if taicpu(p).oper[0]^.typ = top_reg then
  7827. begin
  7828. TransferUsedRegs(TmpUsedRegs);
  7829. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  7830. end;
  7831. { If it's not a register, it's a memory address }
  7832. if (taicpu(p).oper[0]^.typ <> top_reg) or RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) then
  7833. begin
  7834. { Even if the register is still in use, we can minimise the
  7835. pipeline stall by changing the MOV into another SETcc. }
  7836. taicpu(hp1).opcode := A_SETcc;
  7837. taicpu(hp1).condition := taicpu(p).condition;
  7838. if taicpu(hp1).oper[1]^.typ = top_ref then
  7839. begin
  7840. { Swapping the operand pointers like this is probably a
  7841. bit naughty, but it is far faster than using loadoper
  7842. to transfer the reference from oper[1] to oper[0] if
  7843. you take into account the extra procedure calls and
  7844. the memory allocation and deallocation required }
  7845. OperPtr := taicpu(hp1).oper[1];
  7846. taicpu(hp1).oper[1] := taicpu(hp1).oper[0];
  7847. taicpu(hp1).oper[0] := OperPtr;
  7848. end
  7849. else
  7850. taicpu(hp1).oper[0]^.reg := taicpu(hp1).oper[1]^.reg;
  7851. taicpu(hp1).clearop(1);
  7852. taicpu(hp1).ops := 1;
  7853. DebugMsg(SPeepholeOptimization + 'SETcc/Mov -> SETcc/SETcc',p);
  7854. end
  7855. else
  7856. begin
  7857. if taicpu(hp1).oper[1]^.typ = top_reg then
  7858. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
  7859. taicpu(p).loadoper(0, taicpu(hp1).oper[1]^);
  7860. RemoveInstruction(hp1);
  7861. DebugMsg(SPeepholeOptimization + 'SETcc/Mov -> SETcc',p);
  7862. end
  7863. end;
  7864. Result := True;
  7865. end;
  7866. end;
  7867. end;
  7868. function TX86AsmOptimizer.OptPass2Jmp(var p : tai) : boolean;
  7869. var
  7870. hp1: tai;
  7871. Count: Integer;
  7872. OrigLabel: TAsmLabel;
  7873. begin
  7874. result := False;
  7875. { Sometimes, the optimisations below can permit this }
  7876. RemoveDeadCodeAfterJump(p);
  7877. if (taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full) and (taicpu(p).oper[0]^.ref^.base=NR_NO) and
  7878. (taicpu(p).oper[0]^.ref^.index=NR_NO) and (taicpu(p).oper[0]^.ref^.symbol is tasmlabel) then
  7879. begin
  7880. OrigLabel := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  7881. { Also a side-effect of optimisations }
  7882. if CollapseZeroDistJump(p, OrigLabel) then
  7883. begin
  7884. Result := True;
  7885. Exit;
  7886. end;
  7887. hp1 := GetLabelWithSym(OrigLabel);
  7888. if (taicpu(p).condition=C_None) and assigned(hp1) and SkipLabels(hp1,hp1) and (hp1.typ = ait_instruction) then
  7889. begin
  7890. case taicpu(hp1).opcode of
  7891. A_RET:
  7892. {
  7893. change
  7894. jmp .L1
  7895. ...
  7896. .L1:
  7897. ret
  7898. into
  7899. ret
  7900. }
  7901. begin
  7902. ConvertJumpToRET(p, hp1);
  7903. result:=true;
  7904. end;
  7905. { Check any kind of direct assignment instruction }
  7906. A_MOV,
  7907. A_MOVD,
  7908. A_MOVQ,
  7909. A_MOVSX,
  7910. {$ifdef x86_64}
  7911. A_MOVSXD,
  7912. {$endif x86_64}
  7913. A_MOVZX,
  7914. A_MOVAPS,
  7915. A_MOVUPS,
  7916. A_MOVSD,
  7917. A_MOVAPD,
  7918. A_MOVUPD,
  7919. A_MOVDQA,
  7920. A_MOVDQU,
  7921. A_VMOVSS,
  7922. A_VMOVAPS,
  7923. A_VMOVUPS,
  7924. A_VMOVSD,
  7925. A_VMOVAPD,
  7926. A_VMOVUPD,
  7927. A_VMOVDQA,
  7928. A_VMOVDQU:
  7929. if ((current_settings.optimizerswitches * [cs_opt_level3, cs_opt_size]) <> [cs_opt_size]) and
  7930. CheckJumpMovTransferOpt(p, hp1, 0, Count) then
  7931. begin
  7932. Result := True;
  7933. Exit;
  7934. end;
  7935. else
  7936. ;
  7937. end;
  7938. end;
  7939. end;
  7940. end;
  7941. class function TX86AsmOptimizer.CanBeCMOV(p : tai) : boolean;
  7942. begin
  7943. CanBeCMOV:=assigned(p) and
  7944. MatchInstruction(p,A_MOV,[S_W,S_L,S_Q]) and
  7945. { we can't use cmov ref,reg because
  7946. ref could be nil and cmov still throws an exception
  7947. if ref=nil but the mov isn't done (FK)
  7948. or ((taicpu(p).oper[0]^.typ = top_ref) and
  7949. (taicpu(p).oper[0]^.ref^.refaddr = addr_no))
  7950. }
  7951. (taicpu(p).oper[1]^.typ = top_reg) and
  7952. (
  7953. (taicpu(p).oper[0]^.typ = top_reg) or
  7954. { allow references, but only pure symbols or got rel. addressing with RIP as based,
  7955. it is not expected that this can cause a seg. violation }
  7956. (
  7957. (taicpu(p).oper[0]^.typ = top_ref) and
  7958. IsRefSafe(taicpu(p).oper[0]^.ref)
  7959. )
  7960. );
  7961. end;
  7962. function TX86AsmOptimizer.OptPass2Jcc(var p : tai) : boolean;
  7963. var
  7964. hp1,hp2: tai;
  7965. {$ifndef i8086}
  7966. hp3,hp4,hpmov2, hp5: tai;
  7967. l : Longint;
  7968. condition : TAsmCond;
  7969. {$endif i8086}
  7970. carryadd_opcode : TAsmOp;
  7971. symbol: TAsmSymbol;
  7972. reg: tsuperregister;
  7973. increg, tmpreg: TRegister;
  7974. begin
  7975. result:=false;
  7976. if GetNextInstruction(p,hp1) and (hp1.typ=ait_instruction) then
  7977. begin
  7978. symbol := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  7979. if (
  7980. (
  7981. ((Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB)) and
  7982. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  7983. (Taicpu(hp1).oper[0]^.val=1)
  7984. ) or
  7985. ((Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC))
  7986. ) and
  7987. GetNextInstruction(hp1,hp2) and
  7988. SkipAligns(hp2, hp2) and
  7989. (hp2.typ = ait_label) and
  7990. (Tasmlabel(symbol) = Tai_label(hp2).labsym) then
  7991. { jb @@1 cmc
  7992. inc/dec operand --> adc/sbb operand,0
  7993. @@1:
  7994. ... and ...
  7995. jnb @@1
  7996. inc/dec operand --> adc/sbb operand,0
  7997. @@1: }
  7998. begin
  7999. if Taicpu(p).condition in [C_NAE,C_B,C_C] then
  8000. begin
  8001. case taicpu(hp1).opcode of
  8002. A_INC,
  8003. A_ADD:
  8004. carryadd_opcode:=A_ADC;
  8005. A_DEC,
  8006. A_SUB:
  8007. carryadd_opcode:=A_SBB;
  8008. else
  8009. InternalError(2021011001);
  8010. end;
  8011. Taicpu(p).clearop(0);
  8012. Taicpu(p).ops:=0;
  8013. Taicpu(p).is_jmp:=false;
  8014. Taicpu(p).opcode:=A_CMC;
  8015. Taicpu(p).condition:=C_NONE;
  8016. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2CmcAdc/Sbb',p);
  8017. Taicpu(hp1).ops:=2;
  8018. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  8019. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  8020. else
  8021. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  8022. Taicpu(hp1).loadconst(0,0);
  8023. Taicpu(hp1).opcode:=carryadd_opcode;
  8024. result:=true;
  8025. exit;
  8026. end
  8027. else if Taicpu(p).condition in [C_AE,C_NB,C_NC] then
  8028. begin
  8029. case taicpu(hp1).opcode of
  8030. A_INC,
  8031. A_ADD:
  8032. carryadd_opcode:=A_ADC;
  8033. A_DEC,
  8034. A_SUB:
  8035. carryadd_opcode:=A_SBB;
  8036. else
  8037. InternalError(2021011002);
  8038. end;
  8039. Taicpu(hp1).ops:=2;
  8040. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2Adc/Sbb',p);
  8041. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  8042. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  8043. else
  8044. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  8045. Taicpu(hp1).loadconst(0,0);
  8046. Taicpu(hp1).opcode:=carryadd_opcode;
  8047. RemoveCurrentP(p, hp1);
  8048. result:=true;
  8049. exit;
  8050. end
  8051. {
  8052. jcc @@1 setcc tmpreg
  8053. inc/dec/add/sub operand -> (movzx tmpreg)
  8054. @@1: add/sub tmpreg,operand
  8055. While this increases code size slightly, it makes the code much faster if the
  8056. jump is unpredictable
  8057. }
  8058. else if not(cs_opt_size in current_settings.optimizerswitches) then
  8059. begin
  8060. { search for an available register which is volatile }
  8061. for reg in tcpuregisterset do
  8062. begin
  8063. if
  8064. {$if defined(i386) or defined(i8086)}
  8065. { Only use registers whose lowest 8-bits can Be accessed }
  8066. (reg in [RS_EAX,RS_EBX,RS_ECX,RS_EDX]) and
  8067. {$endif i386 or i8086}
  8068. (reg in paramanager.get_volatile_registers_int(current_procinfo.procdef.proccalloption)) and
  8069. not(reg in UsedRegs[R_INTREGISTER].GetUsedRegs)
  8070. { We don't need to check if tmpreg is in hp1 or not, because
  8071. it will be marked as in use at p (if not, this is
  8072. indictive of a compiler bug). }
  8073. then
  8074. begin
  8075. TAsmLabel(symbol).decrefs;
  8076. increg := newreg(R_INTREGISTER,reg,R_SUBL);
  8077. Taicpu(p).clearop(0);
  8078. Taicpu(p).ops:=1;
  8079. Taicpu(p).is_jmp:=false;
  8080. Taicpu(p).opcode:=A_SETcc;
  8081. DebugMsg(SPeepholeOptimization+'JccAdd2SetccAdd',p);
  8082. Taicpu(p).condition:=inverse_cond(Taicpu(p).condition);
  8083. Taicpu(p).loadreg(0,increg);
  8084. if getsubreg(Taicpu(hp1).oper[1]^.reg)<>R_SUBL then
  8085. begin
  8086. case getsubreg(Taicpu(hp1).oper[1]^.reg) of
  8087. R_SUBW:
  8088. begin
  8089. tmpreg := newreg(R_INTREGISTER,reg,R_SUBW);
  8090. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BW,increg,tmpreg);
  8091. end;
  8092. R_SUBD:
  8093. begin
  8094. tmpreg := newreg(R_INTREGISTER,reg,R_SUBD);
  8095. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BL,increg,tmpreg);
  8096. end;
  8097. {$ifdef x86_64}
  8098. R_SUBQ:
  8099. begin
  8100. { MOVZX doesn't have a 64-bit variant, because
  8101. the 32-bit version implicitly zeroes the
  8102. upper 32-bits of the destination register }
  8103. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BL,increg,
  8104. newreg(R_INTREGISTER,reg,R_SUBD));
  8105. tmpreg := newreg(R_INTREGISTER,reg,R_SUBQ);
  8106. end;
  8107. {$endif x86_64}
  8108. else
  8109. Internalerror(2020030601);
  8110. end;
  8111. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  8112. asml.InsertAfter(hp2,p);
  8113. end
  8114. else
  8115. tmpreg := increg;
  8116. if (Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC) then
  8117. begin
  8118. Taicpu(hp1).ops:=2;
  8119. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^)
  8120. end;
  8121. Taicpu(hp1).loadreg(0,tmpreg);
  8122. AllocRegBetween(tmpreg,p,hp1,UsedRegs);
  8123. Result := True;
  8124. { p is no longer a Jcc instruction, so exit }
  8125. Exit;
  8126. end;
  8127. end;
  8128. end;
  8129. end;
  8130. { Detect the following:
  8131. jmp<cond> @Lbl1
  8132. jmp @Lbl2
  8133. ...
  8134. @Lbl1:
  8135. ret
  8136. Change to:
  8137. jmp<inv_cond> @Lbl2
  8138. ret
  8139. }
  8140. if MatchInstruction(hp1,A_JMP,[]) and (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  8141. begin
  8142. hp2:=getlabelwithsym(TAsmLabel(symbol));
  8143. if Assigned(hp2) and SkipLabels(hp2,hp2) and
  8144. MatchInstruction(hp2,A_RET,[S_NO]) then
  8145. begin
  8146. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  8147. { Change label address to that of the unconditional jump }
  8148. taicpu(p).loadoper(0, taicpu(hp1).oper[0]^);
  8149. TAsmLabel(symbol).DecRefs;
  8150. taicpu(hp1).opcode := A_RET;
  8151. taicpu(hp1).is_jmp := false;
  8152. taicpu(hp1).ops := taicpu(hp2).ops;
  8153. DebugMsg(SPeepholeOptimization+'JccJmpRet2J!ccRet',p);
  8154. case taicpu(hp2).ops of
  8155. 0:
  8156. taicpu(hp1).clearop(0);
  8157. 1:
  8158. taicpu(hp1).loadconst(0,taicpu(hp2).oper[0]^.val);
  8159. else
  8160. internalerror(2016041302);
  8161. end;
  8162. end;
  8163. {$ifndef i8086}
  8164. end
  8165. {
  8166. convert
  8167. j<c> .L1
  8168. mov 1,reg
  8169. jmp .L2
  8170. .L1
  8171. mov 0,reg
  8172. .L2
  8173. into
  8174. mov 0,reg
  8175. set<not(c)> reg
  8176. take care of alignment and that the mov 0,reg is not converted into a xor as this
  8177. would destroy the flag contents
  8178. }
  8179. else if MatchInstruction(hp1,A_MOV,[]) and
  8180. MatchOpType(taicpu(hp1),top_const,top_reg) and
  8181. {$ifdef i386}
  8182. (
  8183. { Under i386, ESI, EDI, EBP and ESP
  8184. don't have an 8-bit representation }
  8185. not (getsupreg(taicpu(hp1).oper[1]^.reg) in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  8186. ) and
  8187. {$endif i386}
  8188. (taicpu(hp1).oper[0]^.val=1) and
  8189. GetNextInstruction(hp1,hp2) and
  8190. MatchInstruction(hp2,A_JMP,[]) and (taicpu(hp2).oper[0]^.ref^.refaddr=addr_full) and
  8191. GetNextInstruction(hp2,hp3) and
  8192. { skip align }
  8193. ((hp3.typ<>ait_align) or GetNextInstruction(hp3,hp3)) and
  8194. (hp3.typ=ait_label) and
  8195. (tasmlabel(taicpu(p).oper[0]^.ref^.symbol)=tai_label(hp3).labsym) and
  8196. (tai_label(hp3).labsym.getrefs=1) and
  8197. GetNextInstruction(hp3,hp4) and
  8198. MatchInstruction(hp4,A_MOV,[]) and
  8199. MatchOpType(taicpu(hp4),top_const,top_reg) and
  8200. (taicpu(hp4).oper[0]^.val=0) and
  8201. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp4).oper[1]^) and
  8202. GetNextInstruction(hp4,hp5) and
  8203. (hp5.typ=ait_label) and
  8204. (tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol)=tai_label(hp5).labsym) and
  8205. (tai_label(hp5).labsym.getrefs=1) then
  8206. begin
  8207. AllocRegBetween(NR_FLAGS,p,hp4,UsedRegs);
  8208. DebugMsg(SPeepholeOptimization+'JccMovJmpMov2MovSetcc',p);
  8209. { remove last label }
  8210. RemoveInstruction(hp5);
  8211. { remove second label }
  8212. RemoveInstruction(hp3);
  8213. { if align is present remove it }
  8214. if GetNextInstruction(hp2,hp3) and (hp3.typ=ait_align) then
  8215. RemoveInstruction(hp3);
  8216. { remove jmp }
  8217. RemoveInstruction(hp2);
  8218. if taicpu(hp1).opsize=S_B then
  8219. RemoveInstruction(hp1)
  8220. else
  8221. taicpu(hp1).loadconst(0,0);
  8222. taicpu(hp4).opcode:=A_SETcc;
  8223. taicpu(hp4).opsize:=S_B;
  8224. taicpu(hp4).condition:=inverse_cond(taicpu(p).condition);
  8225. taicpu(hp4).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(hp4).oper[1]^.reg),R_SUBL));
  8226. taicpu(hp4).opercnt:=1;
  8227. taicpu(hp4).ops:=1;
  8228. taicpu(hp4).freeop(1);
  8229. RemoveCurrentP(p);
  8230. Result:=true;
  8231. exit;
  8232. end
  8233. else if CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype] then
  8234. begin
  8235. { check for
  8236. jCC xxx
  8237. <several movs>
  8238. xxx:
  8239. }
  8240. l:=0;
  8241. while assigned(hp1) and
  8242. CanBeCMOV(hp1) and
  8243. { stop on labels }
  8244. not(hp1.typ=ait_label) do
  8245. begin
  8246. inc(l);
  8247. GetNextInstruction(hp1,hp1);
  8248. end;
  8249. if assigned(hp1) then
  8250. begin
  8251. if FindLabel(tasmlabel(symbol),hp1) then
  8252. begin
  8253. if (l<=4) and (l>0) then
  8254. begin
  8255. condition:=inverse_cond(taicpu(p).condition);
  8256. GetNextInstruction(p,hp1);
  8257. repeat
  8258. if not Assigned(hp1) then
  8259. InternalError(2018062900);
  8260. taicpu(hp1).opcode:=A_CMOVcc;
  8261. taicpu(hp1).condition:=condition;
  8262. UpdateUsedRegs(hp1);
  8263. GetNextInstruction(hp1,hp1);
  8264. until not(CanBeCMOV(hp1));
  8265. { Remember what hp1 is in case there's multiple aligns to get rid of }
  8266. hp2 := hp1;
  8267. repeat
  8268. if not Assigned(hp2) then
  8269. InternalError(2018062910);
  8270. case hp2.typ of
  8271. ait_label:
  8272. { What we expected - break out of the loop (it won't be a dead label at the top of
  8273. a cluster because that was optimised at an earlier stage) }
  8274. Break;
  8275. ait_align:
  8276. { Go to the next entry until a label is found (may be multiple aligns before it) }
  8277. begin
  8278. hp2 := tai(hp2.Next);
  8279. Continue;
  8280. end;
  8281. else
  8282. begin
  8283. { Might be a comment or temporary allocation entry }
  8284. if not (hp2.typ in SkipInstr) then
  8285. InternalError(2018062911);
  8286. hp2 := tai(hp2.Next);
  8287. Continue;
  8288. end;
  8289. end;
  8290. until False;
  8291. { Now we can safely decrement the reference count }
  8292. tasmlabel(symbol).decrefs;
  8293. DebugMsg(SPeepholeOptimization+'JccMov2CMov',p);
  8294. { Remove the original jump }
  8295. RemoveInstruction(p); { Note, the choice to not use RemoveCurrentp is deliberate }
  8296. GetNextInstruction(hp2, p); { Instruction after the label }
  8297. { Remove the label if this is its final reference }
  8298. if (tasmlabel(symbol).getrefs=0) then
  8299. StripLabelFast(hp1);
  8300. if Assigned(p) then
  8301. begin
  8302. UpdateUsedRegs(p);
  8303. result:=true;
  8304. end;
  8305. exit;
  8306. end;
  8307. end
  8308. else
  8309. begin
  8310. { check further for
  8311. jCC xxx
  8312. <several movs 1>
  8313. jmp yyy
  8314. xxx:
  8315. <several movs 2>
  8316. yyy:
  8317. }
  8318. { hp2 points to jmp yyy }
  8319. hp2:=hp1;
  8320. { skip hp1 to xxx (or an align right before it) }
  8321. GetNextInstruction(hp1, hp1);
  8322. if assigned(hp2) and
  8323. assigned(hp1) and
  8324. (l<=3) and
  8325. (hp2.typ=ait_instruction) and
  8326. (taicpu(hp2).is_jmp) and
  8327. (taicpu(hp2).condition=C_None) and
  8328. { real label and jump, no further references to the
  8329. label are allowed }
  8330. (tasmlabel(symbol).getrefs=1) and
  8331. FindLabel(tasmlabel(symbol),hp1) then
  8332. begin
  8333. l:=0;
  8334. { skip hp1 to <several moves 2> }
  8335. if (hp1.typ = ait_align) then
  8336. GetNextInstruction(hp1, hp1);
  8337. GetNextInstruction(hp1, hpmov2);
  8338. hp1 := hpmov2;
  8339. while assigned(hp1) and
  8340. CanBeCMOV(hp1) do
  8341. begin
  8342. inc(l);
  8343. GetNextInstruction(hp1, hp1);
  8344. end;
  8345. { hp1 points to yyy (or an align right before it) }
  8346. hp3 := hp1;
  8347. if assigned(hp1) and
  8348. FindLabel(tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol),hp1) then
  8349. begin
  8350. condition:=inverse_cond(taicpu(p).condition);
  8351. GetNextInstruction(p,hp1);
  8352. repeat
  8353. taicpu(hp1).opcode:=A_CMOVcc;
  8354. taicpu(hp1).condition:=condition;
  8355. UpdateUsedRegs(hp1);
  8356. GetNextInstruction(hp1,hp1);
  8357. until not(assigned(hp1)) or
  8358. not(CanBeCMOV(hp1));
  8359. condition:=inverse_cond(condition);
  8360. hp1 := hpmov2;
  8361. { hp1 is now at <several movs 2> }
  8362. while Assigned(hp1) and CanBeCMOV(hp1) do
  8363. begin
  8364. taicpu(hp1).opcode:=A_CMOVcc;
  8365. taicpu(hp1).condition:=condition;
  8366. UpdateUsedRegs(hp1);
  8367. GetNextInstruction(hp1,hp1);
  8368. end;
  8369. hp1 := p;
  8370. { Get first instruction after label }
  8371. GetNextInstruction(hp3, p);
  8372. if assigned(p) and (hp3.typ = ait_align) then
  8373. GetNextInstruction(p, p);
  8374. { Don't dereference yet, as doing so will cause
  8375. GetNextInstruction to skip the label and
  8376. optional align marker. [Kit] }
  8377. GetNextInstruction(hp2, hp4);
  8378. DebugMsg(SPeepholeOptimization+'JccMovJmpMov2CMovCMov',hp1);
  8379. { remove jCC }
  8380. RemoveInstruction(hp1);
  8381. { Now we can safely decrement it }
  8382. tasmlabel(symbol).decrefs;
  8383. { Remove label xxx (it will have a ref of zero due to the initial check }
  8384. StripLabelFast(hp4);
  8385. { remove jmp }
  8386. symbol := taicpu(hp2).oper[0]^.ref^.symbol;
  8387. RemoveInstruction(hp2);
  8388. { As before, now we can safely decrement it }
  8389. tasmlabel(symbol).decrefs;
  8390. { Remove label yyy (and the optional alignment) if its reference falls to zero }
  8391. if tasmlabel(symbol).getrefs = 0 then
  8392. StripLabelFast(hp3);
  8393. if Assigned(p) then
  8394. begin
  8395. UpdateUsedRegs(p);
  8396. result:=true;
  8397. end;
  8398. exit;
  8399. end;
  8400. end;
  8401. end;
  8402. end;
  8403. {$endif i8086}
  8404. end;
  8405. end;
  8406. end;
  8407. function TX86AsmOptimizer.OptPass1Movx(var p : tai) : boolean;
  8408. var
  8409. hp1,hp2: tai;
  8410. reg_and_hp1_is_instr: Boolean;
  8411. begin
  8412. result:=false;
  8413. reg_and_hp1_is_instr:=(taicpu(p).oper[1]^.typ = top_reg) and
  8414. GetNextInstruction(p,hp1) and
  8415. (hp1.typ = ait_instruction);
  8416. if reg_and_hp1_is_instr and
  8417. (
  8418. (taicpu(hp1).opcode <> A_LEA) or
  8419. { If the LEA instruction can be converted into an arithmetic instruction,
  8420. it may be possible to then fold it. }
  8421. (
  8422. { If the flags register is in use, don't change the instruction
  8423. to an ADD otherwise this will scramble the flags. [Kit] }
  8424. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  8425. ConvertLEA(taicpu(hp1))
  8426. )
  8427. ) and
  8428. IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
  8429. GetNextInstruction(hp1,hp2) and
  8430. MatchInstruction(hp2,A_MOV,[]) and
  8431. (taicpu(hp2).oper[0]^.typ = top_reg) and
  8432. OpsEqual(taicpu(hp2).oper[1]^,taicpu(p).oper[0]^) and
  8433. ((taicpu(p).opsize in [S_BW,S_BL]) and (taicpu(hp2).opsize=S_B) or
  8434. (taicpu(p).opsize in [S_WL]) and (taicpu(hp2).opsize=S_W)) and
  8435. {$ifdef i386}
  8436. { not all registers have byte size sub registers on i386 }
  8437. ((taicpu(hp2).opsize<>S_B) or (getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])) and
  8438. {$endif i386}
  8439. (((taicpu(hp1).ops=2) and
  8440. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg))) or
  8441. ((taicpu(hp1).ops=1) and
  8442. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg)))) and
  8443. not(RegUsedAfterInstruction(taicpu(hp2).oper[0]^.reg,hp2,UsedRegs)) then
  8444. begin
  8445. { change movsX/movzX reg/ref, reg2
  8446. add/sub/or/... reg3/$const, reg2
  8447. mov reg2 reg/ref
  8448. to add/sub/or/... reg3/$const, reg/ref }
  8449. { by example:
  8450. movswl %si,%eax movswl %si,%eax p
  8451. decl %eax addl %edx,%eax hp1
  8452. movw %ax,%si movw %ax,%si hp2
  8453. ->
  8454. movswl %si,%eax movswl %si,%eax p
  8455. decw %eax addw %edx,%eax hp1
  8456. movw %ax,%si movw %ax,%si hp2
  8457. }
  8458. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  8459. {
  8460. ->
  8461. movswl %si,%eax movswl %si,%eax p
  8462. decw %si addw %dx,%si hp1
  8463. movw %ax,%si movw %ax,%si hp2
  8464. }
  8465. case taicpu(hp1).ops of
  8466. 1:
  8467. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  8468. 2:
  8469. begin
  8470. taicpu(hp1).loadoper(1,taicpu(hp2).oper[1]^);
  8471. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  8472. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  8473. end;
  8474. else
  8475. internalerror(2008042702);
  8476. end;
  8477. {
  8478. ->
  8479. decw %si addw %dx,%si p
  8480. }
  8481. DebugMsg(SPeepholeOptimization + 'var3',p);
  8482. RemoveCurrentP(p, hp1);
  8483. RemoveInstruction(hp2);
  8484. end
  8485. else if reg_and_hp1_is_instr and
  8486. (taicpu(hp1).opcode = A_MOV) and
  8487. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  8488. (MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)
  8489. {$ifdef x86_64}
  8490. { check for implicit extension to 64 bit }
  8491. or
  8492. ((taicpu(p).opsize in [S_BL,S_WL]) and
  8493. (taicpu(hp1).opsize=S_Q) and
  8494. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg)
  8495. )
  8496. {$endif x86_64}
  8497. )
  8498. then
  8499. begin
  8500. { change
  8501. movx %reg1,%reg2
  8502. mov %reg2,%reg3
  8503. dealloc %reg2
  8504. into
  8505. movx %reg,%reg3
  8506. }
  8507. TransferUsedRegs(TmpUsedRegs);
  8508. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  8509. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  8510. begin
  8511. DebugMsg(SPeepholeOptimization + 'MovxMov2Movx',p);
  8512. {$ifdef x86_64}
  8513. if (taicpu(p).opsize in [S_BL,S_WL]) and
  8514. (taicpu(hp1).opsize=S_Q) then
  8515. taicpu(p).loadreg(1,newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg),R_SUBD))
  8516. else
  8517. {$endif x86_64}
  8518. taicpu(p).loadreg(1,taicpu(hp1).oper[1]^.reg);
  8519. RemoveInstruction(hp1);
  8520. end;
  8521. end
  8522. else if reg_and_hp1_is_instr and
  8523. ((taicpu(hp1).opcode=A_MOV) or
  8524. (taicpu(hp1).opcode=A_ADD) or
  8525. (taicpu(hp1).opcode=A_SUB) or
  8526. (taicpu(hp1).opcode=A_CMP) or
  8527. (taicpu(hp1).opcode=A_OR) or
  8528. (taicpu(hp1).opcode=A_XOR) or
  8529. (taicpu(hp1).opcode=A_AND)
  8530. ) and
  8531. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  8532. (((taicpu(p).opsize in [S_BW,S_BL,S_WL{$ifdef x86_64},S_BQ,S_WQ,S_LQ{$endif x86_64}]) and
  8533. (taicpu(hp1).opsize=S_B)) or
  8534. ((taicpu(p).opsize in [S_WL{$ifdef x86_64},S_WQ,S_LQ{$endif x86_64}]) and
  8535. (taicpu(hp1).opsize=S_W))
  8536. {$ifdef x86_64}
  8537. or ((taicpu(p).opsize=S_LQ) and
  8538. (taicpu(hp1).opsize=S_L))
  8539. {$endif x86_64}
  8540. ) and
  8541. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg) then
  8542. begin
  8543. { change
  8544. movx %reg1,%reg2
  8545. mov %reg2,%reg3
  8546. dealloc %reg2
  8547. into
  8548. mov %reg1,%reg3
  8549. if the second mov accesses only the bits stored in reg1
  8550. }
  8551. TransferUsedRegs(TmpUsedRegs);
  8552. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  8553. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  8554. begin
  8555. DebugMsg(SPeepholeOptimization + 'MovxOp2Op',p);
  8556. if taicpu(p).oper[0]^.typ=top_reg then
  8557. begin
  8558. case taicpu(hp1).opsize of
  8559. S_B:
  8560. taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBL));
  8561. S_W:
  8562. taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBW));
  8563. S_L:
  8564. taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBD));
  8565. else
  8566. Internalerror(2020102301);
  8567. end;
  8568. AllocRegBetween(taicpu(hp1).oper[0]^.reg,p,hp1,UsedRegs);
  8569. end
  8570. else
  8571. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  8572. RemoveCurrentP(p);
  8573. result:=true;
  8574. exit;
  8575. end;
  8576. end
  8577. else if reg_and_hp1_is_instr and
  8578. (taicpu(p).oper[0]^.typ = top_reg) and
  8579. (
  8580. (taicpu(hp1).opcode = A_SHL) or (taicpu(hp1).opcode = A_SAL)
  8581. ) and
  8582. (taicpu(hp1).oper[0]^.typ = top_const) and
  8583. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  8584. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  8585. { Minimum shift value allowed is the bit difference between the sizes }
  8586. (taicpu(hp1).oper[0]^.val >=
  8587. { Multiply by 8 because tcgsize2size returns bytes, not bits }
  8588. 8 * (
  8589. tcgsize2size[reg_cgsize(taicpu(p).oper[1]^.reg)] -
  8590. tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
  8591. )
  8592. ) then
  8593. begin
  8594. { For:
  8595. movsx/movzx %reg1,%reg1 (same register, just different sizes)
  8596. shl/sal ##, %reg1
  8597. Remove the movsx/movzx instruction if the shift overwrites the
  8598. extended bits of the register (e.g. movslq %eax,%rax; shlq $32,%rax
  8599. }
  8600. DebugMsg(SPeepholeOptimization + 'MovxShl2Shl',p);
  8601. RemoveCurrentP(p, hp1);
  8602. Result := True;
  8603. Exit;
  8604. end
  8605. else if reg_and_hp1_is_instr and
  8606. (taicpu(p).oper[0]^.typ = top_reg) and
  8607. (
  8608. ((taicpu(hp1).opcode = A_SHR) and (taicpu(p).opcode = A_MOVZX)) or
  8609. ((taicpu(hp1).opcode = A_SAR) and (taicpu(p).opcode <> A_MOVZX))
  8610. ) and
  8611. (taicpu(hp1).oper[0]^.typ = top_const) and
  8612. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  8613. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  8614. { Minimum shift value allowed is the bit size of the smallest register - 1 }
  8615. (taicpu(hp1).oper[0]^.val <
  8616. { Multiply by 8 because tcgsize2size returns bytes, not bits }
  8617. 8 * (
  8618. tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
  8619. )
  8620. ) then
  8621. begin
  8622. { For:
  8623. movsx %reg1,%reg1 movzx %reg1,%reg1 (same register, just different sizes)
  8624. sar ##, %reg1 shr ##, %reg1
  8625. Move the shift to before the movx instruction if the shift value
  8626. is not too large.
  8627. }
  8628. asml.Remove(hp1);
  8629. asml.InsertBefore(hp1, p);
  8630. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[0]^.reg;
  8631. case taicpu(p).opsize of
  8632. s_BW, S_BL{$ifdef x86_64}, S_BQ{$endif}:
  8633. taicpu(hp1).opsize := S_B;
  8634. S_WL{$ifdef x86_64}, S_WQ{$endif}:
  8635. taicpu(hp1).opsize := S_W;
  8636. {$ifdef x86_64}
  8637. S_LQ:
  8638. taicpu(hp1).opsize := S_L;
  8639. {$endif}
  8640. else
  8641. InternalError(2020112401);
  8642. end;
  8643. if (taicpu(hp1).opcode = A_SHR) then
  8644. DebugMsg(SPeepholeOptimization + 'MovzShr2ShrMovz', hp1)
  8645. else
  8646. DebugMsg(SPeepholeOptimization + 'MovsSar2SarMovs', hp1);
  8647. Result := True;
  8648. end
  8649. else if taicpu(p).opcode=A_MOVZX then
  8650. begin
  8651. { removes superfluous And's after movzx's }
  8652. if reg_and_hp1_is_instr and
  8653. (taicpu(hp1).opcode = A_AND) and
  8654. MatchOpType(taicpu(hp1),top_const,top_reg) and
  8655. ((taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)
  8656. {$ifdef x86_64}
  8657. { check for implicit extension to 64 bit }
  8658. or
  8659. ((taicpu(p).opsize in [S_BL,S_WL]) and
  8660. (taicpu(hp1).opsize=S_Q) and
  8661. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg)
  8662. )
  8663. {$endif x86_64}
  8664. )
  8665. then
  8666. begin
  8667. case taicpu(p).opsize Of
  8668. S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
  8669. if (taicpu(hp1).oper[0]^.val = $ff) then
  8670. begin
  8671. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz1',p);
  8672. RemoveInstruction(hp1);
  8673. Result:=true;
  8674. exit;
  8675. end;
  8676. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  8677. if (taicpu(hp1).oper[0]^.val = $ffff) then
  8678. begin
  8679. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz2',p);
  8680. RemoveInstruction(hp1);
  8681. Result:=true;
  8682. exit;
  8683. end;
  8684. {$ifdef x86_64}
  8685. S_LQ:
  8686. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  8687. begin
  8688. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz3',p);
  8689. RemoveInstruction(hp1);
  8690. Result:=true;
  8691. exit;
  8692. end;
  8693. {$endif x86_64}
  8694. else
  8695. ;
  8696. end;
  8697. { we cannot get rid of the and, but can we get rid of the movz ?}
  8698. if SuperRegistersEqual(taicpu(p).oper[0]^.reg,taicpu(p).oper[1]^.reg) then
  8699. begin
  8700. case taicpu(p).opsize Of
  8701. S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
  8702. if (taicpu(hp1).oper[0]^.val and $ff)=taicpu(hp1).oper[0]^.val then
  8703. begin
  8704. DebugMsg(SPeepholeOptimization + 'MovzAnd2And1',p);
  8705. RemoveCurrentP(p,hp1);
  8706. Result:=true;
  8707. exit;
  8708. end;
  8709. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  8710. if (taicpu(hp1).oper[0]^.val and $ffff)=taicpu(hp1).oper[0]^.val then
  8711. begin
  8712. DebugMsg(SPeepholeOptimization + 'MovzAnd2And2',p);
  8713. RemoveCurrentP(p,hp1);
  8714. Result:=true;
  8715. exit;
  8716. end;
  8717. {$ifdef x86_64}
  8718. S_LQ:
  8719. if (taicpu(hp1).oper[0]^.val and $ffffffff)=taicpu(hp1).oper[0]^.val then
  8720. begin
  8721. DebugMsg(SPeepholeOptimization + 'MovzAnd2And3',p);
  8722. RemoveCurrentP(p,hp1);
  8723. Result:=true;
  8724. exit;
  8725. end;
  8726. {$endif x86_64}
  8727. else
  8728. ;
  8729. end;
  8730. end;
  8731. end;
  8732. { changes some movzx constructs to faster synonyms (all examples
  8733. are given with eax/ax, but are also valid for other registers)}
  8734. if MatchOpType(taicpu(p),top_reg,top_reg) then
  8735. begin
  8736. case taicpu(p).opsize of
  8737. { Technically, movzbw %al,%ax cannot be encoded in 32/64-bit mode
  8738. (the machine code is equivalent to movzbl %al,%eax), but the
  8739. code generator still generates that assembler instruction and
  8740. it is silently converted. This should probably be checked.
  8741. [Kit] }
  8742. S_BW:
  8743. begin
  8744. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  8745. (
  8746. not IsMOVZXAcceptable
  8747. { and $0xff,%ax has a smaller encoding but risks a partial write penalty }
  8748. or (
  8749. (cs_opt_size in current_settings.optimizerswitches) and
  8750. (taicpu(p).oper[1]^.reg = NR_AX)
  8751. )
  8752. ) then
  8753. {Change "movzbw %al, %ax" to "andw $0x0ffh, %ax"}
  8754. begin
  8755. DebugMsg(SPeepholeOptimization + 'var7',p);
  8756. taicpu(p).opcode := A_AND;
  8757. taicpu(p).changeopsize(S_W);
  8758. taicpu(p).loadConst(0,$ff);
  8759. Result := True;
  8760. end
  8761. else if not IsMOVZXAcceptable and
  8762. GetNextInstruction(p, hp1) and
  8763. (tai(hp1).typ = ait_instruction) and
  8764. (taicpu(hp1).opcode = A_AND) and
  8765. MatchOpType(taicpu(hp1),top_const,top_reg) and
  8766. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  8767. { Change "movzbw %reg1, %reg2; andw $const, %reg2"
  8768. to "movw %reg1, reg2; andw $(const1 and $ff), %reg2"}
  8769. begin
  8770. DebugMsg(SPeepholeOptimization + 'var8',p);
  8771. taicpu(p).opcode := A_MOV;
  8772. taicpu(p).changeopsize(S_W);
  8773. setsubreg(taicpu(p).oper[0]^.reg,R_SUBW);
  8774. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  8775. Result := True;
  8776. end;
  8777. end;
  8778. {$ifndef i8086} { movzbl %al,%eax cannot be encoded in 16-bit mode (the machine code is equivalent to movzbw %al,%ax }
  8779. S_BL:
  8780. begin
  8781. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  8782. (
  8783. not IsMOVZXAcceptable
  8784. { and $0xff,%eax has a smaller encoding but risks a partial write penalty }
  8785. or (
  8786. (cs_opt_size in current_settings.optimizerswitches) and
  8787. (taicpu(p).oper[1]^.reg = NR_EAX)
  8788. )
  8789. ) then
  8790. { Change "movzbl %al, %eax" to "andl $0x0ffh, %eax" }
  8791. begin
  8792. DebugMsg(SPeepholeOptimization + 'var9',p);
  8793. taicpu(p).opcode := A_AND;
  8794. taicpu(p).changeopsize(S_L);
  8795. taicpu(p).loadConst(0,$ff);
  8796. Result := True;
  8797. end
  8798. else if not IsMOVZXAcceptable and
  8799. GetNextInstruction(p, hp1) and
  8800. (tai(hp1).typ = ait_instruction) and
  8801. (taicpu(hp1).opcode = A_AND) and
  8802. MatchOpType(taicpu(hp1),top_const,top_reg) and
  8803. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  8804. { Change "movzbl %reg1, %reg2; andl $const, %reg2"
  8805. to "movl %reg1, reg2; andl $(const1 and $ff), %reg2"}
  8806. begin
  8807. DebugMsg(SPeepholeOptimization + 'var10',p);
  8808. taicpu(p).opcode := A_MOV;
  8809. taicpu(p).changeopsize(S_L);
  8810. { do not use R_SUBWHOLE
  8811. as movl %rdx,%eax
  8812. is invalid in assembler PM }
  8813. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  8814. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  8815. Result := True;
  8816. end;
  8817. end;
  8818. {$endif i8086}
  8819. S_WL:
  8820. if not IsMOVZXAcceptable then
  8821. begin
  8822. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) then
  8823. { Change "movzwl %ax, %eax" to "andl $0x0ffffh, %eax" }
  8824. begin
  8825. DebugMsg(SPeepholeOptimization + 'var11',p);
  8826. taicpu(p).opcode := A_AND;
  8827. taicpu(p).changeopsize(S_L);
  8828. taicpu(p).loadConst(0,$ffff);
  8829. Result := True;
  8830. end
  8831. else if GetNextInstruction(p, hp1) and
  8832. (tai(hp1).typ = ait_instruction) and
  8833. (taicpu(hp1).opcode = A_AND) and
  8834. (taicpu(hp1).oper[0]^.typ = top_const) and
  8835. (taicpu(hp1).oper[1]^.typ = top_reg) and
  8836. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  8837. { Change "movzwl %reg1, %reg2; andl $const, %reg2"
  8838. to "movl %reg1, reg2; andl $(const1 and $ffff), %reg2"}
  8839. begin
  8840. DebugMsg(SPeepholeOptimization + 'var12',p);
  8841. taicpu(p).opcode := A_MOV;
  8842. taicpu(p).changeopsize(S_L);
  8843. { do not use R_SUBWHOLE
  8844. as movl %rdx,%eax
  8845. is invalid in assembler PM }
  8846. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  8847. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  8848. Result := True;
  8849. end;
  8850. end;
  8851. else
  8852. InternalError(2017050705);
  8853. end;
  8854. end
  8855. else if not IsMOVZXAcceptable and (taicpu(p).oper[0]^.typ = top_ref) then
  8856. begin
  8857. if GetNextInstruction(p, hp1) and
  8858. (tai(hp1).typ = ait_instruction) and
  8859. (taicpu(hp1).opcode = A_AND) and
  8860. MatchOpType(taicpu(hp1),top_const,top_reg) and
  8861. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  8862. begin
  8863. //taicpu(p).opcode := A_MOV;
  8864. case taicpu(p).opsize Of
  8865. S_BL:
  8866. begin
  8867. DebugMsg(SPeepholeOptimization + 'var13',p);
  8868. taicpu(hp1).changeopsize(S_L);
  8869. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  8870. end;
  8871. S_WL:
  8872. begin
  8873. DebugMsg(SPeepholeOptimization + 'var14',p);
  8874. taicpu(hp1).changeopsize(S_L);
  8875. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  8876. end;
  8877. S_BW:
  8878. begin
  8879. DebugMsg(SPeepholeOptimization + 'var15',p);
  8880. taicpu(hp1).changeopsize(S_W);
  8881. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  8882. end;
  8883. else
  8884. Internalerror(2017050704)
  8885. end;
  8886. Result := True;
  8887. end;
  8888. end;
  8889. end;
  8890. end;
  8891. function TX86AsmOptimizer.OptPass1AND(var p : tai) : boolean;
  8892. var
  8893. hp1, hp2 : tai;
  8894. MaskLength : Cardinal;
  8895. MaskedBits : TCgInt;
  8896. begin
  8897. Result:=false;
  8898. { There are no optimisations for reference targets }
  8899. if (taicpu(p).oper[1]^.typ <> top_reg) then
  8900. Exit;
  8901. while GetNextInstruction(p, hp1) and
  8902. (hp1.typ = ait_instruction) do
  8903. begin
  8904. if (taicpu(p).oper[0]^.typ = top_const) then
  8905. begin
  8906. case taicpu(hp1).opcode of
  8907. A_AND:
  8908. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  8909. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  8910. { the second register must contain the first one, so compare their subreg types }
  8911. (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
  8912. (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
  8913. { change
  8914. and const1, reg
  8915. and const2, reg
  8916. to
  8917. and (const1 and const2), reg
  8918. }
  8919. begin
  8920. taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
  8921. DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
  8922. RemoveCurrentP(p, hp1);
  8923. Result:=true;
  8924. exit;
  8925. end;
  8926. A_CMP:
  8927. if (PopCnt(DWord(taicpu(p).oper[0]^.val)) = 1) and { Only 1 bit set }
  8928. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.val) and
  8929. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  8930. { Just check that the condition on the next instruction is compatible }
  8931. GetNextInstruction(hp1, hp2) and
  8932. (hp2.typ = ait_instruction) and
  8933. (taicpu(hp2).condition in [C_Z, C_E, C_NZ, C_NE])
  8934. then
  8935. { change
  8936. and 2^n, reg
  8937. cmp 2^n, reg
  8938. j(c) / set(c) / cmov(c) (c is equal or not equal)
  8939. to
  8940. and 2^n, reg
  8941. test reg, reg
  8942. j(~c) / set(~c) / cmov(~c)
  8943. }
  8944. begin
  8945. { Keep TEST instruction in, rather than remove it, because
  8946. it may trigger other optimisations such as MovAndTest2Test }
  8947. taicpu(hp1).loadreg(0, taicpu(hp1).oper[1]^.reg);
  8948. taicpu(hp1).opcode := A_TEST;
  8949. DebugMsg(SPeepholeOptimization + 'AND/CMP/J(c) -> AND/J(~c) with power of 2 constant', p);
  8950. taicpu(hp2).condition := inverse_cond(taicpu(hp2).condition);
  8951. Result := True;
  8952. Exit;
  8953. end;
  8954. A_MOVZX:
  8955. if MatchOpType(taicpu(hp1),top_reg,top_reg) and
  8956. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) and
  8957. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  8958. (
  8959. (
  8960. (taicpu(p).opsize=S_W) and
  8961. (taicpu(hp1).opsize=S_BW)
  8962. ) or
  8963. (
  8964. (taicpu(p).opsize=S_L) and
  8965. (taicpu(hp1).opsize in [S_WL,S_BL{$ifdef x86_64},S_BQ,S_WQ{$endif x86_64}])
  8966. )
  8967. {$ifdef x86_64}
  8968. or
  8969. (
  8970. (taicpu(p).opsize=S_Q) and
  8971. (taicpu(hp1).opsize in [S_BQ,S_WQ,S_BL,S_WL])
  8972. )
  8973. {$endif x86_64}
  8974. ) then
  8975. begin
  8976. if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  8977. ((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val)
  8978. ) or
  8979. (((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  8980. ((taicpu(p).oper[0]^.val and $ffff)=taicpu(p).oper[0]^.val))
  8981. then
  8982. begin
  8983. { Unlike MOVSX, MOVZX doesn't actually have a version that zero-extends a
  8984. 32-bit register to a 64-bit register, or even a version called MOVZXD, so
  8985. code that tests for the presence of AND 0xffffffff followed by MOVZX is
  8986. wasted, and is indictive of a compiler bug if it were triggered. [Kit]
  8987. NOTE: To zero-extend from 32 bits to 64 bits, simply use the standard MOV.
  8988. }
  8989. DebugMsg(SPeepholeOptimization + 'AndMovzToAnd done',p);
  8990. RemoveInstruction(hp1);
  8991. { See if there are other optimisations possible }
  8992. Continue;
  8993. end;
  8994. end;
  8995. A_SHL:
  8996. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  8997. (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
  8998. begin
  8999. {$ifopt R+}
  9000. {$define RANGE_WAS_ON}
  9001. {$R-}
  9002. {$endif}
  9003. { get length of potential and mask }
  9004. MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
  9005. { really a mask? }
  9006. {$ifdef RANGE_WAS_ON}
  9007. {$R+}
  9008. {$endif}
  9009. if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
  9010. { unmasked part shifted out? }
  9011. ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
  9012. begin
  9013. DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
  9014. RemoveCurrentP(p, hp1);
  9015. Result:=true;
  9016. exit;
  9017. end;
  9018. end;
  9019. A_SHR:
  9020. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  9021. (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
  9022. (taicpu(hp1).oper[0]^.val <= 63) then
  9023. begin
  9024. { Does SHR combined with the AND cover all the bits?
  9025. e.g. for "andb $252,%reg; shrb $2,%reg" - the "and" can be removed }
  9026. MaskedBits := taicpu(p).oper[0]^.val or ((TCgInt(1) shl taicpu(hp1).oper[0]^.val) - 1);
  9027. if ((taicpu(p).opsize = S_B) and ((MaskedBits and $FF) = $FF)) or
  9028. ((taicpu(p).opsize = S_W) and ((MaskedBits and $FFFF) = $FFFF)) or
  9029. ((taicpu(p).opsize = S_L) and ((MaskedBits and $FFFFFFFF) = $FFFFFFFF)) then
  9030. begin
  9031. DebugMsg(SPeepholeOptimization + 'AndShrToShr done', p);
  9032. RemoveCurrentP(p, hp1);
  9033. Result := True;
  9034. Exit;
  9035. end;
  9036. end;
  9037. A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  9038. if (taicpu(hp1).oper[0]^.typ = top_reg) and
  9039. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
  9040. begin
  9041. if SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
  9042. (
  9043. (
  9044. (taicpu(hp1).opsize in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  9045. ((taicpu(p).oper[0]^.val and $7F) = taicpu(p).oper[0]^.val)
  9046. ) or (
  9047. (taicpu(hp1).opsize in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  9048. ((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val)
  9049. {$ifdef x86_64}
  9050. ) or (
  9051. (taicpu(hp1).opsize = S_LQ) and
  9052. ((taicpu(p).oper[0]^.val and $7fffffff) = taicpu(p).oper[0]^.val)
  9053. {$endif x86_64}
  9054. )
  9055. ) then
  9056. begin
  9057. if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg){$ifdef x86_64} or (taicpu(hp1).opsize = S_LQ){$endif x86_64} then
  9058. begin
  9059. DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
  9060. RemoveInstruction(hp1);
  9061. { See if there are other optimisations possible }
  9062. Continue;
  9063. end;
  9064. { The super-registers are the same though.
  9065. Note that this change by itself doesn't improve
  9066. code speed, but it opens up other optimisations. }
  9067. {$ifdef x86_64}
  9068. { Convert 64-bit register to 32-bit }
  9069. case taicpu(hp1).opsize of
  9070. S_BQ:
  9071. begin
  9072. taicpu(hp1).opsize := S_BL;
  9073. taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
  9074. end;
  9075. S_WQ:
  9076. begin
  9077. taicpu(hp1).opsize := S_WL;
  9078. taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
  9079. end
  9080. else
  9081. ;
  9082. end;
  9083. {$endif x86_64}
  9084. DebugMsg(SPeepholeOptimization + 'AndMovsxToAndMovzx', hp1);
  9085. taicpu(hp1).opcode := A_MOVZX;
  9086. { See if there are other optimisations possible }
  9087. Continue;
  9088. end;
  9089. end;
  9090. else
  9091. ;
  9092. end;
  9093. end;
  9094. if (taicpu(hp1).is_jmp) and
  9095. (taicpu(hp1).opcode<>A_JMP) and
  9096. not(RegInUsedRegs(taicpu(p).oper[1]^.reg,UsedRegs)) then
  9097. begin
  9098. { change
  9099. and x, reg
  9100. jxx
  9101. to
  9102. test x, reg
  9103. jxx
  9104. if reg is deallocated before the
  9105. jump, but only if it's a conditional jump (PFV)
  9106. }
  9107. taicpu(p).opcode := A_TEST;
  9108. Exit;
  9109. end;
  9110. Break;
  9111. end;
  9112. { Lone AND tests }
  9113. if (taicpu(p).oper[0]^.typ = top_const) then
  9114. begin
  9115. {
  9116. - Convert and $0xFF,reg to and reg,reg if reg is 8-bit
  9117. - Convert and $0xFFFF,reg to and reg,reg if reg is 16-bit
  9118. - Convert and $0xFFFFFFFF,reg to and reg,reg if reg is 32-bit
  9119. }
  9120. if ((taicpu(p).oper[0]^.val = $FF) and (taicpu(p).opsize = S_B)) or
  9121. ((taicpu(p).oper[0]^.val = $FFFF) and (taicpu(p).opsize = S_W)) or
  9122. ((taicpu(p).oper[0]^.val = $FFFFFFFF) and (taicpu(p).opsize = S_L)) then
  9123. begin
  9124. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  9125. if taicpu(p).opsize = S_L then
  9126. begin
  9127. Include(OptsToCheck,aoc_MovAnd2Mov_3);
  9128. Result := True;
  9129. end;
  9130. end;
  9131. end;
  9132. { Backward check to determine necessity of and %reg,%reg }
  9133. if (taicpu(p).oper[0]^.typ = top_reg) and
  9134. (taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
  9135. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  9136. GetLastInstruction(p, hp2) and
  9137. RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp2) and
  9138. { Check size of adjacent instruction to determine if the AND is
  9139. effectively a null operation }
  9140. (
  9141. (taicpu(p).opsize = taicpu(hp2).opsize) or
  9142. { Note: Don't include S_Q }
  9143. ((taicpu(p).opsize = S_L) and (taicpu(hp2).opsize in [S_BL, S_WL])) or
  9144. ((taicpu(p).opsize = S_W) and (taicpu(hp2).opsize in [S_BW, S_BL, S_WL, S_L])) or
  9145. ((taicpu(p).opsize = S_B) and (taicpu(hp2).opsize in [S_BW, S_BL, S_WL, S_W, S_L]))
  9146. ) then
  9147. begin
  9148. DebugMsg(SPeepholeOptimization + 'And2Nop', p);
  9149. { If GetNextInstruction returned False, hp1 will be nil }
  9150. RemoveCurrentP(p, hp1);
  9151. Result := True;
  9152. Exit;
  9153. end;
  9154. end;
  9155. function TX86AsmOptimizer.OptPass2ADD(var p : tai) : boolean;
  9156. var
  9157. hp1: tai; NewRef: TReference;
  9158. { This entire nested function is used in an if-statement below, but we
  9159. want to avoid all the used reg transfers and GetNextInstruction calls
  9160. until we really have to check }
  9161. function MemRegisterNotUsedLater: Boolean; inline;
  9162. var
  9163. hp2: tai;
  9164. begin
  9165. TransferUsedRegs(TmpUsedRegs);
  9166. hp2 := p;
  9167. repeat
  9168. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  9169. until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
  9170. Result := not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs);
  9171. end;
  9172. begin
  9173. Result := False;
  9174. if not GetNextInstruction(p, hp1) or (hp1.typ <> ait_instruction) then
  9175. Exit;
  9176. if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) then
  9177. begin
  9178. { Change:
  9179. add %reg2,%reg1
  9180. mov/s/z #(%reg1),%reg1 (%reg1 superregisters must be the same)
  9181. To:
  9182. mov/s/z #(%reg1,%reg2),%reg1
  9183. }
  9184. if MatchOpType(taicpu(p), top_reg, top_reg) and
  9185. MatchInstruction(hp1, [A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}], []) and
  9186. MatchOpType(taicpu(hp1), top_ref, top_reg) and
  9187. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
  9188. (
  9189. (
  9190. (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  9191. (taicpu(hp1).oper[0]^.ref^.index = NR_NO) and
  9192. { r/esp cannot be an index }
  9193. (taicpu(p).oper[0]^.reg<>NR_STACK_POINTER_REG)
  9194. ) or (
  9195. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  9196. (taicpu(hp1).oper[0]^.ref^.base = NR_NO)
  9197. )
  9198. ) and (
  9199. Reg1WriteOverwritesReg2Entirely(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) or
  9200. (
  9201. { If the super registers ARE equal, then this MOV/S/Z does a partial write }
  9202. not SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
  9203. MemRegisterNotUsedLater
  9204. )
  9205. ) then
  9206. begin
  9207. taicpu(hp1).oper[0]^.ref^.base := taicpu(p).oper[1]^.reg;
  9208. taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.reg;
  9209. DebugMsg(SPeepholeOptimization + 'AddMov2Mov done', p);
  9210. RemoveCurrentp(p, hp1);
  9211. Result := True;
  9212. Exit;
  9213. end;
  9214. { Change:
  9215. addl/q $x,%reg1
  9216. movl/q %reg1,%reg2
  9217. To:
  9218. leal/q $x(%reg1),%reg2
  9219. addl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
  9220. Breaks the dependency chain.
  9221. }
  9222. if MatchOpType(taicpu(p),top_const,top_reg) and
  9223. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  9224. (taicpu(hp1).oper[1]^.typ = top_reg) and
  9225. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
  9226. (
  9227. { Don't do AddMov2LeaAdd under -Os, but do allow AddMov2Lea }
  9228. not (cs_opt_size in current_settings.optimizerswitches) or
  9229. (
  9230. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
  9231. RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
  9232. )
  9233. ) then
  9234. begin
  9235. { Change the MOV instruction to a LEA instruction, and update the
  9236. first operand }
  9237. reference_reset(NewRef, 1, []);
  9238. NewRef.base := taicpu(p).oper[1]^.reg;
  9239. NewRef.scalefactor := 1;
  9240. NewRef.offset := taicpu(p).oper[0]^.val;
  9241. taicpu(hp1).opcode := A_LEA;
  9242. taicpu(hp1).loadref(0, NewRef);
  9243. TransferUsedRegs(TmpUsedRegs);
  9244. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  9245. if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
  9246. RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
  9247. begin
  9248. { Move what is now the LEA instruction to before the SUB instruction }
  9249. Asml.Remove(hp1);
  9250. Asml.InsertBefore(hp1, p);
  9251. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
  9252. DebugMsg(SPeepholeOptimization + 'AddMov2LeaAdd', p);
  9253. p := hp1;
  9254. end
  9255. else
  9256. begin
  9257. { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
  9258. RemoveCurrentP(p, hp1);
  9259. DebugMsg(SPeepholeOptimization + 'AddMov2Lea', p);
  9260. end;
  9261. Result := True;
  9262. end;
  9263. end;
  9264. end;
  9265. function TX86AsmOptimizer.OptPass2Lea(var p : tai) : Boolean;
  9266. begin
  9267. Result:=false;
  9268. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) and
  9269. (taicpu(p).oper[0]^.ref^.offset = 0) then
  9270. begin
  9271. if (taicpu(p).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  9272. (taicpu(p).oper[0]^.ref^.index <> NR_NO) and
  9273. (taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) then
  9274. begin
  9275. taicpu(p).loadreg(1,taicpu(p).oper[0]^.ref^.base);
  9276. taicpu(p).loadreg(0,taicpu(p).oper[0]^.ref^.index);
  9277. taicpu(p).opcode:=A_ADD;
  9278. DebugMsg(SPeepholeOptimization + 'Lea2AddBase done',p);
  9279. result:=true;
  9280. end
  9281. else if (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) then
  9282. begin
  9283. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) then
  9284. begin
  9285. if (taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) then
  9286. begin
  9287. taicpu(p).loadreg(1,taicpu(p).oper[0]^.ref^.index);
  9288. taicpu(p).loadreg(0,taicpu(p).oper[0]^.ref^.base);
  9289. taicpu(p).opcode:=A_ADD;
  9290. DebugMsg(SPeepholeOptimization + 'Lea2AddIndex done',p);
  9291. result:=true;
  9292. end;
  9293. end
  9294. else
  9295. if (taicpu(p).oper[0]^.ref^.scalefactor in [2, 4, 8]) then
  9296. begin
  9297. { BsrByte is, in essence, the base-2 logarithm of the scale factor }
  9298. taicpu(p).loadreg(1,taicpu(p).oper[0]^.ref^.index);
  9299. taicpu(p).loadconst(0, BsrByte(taicpu(p).oper[0]^.ref^.scalefactor));
  9300. taicpu(p).opcode:=A_SHL;
  9301. DebugMsg(SPeepholeOptimization + 'Lea2Shl done',p);
  9302. result:=true;
  9303. end;
  9304. end;
  9305. end;
  9306. end;
  9307. function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
  9308. var
  9309. hp1: tai; NewRef: TReference;
  9310. begin
  9311. { Change:
  9312. subl/q $x,%reg1
  9313. movl/q %reg1,%reg2
  9314. To:
  9315. leal/q $-x(%reg1),%reg2
  9316. subl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
  9317. Breaks the dependency chain and potentially permits the removal of
  9318. a CMP instruction if one follows.
  9319. }
  9320. Result := False;
  9321. if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  9322. MatchOpType(taicpu(p),top_const,top_reg) and
  9323. GetNextInstruction(p, hp1) and
  9324. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  9325. (taicpu(hp1).oper[1]^.typ = top_reg) and
  9326. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
  9327. (
  9328. { Don't do SubMov2LeaSub under -Os, but do allow SubMov2Lea }
  9329. not (cs_opt_size in current_settings.optimizerswitches) or
  9330. (
  9331. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
  9332. RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
  9333. )
  9334. ) then
  9335. begin
  9336. { Change the MOV instruction to a LEA instruction, and update the
  9337. first operand }
  9338. reference_reset(NewRef, 1, []);
  9339. NewRef.base := taicpu(p).oper[1]^.reg;
  9340. NewRef.scalefactor := 1;
  9341. NewRef.offset := -taicpu(p).oper[0]^.val;
  9342. taicpu(hp1).opcode := A_LEA;
  9343. taicpu(hp1).loadref(0, NewRef);
  9344. TransferUsedRegs(TmpUsedRegs);
  9345. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  9346. if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
  9347. RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
  9348. begin
  9349. { Move what is now the LEA instruction to before the SUB instruction }
  9350. Asml.Remove(hp1);
  9351. Asml.InsertBefore(hp1, p);
  9352. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
  9353. DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
  9354. p := hp1;
  9355. end
  9356. else
  9357. begin
  9358. { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
  9359. RemoveCurrentP(p, hp1);
  9360. DebugMsg(SPeepholeOptimization + 'SubMov2Lea', p);
  9361. end;
  9362. Result := True;
  9363. end;
  9364. end;
  9365. function TX86AsmOptimizer.SkipSimpleInstructions(var hp1 : tai) : Boolean;
  9366. begin
  9367. { we can skip all instructions not messing with the stack pointer }
  9368. while assigned(hp1) and {MatchInstruction(hp1,[A_LEA,A_MOV,A_MOVQ,A_MOVSQ,A_MOVSX,A_MOVSXD,A_MOVZX,
  9369. A_AND,A_OR,A_XOR,A_ADD,A_SHR,A_SHL,A_IMUL,A_SETcc,A_SAR,A_SUB,A_TEST,A_CMOVcc,
  9370. A_MOVSS,A_MOVSD,A_MOVAPS,A_MOVUPD,A_MOVAPD,A_MOVUPS,
  9371. A_VMOVSS,A_VMOVSD,A_VMOVAPS,A_VMOVUPD,A_VMOVAPD,A_VMOVUPS],[]) and}
  9372. ({(taicpu(hp1).ops=0) or }
  9373. ({(MatchOpType(taicpu(hp1),top_reg,top_reg) or MatchOpType(taicpu(hp1),top_const,top_reg) or
  9374. (MatchOpType(taicpu(hp1),top_ref,top_reg))
  9375. ) and }
  9376. not(RegInInstruction(NR_STACK_POINTER_REG,hp1)) { and not(RegInInstruction(NR_FRAME_POINTER_REG,hp1))}
  9377. )
  9378. ) do
  9379. GetNextInstruction(hp1,hp1);
  9380. Result:=assigned(hp1);
  9381. end;
  9382. function TX86AsmOptimizer.PostPeepholeOptLea(var p : tai) : Boolean;
  9383. var
  9384. hp1, hp2, hp3, hp4, hp5: tai;
  9385. begin
  9386. Result:=false;
  9387. hp5:=nil;
  9388. { replace
  9389. leal(q) x(<stackpointer>),<stackpointer>
  9390. call procname
  9391. leal(q) -x(<stackpointer>),<stackpointer>
  9392. ret
  9393. by
  9394. jmp procname
  9395. but do it only on level 4 because it destroys stack back traces
  9396. }
  9397. if (cs_opt_level4 in current_settings.optimizerswitches) and
  9398. MatchOpType(taicpu(p),top_ref,top_reg) and
  9399. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  9400. (taicpu(p).oper[0]^.ref^.index=NR_NO) and
  9401. { the -8 or -24 are not required, but bail out early if possible,
  9402. higher values are unlikely }
  9403. ((taicpu(p).oper[0]^.ref^.offset=-8) or
  9404. (taicpu(p).oper[0]^.ref^.offset=-24)) and
  9405. (taicpu(p).oper[0]^.ref^.symbol=nil) and
  9406. (taicpu(p).oper[0]^.ref^.relsymbol=nil) and
  9407. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and
  9408. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG) and
  9409. GetNextInstruction(p, hp1) and
  9410. { Take a copy of hp1 }
  9411. SetAndTest(hp1, hp4) and
  9412. { trick to skip label }
  9413. ((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
  9414. SkipSimpleInstructions(hp1) and
  9415. MatchInstruction(hp1,A_CALL,[S_NO]) and
  9416. GetNextInstruction(hp1, hp2) and
  9417. MatchInstruction(hp2,A_LEA,[taicpu(p).opsize]) and
  9418. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  9419. (taicpu(hp2).oper[0]^.ref^.offset=-taicpu(p).oper[0]^.ref^.offset) and
  9420. (taicpu(hp2).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  9421. (taicpu(hp2).oper[0]^.ref^.index=NR_NO) and
  9422. (taicpu(hp2).oper[0]^.ref^.symbol=nil) and
  9423. (taicpu(hp2).oper[0]^.ref^.relsymbol=nil) and
  9424. (taicpu(hp2).oper[0]^.ref^.segment=NR_NO) and
  9425. (taicpu(hp2).oper[1]^.reg=NR_STACK_POINTER_REG) and
  9426. GetNextInstruction(hp2, hp3) and
  9427. { trick to skip label }
  9428. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  9429. (MatchInstruction(hp3,A_RET,[S_NO]) or
  9430. (MatchInstruction(hp3,A_VZEROUPPER,[S_NO]) and
  9431. SetAndTest(hp3,hp5) and
  9432. GetNextInstruction(hp3,hp3) and
  9433. MatchInstruction(hp3,A_RET,[S_NO])
  9434. )
  9435. ) and
  9436. (taicpu(hp3).ops=0) then
  9437. begin
  9438. taicpu(hp1).opcode := A_JMP;
  9439. taicpu(hp1).is_jmp := true;
  9440. DebugMsg(SPeepholeOptimization + 'LeaCallLeaRet2Jmp done',p);
  9441. RemoveCurrentP(p, hp4);
  9442. RemoveInstruction(hp2);
  9443. RemoveInstruction(hp3);
  9444. if Assigned(hp5) then
  9445. begin
  9446. AsmL.Remove(hp5);
  9447. ASmL.InsertBefore(hp5,hp1)
  9448. end;
  9449. Result:=true;
  9450. end;
  9451. end;
  9452. function TX86AsmOptimizer.PostPeepholeOptPush(var p : tai) : Boolean;
  9453. {$ifdef x86_64}
  9454. var
  9455. hp1, hp2, hp3, hp4, hp5: tai;
  9456. {$endif x86_64}
  9457. begin
  9458. Result:=false;
  9459. {$ifdef x86_64}
  9460. hp5:=nil;
  9461. { replace
  9462. push %rax
  9463. call procname
  9464. pop %rcx
  9465. ret
  9466. by
  9467. jmp procname
  9468. but do it only on level 4 because it destroys stack back traces
  9469. It depends on the fact, that the sequence push rax/pop rcx is used for stack alignment as rcx is volatile
  9470. for all supported calling conventions
  9471. }
  9472. if (cs_opt_level4 in current_settings.optimizerswitches) and
  9473. MatchOpType(taicpu(p),top_reg) and
  9474. (taicpu(p).oper[0]^.reg=NR_RAX) and
  9475. GetNextInstruction(p, hp1) and
  9476. { Take a copy of hp1 }
  9477. SetAndTest(hp1, hp4) and
  9478. { trick to skip label }
  9479. ((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
  9480. SkipSimpleInstructions(hp1) and
  9481. MatchInstruction(hp1,A_CALL,[S_NO]) and
  9482. GetNextInstruction(hp1, hp2) and
  9483. MatchInstruction(hp2,A_POP,[taicpu(p).opsize]) and
  9484. MatchOpType(taicpu(hp2),top_reg) and
  9485. (taicpu(hp2).oper[0]^.reg=NR_RCX) and
  9486. GetNextInstruction(hp2, hp3) and
  9487. { trick to skip label }
  9488. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  9489. (MatchInstruction(hp3,A_RET,[S_NO]) or
  9490. (MatchInstruction(hp3,A_VZEROUPPER,[S_NO]) and
  9491. SetAndTest(hp3,hp5) and
  9492. GetNextInstruction(hp3,hp3) and
  9493. MatchInstruction(hp3,A_RET,[S_NO])
  9494. )
  9495. ) and
  9496. (taicpu(hp3).ops=0) then
  9497. begin
  9498. taicpu(hp1).opcode := A_JMP;
  9499. taicpu(hp1).is_jmp := true;
  9500. DebugMsg(SPeepholeOptimization + 'PushCallPushRet2Jmp done',p);
  9501. RemoveCurrentP(p, hp4);
  9502. RemoveInstruction(hp2);
  9503. RemoveInstruction(hp3);
  9504. if Assigned(hp5) then
  9505. begin
  9506. AsmL.Remove(hp5);
  9507. ASmL.InsertBefore(hp5,hp1)
  9508. end;
  9509. Result:=true;
  9510. end;
  9511. {$endif x86_64}
  9512. end;
  9513. function TX86AsmOptimizer.PostPeepholeOptMov(var p : tai) : Boolean;
  9514. var
  9515. Value, RegName: string;
  9516. begin
  9517. Result:=false;
  9518. if (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(p).oper[0]^.typ = top_const) then
  9519. begin
  9520. case taicpu(p).oper[0]^.val of
  9521. 0:
  9522. { Don't make this optimisation if the CPU flags are required, since XOR scrambles them }
  9523. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  9524. begin
  9525. { change "mov $0,%reg" into "xor %reg,%reg" }
  9526. taicpu(p).opcode := A_XOR;
  9527. taicpu(p).loadReg(0,taicpu(p).oper[1]^.reg);
  9528. Result := True;
  9529. {$ifdef x86_64}
  9530. end
  9531. else if (taicpu(p).opsize = S_Q) then
  9532. begin
  9533. RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
  9534. { The actual optimization }
  9535. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  9536. taicpu(p).changeopsize(S_L);
  9537. DebugMsg(SPeepholeOptimization + 'movq $0,' + RegName + ' -> movl $0,' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
  9538. Result := True;
  9539. end;
  9540. $1..$FFFFFFFF:
  9541. begin
  9542. { Code size reduction by J. Gareth "Kit" Moreton }
  9543. { change 64-bit register to 32-bit register to reduce code size (upper 32 bits will be set to zero) }
  9544. case taicpu(p).opsize of
  9545. S_Q:
  9546. begin
  9547. RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
  9548. Value := debug_tostr(taicpu(p).oper[0]^.val);
  9549. { The actual optimization }
  9550. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  9551. taicpu(p).changeopsize(S_L);
  9552. DebugMsg(SPeepholeOptimization + 'movq $' + Value + ',' + RegName + ' -> movl $' + Value + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
  9553. Result := True;
  9554. end;
  9555. else
  9556. { Do nothing };
  9557. end;
  9558. {$endif x86_64}
  9559. end;
  9560. -1:
  9561. { Don't make this optimisation if the CPU flags are required, since OR scrambles them }
  9562. if (cs_opt_size in current_settings.optimizerswitches) and
  9563. (taicpu(p).opsize <> S_B) and
  9564. not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  9565. begin
  9566. { change "mov $-1,%reg" into "or $-1,%reg" }
  9567. { NOTES:
  9568. - No size saving is made when changing a Word-sized assignment unless the register is AX (smaller encoding)
  9569. - This operation creates a false dependency on the register, so only do it when optimising for size
  9570. - It is possible to set memory operands using this method, but this creates an even greater false dependency, so don't do this at all
  9571. }
  9572. taicpu(p).opcode := A_OR;
  9573. Result := True;
  9574. end;
  9575. else
  9576. { Do nothing };
  9577. end;
  9578. end;
  9579. end;
  9580. function TX86AsmOptimizer.PostPeepholeOptAnd(var p : tai) : boolean;
  9581. var
  9582. hp1: tai;
  9583. begin
  9584. { Detect:
  9585. andw x, %ax (0 <= x < $8000)
  9586. ...
  9587. movzwl %ax,%eax
  9588. Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
  9589. }
  9590. Result := False; if MatchOpType(taicpu(p), top_const, top_reg) and
  9591. (taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W }
  9592. ((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val) and
  9593. GetNextInstructionUsingReg(p, hp1, NR_EAX) and
  9594. MatchInstruction(hp1, A_MOVZX, [S_WL]) and
  9595. MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and
  9596. MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then
  9597. begin
  9598. DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via AndMovz2AndCwtl)', hp1);
  9599. taicpu(hp1).opcode := A_CWDE;
  9600. taicpu(hp1).clearop(0);
  9601. taicpu(hp1).clearop(1);
  9602. taicpu(hp1).ops := 0;
  9603. { A change was made, but not with p, so move forward 1 }
  9604. p := tai(p.Next);
  9605. Result := True;
  9606. end;
  9607. end;
  9608. function TX86AsmOptimizer.PostPeepholeOptMOVSX(var p : tai) : boolean;
  9609. begin
  9610. Result := False;
  9611. if not MatchOpType(taicpu(p), top_reg, top_reg) then
  9612. Exit;
  9613. { Convert:
  9614. movswl %ax,%eax -> cwtl
  9615. movslq %eax,%rax -> cdqe
  9616. NOTE: Don't convert movswl %al,%ax to cbw, because cbw and cwde
  9617. refer to the same opcode and depends only on the assembler's
  9618. current operand-size attribute. [Kit]
  9619. }
  9620. with taicpu(p) do
  9621. case opsize of
  9622. S_WL:
  9623. if (oper[0]^.reg = NR_AX) and (oper[1]^.reg = NR_EAX) then
  9624. begin
  9625. DebugMsg(SPeepholeOptimization + 'Converted movswl %ax,%eax to cwtl', p);
  9626. opcode := A_CWDE;
  9627. clearop(0);
  9628. clearop(1);
  9629. ops := 0;
  9630. Result := True;
  9631. end;
  9632. {$ifdef x86_64}
  9633. S_LQ:
  9634. if (oper[0]^.reg = NR_EAX) and (oper[1]^.reg = NR_RAX) then
  9635. begin
  9636. DebugMsg(SPeepholeOptimization + 'Converted movslq %eax,%rax to cltq', p);
  9637. opcode := A_CDQE;
  9638. clearop(0);
  9639. clearop(1);
  9640. ops := 0;
  9641. Result := True;
  9642. end;
  9643. {$endif x86_64}
  9644. else
  9645. ;
  9646. end;
  9647. end;
  9648. function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean;
  9649. var
  9650. hp1: tai;
  9651. begin
  9652. { Detect:
  9653. shr x, %ax (x > 0)
  9654. ...
  9655. movzwl %ax,%eax
  9656. Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
  9657. }
  9658. Result := False;
  9659. if MatchOpType(taicpu(p), top_const, top_reg) and
  9660. (taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W }
  9661. (taicpu(p).oper[0]^.val > 0) and
  9662. GetNextInstructionUsingReg(p, hp1, NR_EAX) and
  9663. MatchInstruction(hp1, A_MOVZX, [S_WL]) and
  9664. MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and
  9665. MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then
  9666. begin
  9667. DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
  9668. taicpu(hp1).opcode := A_CWDE;
  9669. taicpu(hp1).clearop(0);
  9670. taicpu(hp1).clearop(1);
  9671. taicpu(hp1).ops := 0;
  9672. { A change was made, but not with p, so move forward 1 }
  9673. p := tai(p.Next);
  9674. Result := True;
  9675. end;
  9676. end;
  9677. function TX86AsmOptimizer.PostPeepholeOptADDSUB(var p : tai) : boolean;
  9678. var
  9679. hp1, hp2: tai;
  9680. begin
  9681. { Detect:
  9682. add/sub %reg2,(dest)
  9683. add/sub x, (dest)
  9684. (dest can be a register or a reference)
  9685. Swap the instructions to minimise a pipeline stall. This reverses the
  9686. "Add swap" and "Sub swap" optimisations done in pass 1 if no new
  9687. optimisations could be made.
  9688. }
  9689. Result := False;
  9690. if (taicpu(p).oper[0]^.typ = top_reg) and
  9691. not RegInOp(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^) and
  9692. (
  9693. (
  9694. (taicpu(p).oper[1]^.typ = top_reg) and
  9695. { We can try searching further ahead if we're writing to a register }
  9696. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.reg)
  9697. ) or
  9698. (
  9699. (taicpu(p).oper[1]^.typ = top_ref) and
  9700. GetNextInstruction(p, hp1)
  9701. )
  9702. ) and
  9703. MatchInstruction(hp1, A_ADD, A_SUB, [taicpu(p).opsize]) and
  9704. (taicpu(hp1).oper[0]^.typ = top_const) and
  9705. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[1]^) then
  9706. begin
  9707. { Make doubly sure the flags aren't in use because the order of additions may affect them }
  9708. TransferUsedRegs(TmpUsedRegs);
  9709. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  9710. hp2 := p;
  9711. while not (cs_opt_level3 in current_settings.optimizerswitches) and
  9712. GetNextInstruction(hp2, hp2) and (hp2 <> hp1) do
  9713. UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
  9714. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  9715. begin
  9716. asml.remove(hp1);
  9717. asml.InsertBefore(hp1, p);
  9718. DebugMsg(SPeepholeOptimization + 'Add/Sub swap 2 done', hp1);
  9719. Result := True;
  9720. end;
  9721. end;
  9722. end;
  9723. function TX86AsmOptimizer.PostPeepholeOptCmp(var p : tai) : Boolean;
  9724. begin
  9725. Result:=false;
  9726. { change "cmp $0, %reg" to "test %reg, %reg" }
  9727. if MatchOpType(taicpu(p),top_const,top_reg) and
  9728. (taicpu(p).oper[0]^.val = 0) then
  9729. begin
  9730. taicpu(p).opcode := A_TEST;
  9731. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  9732. Result:=true;
  9733. end;
  9734. end;
  9735. function TX86AsmOptimizer.PostPeepholeOptTestOr(var p : tai) : Boolean;
  9736. var
  9737. IsTestConstX : Boolean;
  9738. hp1,hp2 : tai;
  9739. begin
  9740. Result:=false;
  9741. { removes the line marked with (x) from the sequence
  9742. and/or/xor/add/sub/... $x, %y
  9743. test/or %y, %y | test $-1, %y (x)
  9744. j(n)z _Label
  9745. as the first instruction already adjusts the ZF
  9746. %y operand may also be a reference }
  9747. IsTestConstX:=(taicpu(p).opcode=A_TEST) and
  9748. MatchOperand(taicpu(p).oper[0]^,-1);
  9749. if (OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) or IsTestConstX) and
  9750. GetLastInstruction(p, hp1) and
  9751. (tai(hp1).typ = ait_instruction) and
  9752. GetNextInstruction(p,hp2) and
  9753. MatchInstruction(hp2,A_SETcc,A_Jcc,A_CMOVcc,[]) then
  9754. case taicpu(hp1).opcode Of
  9755. A_ADD, A_SUB, A_OR, A_XOR, A_AND:
  9756. begin
  9757. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  9758. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  9759. { and in case of carry for A(E)/B(E)/C/NC }
  9760. ((taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) or
  9761. ((taicpu(hp1).opcode <> A_ADD) and
  9762. (taicpu(hp1).opcode <> A_SUB))) then
  9763. begin
  9764. RemoveCurrentP(p, hp2);
  9765. Result:=true;
  9766. Exit;
  9767. end;
  9768. end;
  9769. A_SHL, A_SAL, A_SHR, A_SAR:
  9770. begin
  9771. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  9772. { SHL/SAL/SHR/SAR with a value of 0 do not change the flags }
  9773. { therefore, it's only safe to do this optimization for }
  9774. { shifts by a (nonzero) constant }
  9775. (taicpu(hp1).oper[0]^.typ = top_const) and
  9776. (taicpu(hp1).oper[0]^.val <> 0) and
  9777. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  9778. { and in case of carry for A(E)/B(E)/C/NC }
  9779. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  9780. begin
  9781. RemoveCurrentP(p, hp2);
  9782. Result:=true;
  9783. Exit;
  9784. end;
  9785. end;
  9786. A_DEC, A_INC, A_NEG:
  9787. begin
  9788. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) and
  9789. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  9790. { and in case of carry for A(E)/B(E)/C/NC }
  9791. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  9792. begin
  9793. RemoveCurrentP(p, hp2);
  9794. Result:=true;
  9795. Exit;
  9796. end;
  9797. end
  9798. else
  9799. ;
  9800. end; { case }
  9801. { change "test $-1,%reg" into "test %reg,%reg" }
  9802. if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
  9803. taicpu(p).loadoper(0,taicpu(p).oper[1]^);
  9804. { Change "or %reg,%reg" to "test %reg,%reg" as OR generates a false dependency }
  9805. if MatchInstruction(p, A_OR, []) and
  9806. { Can only match if they're both registers }
  9807. MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) then
  9808. begin
  9809. DebugMsg(SPeepholeOptimization + 'or %reg,%reg -> test %reg,%reg to remove false dependency (Or2Test)', p);
  9810. taicpu(p).opcode := A_TEST;
  9811. { No need to set Result to True, as we've done all the optimisations we can }
  9812. end;
  9813. end;
  9814. function TX86AsmOptimizer.PostPeepholeOptCall(var p : tai) : Boolean;
  9815. var
  9816. hp1,hp3 : tai;
  9817. {$ifndef x86_64}
  9818. hp2 : taicpu;
  9819. {$endif x86_64}
  9820. begin
  9821. Result:=false;
  9822. hp3:=nil;
  9823. {$ifndef x86_64}
  9824. { don't do this on modern CPUs, this really hurts them due to
  9825. broken call/ret pairing }
  9826. if (current_settings.optimizecputype < cpu_Pentium2) and
  9827. not(cs_create_pic in current_settings.moduleswitches) and
  9828. GetNextInstruction(p, hp1) and
  9829. MatchInstruction(hp1,A_JMP,[S_NO]) and
  9830. MatchOpType(taicpu(hp1),top_ref) and
  9831. (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  9832. begin
  9833. hp2 := taicpu.Op_sym(A_PUSH,S_L,taicpu(hp1).oper[0]^.ref^.symbol);
  9834. InsertLLItem(p.previous, p, hp2);
  9835. taicpu(p).opcode := A_JMP;
  9836. taicpu(p).is_jmp := true;
  9837. RemoveInstruction(hp1);
  9838. Result:=true;
  9839. end
  9840. else
  9841. {$endif x86_64}
  9842. { replace
  9843. call procname
  9844. ret
  9845. by
  9846. jmp procname
  9847. but do it only on level 4 because it destroys stack back traces
  9848. else if the subroutine is marked as no return, remove the ret
  9849. }
  9850. if ((cs_opt_level4 in current_settings.optimizerswitches) or
  9851. (po_noreturn in current_procinfo.procdef.procoptions)) and
  9852. GetNextInstruction(p, hp1) and
  9853. (MatchInstruction(hp1,A_RET,[S_NO]) or
  9854. (MatchInstruction(hp1,A_VZEROUPPER,[S_NO]) and
  9855. SetAndTest(hp1,hp3) and
  9856. GetNextInstruction(hp1,hp1) and
  9857. MatchInstruction(hp1,A_RET,[S_NO])
  9858. )
  9859. ) and
  9860. (taicpu(hp1).ops=0) then
  9861. begin
  9862. if (cs_opt_level4 in current_settings.optimizerswitches) and
  9863. { we might destroy stack alignment here if we do not do a call }
  9864. (target_info.stackalign<=sizeof(SizeUInt)) then
  9865. begin
  9866. taicpu(p).opcode := A_JMP;
  9867. taicpu(p).is_jmp := true;
  9868. DebugMsg(SPeepholeOptimization + 'CallRet2Jmp done',p);
  9869. end
  9870. else
  9871. DebugMsg(SPeepholeOptimization + 'CallRet2Call done',p);
  9872. RemoveInstruction(hp1);
  9873. if Assigned(hp3) then
  9874. begin
  9875. AsmL.Remove(hp3);
  9876. AsmL.InsertBefore(hp3,p)
  9877. end;
  9878. Result:=true;
  9879. end;
  9880. end;
  9881. function TX86AsmOptimizer.PostPeepholeOptMovzx(var p : tai) : Boolean;
  9882. function ConstInRange(const Val: TCGInt; const OpSize: TOpSize): Boolean;
  9883. begin
  9884. case OpSize of
  9885. S_B, S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  9886. Result := (Val <= $FF) and (Val >= -128);
  9887. S_W, S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  9888. Result := (Val <= $FFFF) and (Val >= -32768);
  9889. S_L{$ifdef x86_64}, S_LQ{$endif x86_64}:
  9890. Result := (Val <= $FFFFFFFF) and (Val >= -2147483648);
  9891. else
  9892. Result := True;
  9893. end;
  9894. end;
  9895. var
  9896. hp1, hp2 : tai;
  9897. SizeChange: Boolean;
  9898. PreMessage: string;
  9899. begin
  9900. Result := False;
  9901. if (taicpu(p).oper[0]^.typ = top_reg) and
  9902. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  9903. GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) then
  9904. begin
  9905. { Change (using movzbl %al,%eax as an example):
  9906. movzbl %al, %eax movzbl %al, %eax
  9907. cmpl x, %eax testl %eax,%eax
  9908. To:
  9909. cmpb x, %al testb %al, %al (Move one back to avoid a false dependency)
  9910. movzbl %al, %eax movzbl %al, %eax
  9911. Smaller instruction and minimises pipeline stall as the CPU
  9912. doesn't have to wait for the register to get zero-extended. [Kit]
  9913. Also allow if the smaller of the two registers is being checked,
  9914. as this still removes the false dependency.
  9915. }
  9916. if
  9917. (
  9918. (
  9919. (taicpu(hp1).opcode = A_CMP) and MatchOpType(taicpu(hp1), top_const, top_reg) and
  9920. ConstInRange(taicpu(hp1).oper[0]^.val, taicpu(p).opsize)
  9921. ) or (
  9922. { If MatchOperand returns True, they must both be registers }
  9923. (taicpu(hp1).opcode = A_TEST) and MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^)
  9924. )
  9925. ) and
  9926. (reg2opsize(taicpu(hp1).oper[1]^.reg) <= reg2opsize(taicpu(p).oper[1]^.reg)) and
  9927. SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) then
  9928. begin
  9929. PreMessage := debug_op2str(taicpu(hp1).opcode) + debug_opsize2str(taicpu(hp1).opsize) + ' ' + debug_operstr(taicpu(hp1).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' -> ' + debug_op2str(taicpu(hp1).opcode);
  9930. asml.Remove(hp1);
  9931. asml.InsertBefore(hp1, p);
  9932. { Swap instructions in the case of cmp 0,%reg or test %reg,%reg }
  9933. if (taicpu(hp1).opcode = A_TEST) or (taicpu(hp1).oper[0]^.val = 0) then
  9934. begin
  9935. taicpu(hp1).opcode := A_TEST;
  9936. taicpu(hp1).loadreg(0, taicpu(p).oper[0]^.reg);
  9937. end;
  9938. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[0]^.reg;
  9939. case taicpu(p).opsize of
  9940. S_BW, S_BL:
  9941. begin
  9942. SizeChange := taicpu(hp1).opsize <> S_B;
  9943. taicpu(hp1).changeopsize(S_B);
  9944. end;
  9945. S_WL:
  9946. begin
  9947. SizeChange := taicpu(hp1).opsize <> S_W;
  9948. taicpu(hp1).changeopsize(S_W);
  9949. end
  9950. else
  9951. InternalError(2020112701);
  9952. end;
  9953. UpdateUsedRegs(tai(p.Next));
  9954. { Check if the register is used aferwards - if not, we can
  9955. remove the movzx instruction completely }
  9956. if not RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, p, UsedRegs) then
  9957. begin
  9958. { Hp1 is a better position than p for debugging purposes }
  9959. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 4a', hp1);
  9960. RemoveCurrentp(p, hp1);
  9961. Result := True;
  9962. end;
  9963. if SizeChange then
  9964. DebugMsg(SPeepholeOptimization + PreMessage +
  9965. debug_opsize2str(taicpu(hp1).opsize) + ' ' + debug_operstr(taicpu(hp1).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (smaller and minimises pipeline stall - MovzxCmp2CmpMovzx)', hp1)
  9966. else
  9967. DebugMsg(SPeepholeOptimization + 'MovzxCmp2CmpMovzx', hp1);
  9968. Exit;
  9969. end;
  9970. { Change (using movzwl %ax,%eax as an example):
  9971. movzwl %ax, %eax
  9972. movb %al, (dest) (Register is smaller than read register in movz)
  9973. To:
  9974. movb %al, (dest) (Move one back to avoid a false dependency)
  9975. movzwl %ax, %eax
  9976. }
  9977. if (taicpu(hp1).opcode = A_MOV) and
  9978. (taicpu(hp1).oper[0]^.typ = top_reg) and
  9979. not RegInOp(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^) and
  9980. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[0]^.reg) and
  9981. (reg2opsize(taicpu(hp1).oper[0]^.reg) <= reg2opsize(taicpu(p).oper[0]^.reg)) then
  9982. begin
  9983. DebugMsg(SPeepholeOptimization + 'MovzxMov2MovMovzx', hp1);
  9984. hp2 := tai(hp1.Previous); { Effectively the old position of hp1 }
  9985. asml.Remove(hp1);
  9986. asml.InsertBefore(hp1, p);
  9987. if taicpu(hp1).oper[1]^.typ = top_reg then
  9988. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
  9989. { Check if the register is used aferwards - if not, we can
  9990. remove the movzx instruction completely }
  9991. if not RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg, p, UsedRegs) then
  9992. begin
  9993. { Hp1 is a better position than p for debugging purposes }
  9994. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 4b', hp1);
  9995. RemoveCurrentp(p, hp1);
  9996. Result := True;
  9997. end;
  9998. Exit;
  9999. end;
  10000. end;
  10001. end;
  10002. {$ifdef x86_64}
  10003. function TX86AsmOptimizer.PostPeepholeOptXor(var p : tai) : Boolean;
  10004. var
  10005. PreMessage, RegName: string;
  10006. begin
  10007. { Code size reduction by J. Gareth "Kit" Moreton }
  10008. { change "xorq %reg,%reg" to "xorl %reg,%reg" for %rax, %rcx, %rdx, %rbx, %rsi, %rdi, %rbp and %rsp,
  10009. as this removes the REX prefix }
  10010. Result := False;
  10011. if not OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  10012. Exit;
  10013. if taicpu(p).oper[0]^.typ <> top_reg then
  10014. { Should be impossible if both operands were equal, since one of XOR's operands must be a register }
  10015. InternalError(2018011500);
  10016. case taicpu(p).opsize of
  10017. S_Q:
  10018. begin
  10019. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 64-bit register name }
  10020. PreMessage := 'xorq ' + RegName + ',' + RegName + ' -> xorl ';
  10021. { The actual optimization }
  10022. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  10023. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  10024. taicpu(p).changeopsize(S_L);
  10025. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 32-bit register name }
  10026. DebugMsg(SPeepholeOptimization + PreMessage + RegName + ',' + RegName + ' (32-bit register recommended when zeroing 64-bit counterpart)', p);
  10027. end;
  10028. else
  10029. ;
  10030. end;
  10031. end;
  10032. {$endif}
  10033. function TX86AsmOptimizer.PostPeepholeOptVPXOR(var p : tai) : Boolean;
  10034. var
  10035. XReg: TRegister;
  10036. begin
  10037. Result := False;
  10038. { Turn "vpxor %ymmreg2,%ymmreg2,%ymmreg1" to "vpxor %xmmreg2,%xmmreg2,%xmmreg1"
  10039. Smaller encoding and slightly faster on some platforms (also works for
  10040. ZMM-sized registers) }
  10041. if (taicpu(p).opsize in [S_YMM, S_ZMM]) and
  10042. MatchOpType(taicpu(p), top_reg, top_reg, top_reg) then
  10043. begin
  10044. XReg := taicpu(p).oper[0]^.reg;
  10045. if (taicpu(p).oper[1]^.reg = XReg) then
  10046. begin
  10047. taicpu(p).changeopsize(S_XMM);
  10048. setsubreg(taicpu(p).oper[2]^.reg, R_SUBMMX);
  10049. if (cs_opt_size in current_settings.optimizerswitches) then
  10050. begin
  10051. { Change input registers to %xmm0 to reduce size. Note that
  10052. there's a risk of a false dependency doing this, so only
  10053. optimise for size here }
  10054. XReg := NR_XMM0;
  10055. DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM and changed input registers to %xmm0 to reduce size', p);
  10056. end
  10057. else
  10058. begin
  10059. setsubreg(XReg, R_SUBMMX);
  10060. DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM to reduce size and increase efficiency', p);
  10061. end;
  10062. taicpu(p).oper[0]^.reg := XReg;
  10063. taicpu(p).oper[1]^.reg := XReg;
  10064. Result := True;
  10065. end;
  10066. end;
  10067. end;
  10068. class procedure TX86AsmOptimizer.OptimizeRefs(var p: taicpu);
  10069. var
  10070. OperIdx: Integer;
  10071. begin
  10072. for OperIdx := 0 to p.ops - 1 do
  10073. if p.oper[OperIdx]^.typ = top_ref then
  10074. optimize_ref(p.oper[OperIdx]^.ref^, False);
  10075. end;
  10076. end.