aoptx86.pas 787 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376937793789379938093819382938393849385938693879388938993909391939293939394939593969397939893999400940194029403940494059406940794089409941094119412941394149415941694179418941994209421942294239424942594269427942894299430943194329433943494359436943794389439944094419442944394449445944694479448944994509451945294539454945594569457945894599460946194629463946494659466946794689469947094719472947394749475947694779478947994809481948294839484948594869487948894899490949194929493949494959496949794989499950095019502950395049505950695079508950995109511951295139514951595169517951895199520952195229523952495259526952795289529953095319532953395349535953695379538953995409541954295439544954595469547954895499550955195529553955495559556955795589559956095619562956395649565956695679568956995709571957295739574957595769577957895799580958195829583958495859586958795889589959095919592959395949595959695979598959996009601960296039604960596069607960896099610961196129613961496159616961796189619962096219622962396249625962696279628962996309631963296339634963596369637963896399640964196429643964496459646964796489649965096519652965396549655965696579658965996609661966296639664966596669667966896699670967196729673967496759676967796789679968096819682968396849685968696879688968996909691969296939694969596969697969896999700970197029703970497059706970797089709971097119712971397149715971697179718971997209721972297239724972597269727972897299730973197329733973497359736973797389739974097419742974397449745974697479748974997509751975297539754975597569757975897599760976197629763976497659766976797689769977097719772977397749775977697779778977997809781978297839784978597869787978897899790979197929793979497959796979797989799980098019802980398049805980698079808980998109811981298139814981598169817981898199820982198229823982498259826982798289829983098319832983398349835983698379838983998409841984298439844984598469847984898499850985198529853985498559856985798589859986098619862986398649865986698679868986998709871987298739874987598769877987898799880988198829883988498859886988798889889989098919892989398949895989698979898989999009901990299039904990599069907990899099910991199129913991499159916991799189919992099219922992399249925992699279928992999309931993299339934993599369937993899399940994199429943994499459946994799489949995099519952995399549955995699579958995999609961996299639964996599669967996899699970997199729973997499759976997799789979998099819982998399849985998699879988998999909991999299939994999599969997999899991000010001100021000310004100051000610007100081000910010100111001210013100141001510016100171001810019100201002110022100231002410025100261002710028100291003010031100321003310034100351003610037100381003910040100411004210043100441004510046100471004810049100501005110052100531005410055100561005710058100591006010061100621006310064100651006610067100681006910070100711007210073100741007510076100771007810079100801008110082100831008410085100861008710088100891009010091100921009310094100951009610097100981009910100101011010210103101041010510106101071010810109101101011110112101131011410115101161011710118101191012010121101221012310124101251012610127101281012910130101311013210133101341013510136101371013810139101401014110142101431014410145101461014710148101491015010151101521015310154101551015610157101581015910160101611016210163101641016510166101671016810169101701017110172101731017410175101761017710178101791018010181101821018310184101851018610187101881018910190101911019210193101941019510196101971019810199102001020110202102031020410205102061020710208102091021010211102121021310214102151021610217102181021910220102211022210223102241022510226102271022810229102301023110232102331023410235102361023710238102391024010241102421024310244102451024610247102481024910250102511025210253102541025510256102571025810259102601026110262102631026410265102661026710268102691027010271102721027310274102751027610277102781027910280102811028210283102841028510286102871028810289102901029110292102931029410295102961029710298102991030010301103021030310304103051030610307103081030910310103111031210313103141031510316103171031810319103201032110322103231032410325103261032710328103291033010331103321033310334103351033610337103381033910340103411034210343103441034510346103471034810349103501035110352103531035410355103561035710358103591036010361103621036310364103651036610367103681036910370103711037210373103741037510376103771037810379103801038110382103831038410385103861038710388103891039010391103921039310394103951039610397103981039910400104011040210403104041040510406104071040810409104101041110412104131041410415104161041710418104191042010421104221042310424104251042610427104281042910430104311043210433104341043510436104371043810439104401044110442104431044410445104461044710448104491045010451104521045310454104551045610457104581045910460104611046210463104641046510466104671046810469104701047110472104731047410475104761047710478104791048010481104821048310484104851048610487104881048910490104911049210493104941049510496104971049810499105001050110502105031050410505105061050710508105091051010511105121051310514105151051610517105181051910520105211052210523105241052510526105271052810529105301053110532105331053410535105361053710538105391054010541105421054310544105451054610547105481054910550105511055210553105541055510556105571055810559105601056110562105631056410565105661056710568105691057010571105721057310574105751057610577105781057910580105811058210583105841058510586105871058810589105901059110592105931059410595105961059710598105991060010601106021060310604106051060610607106081060910610106111061210613106141061510616106171061810619106201062110622106231062410625106261062710628106291063010631106321063310634106351063610637106381063910640106411064210643106441064510646106471064810649106501065110652106531065410655106561065710658106591066010661106621066310664106651066610667106681066910670106711067210673106741067510676106771067810679106801068110682106831068410685106861068710688106891069010691106921069310694106951069610697106981069910700107011070210703107041070510706107071070810709107101071110712107131071410715107161071710718107191072010721107221072310724107251072610727107281072910730107311073210733107341073510736107371073810739107401074110742107431074410745107461074710748107491075010751107521075310754107551075610757107581075910760107611076210763107641076510766107671076810769107701077110772107731077410775107761077710778107791078010781107821078310784107851078610787107881078910790107911079210793107941079510796107971079810799108001080110802108031080410805108061080710808108091081010811108121081310814108151081610817108181081910820108211082210823108241082510826108271082810829108301083110832108331083410835108361083710838108391084010841108421084310844108451084610847108481084910850108511085210853108541085510856108571085810859108601086110862108631086410865108661086710868108691087010871108721087310874108751087610877108781087910880108811088210883108841088510886108871088810889108901089110892108931089410895108961089710898108991090010901109021090310904109051090610907109081090910910109111091210913109141091510916109171091810919109201092110922109231092410925109261092710928109291093010931109321093310934109351093610937109381093910940109411094210943109441094510946109471094810949109501095110952109531095410955109561095710958109591096010961109621096310964109651096610967109681096910970109711097210973109741097510976109771097810979109801098110982109831098410985109861098710988109891099010991109921099310994109951099610997109981099911000110011100211003110041100511006110071100811009110101101111012110131101411015110161101711018110191102011021110221102311024110251102611027110281102911030110311103211033110341103511036110371103811039110401104111042110431104411045110461104711048110491105011051110521105311054110551105611057110581105911060110611106211063110641106511066110671106811069110701107111072110731107411075110761107711078110791108011081110821108311084110851108611087110881108911090110911109211093110941109511096110971109811099111001110111102111031110411105111061110711108111091111011111111121111311114111151111611117111181111911120111211112211123111241112511126111271112811129111301113111132111331113411135111361113711138111391114011141111421114311144111451114611147111481114911150111511115211153111541115511156111571115811159111601116111162111631116411165111661116711168111691117011171111721117311174111751117611177111781117911180111811118211183111841118511186111871118811189111901119111192111931119411195111961119711198111991120011201112021120311204112051120611207112081120911210112111121211213112141121511216112171121811219112201122111222112231122411225112261122711228112291123011231112321123311234112351123611237112381123911240112411124211243112441124511246112471124811249112501125111252112531125411255112561125711258112591126011261112621126311264112651126611267112681126911270112711127211273112741127511276112771127811279112801128111282112831128411285112861128711288112891129011291112921129311294112951129611297112981129911300113011130211303113041130511306113071130811309113101131111312113131131411315113161131711318113191132011321113221132311324113251132611327113281132911330113311133211333113341133511336113371133811339113401134111342113431134411345113461134711348113491135011351113521135311354113551135611357113581135911360113611136211363113641136511366113671136811369113701137111372113731137411375113761137711378113791138011381113821138311384113851138611387113881138911390113911139211393113941139511396113971139811399114001140111402114031140411405114061140711408114091141011411114121141311414114151141611417114181141911420114211142211423114241142511426114271142811429114301143111432114331143411435114361143711438114391144011441114421144311444114451144611447114481144911450114511145211453114541145511456114571145811459114601146111462114631146411465114661146711468114691147011471114721147311474114751147611477114781147911480114811148211483114841148511486114871148811489114901149111492114931149411495114961149711498114991150011501115021150311504115051150611507115081150911510115111151211513115141151511516115171151811519115201152111522115231152411525115261152711528115291153011531115321153311534115351153611537115381153911540115411154211543115441154511546115471154811549115501155111552115531155411555115561155711558115591156011561115621156311564115651156611567115681156911570115711157211573115741157511576115771157811579115801158111582115831158411585115861158711588115891159011591115921159311594115951159611597115981159911600116011160211603116041160511606116071160811609116101161111612116131161411615116161161711618116191162011621116221162311624116251162611627116281162911630116311163211633116341163511636116371163811639116401164111642116431164411645116461164711648116491165011651116521165311654116551165611657116581165911660116611166211663116641166511666116671166811669116701167111672116731167411675116761167711678116791168011681116821168311684116851168611687116881168911690116911169211693116941169511696116971169811699117001170111702117031170411705117061170711708117091171011711117121171311714117151171611717117181171911720117211172211723117241172511726117271172811729117301173111732117331173411735117361173711738117391174011741117421174311744117451174611747117481174911750117511175211753117541175511756117571175811759117601176111762117631176411765117661176711768117691177011771117721177311774117751177611777117781177911780117811178211783117841178511786117871178811789117901179111792117931179411795117961179711798117991180011801118021180311804118051180611807118081180911810118111181211813118141181511816118171181811819118201182111822118231182411825118261182711828118291183011831118321183311834118351183611837118381183911840118411184211843118441184511846118471184811849118501185111852118531185411855118561185711858118591186011861118621186311864118651186611867118681186911870118711187211873118741187511876118771187811879118801188111882118831188411885118861188711888118891189011891118921189311894118951189611897118981189911900119011190211903119041190511906119071190811909119101191111912119131191411915119161191711918119191192011921119221192311924119251192611927119281192911930119311193211933119341193511936119371193811939119401194111942119431194411945119461194711948119491195011951119521195311954119551195611957119581195911960119611196211963119641196511966119671196811969119701197111972119731197411975119761197711978119791198011981119821198311984119851198611987119881198911990119911199211993119941199511996119971199811999120001200112002120031200412005120061200712008120091201012011120121201312014120151201612017120181201912020120211202212023120241202512026120271202812029120301203112032120331203412035120361203712038120391204012041120421204312044120451204612047120481204912050120511205212053120541205512056120571205812059120601206112062120631206412065120661206712068120691207012071120721207312074120751207612077120781207912080120811208212083120841208512086120871208812089120901209112092120931209412095120961209712098120991210012101121021210312104121051210612107121081210912110121111211212113121141211512116121171211812119121201212112122121231212412125121261212712128121291213012131121321213312134121351213612137121381213912140121411214212143121441214512146121471214812149121501215112152121531215412155121561215712158121591216012161121621216312164121651216612167121681216912170121711217212173121741217512176121771217812179121801218112182121831218412185121861218712188121891219012191121921219312194121951219612197121981219912200122011220212203122041220512206122071220812209122101221112212122131221412215122161221712218122191222012221122221222312224122251222612227122281222912230122311223212233122341223512236122371223812239122401224112242122431224412245122461224712248122491225012251122521225312254122551225612257122581225912260122611226212263122641226512266122671226812269122701227112272122731227412275122761227712278122791228012281122821228312284122851228612287122881228912290122911229212293122941229512296122971229812299123001230112302123031230412305123061230712308123091231012311123121231312314123151231612317123181231912320123211232212323123241232512326123271232812329123301233112332123331233412335123361233712338123391234012341123421234312344123451234612347123481234912350123511235212353123541235512356123571235812359123601236112362123631236412365123661236712368123691237012371123721237312374123751237612377123781237912380123811238212383123841238512386123871238812389123901239112392123931239412395123961239712398123991240012401124021240312404124051240612407124081240912410124111241212413124141241512416124171241812419124201242112422124231242412425124261242712428124291243012431124321243312434124351243612437124381243912440124411244212443124441244512446124471244812449124501245112452124531245412455124561245712458124591246012461124621246312464124651246612467124681246912470124711247212473124741247512476124771247812479124801248112482124831248412485124861248712488124891249012491124921249312494124951249612497124981249912500125011250212503125041250512506125071250812509125101251112512125131251412515125161251712518125191252012521125221252312524125251252612527125281252912530125311253212533125341253512536125371253812539125401254112542125431254412545125461254712548125491255012551125521255312554125551255612557125581255912560125611256212563125641256512566125671256812569125701257112572125731257412575125761257712578125791258012581125821258312584125851258612587125881258912590125911259212593125941259512596125971259812599126001260112602126031260412605126061260712608126091261012611126121261312614126151261612617126181261912620126211262212623126241262512626126271262812629126301263112632126331263412635126361263712638126391264012641126421264312644126451264612647126481264912650126511265212653126541265512656126571265812659126601266112662126631266412665126661266712668126691267012671126721267312674126751267612677126781267912680126811268212683126841268512686126871268812689126901269112692126931269412695126961269712698126991270012701127021270312704127051270612707127081270912710127111271212713127141271512716127171271812719127201272112722127231272412725127261272712728127291273012731127321273312734127351273612737127381273912740127411274212743127441274512746127471274812749127501275112752127531275412755127561275712758127591276012761127621276312764127651276612767127681276912770127711277212773127741277512776127771277812779127801278112782127831278412785127861278712788127891279012791127921279312794127951279612797127981279912800128011280212803128041280512806128071280812809128101281112812128131281412815128161281712818128191282012821128221282312824128251282612827128281282912830128311283212833128341283512836128371283812839128401284112842128431284412845128461284712848128491285012851128521285312854128551285612857128581285912860128611286212863128641286512866128671286812869128701287112872128731287412875128761287712878128791288012881128821288312884128851288612887128881288912890128911289212893128941289512896128971289812899129001290112902129031290412905129061290712908129091291012911129121291312914129151291612917129181291912920129211292212923129241292512926129271292812929129301293112932129331293412935129361293712938129391294012941129421294312944129451294612947129481294912950129511295212953129541295512956129571295812959129601296112962129631296412965129661296712968129691297012971129721297312974129751297612977129781297912980129811298212983129841298512986129871298812989129901299112992129931299412995129961299712998129991300013001130021300313004130051300613007130081300913010130111301213013130141301513016130171301813019130201302113022130231302413025130261302713028130291303013031130321303313034130351303613037130381303913040130411304213043130441304513046130471304813049130501305113052130531305413055130561305713058130591306013061130621306313064130651306613067130681306913070130711307213073130741307513076130771307813079130801308113082130831308413085130861308713088130891309013091130921309313094130951309613097130981309913100131011310213103131041310513106131071310813109131101311113112131131311413115131161311713118131191312013121131221312313124131251312613127131281312913130131311313213133131341313513136131371313813139131401314113142131431314413145131461314713148131491315013151131521315313154131551315613157131581315913160131611316213163131641316513166131671316813169131701317113172131731317413175131761317713178131791318013181131821318313184131851318613187131881318913190131911319213193131941319513196131971319813199132001320113202132031320413205132061320713208132091321013211132121321313214132151321613217132181321913220132211322213223132241322513226132271322813229132301323113232132331323413235132361323713238132391324013241132421324313244132451324613247132481324913250132511325213253132541325513256132571325813259132601326113262132631326413265132661326713268132691327013271132721327313274132751327613277132781327913280132811328213283132841328513286132871328813289132901329113292132931329413295132961329713298132991330013301133021330313304133051330613307133081330913310133111331213313133141331513316133171331813319133201332113322133231332413325133261332713328133291333013331133321333313334133351333613337133381333913340133411334213343133441334513346133471334813349133501335113352133531335413355133561335713358133591336013361133621336313364133651336613367133681336913370133711337213373133741337513376133771337813379133801338113382133831338413385133861338713388133891339013391133921339313394133951339613397133981339913400134011340213403134041340513406134071340813409134101341113412134131341413415134161341713418134191342013421134221342313424134251342613427134281342913430134311343213433134341343513436134371343813439134401344113442134431344413445134461344713448134491345013451134521345313454134551345613457134581345913460134611346213463134641346513466134671346813469134701347113472134731347413475134761347713478134791348013481134821348313484134851348613487134881348913490134911349213493134941349513496134971349813499135001350113502135031350413505135061350713508135091351013511135121351313514135151351613517135181351913520135211352213523135241352513526135271352813529135301353113532135331353413535135361353713538135391354013541135421354313544135451354613547135481354913550135511355213553135541355513556135571355813559135601356113562135631356413565135661356713568135691357013571135721357313574135751357613577135781357913580135811358213583135841358513586135871358813589135901359113592135931359413595135961359713598135991360013601136021360313604136051360613607136081360913610136111361213613136141361513616136171361813619136201362113622136231362413625136261362713628136291363013631136321363313634136351363613637136381363913640136411364213643136441364513646136471364813649136501365113652136531365413655136561365713658136591366013661136621366313664136651366613667136681366913670136711367213673136741367513676136771367813679136801368113682136831368413685136861368713688136891369013691136921369313694136951369613697136981369913700137011370213703137041370513706137071370813709137101371113712137131371413715137161371713718137191372013721137221372313724137251372613727137281372913730137311373213733137341373513736137371373813739137401374113742137431374413745137461374713748137491375013751137521375313754137551375613757137581375913760137611376213763137641376513766137671376813769137701377113772137731377413775137761377713778137791378013781137821378313784137851378613787137881378913790137911379213793137941379513796137971379813799138001380113802138031380413805138061380713808138091381013811138121381313814138151381613817138181381913820138211382213823138241382513826138271382813829138301383113832138331383413835138361383713838138391384013841138421384313844138451384613847138481384913850138511385213853138541385513856138571385813859138601386113862138631386413865138661386713868138691387013871138721387313874138751387613877138781387913880138811388213883138841388513886138871388813889138901389113892138931389413895138961389713898138991390013901139021390313904139051390613907139081390913910139111391213913139141391513916139171391813919139201392113922139231392413925139261392713928139291393013931139321393313934139351393613937139381393913940139411394213943139441394513946139471394813949139501395113952139531395413955139561395713958139591396013961139621396313964139651396613967139681396913970139711397213973139741397513976139771397813979139801398113982139831398413985139861398713988139891399013991139921399313994139951399613997139981399914000140011400214003140041400514006140071400814009140101401114012140131401414015140161401714018140191402014021140221402314024140251402614027140281402914030140311403214033140341403514036140371403814039140401404114042140431404414045140461404714048140491405014051140521405314054140551405614057140581405914060140611406214063140641406514066140671406814069140701407114072140731407414075140761407714078140791408014081140821408314084140851408614087140881408914090140911409214093140941409514096140971409814099141001410114102141031410414105141061410714108141091411014111141121411314114141151411614117141181411914120141211412214123141241412514126141271412814129141301413114132141331413414135141361413714138141391414014141141421414314144141451414614147141481414914150141511415214153141541415514156141571415814159141601416114162141631416414165141661416714168141691417014171141721417314174141751417614177141781417914180141811418214183141841418514186141871418814189141901419114192141931419414195141961419714198141991420014201142021420314204142051420614207142081420914210142111421214213142141421514216142171421814219142201422114222142231422414225142261422714228142291423014231142321423314234142351423614237142381423914240142411424214243142441424514246142471424814249142501425114252142531425414255142561425714258142591426014261142621426314264142651426614267142681426914270142711427214273142741427514276142771427814279142801428114282142831428414285142861428714288142891429014291142921429314294142951429614297142981429914300143011430214303143041430514306143071430814309143101431114312143131431414315143161431714318143191432014321143221432314324143251432614327143281432914330143311433214333143341433514336143371433814339143401434114342143431434414345143461434714348143491435014351143521435314354143551435614357143581435914360143611436214363143641436514366143671436814369143701437114372143731437414375143761437714378143791438014381143821438314384143851438614387143881438914390143911439214393143941439514396143971439814399144001440114402144031440414405144061440714408144091441014411144121441314414144151441614417144181441914420144211442214423144241442514426144271442814429144301443114432144331443414435144361443714438144391444014441144421444314444144451444614447144481444914450144511445214453144541445514456144571445814459144601446114462144631446414465144661446714468144691447014471144721447314474144751447614477144781447914480144811448214483144841448514486144871448814489144901449114492144931449414495144961449714498144991450014501145021450314504145051450614507145081450914510145111451214513145141451514516145171451814519145201452114522145231452414525145261452714528145291453014531145321453314534145351453614537145381453914540145411454214543145441454514546145471454814549145501455114552145531455414555145561455714558145591456014561145621456314564145651456614567145681456914570145711457214573145741457514576145771457814579145801458114582145831458414585145861458714588145891459014591145921459314594145951459614597145981459914600146011460214603146041460514606146071460814609146101461114612146131461414615146161461714618146191462014621146221462314624146251462614627146281462914630146311463214633146341463514636146371463814639146401464114642146431464414645146461464714648146491465014651146521465314654146551465614657146581465914660146611466214663146641466514666146671466814669146701467114672146731467414675146761467714678146791468014681146821468314684146851468614687146881468914690146911469214693146941469514696146971469814699147001470114702147031470414705147061470714708147091471014711147121471314714147151471614717147181471914720147211472214723147241472514726147271472814729147301473114732147331473414735147361473714738147391474014741147421474314744147451474614747147481474914750147511475214753147541475514756147571475814759147601476114762147631476414765147661476714768147691477014771147721477314774147751477614777147781477914780147811478214783147841478514786147871478814789147901479114792147931479414795147961479714798147991480014801148021480314804148051480614807148081480914810148111481214813148141481514816148171481814819148201482114822148231482414825148261482714828148291483014831148321483314834148351483614837148381483914840148411484214843148441484514846148471484814849148501485114852148531485414855148561485714858148591486014861148621486314864148651486614867148681486914870148711487214873148741487514876148771487814879148801488114882148831488414885148861488714888148891489014891148921489314894148951489614897148981489914900149011490214903149041490514906149071490814909149101491114912149131491414915149161491714918149191492014921149221492314924149251492614927149281492914930149311493214933149341493514936149371493814939149401494114942149431494414945149461494714948149491495014951149521495314954149551495614957149581495914960149611496214963149641496514966149671496814969149701497114972149731497414975149761497714978149791498014981149821498314984149851498614987149881498914990149911499214993149941499514996149971499814999150001500115002150031500415005150061500715008150091501015011150121501315014150151501615017150181501915020150211502215023150241502515026150271502815029150301503115032150331503415035150361503715038150391504015041150421504315044150451504615047150481504915050150511505215053150541505515056150571505815059150601506115062150631506415065150661506715068150691507015071150721507315074150751507615077150781507915080150811508215083150841508515086150871508815089150901509115092150931509415095150961509715098150991510015101151021510315104151051510615107151081510915110151111511215113151141511515116151171511815119151201512115122151231512415125151261512715128151291513015131151321513315134151351513615137151381513915140151411514215143151441514515146151471514815149151501515115152151531515415155151561515715158151591516015161151621516315164151651516615167151681516915170151711517215173151741517515176151771517815179151801518115182151831518415185151861518715188151891519015191151921519315194151951519615197151981519915200152011520215203152041520515206152071520815209152101521115212152131521415215152161521715218152191522015221152221522315224152251522615227152281522915230152311523215233152341523515236152371523815239152401524115242152431524415245152461524715248152491525015251152521525315254152551525615257152581525915260152611526215263152641526515266152671526815269152701527115272152731527415275152761527715278152791528015281152821528315284152851528615287152881528915290152911529215293152941529515296152971529815299153001530115302153031530415305153061530715308153091531015311153121531315314153151531615317153181531915320153211532215323153241532515326153271532815329153301533115332153331533415335153361533715338153391534015341153421534315344153451534615347153481534915350153511535215353153541535515356153571535815359153601536115362153631536415365153661536715368153691537015371153721537315374153751537615377153781537915380153811538215383153841538515386153871538815389153901539115392153931539415395153961539715398153991540015401154021540315404154051540615407154081540915410154111541215413154141541515416154171541815419154201542115422154231542415425154261542715428154291543015431154321543315434154351543615437154381543915440154411544215443154441544515446154471544815449154501545115452154531545415455154561545715458154591546015461154621546315464154651546615467154681546915470154711547215473154741547515476154771547815479154801548115482154831548415485154861548715488154891549015491154921549315494154951549615497154981549915500155011550215503155041550515506155071550815509155101551115512155131551415515155161551715518155191552015521155221552315524155251552615527155281552915530155311553215533155341553515536155371553815539155401554115542155431554415545155461554715548155491555015551155521555315554155551555615557155581555915560155611556215563155641556515566155671556815569155701557115572155731557415575155761557715578155791558015581155821558315584155851558615587155881558915590155911559215593155941559515596155971559815599156001560115602156031560415605156061560715608156091561015611156121561315614156151561615617156181561915620156211562215623156241562515626156271562815629156301563115632156331563415635156361563715638156391564015641156421564315644156451564615647156481564915650156511565215653156541565515656156571565815659156601566115662156631566415665156661566715668156691567015671156721567315674156751567615677156781567915680156811568215683156841568515686156871568815689156901569115692156931569415695156961569715698156991570015701157021570315704157051570615707157081570915710157111571215713157141571515716157171571815719157201572115722157231572415725157261572715728157291573015731157321573315734157351573615737157381573915740157411574215743157441574515746157471574815749157501575115752157531575415755157561575715758157591576015761157621576315764157651576615767157681576915770157711577215773157741577515776157771577815779157801578115782157831578415785157861578715788157891579015791157921579315794157951579615797157981579915800158011580215803158041580515806158071580815809158101581115812158131581415815158161581715818158191582015821158221582315824158251582615827158281582915830158311583215833158341583515836158371583815839158401584115842158431584415845158461584715848158491585015851158521585315854158551585615857158581585915860158611586215863158641586515866158671586815869158701587115872158731587415875158761587715878158791588015881158821588315884158851588615887158881588915890158911589215893158941589515896158971589815899159001590115902159031590415905159061590715908159091591015911159121591315914159151591615917159181591915920159211592215923159241592515926159271592815929159301593115932159331593415935159361593715938159391594015941159421594315944159451594615947159481594915950159511595215953159541595515956159571595815959159601596115962159631596415965159661596715968159691597015971159721597315974159751597615977159781597915980159811598215983159841598515986159871598815989159901599115992159931599415995159961599715998159991600016001160021600316004160051600616007160081600916010160111601216013160141601516016160171601816019160201602116022160231602416025160261602716028160291603016031160321603316034160351603616037160381603916040160411604216043160441604516046160471604816049160501605116052160531605416055160561605716058160591606016061160621606316064160651606616067160681606916070160711607216073160741607516076160771607816079160801608116082160831608416085160861608716088160891609016091160921609316094160951609616097160981609916100161011610216103161041610516106161071610816109161101611116112161131611416115161161611716118161191612016121161221612316124161251612616127161281612916130161311613216133161341613516136161371613816139161401614116142161431614416145161461614716148161491615016151161521615316154161551615616157161581615916160161611616216163161641616516166161671616816169161701617116172161731617416175161761617716178161791618016181161821618316184161851618616187161881618916190161911619216193161941619516196161971619816199162001620116202162031620416205162061620716208162091621016211162121621316214162151621616217162181621916220162211622216223162241622516226162271622816229162301623116232162331623416235162361623716238162391624016241162421624316244162451624616247162481624916250162511625216253162541625516256162571625816259162601626116262162631626416265162661626716268162691627016271162721627316274162751627616277162781627916280162811628216283162841628516286162871628816289162901629116292162931629416295162961629716298162991630016301163021630316304163051630616307163081630916310163111631216313163141631516316163171631816319163201632116322163231632416325163261632716328163291633016331163321633316334163351633616337163381633916340163411634216343163441634516346163471634816349163501635116352163531635416355163561635716358163591636016361163621636316364163651636616367163681636916370163711637216373163741637516376163771637816379163801638116382163831638416385163861638716388163891639016391163921639316394163951639616397163981639916400164011640216403164041640516406164071640816409164101641116412164131641416415164161641716418164191642016421164221642316424164251642616427164281642916430164311643216433164341643516436164371643816439164401644116442164431644416445164461644716448164491645016451164521645316454164551645616457164581645916460164611646216463164641646516466164671646816469164701647116472164731647416475164761647716478164791648016481164821648316484164851648616487164881648916490164911649216493164941649516496164971649816499165001650116502165031650416505165061650716508165091651016511165121651316514165151651616517165181651916520165211652216523165241652516526165271652816529165301653116532165331653416535165361653716538165391654016541165421654316544165451654616547165481654916550165511655216553165541655516556165571655816559165601656116562165631656416565165661656716568165691657016571165721657316574165751657616577165781657916580165811658216583165841658516586165871658816589165901659116592165931659416595165961659716598165991660016601166021660316604166051660616607166081660916610166111661216613166141661516616166171661816619166201662116622166231662416625166261662716628166291663016631166321663316634166351663616637166381663916640166411664216643166441664516646166471664816649166501665116652166531665416655166561665716658166591666016661166621666316664166651666616667166681666916670166711667216673166741667516676166771667816679166801668116682166831668416685166861668716688166891669016691166921669316694166951669616697166981669916700167011670216703167041670516706167071670816709167101671116712167131671416715167161671716718167191672016721167221672316724167251672616727167281672916730167311673216733167341673516736167371673816739167401674116742167431674416745167461674716748167491675016751167521675316754167551675616757167581675916760167611676216763167641676516766167671676816769167701677116772167731677416775167761677716778167791678016781167821678316784167851678616787167881678916790167911679216793167941679516796167971679816799168001680116802168031680416805168061680716808168091681016811168121681316814168151681616817168181681916820168211682216823168241682516826168271682816829168301683116832168331683416835168361683716838168391684016841168421684316844168451684616847168481684916850168511685216853168541685516856168571685816859168601686116862168631686416865168661686716868168691687016871168721687316874168751687616877168781687916880168811688216883168841688516886168871688816889168901689116892168931689416895168961689716898168991690016901169021690316904169051690616907169081690916910169111691216913169141691516916169171691816919169201692116922169231692416925169261692716928169291693016931169321693316934169351693616937169381693916940169411694216943169441694516946169471694816949169501695116952169531695416955169561695716958169591696016961169621696316964169651696616967169681696916970169711697216973169741697516976169771697816979169801698116982169831698416985169861698716988169891699016991169921699316994169951699616997169981699917000170011700217003170041700517006170071700817009170101701117012170131701417015170161701717018170191702017021170221702317024170251702617027170281702917030170311703217033170341703517036170371703817039170401704117042170431704417045170461704717048170491705017051170521705317054170551705617057170581705917060170611706217063170641706517066170671706817069170701707117072170731707417075170761707717078170791708017081170821708317084170851708617087170881708917090170911709217093170941709517096170971709817099171001710117102171031710417105171061710717108171091711017111171121711317114171151711617117171181711917120171211712217123171241712517126171271712817129171301713117132171331713417135171361713717138171391714017141171421714317144171451714617147171481714917150171511715217153171541715517156171571715817159171601716117162171631716417165171661716717168171691717017171171721717317174171751717617177171781717917180171811718217183171841718517186171871718817189171901719117192171931719417195171961719717198171991720017201172021720317204172051720617207172081720917210172111721217213172141721517216172171721817219172201722117222172231722417225172261722717228172291723017231172321723317234172351723617237172381723917240172411724217243172441724517246172471724817249172501725117252172531725417255172561725717258172591726017261172621726317264172651726617267172681726917270172711727217273172741727517276172771727817279172801728117282172831728417285172861728717288172891729017291172921729317294172951729617297172981729917300173011730217303173041730517306173071730817309173101731117312173131731417315173161731717318173191732017321173221732317324173251732617327173281732917330173311733217333173341733517336173371733817339173401734117342173431734417345173461734717348173491735017351173521735317354173551735617357173581735917360173611736217363173641736517366173671736817369173701737117372173731737417375173761737717378173791738017381173821738317384173851738617387173881738917390173911739217393173941739517396173971739817399174001740117402174031740417405174061740717408174091741017411174121741317414174151741617417174181741917420174211742217423174241742517426174271742817429174301743117432174331743417435174361743717438174391744017441174421744317444174451744617447174481744917450174511745217453174541745517456174571745817459174601746117462174631746417465174661746717468174691747017471174721747317474174751747617477174781747917480174811748217483174841748517486174871748817489174901749117492174931749417495174961749717498174991750017501175021750317504175051750617507175081750917510175111751217513175141751517516175171751817519175201752117522175231752417525175261752717528175291753017531175321753317534175351753617537175381753917540175411754217543175441754517546175471754817549175501755117552175531755417555175561755717558175591756017561175621756317564175651756617567175681756917570175711757217573175741757517576175771757817579175801758117582175831758417585175861758717588175891759017591175921759317594175951759617597175981759917600176011760217603176041760517606176071760817609176101761117612176131761417615176161761717618176191762017621176221762317624176251762617627176281762917630176311763217633176341763517636176371763817639176401764117642176431764417645176461764717648176491765017651176521765317654176551765617657176581765917660176611766217663176641766517666176671766817669176701767117672176731767417675176761767717678176791768017681176821768317684176851768617687176881768917690176911769217693176941769517696176971769817699177001770117702177031770417705177061770717708177091771017711177121771317714177151771617717177181771917720177211772217723177241772517726177271772817729177301773117732177331773417735177361773717738177391774017741177421774317744177451774617747177481774917750177511775217753177541775517756177571775817759177601776117762177631776417765177661776717768177691777017771177721777317774177751777617777177781777917780177811778217783177841778517786177871778817789177901779117792177931779417795177961779717798177991780017801178021780317804178051780617807178081780917810178111781217813178141781517816178171781817819178201782117822178231782417825178261782717828178291783017831178321783317834178351783617837178381783917840178411784217843178441784517846178471784817849178501785117852178531785417855178561785717858178591786017861178621786317864178651786617867178681786917870178711787217873178741787517876178771787817879178801788117882178831788417885178861788717888178891789017891178921789317894178951789617897178981789917900179011790217903179041790517906179071790817909179101791117912179131791417915179161791717918179191792017921179221792317924179251792617927179281792917930179311793217933179341793517936179371793817939179401794117942179431794417945179461794717948179491795017951179521795317954179551795617957179581795917960179611796217963179641796517966179671796817969179701797117972179731797417975179761797717978179791798017981179821798317984179851798617987179881798917990179911799217993179941799517996179971799817999180001800118002180031800418005180061800718008180091801018011180121801318014180151801618017180181801918020180211802218023180241802518026180271802818029180301803118032180331803418035180361803718038180391804018041180421804318044180451804618047180481804918050180511805218053180541805518056180571805818059180601806118062180631806418065180661806718068180691807018071180721807318074180751807618077180781807918080180811808218083180841808518086180871808818089180901809118092180931809418095180961809718098180991810018101181021810318104181051810618107181081810918110181111811218113181141811518116181171811818119181201812118122181231812418125181261812718128181291813018131181321813318134181351813618137181381813918140181411814218143181441814518146181471814818149181501815118152181531815418155181561815718158181591816018161181621816318164181651816618167181681816918170181711817218173181741817518176181771817818179181801818118182181831818418185181861818718188181891819018191181921819318194181951819618197181981819918200182011820218203182041820518206182071820818209182101821118212182131821418215182161821718218182191822018221182221822318224182251822618227182281822918230182311823218233182341823518236182371823818239182401824118242182431824418245182461824718248182491825018251182521825318254182551825618257182581825918260182611826218263182641826518266182671826818269182701827118272182731827418275182761827718278182791828018281182821828318284182851828618287182881828918290182911829218293182941829518296182971829818299183001830118302183031830418305183061830718308183091831018311183121831318314183151831618317183181831918320183211832218323183241832518326183271832818329
  1. {
  2. Copyright (c) 1998-2002 by Florian Klaempfl and Jonas Maebe
  3. This unit contains the peephole optimizer.
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit aoptx86;
  18. {$i fpcdefs.inc}
  19. { $define DEBUG_AOPTCPU}
  20. {$ifdef EXTDEBUG}
  21. {$define DEBUG_AOPTCPU}
  22. {$endif EXTDEBUG}
  23. interface
  24. uses
  25. globtype,cclasses,
  26. cpubase,
  27. aasmtai,aasmcpu,
  28. cgbase,cgutils,
  29. aopt,aoptobj;
  30. type
  31. TOptsToCheck = (
  32. aoc_MovAnd2Mov_3,
  33. aoc_ForceNewIteration,
  34. aoc_DoPass2JccOpts,
  35. aoc_MovlMovq2MovlMovl
  36. );
  37. TX86AsmOptimizer = class(TAsmOptimizer)
  38. { some optimizations are very expensive to check, so the
  39. pre opt pass can be used to set some flags, depending on the found
  40. instructions if it is worth to check a certain optimization }
  41. OptsToCheck : set of TOptsToCheck;
  42. function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override;
  43. function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override;
  44. class function RegReadByInstruction(reg : TRegister; hp : tai) : boolean; static;
  45. function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
  46. function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  47. { Identical to GetNextInstructionUsingReg, but returns a value indicating
  48. how many instructions away that Next is from Current is.
  49. 0 = failure, equivalent to False in GetNextInstructionUsingReg }
  50. function GetNextInstructionUsingRegCount(Current: tai; out Next: tai; reg: TRegister): Cardinal;
  51. { This version of GetNextInstructionUsingReg will look across conditional jumps,
  52. potentially allowing further optimisation (although it might need to know if
  53. it crossed a conditional jump. }
  54. function GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var JumpTracking: TLinkedList; var CrossJump: Boolean): Boolean;
  55. {
  56. In comparison with GetNextInstructionUsingReg, GetNextInstructionUsingRegTrackingUse tracks
  57. the use of a register by allocs/dealloc, so it can ignore calls.
  58. In the following example, GetNextInstructionUsingReg will return the second movq,
  59. GetNextInstructionUsingRegTrackingUse won't.
  60. movq %rdi,%rax
  61. # Register rdi released
  62. # Register rdi allocated
  63. movq %rax,%rdi
  64. While in this example:
  65. movq %rdi,%rax
  66. call proc
  67. movq %rdi,%rax
  68. GetNextInstructionUsingRegTrackingUse will return the second instruction while GetNextInstructionUsingReg
  69. won't.
  70. }
  71. function GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  72. function RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean; override;
  73. { returns true if any of the registers in ref are modified by any
  74. instruction between p1 and p2, or if those instructions write to the
  75. reference }
  76. function RefModifiedBetween(Ref: TReference; RefSize: ASizeInt; p1, p2: tai): Boolean;
  77. private
  78. function SkipSimpleInstructions(var hp1: tai): Boolean;
  79. protected
  80. class function IsMOVZXAcceptable: Boolean; static; inline;
  81. function CheckMovMov2MovMov2(const p, hp1: tai): Boolean;
  82. { Attempts to allocate a volatile integer register for use between p and hp,
  83. using AUsedRegs for the current register usage information. Returns NR_NO
  84. if no free register could be found }
  85. function GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
  86. { Attempts to allocate a volatile MM register for use between p and hp,
  87. using AUsedRegs for the current register usage information. Returns NR_NO
  88. if no free register could be found }
  89. function GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
  90. { checks whether loading a new value in reg1 overwrites the entirety of reg2 }
  91. class function Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean; static;
  92. { checks whether reading the value in reg1 depends on the value of reg2. This
  93. is very similar to SuperRegisterEquals, except it takes into account that
  94. R_SUBH and R_SUBL are independendent (e.g. reading from AL does not
  95. depend on the value in AH). }
  96. class function Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean; static;
  97. { Replaces all references to AOldReg in a memory reference to ANewReg }
  98. class function ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean; static;
  99. { Replaces all references to AOldReg in an operand to ANewReg }
  100. class function ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean; static;
  101. { Replaces all references to AOldReg in an instruction to ANewReg,
  102. except where the register is being written }
  103. class function ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean; static;
  104. { Returns true if the reference only refers to ESP or EBP (or their 64-bit equivalents),
  105. or writes to a global symbol }
  106. class function IsRefSafe(const ref: PReference): Boolean; static;
  107. { Returns true if the given MOV instruction can be safely converted to CMOV }
  108. class function CanBeCMOV(p, cond_p: tai; var RefModified: Boolean) : boolean; static;
  109. { Like UpdateUsedRegs, but ignores deallocations }
  110. class procedure UpdateIntRegsNoDealloc(var AUsedRegs: TAllUsedRegs; p: Tai); static;
  111. { Returns true if the given logic instruction can be converted into a BTx instruction (BT not included) }
  112. class function IsBTXAcceptable(p : tai) : boolean; static;
  113. { Converts the LEA instruction to ADD/INC/SUB/DEC. Returns True if the
  114. conversion was successful }
  115. function ConvertLEA(const p : taicpu): Boolean;
  116. function DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  117. function FuncMov2Func(var p: tai; const hp1: tai): Boolean;
  118. {$ifdef x86_64}
  119. { If a "mov %reg1d,%reg2d; and %reg1d,%reg1d" is found, we can possibly
  120. replace %reg2q with %reg1q in later instructions }
  121. function DoZeroUpper32Opt(var mov_p: tai; var and_p: tai): Boolean;
  122. {$endif x86_64}
  123. procedure DebugMsg(const s : string; p : tai);inline;
  124. class function IsExitCode(p : tai) : boolean; static;
  125. class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
  126. class function IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean; static;
  127. procedure RemoveLastDeallocForFuncRes(p : tai);
  128. function DoArithCombineOpt(var p : tai) : Boolean;
  129. function DoMovCmpMemOpt(var p : tai; const hp1: tai) : Boolean;
  130. function DoSETccLblRETOpt(var p: tai; const hp_label: tai_label) : Boolean;
  131. function HandleSHRMerge(var p: tai; const PostPeephole: Boolean): Boolean;
  132. function PrePeepholeOptSxx(var p : tai) : boolean;
  133. function PrePeepholeOptIMUL(var p : tai) : boolean;
  134. function PrePeepholeOptAND(var p : tai) : boolean;
  135. function OptPass1Test(var p: tai): boolean;
  136. function OptPass1Add(var p: tai): boolean;
  137. function OptPass1AND(var p : tai) : boolean;
  138. function OptPass1CMOVcc(var p: tai): Boolean;
  139. function OptPass1_V_MOVAP(var p : tai) : boolean;
  140. function OptPass1VOP(var p : tai) : boolean;
  141. function OptPass1MOV(var p : tai) : boolean;
  142. function OptPass1MOVD(var p : tai) : boolean;
  143. function OptPass1Movx(var p : tai) : boolean;
  144. function OptPass1MOVXX(var p : tai) : boolean;
  145. {$ifndef i8086}
  146. function OptPass1NOT(var p : tai) : boolean;
  147. {$endif not i8086}
  148. function OptPass1OP(var p : tai) : boolean;
  149. function OptPass1LEA(var p : tai) : boolean;
  150. function OptPass1Sub(var p : tai) : boolean;
  151. function OptPass1SHLSAL(var p : tai) : boolean;
  152. function OptPass1SHR(var p : tai) : boolean;
  153. function OptPass1FSTP(var p : tai) : boolean;
  154. function OptPass1FLD(var p : tai) : boolean;
  155. function OptPass1Cmp(var p : tai) : boolean;
  156. function OptPass1PXor(var p : tai) : boolean;
  157. function OptPass1VPXor(var p: tai): boolean;
  158. function OptPass1Imul(var p : tai) : boolean;
  159. function OptPass1Jcc(var p : tai) : boolean;
  160. function OptPass1SHXX(var p: tai): boolean;
  161. function OptPass1VMOVDQ(var p: tai): Boolean;
  162. function OptPass1_V_Cvtss2sd(var p: tai): boolean;
  163. function OptPass1STCCLC(var p: tai): Boolean;
  164. function OptPass2STCCLC(var p: tai): Boolean;
  165. function OptPass2CMOVcc(var p: tai): Boolean;
  166. function OptPass2Movx(var p : tai): Boolean;
  167. function OptPass2MOV(var p : tai) : boolean;
  168. function OptPass2Imul(var p : tai) : boolean;
  169. function OptPass2Jmp(var p : tai) : boolean;
  170. function OptPass2Jcc(var p : tai) : boolean;
  171. function OptPass2Lea(var p: tai): Boolean;
  172. function OptPass2SUB(var p: tai): Boolean;
  173. function OptPass2ADD(var p : tai): Boolean;
  174. function OptPass2SETcc(var p : tai) : boolean;
  175. function OptPass2Cmp(var p: tai): Boolean;
  176. function OptPass2Test(var p: tai): Boolean;
  177. function CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
  178. function PostPeepholeOptMov(var p : tai) : Boolean;
  179. function PostPeepholeOptMovzx(var p : tai) : Boolean;
  180. function PostPeepholeOptXor(var p : tai) : Boolean;
  181. function PostPeepholeOptAnd(var p : tai) : boolean;
  182. function PostPeepholeOptMOVSX(var p : tai) : boolean;
  183. function PostPeepholeOptCmp(var p : tai) : Boolean;
  184. function PostPeepholeOptTestOr(var p : tai) : Boolean;
  185. function PostPeepholeOptCall(var p : tai) : Boolean;
  186. function PostPeepholeOptLea(var p : tai) : Boolean;
  187. function PostPeepholeOptPush(var p: tai): Boolean;
  188. function PostPeepholeOptShr(var p : tai) : boolean;
  189. function PostPeepholeOptADDSUB(var p : tai) : Boolean;
  190. function PostPeepholeOptVPXOR(var p: tai): Boolean;
  191. function PostPeepholeOptRET(var p: tai): Boolean;
  192. function PostPeepholeOptRORX(var p: tai): Boolean;
  193. function PostPeepholeOptSARXSHLXSHRX(var p: tai): Boolean;
  194. procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
  195. function CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
  196. function TrySwapMovOp(var p, hp1: tai): Boolean;
  197. function TrySwapMovCmp(var p, hp1: tai): Boolean;
  198. function TryCmpCMovOpts(var p, hp1: tai) : Boolean;
  199. function TryJccStcClcOpt(var p, hp1: tai): Boolean;
  200. { Processor-dependent reference optimisation }
  201. class procedure OptimizeRefs(var p: taicpu); static;
  202. end;
  203. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  204. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  205. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  206. function MatchInstruction(const instr: tai; const ops: array of TAsmOp; const opsize: topsizes): boolean;
  207. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  208. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  209. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  210. {$if max_operands>2}
  211. function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
  212. {$endif max_operands>2}
  213. function RefsEqual(const r1, r2: treference): boolean;
  214. { Like RefsEqual, but doesn't compare the offsets }
  215. function RefsAlmostEqual(const r1, r2: treference): boolean;
  216. { Note that Result is set to True if the references COULD overlap but the
  217. compiler cannot be sure (e.g. "(%reg1)" and "4(%reg2)" with a range of 4
  218. might still overlap because %reg2 could be equal to %reg1-4 }
  219. function RefsMightOverlap(const r1, r2: treference; const Range: asizeint): boolean;
  220. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  221. { returns true, if ref is a reference using only the registers passed as base and index
  222. and having an offset }
  223. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  224. implementation
  225. uses
  226. cutils,verbose,
  227. systems,
  228. globals,
  229. cpuinfo,
  230. procinfo,
  231. paramgr,
  232. aasmbase,
  233. aoptbase,aoptutils,
  234. symconst,symsym,
  235. cgx86,
  236. itcpugas;
  237. {$ifndef 8086}
  238. const
  239. MAX_CMOV_INSTRUCTIONS = 4;
  240. MAX_CMOV_REGISTERS = 8;
  241. type
  242. TCMovTrackingState = (tsInvalid, tsSimple, tsDetour, tsBranching,
  243. tsDouble, tsDoubleBranchSame, tsDoubleBranchDifferent, tsDoubleSecondBranching,
  244. tsProcessed);
  245. { For OptPass2Jcc }
  246. TCMOVTracking = object
  247. private
  248. CMOVScore, ConstCount: LongInt;
  249. RegWrites: array[0..MAX_CMOV_INSTRUCTIONS*2 - 1] of TRegister;
  250. ConstRegs: array[0..MAX_CMOV_REGISTERS - 1] of TRegister;
  251. ConstVals: array[0..MAX_CMOV_REGISTERS - 1] of TCGInt;
  252. ConstSizes: array[0..MAX_CMOV_REGISTERS - 1] of TSubRegister; { May not match ConstRegs if one is shared over multiple CMOVs. }
  253. ConstMovs: array[0..MAX_CMOV_REGISTERS - 1] of tai; { Location of initialisation instruction }
  254. ConstWriteSizes: array[0..first_int_imreg - 1] of TSubRegister; { Largest size of register written. }
  255. fOptimizer: TX86AsmOptimizer;
  256. fLabel: TAsmSymbol;
  257. fInsertionPoint,
  258. fCondition,
  259. fInitialJump,
  260. fFirstMovBlock,
  261. fFirstMovBlockStop,
  262. fSecondJump,
  263. fThirdJump,
  264. fSecondMovBlock,
  265. fSecondMovBlockStop,
  266. fMidLabel,
  267. fEndLabel,
  268. fAllocationRange: tai;
  269. fState: TCMovTrackingState;
  270. function TryCMOVConst(p, start, stop: tai; var Count: LongInt): Boolean;
  271. function InitialiseBlock(BlockStart, OneBeforeBlock: tai; out BlockStop: tai; out EndJump: tai): Boolean;
  272. function AnalyseMOVBlock(BlockStart, BlockStop, SearchStart: tai): LongInt;
  273. public
  274. RegisterTracking: TAllUsedRegs;
  275. constructor Init(Optimizer: TX86AsmOptimizer; var p_initialjump, p_initialmov: tai; var AFirstLabel: TAsmLabel);
  276. destructor Done;
  277. procedure Process(out new_p: tai);
  278. property State: TCMovTrackingState read fState;
  279. end;
  280. PCMOVTracking = ^TCMOVTracking;
  281. {$endif 8086}
  282. {$ifdef DEBUG_AOPTCPU}
  283. const
  284. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  285. {$else DEBUG_AOPTCPU}
  286. { Empty strings help the optimizer to remove string concatenations that won't
  287. ever appear to the user on release builds. [Kit] }
  288. const
  289. SPeepholeOptimization = '';
  290. {$endif DEBUG_AOPTCPU}
  291. LIST_STEP_SIZE = 4;
  292. type
  293. TJumpTrackingItem = class(TLinkedListItem)
  294. private
  295. FSymbol: TAsmSymbol;
  296. FRefs: LongInt;
  297. public
  298. constructor Create(ASymbol: TAsmSymbol);
  299. procedure IncRefs; {$ifdef USEINLINE}inline;{$endif USEINLINE}
  300. property Symbol: TAsmSymbol read FSymbol;
  301. property Refs: LongInt read FRefs;
  302. end;
  303. constructor TJumpTrackingItem.Create(ASymbol: TAsmSymbol);
  304. begin
  305. inherited Create;
  306. FSymbol := ASymbol;
  307. FRefs := 0;
  308. end;
  309. procedure TJumpTrackingItem.IncRefs; {$ifdef USEINLINE}inline;{$endif USEINLINE}
  310. begin
  311. Inc(FRefs);
  312. end;
  313. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  314. begin
  315. result :=
  316. (instr.typ = ait_instruction) and
  317. (taicpu(instr).opcode = op) and
  318. ((opsize = []) or (taicpu(instr).opsize in opsize));
  319. end;
  320. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  321. begin
  322. result :=
  323. (instr.typ = ait_instruction) and
  324. ((taicpu(instr).opcode = op1) or
  325. (taicpu(instr).opcode = op2)
  326. ) and
  327. ((opsize = []) or (taicpu(instr).opsize in opsize));
  328. end;
  329. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  330. begin
  331. result :=
  332. (instr.typ = ait_instruction) and
  333. ((taicpu(instr).opcode = op1) or
  334. (taicpu(instr).opcode = op2) or
  335. (taicpu(instr).opcode = op3)
  336. ) and
  337. ((opsize = []) or (taicpu(instr).opsize in opsize));
  338. end;
  339. function MatchInstruction(const instr : tai;const ops : array of TAsmOp;
  340. const opsize : topsizes) : boolean;
  341. var
  342. op : TAsmOp;
  343. begin
  344. result:=false;
  345. if (instr.typ <> ait_instruction) or
  346. ((opsize <> []) and not(taicpu(instr).opsize in opsize)) then
  347. exit;
  348. for op in ops do
  349. begin
  350. if taicpu(instr).opcode = op then
  351. begin
  352. result:=true;
  353. exit;
  354. end;
  355. end;
  356. end;
  357. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  358. begin
  359. result := (oper.typ = top_reg) and (oper.reg = reg);
  360. end;
  361. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  362. begin
  363. result := (oper.typ = top_const) and (oper.val = a);
  364. end;
  365. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  366. begin
  367. result := oper1.typ = oper2.typ;
  368. if result then
  369. case oper1.typ of
  370. top_const:
  371. Result:=oper1.val = oper2.val;
  372. top_reg:
  373. Result:=oper1.reg = oper2.reg;
  374. top_ref:
  375. Result:=RefsEqual(oper1.ref^, oper2.ref^);
  376. else
  377. internalerror(2013102801);
  378. end
  379. end;
  380. function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
  381. begin
  382. result := (oper1.typ = oper2.typ) and (oper1.typ = oper3.typ);
  383. if result then
  384. case oper1.typ of
  385. top_const:
  386. Result:=(oper1.val = oper2.val) and (oper1.val = oper3.val);
  387. top_reg:
  388. Result:=(oper1.reg = oper2.reg) and (oper1.reg = oper3.reg);
  389. top_ref:
  390. Result:=RefsEqual(oper1.ref^, oper2.ref^) and RefsEqual(oper1.ref^, oper3.ref^);
  391. else
  392. internalerror(2020052401);
  393. end
  394. end;
  395. function RefsEqual(const r1, r2: treference): boolean;
  396. begin
  397. RefsEqual :=
  398. (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  399. (r1.relsymbol = r2.relsymbol) and
  400. (r1.segment = r2.segment) and (r1.base = r2.base) and
  401. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  402. (r1.offset = r2.offset) and
  403. (r1.volatility + r2.volatility = []);
  404. end;
  405. function RefsAlmostEqual(const r1, r2: treference): boolean;
  406. begin
  407. RefsAlmostEqual :=
  408. (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  409. (r1.relsymbol = r2.relsymbol) and
  410. (r1.segment = r2.segment) and (r1.base = r2.base) and
  411. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  412. { Don't compare the offsets }
  413. (r1.volatility + r2.volatility = []);
  414. end;
  415. function RefsMightOverlap(const r1, r2: treference; const Range: asizeint): boolean;
  416. begin
  417. if (r1.symbol<>r2.symbol) then
  418. { If the index registers are different, there's a chance one could
  419. be set so it equals the other symbol }
  420. Exit((r1.index<>r2.index) or (r1.scalefactor<>r2.scalefactor));
  421. if (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  422. (r1.relsymbol = r2.relsymbol) and
  423. (r1.segment = r2.segment) and (r1.base = r2.base) and
  424. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  425. (r1.volatility + r2.volatility = []) then
  426. { In this case, it all depends on the offsets }
  427. Exit(abs(r1.offset - r2.offset) < Range);
  428. { There's a chance things MIGHT overlap, so take no chances }
  429. Result := True;
  430. end;
  431. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  432. begin
  433. Result:=(ref.offset=0) and
  434. (ref.scalefactor in [0,1]) and
  435. (ref.segment=NR_NO) and
  436. (ref.symbol=nil) and
  437. (ref.relsymbol=nil) and
  438. ((base=NR_INVALID) or
  439. (ref.base=base)) and
  440. ((index=NR_INVALID) or
  441. (ref.index=index)) and
  442. (ref.volatility=[]);
  443. end;
  444. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  445. begin
  446. Result:=(ref.scalefactor in [0,1]) and
  447. (ref.segment=NR_NO) and
  448. (ref.symbol=nil) and
  449. (ref.relsymbol=nil) and
  450. ((base=NR_INVALID) or
  451. (ref.base=base)) and
  452. ((index=NR_INVALID) or
  453. (ref.index=index)) and
  454. (ref.volatility=[]);
  455. end;
  456. function InstrReadsFlags(p: tai): boolean;
  457. begin
  458. InstrReadsFlags := true;
  459. case p.typ of
  460. ait_instruction:
  461. if InsProp[taicpu(p).opcode].Ch*
  462. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  463. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  464. Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc,Ch_All]<>[] then
  465. exit;
  466. ait_label:
  467. exit;
  468. else
  469. ;
  470. end;
  471. InstrReadsFlags := false;
  472. end;
  473. function TX86AsmOptimizer.GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  474. begin
  475. Next:=Current;
  476. repeat
  477. Result:=GetNextInstruction(Next,Next);
  478. until not (Result) or
  479. not(cs_opt_level3 in current_settings.optimizerswitches) or
  480. (Next.typ<>ait_instruction) or
  481. RegInInstruction(reg,Next) or
  482. is_calljmp(taicpu(Next).opcode);
  483. end;
  484. function TX86AsmOptimizer.GetNextInstructionUsingRegCount(Current: tai; out Next: tai; reg: TRegister): Cardinal;
  485. var
  486. GetNextResult: Boolean;
  487. begin
  488. Result:=0;
  489. Next:=Current;
  490. repeat
  491. GetNextResult := GetNextInstruction(Next,Next);
  492. if GetNextResult then
  493. Inc(Result)
  494. else
  495. { Must return zero upon hitting the end of the linked list without a match }
  496. Result := 0;
  497. until not (GetNextResult) or
  498. not(cs_opt_level3 in current_settings.optimizerswitches) or
  499. (Next.typ<>ait_instruction) or
  500. RegInInstruction(reg,Next) or
  501. is_calljmp(taicpu(Next).opcode);
  502. end;
  503. function TX86AsmOptimizer.GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var JumpTracking: TLinkedList; var CrossJump: Boolean): Boolean;
  504. procedure TrackJump(Symbol: TAsmSymbol);
  505. var
  506. Search: TJumpTrackingItem;
  507. begin
  508. { See if an entry already exists in our jump tracking list
  509. (faster to search backwards due to the higher chance of
  510. matching destinations) }
  511. Search := TJumpTrackingItem(JumpTracking.Last);
  512. while Assigned(Search) do
  513. begin
  514. if Search.Symbol = Symbol then
  515. begin
  516. { Found it - remove it so it can be pushed to the front }
  517. JumpTracking.Remove(Search);
  518. Break;
  519. end;
  520. Search := TJumpTrackingItem(Search.Previous);
  521. end;
  522. if not Assigned(Search) then
  523. Search := TJumpTrackingItem.Create(JumpTargetOp(taicpu(Next))^.ref^.symbol);
  524. JumpTracking.Concat(Search);
  525. Search.IncRefs;
  526. end;
  527. function LabelAccountedFor(Symbol: TAsmSymbol): Boolean;
  528. var
  529. Search: TJumpTrackingItem;
  530. begin
  531. Result := False;
  532. { See if this label appears in the tracking list }
  533. Search := TJumpTrackingItem(JumpTracking.Last);
  534. while Assigned(Search) do
  535. begin
  536. if Search.Symbol = Symbol then
  537. begin
  538. { Found it - let's see what we can discover }
  539. if Search.Symbol.getrefs = Search.Refs then
  540. begin
  541. { Success - all the references are accounted for }
  542. JumpTracking.Remove(Search);
  543. Search.Free;
  544. { It is logically impossible for CrossJump to be false here
  545. because we must have run into a conditional jump for
  546. this label at some point }
  547. if not CrossJump then
  548. InternalError(2022041710);
  549. if JumpTracking.First = nil then
  550. { Tracking list is now empty - no more cross jumps }
  551. CrossJump := False;
  552. Result := True;
  553. Exit;
  554. end;
  555. { If the references don't match, it's possible to enter
  556. this label through other means, so drop out }
  557. Exit;
  558. end;
  559. Search := TJumpTrackingItem(Search.Previous);
  560. end;
  561. end;
  562. var
  563. Next_Label: tai;
  564. begin
  565. { Note, CrossJump keeps its input value if a conditional jump is not found - it doesn't get set to False }
  566. Next := Current;
  567. repeat
  568. Result := GetNextInstruction(Next,Next);
  569. if not Result then
  570. Break;
  571. if (Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) then
  572. if is_calljmpuncondret(taicpu(Next).opcode) then
  573. begin
  574. if (taicpu(Next).opcode = A_JMP) and
  575. { Remove dead code now to save time }
  576. RemoveDeadCodeAfterJump(taicpu(Next)) then
  577. { A jump was removed, but not the current instruction, and
  578. Result doesn't necessarily translate into an optimisation
  579. routine's Result, so use the "Force New Iteration" flag so
  580. mark a new pass }
  581. Include(OptsToCheck, aoc_ForceNewIteration);
  582. if not Assigned(JumpTracking) then
  583. begin
  584. { Cross-label optimisations often causes other optimisations
  585. to perform worse because they're not given the chance to
  586. optimise locally. In this case, don't do the cross-label
  587. optimisations yet, but flag them as a potential possibility
  588. for the next iteration of Pass 1 }
  589. if not NotFirstIteration then
  590. Include(OptsToCheck, aoc_ForceNewIteration);
  591. end
  592. else if IsJumpToLabel(taicpu(Next)) and
  593. GetNextInstruction(Next, Next_Label) then
  594. begin
  595. { If we have JMP .lbl, and the label after it has all of its
  596. references tracked, then this is probably an if-else style of
  597. block and we can keep tracking. If the label for this jump
  598. then appears later and is fully tracked, then it's the end
  599. of the if-else blocks and the code paths converge (thus
  600. marking the end of the cross-jump) }
  601. if (Next_Label.typ = ait_label) then
  602. begin
  603. if LabelAccountedFor(tai_label(Next_Label).labsym) then
  604. begin
  605. TrackJump(JumpTargetOp(taicpu(Next))^.ref^.symbol);
  606. Next := Next_Label;
  607. { CrossJump gets set to false by LabelAccountedFor if the
  608. list is completely emptied (as it indicates that all
  609. code paths have converged). We could avoid this nuance
  610. by moving the TrackJump call to before the
  611. LabelAccountedFor call, but this is slower in situations
  612. where LabelAccountedFor would return False due to the
  613. creation of a new object that is not used and destroyed
  614. soon after. }
  615. CrossJump := True;
  616. Continue;
  617. end;
  618. end
  619. else if (Next_Label.typ <> ait_marker) then
  620. { We just did a RemoveDeadCodeAfterJump, so either we find
  621. a label, the end of the procedure or some kind of marker}
  622. InternalError(2022041720);
  623. end;
  624. Result := False;
  625. Exit;
  626. end
  627. else
  628. begin
  629. if not Assigned(JumpTracking) then
  630. begin
  631. { Cross-label optimisations often causes other optimisations
  632. to perform worse because they're not given the chance to
  633. optimise locally. In this case, don't do the cross-label
  634. optimisations yet, but flag them as a potential possibility
  635. for the next iteration of Pass 1 }
  636. if not NotFirstIteration then
  637. Include(OptsToCheck, aoc_ForceNewIteration);
  638. end
  639. else if IsJumpToLabel(taicpu(Next)) then
  640. TrackJump(JumpTargetOp(taicpu(Next))^.ref^.symbol)
  641. else
  642. { Conditional jumps should always be a jump to label }
  643. InternalError(2022041701);
  644. CrossJump := True;
  645. Continue;
  646. end;
  647. if Next.typ = ait_label then
  648. begin
  649. if not Assigned(JumpTracking) then
  650. begin
  651. { Cross-label optimisations often causes other optimisations
  652. to perform worse because they're not given the chance to
  653. optimise locally. In this case, don't do the cross-label
  654. optimisations yet, but flag them as a potential possibility
  655. for the next iteration of Pass 1 }
  656. if not NotFirstIteration then
  657. Include(OptsToCheck, aoc_ForceNewIteration);
  658. end
  659. else if LabelAccountedFor(tai_label(Next).labsym) then
  660. Continue;
  661. { If we reach here, we're at a label that hasn't been seen before
  662. (or JumpTracking was nil) }
  663. Break;
  664. end;
  665. until not Result or
  666. not (cs_opt_level3 in current_settings.optimizerswitches) or
  667. not (Next.typ in [ait_label, ait_instruction]) or
  668. RegInInstruction(reg,Next);
  669. end;
  670. function TX86AsmOptimizer.GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  671. begin
  672. if not(cs_opt_level3 in current_settings.optimizerswitches) then
  673. begin
  674. Result:=GetNextInstruction(Current,Next);
  675. exit;
  676. end;
  677. Next:=tai(Current.Next);
  678. Result:=false;
  679. while assigned(Next) do
  680. begin
  681. if ((Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) and not(taicpu(Next).opcode=A_CALL)) or
  682. ((Next.typ=ait_regalloc) and (getsupreg(tai_regalloc(Next).reg)=getsupreg(reg))) or
  683. ((Next.typ=ait_label) and not(labelCanBeSkipped(Tai_Label(Next)))) then
  684. exit
  685. else if (Next.typ=ait_instruction) and RegInInstruction(reg,Next) and not(taicpu(Next).opcode=A_CALL) then
  686. begin
  687. Result:=true;
  688. exit;
  689. end;
  690. Next:=tai(Next.Next);
  691. end;
  692. end;
  693. function TX86AsmOptimizer.InstructionLoadsFromReg(const reg: TRegister;const hp: tai): boolean;
  694. begin
  695. Result:=RegReadByInstruction(reg,hp);
  696. end;
  697. class function TX86AsmOptimizer.RegReadByInstruction(reg: TRegister; hp: tai): boolean;
  698. var
  699. p: taicpu;
  700. opcount: longint;
  701. begin
  702. RegReadByInstruction := false;
  703. if hp.typ <> ait_instruction then
  704. exit;
  705. p := taicpu(hp);
  706. case p.opcode of
  707. A_CALL:
  708. regreadbyinstruction := true;
  709. A_IMUL:
  710. case p.ops of
  711. 1:
  712. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  713. (
  714. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  715. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  716. );
  717. 2,3:
  718. regReadByInstruction :=
  719. reginop(reg,p.oper[0]^) or
  720. reginop(reg,p.oper[1]^);
  721. else
  722. InternalError(2019112801);
  723. end;
  724. A_MUL:
  725. begin
  726. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  727. (
  728. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  729. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  730. );
  731. end;
  732. A_IDIV,A_DIV:
  733. begin
  734. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  735. (
  736. (getregtype(reg)=R_INTREGISTER) and
  737. (
  738. (getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B))
  739. )
  740. );
  741. end;
  742. else
  743. begin
  744. if (p.opcode=A_LEA) and is_segment_reg(reg) then
  745. begin
  746. RegReadByInstruction := false;
  747. exit;
  748. end;
  749. for opcount := 0 to p.ops-1 do
  750. if (p.oper[opCount]^.typ = top_ref) and
  751. RegInRef(reg,p.oper[opcount]^.ref^) then
  752. begin
  753. RegReadByInstruction := true;
  754. exit
  755. end;
  756. { special handling for SSE MOVSD }
  757. if (p.opcode=A_MOVSD) and (p.ops>0) then
  758. begin
  759. if p.ops<>2 then
  760. internalerror(2017042702);
  761. regReadByInstruction := reginop(reg,p.oper[0]^) or
  762. (
  763. (p.oper[1]^.typ=top_reg) and (p.oper[0]^.typ=top_reg) and reginop(reg, p.oper[1]^)
  764. );
  765. exit;
  766. end;
  767. with insprop[p.opcode] do
  768. begin
  769. case getregtype(reg) of
  770. R_INTREGISTER:
  771. begin
  772. case getsupreg(reg) of
  773. RS_EAX:
  774. if [Ch_REAX,Ch_RWEAX,Ch_MEAX,Ch_WRAX,Ch_RWRAX,Ch_MRAX]*Ch<>[] then
  775. begin
  776. RegReadByInstruction := true;
  777. exit
  778. end;
  779. RS_ECX:
  780. if [Ch_RECX,Ch_RWECX,Ch_MECX,Ch_WRCX,Ch_RWRCX,Ch_MRCX]*Ch<>[] then
  781. begin
  782. RegReadByInstruction := true;
  783. exit
  784. end;
  785. RS_EDX:
  786. if [Ch_REDX,Ch_RWEDX,Ch_MEDX,Ch_WRDX,Ch_RWRDX,Ch_MRDX]*Ch<>[] then
  787. begin
  788. RegReadByInstruction := true;
  789. exit
  790. end;
  791. RS_EBX:
  792. if [Ch_REBX,Ch_RWEBX,Ch_MEBX,Ch_WRBX,Ch_RWRBX,Ch_MRBX]*Ch<>[] then
  793. begin
  794. RegReadByInstruction := true;
  795. exit
  796. end;
  797. RS_ESP:
  798. if [Ch_RESP,Ch_RWESP,Ch_MESP,Ch_WRSP,Ch_RWRSP,Ch_MRSP]*Ch<>[] then
  799. begin
  800. RegReadByInstruction := true;
  801. exit
  802. end;
  803. RS_EBP:
  804. if [Ch_REBP,Ch_RWEBP,Ch_MEBP,Ch_WRBP,Ch_RWRBP,Ch_MRBP]*Ch<>[] then
  805. begin
  806. RegReadByInstruction := true;
  807. exit
  808. end;
  809. RS_ESI:
  810. if [Ch_RESI,Ch_RWESI,Ch_MESI,Ch_WRSI,Ch_RWRSI,Ch_MRSI]*Ch<>[] then
  811. begin
  812. RegReadByInstruction := true;
  813. exit
  814. end;
  815. RS_EDI:
  816. if [Ch_REDI,Ch_RWEDI,Ch_MEDI,Ch_WRDI,Ch_RWRDI,Ch_MRDI]*Ch<>[] then
  817. begin
  818. RegReadByInstruction := true;
  819. exit
  820. end;
  821. end;
  822. end;
  823. R_MMREGISTER:
  824. begin
  825. case getsupreg(reg) of
  826. RS_XMM0:
  827. if [Ch_RXMM0,Ch_RWXMM0,Ch_MXMM0]*Ch<>[] then
  828. begin
  829. RegReadByInstruction := true;
  830. exit
  831. end;
  832. end;
  833. end;
  834. else
  835. ;
  836. end;
  837. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  838. begin
  839. if (Ch_RFLAGScc in Ch) and not(getsubreg(reg) in [R_SUBW,R_SUBD,R_SUBQ]) then
  840. begin
  841. case p.condition of
  842. C_A,C_NBE, { CF=0 and ZF=0 }
  843. C_BE,C_NA: { CF=1 or ZF=1 }
  844. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY,R_SUBFLAGZERO];
  845. C_AE,C_NB,C_NC, { CF=0 }
  846. C_B,C_NAE,C_C: { CF=1 }
  847. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY];
  848. C_NE,C_NZ, { ZF=0 }
  849. C_E,C_Z: { ZF=1 }
  850. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO];
  851. C_G,C_NLE, { ZF=0 and SF=OF }
  852. C_LE,C_NG: { ZF=1 or SF<>OF }
  853. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO,R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  854. C_GE,C_NL, { SF=OF }
  855. C_L,C_NGE: { SF<>OF }
  856. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  857. C_NO, { OF=0 }
  858. C_O: { OF=1 }
  859. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGOVERFLOW];
  860. C_NP,C_PO, { PF=0 }
  861. C_P,C_PE: { PF=1 }
  862. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGPARITY];
  863. C_NS, { SF=0 }
  864. C_S: { SF=1 }
  865. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN];
  866. else
  867. internalerror(2017042701);
  868. end;
  869. if RegReadByInstruction then
  870. exit;
  871. end;
  872. case getsubreg(reg) of
  873. R_SUBW,R_SUBD,R_SUBQ:
  874. RegReadByInstruction :=
  875. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  876. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  877. Ch_RDirFlag,Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc]*Ch<>[];
  878. R_SUBFLAGCARRY:
  879. RegReadByInstruction:=[Ch_RCarryFlag,Ch_RWCarryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  880. R_SUBFLAGPARITY:
  881. RegReadByInstruction:=[Ch_RParityFlag,Ch_RWParityFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  882. R_SUBFLAGAUXILIARY:
  883. RegReadByInstruction:=[Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  884. R_SUBFLAGZERO:
  885. RegReadByInstruction:=[Ch_RZeroFlag,Ch_RWZeroFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  886. R_SUBFLAGSIGN:
  887. RegReadByInstruction:=[Ch_RSignFlag,Ch_RWSignFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  888. R_SUBFLAGOVERFLOW:
  889. RegReadByInstruction:=[Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  890. R_SUBFLAGINTERRUPT:
  891. RegReadByInstruction:=[Ch_RFlags,Ch_RWFlags]*Ch<>[];
  892. R_SUBFLAGDIRECTION:
  893. RegReadByInstruction:=[Ch_RDirFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  894. else
  895. internalerror(2017042601);
  896. end;
  897. exit;
  898. end;
  899. if (Ch_NoReadIfEqualRegs in Ch) and (p.ops=2) and
  900. (p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
  901. (p.oper[0]^.reg=p.oper[1]^.reg) then
  902. exit;
  903. if ([CH_RWOP1,CH_ROP1,CH_MOP1]*Ch<>[]) and reginop(reg,p.oper[0]^) then
  904. begin
  905. RegReadByInstruction := true;
  906. exit
  907. end;
  908. if ([Ch_RWOP2,Ch_ROP2,Ch_MOP2]*Ch<>[]) and reginop(reg,p.oper[1]^) then
  909. begin
  910. RegReadByInstruction := true;
  911. exit
  912. end;
  913. if ([Ch_RWOP3,Ch_ROP3,Ch_MOP3]*Ch<>[]) and reginop(reg,p.oper[2]^) then
  914. begin
  915. RegReadByInstruction := true;
  916. exit
  917. end;
  918. if ([Ch_RWOP4,Ch_ROP4,Ch_MOP4]*Ch<>[]) and reginop(reg,p.oper[3]^) then
  919. begin
  920. RegReadByInstruction := true;
  921. exit
  922. end;
  923. end;
  924. end;
  925. end;
  926. end;
  927. function TX86AsmOptimizer.RegInInstruction(Reg: TRegister; p1: tai): Boolean;
  928. begin
  929. result:=false;
  930. if p1.typ<>ait_instruction then
  931. exit;
  932. if (Ch_All in insprop[taicpu(p1).opcode].Ch) then
  933. exit(true);
  934. if (getregtype(reg)=R_INTREGISTER) and
  935. { change information for xmm movsd are not correct }
  936. ((taicpu(p1).opcode<>A_MOVSD) or (taicpu(p1).ops=0)) then
  937. begin
  938. { Handle instructions that behave differently depending on the size and operand count }
  939. case taicpu(p1).opcode of
  940. A_MUL, A_DIV, A_IDIV:
  941. if taicpu(p1).opsize = S_B then
  942. Result := (getsupreg(Reg) = RS_EAX)
  943. else
  944. Result := (getsupreg(Reg) in [RS_EAX, RS_EDX]);
  945. A_IMUL:
  946. if taicpu(p1).ops = 1 then
  947. begin
  948. if taicpu(p1).opsize = S_B then
  949. Result := (getsupreg(Reg) = RS_EAX)
  950. else
  951. Result := (getsupreg(Reg) in [RS_EAX, RS_EDX]);
  952. end;
  953. { If ops are greater than 1, call inherited method }
  954. else
  955. case getsupreg(reg) of
  956. { RS_EAX = RS_RAX on x86-64 }
  957. RS_EAX:
  958. result:=([Ch_REAX,Ch_RRAX,Ch_WEAX,Ch_WRAX,Ch_RWEAX,Ch_RWRAX,Ch_MEAX,Ch_MRAX]*insprop[taicpu(p1).opcode].Ch)<>[];
  959. RS_ECX:
  960. result:=([Ch_RECX,Ch_RRCX,Ch_WECX,Ch_WRCX,Ch_RWECX,Ch_RWRCX,Ch_MECX,Ch_MRCX]*insprop[taicpu(p1).opcode].Ch)<>[];
  961. RS_EDX:
  962. result:=([Ch_REDX,Ch_RRDX,Ch_WEDX,Ch_WRDX,Ch_RWEDX,Ch_RWRDX,Ch_MEDX,Ch_MRDX]*insprop[taicpu(p1).opcode].Ch)<>[];
  963. RS_EBX:
  964. result:=([Ch_REBX,Ch_RRBX,Ch_WEBX,Ch_WRBX,Ch_RWEBX,Ch_RWRBX,Ch_MEBX,Ch_MRBX]*insprop[taicpu(p1).opcode].Ch)<>[];
  965. RS_ESP:
  966. result:=([Ch_RESP,Ch_RRSP,Ch_WESP,Ch_WRSP,Ch_RWESP,Ch_RWRSP,Ch_MESP,Ch_MRSP]*insprop[taicpu(p1).opcode].Ch)<>[];
  967. RS_EBP:
  968. result:=([Ch_REBP,Ch_RRBP,Ch_WEBP,Ch_WRBP,Ch_RWEBP,Ch_RWRBP,Ch_MEBP,Ch_MRBP]*insprop[taicpu(p1).opcode].Ch)<>[];
  969. RS_ESI:
  970. result:=([Ch_RESI,Ch_RRSI,Ch_WESI,Ch_WRSI,Ch_RWESI,Ch_RWRSI,Ch_MESI,Ch_MRSI,Ch_RMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  971. RS_EDI:
  972. result:=([Ch_REDI,Ch_RRDI,Ch_WEDI,Ch_WRDI,Ch_RWEDI,Ch_RWRDI,Ch_MEDI,Ch_MRDI,Ch_WMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  973. else
  974. ;
  975. end;
  976. end;
  977. if result then
  978. exit;
  979. end
  980. else if getregtype(reg)=R_MMREGISTER then
  981. begin
  982. case getsupreg(reg) of
  983. RS_XMM0:
  984. result:=([Ch_RXMM0,Ch_WXMM0,Ch_RWXMM0,Ch_MXMM0]*insprop[taicpu(p1).opcode].Ch)<>[];
  985. else
  986. ;
  987. end;
  988. if result then
  989. exit;
  990. end
  991. else if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  992. begin
  993. if ([Ch_RFlags,Ch_WFlags,Ch_RWFlags,Ch_RFLAGScc]*insprop[taicpu(p1).opcode].Ch)<>[] then
  994. exit(true);
  995. case getsubreg(reg) of
  996. R_SUBFLAGCARRY:
  997. Result:=([Ch_RCarryFlag,Ch_RWCarryFlag,Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  998. R_SUBFLAGPARITY:
  999. Result:=([Ch_RParityFlag,Ch_RWParityFlag,Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  1000. R_SUBFLAGAUXILIARY:
  1001. Result:=([Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  1002. R_SUBFLAGZERO:
  1003. Result:=([Ch_RZeroFlag,Ch_RWZeroFlag,Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  1004. R_SUBFLAGSIGN:
  1005. Result:=([Ch_RSignFlag,Ch_RWSignFlag,Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  1006. R_SUBFLAGOVERFLOW:
  1007. Result:=([Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  1008. R_SUBFLAGINTERRUPT:
  1009. Result:=([Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  1010. R_SUBFLAGDIRECTION:
  1011. Result:=([Ch_RDirFlag,Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  1012. R_SUBW,R_SUBD,R_SUBQ:
  1013. { Everything except the direction bits }
  1014. Result:=
  1015. ([Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  1016. Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
  1017. Ch_W0CarryFlag,Ch_W0ParityFlag,Ch_W0AuxiliaryFlag,Ch_W0ZeroFlag,Ch_W0SignFlag,Ch_W0OverflowFlag,
  1018. Ch_W1CarryFlag,Ch_W1ParityFlag,Ch_W1AuxiliaryFlag,Ch_W1ZeroFlag,Ch_W1SignFlag,Ch_W1OverflowFlag,
  1019. Ch_WUCarryFlag,Ch_WUParityFlag,Ch_WUAuxiliaryFlag,Ch_WUZeroFlag,Ch_WUSignFlag,Ch_WUOverflowFlag,
  1020. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag
  1021. ]*insprop[taicpu(p1).opcode].Ch)<>[];
  1022. else
  1023. ;
  1024. end;
  1025. if result then
  1026. exit;
  1027. end
  1028. else if (getregtype(reg)=R_FPUREGISTER) and (Ch_FPU in insprop[taicpu(p1).opcode].Ch) then
  1029. exit(true);
  1030. Result:=inherited RegInInstruction(Reg, p1);
  1031. end;
  1032. function TX86AsmOptimizer.RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean;
  1033. const
  1034. WriteOps: array[0..3] of set of TInsChange =
  1035. ([CH_RWOP1,CH_WOP1,CH_MOP1],
  1036. [Ch_RWOP2,Ch_WOP2,Ch_MOP2],
  1037. [Ch_RWOP3,Ch_WOP3,Ch_MOP3],
  1038. [Ch_RWOP4,Ch_WOP4,Ch_MOP4]);
  1039. var
  1040. OperIdx: Integer;
  1041. begin
  1042. Result := False;
  1043. if p1.typ <> ait_instruction then
  1044. exit;
  1045. with insprop[taicpu(p1).opcode] do
  1046. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  1047. begin
  1048. case getsubreg(reg) of
  1049. R_SUBW,R_SUBD,R_SUBQ:
  1050. Result :=
  1051. [Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
  1052. Ch_W0CarryFlag,Ch_W0ParityFlag,Ch_W0AuxiliaryFlag,Ch_W0ZeroFlag,Ch_W0SignFlag,Ch_W0OverflowFlag,
  1053. Ch_W1CarryFlag,Ch_W1ParityFlag,Ch_W1AuxiliaryFlag,Ch_W1ZeroFlag,Ch_W1SignFlag,Ch_W1OverflowFlag,
  1054. Ch_WUCarryFlag,Ch_WUParityFlag,Ch_WUAuxiliaryFlag,Ch_WUZeroFlag,Ch_WUSignFlag,Ch_WUOverflowFlag,
  1055. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  1056. Ch_W0DirFlag,Ch_W1DirFlag,Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1057. R_SUBFLAGCARRY:
  1058. Result:=[Ch_WCarryFlag,Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WUCarryFlag,Ch_RWCarryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1059. R_SUBFLAGPARITY:
  1060. Result:=[Ch_WParityFlag,Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WUParityFlag,Ch_RWParityFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1061. R_SUBFLAGAUXILIARY:
  1062. Result:=[Ch_WAuxiliaryFlag,Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1063. R_SUBFLAGZERO:
  1064. Result:=[Ch_WZeroFlag,Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WUZeroFlag,Ch_RWZeroFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1065. R_SUBFLAGSIGN:
  1066. Result:=[Ch_WSignFlag,Ch_W0SignFlag,Ch_W1SignFlag,Ch_WUSignFlag,Ch_RWSignFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1067. R_SUBFLAGOVERFLOW:
  1068. Result:=[Ch_WOverflowFlag,Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WUOverflowFlag,Ch_RWOverflowFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1069. R_SUBFLAGINTERRUPT:
  1070. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1071. R_SUBFLAGDIRECTION:
  1072. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1073. else
  1074. internalerror(2017042602);
  1075. end;
  1076. exit;
  1077. end;
  1078. case taicpu(p1).opcode of
  1079. A_CALL:
  1080. { We could potentially set Result to False if the register in
  1081. question is non-volatile for the subroutine's calling convention,
  1082. but this would require detecting the calling convention in use and
  1083. also assuming that the routine doesn't contain malformed assembly
  1084. language, for example... so it could only be done under -O4 as it
  1085. would be considered a side-effect. [Kit] }
  1086. Result := True;
  1087. A_MOVSD:
  1088. { special handling for SSE MOVSD }
  1089. if (taicpu(p1).ops>0) then
  1090. begin
  1091. if taicpu(p1).ops<>2 then
  1092. internalerror(2017042703);
  1093. Result := (taicpu(p1).oper[1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[1]^);
  1094. end;
  1095. { VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
  1096. so fix it here (FK)
  1097. }
  1098. A_VMOVSS,
  1099. A_VMOVSD:
  1100. begin
  1101. Result := (taicpu(p1).ops=3) and (taicpu(p1).oper[2]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[2]^);
  1102. exit;
  1103. end;
  1104. A_MUL, A_DIV, A_IDIV:
  1105. begin
  1106. if taicpu(p1).opsize = S_B then
  1107. Result := (getsupreg(Reg) = RS_EAX)
  1108. else
  1109. Result := (getsupreg(Reg) in [RS_EAX, RS_EDX]);
  1110. end;
  1111. A_IMUL:
  1112. begin
  1113. if taicpu(p1).ops = 1 then
  1114. begin
  1115. Result := (getsupreg(Reg) in [RS_EAX, RS_EDX]);
  1116. end
  1117. else
  1118. Result := (taicpu(p1).oper[taicpu(p1).ops-1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[taicpu(p1).ops-1]^);
  1119. Exit;
  1120. end;
  1121. else
  1122. ;
  1123. end;
  1124. if Result then
  1125. exit;
  1126. with insprop[taicpu(p1).opcode] do
  1127. begin
  1128. if getregtype(reg)=R_INTREGISTER then
  1129. begin
  1130. case getsupreg(reg) of
  1131. RS_EAX:
  1132. if [Ch_WEAX,Ch_RWEAX,Ch_MEAX,Ch_WRAX,Ch_RWRAX,Ch_MRAX]*Ch<>[] then
  1133. begin
  1134. Result := True;
  1135. exit
  1136. end;
  1137. RS_ECX:
  1138. if [Ch_WECX,Ch_RWECX,Ch_MECX,Ch_WRCX,Ch_RWRCX,Ch_MRCX]*Ch<>[] then
  1139. begin
  1140. Result := True;
  1141. exit
  1142. end;
  1143. RS_EDX:
  1144. if [Ch_WEDX,Ch_RWEDX,Ch_MEDX,Ch_WRDX,Ch_RWRDX,Ch_MRDX]*Ch<>[] then
  1145. begin
  1146. Result := True;
  1147. exit
  1148. end;
  1149. RS_EBX:
  1150. if [Ch_WEBX,Ch_RWEBX,Ch_MEBX,Ch_WRBX,Ch_RWRBX,Ch_MRBX]*Ch<>[] then
  1151. begin
  1152. Result := True;
  1153. exit
  1154. end;
  1155. RS_ESP:
  1156. if [Ch_WESP,Ch_RWESP,Ch_MESP,Ch_WRSP,Ch_RWRSP,Ch_MRSP]*Ch<>[] then
  1157. begin
  1158. Result := True;
  1159. exit
  1160. end;
  1161. RS_EBP:
  1162. if [Ch_WEBP,Ch_RWEBP,Ch_MEBP,Ch_WRBP,Ch_RWRBP,Ch_MRBP]*Ch<>[] then
  1163. begin
  1164. Result := True;
  1165. exit
  1166. end;
  1167. RS_ESI:
  1168. if [Ch_WESI,Ch_RWESI,Ch_MESI,Ch_WRSI,Ch_RWRSI,Ch_MRSI]*Ch<>[] then
  1169. begin
  1170. Result := True;
  1171. exit
  1172. end;
  1173. RS_EDI:
  1174. if [Ch_WEDI,Ch_RWEDI,Ch_MEDI,Ch_WRDI,Ch_RWRDI,Ch_MRDI]*Ch<>[] then
  1175. begin
  1176. Result := True;
  1177. exit
  1178. end;
  1179. end;
  1180. end;
  1181. for OperIdx := 0 to taicpu(p1).ops - 1 do
  1182. if (WriteOps[OperIdx]*Ch<>[]) and
  1183. { The register doesn't get modified inside a reference }
  1184. (taicpu(p1).oper[OperIdx]^.typ = top_reg) and
  1185. SuperRegistersEqual(reg,taicpu(p1).oper[OperIdx]^.reg) then
  1186. begin
  1187. Result := true;
  1188. exit
  1189. end;
  1190. end;
  1191. end;
  1192. function TX86AsmOptimizer.RefModifiedBetween(Ref: TReference; RefSize: ASizeInt; p1, p2: tai): Boolean;
  1193. const
  1194. WriteOps: array[0..3] of set of TInsChange =
  1195. ([CH_RWOP1,CH_WOP1,CH_MOP1],
  1196. [Ch_RWOP2,Ch_WOP2,Ch_MOP2],
  1197. [Ch_RWOP3,Ch_WOP3,Ch_MOP3],
  1198. [Ch_RWOP4,Ch_WOP4,Ch_MOP4]);
  1199. var
  1200. X: Integer;
  1201. CurrentP1Size: asizeint;
  1202. begin
  1203. Result := (
  1204. (Ref.base <> NR_NO) and
  1205. {$ifdef x86_64}
  1206. (Ref.base <> NR_RIP) and
  1207. {$endif x86_64}
  1208. RegModifiedBetween(Ref.base, p1, p2)
  1209. ) or
  1210. (
  1211. (Ref.index <> NR_NO) and
  1212. (Ref.index <> Ref.base) and
  1213. RegModifiedBetween(Ref.index, p1, p2)
  1214. );
  1215. { Now check to see if the memory itself is written to }
  1216. if not Result then
  1217. begin
  1218. while assigned(p1) and assigned(p2) and GetNextInstruction(p1,p1) and (p1<>p2) do
  1219. if p1.typ = ait_instruction then
  1220. begin
  1221. CurrentP1Size := topsize2memsize[taicpu(p1).opsize] shr 3; { Convert to bytes }
  1222. with insprop[taicpu(p1).opcode] do
  1223. for X := 0 to taicpu(p1).ops - 1 do
  1224. if (taicpu(p1).oper[X]^.typ = top_ref) and
  1225. RefsAlmostEqual(Ref, taicpu(p1).oper[X]^.ref^) and
  1226. { Catch any potential overlaps }
  1227. (
  1228. (RefSize = 0) or
  1229. ((taicpu(p1).oper[X]^.ref^.offset - Ref.offset) < RefSize)
  1230. ) and
  1231. (
  1232. (CurrentP1Size = 0) or
  1233. ((Ref.offset - taicpu(p1).oper[X]^.ref^.offset) < CurrentP1Size)
  1234. ) and
  1235. { Reference is used, but does the instruction write to it? }
  1236. (
  1237. (Ch_All in Ch) or
  1238. ((WriteOps[X] * Ch) <> [])
  1239. ) then
  1240. begin
  1241. Result := True;
  1242. Break;
  1243. end;
  1244. end;
  1245. end;
  1246. end;
  1247. {$ifdef DEBUG_AOPTCPU}
  1248. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);
  1249. begin
  1250. asml.insertbefore(tai_comment.Create(strpnew(s)), p);
  1251. end;
  1252. function debug_tostr(i: tcgint): string; inline;
  1253. begin
  1254. Result := tostr(i);
  1255. end;
  1256. function debug_hexstr(i: tcgint): string;
  1257. begin
  1258. Result := '0x';
  1259. case i of
  1260. 0..$FF:
  1261. Result := Result + hexstr(i, 2);
  1262. $100..$FFFF:
  1263. Result := Result + hexstr(i, 4);
  1264. $10000..$FFFFFF:
  1265. Result := Result + hexstr(i, 6);
  1266. $1000000..$FFFFFFFF:
  1267. Result := Result + hexstr(i, 8);
  1268. else
  1269. Result := Result + hexstr(i, 16);
  1270. end;
  1271. end;
  1272. function debug_regname(r: TRegister): string; inline;
  1273. begin
  1274. Result := '%' + std_regname(r);
  1275. end;
  1276. { Debug output function - creates a string representation of an operator }
  1277. function debug_operstr(oper: TOper): string;
  1278. begin
  1279. case oper.typ of
  1280. top_const:
  1281. Result := '$' + debug_tostr(oper.val);
  1282. top_reg:
  1283. Result := debug_regname(oper.reg);
  1284. top_ref:
  1285. begin
  1286. if oper.ref^.offset <> 0 then
  1287. Result := debug_tostr(oper.ref^.offset) + '('
  1288. else
  1289. Result := '(';
  1290. if (oper.ref^.base <> NR_INVALID) and (oper.ref^.base <> NR_NO) then
  1291. begin
  1292. Result := Result + debug_regname(oper.ref^.base);
  1293. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  1294. Result := Result + ',' + debug_regname(oper.ref^.index);
  1295. end
  1296. else
  1297. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  1298. Result := Result + debug_regname(oper.ref^.index);
  1299. if (oper.ref^.scalefactor > 1) then
  1300. Result := Result + ',' + debug_tostr(oper.ref^.scalefactor) + ')'
  1301. else
  1302. Result := Result + ')';
  1303. end;
  1304. else
  1305. Result := '[UNKNOWN]';
  1306. end;
  1307. end;
  1308. function debug_op2str(opcode: tasmop): string; inline;
  1309. begin
  1310. Result := std_op2str[opcode];
  1311. end;
  1312. function debug_opsize2str(opsize: topsize): string; inline;
  1313. begin
  1314. Result := gas_opsize2str[opsize];
  1315. end;
  1316. {$else DEBUG_AOPTCPU}
  1317. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);inline;
  1318. begin
  1319. end;
  1320. function debug_tostr(i: tcgint): string; inline;
  1321. begin
  1322. Result := '';
  1323. end;
  1324. function debug_hexstr(i: tcgint): string; inline;
  1325. begin
  1326. Result := '';
  1327. end;
  1328. function debug_regname(r: TRegister): string; inline;
  1329. begin
  1330. Result := '';
  1331. end;
  1332. function debug_operstr(oper: TOper): string; inline;
  1333. begin
  1334. Result := '';
  1335. end;
  1336. function debug_op2str(opcode: tasmop): string; inline;
  1337. begin
  1338. Result := '';
  1339. end;
  1340. function debug_opsize2str(opsize: topsize): string; inline;
  1341. begin
  1342. Result := '';
  1343. end;
  1344. {$endif DEBUG_AOPTCPU}
  1345. class function TX86AsmOptimizer.IsMOVZXAcceptable: Boolean; inline;
  1346. begin
  1347. {$ifdef x86_64}
  1348. { Always fine on x86-64 }
  1349. Result := True;
  1350. {$else x86_64}
  1351. Result :=
  1352. {$ifdef i8086}
  1353. (current_settings.cputype >= cpu_386) and
  1354. {$endif i8086}
  1355. (
  1356. { Always accept if optimising for size }
  1357. (cs_opt_size in current_settings.optimizerswitches) or
  1358. { From the Pentium II onwards, MOVZX only takes 1 cycle. [Kit] }
  1359. (current_settings.optimizecputype >= cpu_Pentium2)
  1360. );
  1361. {$endif x86_64}
  1362. end;
  1363. { Attempts to allocate a volatile integer register for use between p and hp,
  1364. using AUsedRegs for the current register usage information. Returns NR_NO
  1365. if no free register could be found }
  1366. function TX86AsmOptimizer.GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
  1367. var
  1368. RegSet: TCPURegisterSet;
  1369. CurrentSuperReg: Integer;
  1370. CurrentReg: TRegister;
  1371. Currentp: tai;
  1372. Breakout: Boolean;
  1373. begin
  1374. Result := NR_NO;
  1375. RegSet :=
  1376. paramanager.get_volatile_registers_int(current_procinfo.procdef.proccalloption) +
  1377. current_procinfo.saved_regs_int;
  1378. (*
  1379. { Don't use the frame register unless explicitly allowed (fixes i40111) }
  1380. if ([cs_useebp, cs_userbp] * current_settings.optimizerswitches) = [] then
  1381. Exclude(RegSet, RS_FRAME_POINTER_REG);
  1382. *)
  1383. for CurrentSuperReg in RegSet do
  1384. begin
  1385. CurrentReg := newreg(R_INTREGISTER, TSuperRegister(CurrentSuperReg), RegSize);
  1386. if not AUsedRegs[R_INTREGISTER].IsUsed(CurrentReg)
  1387. {$if defined(i386) or defined(i8086)}
  1388. { If the target size is 8-bit, make sure we can actually encode it }
  1389. and (
  1390. (RegSize >= R_SUBW) or { Not R_SUBL or R_SUBH }
  1391. (GetSupReg(CurrentReg) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX])
  1392. )
  1393. {$endif i386 or i8086}
  1394. then
  1395. begin
  1396. Currentp := p;
  1397. Breakout := False;
  1398. while not Breakout and GetNextInstruction(Currentp, Currentp) and (Currentp <> hp) do
  1399. begin
  1400. case Currentp.typ of
  1401. ait_instruction:
  1402. begin
  1403. if RegInInstruction(CurrentReg, Currentp) then
  1404. begin
  1405. Breakout := True;
  1406. Break;
  1407. end;
  1408. { Cannot allocate across an unconditional jump }
  1409. if is_calljmpuncondret(taicpu(Currentp).opcode) then
  1410. Exit;
  1411. end;
  1412. ait_marker:
  1413. { Don't try anything more if a marker is hit }
  1414. Exit;
  1415. ait_regalloc:
  1416. if (tai_regalloc(Currentp).ratype <> ra_dealloc) and SuperRegistersEqual(CurrentReg, tai_regalloc(Currentp).reg) then
  1417. begin
  1418. Breakout := True;
  1419. Break;
  1420. end;
  1421. else
  1422. ;
  1423. end;
  1424. end;
  1425. if Breakout then
  1426. { Try the next register }
  1427. Continue;
  1428. { We have a free register available }
  1429. Result := CurrentReg;
  1430. if not DontAlloc then
  1431. AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
  1432. Exit;
  1433. end;
  1434. end;
  1435. end;
  1436. { Attempts to allocate a volatile MM register for use between p and hp,
  1437. using AUsedRegs for the current register usage information. Returns NR_NO
  1438. if no free register could be found }
  1439. function TX86AsmOptimizer.GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
  1440. var
  1441. RegSet: TCPURegisterSet;
  1442. CurrentSuperReg: Integer;
  1443. CurrentReg: TRegister;
  1444. Currentp: tai;
  1445. Breakout: Boolean;
  1446. begin
  1447. Result := NR_NO;
  1448. RegSet :=
  1449. paramanager.get_volatile_registers_mm(current_procinfo.procdef.proccalloption) +
  1450. current_procinfo.saved_regs_mm;
  1451. for CurrentSuperReg in RegSet do
  1452. begin
  1453. CurrentReg := newreg(R_MMREGISTER, TSuperRegister(CurrentSuperReg), RegSize);
  1454. if not AUsedRegs[R_MMREGISTER].IsUsed(CurrentReg) then
  1455. begin
  1456. Currentp := p;
  1457. Breakout := False;
  1458. while not Breakout and GetNextInstruction(Currentp, Currentp) and (Currentp <> hp) do
  1459. begin
  1460. case Currentp.typ of
  1461. ait_instruction:
  1462. begin
  1463. if RegInInstruction(CurrentReg, Currentp) then
  1464. begin
  1465. Breakout := True;
  1466. Break;
  1467. end;
  1468. { Cannot allocate across an unconditional jump }
  1469. if is_calljmpuncondret(taicpu(Currentp).opcode) then
  1470. Exit;
  1471. end;
  1472. ait_marker:
  1473. { Don't try anything more if a marker is hit }
  1474. Exit;
  1475. ait_regalloc:
  1476. if (tai_regalloc(Currentp).ratype <> ra_dealloc) and SuperRegistersEqual(CurrentReg, tai_regalloc(Currentp).reg) then
  1477. begin
  1478. Breakout := True;
  1479. Break;
  1480. end;
  1481. else
  1482. ;
  1483. end;
  1484. end;
  1485. if Breakout then
  1486. { Try the next register }
  1487. Continue;
  1488. { We have a free register available }
  1489. Result := CurrentReg;
  1490. if not DontAlloc then
  1491. AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
  1492. Exit;
  1493. end;
  1494. end;
  1495. end;
  1496. class function TX86AsmOptimizer.Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  1497. begin
  1498. if not SuperRegistersEqual(reg1,reg2) then
  1499. exit(false);
  1500. if getregtype(reg1)<>R_INTREGISTER then
  1501. exit(true); {because SuperRegisterEqual is true}
  1502. case getsubreg(reg1) of
  1503. { A write to R_SUBL doesn't change R_SUBH and if reg2 is R_SUBW or
  1504. higher, it preserves the high bits, so the new value depends on
  1505. reg2's previous value. In other words, it is equivalent to doing:
  1506. reg2 := (reg2 and $ffffff00) or byte(reg1); }
  1507. R_SUBL:
  1508. exit(getsubreg(reg2)=R_SUBL);
  1509. { A write to R_SUBH doesn't change R_SUBL and if reg2 is R_SUBW or
  1510. higher, it actually does a:
  1511. reg2 := (reg2 and $ffff00ff) or (reg1 and $ff00); }
  1512. R_SUBH:
  1513. exit(getsubreg(reg2)=R_SUBH);
  1514. { If reg2 is R_SUBD or larger, a write to R_SUBW preserves the high 16
  1515. bits of reg2:
  1516. reg2 := (reg2 and $ffff0000) or word(reg1); }
  1517. R_SUBW:
  1518. exit(getsubreg(reg2) in [R_SUBL,R_SUBH,R_SUBW]);
  1519. { a write to R_SUBD always overwrites every other subregister,
  1520. because it clears the high 32 bits of R_SUBQ on x86_64 }
  1521. R_SUBD,
  1522. R_SUBQ:
  1523. exit(true);
  1524. else
  1525. internalerror(2017042801);
  1526. end;
  1527. end;
  1528. class function TX86AsmOptimizer.Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  1529. begin
  1530. if not SuperRegistersEqual(reg1,reg2) then
  1531. exit(false);
  1532. if getregtype(reg1)<>R_INTREGISTER then
  1533. exit(true); {because SuperRegisterEqual is true}
  1534. case getsubreg(reg1) of
  1535. R_SUBL:
  1536. exit(getsubreg(reg2)<>R_SUBH);
  1537. R_SUBH:
  1538. exit(getsubreg(reg2)<>R_SUBL);
  1539. R_SUBW,
  1540. R_SUBD,
  1541. R_SUBQ:
  1542. exit(true);
  1543. else
  1544. internalerror(2017042802);
  1545. end;
  1546. end;
  1547. function TX86AsmOptimizer.PrePeepholeOptSxx(var p : tai) : boolean;
  1548. var
  1549. hp1 : tai;
  1550. l : TCGInt;
  1551. begin
  1552. result:=false;
  1553. if not(GetNextInstruction(p, hp1)) then
  1554. exit;
  1555. { changes the code sequence
  1556. shr/sar const1, x
  1557. shl const2, x
  1558. to
  1559. either "sar/and", "shl/and" or just "and" depending on const1 and const2 }
  1560. if (taicpu(p).oper[0]^.typ = top_const) and
  1561. MatchInstruction(hp1,A_SHL,[]) and
  1562. (taicpu(hp1).oper[0]^.typ = top_const) and
  1563. (taicpu(hp1).opsize = taicpu(p).opsize) and
  1564. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[1]^.typ) and
  1565. OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^) then
  1566. begin
  1567. if (taicpu(p).oper[0]^.val > taicpu(hp1).oper[0]^.val) and
  1568. not(cs_opt_size in current_settings.optimizerswitches)
  1569. {$ifdef x86_64}
  1570. and (
  1571. (taicpu(p).opsize <> S_Q) or
  1572. { 64-bit AND can only store signed 32-bit immediates }
  1573. (taicpu(p).oper[0]^.val < 32)
  1574. )
  1575. {$endif x86_64}
  1576. then
  1577. begin
  1578. { shr/sar const1, %reg
  1579. shl const2, %reg
  1580. with const1 > const2 }
  1581. DebugMsg(SPeepholeOptimization + 'SxrShl2SxrAnd 1 done',p);
  1582. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  1583. taicpu(hp1).opcode := A_AND;
  1584. l := (1 shl (taicpu(hp1).oper[0]^.val)) - 1;
  1585. case taicpu(p).opsize Of
  1586. S_B: taicpu(hp1).loadConst(0,l Xor $ff);
  1587. S_W: taicpu(hp1).loadConst(0,l Xor $ffff);
  1588. S_L: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffff));
  1589. S_Q: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffffffffffff));
  1590. else
  1591. Internalerror(2017050703)
  1592. end;
  1593. end
  1594. else if (taicpu(p).oper[0]^.val<taicpu(hp1).oper[0]^.val) and
  1595. not(cs_opt_size in current_settings.optimizerswitches)
  1596. {$ifdef x86_64}
  1597. and (
  1598. (taicpu(p).opsize <> S_Q) or
  1599. { 64-bit AND can only store signed 32-bit immediates }
  1600. (taicpu(p).oper[0]^.val < 32)
  1601. )
  1602. {$endif x86_64}
  1603. then
  1604. begin
  1605. { shr/sar const1, %reg
  1606. shl const2, %reg
  1607. with const1 < const2 }
  1608. DebugMsg(SPeepholeOptimization + 'SxrShl2SxrAnd 2 done',p);
  1609. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val-taicpu(p).oper[0]^.val);
  1610. taicpu(p).opcode := A_AND;
  1611. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  1612. case taicpu(p).opsize Of
  1613. S_B: taicpu(p).loadConst(0,l Xor $ff);
  1614. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  1615. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  1616. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  1617. else
  1618. Internalerror(2017050702)
  1619. end;
  1620. end
  1621. else if (taicpu(p).oper[0]^.val = taicpu(hp1).oper[0]^.val)
  1622. {$ifdef x86_64}
  1623. and (
  1624. (taicpu(p).opsize <> S_Q) or
  1625. { 64-bit AND can only store signed 32-bit immediates }
  1626. (taicpu(p).oper[0]^.val < 32)
  1627. )
  1628. {$endif x86_64}
  1629. then
  1630. begin
  1631. { shr/sar const1, %reg
  1632. shl const2, %reg
  1633. with const1 = const2 }
  1634. DebugMsg(SPeepholeOptimization + 'SxrShl2And done',p);
  1635. taicpu(p).opcode := A_AND;
  1636. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  1637. case taicpu(p).opsize Of
  1638. S_B: taicpu(p).loadConst(0,l Xor $ff);
  1639. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  1640. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  1641. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  1642. else
  1643. Internalerror(2017050701)
  1644. end;
  1645. RemoveInstruction(hp1);
  1646. end;
  1647. end;
  1648. end;
  1649. function TX86AsmOptimizer.PrePeepholeOptIMUL(var p : tai) : boolean;
  1650. var
  1651. opsize : topsize;
  1652. hp1, hp2 : tai;
  1653. tmpref : treference;
  1654. ShiftValue : Cardinal;
  1655. BaseValue : TCGInt;
  1656. begin
  1657. result:=false;
  1658. opsize:=taicpu(p).opsize;
  1659. { changes certain "imul const, %reg"'s to lea sequences }
  1660. if (MatchOpType(taicpu(p),top_const,top_reg) or
  1661. MatchOpType(taicpu(p),top_const,top_reg,top_reg)) and
  1662. (opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
  1663. if (taicpu(p).oper[0]^.val = 1) then
  1664. if (taicpu(p).ops = 2) then
  1665. { remove "imul $1, reg" }
  1666. begin
  1667. DebugMsg(SPeepholeOptimization + 'Imul2Nop done',p);
  1668. Result := RemoveCurrentP(p);
  1669. end
  1670. else
  1671. { change "imul $1, reg1, reg2" to "mov reg1, reg2" }
  1672. begin
  1673. hp1 := taicpu.Op_Reg_Reg(A_MOV, opsize, taicpu(p).oper[1]^.reg,taicpu(p).oper[2]^.reg);
  1674. taicpu(hp1).fileinfo := taicpu(p).fileinfo;
  1675. asml.InsertAfter(hp1, p);
  1676. DebugMsg(SPeepholeOptimization + 'Imul2Mov done',p);
  1677. RemoveCurrentP(p, hp1);
  1678. Result := True;
  1679. end
  1680. else if ((taicpu(p).ops <= 2) or
  1681. (taicpu(p).oper[2]^.typ = Top_Reg)) and
  1682. not(cs_opt_size in current_settings.optimizerswitches) and
  1683. (not(GetNextInstruction(p, hp1)) or
  1684. not((tai(hp1).typ = ait_instruction) and
  1685. ((taicpu(hp1).opcode=A_Jcc) and
  1686. (taicpu(hp1).condition in [C_O,C_NO])))) then
  1687. begin
  1688. {
  1689. imul X, reg1, reg2 to
  1690. lea (reg1,reg1,Y), reg2
  1691. shl ZZ,reg2
  1692. imul XX, reg1 to
  1693. lea (reg1,reg1,YY), reg1
  1694. shl ZZ,reg2
  1695. This optimziation makes sense for pretty much every x86, except the VIA Nano3000: it has IMUL latency 2, lea/shl pair as well,
  1696. it does not exist as a separate optimization target in FPC though.
  1697. This optimziation can be applied as long as only two bits are set in the constant and those two bits are separated by
  1698. at most two zeros
  1699. }
  1700. reference_reset(tmpref,1,[]);
  1701. if (PopCnt(QWord(taicpu(p).oper[0]^.val))=2) and (BsrQWord(taicpu(p).oper[0]^.val)-BsfQWord(taicpu(p).oper[0]^.val)<=3) then
  1702. begin
  1703. ShiftValue:=BsfQWord(taicpu(p).oper[0]^.val);
  1704. BaseValue:=taicpu(p).oper[0]^.val shr ShiftValue;
  1705. TmpRef.base := taicpu(p).oper[1]^.reg;
  1706. TmpRef.index := taicpu(p).oper[1]^.reg;
  1707. if not(BaseValue in [3,5,9]) then
  1708. Internalerror(2018110101);
  1709. TmpRef.ScaleFactor := BaseValue-1;
  1710. if (taicpu(p).ops = 2) then
  1711. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[1]^.reg)
  1712. else
  1713. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[2]^.reg);
  1714. AsmL.InsertAfter(hp1,p);
  1715. DebugMsg(SPeepholeOptimization + 'Imul2LeaShl done',p);
  1716. taicpu(hp1).fileinfo:=taicpu(p).fileinfo;
  1717. RemoveCurrentP(p, hp1);
  1718. if ShiftValue>0 then
  1719. begin
  1720. hp2 := taicpu.op_const_reg(A_SHL, opsize, ShiftValue, taicpu(hp1).oper[1]^.reg);
  1721. AsmL.InsertAfter(hp2,hp1);
  1722. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  1723. end;
  1724. Result := True;
  1725. end;
  1726. end;
  1727. end;
  1728. function TX86AsmOptimizer.PrePeepholeOptAND(var p : tai) : boolean;
  1729. begin
  1730. Result := False;
  1731. if MatchOperand(taicpu(p).oper[0]^, 0) and
  1732. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  1733. begin
  1734. DebugMsg(SPeepholeOptimization + 'AND 0 -> MOV 0', p);
  1735. taicpu(p).opcode := A_MOV;
  1736. Result := True;
  1737. end;
  1738. end;
  1739. function TX86AsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  1740. var
  1741. p: taicpu absolute hp; { Implicit typecast }
  1742. i: Integer;
  1743. begin
  1744. Result := False;
  1745. if not assigned(hp) or
  1746. (hp.typ <> ait_instruction) then
  1747. Exit;
  1748. Prefetch(insprop[p.opcode]);
  1749. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  1750. with insprop[p.opcode] do
  1751. begin
  1752. case getsubreg(reg) of
  1753. R_SUBW,R_SUBD,R_SUBQ:
  1754. Result:=
  1755. { ZF, CF, OF, SF, PF and AF must all be set in some way (ordered so the most
  1756. uncommon flags are checked first }
  1757. ([Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_WFlags] * Ch <> []) and
  1758. ([Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag,Ch_WFlags]*Ch <> []) and
  1759. ([Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag,Ch_WFlags]*Ch <> []) and
  1760. ([Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag,Ch_WFlags]*Ch <> []) and
  1761. ([Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag,Ch_WFlags]*Ch <> []) and
  1762. ([Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag,Ch_WFlags]*Ch <> []);
  1763. R_SUBFLAGCARRY:
  1764. Result:=[Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag,Ch_WFlags]*Ch<>[];
  1765. R_SUBFLAGPARITY:
  1766. Result:=[Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag,Ch_WFlags]*Ch<>[];
  1767. R_SUBFLAGAUXILIARY:
  1768. Result:=[Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_WFlags]*Ch<>[];
  1769. R_SUBFLAGZERO:
  1770. Result:=[Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag,Ch_WFlags]*Ch<>[];
  1771. R_SUBFLAGSIGN:
  1772. Result:=[Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag,Ch_WFlags]*Ch<>[];
  1773. R_SUBFLAGOVERFLOW:
  1774. Result:=[Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag,Ch_WFlags]*Ch<>[];
  1775. R_SUBFLAGINTERRUPT:
  1776. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*Ch<>[];
  1777. R_SUBFLAGDIRECTION:
  1778. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*Ch<>[];
  1779. else
  1780. internalerror(2017050501);
  1781. end;
  1782. exit;
  1783. end;
  1784. { Handle special cases first }
  1785. case p.opcode of
  1786. A_MOV, A_MOVZX, A_MOVSX, A_LEA, A_VMOVSS, A_VMOVSD, A_VMOVAPD,
  1787. A_VMOVAPS, A_VMOVQ, A_MOVSS, A_MOVSD, A_MOVQ, A_MOVAPD, A_MOVAPS:
  1788. begin
  1789. Result :=
  1790. (p.ops=2) and { A_MOVSD can have zero operands, so this check is needed }
  1791. (p.oper[1]^.typ = top_reg) and
  1792. (Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)) and
  1793. (
  1794. (p.oper[0]^.typ = top_const) or
  1795. (
  1796. (p.oper[0]^.typ = top_reg) and
  1797. not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))
  1798. ) or (
  1799. (p.oper[0]^.typ = top_ref) and
  1800. not RegInRef(reg,p.oper[0]^.ref^)
  1801. )
  1802. );
  1803. end;
  1804. A_MUL, A_IMUL:
  1805. Result :=
  1806. (
  1807. (p.ops=3) and { IMUL only }
  1808. (Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg,reg)) and
  1809. (
  1810. (
  1811. (p.oper[1]^.typ=top_reg) and
  1812. not Reg1ReadDependsOnReg2(p.oper[1]^.reg,reg)
  1813. ) or (
  1814. (p.oper[1]^.typ=top_ref) and
  1815. not RegInRef(reg,p.oper[1]^.ref^)
  1816. )
  1817. )
  1818. ) or (
  1819. (
  1820. (p.ops=1) and
  1821. (
  1822. (
  1823. (
  1824. (p.oper[0]^.typ=top_reg) and
  1825. not Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg)
  1826. )
  1827. ) or (
  1828. (p.oper[0]^.typ=top_ref) and
  1829. not RegInRef(reg,p.oper[0]^.ref^)
  1830. )
  1831. ) and (
  1832. (
  1833. (p.opsize=S_B) and
  1834. Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and
  1835. not Reg1ReadDependsOnReg2(NR_AL,reg)
  1836. ) or (
  1837. (p.opsize=S_W) and
  1838. Reg1WriteOverwritesReg2Entirely(NR_DX,reg)
  1839. ) or (
  1840. (p.opsize=S_L) and
  1841. Reg1WriteOverwritesReg2Entirely(NR_EDX,reg)
  1842. {$ifdef x86_64}
  1843. ) or (
  1844. (p.opsize=S_Q) and
  1845. Reg1WriteOverwritesReg2Entirely(NR_RDX,reg)
  1846. {$endif x86_64}
  1847. )
  1848. )
  1849. )
  1850. );
  1851. A_CBW:
  1852. Result := Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg));
  1853. {$ifndef x86_64}
  1854. A_LDS:
  1855. Result := (reg=NR_DS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1856. A_LES:
  1857. Result := (reg=NR_ES) and not(RegInRef(reg,p.oper[0]^.ref^));
  1858. {$endif not x86_64}
  1859. A_LFS:
  1860. Result := (reg=NR_FS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1861. A_LGS:
  1862. Result := (reg=NR_GS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1863. A_LSS:
  1864. Result := (reg=NR_SS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1865. A_LAHF{$ifndef x86_64}, A_AAM{$endif not x86_64}:
  1866. Result := Reg1WriteOverwritesReg2Entirely(NR_AH,reg);
  1867. A_LODSB:
  1868. Result := Reg1WriteOverwritesReg2Entirely(NR_AL,reg);
  1869. A_LODSW:
  1870. Result := Reg1WriteOverwritesReg2Entirely(NR_AX,reg);
  1871. {$ifdef x86_64}
  1872. A_LODSQ:
  1873. Result := Reg1WriteOverwritesReg2Entirely(NR_RAX,reg);
  1874. {$endif x86_64}
  1875. A_LODSD:
  1876. Result := Reg1WriteOverwritesReg2Entirely(NR_EAX,reg);
  1877. A_FSTSW, A_FNSTSW:
  1878. Result := (p.oper[0]^.typ=top_reg) and Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg);
  1879. else
  1880. begin
  1881. with insprop[p.opcode] do
  1882. begin
  1883. if (
  1884. { xor %reg,%reg etc. is classed as a new value }
  1885. (([Ch_NoReadIfEqualRegs]*Ch)<>[]) and
  1886. MatchOpType(p, top_reg, top_reg) and
  1887. (p.oper[0]^.reg = p.oper[1]^.reg) and
  1888. Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)
  1889. ) then
  1890. begin
  1891. Result := True;
  1892. Exit;
  1893. end;
  1894. { Make sure the entire register is overwritten }
  1895. if (getregtype(reg) = R_INTREGISTER) then
  1896. begin
  1897. if (p.ops > 0) then
  1898. begin
  1899. if RegInOp(reg, p.oper[0]^) then
  1900. begin
  1901. if (p.oper[0]^.typ = top_ref) then
  1902. begin
  1903. if RegInRef(reg, p.oper[0]^.ref^) then
  1904. begin
  1905. Result := False;
  1906. Exit;
  1907. end;
  1908. end
  1909. else if (p.oper[0]^.typ = top_reg) then
  1910. begin
  1911. if ([Ch_ROp1, Ch_RWOp1, Ch_MOp1]*Ch<>[]) then
  1912. begin
  1913. Result := False;
  1914. Exit;
  1915. end
  1916. else if ([Ch_WOp1]*Ch<>[]) then
  1917. begin
  1918. if Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg, reg) then
  1919. Result := True
  1920. else
  1921. begin
  1922. Result := False;
  1923. Exit;
  1924. end;
  1925. end;
  1926. end;
  1927. end;
  1928. if (p.ops > 1) then
  1929. begin
  1930. if RegInOp(reg, p.oper[1]^) then
  1931. begin
  1932. if (p.oper[1]^.typ = top_ref) then
  1933. begin
  1934. if RegInRef(reg, p.oper[1]^.ref^) then
  1935. begin
  1936. Result := False;
  1937. Exit;
  1938. end;
  1939. end
  1940. else if (p.oper[1]^.typ = top_reg) then
  1941. begin
  1942. if ([Ch_ROp2, Ch_RWOp2, Ch_MOp2]*Ch<>[]) then
  1943. begin
  1944. Result := False;
  1945. Exit;
  1946. end
  1947. else if ([Ch_WOp2]*Ch<>[]) then
  1948. begin
  1949. if Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg, reg) then
  1950. Result := True
  1951. else
  1952. begin
  1953. Result := False;
  1954. Exit;
  1955. end;
  1956. end;
  1957. end;
  1958. end;
  1959. if (p.ops > 2) then
  1960. begin
  1961. if RegInOp(reg, p.oper[2]^) then
  1962. begin
  1963. if (p.oper[2]^.typ = top_ref) then
  1964. begin
  1965. if RegInRef(reg, p.oper[2]^.ref^) then
  1966. begin
  1967. Result := False;
  1968. Exit;
  1969. end;
  1970. end
  1971. else if (p.oper[2]^.typ = top_reg) then
  1972. begin
  1973. if ([Ch_ROp3, Ch_RWOp3, Ch_MOp3]*Ch<>[]) then
  1974. begin
  1975. Result := False;
  1976. Exit;
  1977. end
  1978. else if ([Ch_WOp3]*Ch<>[]) then
  1979. begin
  1980. if Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg, reg) then
  1981. Result := True
  1982. else
  1983. begin
  1984. Result := False;
  1985. Exit;
  1986. end;
  1987. end;
  1988. end;
  1989. end;
  1990. if (p.ops > 3) and RegInOp(reg, p.oper[3]^) then
  1991. begin
  1992. if (p.oper[3]^.typ = top_ref) then
  1993. begin
  1994. if RegInRef(reg, p.oper[3]^.ref^) then
  1995. begin
  1996. Result := False;
  1997. Exit;
  1998. end;
  1999. end
  2000. else if (p.oper[3]^.typ = top_reg) then
  2001. begin
  2002. if ([Ch_ROp4, Ch_RWOp4, Ch_MOp4]*Ch<>[]) then
  2003. begin
  2004. Result := False;
  2005. Exit;
  2006. end
  2007. else if ([Ch_WOp4]*Ch<>[]) then
  2008. begin
  2009. if Reg1WriteOverwritesReg2Entirely(p.oper[3]^.reg, reg) then
  2010. Result := True
  2011. else
  2012. begin
  2013. Result := False;
  2014. Exit;
  2015. end;
  2016. end;
  2017. end;
  2018. end;
  2019. end;
  2020. end;
  2021. end;
  2022. { Don't do these ones first in case an input operand is equal to an explicit output register }
  2023. case getsupreg(reg) of
  2024. RS_EAX:
  2025. if ([Ch_WEAX{$ifdef x86_64},Ch_WRAX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EAX, reg) then
  2026. begin
  2027. Result := True;
  2028. Exit;
  2029. end;
  2030. RS_ECX:
  2031. if ([Ch_WECX{$ifdef x86_64},Ch_WRCX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ECX, reg) then
  2032. begin
  2033. Result := True;
  2034. Exit;
  2035. end;
  2036. RS_EDX:
  2037. if ([Ch_REDX{$ifdef x86_64},Ch_WRDX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EDX, reg) then
  2038. begin
  2039. Result := True;
  2040. Exit;
  2041. end;
  2042. RS_EBX:
  2043. if ([Ch_WEBX{$ifdef x86_64},Ch_WRBX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EBX, reg) then
  2044. begin
  2045. Result := True;
  2046. Exit;
  2047. end;
  2048. RS_ESP:
  2049. if ([Ch_WESP{$ifdef x86_64},Ch_WRSP{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ESP, reg) then
  2050. begin
  2051. Result := True;
  2052. Exit;
  2053. end;
  2054. RS_EBP:
  2055. if ([Ch_WEBP{$ifdef x86_64},Ch_WRBP{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EBP, reg) then
  2056. begin
  2057. Result := True;
  2058. Exit;
  2059. end;
  2060. RS_ESI:
  2061. if ([Ch_WESI{$ifdef x86_64},Ch_WRSI{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ESI, reg) then
  2062. begin
  2063. Result := True;
  2064. Exit;
  2065. end;
  2066. RS_EDI:
  2067. if ([Ch_WEDI{$ifdef x86_64},Ch_WRDI{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EDI, reg) then
  2068. begin
  2069. Result := True;
  2070. Exit;
  2071. end;
  2072. else
  2073. ;
  2074. end;
  2075. end;
  2076. end;
  2077. end;
  2078. end;
  2079. end;
  2080. class function TX86AsmOptimizer.IsExitCode(p : tai) : boolean;
  2081. var
  2082. hp2,hp3 : tai;
  2083. begin
  2084. { some x86-64 issue a NOP before the real exit code }
  2085. if MatchInstruction(p,A_NOP,[]) then
  2086. GetNextInstruction(p,p);
  2087. result:=assigned(p) and (p.typ=ait_instruction) and
  2088. ((taicpu(p).opcode = A_RET) or
  2089. ((taicpu(p).opcode=A_LEAVE) and
  2090. GetNextInstruction(p,hp2) and
  2091. MatchInstruction(hp2,A_RET,[S_NO])
  2092. ) or
  2093. (((taicpu(p).opcode=A_LEA) and
  2094. MatchOpType(taicpu(p),top_ref,top_reg) and
  2095. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  2096. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  2097. ) and
  2098. GetNextInstruction(p,hp2) and
  2099. MatchInstruction(hp2,A_RET,[S_NO])
  2100. ) or
  2101. ((((taicpu(p).opcode=A_MOV) and
  2102. MatchOpType(taicpu(p),top_reg,top_reg) and
  2103. (taicpu(p).oper[0]^.reg=current_procinfo.framepointer) and
  2104. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)) or
  2105. ((taicpu(p).opcode=A_LEA) and
  2106. MatchOpType(taicpu(p),top_ref,top_reg) and
  2107. (taicpu(p).oper[0]^.ref^.base=current_procinfo.framepointer) and
  2108. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  2109. )
  2110. ) and
  2111. GetNextInstruction(p,hp2) and
  2112. MatchInstruction(hp2,A_POP,[reg2opsize(current_procinfo.framepointer)]) and
  2113. MatchOpType(taicpu(hp2),top_reg) and
  2114. (taicpu(hp2).oper[0]^.reg=current_procinfo.framepointer) and
  2115. GetNextInstruction(hp2,hp3) and
  2116. MatchInstruction(hp3,A_RET,[S_NO])
  2117. )
  2118. );
  2119. end;
  2120. class function TX86AsmOptimizer.isFoldableArithOp(hp1: taicpu; reg: tregister): boolean;
  2121. begin
  2122. isFoldableArithOp := False;
  2123. case hp1.opcode of
  2124. A_ADD,A_SUB,A_OR,A_XOR,A_AND,A_SHL,A_SHR,A_SAR:
  2125. isFoldableArithOp :=
  2126. ((taicpu(hp1).oper[0]^.typ = top_const) or
  2127. ((taicpu(hp1).oper[0]^.typ = top_reg) and
  2128. (taicpu(hp1).oper[0]^.reg <> reg))) and
  2129. (taicpu(hp1).oper[1]^.typ = top_reg) and
  2130. (taicpu(hp1).oper[1]^.reg = reg);
  2131. A_INC,A_DEC,A_NEG,A_NOT:
  2132. isFoldableArithOp :=
  2133. (taicpu(hp1).oper[0]^.typ = top_reg) and
  2134. (taicpu(hp1).oper[0]^.reg = reg);
  2135. else
  2136. ;
  2137. end;
  2138. end;
  2139. procedure TX86AsmOptimizer.RemoveLastDeallocForFuncRes(p: tai);
  2140. procedure DoRemoveLastDeallocForFuncRes( supreg: tsuperregister);
  2141. var
  2142. hp2: tai;
  2143. begin
  2144. hp2 := p;
  2145. repeat
  2146. hp2 := tai(hp2.previous);
  2147. if assigned(hp2) and
  2148. (hp2.typ = ait_regalloc) and
  2149. (tai_regalloc(hp2).ratype=ra_dealloc) and
  2150. (getregtype(tai_regalloc(hp2).reg) = R_INTREGISTER) and
  2151. (getsupreg(tai_regalloc(hp2).reg) = supreg) then
  2152. begin
  2153. RemoveInstruction(hp2);
  2154. break;
  2155. end;
  2156. until not(assigned(hp2)) or regInInstruction(newreg(R_INTREGISTER,supreg,R_SUBWHOLE),hp2);
  2157. end;
  2158. begin
  2159. case current_procinfo.procdef.returndef.typ of
  2160. arraydef,recorddef,pointerdef,
  2161. stringdef,enumdef,procdef,objectdef,errordef,
  2162. filedef,setdef,procvardef,
  2163. classrefdef,forwarddef:
  2164. DoRemoveLastDeallocForFuncRes(RS_EAX);
  2165. orddef:
  2166. if current_procinfo.procdef.returndef.size <> 0 then
  2167. begin
  2168. DoRemoveLastDeallocForFuncRes(RS_EAX);
  2169. { for int64/qword }
  2170. if current_procinfo.procdef.returndef.size = 8 then
  2171. DoRemoveLastDeallocForFuncRes(RS_EDX);
  2172. end;
  2173. else
  2174. ;
  2175. end;
  2176. end;
  2177. function TX86AsmOptimizer.OptPass1CMOVcc(var p: tai): Boolean;
  2178. var
  2179. hp1: tai;
  2180. operswap: poper;
  2181. begin
  2182. Result := False;
  2183. { Optimise:
  2184. cmov(c) %reg1,%reg2
  2185. mov %reg2,%reg1
  2186. (%reg2 dealloc.)
  2187. To:
  2188. cmov(~c) %reg2,%reg1
  2189. }
  2190. if (taicpu(p).oper[0]^.typ = top_reg) then
  2191. while GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.reg) and
  2192. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  2193. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
  2194. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) do
  2195. begin
  2196. TransferUsedRegs(TmpUsedRegs);
  2197. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1);
  2198. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  2199. begin
  2200. DebugMsg(SPeepholeOptimization + 'CMOV(c) %reg1,%reg2; MOV %reg2,%reg1 -> CMOV(~c) %reg2,%reg1 (CMovMov2CMov)', p);
  2201. { Save time by swapping the pointers (they're both registers, so
  2202. we don't need to worry about reference counts) }
  2203. operswap := taicpu(p).oper[0];
  2204. taicpu(p).oper[0] := taicpu(p).oper[1];
  2205. taicpu(p).oper[1] := operswap;
  2206. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  2207. RemoveInstruction(hp1);
  2208. { It's still a CMOV, so we can look further ahead }
  2209. Include(OptsToCheck, aoc_ForceNewIteration);
  2210. { But first, let's see if this will get optimised again
  2211. (probably won't happen, but best to be sure) }
  2212. Continue;
  2213. end;
  2214. Break;
  2215. end;
  2216. end;
  2217. function TX86AsmOptimizer.OptPass1_V_MOVAP(var p : tai) : boolean;
  2218. var
  2219. hp1,hp2 : tai;
  2220. begin
  2221. result:=false;
  2222. if MatchOpType(taicpu(p),top_reg,top_reg) then
  2223. begin
  2224. { vmova* reg1,reg1
  2225. =>
  2226. <nop> }
  2227. if taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg then
  2228. begin
  2229. RemoveCurrentP(p);
  2230. result:=true;
  2231. exit;
  2232. end;
  2233. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  2234. (hp1.typ = ait_instruction) and
  2235. (
  2236. { Under -O2 and below, the instructions are always adjacent }
  2237. not (cs_opt_level3 in current_settings.optimizerswitches) or
  2238. (taicpu(hp1).ops <= 1) or
  2239. not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^) or
  2240. { If reg1 = reg3, reg1 must not be modified in between }
  2241. not RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)
  2242. ) then
  2243. begin
  2244. if MatchInstruction(hp1,[taicpu(p).opcode],[S_NO]) and
  2245. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  2246. begin
  2247. { vmova* reg1,reg2
  2248. ...
  2249. vmova* reg2,reg3
  2250. dealloc reg2
  2251. =>
  2252. vmova* reg1,reg3 }
  2253. TransferUsedRegs(TmpUsedRegs);
  2254. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2255. if MatchOpType(taicpu(hp1),top_reg,top_reg) and
  2256. not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1) and
  2257. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2258. begin
  2259. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 1',p);
  2260. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2261. TransferUsedRegs(TmpUsedRegs);
  2262. AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp1, TmpUsedRegs);
  2263. RemoveInstruction(hp1);
  2264. result:=true;
  2265. exit;
  2266. end;
  2267. { special case:
  2268. vmova* reg1,<op>
  2269. ...
  2270. vmova* <op>,reg1
  2271. =>
  2272. vmova* reg1,<op> }
  2273. if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
  2274. ((taicpu(p).oper[0]^.typ<>top_ref) or
  2275. (not(vol_read in taicpu(p).oper[0]^.ref^.volatility))
  2276. ) then
  2277. begin
  2278. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 2',p);
  2279. RemoveInstruction(hp1);
  2280. result:=true;
  2281. exit;
  2282. end
  2283. end
  2284. else if ((MatchInstruction(p,[A_MOVAPS,A_VMOVAPS],[S_NO]) and
  2285. MatchInstruction(hp1,[A_MOVSS,A_VMOVSS],[S_NO])) or
  2286. ((MatchInstruction(p,[A_MOVAPD,A_VMOVAPD],[S_NO]) and
  2287. MatchInstruction(hp1,[A_MOVSD,A_VMOVSD],[S_NO])))
  2288. ) and
  2289. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  2290. begin
  2291. { vmova* reg1,reg2
  2292. ...
  2293. vmovs* reg2,<op>
  2294. dealloc reg2
  2295. =>
  2296. vmovs* reg1,<op> }
  2297. TransferUsedRegs(TmpUsedRegs);
  2298. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1);
  2299. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2300. begin
  2301. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVS*2(V)MOVS* 1',p);
  2302. taicpu(p).opcode:=taicpu(hp1).opcode;
  2303. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2304. TransferUsedRegs(TmpUsedRegs);
  2305. AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp1, TmpUsedRegs);
  2306. RemoveInstruction(hp1);
  2307. result:=true;
  2308. exit;
  2309. end
  2310. end;
  2311. if MatchInstruction(hp1,[A_VFMADDPD,
  2312. A_VFMADD132PD,
  2313. A_VFMADD132PS,
  2314. A_VFMADD132SD,
  2315. A_VFMADD132SS,
  2316. A_VFMADD213PD,
  2317. A_VFMADD213PS,
  2318. A_VFMADD213SD,
  2319. A_VFMADD213SS,
  2320. A_VFMADD231PD,
  2321. A_VFMADD231PS,
  2322. A_VFMADD231SD,
  2323. A_VFMADD231SS,
  2324. A_VFMADDSUB132PD,
  2325. A_VFMADDSUB132PS,
  2326. A_VFMADDSUB213PD,
  2327. A_VFMADDSUB213PS,
  2328. A_VFMADDSUB231PD,
  2329. A_VFMADDSUB231PS,
  2330. A_VFMSUB132PD,
  2331. A_VFMSUB132PS,
  2332. A_VFMSUB132SD,
  2333. A_VFMSUB132SS,
  2334. A_VFMSUB213PD,
  2335. A_VFMSUB213PS,
  2336. A_VFMSUB213SD,
  2337. A_VFMSUB213SS,
  2338. A_VFMSUB231PD,
  2339. A_VFMSUB231PS,
  2340. A_VFMSUB231SD,
  2341. A_VFMSUB231SS,
  2342. A_VFMSUBADD132PD,
  2343. A_VFMSUBADD132PS,
  2344. A_VFMSUBADD213PD,
  2345. A_VFMSUBADD213PS,
  2346. A_VFMSUBADD231PD,
  2347. A_VFMSUBADD231PS,
  2348. A_VFNMADD132PD,
  2349. A_VFNMADD132PS,
  2350. A_VFNMADD132SD,
  2351. A_VFNMADD132SS,
  2352. A_VFNMADD213PD,
  2353. A_VFNMADD213PS,
  2354. A_VFNMADD213SD,
  2355. A_VFNMADD213SS,
  2356. A_VFNMADD231PD,
  2357. A_VFNMADD231PS,
  2358. A_VFNMADD231SD,
  2359. A_VFNMADD231SS,
  2360. A_VFNMSUB132PD,
  2361. A_VFNMSUB132PS,
  2362. A_VFNMSUB132SD,
  2363. A_VFNMSUB132SS,
  2364. A_VFNMSUB213PD,
  2365. A_VFNMSUB213PS,
  2366. A_VFNMSUB213SD,
  2367. A_VFNMSUB213SS,
  2368. A_VFNMSUB231PD,
  2369. A_VFNMSUB231PS,
  2370. A_VFNMSUB231SD,
  2371. A_VFNMSUB231SS],[S_NO]) and
  2372. { we mix single and double opperations here because we assume that the compiler
  2373. generates vmovapd only after double operations and vmovaps only after single operations }
  2374. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[2]^.reg) and
  2375. GetNextInstructionUsingReg(hp1, hp2, taicpu(hp1).oper[2]^.reg) and
  2376. MatchInstruction(hp2,[A_VMOVAPD,A_VMOVAPS,A_MOVAPD,A_MOVAPS],[S_NO]) and
  2377. MatchOperand(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) then
  2378. begin
  2379. TransferUsedRegs(TmpUsedRegs);
  2380. UpdateUsedRegsBetween(TmpUsedRegs, p, hp2);
  2381. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2382. begin
  2383. taicpu(hp1).loadoper(2,taicpu(p).oper[0]^);
  2384. if (cs_opt_level3 in current_settings.optimizerswitches) then
  2385. RemoveCurrentP(p)
  2386. else
  2387. RemoveCurrentP(p, hp1); // hp1 is guaranteed to be the immediate next instruction in this case.
  2388. RemoveInstruction(hp2);
  2389. end;
  2390. end
  2391. else if (hp1.typ = ait_instruction) and
  2392. (((taicpu(p).opcode=A_MOVAPS) and
  2393. ((taicpu(hp1).opcode=A_ADDSS) or (taicpu(hp1).opcode=A_SUBSS) or
  2394. (taicpu(hp1).opcode=A_MULSS) or (taicpu(hp1).opcode=A_DIVSS))) or
  2395. ((taicpu(p).opcode=A_MOVAPD) and
  2396. ((taicpu(hp1).opcode=A_ADDSD) or (taicpu(hp1).opcode=A_SUBSD) or
  2397. (taicpu(hp1).opcode=A_MULSD) or (taicpu(hp1).opcode=A_DIVSD)))
  2398. ) and
  2399. GetNextInstructionUsingReg(hp1, hp2, taicpu(hp1).oper[1]^.reg) and
  2400. MatchInstruction(hp2,taicpu(p).opcode,[]) and
  2401. OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  2402. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  2403. MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^) then
  2404. { change
  2405. movapX reg,reg2
  2406. addsX/subsX/... reg3, reg2
  2407. movapX reg2,reg
  2408. to
  2409. addsX/subsX/... reg3,reg
  2410. }
  2411. begin
  2412. TransferUsedRegs(TmpUsedRegs);
  2413. UpdateUsedRegsBetween(TmpUsedRegs, p, hp2);
  2414. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2415. begin
  2416. DebugMsg(SPeepholeOptimization + 'MovapXOpMovapX2Op ('+
  2417. debug_op2str(taicpu(p).opcode)+' '+
  2418. debug_op2str(taicpu(hp1).opcode)+' '+
  2419. debug_op2str(taicpu(hp2).opcode)+') done',p);
  2420. { we cannot eliminate the first move if
  2421. the operations uses the same register for source and dest }
  2422. if not(OpsEqual(taicpu(hp1).oper[1]^,taicpu(hp1).oper[0]^)) then
  2423. { Remember that hp1 is not necessarily the immediate
  2424. next instruction }
  2425. RemoveCurrentP(p);
  2426. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  2427. RemoveInstruction(hp2);
  2428. result:=true;
  2429. end;
  2430. end
  2431. else if (hp1.typ = ait_instruction) and
  2432. (((taicpu(p).opcode=A_VMOVAPD) and
  2433. (taicpu(hp1).opcode=A_VCOMISD)) or
  2434. ((taicpu(p).opcode=A_VMOVAPS) and
  2435. ((taicpu(hp1).opcode=A_VCOMISS))
  2436. )
  2437. ) and not(OpsEqual(taicpu(hp1).oper[1]^,taicpu(hp1).oper[0]^)) then
  2438. { change
  2439. movapX reg,reg1
  2440. vcomisX reg1,reg1
  2441. to
  2442. vcomisX reg,reg
  2443. }
  2444. begin
  2445. TransferUsedRegs(TmpUsedRegs);
  2446. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2447. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2448. begin
  2449. DebugMsg(SPeepholeOptimization + 'MovapXComisX2ComisX2 ('+
  2450. debug_op2str(taicpu(p).opcode)+' '+
  2451. debug_op2str(taicpu(hp1).opcode)+') done',p);
  2452. if OpsEqual(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  2453. taicpu(hp1).loadoper(0, taicpu(p).oper[0]^);
  2454. if OpsEqual(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) then
  2455. taicpu(hp1).loadoper(1, taicpu(p).oper[0]^);
  2456. RemoveCurrentP(p);
  2457. result:=true;
  2458. exit;
  2459. end;
  2460. end
  2461. end;
  2462. end;
  2463. end;
  2464. function TX86AsmOptimizer.OptPass1VOP(var p : tai) : boolean;
  2465. var
  2466. hp1 : tai;
  2467. begin
  2468. result:=false;
  2469. { replace
  2470. V<Op>X %mreg1,%mreg2,%mreg3
  2471. VMovX %mreg3,%mreg4
  2472. dealloc %mreg3
  2473. by
  2474. V<Op>X %mreg1,%mreg2,%mreg4
  2475. ?
  2476. }
  2477. if GetNextInstruction(p,hp1) and
  2478. { we mix single and double operations here because we assume that the compiler
  2479. generates vmovapd only after double operations and vmovaps only after single operations }
  2480. MatchInstruction(hp1,A_VMOVAPD,A_VMOVAPS,[S_NO]) and
  2481. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  2482. (taicpu(hp1).oper[1]^.typ=top_reg) then
  2483. begin
  2484. TransferUsedRegs(TmpUsedRegs);
  2485. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2486. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  2487. begin
  2488. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  2489. DebugMsg(SPeepholeOptimization + 'VOpVmov2VOp done',p);
  2490. RemoveInstruction(hp1);
  2491. result:=true;
  2492. end;
  2493. end;
  2494. end;
  2495. { Replaces all references to AOldReg in a memory reference to ANewReg }
  2496. class function TX86AsmOptimizer.ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean;
  2497. begin
  2498. Result := False;
  2499. { For safety reasons, only check for exact register matches }
  2500. { Check base register }
  2501. if (ref.base = AOldReg) then
  2502. begin
  2503. ref.base := ANewReg;
  2504. Result := True;
  2505. end;
  2506. { Check index register }
  2507. if (ref.index = AOldReg) and (getsupreg(ANewReg)<>RS_ESP) then
  2508. begin
  2509. ref.index := ANewReg;
  2510. Result := True;
  2511. end;
  2512. end;
  2513. { Replaces all references to AOldReg in an operand to ANewReg }
  2514. class function TX86AsmOptimizer.ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean;
  2515. var
  2516. OldSupReg, NewSupReg: TSuperRegister;
  2517. OldSubReg, NewSubReg: TSubRegister;
  2518. OldRegType: TRegisterType;
  2519. ThisOper: POper;
  2520. begin
  2521. ThisOper := p.oper[OperIdx]; { Faster to access overall }
  2522. Result := False;
  2523. if (AOldReg = NR_NO) or (ANewReg = NR_NO) then
  2524. InternalError(2020011801);
  2525. OldSupReg := getsupreg(AOldReg);
  2526. OldSubReg := getsubreg(AOldReg);
  2527. OldRegType := getregtype(AOldReg);
  2528. NewSupReg := getsupreg(ANewReg);
  2529. NewSubReg := getsubreg(ANewReg);
  2530. if OldRegType <> getregtype(ANewReg) then
  2531. InternalError(2020011802);
  2532. if OldSubReg <> NewSubReg then
  2533. InternalError(2020011803);
  2534. case ThisOper^.typ of
  2535. top_reg:
  2536. if (
  2537. (ThisOper^.reg = AOldReg) or
  2538. (
  2539. (OldRegType = R_INTREGISTER) and
  2540. (getsupreg(ThisOper^.reg) = OldSupReg) and
  2541. (getregtype(ThisOper^.reg) = R_INTREGISTER) and
  2542. (
  2543. (getsubreg(ThisOper^.reg) <= OldSubReg)
  2544. {$ifndef x86_64}
  2545. and (
  2546. { Under i386 and i8086, ESI, EDI, EBP and ESP
  2547. don't have an 8-bit representation }
  2548. (getsubreg(ThisOper^.reg) >= R_SUBW) or
  2549. not (NewSupReg in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  2550. )
  2551. {$endif x86_64}
  2552. )
  2553. )
  2554. ) then
  2555. begin
  2556. ThisOper^.reg := newreg(getregtype(ANewReg), NewSupReg, getsubreg(p.oper[OperIdx]^.reg));
  2557. Result := True;
  2558. end;
  2559. top_ref:
  2560. if ReplaceRegisterInRef(ThisOper^.ref^, AOldReg, ANewReg) then
  2561. Result := True;
  2562. else
  2563. ;
  2564. end;
  2565. end;
  2566. { Replaces all references to AOldReg in an instruction to ANewReg }
  2567. class function TX86AsmOptimizer.ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
  2568. const
  2569. ReadFlag: array[0..3] of TInsChange = (Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Rop4);
  2570. var
  2571. OperIdx: Integer;
  2572. begin
  2573. Result := False;
  2574. for OperIdx := 0 to p.ops - 1 do
  2575. if (ReadFlag[OperIdx] in InsProp[p.Opcode].Ch) then
  2576. begin
  2577. { The shift and rotate instructions can only use CL }
  2578. if not (
  2579. (OperIdx = 0) and
  2580. { This second condition just helps to avoid unnecessarily
  2581. calling MatchInstruction for 10 different opcodes }
  2582. (p.oper[0]^.reg = NR_CL) and
  2583. MatchInstruction(p, [A_RCL, A_RCR, A_ROL, A_ROR, A_SAL, A_SAR, A_SHL, A_SHLD, A_SHR, A_SHRD], [])
  2584. ) then
  2585. Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
  2586. end
  2587. else if p.oper[OperIdx]^.typ = top_ref then
  2588. { It's okay to replace registers in references that get written to }
  2589. Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
  2590. end;
  2591. class function TX86AsmOptimizer.IsRefSafe(const ref: PReference): Boolean;
  2592. begin
  2593. Result :=
  2594. (ref^.index = NR_NO) and
  2595. (
  2596. {$ifdef x86_64}
  2597. (
  2598. (ref^.base = NR_RIP) and
  2599. (ref^.refaddr in [addr_pic, addr_pic_no_got])
  2600. ) or
  2601. {$endif x86_64}
  2602. (ref^.refaddr = addr_full) or
  2603. (ref^.base = NR_STACK_POINTER_REG) or
  2604. (ref^.base = current_procinfo.framepointer)
  2605. );
  2606. end;
  2607. function TX86AsmOptimizer.ConvertLEA(const p: taicpu): Boolean;
  2608. var
  2609. l: asizeint;
  2610. begin
  2611. Result := False;
  2612. { Should have been checked previously }
  2613. if p.opcode <> A_LEA then
  2614. InternalError(2020072501);
  2615. { do not mess with the stack point as adjusting it by lea is recommend, except if we optimize for size }
  2616. if (p.oper[1]^.reg=NR_STACK_POINTER_REG) and
  2617. not(cs_opt_size in current_settings.optimizerswitches) then
  2618. exit;
  2619. with p.oper[0]^.ref^ do
  2620. begin
  2621. if (base <> p.oper[1]^.reg) or
  2622. (index <> NR_NO) or
  2623. assigned(symbol) then
  2624. exit;
  2625. l:=offset;
  2626. if (l=1) and UseIncDec then
  2627. begin
  2628. p.opcode:=A_INC;
  2629. p.loadreg(0,p.oper[1]^.reg);
  2630. p.ops:=1;
  2631. DebugMsg(SPeepholeOptimization + 'Lea2Inc done',p);
  2632. end
  2633. else if (l=-1) and UseIncDec then
  2634. begin
  2635. p.opcode:=A_DEC;
  2636. p.loadreg(0,p.oper[1]^.reg);
  2637. p.ops:=1;
  2638. DebugMsg(SPeepholeOptimization + 'Lea2Dec done',p);
  2639. end
  2640. else
  2641. begin
  2642. if (l<0) and (l<>-2147483648) then
  2643. begin
  2644. p.opcode:=A_SUB;
  2645. p.loadConst(0,-l);
  2646. DebugMsg(SPeepholeOptimization + 'Lea2Sub done',p);
  2647. end
  2648. else
  2649. begin
  2650. p.opcode:=A_ADD;
  2651. p.loadConst(0,l);
  2652. DebugMsg(SPeepholeOptimization + 'Lea2Add done',p);
  2653. end;
  2654. end;
  2655. end;
  2656. Result := True;
  2657. end;
  2658. function TX86AsmOptimizer.DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  2659. var
  2660. CurrentReg, ReplaceReg: TRegister;
  2661. begin
  2662. Result := False;
  2663. ReplaceReg := taicpu(p_mov).oper[0]^.reg;
  2664. CurrentReg := taicpu(p_mov).oper[1]^.reg;
  2665. case hp.opcode of
  2666. A_FSTSW, A_FNSTSW,
  2667. A_IN, A_INS, A_OUT, A_OUTS,
  2668. A_CMPS, A_LODS, A_MOVS, A_SCAS, A_STOS:
  2669. { These routines have explicit operands, but they are restricted in
  2670. what they can be (e.g. IN and OUT can only read from AL, AX or
  2671. EAX. }
  2672. Exit;
  2673. A_IMUL:
  2674. begin
  2675. { The 1-operand version writes to implicit registers
  2676. The 2-operand version reads from the first operator, and reads
  2677. from and writes to the second (equivalent to Ch_ROp1, ChRWOp2).
  2678. the 3-operand version reads from a register that it doesn't write to
  2679. }
  2680. case hp.ops of
  2681. 1:
  2682. if (
  2683. (
  2684. (hp.opsize = S_B) and (getsupreg(CurrentReg) <> RS_EAX)
  2685. ) or
  2686. not (getsupreg(CurrentReg) in [RS_EAX, RS_EDX])
  2687. ) and ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  2688. begin
  2689. Result := True;
  2690. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 1)', hp);
  2691. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2692. end;
  2693. 2:
  2694. { Only modify the first parameter }
  2695. if ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  2696. begin
  2697. Result := True;
  2698. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 2)', hp);
  2699. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2700. end;
  2701. 3:
  2702. { Only modify the second parameter }
  2703. if ReplaceRegisterInOper(hp, 1, CurrentReg, ReplaceReg) then
  2704. begin
  2705. Result := True;
  2706. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 3)', hp);
  2707. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2708. end;
  2709. else
  2710. InternalError(2020012901);
  2711. end;
  2712. end;
  2713. else
  2714. if (hp.ops > 0) and
  2715. ReplaceRegisterInInstruction(hp, CurrentReg, ReplaceReg) then
  2716. begin
  2717. Result := True;
  2718. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovXXX2MovXXX)', hp);
  2719. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2720. end;
  2721. end;
  2722. end;
  2723. function TX86AsmOptimizer.FuncMov2Func(var p: tai; const hp1: tai): Boolean;
  2724. var
  2725. hp2, hp_regalloc: tai;
  2726. p_SourceReg, p_TargetReg: TRegister;
  2727. begin
  2728. Result := False;
  2729. { Backward optimisation. If we have:
  2730. func. %reg1,%reg2
  2731. mov %reg2,%reg3
  2732. (dealloc %reg2)
  2733. Change to:
  2734. func. %reg1,%reg3 (see comment below for what a valid func. is)
  2735. Perform similar optimisations with 1, 3 and 4-operand instructions
  2736. that only have one output.
  2737. }
  2738. if MatchOpType(taicpu(p), top_reg, top_reg) then
  2739. begin
  2740. p_SourceReg := taicpu(p).oper[0]^.reg;
  2741. p_TargetReg := taicpu(p).oper[1]^.reg;
  2742. TransferUsedRegs(TmpUsedRegs);
  2743. if not RegUsedAfterInstruction(p_SourceReg, p, TmpUsedRegs) and
  2744. GetLastInstruction(p, hp2) and
  2745. (hp2.typ = ait_instruction) and
  2746. { Have to make sure it's an instruction that only reads from
  2747. the first operands and only writes (not reads or modifies) to
  2748. the last one; in essence, a pure function such as BSR, POPCNT
  2749. or ANDN }
  2750. (
  2751. (
  2752. (taicpu(hp2).ops = 1) and
  2753. (insprop[taicpu(hp2).opcode].Ch * [Ch_Wop1] = [Ch_Wop1])
  2754. ) or
  2755. (
  2756. (taicpu(hp2).ops = 2) and
  2757. (insprop[taicpu(hp2).opcode].Ch * [Ch_Rop1, Ch_Wop2] = [Ch_Rop1, Ch_Wop2])
  2758. ) or
  2759. (
  2760. (taicpu(hp2).ops = 3) and
  2761. (insprop[taicpu(hp2).opcode].Ch * [Ch_Rop1, Ch_Rop2, Ch_Wop3] = [Ch_Rop1, Ch_Rop2, Ch_Wop3])
  2762. ) or
  2763. (
  2764. (taicpu(hp2).ops = 4) and
  2765. (insprop[taicpu(hp2).opcode].Ch * [Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Wop4] = [Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Wop4])
  2766. )
  2767. ) and
  2768. (taicpu(hp2).oper[taicpu(hp2).ops-1]^.typ = top_reg) and
  2769. (taicpu(hp2).oper[taicpu(hp2).ops-1]^.reg = p_SourceReg) then
  2770. begin
  2771. case taicpu(hp2).opcode of
  2772. A_FSTSW, A_FNSTSW,
  2773. A_IN, A_INS, A_OUT, A_OUTS,
  2774. A_CMPS, A_LODS, A_MOVS, A_SCAS, A_STOS:
  2775. { These routines have explicit operands, but they are restricted in
  2776. what they can be (e.g. IN and OUT can only read from AL, AX or
  2777. EAX. }
  2778. ;
  2779. else
  2780. begin
  2781. DebugMsg(SPeepholeOptimization + 'Removed MOV and changed destination on previous instruction to optimise register usage (FuncMov2Func)', p);
  2782. { if %reg2 (p_SourceReg) is allocated before func., remove it completely }
  2783. hp_regalloc := FindRegAllocBackward(p_SourceReg, hp2);
  2784. if Assigned(hp_regalloc) then
  2785. begin
  2786. Asml.Remove(hp_regalloc);
  2787. if Assigned(FindRegDealloc(p_SourceReg, p)) then
  2788. begin
  2789. ExcludeRegFromUsedRegs(p_SourceReg, UsedRegs);
  2790. hp_regalloc.Free;
  2791. end
  2792. else
  2793. { If the register is not explicitly deallocated, it's
  2794. being reused, so move the allocation to after func. }
  2795. AsmL.InsertAfter(hp_regalloc, hp2);
  2796. end;
  2797. if not RegInInstruction(p_TargetReg, hp2) then
  2798. begin
  2799. TransferUsedRegs(TmpUsedRegs);
  2800. AllocRegBetween(p_TargetReg, hp2, p, TmpUsedRegs);
  2801. end;
  2802. { Actually make the changes }
  2803. taicpu(hp2).oper[taicpu(hp2).ops-1]^.reg := p_TargetReg;
  2804. RemoveCurrentp(p, hp1);
  2805. { If the Func was another MOV instruction, we might get
  2806. "mov %reg,%reg" that doesn't get removed in Pass 2
  2807. otherwise, so deal with it here (also do something
  2808. similar with lea (%reg),%reg}
  2809. if (taicpu(hp2).opcode = A_MOV) and MatchOperand(taicpu(hp2).oper[0]^, taicpu(hp2).oper[1]^.reg) then
  2810. begin
  2811. DebugMsg(SPeepholeOptimization + 'Mov2Nop 1a done', hp2);
  2812. if p = hp2 then
  2813. RemoveCurrentp(p)
  2814. else
  2815. RemoveInstruction(hp2);
  2816. end;
  2817. Result := True;
  2818. Exit;
  2819. end;
  2820. end;
  2821. end;
  2822. end;
  2823. end;
  2824. function TX86AsmOptimizer.CheckMovMov2MovMov2(const p, hp1: tai) : boolean;
  2825. begin
  2826. Result := False;
  2827. if MatchOpType(taicpu(p),top_ref,top_reg) and
  2828. MatchOpType(taicpu(hp1),top_ref,top_reg) and
  2829. (taicpu(p).opsize = taicpu(hp1).opsize) and
  2830. RefsEqual(taicpu(p).oper[0]^.ref^,taicpu(hp1).oper[0]^.ref^) and
  2831. (taicpu(p).oper[0]^.ref^.volatility=[]) and
  2832. (taicpu(hp1).oper[0]^.ref^.volatility=[]) and
  2833. not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.base)) and
  2834. not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.index)) then
  2835. begin
  2836. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 2',p);
  2837. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  2838. Result := True;
  2839. Include(OptsToCheck, aoc_ForceNewIteration);
  2840. end;
  2841. end;
  2842. function TX86AsmOptimizer.OptPass1MOV(var p : tai) : boolean;
  2843. var
  2844. hp1, hp2, hp3, hp4, last_hp1: tai;
  2845. GetNextInstruction_p, DoOptimisation, TempBool: Boolean;
  2846. p_SourceReg, p_TargetReg, NewMMReg: TRegister;
  2847. {$ifdef x86_64}
  2848. NewConst: TCGInt;
  2849. {$endif x86_64}
  2850. procedure convert_mov_value(signed_movop: tasmop; max_value: tcgint); inline;
  2851. begin
  2852. if taicpu(hp1).opcode = signed_movop then
  2853. begin
  2854. if taicpu(p).oper[0]^.val > max_value shr 1 then
  2855. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val - max_value - 1 { Convert to signed }
  2856. end
  2857. else
  2858. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and max_value; { Trim to unsigned }
  2859. end;
  2860. function GetNextHp1(const in_p: tai): Boolean;
  2861. begin
  2862. if NotFirstIteration and (cs_opt_level3 in current_settings.optimizerswitches) then
  2863. GetNextInstruction_p := GetNextInstructionUsingReg(in_p, hp1, p_TargetReg)
  2864. else
  2865. GetNextInstruction_p := GetNextInstruction(in_p, hp1);
  2866. Result := GetNextInstruction_p and (hp1.typ = ait_instruction);
  2867. end;
  2868. function TryConstMerge(var p1, p2: tai): Boolean;
  2869. var
  2870. ThisRef: TReference;
  2871. begin
  2872. Result := False;
  2873. ThisRef := taicpu(p2).oper[1]^.ref^;
  2874. { Only permit writes to the stack, since we can guarantee alignment with that }
  2875. if (ThisRef.index = NR_NO) and
  2876. (
  2877. (ThisRef.base = NR_STACK_POINTER_REG) or
  2878. (ThisRef.base = current_procinfo.framepointer)
  2879. ) then
  2880. begin
  2881. case taicpu(p).opsize of
  2882. S_B:
  2883. begin
  2884. { Word writes must be on a 2-byte boundary }
  2885. if (taicpu(p1).oper[1]^.ref^.offset mod 2) = 0 then
  2886. begin
  2887. { Reduce offset of second reference to see if it is sequential with the first }
  2888. Dec(ThisRef.offset, 1);
  2889. if RefsEqual(taicpu(p1).oper[1]^.ref^, ThisRef) then
  2890. begin
  2891. { Make sure the constants aren't represented as a
  2892. negative number, as these won't merge properly }
  2893. taicpu(p1).opsize := S_W;
  2894. taicpu(p1).oper[0]^.val := (taicpu(p1).oper[0]^.val and $FF) or ((taicpu(p2).oper[0]^.val and $FF) shl 8);
  2895. DebugMsg(SPeepholeOptimization + 'Merged two byte-sized constant writes to stack (MovMov2Mov 2a)', p1);
  2896. RemoveInstruction(p2);
  2897. Result := True;
  2898. end;
  2899. end;
  2900. end;
  2901. S_W:
  2902. begin
  2903. { Longword writes must be on a 4-byte boundary }
  2904. if (taicpu(p1).oper[1]^.ref^.offset mod 4) = 0 then
  2905. begin
  2906. { Reduce offset of second reference to see if it is sequential with the first }
  2907. Dec(ThisRef.offset, 2);
  2908. if RefsEqual(taicpu(p1).oper[1]^.ref^, ThisRef) then
  2909. begin
  2910. { Make sure the constants aren't represented as a
  2911. negative number, as these won't merge properly }
  2912. taicpu(p1).opsize := S_L;
  2913. taicpu(p1).oper[0]^.val := (taicpu(p1).oper[0]^.val and $FFFF) or ((taicpu(p2).oper[0]^.val and $FFFF) shl 16);
  2914. DebugMsg(SPeepholeOptimization + 'Merged two word-sized constant writes to stack (MovMov2Mov 2b)', p1);
  2915. RemoveInstruction(p2);
  2916. Result := True;
  2917. end;
  2918. end;
  2919. end;
  2920. {$ifdef x86_64}
  2921. S_L:
  2922. begin
  2923. { Only sign-extended 32-bit constants can be written to 64-bit memory directly, so check to
  2924. see if the constants can be encoded this way. }
  2925. NewConst := (taicpu(p1).oper[0]^.val and $FFFFFFFF) or (taicpu(p2).oper[0]^.val shl 32);
  2926. if (NewConst >= -2147483648) and (NewConst <= 2147483647) and
  2927. { Quadword writes must be on an 8-byte boundary }
  2928. ((taicpu(p1).oper[1]^.ref^.offset mod 8) = 0) then
  2929. begin
  2930. { Reduce offset of second reference to see if it is sequential with the first }
  2931. Dec(ThisRef.offset, 4);
  2932. if RefsEqual(taicpu(p1).oper[1]^.ref^, ThisRef) then
  2933. begin
  2934. { Make sure the constants aren't represented as a
  2935. negative number, as these won't merge properly }
  2936. taicpu(p1).opsize := S_Q;
  2937. { Force a typecast into a 32-bit signed integer (that will then be sign-extended to 64-bit) }
  2938. taicpu(p1).oper[0]^.val := NewConst;
  2939. DebugMsg(SPeepholeOptimization + 'Merged two longword-sized constant writes to stack (MovMov2Mov 2c)', p1);
  2940. RemoveInstruction(p2);
  2941. Result := True;
  2942. end;
  2943. end;
  2944. end;
  2945. {$endif x86_64}
  2946. else
  2947. ;
  2948. end;
  2949. end;
  2950. end;
  2951. var
  2952. TempRegUsed, CrossJump: Boolean;
  2953. PreMessage, RegName1, RegName2, InputVal, MaskNum: string;
  2954. NewSize: topsize; NewOffset: asizeint;
  2955. SourceRef, TargetRef: TReference;
  2956. MovAligned, MovUnaligned: TAsmOp;
  2957. JumpTracking: TLinkedList;
  2958. begin
  2959. Result:=false;
  2960. { remove mov reg1,reg1? }
  2961. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^)
  2962. then
  2963. begin
  2964. DebugMsg(SPeepholeOptimization + 'Mov2Nop 1 done',p);
  2965. { take care of the register (de)allocs following p }
  2966. RemoveCurrentP(p);
  2967. Result := True;
  2968. exit;
  2969. end;
  2970. { Prevent compiler warnings }
  2971. p_SourceReg := NR_NO;
  2972. p_TargetReg := NR_NO;
  2973. hp1 := nil;
  2974. if taicpu(p).oper[1]^.typ = top_reg then
  2975. begin
  2976. { Saves on a large number of dereferences }
  2977. p_TargetReg := taicpu(p).oper[1]^.reg;
  2978. TransferUsedRegs(TmpUsedRegs);
  2979. last_hp1 := p;
  2980. if GetNextHp1(p) then
  2981. while True do
  2982. begin
  2983. if (taicpu(hp1).opcode = A_AND) and
  2984. (taicpu(hp1).oper[1]^.typ = top_reg) and
  2985. SuperRegistersEqual(p_TargetReg, taicpu(hp1).oper[1]^.reg) then
  2986. begin
  2987. UpdateUsedRegsBetween(TmpUsedRegs, last_hp1, hp1);
  2988. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  2989. (taicpu(hp1).oper[0]^.typ = top_const) and
  2990. (taicpu(p).opsize = taicpu(hp1).opsize) then
  2991. begin
  2992. case taicpu(p).opsize of
  2993. S_L:
  2994. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  2995. begin
  2996. { Optimize out:
  2997. mov x, %reg
  2998. and ffffffffh, %reg
  2999. }
  3000. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 1 done',p);
  3001. hp2 := tai(hp1.Previous);
  3002. RemoveInstruction(hp1);
  3003. //Include(OptsToCheck, aoc_ForceNewIteration);
  3004. if GetNextHp1(hp2) then
  3005. Continue
  3006. else
  3007. Exit;
  3008. end;
  3009. S_Q: { TODO: Confirm if this is even possible }
  3010. if (taicpu(hp1).oper[0]^.val = $ffffffffffffffff) then
  3011. begin
  3012. { Optimize out:
  3013. mov x, %reg
  3014. and ffffffffffffffffh, %reg
  3015. }
  3016. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 2 done',p);
  3017. hp2 := tai(hp1.Previous);
  3018. RemoveInstruction(hp1);
  3019. //Include(OptsToCheck, aoc_ForceNewIteration);
  3020. if GetNextHp1(hp2) then
  3021. Continue
  3022. else
  3023. Exit;
  3024. end;
  3025. else
  3026. ;
  3027. end;
  3028. if (
  3029. { Make sure that if a reference is used, its registers
  3030. are not modified in between }
  3031. (
  3032. (taicpu(p).oper[0]^.typ = top_reg) and
  3033. not RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)
  3034. ) or
  3035. (
  3036. (taicpu(p).oper[0]^.typ = top_ref) and
  3037. (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) and
  3038. not RefModifiedBetween(taicpu(p).oper[0]^.ref^, topsize2memsize[taicpu(p).opsize] shr 3, p, hp1)
  3039. )
  3040. ) and
  3041. GetNextInstruction(hp1,hp2) and
  3042. MatchInstruction(hp2,A_TEST,[]) and
  3043. (
  3044. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp2).oper[1]^) or
  3045. (
  3046. { If the register being tested is smaller than the one
  3047. that received a bitwise AND, permit it if the constant
  3048. fits into the smaller size }
  3049. (taicpu(hp1).oper[1]^.typ = top_reg) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  3050. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg,taicpu(hp2).oper[1]^.reg) and
  3051. (taicpu(hp1).oper[0]^.typ = top_const) and (taicpu(hp1).oper[0]^.val >= 0) and
  3052. (GetSubReg(taicpu(hp2).oper[1]^.reg) < GetSubReg(taicpu(hp1).oper[1]^.reg)) and
  3053. (
  3054. (
  3055. (GetSubReg(taicpu(hp2).oper[1]^.reg) = R_SUBL) and
  3056. (taicpu(hp1).oper[0]^.val <= $FF)
  3057. ) or
  3058. (
  3059. (GetSubReg(taicpu(hp2).oper[1]^.reg) = R_SUBW) and
  3060. (taicpu(hp1).oper[0]^.val <= $FFFF)
  3061. {$ifdef x86_64}
  3062. ) or
  3063. (
  3064. (GetSubReg(taicpu(hp2).oper[1]^.reg) = R_SUBD) and
  3065. (taicpu(hp1).oper[0]^.val <= $FFFFFFFF)
  3066. {$endif x86_64}
  3067. )
  3068. )
  3069. )
  3070. ) and
  3071. (
  3072. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^) or
  3073. MatchOperand(taicpu(hp2).oper[0]^,-1)
  3074. ) and
  3075. GetNextInstruction(hp2,hp3) and
  3076. MatchInstruction(hp3,A_Jcc,A_Setcc,[]) and
  3077. (taicpu(hp3).condition in [C_E,C_NE]) then
  3078. begin
  3079. TransferUsedRegs(TmpUsedRegs);
  3080. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3081. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3082. if not(RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp2, TmpUsedRegs)) then
  3083. begin
  3084. DebugMsg(SPeepholeOptimization + 'MovAndTest2Test done',p);
  3085. taicpu(hp1).loadoper(1,taicpu(p).oper[0]^);
  3086. taicpu(hp1).opcode:=A_TEST;
  3087. { Shrink the TEST instruction down to the smallest possible size }
  3088. case taicpu(hp1).oper[0]^.val of
  3089. 0..255:
  3090. if (taicpu(hp1).opsize <> S_B)
  3091. {$ifndef x86_64}
  3092. and (
  3093. (taicpu(hp1).oper[1]^.typ <> top_reg) or
  3094. { Cannot encode byte-sized ESI, EDI, EBP or ESP under i386 }
  3095. (GetSupReg(taicpu(hp1).oper[1]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])
  3096. )
  3097. {$endif x86_64}
  3098. then
  3099. begin
  3100. if taicpu(hp1).opsize <> taicpu(hp2).opsize then
  3101. { Only print debug message if the TEST instruction
  3102. is a different size before and after }
  3103. DebugMsg(SPeepholeOptimization + 'test' + debug_opsize2str(taicpu(hp1).opsize) + ' -> testb to reduce instruction size (Test2Test 1a)' , p);
  3104. taicpu(hp1).opsize := S_B;
  3105. if (taicpu(hp1).oper[1]^.typ = top_reg) then
  3106. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBL);
  3107. end;
  3108. 256..65535:
  3109. if (taicpu(hp1).opsize <> S_W) then
  3110. begin
  3111. if taicpu(hp1).opsize <> taicpu(hp2).opsize then
  3112. { Only print debug message if the TEST instruction
  3113. is a different size before and after }
  3114. DebugMsg(SPeepholeOptimization + 'test' + debug_opsize2str(taicpu(hp1).opsize) + ' -> testw to reduce instruction size (Test2Test 1b)' , p);
  3115. taicpu(hp1).opsize := S_W;
  3116. if (taicpu(hp1).oper[1]^.typ = top_reg) then
  3117. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBW);
  3118. end;
  3119. {$ifdef x86_64}
  3120. 65536..$7FFFFFFF:
  3121. if (taicpu(hp1).opsize <> S_L) then
  3122. begin
  3123. if taicpu(hp1).opsize <> taicpu(hp2).opsize then
  3124. { Only print debug message if the TEST instruction
  3125. is a different size before and after }
  3126. DebugMsg(SPeepholeOptimization + 'test' + debug_opsize2str(taicpu(hp1).opsize) + ' -> testl to reduce instruction size (Test2Test 1c)' , p);
  3127. taicpu(hp1).opsize := S_L;
  3128. if (taicpu(hp1).oper[1]^.typ = top_reg) then
  3129. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  3130. end;
  3131. {$endif x86_64}
  3132. else
  3133. ;
  3134. end;
  3135. RemoveInstruction(hp2);
  3136. RemoveCurrentP(p);
  3137. Result:=true;
  3138. exit;
  3139. end;
  3140. end;
  3141. end;
  3142. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  3143. (taicpu(p).opsize = taicpu(hp1).opsize) and
  3144. (taicpu(hp1).oper[0]^.typ <> top_ref) and
  3145. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^) and
  3146. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[1]^) and
  3147. (
  3148. not (cs_opt_level3 in current_settings.optimizerswitches) or
  3149. (taicpu(hp1).oper[0]^.typ = top_const) or
  3150. not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)
  3151. ) then
  3152. begin
  3153. { With:
  3154. mov %reg1,%reg2
  3155. ...
  3156. and %reg1,%reg2
  3157. Or:
  3158. mov $x,%reg2
  3159. ...
  3160. and $x,%reg2
  3161. Remove the 'and' instruction
  3162. }
  3163. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 4 done',hp1);
  3164. hp2 := tai(hp1.Previous);
  3165. RemoveInstruction(hp1);
  3166. //Include(OptsToCheck, aoc_ForceNewIteration);
  3167. if GetNextHp1(hp2) then
  3168. Continue
  3169. else
  3170. Exit;
  3171. end;
  3172. if IsMOVZXAcceptable and
  3173. (taicpu(p).oper[0]^.typ <> top_const) then { MOVZX only supports registers and memory, not immediates (use MOV for that!) }
  3174. begin
  3175. InputVal := debug_operstr(taicpu(p).oper[0]^);
  3176. MaskNum := debug_tostr(taicpu(hp1).oper[0]^.val);
  3177. case taicpu(p).opsize of
  3178. S_B:
  3179. if (taicpu(hp1).oper[0]^.val = $ff) then
  3180. begin
  3181. { Convert:
  3182. movb x, %regl movb x, %regl
  3183. andw ffh, %regw andl ffh, %regd
  3184. To:
  3185. movzbw x, %regd movzbl x, %regd
  3186. (Identical registers, just different sizes)
  3187. }
  3188. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 8-bit register name }
  3189. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 16/32-bit register name }
  3190. case taicpu(hp1).opsize of
  3191. S_W: NewSize := S_BW;
  3192. S_L: NewSize := S_BL;
  3193. {$ifdef x86_64}
  3194. S_Q: NewSize := S_BQ;
  3195. {$endif x86_64}
  3196. else
  3197. InternalError(2018011510);
  3198. end;
  3199. end
  3200. else
  3201. NewSize := S_NO;
  3202. S_W:
  3203. if (taicpu(hp1).oper[0]^.val = $ffff) then
  3204. begin
  3205. { Convert:
  3206. movw x, %regw
  3207. andl ffffh, %regd
  3208. To:
  3209. movzwl x, %regd
  3210. (Identical registers, just different sizes)
  3211. }
  3212. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 16-bit register name }
  3213. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 32-bit register name }
  3214. case taicpu(hp1).opsize of
  3215. S_L: NewSize := S_WL;
  3216. {$ifdef x86_64}
  3217. S_Q: NewSize := S_WQ;
  3218. {$endif x86_64}
  3219. else
  3220. InternalError(2018011511);
  3221. end;
  3222. end
  3223. else
  3224. NewSize := S_NO;
  3225. else
  3226. NewSize := S_NO;
  3227. end;
  3228. if NewSize <> S_NO then
  3229. begin
  3230. PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + InputVal + ',' + RegName1;
  3231. { The actual optimization }
  3232. taicpu(p).opcode := A_MOVZX;
  3233. taicpu(p).changeopsize(NewSize);
  3234. taicpu(p).loadoper(1, taicpu(hp1).oper[1]^);
  3235. { Make sure we deal with any reference counts that were increased }
  3236. if taicpu(hp1).oper[1]^.typ = top_ref then
  3237. begin
  3238. if Assigned(taicpu(hp1).oper[1]^.ref^.symbol) then
  3239. taicpu(hp1).oper[1]^.ref^.symbol.decrefs;
  3240. if Assigned(taicpu(hp1).oper[1]^.ref^.relsymbol) then
  3241. taicpu(hp1).oper[1]^.ref^.relsymbol.decrefs;
  3242. end;
  3243. { Safeguard if "and" is followed by a conditional command }
  3244. TransferUsedRegs(TmpUsedRegs);
  3245. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.next), hp1);
  3246. if (RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  3247. begin
  3248. { At this point, the "and" command is effectively equivalent to
  3249. "test %reg,%reg". This will be handled separately by the
  3250. Peephole Optimizer. [Kit] }
  3251. DebugMsg(SPeepholeOptimization + PreMessage +
  3252. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  3253. end
  3254. else
  3255. begin
  3256. DebugMsg(SPeepholeOptimization + PreMessage + '; and' + debug_opsize2str(taicpu(hp1).opsize) + ' $' + MaskNum + ',' + RegName2 +
  3257. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  3258. RemoveInstruction(hp1);
  3259. end;
  3260. Result := True;
  3261. Exit;
  3262. { Go through DeepMOVOpt again (jump to "while True do") }
  3263. Continue;
  3264. end;
  3265. end;
  3266. end;
  3267. if taicpu(p).oper[0]^.typ = top_reg then
  3268. begin
  3269. p_SourceReg := taicpu(p).oper[0]^.reg;
  3270. { Look for:
  3271. mov %reg1,%reg2
  3272. ??? %reg2,r/m
  3273. Change to:
  3274. mov %reg1,%reg2
  3275. ??? %reg1,r/m
  3276. }
  3277. if RegReadByInstruction(p_TargetReg, hp1) and
  3278. not RegModifiedBetween(p_SourceReg, p, hp1) and
  3279. DeepMOVOpt(taicpu(p), taicpu(hp1)) then
  3280. begin
  3281. { A change has occurred, just not in p }
  3282. Include(OptsToCheck, aoc_ForceNewIteration);
  3283. TransferUsedRegs(TmpUsedRegs);
  3284. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3285. if not RegUsedAfterInstruction(p_TargetReg, hp1, TmpUsedRegs) and
  3286. { Just in case something didn't get modified (e.g. an
  3287. implicit register) }
  3288. not RegReadByInstruction(p_TargetReg, hp1) then
  3289. begin
  3290. { We can remove the original MOV }
  3291. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3 done',p);
  3292. RemoveCurrentP(p);
  3293. { UsedRegs got updated by RemoveCurrentp }
  3294. Result := True;
  3295. Exit;
  3296. end;
  3297. { If we know a MOV instruction has become a null operation, we might as well
  3298. get rid of it now to save time. }
  3299. if (taicpu(hp1).opcode = A_MOV) and
  3300. (taicpu(hp1).oper[1]^.typ = top_reg) and
  3301. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
  3302. { Just being a register is enough to confirm it's a null operation }
  3303. (taicpu(hp1).oper[0]^.typ = top_reg) then
  3304. begin
  3305. Result := True;
  3306. { Speed-up to reduce a pipeline stall... if we had something like...
  3307. movl %eax,%edx
  3308. movw %dx,%ax
  3309. ... the second instruction would change to movw %ax,%ax, but
  3310. given that it is now %ax that's active rather than %eax,
  3311. penalties might occur due to a partial register write, so instead,
  3312. change it to a MOVZX instruction when optimising for speed.
  3313. }
  3314. if not (cs_opt_size in current_settings.optimizerswitches) and
  3315. IsMOVZXAcceptable and
  3316. (taicpu(hp1).opsize < taicpu(p).opsize)
  3317. {$ifdef x86_64}
  3318. { operations already implicitly set the upper 64 bits to zero }
  3319. and not ((taicpu(hp1).opsize = S_L) and (taicpu(p).opsize = S_Q))
  3320. {$endif x86_64}
  3321. then
  3322. begin
  3323. DebugMsg(SPeepholeOptimization + 'Zero-extension to minimise pipeline stall (Mov2Movz)',hp1);
  3324. case taicpu(p).opsize of
  3325. S_W:
  3326. if taicpu(hp1).opsize = S_B then
  3327. taicpu(hp1).opsize := S_BL
  3328. else
  3329. InternalError(2020012911);
  3330. S_L{$ifdef x86_64}, S_Q{$endif x86_64}:
  3331. case taicpu(hp1).opsize of
  3332. S_B:
  3333. taicpu(hp1).opsize := S_BL;
  3334. S_W:
  3335. taicpu(hp1).opsize := S_WL;
  3336. else
  3337. InternalError(2020012912);
  3338. end;
  3339. else
  3340. InternalError(2020012910);
  3341. end;
  3342. taicpu(hp1).opcode := A_MOVZX;
  3343. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  3344. end
  3345. else
  3346. begin
  3347. GetNextInstruction_p := GetNextInstruction(hp1, hp2);
  3348. DebugMsg(SPeepholeOptimization + 'Mov2Nop 4 done',hp1);
  3349. RemoveInstruction(hp1);
  3350. { The instruction after what was hp1 is now the immediate next instruction,
  3351. so we can continue to make optimisations if it's present }
  3352. if not GetNextInstruction_p or (hp2.typ <> ait_instruction) then
  3353. Exit;
  3354. hp1 := hp2;
  3355. end;
  3356. end;
  3357. end;
  3358. {$ifdef x86_64}
  3359. { Change:
  3360. movl %reg1l,%reg2l
  3361. movq %reg2q,%reg1q
  3362. To:
  3363. movl %reg1l,%reg2l
  3364. andl %reg1l,%reg1l
  3365. }
  3366. if (taicpu(p).opsize = S_L) and MatchInstruction(hp1,A_MOV,[S_Q]) and
  3367. not RegModifiedBetween(p_SourceReg, p, hp1) and
  3368. MatchOpType(taicpu(hp1), top_reg, top_reg) and
  3369. SuperRegistersEqual(p_TargetReg, taicpu(hp1).oper[0]^.reg) and
  3370. SuperRegistersEqual(p_SourceReg, taicpu(hp1).oper[1]^.reg) then
  3371. begin
  3372. TransferUsedRegs(TmpUsedRegs);
  3373. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3374. taicpu(hp1).opsize := S_L;
  3375. taicpu(hp1).loadreg(0, p_SourceReg);
  3376. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  3377. AllocRegBetween(p_SourceReg, p, hp1, UsedRegs);
  3378. DebugMsg(SPeepholeOptimization + 'Made 32-to-64-bit zero extension more efficient (MovlMovq2MovlAndl 1)', hp1);
  3379. taicpu(hp1).opcode := A_AND;
  3380. { We may be able to do more and replace references
  3381. to %reg2q with %reg1q etc. }
  3382. if (cs_opt_level3 in current_settings.optimizerswitches) and
  3383. { p_TargetReg is not used between, otherwise the earlier
  3384. GetNextInstructionUsingReg would have stopped sooner }
  3385. DoZeroUpper32Opt(p,hp1) then
  3386. begin
  3387. Result := True;
  3388. Exit;
  3389. end;
  3390. end;
  3391. {
  3392. If we have the following already in the code...
  3393. movl %reg1l,%reg2l
  3394. andl %reg1l,%reg1l
  3395. ...we may be able to do more and replace references to
  3396. %reg2q with %reg1q etc. (program flow won't reach this
  3397. point if the second instruction was originally a MOV
  3398. and just got changed to AND)
  3399. }
  3400. if (cs_opt_level3 in current_settings.optimizerswitches) and
  3401. (taicpu(p).opsize = S_L) and MatchInstruction(hp1,A_AND,[S_L]) and
  3402. not RegModifiedBetween(p_SourceReg, p, hp1) and
  3403. { p_TargetReg is not used between, otherwise the earlier
  3404. GetNextInstructionUsingReg would have stopped sooner }
  3405. MatchOperand(taicpu(hp1).oper[1]^, p_SourceReg) and
  3406. (
  3407. MatchOperand(taicpu(hp1).oper[0]^, p_SourceReg) or
  3408. MatchOperand(taicpu(hp1).oper[0]^, $ffffffff)
  3409. ) and
  3410. DoZeroUpper32Opt(p,hp1) then
  3411. begin
  3412. Result := True;
  3413. Exit;
  3414. end;
  3415. {$endif x86_64}
  3416. end
  3417. else if taicpu(p).oper[0]^.typ = top_const then
  3418. begin
  3419. if (taicpu(hp1).opcode = A_OR) and
  3420. (taicpu(p).oper[1]^.typ = top_reg) and
  3421. MatchOperand(taicpu(p).oper[0]^, 0) and
  3422. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) then
  3423. begin
  3424. { mov 0, %reg
  3425. or ###,%reg
  3426. Change to (only if the flags are not used):
  3427. mov ###,%reg
  3428. }
  3429. TransferUsedRegs(TmpUsedRegs);
  3430. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3431. DoOptimisation := True;
  3432. { Even if the flags are used, we might be able to do the optimisation
  3433. if the conditions are predictable }
  3434. if RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  3435. begin
  3436. { Only perform if ### = %reg (the same register) or equal to 0,
  3437. so %reg is guaranteed to still have a value of zero }
  3438. if MatchOperand(taicpu(hp1).oper[0]^, 0) or
  3439. MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^.reg) then
  3440. begin
  3441. hp2 := hp1;
  3442. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3443. while RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  3444. GetNextInstruction(hp2, hp3) do
  3445. begin
  3446. { Don't continue modifying if the flags state is getting changed }
  3447. if RegModifiedByInstruction(NR_DEFAULTFLAGS, hp3) then
  3448. Break;
  3449. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  3450. if MatchInstruction(hp3, A_Jcc, A_SETcc, A_CMOVcc, []) then
  3451. begin
  3452. if condition_in(C_E, taicpu(hp3).condition) or (taicpu(hp3).condition in [C_NC, C_NS, C_NO]) then
  3453. begin
  3454. { Condition is always true }
  3455. case taicpu(hp3).opcode of
  3456. A_Jcc:
  3457. begin
  3458. { Check for jump shortcuts before we destroy the condition }
  3459. hp4 := hp3;
  3460. DoJumpOptimizations(hp3, TempBool);
  3461. { Make sure hp3 hasn't changed }
  3462. if (hp4 = hp3) then
  3463. begin
  3464. DebugMsg(SPeepholeOptimization + 'Condition is always true (jump made unconditional)', hp3);
  3465. MakeUnconditional(taicpu(hp3));
  3466. end;
  3467. Result := True;
  3468. end;
  3469. A_CMOVcc:
  3470. begin
  3471. DebugMsg(SPeepholeOptimization + 'Condition is always true (CMOVcc -> MOV)', hp3);
  3472. taicpu(hp3).opcode := A_MOV;
  3473. taicpu(hp3).condition := C_None;
  3474. Result := True;
  3475. end;
  3476. A_SETcc:
  3477. begin
  3478. DebugMsg(SPeepholeOptimization + 'Condition is always true (changed to MOV 1)', hp3);
  3479. { Convert "set(c) %reg" instruction to "movb 1,%reg" }
  3480. taicpu(hp3).opcode := A_MOV;
  3481. taicpu(hp3).ops := 2;
  3482. taicpu(hp3).condition := C_None;
  3483. taicpu(hp3).opsize := S_B;
  3484. taicpu(hp3).loadreg(1,taicpu(hp3).oper[0]^.reg);
  3485. taicpu(hp3).loadconst(0, 1);
  3486. Result := True;
  3487. end;
  3488. else
  3489. InternalError(2021090701);
  3490. end;
  3491. end
  3492. else if (taicpu(hp3).condition in [C_A, C_B, C_C, C_G, C_L, C_NE, C_NZ, C_O, C_S]) then
  3493. begin
  3494. { Condition is always false }
  3495. case taicpu(hp3).opcode of
  3496. A_Jcc:
  3497. begin
  3498. DebugMsg(SPeepholeOptimization + 'Condition is always false (jump removed)', hp3);
  3499. TAsmLabel(taicpu(hp3).oper[0]^.ref^.symbol).decrefs;
  3500. RemoveInstruction(hp3);
  3501. Result := True;
  3502. { Since hp3 was deleted, hp2 must not be updated }
  3503. Continue;
  3504. end;
  3505. A_CMOVcc:
  3506. begin
  3507. DebugMsg(SPeepholeOptimization + 'Condition is always false (conditional load removed)', hp3);
  3508. RemoveInstruction(hp3);
  3509. Result := True;
  3510. { Since hp3 was deleted, hp2 must not be updated }
  3511. Continue;
  3512. end;
  3513. A_SETcc:
  3514. begin
  3515. DebugMsg(SPeepholeOptimization + 'Condition is always false (changed to MOV 0)', hp3);
  3516. { Convert "set(c) %reg" instruction to "movb 0,%reg" }
  3517. taicpu(hp3).opcode := A_MOV;
  3518. taicpu(hp3).ops := 2;
  3519. taicpu(hp3).condition := C_None;
  3520. taicpu(hp3).opsize := S_B;
  3521. taicpu(hp3).loadreg(1,taicpu(hp3).oper[0]^.reg);
  3522. taicpu(hp3).loadconst(0, 0);
  3523. Result := True;
  3524. end;
  3525. else
  3526. InternalError(2021090702);
  3527. end;
  3528. end
  3529. else
  3530. { Uncertain what to do - don't optimise (although optimise other conditional statements if present) }
  3531. DoOptimisation := False;
  3532. end;
  3533. hp2 := hp3;
  3534. end;
  3535. if DoOptimisation then
  3536. begin
  3537. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  3538. if RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  3539. { Flags are still in use - don't optimise }
  3540. DoOptimisation := False;
  3541. end;
  3542. end
  3543. else
  3544. DoOptimisation := False;
  3545. end;
  3546. if DoOptimisation then
  3547. begin
  3548. {$ifdef x86_64}
  3549. { OR only supports 32-bit sign-extended constants for 64-bit
  3550. instructions, so compensate for this if the constant is
  3551. encoded as a value greater than or equal to 2^31 }
  3552. if (taicpu(hp1).opsize = S_Q) and
  3553. (taicpu(hp1).oper[0]^.typ = top_const) and
  3554. (taicpu(hp1).oper[0]^.val >= $80000000) then
  3555. taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val or $FFFFFFFF00000000;
  3556. {$endif x86_64}
  3557. DebugMsg(SPeepholeOptimization + 'MOV 0 / OR -> MOV', p);
  3558. taicpu(hp1).opcode := A_MOV;
  3559. RemoveCurrentP(p);
  3560. Result := True;
  3561. Exit;
  3562. end;
  3563. end;
  3564. end
  3565. else if
  3566. { oper[0] is a reference }
  3567. (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) then
  3568. begin
  3569. if MatchInstruction(hp1,A_LEA,[S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
  3570. begin
  3571. if ((MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(hp1).oper[1]^.reg,Taicpu(p).oper[1]^.reg) and
  3572. (Taicpu(hp1).oper[0]^.ref^.base<>Taicpu(p).oper[1]^.reg)
  3573. ) or
  3574. (MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(p).oper[1]^.reg,Taicpu(hp1).oper[1]^.reg) and
  3575. (Taicpu(hp1).oper[0]^.ref^.index<>Taicpu(p).oper[1]^.reg)
  3576. )
  3577. ) and
  3578. not RegModifiedBetween(Taicpu(hp1).oper[1]^.reg, p, hp1) then
  3579. { mov ref,reg1
  3580. lea (reg1,reg2),reg2
  3581. to
  3582. add ref,reg2 }
  3583. begin
  3584. TransferUsedRegs(TmpUsedRegs);
  3585. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3586. { If the flags register is in use, don't change the instruction to an
  3587. ADD otherwise this will scramble the flags. [Kit] }
  3588. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  3589. { reg1 may not be used afterwards }
  3590. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
  3591. begin
  3592. Taicpu(hp1).opcode:=A_ADD;
  3593. Taicpu(hp1).oper[0]^.ref^:=Taicpu(p).oper[0]^.ref^;
  3594. DebugMsg(SPeepholeOptimization + 'MovLea2Add done',hp1);
  3595. RemoveCurrentp(p);
  3596. result:=true;
  3597. exit;
  3598. end;
  3599. end;
  3600. { If the LEA instruction can be converted into an arithmetic instruction,
  3601. it may be possible to then fold it in the next optimisation. }
  3602. if ConvertLEA(taicpu(hp1)) then
  3603. Include(OptsToCheck, aoc_ForceNewIteration);
  3604. end;
  3605. {
  3606. mov ref,reg0
  3607. <op> reg0,reg1
  3608. dealloc reg0
  3609. to
  3610. <op> ref,reg1
  3611. }
  3612. if MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3613. (taicpu(hp1).oper[0]^.reg = p_TargetReg) and
  3614. MatchInstruction(hp1, [A_AND, A_OR, A_XOR, A_ADD, A_SUB, A_CMP, A_TEST, A_CMOVcc, A_BSR, A_BSF, A_POPCNT, A_LZCNT], [taicpu(p).opsize]) and
  3615. not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, p_TargetReg) and
  3616. not RefModifiedBetween(taicpu(p).oper[0]^.ref^, topsize2memsize[taicpu(p).opsize] shr 3, p, hp1) then
  3617. begin
  3618. TransferUsedRegs(TmpUsedRegs);
  3619. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3620. if not RegUsedAfterInstruction(p_TargetReg, hp1, TmpUsedRegs) then
  3621. begin
  3622. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  3623. { loadref increases the reference count, so decrement it again }
  3624. if Assigned(taicpu(p).oper[0]^.ref^.symbol) then
  3625. taicpu(p).oper[0]^.ref^.symbol.decrefs;
  3626. if Assigned(taicpu(p).oper[0]^.ref^.relsymbol) then
  3627. taicpu(p).oper[0]^.ref^.relsymbol.decrefs;
  3628. DebugMsg(SPeepholeOptimization + 'MovOp2Op done',hp1);
  3629. { See if we can remove the allocation of reg0 }
  3630. if not RegInRef(p_TargetReg, taicpu(p).oper[0]^.ref^) then
  3631. TryRemoveRegAlloc(p_TargetReg, p, hp1);
  3632. RemoveCurrentp(p);
  3633. Result:=true;
  3634. exit;
  3635. end;
  3636. end;
  3637. end;
  3638. { Depending on the DeepMOVOpt above, it may turn out that hp1 completely
  3639. overwrites the original destination register. e.g.
  3640. movl ###,%reg2d
  3641. movslq ###,%reg2q (### doesn't have to be the same as the first one)
  3642. In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
  3643. }
  3644. if MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
  3645. (taicpu(hp1).oper[1]^.typ = top_reg) and
  3646. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  3647. begin
  3648. if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
  3649. begin
  3650. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  3651. case taicpu(p).oper[0]^.typ of
  3652. top_const:
  3653. { We have something like:
  3654. movb $x, %regb
  3655. movzbl %regb,%regd
  3656. Change to:
  3657. movl $x, %regd
  3658. }
  3659. begin
  3660. case taicpu(hp1).opsize of
  3661. S_BW:
  3662. begin
  3663. convert_mov_value(A_MOVSX, $FF);
  3664. setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
  3665. taicpu(p).opsize := S_W;
  3666. end;
  3667. S_BL:
  3668. begin
  3669. convert_mov_value(A_MOVSX, $FF);
  3670. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  3671. taicpu(p).opsize := S_L;
  3672. end;
  3673. S_WL:
  3674. begin
  3675. convert_mov_value(A_MOVSX, $FFFF);
  3676. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  3677. taicpu(p).opsize := S_L;
  3678. end;
  3679. {$ifdef x86_64}
  3680. S_BQ:
  3681. begin
  3682. convert_mov_value(A_MOVSX, $FF);
  3683. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  3684. taicpu(p).opsize := S_Q;
  3685. end;
  3686. S_WQ:
  3687. begin
  3688. convert_mov_value(A_MOVSX, $FFFF);
  3689. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  3690. taicpu(p).opsize := S_Q;
  3691. end;
  3692. S_LQ:
  3693. begin
  3694. convert_mov_value(A_MOVSXD, $FFFFFFFF); { Note it's MOVSXD, not MOVSX }
  3695. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  3696. taicpu(p).opsize := S_Q;
  3697. end;
  3698. {$endif x86_64}
  3699. else
  3700. { If hp1 was a MOV instruction, it should have been
  3701. optimised already }
  3702. InternalError(2020021001);
  3703. end;
  3704. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
  3705. RemoveInstruction(hp1);
  3706. Result := True;
  3707. Exit;
  3708. end;
  3709. top_ref:
  3710. begin
  3711. { We have something like:
  3712. movb mem, %regb
  3713. movzbl %regb,%regd
  3714. Change to:
  3715. movzbl mem, %regd
  3716. }
  3717. if (taicpu(p).oper[0]^.ref^.refaddr<>addr_full) and (IsMOVZXAcceptable or (taicpu(hp1).opcode<>A_MOVZX)) then
  3718. begin
  3719. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
  3720. taicpu(p).opcode := taicpu(hp1).opcode;
  3721. taicpu(p).opsize := taicpu(hp1).opsize;
  3722. taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg;
  3723. RemoveInstruction(hp1);
  3724. Result := True;
  3725. Exit;
  3726. end;
  3727. end;
  3728. else
  3729. if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
  3730. { Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
  3731. Exit;
  3732. end;
  3733. end
  3734. { The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
  3735. and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
  3736. optimised }
  3737. else
  3738. begin
  3739. DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
  3740. RemoveCurrentP(p);
  3741. Result := True;
  3742. Exit;
  3743. end;
  3744. end;
  3745. if (taicpu(hp1).opcode = A_MOV) and
  3746. (
  3747. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)
  3748. {$ifdef x86_64}
  3749. or (
  3750. { Permit zero extension from 32- to 64-bit when writing
  3751. a constant (it will be checked to see if it fits into
  3752. a signed 32-bit integer) }
  3753. (taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and
  3754. (
  3755. { Valid situations... writing an unsigned 32-bit
  3756. immediate, or the destination is a 64-bit register }
  3757. (taicpu(p).oper[0]^.typ = top_const) or
  3758. (taicpu(hp1).oper[1]^.typ = top_reg)
  3759. ) and
  3760. (taicpu(hp1).oper[0]^.typ = top_reg) and
  3761. SuperRegistersEqual(p_TargetReg, taicpu(hp1).oper[0]^.reg)
  3762. )
  3763. {$endif x86_64}
  3764. ) then
  3765. begin
  3766. { Remember that p_TargetReg contains taicpu(p).oper[1]^.reg }
  3767. TransferUsedRegs(TmpUsedRegs);
  3768. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3769. { we have
  3770. mov x, %treg
  3771. mov %treg, y
  3772. }
  3773. if not(RegInOp(p_TargetReg, taicpu(hp1).oper[1]^)) then
  3774. if not(RegUsedAfterInstruction(p_TargetReg, hp1, TmpUsedRegs)) then
  3775. begin
  3776. { we've got
  3777. mov x, %treg
  3778. mov %treg, y
  3779. with %treg is not used after }
  3780. case taicpu(p).oper[0]^.typ Of
  3781. { top_reg is covered by DeepMOVOpt }
  3782. top_const:
  3783. begin
  3784. { change
  3785. mov const, %treg
  3786. mov %treg, y
  3787. to
  3788. mov const, y
  3789. }
  3790. {$ifdef x86_64}
  3791. if (taicpu(hp1).oper[1]^.typ=top_reg) or
  3792. (
  3793. { For 32-to-64-bit zero-extension, the immediate
  3794. must be between 0 and 2^31 - 1}
  3795. (taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and
  3796. ((taicpu(p).oper[0]^.val>=0) and (taicpu(p).oper[0]^.val<=high(longint)))
  3797. ) or
  3798. (
  3799. not ((taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q)) and
  3800. (
  3801. (taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))
  3802. )
  3803. ) then
  3804. {$endif x86_64}
  3805. begin
  3806. taicpu(hp1).loadconst(0, taicpu(p).oper[0]^.val);
  3807. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 5 done', hp1);
  3808. RemoveCurrentP(p);
  3809. Result := True;
  3810. Exit;
  3811. end;
  3812. end;
  3813. top_ref:
  3814. case taicpu(hp1).oper[1]^.typ of
  3815. top_reg:
  3816. { change
  3817. mov mem, %treg
  3818. mov %treg, %reg
  3819. to
  3820. mov mem, %reg"
  3821. }
  3822. if not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1) then
  3823. begin
  3824. {$ifdef x86_64}
  3825. { If zero extending from 32-bit to 64-bit,
  3826. we have to make sure the replaced
  3827. register is the right size }
  3828. taicpu(p).loadreg(1, newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg),getsubreg(p_TargetReg)));
  3829. {$else}
  3830. taicpu(p).loadreg(1, taicpu(hp1).oper[1]^.reg);
  3831. {$endif x86_64}
  3832. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3a done', p);
  3833. AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp1, UsedRegs);
  3834. RemoveInstruction(hp1);
  3835. Result := True;
  3836. Exit;
  3837. end
  3838. else if
  3839. { Make sure that if a reference is used, its
  3840. registers are not modified in between }
  3841. not RefModifiedBetween(taicpu(p).oper[0]^.ref^, topsize2memsize[taicpu(p).opsize] shr 3, p, hp1) then
  3842. begin
  3843. if (taicpu(p).oper[0]^.ref^.base <> NR_NO){$ifdef x86_64} and (taicpu(p).oper[0]^.ref^.base <> NR_RIP){$endif x86_64} then
  3844. AllocRegBetween(taicpu(p).oper[0]^.ref^.base, p, hp1, UsedRegs);
  3845. if (taicpu(p).oper[0]^.ref^.index <> NR_NO) and (taicpu(p).oper[0]^.ref^.index <> taicpu(p).oper[0]^.ref^.base) then
  3846. AllocRegBetween(taicpu(p).oper[0]^.ref^.index, p, hp1, UsedRegs);
  3847. taicpu(hp1).loadref(0, taicpu(p).oper[0]^.ref^);
  3848. if Assigned(taicpu(p).oper[0]^.ref^.symbol) then
  3849. taicpu(p).oper[0]^.ref^.symbol.decrefs;
  3850. if Assigned(taicpu(p).oper[0]^.ref^.relsymbol) then
  3851. taicpu(p).oper[0]^.ref^.relsymbol.decrefs;
  3852. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3 done', hp1);
  3853. RemoveCurrentP(p);
  3854. Result := True;
  3855. Exit;
  3856. end;
  3857. top_ref:
  3858. if not RegInRef(p_TargetReg, taicpu(p).oper[0]^.ref^) then
  3859. begin
  3860. {$ifdef x86_64}
  3861. { Look for the following to simplify:
  3862. mov x(mem1), %reg
  3863. mov %reg, y(mem2)
  3864. mov x+8(mem1), %reg
  3865. mov %reg, y+8(mem2)
  3866. Change to:
  3867. movdqu x(mem1), %xmmreg
  3868. movdqu %xmmreg, y(mem2)
  3869. ...but only as long as the memory blocks don't overlap
  3870. }
  3871. SourceRef := taicpu(p).oper[0]^.ref^;
  3872. TargetRef := taicpu(hp1).oper[1]^.ref^;
  3873. if (taicpu(p).opsize = S_Q) and
  3874. not RegUsedAfterInstruction(p_TargetReg, hp1, TmpUsedRegs) and
  3875. GetNextInstruction(hp1, hp2) and
  3876. MatchInstruction(hp2, A_MOV, [taicpu(p).opsize]) and
  3877. MatchOpType(taicpu(hp2), top_ref, top_reg) then
  3878. begin
  3879. { Delay calling GetNextInstruction(hp2, hp3) for as long as possible }
  3880. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3881. Inc(SourceRef.offset, 8);
  3882. if UseAVX then
  3883. begin
  3884. MovAligned := A_VMOVDQA;
  3885. MovUnaligned := A_VMOVDQU;
  3886. end
  3887. else
  3888. begin
  3889. MovAligned := A_MOVDQA;
  3890. MovUnaligned := A_MOVDQU;
  3891. end;
  3892. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) and
  3893. not RefsMightOverlap(taicpu(p).oper[0]^.ref^, TargetRef, 16) then
  3894. begin
  3895. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  3896. Inc(TargetRef.offset, 8);
  3897. if GetNextInstruction(hp2, hp3) and
  3898. MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
  3899. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  3900. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  3901. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  3902. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  3903. begin
  3904. NewMMReg := GetMMRegisterBetween(R_SUBMMX, UsedRegs, p, hp3);
  3905. if NewMMReg <> NR_NO then
  3906. begin
  3907. { Remember that the offsets are 8 ahead }
  3908. if ((SourceRef.offset mod 16) = 8) and
  3909. (
  3910. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  3911. (SourceRef.base = current_procinfo.framepointer) or
  3912. ((SourceRef.alignment >= 16) and ((SourceRef.alignment mod 16) = 0))
  3913. ) then
  3914. taicpu(p).opcode := MovAligned
  3915. else
  3916. taicpu(p).opcode := MovUnaligned;
  3917. taicpu(p).opsize := S_XMM;
  3918. taicpu(p).oper[1]^.reg := NewMMReg;
  3919. if ((TargetRef.offset mod 16) = 8) and
  3920. (
  3921. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  3922. (TargetRef.base = current_procinfo.framepointer) or
  3923. ((TargetRef.alignment >= 16) and ((TargetRef.alignment mod 16) = 0))
  3924. ) then
  3925. taicpu(hp1).opcode := MovAligned
  3926. else
  3927. taicpu(hp1).opcode := MovUnaligned;
  3928. taicpu(hp1).opsize := S_XMM;
  3929. taicpu(hp1).oper[0]^.reg := NewMMReg;
  3930. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(NewMMReg) + ' to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 1)', p);
  3931. RemoveInstruction(hp2);
  3932. RemoveInstruction(hp3);
  3933. Result := True;
  3934. Exit;
  3935. end;
  3936. end;
  3937. end
  3938. else
  3939. begin
  3940. { See if the next references are 8 less rather than 8 greater }
  3941. Dec(SourceRef.offset, 16); { -8 the other way }
  3942. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
  3943. begin
  3944. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  3945. Dec(TargetRef.offset, 8); { Only 8, not 16, as it wasn't incremented unlike SourceRef }
  3946. if not RefsMightOverlap(SourceRef, TargetRef, 16) and
  3947. GetNextInstruction(hp2, hp3) and
  3948. MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
  3949. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  3950. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  3951. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  3952. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  3953. begin
  3954. NewMMReg := GetMMRegisterBetween(R_SUBMMX, UsedRegs, p, hp3);
  3955. if NewMMReg <> NR_NO then
  3956. begin
  3957. { hp2 and hp3 are the starting offsets, so mod = 0 this time }
  3958. if ((SourceRef.offset mod 16) = 0) and
  3959. (
  3960. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  3961. (SourceRef.base = current_procinfo.framepointer) or
  3962. ((SourceRef.alignment >= 16) and ((SourceRef.alignment mod 16) = 0))
  3963. ) then
  3964. taicpu(hp2).opcode := MovAligned
  3965. else
  3966. taicpu(hp2).opcode := MovUnaligned;
  3967. taicpu(hp2).opsize := S_XMM;
  3968. taicpu(hp2).oper[1]^.reg := NewMMReg;
  3969. if ((TargetRef.offset mod 16) = 0) and
  3970. (
  3971. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  3972. (TargetRef.base = current_procinfo.framepointer) or
  3973. ((TargetRef.alignment >= 16) and ((TargetRef.alignment mod 16) = 0))
  3974. ) then
  3975. taicpu(hp3).opcode := MovAligned
  3976. else
  3977. taicpu(hp3).opcode := MovUnaligned;
  3978. taicpu(hp3).opsize := S_XMM;
  3979. taicpu(hp3).oper[0]^.reg := NewMMReg;
  3980. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(NewMMReg) + ' to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 2)', p);
  3981. RemoveInstruction(hp1);
  3982. RemoveCurrentP(p);
  3983. Result := True;
  3984. Exit;
  3985. end;
  3986. end;
  3987. end;
  3988. end;
  3989. end;
  3990. {$endif x86_64}
  3991. end;
  3992. else
  3993. { The write target should be a reg or a ref }
  3994. InternalError(2021091601);
  3995. end;
  3996. else
  3997. ;
  3998. end;
  3999. end
  4000. else if (taicpu(p).oper[0]^.typ = top_const) and
  4001. { %treg is used afterwards, but all eventualities other
  4002. than the first MOV instruction being a constant are
  4003. covered by DeepMOVOpt, so only check for that }
  4004. (
  4005. { For MOV operations, a size saving is only made if the register/const is byte-sized }
  4006. not (cs_opt_size in current_settings.optimizerswitches) or
  4007. (taicpu(hp1).opsize = S_B)
  4008. ) and
  4009. (
  4010. (taicpu(hp1).oper[1]^.typ=top_reg) or
  4011. (
  4012. { For 32-to-64-bit zero-extension, the immediate
  4013. must be between 0 and 2^31 - 1}
  4014. (taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and
  4015. ((taicpu(p).oper[0]^.val>=0) and (taicpu(p).oper[0]^.val<=high(longint)))
  4016. ) or
  4017. (
  4018. not ((taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q)) and
  4019. (
  4020. (taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))
  4021. )
  4022. )
  4023. ) then
  4024. begin
  4025. DebugMsg(SPeepholeOptimization + debug_operstr(taicpu(hp1).oper[0]^) + ' = $' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 6b)',hp1);
  4026. taicpu(hp1).loadconst(0, taicpu(p).oper[0]^.val);
  4027. Include(OptsToCheck, aoc_ForceNewIteration);
  4028. end;
  4029. end;
  4030. Break;
  4031. end;
  4032. end;
  4033. if taicpu(p).oper[0]^.typ = top_reg then
  4034. begin
  4035. { oper[1] is a reference }
  4036. { Saves on a large number of dereferences }
  4037. p_SourceReg := taicpu(p).oper[0]^.reg;
  4038. if NotFirstIteration and (cs_opt_level3 in current_settings.optimizerswitches) then
  4039. GetNextInstruction_p := GetNextInstructionUsingReg(p, hp1, p_SourceReg)
  4040. else
  4041. GetNextInstruction_p := GetNextInstruction(p, hp1);
  4042. if GetNextInstruction_p and (hp1.typ = ait_instruction) then
  4043. begin
  4044. if taicpu(p).oper[1]^.typ = top_reg then
  4045. begin
  4046. p_TargetReg := taicpu(p).oper[1]^.reg;
  4047. { Change:
  4048. movl %reg1,%reg2
  4049. ...
  4050. movl x(%reg1),%reg1 (If something other than %reg1 is written to, DeepMOVOpt would have caught it)
  4051. ...
  4052. movl x(%reg2),%regX (%regX can be %reg2 or something else)
  4053. To:
  4054. movl %reg1,%reg2 (if %regX = %reg2, then remove this instruction)
  4055. ...
  4056. movl x(%reg1),%reg1
  4057. ...
  4058. movl %reg1,%regX
  4059. }
  4060. if MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  4061. (taicpu(hp1).oper[0]^.typ = top_ref) { The other operand will be a register } and
  4062. (taicpu(hp1).oper[1]^.reg = p_SourceReg) and
  4063. RegInRef(p_SourceReg, taicpu(hp1).oper[0]^.ref^) and
  4064. not RegModifiedBetween(p_TargetReg, p, hp1) and
  4065. GetNextInstructionUsingReg(hp1, hp2, p_TargetReg) and
  4066. MatchInstruction(hp2, A_MOV, [taicpu(p).opsize]) and
  4067. (taicpu(hp2).oper[0]^.typ = top_ref) { The other operand will be a register } and
  4068. not RegModifiedBetween(p_SourceReg, hp1, hp2) then
  4069. begin
  4070. SourceRef := taicpu(hp2).oper[0]^.ref^;
  4071. if RegInRef(p_TargetReg, SourceRef) and
  4072. { If %reg1 also appears in the second reference, then it will
  4073. not refer to the same memory block as the first reference }
  4074. not RegInRef(p_SourceReg, SourceRef) then
  4075. begin
  4076. { Check to see if the references match if %reg2 is changed to %reg1 }
  4077. if SourceRef.base = p_TargetReg then
  4078. SourceRef.base := p_SourceReg;
  4079. if SourceRef.index = p_TargetReg then
  4080. SourceRef.index := p_SourceReg;
  4081. { RefsEqual also checks to ensure both references are non-volatile }
  4082. if RefsEqual(taicpu(hp1).oper[0]^.ref^, SourceRef) then
  4083. begin
  4084. taicpu(hp2).loadreg(0, p_SourceReg);
  4085. TransferUsedRegs(TmpUsedRegs);
  4086. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  4087. { Make sure the register is allocated between these instructions
  4088. even though it doesn't change value, since it may cause
  4089. optimisations on a later pass to behave incorrectly. (Fixes #41155) }
  4090. AllocRegBetween(p_SourceReg, hp1, hp2, TmpUsedRegs);
  4091. DebugMsg(SPeepholeOptimization + 'Optimised register duplication and memory read (MovMovMov2MovMovMov)', p);
  4092. Result := True;
  4093. if taicpu(hp2).oper[1]^.reg = p_TargetReg then
  4094. begin
  4095. DebugMsg(SPeepholeOptimization + 'Mov2Nop 5a done', p);
  4096. RemoveCurrentP(p);
  4097. Exit;
  4098. end
  4099. else
  4100. begin
  4101. if not RegUsedAfterInstruction(p_TargetReg, hp2, TmpUsedRegs) then
  4102. begin
  4103. DebugMsg(SPeepholeOptimization + 'Mov2Nop 5b done', p);
  4104. RemoveCurrentP(p);
  4105. Exit;
  4106. end;
  4107. end;
  4108. { If we reach this point, p and hp1 weren't actually modified,
  4109. so we can do a bit more work on this pass }
  4110. end;
  4111. end;
  4112. end;
  4113. end;
  4114. end;
  4115. end;
  4116. GetNextInstruction_p:=GetNextInstruction(p, hp1);
  4117. { All the next optimisations require a next instruction }
  4118. if not GetNextInstruction_p or (hp1.typ <> ait_instruction) then
  4119. Exit;
  4120. { Change:
  4121. movl/q (ref), %reg
  4122. movd/q %reg, %xmm0
  4123. (dealloc %reg)
  4124. To:
  4125. movd/q (ref), %xmm0
  4126. }
  4127. if MatchOpType(taicpu(p),top_ref,top_reg) and
  4128. MatchInstruction(hp1,[A_MOVD,A_VMOVD{$ifdef x86_64},A_MOVQ,A_VMOVQ{$endif x86_64}],[]) and
  4129. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^.reg) and
  4130. (taicpu(hp1).oper[1]^.typ=top_reg) and
  4131. (GetRegType(taicpu(hp1).oper[1]^.reg)=R_MMREGISTER) then
  4132. begin
  4133. TransferUsedRegs(TmpUsedRegs);
  4134. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  4135. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs) then
  4136. begin
  4137. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  4138. { loadref increases the reference count, so decrement it again }
  4139. if Assigned(taicpu(p).oper[0]^.ref^.symbol) then
  4140. taicpu(p).oper[0]^.ref^.symbol.decrefs;
  4141. if Assigned(taicpu(p).oper[0]^.ref^.relsymbol) then
  4142. taicpu(p).oper[0]^.ref^.relsymbol.decrefs;
  4143. DebugMsg(SPeepholeOptimization+'Merged MOV and (V)MOVD/(V)MOVQ to eliminate intermediate register (MovMovD/Q2MovD/Q)',p);
  4144. RemoveCurrentP(p,hp1);
  4145. Result:=True;
  4146. Exit;
  4147. end;
  4148. end;
  4149. { Next instruction is also a MOV ? }
  4150. if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) then
  4151. begin
  4152. if MatchOpType(taicpu(p), top_const, top_ref) and
  4153. MatchOpType(taicpu(hp1), top_const, top_ref) and
  4154. TryConstMerge(p, hp1) then
  4155. begin
  4156. Result := True;
  4157. { In case we have four byte writes in a row, check for 2 more
  4158. right now so we don't have to wait for another iteration of
  4159. pass 1
  4160. }
  4161. { If two byte-writes were merged, the opsize is now S_W, not S_B }
  4162. case taicpu(p).opsize of
  4163. S_W:
  4164. begin
  4165. if GetNextInstruction(p, hp1) and
  4166. MatchInstruction(hp1, A_MOV, [S_B]) and
  4167. MatchOpType(taicpu(hp1), top_const, top_ref) and
  4168. GetNextInstruction(hp1, hp2) and
  4169. MatchInstruction(hp2, A_MOV, [S_B]) and
  4170. MatchOpType(taicpu(hp2), top_const, top_ref) and
  4171. { Try to merge the two bytes }
  4172. TryConstMerge(hp1, hp2) then
  4173. { Now try to merge the two words (hp2 will get deleted) }
  4174. TryConstMerge(p, hp1);
  4175. end;
  4176. S_L:
  4177. begin
  4178. { Though this only really benefits x86_64 and not i386, it
  4179. gets a potential optimisation done faster and hence
  4180. reduces the number of times OptPass1MOV is entered }
  4181. if GetNextInstruction(p, hp1) and
  4182. MatchInstruction(hp1, A_MOV, [S_W]) and
  4183. MatchOpType(taicpu(hp1), top_const, top_ref) and
  4184. GetNextInstruction(hp1, hp2) and
  4185. MatchInstruction(hp2, A_MOV, [S_W]) and
  4186. MatchOpType(taicpu(hp2), top_const, top_ref) and
  4187. { Try to merge the two words }
  4188. TryConstMerge(hp1, hp2) then
  4189. { This will always fail on i386, so don't bother
  4190. calling it unless we're doing x86_64 }
  4191. {$ifdef x86_64}
  4192. { Now try to merge the two longwords (hp2 will get deleted) }
  4193. TryConstMerge(p, hp1)
  4194. {$endif x86_64}
  4195. ;
  4196. end;
  4197. else
  4198. ;
  4199. end;
  4200. Exit;
  4201. end;
  4202. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  4203. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  4204. { mov reg1, mem1 or mov mem1, reg1
  4205. mov mem2, reg2 mov reg2, mem2}
  4206. begin
  4207. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  4208. { mov reg1, mem1 or mov mem1, reg1
  4209. mov mem2, reg1 mov reg2, mem1}
  4210. begin
  4211. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  4212. { Removes the second statement from
  4213. mov reg1, mem1/reg2
  4214. mov mem1/reg2, reg1 }
  4215. begin
  4216. if taicpu(p).oper[0]^.typ=top_reg then
  4217. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  4218. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 1',p);
  4219. RemoveInstruction(hp1);
  4220. Result:=true;
  4221. if (taicpu(p).oper[1]^.typ = top_reg) then
  4222. begin
  4223. TransferUsedRegs(TmpUsedRegs);
  4224. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, p, TmpUsedRegs) then
  4225. begin
  4226. { reg2 is no longer in use }
  4227. DebugMsg(SPeepholeOptimization + 'Mov2Nop 6 done',p);
  4228. RemoveCurrentP(p);
  4229. end;
  4230. end;
  4231. exit;
  4232. end
  4233. else
  4234. begin
  4235. TransferUsedRegs(TmpUsedRegs);
  4236. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  4237. if (taicpu(p).oper[1]^.typ = top_ref) and
  4238. { mov reg1, mem1
  4239. mov mem2, reg1 }
  4240. (taicpu(hp1).oper[0]^.ref^.refaddr = addr_no) and
  4241. GetNextInstruction(hp1, hp2) and
  4242. MatchInstruction(hp2,A_CMP,[taicpu(p).opsize]) and
  4243. OpsEqual(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^) and
  4244. OpsEqual(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) and
  4245. not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs)) then
  4246. { change to
  4247. mov reg1, mem1 mov reg1, mem1
  4248. mov mem2, reg1 cmp reg1, mem2
  4249. cmp mem1, reg1
  4250. }
  4251. begin
  4252. RemoveInstruction(hp2);
  4253. taicpu(hp1).opcode := A_CMP;
  4254. taicpu(hp1).loadref(1,taicpu(hp1).oper[0]^.ref^);
  4255. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  4256. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  4257. DebugMsg(SPeepholeOptimization + 'MovMovCmp2MovCmp done',hp1);
  4258. end;
  4259. end;
  4260. end
  4261. else if (taicpu(p).oper[1]^.typ=top_ref) and
  4262. OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  4263. begin
  4264. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  4265. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  4266. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov1 done',p);
  4267. end
  4268. else
  4269. begin
  4270. TransferUsedRegs(TmpUsedRegs);
  4271. if GetNextInstruction(hp1, hp2) and
  4272. MatchOpType(taicpu(p),top_ref,top_reg) and
  4273. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  4274. (taicpu(hp1).oper[1]^.typ = top_ref) and
  4275. MatchInstruction(hp2,A_MOV,[taicpu(p).opsize]) and
  4276. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  4277. RefsEqual(taicpu(hp2).oper[0]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  4278. if not RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^) and
  4279. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,tmpUsedRegs)) then
  4280. { mov mem1, %reg1
  4281. mov %reg1, mem2
  4282. mov mem2, reg2
  4283. to:
  4284. mov mem1, reg2
  4285. mov reg2, mem2}
  4286. begin
  4287. AllocRegBetween(taicpu(hp2).oper[1]^.reg,p,hp2,usedregs);
  4288. DebugMsg(SPeepholeOptimization + 'MovMovMov2MovMov 1 done',p);
  4289. taicpu(p).loadoper(1,taicpu(hp2).oper[1]^);
  4290. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  4291. RemoveInstruction(hp2);
  4292. Result := True;
  4293. end
  4294. {$ifdef i386}
  4295. { this is enabled for i386 only, as the rules to create the reg sets below
  4296. are too complicated for x86-64, so this makes this code too error prone
  4297. on x86-64
  4298. }
  4299. else if (taicpu(p).oper[1]^.reg <> taicpu(hp2).oper[1]^.reg) and
  4300. not(RegInRef(taicpu(p).oper[1]^.reg,taicpu(p).oper[0]^.ref^)) and
  4301. not(RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^)) then
  4302. { mov mem1, reg1 mov mem1, reg1
  4303. mov reg1, mem2 mov reg1, mem2
  4304. mov mem2, reg2 mov mem2, reg1
  4305. to: to:
  4306. mov mem1, reg1 mov mem1, reg1
  4307. mov mem1, reg2 mov reg1, mem2
  4308. mov reg1, mem2
  4309. or (if mem1 depends on reg1
  4310. and/or if mem2 depends on reg2)
  4311. to:
  4312. mov mem1, reg1
  4313. mov reg1, mem2
  4314. mov reg1, reg2
  4315. }
  4316. begin
  4317. taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^);
  4318. taicpu(hp1).loadReg(1,taicpu(hp2).oper[1]^.reg);
  4319. taicpu(hp2).loadRef(1,taicpu(hp2).oper[0]^.ref^);
  4320. taicpu(hp2).loadReg(0,taicpu(p).oper[1]^.reg);
  4321. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  4322. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  4323. (getsupreg(taicpu(p).oper[0]^.ref^.base) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  4324. AllocRegBetween(taicpu(p).oper[0]^.ref^.base,p,hp2,usedregs);
  4325. if (taicpu(p).oper[0]^.ref^.index <> NR_NO) and
  4326. (getsupreg(taicpu(p).oper[0]^.ref^.index) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  4327. AllocRegBetween(taicpu(p).oper[0]^.ref^.index,p,hp2,usedregs);
  4328. end
  4329. else if (taicpu(hp1).Oper[0]^.reg <> taicpu(hp2).Oper[1]^.reg) then
  4330. begin
  4331. taicpu(hp2).loadReg(0,taicpu(hp1).Oper[0]^.reg);
  4332. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  4333. end
  4334. else
  4335. begin
  4336. RemoveInstruction(hp2);
  4337. end
  4338. {$endif i386}
  4339. ;
  4340. end;
  4341. end
  4342. { movl [mem1],reg1
  4343. movl [mem1],reg2
  4344. to
  4345. movl [mem1],reg1
  4346. movl reg1,reg2
  4347. }
  4348. else if not CheckMovMov2MovMov2(p, hp1) and
  4349. { movl const1,[mem1]
  4350. movl [mem1],reg1
  4351. to
  4352. movl const1,reg1
  4353. movl reg1,[mem1]
  4354. }
  4355. MatchOpType(Taicpu(p),top_const,top_ref) and
  4356. MatchOpType(Taicpu(hp1),top_ref,top_reg) and
  4357. (taicpu(p).opsize = taicpu(hp1).opsize) and
  4358. RefsEqual(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.ref^) and
  4359. not(RegInRef(taicpu(hp1).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^)) then
  4360. begin
  4361. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  4362. taicpu(hp1).loadReg(0,taicpu(hp1).oper[1]^.reg);
  4363. taicpu(hp1).loadRef(1,taicpu(p).oper[1]^.ref^);
  4364. taicpu(p).loadReg(1,taicpu(hp1).oper[0]^.reg);
  4365. taicpu(hp1).fileinfo := taicpu(p).fileinfo;
  4366. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 1',p);
  4367. Result:=true;
  4368. exit;
  4369. end;
  4370. { mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
  4371. end;
  4372. { search further than the next instruction for a mov (as long as it's not a jump) }
  4373. if not is_calljmpuncondret(taicpu(hp1).opcode) and
  4374. { check as much as possible before the expensive GetNextInstructionUsingRegCond call }
  4375. (taicpu(p).oper[1]^.typ = top_reg) and
  4376. (taicpu(p).oper[0]^.typ in [top_reg,top_const]) and
  4377. not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
  4378. begin
  4379. { we work with hp2 here, so hp1 can be still used later on when
  4380. checking for GetNextInstruction_p }
  4381. hp3 := hp1;
  4382. { Initialise CrossJump (if it becomes True at any point, it will remain True) }
  4383. CrossJump := (taicpu(hp1).opcode = A_Jcc);
  4384. { Remember that p_TargetReg contains taicpu(p).oper[1]^.reg }
  4385. TransferUsedRegs(TmpUsedRegs);
  4386. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  4387. if NotFirstIteration then
  4388. JumpTracking := TLinkedList.Create
  4389. else
  4390. JumpTracking := nil;
  4391. while GetNextInstructionUsingRegCond(hp3,hp2,p_TargetReg,JumpTracking,CrossJump) and
  4392. { GetNextInstructionUsingRegCond only searches one instruction ahead unless -O3 is specified }
  4393. (hp2.typ=ait_instruction) do
  4394. begin
  4395. case taicpu(hp2).opcode of
  4396. A_POP:
  4397. if MatchOperand(taicpu(hp2).oper[0]^,p_TargetReg) then
  4398. begin
  4399. if not CrossJump and
  4400. not RegUsedBetween(p_TargetReg, p, hp2) then
  4401. begin
  4402. { We can remove the original MOV since the register
  4403. wasn't used between it and its popping from the stack }
  4404. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3c done',p);
  4405. RemoveCurrentp(p, hp1);
  4406. Result := True;
  4407. JumpTracking.Free;
  4408. Exit;
  4409. end;
  4410. { Can't go any further }
  4411. Break;
  4412. end;
  4413. A_MOV:
  4414. if MatchOperand(taicpu(hp2).oper[0]^,p_TargetReg) and
  4415. ((taicpu(p).oper[0]^.typ=top_const) or
  4416. ((taicpu(p).oper[0]^.typ=top_reg) and
  4417. not(RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp2))
  4418. )
  4419. ) then
  4420. begin
  4421. { we have
  4422. mov x, %treg
  4423. mov %treg, y
  4424. }
  4425. { We don't need to call UpdateUsedRegs for every instruction between
  4426. p and hp2 because the register we're concerned about will not
  4427. become deallocated (otherwise GetNextInstructionUsingReg would
  4428. have stopped at an earlier instruction). [Kit] }
  4429. TempRegUsed :=
  4430. CrossJump { Assume the register is in use if it crossed a conditional jump } or
  4431. RegReadByInstruction(p_TargetReg, hp3) or
  4432. RegUsedAfterInstruction(p_TargetReg, hp2, TmpUsedRegs);
  4433. case taicpu(p).oper[0]^.typ Of
  4434. top_reg:
  4435. begin
  4436. { change
  4437. mov %reg, %treg
  4438. mov %treg, y
  4439. to
  4440. mov %reg, y
  4441. }
  4442. p_SourceReg := taicpu(p).oper[0]^.reg; { Saves on a handful of pointer dereferences }
  4443. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  4444. if MatchOperand(taicpu(hp2).oper[1]^, p_SourceReg) then
  4445. begin
  4446. { %reg = y - remove hp2 completely (doing it here instead of relying on
  4447. the "mov %reg,%reg" optimisation might cut down on a pass iteration) }
  4448. if TempRegUsed then
  4449. begin
  4450. DebugMsg(SPeepholeOptimization + debug_regname(p_SourceReg) + ' = ' + RegName1 + '; removed unnecessary instruction (MovMov2MovNop 6b}',hp2);
  4451. AllocRegBetween(p_SourceReg, p, hp2, UsedRegs);
  4452. { Set the start of the next GetNextInstructionUsingRegCond search
  4453. to start at the entry right before hp2 (which is about to be removed) }
  4454. hp3 := tai(hp2.Previous);
  4455. RemoveInstruction(hp2);
  4456. Include(OptsToCheck, aoc_ForceNewIteration);
  4457. { See if there's more we can optimise }
  4458. Continue;
  4459. end
  4460. else
  4461. begin
  4462. RemoveInstruction(hp2);
  4463. { We can remove the original MOV too }
  4464. DebugMsg(SPeepholeOptimization + 'MovMov2NopNop 6b done',p);
  4465. RemoveCurrentP(p, hp1);
  4466. Result:=true;
  4467. JumpTracking.Free;
  4468. Exit;
  4469. end;
  4470. end
  4471. else
  4472. begin
  4473. AllocRegBetween(p_SourceReg, p, hp2, UsedRegs);
  4474. taicpu(hp2).loadReg(0, p_SourceReg);
  4475. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_regname(p_SourceReg) + '; changed to minimise pipeline stall (MovMov2Mov 6a}',hp2);
  4476. { Check to see if the register also appears in the reference }
  4477. if (taicpu(hp2).oper[1]^.typ = top_ref) then
  4478. ReplaceRegisterInRef(taicpu(hp2).oper[1]^.ref^, p_TargetReg, p_SourceReg);
  4479. { ReplaceRegisterInRef won't actually replace the register if it's a different size }
  4480. if not RegInOp(p_TargetReg, taicpu(hp2).oper[1]^) then
  4481. begin
  4482. { Don't remove the first instruction if the temporary register is in use }
  4483. if not TempRegUsed then
  4484. begin
  4485. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 6 done',p);
  4486. RemoveCurrentP(p, hp1);
  4487. Result:=true;
  4488. JumpTracking.Free;
  4489. Exit;
  4490. end;
  4491. { No need to set Result to True here. If there's another instruction later
  4492. on that can be optimised, it will be detected when the main Pass 1 loop
  4493. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] }
  4494. hp3 := hp2;
  4495. Continue;
  4496. end;
  4497. end;
  4498. end;
  4499. top_const:
  4500. if not (cs_opt_size in current_settings.optimizerswitches) or (taicpu(hp2).opsize = S_B) then
  4501. begin
  4502. { change
  4503. mov const, %treg
  4504. mov %treg, y
  4505. to
  4506. mov const, y
  4507. }
  4508. if (taicpu(hp2).oper[1]^.typ=top_reg) or
  4509. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  4510. begin
  4511. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  4512. taicpu(hp2).loadOper(0,taicpu(p).oper[0]^);
  4513. if TempRegUsed then
  4514. begin
  4515. { Don't remove the first instruction if the temporary register is in use }
  4516. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 7a)',hp2);
  4517. { No need to set Result to True. If there's another instruction later on
  4518. that can be optimised, it will be detected when the main Pass 1 loop
  4519. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
  4520. end
  4521. else
  4522. begin
  4523. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 7 done',p);
  4524. RemoveCurrentP(p, hp1);
  4525. Result:=true;
  4526. Exit;
  4527. end;
  4528. end;
  4529. end;
  4530. else
  4531. Internalerror(2019103001);
  4532. end;
  4533. end
  4534. else if MatchOperand(taicpu(hp2).oper[1]^, p_TargetReg) then
  4535. begin
  4536. if not CrossJump and
  4537. not RegUsedBetween(p_TargetReg, p, hp2) and
  4538. not RegReadByInstruction(p_TargetReg, hp2) then
  4539. begin
  4540. { Register is not used before it is overwritten }
  4541. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3a done',p);
  4542. RemoveCurrentp(p, hp1);
  4543. Result := True;
  4544. Exit;
  4545. end;
  4546. if (taicpu(p).oper[0]^.typ = top_const) and
  4547. (taicpu(hp2).oper[0]^.typ = top_const) then
  4548. begin
  4549. if taicpu(p).oper[0]^.val = taicpu(hp2).oper[0]^.val then
  4550. begin
  4551. { Same value - register hasn't changed }
  4552. DebugMsg(SPeepholeOptimization + 'Mov2Nop 2 done', hp2);
  4553. RemoveInstruction(hp2);
  4554. Include(OptsToCheck, aoc_ForceNewIteration);
  4555. { See if there's more we can optimise }
  4556. Continue;
  4557. end;
  4558. end;
  4559. {$ifdef x86_64}
  4560. end
  4561. { Change:
  4562. movl %reg1l,%reg2l
  4563. ...
  4564. movq %reg2q,%reg3q (%reg1 <> %reg3)
  4565. To:
  4566. movl %reg1l,%reg2l
  4567. ...
  4568. movl %reg1l,%reg3l (Upper 32 bits of %reg3q will be zero)
  4569. If %reg1 = %reg3, convert to:
  4570. movl %reg1l,%reg2l
  4571. ...
  4572. andl %reg1l,%reg1l
  4573. }
  4574. else if (taicpu(p).opsize = S_L) and MatchInstruction(hp2,A_MOV,[S_Q]) and
  4575. (taicpu(p).oper[0]^.typ = top_reg) and
  4576. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  4577. SuperRegistersEqual(p_TargetReg, taicpu(hp2).oper[0]^.reg) and
  4578. not RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp2) then
  4579. begin
  4580. TempRegUsed :=
  4581. CrossJump { Assume the register is in use if it crossed a conditional jump } or
  4582. RegReadByInstruction(p_TargetReg, hp3) or
  4583. RegUsedAfterInstruction(p_TargetReg, hp2, TmpUsedRegs);
  4584. taicpu(hp2).opsize := S_L;
  4585. taicpu(hp2).loadreg(0, taicpu(p).oper[0]^.reg);
  4586. setsubreg(taicpu(hp2).oper[1]^.reg, R_SUBD);
  4587. AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp2, UsedRegs);
  4588. if (taicpu(p).oper[0]^.reg = taicpu(hp2).oper[1]^.reg) then
  4589. begin
  4590. { %reg1 = %reg3 }
  4591. DebugMsg(SPeepholeOptimization + 'Made 32-to-64-bit zero extension more efficient (MovlMovq2MovlAndl 2)', hp2);
  4592. taicpu(hp2).opcode := A_AND;
  4593. end
  4594. else
  4595. begin
  4596. { %reg1 <> %reg3 }
  4597. DebugMsg(SPeepholeOptimization + 'Made 32-to-64-bit zero extension more efficient (MovlMovq2MovlMovl 2)', hp2);
  4598. end;
  4599. if not TempRegUsed then
  4600. begin
  4601. DebugMsg(SPeepholeOptimization + 'Mov2Nop 8a done', p);
  4602. RemoveCurrentP(p, hp1);
  4603. Result := True;
  4604. Exit;
  4605. end
  4606. else
  4607. begin
  4608. { Initial instruction wasn't actually changed }
  4609. Include(OptsToCheck, aoc_ForceNewIteration);
  4610. { if %reg1 = %reg3, don't do the long-distance lookahead that
  4611. appears below since %reg1 has technically changed }
  4612. if taicpu(hp2).opcode = A_AND then
  4613. Break;
  4614. end;
  4615. {$endif x86_64}
  4616. end
  4617. else if (taicpu(hp2).oper[0]^.typ = top_ref) and
  4618. GetNextInstruction(hp2, hp4) and
  4619. (hp4.typ = ait_instruction) and (taicpu(hp4).opcode = A_MOV) then
  4620. { Optimise the following first:
  4621. movl [mem1],reg1
  4622. movl [mem1],reg2
  4623. to
  4624. movl [mem1],reg1
  4625. movl reg1,reg2
  4626. If [mem1] contains the target register and reg1 is the
  4627. the source register, this optimisation will get missed
  4628. and produce less efficient code later on.
  4629. }
  4630. if CheckMovMov2MovMov2(hp2, hp4) then
  4631. { Initial instruction wasn't actually changed }
  4632. Include(OptsToCheck, aoc_ForceNewIteration);
  4633. A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  4634. if MatchOpType(taicpu(hp2), top_reg, top_reg) and
  4635. MatchOperand(taicpu(hp2).oper[0]^, p_TargetReg) and
  4636. SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, p_TargetReg) then
  4637. begin
  4638. {
  4639. Change from:
  4640. mov ###, %reg
  4641. ...
  4642. movs/z %reg,%reg (Same register, just different sizes)
  4643. To:
  4644. movs/z ###, %reg (Longer version)
  4645. ...
  4646. (remove)
  4647. }
  4648. DebugMsg(SPeepholeOptimization + 'MovMovs/z2Mov/s/z done', p);
  4649. taicpu(p).oper[1]^.reg := taicpu(hp2).oper[1]^.reg;
  4650. { Keep the first instruction as mov if ### is a constant }
  4651. if taicpu(p).oper[0]^.typ = top_const then
  4652. taicpu(p).opsize := reg2opsize(taicpu(hp2).oper[1]^.reg)
  4653. else
  4654. begin
  4655. taicpu(p).opcode := taicpu(hp2).opcode;
  4656. taicpu(p).opsize := taicpu(hp2).opsize;
  4657. end;
  4658. DebugMsg(SPeepholeOptimization + 'Removed movs/z instruction and extended earlier write (MovMovs/z2Mov/s/z)', hp2);
  4659. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp2, UsedRegs);
  4660. RemoveInstruction(hp2);
  4661. Result := True;
  4662. JumpTracking.Free;
  4663. Exit;
  4664. end;
  4665. else
  4666. { Move down to the if-block below };
  4667. end;
  4668. { Also catches MOV/S/Z instructions that aren't modified }
  4669. if taicpu(p).oper[0]^.typ = top_reg then
  4670. begin
  4671. p_SourceReg := taicpu(p).oper[0]^.reg;
  4672. if
  4673. not RegModifiedByInstruction(p_SourceReg, hp3) and
  4674. not RegModifiedBetween(p_SourceReg, hp3, hp2) and
  4675. DeepMOVOpt(taicpu(p), taicpu(hp2)) then
  4676. begin
  4677. Result := True;
  4678. { Just in case something didn't get modified (e.g. an
  4679. implicit register). Also, if it does read from this
  4680. register, then there's no longer an advantage to
  4681. changing the register on subsequent instructions.}
  4682. if not RegReadByInstruction(p_TargetReg, hp2) then
  4683. begin
  4684. { If a conditional jump was crossed, do not delete
  4685. the original MOV no matter what }
  4686. if not CrossJump and
  4687. { RegEndOfLife returns True if the register is
  4688. deallocated before the next instruction or has
  4689. been loaded with a new value }
  4690. RegEndOfLife(p_TargetReg, taicpu(hp2)) then
  4691. begin
  4692. { We can remove the original MOV }
  4693. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3b done',p);
  4694. RemoveCurrentp(p, hp1);
  4695. JumpTracking.Free;
  4696. Result := True;
  4697. Exit;
  4698. end;
  4699. if not RegModifiedByInstruction(p_TargetReg, hp2) then
  4700. begin
  4701. { See if there's more we can optimise }
  4702. hp3 := hp2;
  4703. Continue;
  4704. end;
  4705. end;
  4706. end;
  4707. end;
  4708. { Break out of the while loop under normal circumstances }
  4709. Break;
  4710. end;
  4711. JumpTracking.Free;
  4712. end;
  4713. if (taicpu(p).oper[1]^.typ = top_reg) and
  4714. (
  4715. {$ifndef x86_64}
  4716. (
  4717. { See if we can catch:
  4718. mov ###,%ecx (or any of the ecx family)
  4719. ...
  4720. shl %cl,### (or another shift or rotate instruction)
  4721. And change to...
  4722. mov ###,%cl (using only %cl)
  4723. ...
  4724. shl %cl,###
  4725. }
  4726. (taicpu(p).opsize <> S_B) and
  4727. (getsupreg(taicpu(p).oper[1]^.reg) = RS_ECX) and
  4728. (
  4729. (taicpu(p).oper[0]^.typ <> top_reg) or
  4730. (getsupreg(taicpu(p).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_EDX])
  4731. )
  4732. ) or
  4733. {$endif not x86_64}
  4734. (
  4735. { Tends to be a very slow operation that is rarely successful,
  4736. so only enable if it's definitely not impossible }
  4737. (aoc_MovAnd2Mov_3 in OptsToCheck) and
  4738. (taicpu(p).opsize = S_L)
  4739. )
  4740. ) and
  4741. GetNextInstructionUsingRegTrackingUse(p,hp2,taicpu(p).oper[1]^.reg) and
  4742. (hp2.typ = ait_instruction) then
  4743. begin
  4744. if (taicpu(hp2).opcode = A_AND) then
  4745. begin
  4746. if (MatchOpType(taicpu(hp2),top_const,top_reg) or
  4747. (MatchOpType(taicpu(hp2),top_reg,top_reg) and
  4748. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^))
  4749. ) then
  4750. begin
  4751. if SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp2).oper[1]^.reg) then
  4752. begin
  4753. if ((taicpu(hp2).oper[0]^.typ=top_const) and (taicpu(hp2).oper[0]^.val = $ffffffff)) or
  4754. ((taicpu(hp2).oper[0]^.typ=top_reg) and (taicpu(hp2).opsize=S_L)) then
  4755. begin
  4756. { Optimize out:
  4757. mov x, %reg
  4758. and ffffffffh, %reg
  4759. }
  4760. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 3 done',p);
  4761. RemoveInstruction(hp2);
  4762. Result:=true;
  4763. exit;
  4764. end;
  4765. end;
  4766. end;
  4767. {$ifndef x86_64} { This is handled during the code generation stage under x86_64 }
  4768. end
  4769. else if
  4770. { Need to check again in case we entered this block because aoc_MovAnd2Mov_3 was set }
  4771. (getsupreg(taicpu(p).oper[1]^.reg) = RS_ECX) and
  4772. MatchInstruction(hp2, [A_SHL, A_SHR, A_SHLD, A_SHRD, A_SAR, A_ROR, A_ROL, A_RCR, A_RCL], []) and
  4773. (taicpu(hp2).oper[0]^.typ = top_reg) { Will be %cl } and
  4774. (
  4775. (
  4776. (taicpu(hp2).oper[1]^.typ = top_reg) and
  4777. (getsupreg(taicpu(hp2).oper[1]^.reg) <> RS_ECX) and
  4778. (
  4779. (taicpu(hp2).ops = 2) or
  4780. (
  4781. { For SHLD/SHRD }
  4782. (
  4783. (taicpu(hp2).oper[2]^.typ = top_reg) and
  4784. (getsupreg(taicpu(hp2).oper[2]^.reg) <> RS_ECX)
  4785. ) or (
  4786. (taicpu(hp2).oper[2]^.typ = top_ref) and
  4787. not RegInRef(NR_ECX, taicpu(hp2).oper[2]^.ref^)
  4788. )
  4789. )
  4790. )
  4791. ) or (
  4792. (taicpu(hp2).oper[1]^.typ = top_ref) and
  4793. not RegInRef(NR_ECX, taicpu(hp2).oper[1]^.ref^)
  4794. )
  4795. ) then
  4796. begin
  4797. TransferUsedRegs(TmpUsedRegs);
  4798. UpdateUsedRegsBetween(TmpUsedRegs, p, hp2);
  4799. if not RegUsedAfterInstruction(NR_ECX, hp2, TmpUsedRegs) then
  4800. begin
  4801. DebugMsg(SPeepholeOptimization + 'Resized mov' + debug_opsize2str(taicpu(p).opsize) + ' to 8-bit to match ' + debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize) + ' instruction (MovOp2MovOp 1)', p);
  4802. taicpu(p).opsize := S_B;
  4803. setsubreg(taicpu(p).oper[1]^.reg, R_SUBL);
  4804. case taicpu(p).oper[0]^.typ of
  4805. top_reg:
  4806. setsubreg(taicpu(p).oper[0]^.reg, R_SUBL);
  4807. top_const:
  4808. if (taicpu(p).oper[0]^.val < -128) or (taicpu(p).oper[0]^.val > 127) then
  4809. taicpu(p).oper[0]^.val := taicpu(p).oper[0]^.val and $FF;
  4810. else
  4811. { i.e. a reference, which doesn't change };
  4812. end;
  4813. Result := True;
  4814. Exit;
  4815. end;
  4816. {$endif not x86_64}
  4817. end;
  4818. end;
  4819. { leave out the mov from "mov reg, x(%frame_pointer); leave/ret" (with
  4820. x >= RetOffset) as it doesn't do anything (it writes either to a
  4821. parameter or to the temporary storage room for the function
  4822. result)
  4823. }
  4824. if IsExitCode(hp1) and
  4825. (taicpu(p).oper[1]^.typ = top_ref) and
  4826. (taicpu(p).oper[1]^.ref^.index = NR_NO) and
  4827. (
  4828. (
  4829. (taicpu(p).oper[1]^.ref^.base = current_procinfo.FramePointer) and
  4830. not (
  4831. assigned(current_procinfo.procdef.funcretsym) and
  4832. (taicpu(p).oper[1]^.ref^.offset <= tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)
  4833. )
  4834. ) or
  4835. { Also discard writes to the stack that are below the base pointer,
  4836. as this is temporary storage rather than a function result on the
  4837. stack, say. }
  4838. (
  4839. (taicpu(p).oper[1]^.ref^.base = NR_STACK_POINTER_REG) and
  4840. (taicpu(p).oper[1]^.ref^.offset < current_procinfo.final_localsize)
  4841. )
  4842. ) then
  4843. begin
  4844. RemoveCurrentp(p, hp1);
  4845. DebugMsg(SPeepholeOptimization + 'removed deadstore before leave/ret',p);
  4846. RemoveLastDeallocForFuncRes(p);
  4847. Result:=true;
  4848. exit;
  4849. end;
  4850. if MatchInstruction(hp1,A_CMP,A_TEST,[taicpu(p).opsize]) then
  4851. begin
  4852. if MatchOpType(taicpu(p),top_reg,top_ref) and
  4853. (taicpu(hp1).oper[1]^.typ = top_ref) and
  4854. RefsEqual(taicpu(p).oper[1]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  4855. begin
  4856. { change
  4857. mov reg1, mem1
  4858. test/cmp x, mem1
  4859. to
  4860. mov reg1, mem1
  4861. test/cmp x, reg1
  4862. }
  4863. taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
  4864. DebugMsg(SPeepholeOptimization + 'MovTestCmp2MovTestCmp 1',hp1);
  4865. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  4866. Result := True;
  4867. Exit;
  4868. end;
  4869. if DoMovCmpMemOpt(p, hp1) then
  4870. begin
  4871. Result := True;
  4872. Exit;
  4873. end;
  4874. end;
  4875. if (taicpu(p).oper[1]^.typ = top_reg) and
  4876. (hp1.typ = ait_instruction) and
  4877. GetNextInstruction(hp1, hp2) and
  4878. MatchInstruction(hp2,A_MOV,[]) and
  4879. (SuperRegistersEqual(taicpu(hp2).oper[0]^.reg,taicpu(p).oper[1]^.reg)) and
  4880. (topsize2memsize[taicpu(hp1).opsize]>=topsize2memsize[taicpu(hp2).opsize]) and
  4881. (
  4882. IsFoldableArithOp(taicpu(hp1), taicpu(p).oper[1]^.reg)
  4883. {$ifdef x86_64}
  4884. or
  4885. (
  4886. (taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and (taicpu(hp2).opsize=S_L) and
  4887. IsFoldableArithOp(taicpu(hp1), newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[1]^.reg),R_SUBQ))
  4888. )
  4889. {$endif x86_64}
  4890. ) then
  4891. begin
  4892. if OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  4893. (taicpu(hp2).oper[0]^.typ=top_reg) then
  4894. { change movsX/movzX reg/ref, reg2
  4895. add/sub/or/... reg3/$const, reg2
  4896. mov reg2 reg/ref
  4897. dealloc reg2
  4898. to
  4899. add/sub/or/... reg3/$const, reg/ref }
  4900. begin
  4901. TransferUsedRegs(TmpUsedRegs);
  4902. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4903. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  4904. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  4905. begin
  4906. { by example:
  4907. movswl %si,%eax movswl %si,%eax p
  4908. decl %eax addl %edx,%eax hp1
  4909. movw %ax,%si movw %ax,%si hp2
  4910. ->
  4911. movswl %si,%eax movswl %si,%eax p
  4912. decw %eax addw %edx,%eax hp1
  4913. movw %ax,%si movw %ax,%si hp2
  4914. }
  4915. DebugMsg(SPeepholeOptimization + 'MovOpMov2Op ('+
  4916. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  4917. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  4918. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  4919. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  4920. {
  4921. ->
  4922. movswl %si,%eax movswl %si,%eax p
  4923. decw %si addw %dx,%si hp1
  4924. movw %ax,%si movw %ax,%si hp2
  4925. }
  4926. case taicpu(hp1).ops of
  4927. 1:
  4928. begin
  4929. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  4930. if taicpu(hp1).oper[0]^.typ=top_reg then
  4931. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  4932. end;
  4933. 2:
  4934. begin
  4935. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  4936. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  4937. (taicpu(hp1).opcode<>A_SHL) and
  4938. (taicpu(hp1).opcode<>A_SHR) and
  4939. (taicpu(hp1).opcode<>A_SAR) then
  4940. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  4941. end;
  4942. else
  4943. internalerror(2008042701);
  4944. end;
  4945. {
  4946. ->
  4947. decw %si addw %dx,%si p
  4948. }
  4949. RemoveInstruction(hp2);
  4950. RemoveCurrentP(p, hp1);
  4951. Result:=True;
  4952. Exit;
  4953. end;
  4954. end;
  4955. if MatchOpType(taicpu(hp2),top_reg,top_reg) and
  4956. not(SuperRegistersEqual(taicpu(hp1).oper[0]^.reg,taicpu(hp2).oper[1]^.reg)) and
  4957. ((topsize2memsize[taicpu(hp1).opsize]<= topsize2memsize[taicpu(hp2).opsize]) or
  4958. { opsize matters for these opcodes, we could probably work around this, but it is not worth the effort }
  4959. ((taicpu(hp1).opcode<>A_SHL) and (taicpu(hp1).opcode<>A_SHR) and (taicpu(hp1).opcode<>A_SAR))
  4960. ) and
  4961. { if ref contains a symbol, we cannot change its size to a smaller size }
  4962. ((taicpu(p).oper[0]^.typ<>top_ref) or (taicpu(p).oper[0]^.ref^.symbol=nil) or
  4963. (topsize2memsize[taicpu(p).opsize]<=topsize2memsize[taicpu(hp2).opsize])
  4964. )
  4965. {$ifdef i386}
  4966. { byte registers of esi, edi, ebp, esp are not available on i386 }
  4967. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  4968. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(p).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  4969. {$endif i386}
  4970. then
  4971. { change movsX/movzX reg/ref, reg2
  4972. add/sub/or/... regX/$const, reg2
  4973. mov reg2, reg3
  4974. dealloc reg2
  4975. to
  4976. movsX/movzX reg/ref, reg3
  4977. add/sub/or/... reg3/$const, reg3
  4978. }
  4979. begin
  4980. TransferUsedRegs(TmpUsedRegs);
  4981. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4982. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  4983. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  4984. begin
  4985. { by example:
  4986. movswl %si,%eax movswl %si,%eax p
  4987. decl %eax addl %edx,%eax hp1
  4988. movw %ax,%si movw %ax,%si hp2
  4989. ->
  4990. movswl %si,%eax movswl %si,%eax p
  4991. decw %eax addw %edx,%eax hp1
  4992. movw %ax,%si movw %ax,%si hp2
  4993. }
  4994. DebugMsg(SPeepholeOptimization + 'MovOpMov2MovOp ('+
  4995. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  4996. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  4997. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  4998. { limit size of constants as well to avoid assembler errors, but
  4999. check opsize to avoid overflow when left shifting the 1 }
  5000. if (taicpu(p).oper[0]^.typ=top_const) and (topsize2memsize[taicpu(hp2).opsize]<=63) then
  5001. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and ((qword(1) shl topsize2memsize[taicpu(hp2).opsize])-1);
  5002. {$ifdef x86_64}
  5003. { Be careful of, for example:
  5004. movl %reg1,%reg2
  5005. addl %reg3,%reg2
  5006. movq %reg2,%reg4
  5007. This will cause problems if the upper 32-bits of %reg3 or %reg4 are non-zero
  5008. }
  5009. if (taicpu(hp1).opsize = S_L) and (taicpu(hp2).opsize = S_Q) then
  5010. begin
  5011. taicpu(hp2).changeopsize(S_L);
  5012. setsubreg(taicpu(hp2).oper[0]^.reg, R_SUBD);
  5013. setsubreg(taicpu(hp2).oper[1]^.reg, R_SUBD);
  5014. end;
  5015. {$endif x86_64}
  5016. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  5017. taicpu(p).changeopsize(taicpu(hp2).opsize);
  5018. if taicpu(p).oper[0]^.typ=top_reg then
  5019. setsubreg(taicpu(p).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  5020. taicpu(p).loadoper(1, taicpu(hp2).oper[1]^);
  5021. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,usedregs);
  5022. {
  5023. ->
  5024. movswl %si,%eax movswl %si,%eax p
  5025. decw %si addw %dx,%si hp1
  5026. movw %ax,%si movw %ax,%si hp2
  5027. }
  5028. case taicpu(hp1).ops of
  5029. 1:
  5030. begin
  5031. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  5032. if taicpu(hp1).oper[0]^.typ=top_reg then
  5033. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  5034. end;
  5035. 2:
  5036. begin
  5037. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  5038. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  5039. (taicpu(hp1).opcode<>A_SHL) and
  5040. (taicpu(hp1).opcode<>A_SHR) and
  5041. (taicpu(hp1).opcode<>A_SAR) then
  5042. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  5043. end;
  5044. else
  5045. internalerror(2018111801);
  5046. end;
  5047. {
  5048. ->
  5049. decw %si addw %dx,%si p
  5050. }
  5051. RemoveInstruction(hp2);
  5052. end;
  5053. end;
  5054. end;
  5055. if MatchInstruction(hp1,A_BTS,A_BTR,[Taicpu(p).opsize]) and
  5056. GetNextInstruction(hp1, hp2) and
  5057. MatchInstruction(hp2,A_OR,[Taicpu(p).opsize]) and
  5058. MatchOperand(Taicpu(p).oper[0]^,0) and
  5059. (Taicpu(p).oper[1]^.typ = top_reg) and
  5060. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp1).oper[1]^) and
  5061. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp2).oper[1]^) then
  5062. { mov reg1,0
  5063. bts reg1,operand1 --> mov reg1,operand2
  5064. or reg1,operand2 bts reg1,operand1}
  5065. begin
  5066. Taicpu(hp2).opcode:=A_MOV;
  5067. DebugMsg(SPeepholeOptimization + 'MovBtsOr2MovBts done',hp1);
  5068. asml.remove(hp1);
  5069. insertllitem(hp2,hp2.next,hp1);
  5070. RemoveCurrentp(p, hp1);
  5071. Result:=true;
  5072. exit;
  5073. end;
  5074. if MatchInstruction(hp1,A_SUB,[Taicpu(p).opsize]) and
  5075. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp1).oper[1]^) and
  5076. GetNextInstruction(hp1, hp2) and
  5077. MatchInstruction(hp2,A_CMP,[Taicpu(p).opsize]) and
  5078. MatchOperand(Taicpu(p).oper[0]^,Taicpu(hp2).oper[1]^) and
  5079. MatchOperand(Taicpu(hp1).oper[0]^,Taicpu(hp2).oper[0]^) then
  5080. { change
  5081. mov reg1,reg2
  5082. sub reg3,reg2
  5083. cmp reg3,reg1
  5084. into
  5085. mov reg1,reg2
  5086. sub reg3,reg2
  5087. }
  5088. begin
  5089. DebugMsg(SPeepholeOptimization + 'MovSubCmp2MovSub done',p);
  5090. RemoveInstruction(hp2);
  5091. Result:=true;
  5092. exit;
  5093. end;
  5094. if (taicpu(p).oper[0]^.typ = top_ref) and { Second operand will be a register }
  5095. MatchInstruction(hp1, A_SHR, A_SAR, [taicpu(p).opsize]) and
  5096. MatchOpType(taicpu(hp1), top_const, top_reg) and
  5097. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5098. begin
  5099. RegName1 := debug_regname(taicpu(hp1).oper[1]^.reg);
  5100. {$ifdef x86_64}
  5101. { Convert:
  5102. movq x(ref),%reg64
  5103. shrq y,%reg64
  5104. To:
  5105. movl x+4(ref),%reg32
  5106. shrl y-32,%reg32 (Remove if y = 32)
  5107. }
  5108. if (taicpu(p).opsize = S_Q) and
  5109. (taicpu(hp1).opcode = A_SHR) and
  5110. (taicpu(hp1).oper[0]^.val >= 32) then
  5111. begin
  5112. PreMessage := 'movq ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
  5113. 'shrq $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> movl ';
  5114. { Convert to 32-bit }
  5115. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5116. taicpu(p).opsize := S_L;
  5117. Inc(taicpu(p).oper[0]^.ref^.offset, 4);
  5118. PreMessage := PreMessage + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg);
  5119. if (taicpu(hp1).oper[0]^.val = 32) then
  5120. begin
  5121. DebugMsg(SPeepholeOptimization + PreMessage + ' (MovShr2Mov)', p);
  5122. RemoveInstruction(hp1);
  5123. end
  5124. else
  5125. begin
  5126. { This will potentially open up more arithmetic operations since
  5127. the peephole optimizer now has a big hint that only the lower
  5128. 32 bits are currently in use (and opcodes are smaller in size) }
  5129. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  5130. taicpu(hp1).opsize := S_L;
  5131. Dec(taicpu(hp1).oper[0]^.val, 32);
  5132. DebugMsg(SPeepholeOptimization + PreMessage +
  5133. '; shrl $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr2MovShr)', p);
  5134. end;
  5135. Result := True;
  5136. Exit;
  5137. end;
  5138. {$endif x86_64}
  5139. { Convert:
  5140. movl x(ref),%reg
  5141. shrl $24,%reg
  5142. To:
  5143. movzbl x+3(ref),%reg
  5144. Do similar things for movl; shrl $16 -> movzwl and movw; shrw $8 -> movzbw
  5145. Also accept sar instead of shr, but convert to movsx instead of movzx
  5146. }
  5147. if taicpu(hp1).opcode = A_SHR then
  5148. MovUnaligned := A_MOVZX
  5149. else
  5150. MovUnaligned := A_MOVSX;
  5151. NewSize := S_NO;
  5152. NewOffset := 0;
  5153. case taicpu(p).opsize of
  5154. S_B:
  5155. { No valid combinations };
  5156. S_W:
  5157. if (taicpu(hp1).oper[0]^.val = 8) then
  5158. begin
  5159. NewSize := S_BW;
  5160. NewOffset := 1;
  5161. end;
  5162. S_L:
  5163. case taicpu(hp1).oper[0]^.val of
  5164. 16:
  5165. begin
  5166. NewSize := S_WL;
  5167. NewOffset := 2;
  5168. end;
  5169. 24:
  5170. begin
  5171. NewSize := S_BL;
  5172. NewOffset := 3;
  5173. end;
  5174. else
  5175. ;
  5176. end;
  5177. {$ifdef x86_64}
  5178. S_Q:
  5179. case taicpu(hp1).oper[0]^.val of
  5180. 32:
  5181. begin
  5182. if taicpu(hp1).opcode = A_SAR then
  5183. begin
  5184. { 32-bit to 64-bit is a distinct instruction }
  5185. MovUnaligned := A_MOVSXD;
  5186. NewSize := S_LQ;
  5187. NewOffset := 4;
  5188. end
  5189. else
  5190. { Should have been handled by MovShr2Mov above }
  5191. InternalError(2022081811);
  5192. end;
  5193. 48:
  5194. begin
  5195. NewSize := S_WQ;
  5196. NewOffset := 6;
  5197. end;
  5198. 56:
  5199. begin
  5200. NewSize := S_BQ;
  5201. NewOffset := 7;
  5202. end;
  5203. else
  5204. ;
  5205. end;
  5206. {$endif x86_64}
  5207. else
  5208. InternalError(2022081810);
  5209. end;
  5210. if (NewSize <> S_NO) and
  5211. (taicpu(p).oper[0]^.ref^.offset <= $7FFFFFFF - NewOffset) then
  5212. begin
  5213. PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
  5214. 'shr' + debug_opsize2str(taicpu(p).opsize) + ' $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> ' +
  5215. debug_op2str(MovUnaligned);
  5216. {$ifdef x86_64}
  5217. if MovUnaligned <> A_MOVSXD then
  5218. { Don't add size suffix for MOVSXD }
  5219. {$endif x86_64}
  5220. PreMessage := PreMessage + debug_opsize2str(NewSize);
  5221. Inc(taicpu(p).oper[0]^.ref^.offset, NewOffset);
  5222. taicpu(p).opcode := MovUnaligned;
  5223. taicpu(p).opsize := NewSize;
  5224. DebugMsg(SPeepholeOptimization + PreMessage + ' ' +
  5225. debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr/Sar2Movx)', p);
  5226. RemoveInstruction(hp1);
  5227. Result := True;
  5228. Exit;
  5229. end;
  5230. end;
  5231. { Backward optimisation shared with OptPass2MOV }
  5232. if FuncMov2Func(p, hp1) then
  5233. begin
  5234. Result := True;
  5235. Exit;
  5236. end;
  5237. end;
  5238. function TX86AsmOptimizer.OptPass1MOVD(var p : tai) : boolean;
  5239. { This function also handles the 64-bit version, MOVQ }
  5240. var
  5241. hp1: tai;
  5242. begin
  5243. Result:=false;
  5244. { Change:
  5245. movd/q %xmm0, %reg
  5246. ...
  5247. movl/q %reg, (ref)
  5248. (dealloc %reg)
  5249. To:
  5250. movd/q %xmm0, (ref)
  5251. }
  5252. if MatchOpType(taicpu(p),top_reg,top_reg) and
  5253. (GetRegType(taicpu(p).oper[0]^.reg)=R_MMREGISTER) and
  5254. (GetRegType(taicpu(p).oper[1]^.reg)=R_INTREGISTER) and
  5255. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  5256. MatchInstruction(hp1, A_MOV, []) and
  5257. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^.reg) and
  5258. (taicpu(hp1).oper[1]^.typ=top_ref) and
  5259. not RegInRef(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.ref^) then
  5260. begin
  5261. TransferUsedRegs(TmpUsedRegs);
  5262. UpdateUsedRegsBetween(TmpUsedRegs,p,hp1);
  5263. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs) then
  5264. begin
  5265. if (
  5266. { Instructions are always adjacent under -O2 and under }
  5267. not(cs_opt_level3 in current_settings.optimizerswitches) or
  5268. (
  5269. (
  5270. (taicpu(hp1).oper[1]^.ref^.base=NR_NO) or
  5271. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base,p,hp1)
  5272. ) and
  5273. (
  5274. (taicpu(hp1).oper[1]^.ref^.index=NR_NO) or
  5275. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index,p,hp1)
  5276. )
  5277. )
  5278. ) then
  5279. begin
  5280. DebugMsg(SPeepholeOptimization+'Merged (V)MOVD/(V)MOVQ and MOV to eliminate intermediate register (MovD/QMov2MovD/Q 1a)',p);
  5281. taicpu(p).loadref(1,taicpu(hp1).oper[1]^.ref^);
  5282. { loadref increases the reference count, so decrement it again }
  5283. if Assigned(taicpu(hp1).oper[1]^.ref^.symbol) then
  5284. taicpu(hp1).oper[1]^.ref^.symbol.decrefs;
  5285. if Assigned(taicpu(hp1).oper[1]^.ref^.relsymbol) then
  5286. taicpu(hp1).oper[1]^.ref^.relsymbol.decrefs;
  5287. RemoveInstruction(hp1);
  5288. Include(OptsToCheck, aoc_ForceNewIteration);
  5289. end
  5290. else if not RegModifiedBetween(taicpu(p).oper[0]^.reg,p,hp1) then
  5291. begin
  5292. { Still possible to optimise if hp1 is converted instead }
  5293. DebugMsg(SPeepholeOptimization+'Merged (V)MOVD/(V)MOVQ and MOV to eliminate intermediate register (MovD/QMov2MovD/Q 1b)',hp1);
  5294. { Decrement the reference prior to replacing it }
  5295. if Assigned(taicpu(hp1).oper[1]^.ref^.symbol) then
  5296. taicpu(hp1).oper[1]^.ref^.symbol.decrefs;
  5297. if Assigned(taicpu(hp1).oper[1]^.ref^.relsymbol) then
  5298. taicpu(hp1).oper[1]^.ref^.relsymbol.decrefs;
  5299. taicpu(hp1).opcode:=taicpu(p).opcode;
  5300. taicpu(hp1).opsize:=taicpu(p).opsize;
  5301. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  5302. TransferUsedRegs(TmpUsedRegs);
  5303. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,TmpUsedRegs);
  5304. RemoveCurrentP(p);
  5305. Result:=True;
  5306. Exit;
  5307. end;
  5308. end;
  5309. end;
  5310. end;
  5311. function TX86AsmOptimizer.OptPass1MOVXX(var p : tai) : boolean;
  5312. var
  5313. hp1 : tai;
  5314. begin
  5315. Result:=false;
  5316. if taicpu(p).ops <> 2 then
  5317. exit;
  5318. if (MatchOpType(taicpu(p),top_reg,top_reg) and GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg)) or
  5319. GetNextInstruction(p,hp1) then
  5320. begin
  5321. if MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  5322. (taicpu(hp1).ops = 2) then
  5323. begin
  5324. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  5325. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  5326. { movXX reg1, mem1 or movXX mem1, reg1
  5327. movXX mem2, reg2 movXX reg2, mem2}
  5328. begin
  5329. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  5330. { movXX reg1, mem1 or movXX mem1, reg1
  5331. movXX mem2, reg1 movXX reg2, mem1}
  5332. begin
  5333. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  5334. begin
  5335. { Removes the second statement from
  5336. movXX reg1, mem1/reg2
  5337. movXX mem1/reg2, reg1
  5338. }
  5339. if taicpu(p).oper[0]^.typ=top_reg then
  5340. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  5341. { Removes the second statement from
  5342. movXX mem1/reg1, reg2
  5343. movXX reg2, mem1/reg1
  5344. }
  5345. if (taicpu(p).oper[1]^.typ=top_reg) and
  5346. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,UsedRegs)) then
  5347. begin
  5348. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2Nop 1 done',p);
  5349. RemoveInstruction(hp1);
  5350. RemoveCurrentp(p); { p will now be equal to the instruction that follows what was hp1 }
  5351. Result:=true;
  5352. exit;
  5353. end
  5354. else if (taicpu(hp1).oper[1]^.typ<>top_ref) or (not(vol_write in taicpu(hp1).oper[1]^.ref^.volatility)) and
  5355. (taicpu(hp1).oper[0]^.typ<>top_ref) or (not(vol_read in taicpu(hp1).oper[0]^.ref^.volatility)) then
  5356. begin
  5357. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2MoVXX 1 done',p);
  5358. RemoveInstruction(hp1);
  5359. Result:=true;
  5360. exit;
  5361. end;
  5362. end
  5363. end;
  5364. end;
  5365. end;
  5366. end;
  5367. end;
  5368. {$ifndef i8086}
  5369. function TX86AsmOptimizer.OptPass1NOT(var p: tai): Boolean;
  5370. var
  5371. hp1, p_next: tai;
  5372. flags_used: Boolean;
  5373. procedure Do_NotAnd2Andn1;
  5374. var
  5375. tempoper: poper;
  5376. begin
  5377. { Change "and %reg1,%reg2" to "andn %reg2,%reg1,%reg2" }
  5378. taicpu(hp1).allocate_oper(3);
  5379. taicpu(hp1).ops:=3;
  5380. { Swap the 1st and 2nd operands by swapping their pointers }
  5381. tempoper:=taicpu(hp1).oper[1];
  5382. taicpu(hp1).oper[1]:=taicpu(hp1).oper[0];
  5383. taicpu(hp1).oper[0]:=tempoper;
  5384. taicpu(hp1).loadreg(2, tempoper^.reg);
  5385. taicpu(hp1).opcode:=A_ANDN;
  5386. end;
  5387. begin
  5388. Result:=False;
  5389. { Don't optimise this for size as ANDN is bigger than NOT and AND combined }
  5390. if not (cs_opt_size in current_settings.optimizerswitches) and
  5391. (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.optimizecputype]) then
  5392. begin
  5393. { Convert: To:
  5394. not %reg1 andn %reg2,%reg1,%reg2
  5395. and %reg1,%reg2 not %reg1
  5396. Or remove "not %reg1" completely if %reg1 is deallocated.
  5397. This breaks the dependency chain.
  5398. }
  5399. if (taicpu(p).oper[0]^.typ=top_reg) and
  5400. { ANDN only supports 32-bit and 64-bit }
  5401. (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  5402. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  5403. MatchInstruction(hp1, A_AND, [taicpu(p).opsize]) and
  5404. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^.reg) and
  5405. (taicpu(hp1).oper[1]^.typ=top_reg) and
  5406. (taicpu(hp1).oper[1]^.reg<>taicpu(p).oper[0]^.reg) and
  5407. (
  5408. { p and hp1 are adjacent on -O2 and below }
  5409. not(cs_opt_level3 in current_settings.optimizerswitches) or
  5410. not RegModifiedBetween(taicpu(hp1).oper[1]^.reg,p,hp1)
  5411. ) then
  5412. begin
  5413. p_next:=tai(p.Next);
  5414. TransferUsedRegs(TmpUsedRegs);
  5415. UpdateUsedRegsBetween(TmpUsedRegs, p_next, hp1);
  5416. { Make a note as to whether the flags are in use because
  5417. RegUsedAfterInstruction might change the state }
  5418. flags_used:=RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs);
  5419. if not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) then
  5420. begin
  5421. DebugMsg(SPeepholeOptimization + 'NotAnd2Andn 1 done', p);
  5422. Do_NotAnd2Andn1;
  5423. RemoveCurrentP(p, p_next);
  5424. Result:=True;
  5425. Exit;
  5426. end
  5427. else if not flags_used then
  5428. begin
  5429. DebugMsg(SPeepholeOptimization + 'NotAnd2AndnNot 1 done', p);
  5430. Do_NotAnd2Andn1;
  5431. asml.Remove(p);
  5432. asml.InsertAfter(p, hp1);
  5433. AllocRegBetween(taicpu(p).oper[0]^.reg, hp1, p, TmpUsedRegs);
  5434. { Make sure the pass 2 iteration continues from the
  5435. correct place, right after p }
  5436. p:=p_next;
  5437. Result:=True;
  5438. Exit;
  5439. end;
  5440. end;
  5441. end;
  5442. end;
  5443. {$endif not i8086}
  5444. function TX86AsmOptimizer.OptPass1OP(var p : tai) : boolean;
  5445. var
  5446. hp1 : tai;
  5447. begin
  5448. result:=false;
  5449. { replace
  5450. <Op>X %mreg1,%mreg2 // Op in [ADD,MUL]
  5451. MovX %mreg2,%mreg1
  5452. dealloc %mreg2
  5453. by
  5454. <Op>X %mreg2,%mreg1
  5455. ?
  5456. }
  5457. if GetNextInstruction(p,hp1) and
  5458. { we mix single and double opperations here because we assume that the compiler
  5459. generates vmovapd only after double operations and vmovaps only after single operations }
  5460. MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
  5461. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  5462. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
  5463. (taicpu(p).oper[0]^.typ=top_reg) then
  5464. begin
  5465. TransferUsedRegs(TmpUsedRegs);
  5466. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  5467. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  5468. begin
  5469. taicpu(p).loadoper(0,taicpu(hp1).oper[0]^);
  5470. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  5471. DebugMsg(SPeepholeOptimization + 'OpMov2Op done',p);
  5472. RemoveInstruction(hp1);
  5473. result:=true;
  5474. end;
  5475. end;
  5476. end;
  5477. function TX86AsmOptimizer.OptPass1Test(var p: tai) : boolean;
  5478. var
  5479. hp1, p_label, p_dist, hp1_dist, hp1_last: tai;
  5480. JumpLabel, JumpLabel_dist: TAsmLabel;
  5481. FirstValue, SecondValue: TCGInt;
  5482. function OptimizeJump(var InputP: tai): Boolean;
  5483. var
  5484. TempBool: Boolean;
  5485. begin
  5486. Result := False;
  5487. TempBool := True;
  5488. if DoJumpOptimizations(InputP, TempBool) or
  5489. not TempBool then
  5490. begin
  5491. Result := True;
  5492. if Assigned(InputP) then
  5493. begin
  5494. { CollapseZeroDistJump will be set to the label or an align
  5495. before it after the jump if it optimises, whether or not
  5496. the label is live or dead }
  5497. if (InputP.typ = ait_align) or
  5498. (
  5499. (InputP.typ = ait_label) and
  5500. not (tai_label(InputP).labsym.is_used)
  5501. ) then
  5502. GetNextInstruction(InputP, InputP);
  5503. end;
  5504. Exit;
  5505. end;
  5506. end;
  5507. begin
  5508. Result := False;
  5509. if (taicpu(p).oper[0]^.typ = top_const) and
  5510. (taicpu(p).oper[0]^.val <> -1) then
  5511. begin
  5512. { Convert unsigned maximum constants to -1 to aid optimisation }
  5513. case taicpu(p).opsize of
  5514. S_B:
  5515. if (taicpu(p).oper[0]^.val and $FF) = $FF then
  5516. begin
  5517. taicpu(p).oper[0]^.val := -1;
  5518. Result := True;
  5519. Exit;
  5520. end;
  5521. S_W:
  5522. if (taicpu(p).oper[0]^.val and $FFFF) = $FFFF then
  5523. begin
  5524. taicpu(p).oper[0]^.val := -1;
  5525. Result := True;
  5526. Exit;
  5527. end;
  5528. S_L:
  5529. if (taicpu(p).oper[0]^.val and $FFFFFFFF) = $FFFFFFFF then
  5530. begin
  5531. taicpu(p).oper[0]^.val := -1;
  5532. Result := True;
  5533. Exit;
  5534. end;
  5535. {$ifdef x86_64}
  5536. S_Q:
  5537. { Storing anything greater than $7FFFFFFF is not possible so do
  5538. nothing };
  5539. {$endif x86_64}
  5540. else
  5541. InternalError(2021121001);
  5542. end;
  5543. end;
  5544. if GetNextInstruction(p, hp1) and
  5545. TrySwapMovCmp(p, hp1) then
  5546. begin
  5547. Result := True;
  5548. Exit;
  5549. end;
  5550. p_label := nil;
  5551. JumpLabel := nil;
  5552. if MatchInstruction(hp1, A_Jcc, []) then
  5553. begin
  5554. if OptimizeJump(hp1) then
  5555. begin
  5556. Result := True;
  5557. if Assigned(hp1) then
  5558. begin
  5559. { CollapseZeroDistJump will be set to the label or an align
  5560. before it after the jump if it optimises, whether or not
  5561. the label is live or dead }
  5562. if (hp1.typ = ait_align) or
  5563. (
  5564. (hp1.typ = ait_label) and
  5565. not (tai_label(hp1).labsym.is_used)
  5566. ) then
  5567. GetNextInstruction(hp1, hp1);
  5568. end;
  5569. TransferUsedRegs(TmpUsedRegs);
  5570. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  5571. if not Assigned(hp1) or
  5572. (
  5573. not MatchInstruction(hp1, A_Jcc, A_SETcc, A_CMOVcc, []) and
  5574. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
  5575. ) then
  5576. begin
  5577. { No more conditional jumps; conditional statement is no longer required }
  5578. DebugMsg(SPeepholeOptimization + 'Removed unnecessary condition (Test2Nop)', p);
  5579. RemoveCurrentP(p);
  5580. end;
  5581. Exit;
  5582. end;
  5583. if IsJumpToLabel(taicpu(hp1)) then
  5584. begin
  5585. JumpLabel := TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol);
  5586. if Assigned(JumpLabel) then
  5587. p_label := getlabelwithsym(JumpLabel);
  5588. end;
  5589. end;
  5590. { Search for:
  5591. test $x,(reg/ref)
  5592. jne @lbl1
  5593. test $y,(reg/ref) (same register or reference)
  5594. jne @lbl1
  5595. Change to:
  5596. test $(x or y),(reg/ref)
  5597. jne @lbl1
  5598. (Note, this doesn't work with je instead of jne)
  5599. Also catch cases where "cmp $0,(reg/ref)" and "test %reg,%reg" are used.
  5600. Also search for:
  5601. test $x,(reg/ref)
  5602. je @lbl1
  5603. ...
  5604. test $y,(reg/ref)
  5605. je/jne @lbl2
  5606. If (x or y) = x, then the second jump is deterministic
  5607. }
  5608. if (
  5609. (
  5610. (taicpu(p).oper[0]^.typ = top_const) or
  5611. (
  5612. { test %reg,%reg can be considered equivalent to test, -1,%reg }
  5613. (taicpu(p).oper[0]^.typ = top_reg) and
  5614. MatchOperand(taicpu(p).oper[1]^, taicpu(p).oper[0]^.reg)
  5615. )
  5616. ) and
  5617. MatchInstruction(hp1, A_JCC, [])
  5618. ) then
  5619. begin
  5620. if (taicpu(p).oper[0]^.typ = top_reg) and
  5621. MatchOperand(taicpu(p).oper[1]^, taicpu(p).oper[0]^.reg) then
  5622. FirstValue := -1
  5623. else
  5624. FirstValue := taicpu(p).oper[0]^.val;
  5625. { If we have several test/jne's in a row, it might be the case that
  5626. the second label doesn't go to the same location, but the one
  5627. after it might (e.g. test; jne @lbl1; test; jne @lbl2; test @lbl1),
  5628. so accommodate for this with a while loop.
  5629. }
  5630. hp1_last := hp1;
  5631. while (
  5632. (
  5633. (taicpu(p).oper[1]^.typ = top_reg) and
  5634. GetNextInstructionUsingReg(hp1_last, p_dist, taicpu(p).oper[1]^.reg)
  5635. ) or GetNextInstruction(hp1_last, p_dist)
  5636. ) and (p_dist.typ = ait_instruction) do
  5637. begin
  5638. if (
  5639. (
  5640. (taicpu(p_dist).opcode = A_TEST) and
  5641. (
  5642. (taicpu(p_dist).oper[0]^.typ = top_const) or
  5643. { test %reg,%reg can be considered equivalent to test, -1,%reg }
  5644. MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p_dist).oper[0]^)
  5645. )
  5646. ) or
  5647. (
  5648. { cmp 0,%reg = test %reg,%reg }
  5649. (taicpu(p_dist).opcode = A_CMP) and
  5650. MatchOperand(taicpu(p_dist).oper[0]^, 0)
  5651. )
  5652. ) and
  5653. { Make sure the destination operands are actually the same }
  5654. MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p).oper[1]^) and
  5655. GetNextInstruction(p_dist, hp1_dist) and
  5656. MatchInstruction(hp1_dist, A_JCC, []) then
  5657. begin
  5658. if OptimizeJump(hp1_dist) then
  5659. begin
  5660. Result := True;
  5661. Exit;
  5662. end;
  5663. if
  5664. (taicpu(p_dist).opcode = A_CMP) { constant will be zero } or
  5665. (
  5666. (taicpu(p_dist).oper[0]^.typ = top_reg) and
  5667. MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p_dist).oper[0]^.reg)
  5668. ) then
  5669. SecondValue := -1
  5670. else
  5671. SecondValue := taicpu(p_dist).oper[0]^.val;
  5672. { If both of the TEST constants are identical, delete the
  5673. second TEST that is unnecessary (be careful though, just
  5674. in case the flags are modified in between) }
  5675. if (FirstValue = SecondValue) then
  5676. begin
  5677. if condition_in(taicpu(hp1_dist).condition, taicpu(hp1).condition) then
  5678. begin
  5679. { Since the second jump's condition is a subset of the first, we
  5680. know it will never branch because the first jump dominates it.
  5681. Get it out of the way now rather than wait for the jump
  5682. optimisations for a speed boost. }
  5683. if IsJumpToLabel(taicpu(hp1_dist)) then
  5684. TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol).DecRefs;
  5685. DebugMsg(SPeepholeOptimization + 'Removed dominated jump (via TEST/Jcc/TEST)', hp1_dist);
  5686. RemoveInstruction(hp1_dist);
  5687. Result := True;
  5688. end
  5689. else if condition_in(inverse_cond(taicpu(hp1).condition), taicpu(hp1_dist).condition) then
  5690. begin
  5691. { If the inverse of the first condition is a subset of the second,
  5692. the second one will definitely branch if the first one doesn't }
  5693. DebugMsg(SPeepholeOptimization + 'Conditional jump will always branch (via TEST/Jcc/TEST)', hp1_dist);
  5694. { We can remove the TEST instruction too }
  5695. DebugMsg(SPeepholeOptimization + 'TEST/Jcc/TEST; removed superfluous TEST', p_dist);
  5696. RemoveInstruction(p_dist);
  5697. MakeUnconditional(taicpu(hp1_dist));
  5698. RemoveDeadCodeAfterJump(hp1_dist);
  5699. { Since the jump is now unconditional, we can't
  5700. continue any further with this particular
  5701. optimisation. The original TEST is still intact
  5702. though, so there might be something else we can
  5703. do }
  5704. Include(OptsToCheck, aoc_ForceNewIteration);
  5705. Break;
  5706. end;
  5707. if Result or
  5708. { If a jump wasn't removed or made unconditional, only
  5709. remove the identical TEST instruction if the flags
  5710. weren't modified }
  5711. not RegModifiedBetween(NR_DEFAULTFLAGS, hp1, p_dist) then
  5712. begin
  5713. DebugMsg(SPeepholeOptimization + 'TEST/Jcc/TEST; removed superfluous TEST', p_dist);
  5714. RemoveInstruction(p_dist);
  5715. { If the jump was removed or made unconditional, we
  5716. don't need to allocate NR_DEFAULTFLAGS over the
  5717. entire range }
  5718. if not Result then
  5719. begin
  5720. { Mark the flags as 'in use' over the entire range }
  5721. AllocRegBetween(NR_DEFAULTFLAGS, hp1, hp1_dist, UsedRegs);
  5722. { Speed gain - continue search from the Jcc instruction }
  5723. hp1_last := hp1_dist;
  5724. { Only the TEST instruction was removed, and the
  5725. original was unchanged, so we can safely do
  5726. another iteration of the while loop }
  5727. Include(OptsToCheck, aoc_ForceNewIteration);
  5728. Continue;
  5729. end;
  5730. Exit;
  5731. end;
  5732. end;
  5733. hp1_last := nil;
  5734. if (taicpu(hp1).condition in [C_NE, C_NZ]) and
  5735. (
  5736. { In this situation, the TEST/JNE pairs must be adjacent (fixes #40366) }
  5737. { Always adjacent under -O2 and under }
  5738. not(cs_opt_level3 in current_settings.optimizerswitches) or
  5739. (
  5740. GetNextInstruction(hp1, hp1_last) and
  5741. (hp1_last = p_dist)
  5742. )
  5743. ) and
  5744. (
  5745. (
  5746. { Test the following variant:
  5747. test $x,(reg/ref)
  5748. jne @lbl1
  5749. test $y,(reg/ref)
  5750. je @lbl2
  5751. @lbl1:
  5752. Becomes:
  5753. test $(x or y),(reg/ref)
  5754. je @lbl2
  5755. @lbl1: (may become a dead label)
  5756. }
  5757. (taicpu(hp1_dist).condition in [C_E, C_Z]) and
  5758. GetNextInstruction(hp1_dist, hp1_last) and
  5759. (hp1_last = p_label)
  5760. ) or
  5761. (
  5762. (taicpu(hp1_dist).condition in [C_NE, C_NZ]) and
  5763. { If the first instruction is test %reg,%reg or test $-1,%reg,
  5764. then the second jump will never branch, so it can also be
  5765. removed regardless of where it goes }
  5766. (
  5767. (FirstValue = -1) or
  5768. (SecondValue = -1) or
  5769. MatchOperand(taicpu(hp1_dist).oper[0]^, taicpu(hp1).oper[0]^)
  5770. )
  5771. )
  5772. ) then
  5773. begin
  5774. { Same jump location... can be a register since nothing's changed }
  5775. { If any of the entries are equivalent to test %reg,%reg, then the
  5776. merged $(x or y) is also test %reg,%reg / test $-1,%reg }
  5777. taicpu(p).loadconst(0, FirstValue or SecondValue);
  5778. if (hp1_last = p_label) then
  5779. begin
  5780. { Variant }
  5781. DebugMsg(SPeepholeOptimization + 'TEST/JNE/TEST/JE/@Lbl merged', p);
  5782. RemoveInstruction(p_dist);
  5783. if Assigned(JumpLabel) then
  5784. JumpLabel.decrefs;
  5785. RemoveInstruction(hp1);
  5786. end
  5787. else
  5788. begin
  5789. { Only remove the second test if no jumps or other conditional instructions follow }
  5790. TransferUsedRegs(TmpUsedRegs);
  5791. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  5792. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  5793. UpdateUsedRegs(TmpUsedRegs, tai(p_dist.Next));
  5794. if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1_dist, TmpUsedRegs) then
  5795. begin
  5796. DebugMsg(SPeepholeOptimization + 'TEST/JNE/TEST/JNE merged', p);
  5797. RemoveInstruction(p_dist);
  5798. { Remove the first jump, not the second, to keep
  5799. any register deallocations between the second
  5800. TEST/JNE pair in the same place. Aids future
  5801. optimisation. }
  5802. if Assigned(JumpLabel) then
  5803. JumpLabel.decrefs;
  5804. RemoveInstruction(hp1);
  5805. end
  5806. else
  5807. begin
  5808. DebugMsg(SPeepholeOptimization + 'TEST/JNE/TEST/JNE merged (second TEST preserved)', p);
  5809. if IsJumpToLabel(taicpu(hp1_dist)) then
  5810. TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol).DecRefs;
  5811. { Remove second jump in this instance }
  5812. RemoveInstruction(hp1_dist);
  5813. end;
  5814. end;
  5815. Result := True;
  5816. Exit;
  5817. end;
  5818. end;
  5819. if { If -O2 and under, it may stop on any old instruction }
  5820. (cs_opt_level3 in current_settings.optimizerswitches) and
  5821. (taicpu(p).oper[1]^.typ = top_reg) and
  5822. not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, p_dist) then
  5823. begin
  5824. hp1_last := p_dist;
  5825. Continue;
  5826. end;
  5827. Break;
  5828. end;
  5829. end;
  5830. { Search for:
  5831. test %reg,%reg
  5832. j(c1) @lbl1
  5833. ...
  5834. @lbl:
  5835. test %reg,%reg (same register)
  5836. j(c2) @lbl2
  5837. If c2 is a subset of c1, change to:
  5838. test %reg,%reg
  5839. j(c1) @lbl2
  5840. (@lbl1 may become a dead label as a result)
  5841. }
  5842. if (taicpu(p).oper[1]^.typ = top_reg) and
  5843. (taicpu(p).oper[0]^.typ = top_reg) and
  5844. (taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
  5845. { p_label <> nil is a marker that hp1 is a Jcc to a label }
  5846. Assigned(p_label) and
  5847. GetNextInstruction(p_label, p_dist) and
  5848. MatchInstruction(p_dist, A_TEST, []) and
  5849. { It's fine if the second test uses smaller sub-registers }
  5850. (taicpu(p_dist).opsize <= taicpu(p).opsize) and
  5851. MatchOpType(taicpu(p_dist), top_reg, top_reg) and
  5852. SuperRegistersEqual(taicpu(p_dist).oper[0]^.reg, taicpu(p).oper[0]^.reg) and
  5853. SuperRegistersEqual(taicpu(p_dist).oper[1]^.reg, taicpu(p).oper[1]^.reg) and
  5854. GetNextInstruction(p_dist, hp1_dist) and
  5855. MatchInstruction(hp1_dist, A_JCC, []) then { This doesn't have to be an explicit label }
  5856. begin
  5857. JumpLabel_dist := TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol);
  5858. if JumpLabel = JumpLabel_dist then
  5859. { This is an infinite loop }
  5860. Exit;
  5861. { Best optimisation when the first condition is a subset (or equal) of the second }
  5862. if condition_in(taicpu(hp1).condition, taicpu(hp1_dist).condition) then
  5863. begin
  5864. { Any registers used here will already be allocated }
  5865. if Assigned(JumpLabel) then
  5866. JumpLabel.DecRefs;
  5867. DebugMsg(SPeepholeOptimization + 'TEST/Jcc/@Lbl/TEST/Jcc -> TEST/Jcc, redirecting first jump', hp1);
  5868. taicpu(hp1).loadref(0, taicpu(hp1_dist).oper[0]^.ref^); { This also increases the reference count }
  5869. Result := True;
  5870. Exit;
  5871. end;
  5872. end;
  5873. end;
  5874. function TX86AsmOptimizer.OptPass1Add(var p : tai) : boolean;
  5875. var
  5876. hp1, hp2: tai;
  5877. ActiveReg: TRegister;
  5878. OldOffset: asizeint;
  5879. ThisConst: TCGInt;
  5880. function RegDeallocated: Boolean;
  5881. begin
  5882. TransferUsedRegs(TmpUsedRegs);
  5883. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  5884. Result := not(RegUsedAfterInstruction(ActiveReg,hp1,TmpUsedRegs))
  5885. end;
  5886. begin
  5887. result:=false;
  5888. hp1 := nil;
  5889. { replace
  5890. addX const,%reg1
  5891. leaX (%reg1,%reg1,Y),%reg2 // Base or index might not be equal to reg1
  5892. dealloc %reg1
  5893. by
  5894. leaX const+const*Y(%reg1,%reg1,Y),%reg2
  5895. }
  5896. if MatchOpType(taicpu(p),top_const,top_reg) then
  5897. begin
  5898. ActiveReg := taicpu(p).oper[1]^.reg;
  5899. { Ensures the entire register was updated }
  5900. if (taicpu(p).opsize >= S_L) and
  5901. GetNextInstructionUsingReg(p,hp1, ActiveReg) and
  5902. MatchInstruction(hp1,A_LEA,[]) and
  5903. (SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.base) or
  5904. SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.index)) and
  5905. (
  5906. { Cover the case where the register in the reference is also the destination register }
  5907. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ActiveReg) or
  5908. (
  5909. { Try to avoid the expensive check of RegUsedAfterInstruction if we know it will return False }
  5910. not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ActiveReg) and
  5911. RegDeallocated
  5912. )
  5913. ) then
  5914. begin
  5915. OldOffset := taicpu(hp1).oper[0]^.ref^.offset;
  5916. {$push}
  5917. {$R-}{$Q-}
  5918. { Explicitly disable overflow checking for these offset calculation
  5919. as those do not matter for the final result }
  5920. if ActiveReg=taicpu(hp1).oper[0]^.ref^.base then
  5921. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val);
  5922. if ActiveReg=taicpu(hp1).oper[0]^.ref^.index then
  5923. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  5924. {$pop}
  5925. {$ifdef x86_64}
  5926. if (taicpu(hp1).oper[0]^.ref^.offset > $7FFFFFFF) or (taicpu(hp1).oper[0]^.ref^.offset < -2147483648) then
  5927. begin
  5928. { Overflow; abort }
  5929. taicpu(hp1).oper[0]^.ref^.offset := OldOffset;
  5930. end
  5931. else
  5932. {$endif x86_64}
  5933. begin
  5934. DebugMsg(SPeepholeOptimization + 'AddLea2Lea done',p);
  5935. if not (cs_opt_level3 in current_settings.optimizerswitches) then
  5936. { hp1 is the immediate next instruction for sure - good for a quick speed boost }
  5937. RemoveCurrentP(p, hp1)
  5938. else
  5939. RemoveCurrentP(p);
  5940. result:=true;
  5941. Exit;
  5942. end;
  5943. end;
  5944. if (
  5945. { Save calling GetNextInstructionUsingReg again }
  5946. Assigned(hp1) or
  5947. GetNextInstructionUsingReg(p,hp1, ActiveReg)
  5948. ) and
  5949. MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
  5950. (taicpu(hp1).oper[1]^.reg = ActiveReg) then
  5951. begin
  5952. { Make sure the flags aren't in use by the second operation }
  5953. TransferUsedRegs(TmpUsedRegs);
  5954. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.next), hp1);
  5955. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  5956. begin
  5957. if taicpu(hp1).oper[0]^.typ = top_const then
  5958. begin
  5959. { Merge add const1,%reg; add/sub const2,%reg to add const1+/-const2,%reg }
  5960. if taicpu(hp1).opcode = A_ADD then
  5961. ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val
  5962. else
  5963. ThisConst := taicpu(p).oper[0]^.val - taicpu(hp1).oper[0]^.val;
  5964. Result := True;
  5965. { Handle any overflows }
  5966. case taicpu(p).opsize of
  5967. S_B:
  5968. taicpu(p).oper[0]^.val := ThisConst and $FF;
  5969. S_W:
  5970. taicpu(p).oper[0]^.val := ThisConst and $FFFF;
  5971. S_L:
  5972. taicpu(p).oper[0]^.val := ThisConst and $FFFFFFFF;
  5973. {$ifdef x86_64}
  5974. S_Q:
  5975. if (ThisConst > $7FFFFFFF) or (ThisConst < -2147483648) then
  5976. { Overflow; abort }
  5977. Result := False
  5978. else
  5979. taicpu(p).oper[0]^.val := ThisConst;
  5980. {$endif x86_64}
  5981. else
  5982. InternalError(2021102610);
  5983. end;
  5984. { Result may get set to False again if the combined immediate overflows for S_Q sizes }
  5985. if Result then
  5986. begin
  5987. if (taicpu(p).oper[0]^.val < 0) and
  5988. (
  5989. ((taicpu(p).opsize = S_B) and (taicpu(p).oper[0]^.val <> -128)) or
  5990. ((taicpu(p).opsize = S_W) and (taicpu(p).oper[0]^.val <> -32768)) or
  5991. ((taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and (taicpu(p).oper[0]^.val <> -2147483648))
  5992. ) then
  5993. begin
  5994. DebugMsg(SPeepholeOptimization + 'ADD; ADD/SUB -> SUB',p);
  5995. taicpu(p).opcode := A_SUB;
  5996. taicpu(p).oper[0]^.val := -taicpu(p).oper[0]^.val;
  5997. end
  5998. else
  5999. DebugMsg(SPeepholeOptimization + 'ADD; ADD/SUB -> ADD',p);
  6000. RemoveInstruction(hp1);
  6001. end;
  6002. end
  6003. else
  6004. begin
  6005. { Move the constant addition to after the reg/ref addition to improve optimisation }
  6006. DebugMsg(SPeepholeOptimization + 'Add/sub swap 1a done',p);
  6007. Asml.Remove(p);
  6008. Asml.InsertAfter(p, hp1);
  6009. p := hp1;
  6010. Result := True;
  6011. Exit;
  6012. end;
  6013. end;
  6014. end;
  6015. if DoArithCombineOpt(p) then
  6016. Result:=true;
  6017. end;
  6018. end;
  6019. function TX86AsmOptimizer.OptPass1LEA(var p : tai) : boolean;
  6020. var
  6021. hp1, hp2: tai;
  6022. ref: Integer;
  6023. saveref: treference;
  6024. offsetcalc: Int64;
  6025. TempReg: TRegister;
  6026. Multiple: TCGInt;
  6027. Adjacent, IntermediateRegDiscarded: Boolean;
  6028. begin
  6029. Result:=false;
  6030. { play save and throw an error if LEA uses a seg register prefix,
  6031. this is most likely an error somewhere else }
  6032. if taicpu(p).oper[0]^.ref^.Segment<>NR_NO then
  6033. internalerror(2022022001);
  6034. { changes "lea (%reg1), %reg2" into "mov %reg1, %reg2" }
  6035. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  6036. (taicpu(p).oper[0]^.ref^.index = NR_NO) and
  6037. (
  6038. { do not mess with leas accessing the stack pointer
  6039. unless it's a null operation }
  6040. (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) or
  6041. (
  6042. (taicpu(p).oper[0]^.ref^.base = NR_STACK_POINTER_REG) and
  6043. (taicpu(p).oper[0]^.ref^.offset = 0)
  6044. )
  6045. ) and
  6046. (not(Assigned(taicpu(p).oper[0]^.ref^.Symbol))) then
  6047. begin
  6048. if (taicpu(p).oper[0]^.ref^.offset = 0) then
  6049. begin
  6050. if (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) then
  6051. begin
  6052. taicpu(p).opcode := A_MOV;
  6053. taicpu(p).loadreg(0, taicpu(p).oper[0]^.ref^.base);
  6054. DebugMsg(SPeepholeOptimization + 'Lea2Mov done',p);
  6055. end
  6056. else
  6057. begin
  6058. DebugMsg(SPeepholeOptimization + 'Lea2Nop done',p);
  6059. RemoveCurrentP(p);
  6060. end;
  6061. Result:=true;
  6062. exit;
  6063. end
  6064. else if (
  6065. { continue to use lea to adjust the stack pointer,
  6066. it is the recommended way, but only if not optimizing for size }
  6067. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) or
  6068. (cs_opt_size in current_settings.optimizerswitches)
  6069. ) and
  6070. { If the flags register is in use, don't change the instruction
  6071. to an ADD otherwise this will scramble the flags. [Kit] }
  6072. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  6073. ConvertLEA(taicpu(p)) then
  6074. begin
  6075. Result:=true;
  6076. exit;
  6077. end;
  6078. end;
  6079. { Don't optimise if the stack or frame pointer is the destination register }
  6080. if (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG) or (taicpu(p).oper[1]^.reg=current_procinfo.framepointer) then
  6081. Exit;
  6082. if GetNextInstruction(p,hp1) and
  6083. (hp1.typ=ait_instruction) then
  6084. begin
  6085. if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  6086. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  6087. MatchOpType(Taicpu(hp1),top_reg,top_reg) then
  6088. begin
  6089. TransferUsedRegs(TmpUsedRegs);
  6090. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  6091. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  6092. begin
  6093. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  6094. DebugMsg(SPeepholeOptimization + 'LeaMov2Lea done',p);
  6095. RemoveInstruction(hp1);
  6096. result:=true;
  6097. exit;
  6098. end;
  6099. end;
  6100. { changes
  6101. lea <ref1>, reg1
  6102. <op> ...,<ref. with reg1>,...
  6103. to
  6104. <op> ...,<ref1>,... }
  6105. { find a reference which uses reg1 }
  6106. if (taicpu(hp1).ops>=1) and (taicpu(hp1).oper[0]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^) then
  6107. ref:=0
  6108. else if (taicpu(hp1).ops>=2) and (taicpu(hp1).oper[1]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^) then
  6109. ref:=1
  6110. else
  6111. ref:=-1;
  6112. if (ref<>-1) and
  6113. { reg1 must be either the base or the index }
  6114. ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) xor (taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg)) then
  6115. begin
  6116. { reg1 can be removed from the reference }
  6117. saveref:=taicpu(hp1).oper[ref]^.ref^;
  6118. if taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg then
  6119. taicpu(hp1).oper[ref]^.ref^.base:=NR_NO
  6120. else if taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg then
  6121. taicpu(hp1).oper[ref]^.ref^.index:=NR_NO
  6122. else
  6123. Internalerror(2019111201);
  6124. { check if the can insert all data of the lea into the second instruction }
  6125. if ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
  6126. ((taicpu(p).oper[0]^.ref^.base=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.base=NR_NO)) and
  6127. ((taicpu(p).oper[0]^.ref^.index=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.index=NR_NO)) and
  6128. ((taicpu(p).oper[0]^.ref^.symbol=nil) or (taicpu(hp1).oper[ref]^.ref^.symbol=nil)) and
  6129. ((taicpu(p).oper[0]^.ref^.relsymbol=nil) or (taicpu(hp1).oper[ref]^.ref^.relsymbol=nil)) and
  6130. ((taicpu(p).oper[0]^.ref^.scalefactor <= 1) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
  6131. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.segment=NR_NO)
  6132. {$ifdef x86_64}
  6133. and (abs(taicpu(hp1).oper[ref]^.ref^.offset+taicpu(p).oper[0]^.ref^.offset)<=$7fffffff)
  6134. and (((taicpu(p).oper[0]^.ref^.base<>NR_RIP) and (taicpu(p).oper[0]^.ref^.index<>NR_RIP)) or
  6135. ((taicpu(hp1).oper[ref]^.ref^.base=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.index=NR_NO))
  6136. )
  6137. {$endif x86_64}
  6138. then
  6139. begin
  6140. { reg1 might not used by the second instruction after it is remove from the reference }
  6141. if not(RegInInstruction(taicpu(p).oper[1]^.reg,taicpu(hp1))) then
  6142. begin
  6143. TransferUsedRegs(TmpUsedRegs);
  6144. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  6145. { reg1 is not updated so it might not be used afterwards }
  6146. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  6147. begin
  6148. DebugMsg(SPeepholeOptimization + 'LeaOp2Op done',p);
  6149. if taicpu(p).oper[0]^.ref^.base<>NR_NO then
  6150. taicpu(hp1).oper[ref]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  6151. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  6152. taicpu(hp1).oper[ref]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  6153. if taicpu(p).oper[0]^.ref^.symbol<>nil then
  6154. taicpu(hp1).oper[ref]^.ref^.symbol:=taicpu(p).oper[0]^.ref^.symbol;
  6155. if taicpu(p).oper[0]^.ref^.relsymbol<>nil then
  6156. taicpu(hp1).oper[ref]^.ref^.relsymbol:=taicpu(p).oper[0]^.ref^.relsymbol;
  6157. if taicpu(p).oper[0]^.ref^.scalefactor > 1 then
  6158. taicpu(hp1).oper[ref]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  6159. inc(taicpu(hp1).oper[ref]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  6160. RemoveCurrentP(p, hp1);
  6161. result:=true;
  6162. exit;
  6163. end
  6164. end;
  6165. end;
  6166. { recover }
  6167. taicpu(hp1).oper[ref]^.ref^:=saveref;
  6168. end;
  6169. Adjacent := RegInInstruction(taicpu(p).oper[1]^.reg, hp1);
  6170. if Adjacent or
  6171. { Check further ahead (up to 2 instructions ahead for -O2) }
  6172. GetNextInstructionUsingReg(hp1,hp1,taicpu(p).oper[1]^.reg) then
  6173. begin
  6174. { Check common LEA/LEA conditions }
  6175. if MatchInstruction(hp1,A_LEA,[taicpu(p).opsize]) and
  6176. (taicpu(p).oper[0]^.ref^.relsymbol = nil) and
  6177. (taicpu(p).oper[0]^.ref^.segment = NR_NO) and
  6178. (taicpu(p).oper[0]^.ref^.symbol = nil) and
  6179. (taicpu(hp1).oper[0]^.ref^.relsymbol = nil) and
  6180. (taicpu(hp1).oper[0]^.ref^.segment = NR_NO) and
  6181. (taicpu(hp1).oper[0]^.ref^.symbol = nil) and
  6182. (
  6183. { If p and hp1 are adjacent, RegModifiedBetween always returns False, so avoid
  6184. calling it (since it calls GetNextInstruction) }
  6185. Adjacent or
  6186. (
  6187. (
  6188. (taicpu(p).oper[0]^.ref^.base = NR_NO) or { Don't call RegModifiedBetween unnecessarily }
  6189. not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.base,p,hp1))
  6190. ) and (
  6191. (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) or { Don't call RegModifiedBetween unnecessarily }
  6192. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  6193. not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.index,p,hp1))
  6194. )
  6195. )
  6196. ) then
  6197. begin
  6198. TransferUsedRegs(TmpUsedRegs);
  6199. hp2 := p;
  6200. repeat
  6201. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  6202. until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
  6203. IntermediateRegDiscarded :=
  6204. (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) or
  6205. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs);
  6206. { changes
  6207. lea offset1(regX,scale), reg1
  6208. lea offset2(reg1,reg1), reg2
  6209. to
  6210. lea (offset1*scale*2)+offset2(regX,scale*2), reg2
  6211. and
  6212. lea offset1(regX,scale1), reg1
  6213. lea offset2(reg1,scale2), reg2
  6214. to
  6215. lea (offset1*scale1*2)+offset2(regX,scale1*scale2), reg2
  6216. and
  6217. lea offset1(regX,scale1), reg1
  6218. lea offset2(reg3,reg1,scale2), reg2
  6219. to
  6220. lea (offset1*scale*2)+offset2(reg3,regX,scale1*scale2), reg2
  6221. ... so long as the final scale does not exceed 8
  6222. (Similarly, allow the first instruction to be "lea (regX,regX),reg1")
  6223. }
  6224. if (taicpu(p).oper[0]^.ref^.base<>NR_STACK_POINTER_REG) and { lea (%rsp,scale),reg is not a valid encoding }
  6225. (
  6226. { Don't optimise if size is a concern and the intermediate register remains in use }
  6227. IntermediateRegDiscarded or
  6228. (
  6229. not (cs_opt_size in current_settings.optimizerswitches) and
  6230. { If the intermediate register is not discarded, it must not
  6231. appear in the first LEA's reference. (Fixes #41166) }
  6232. not RegInRef(taicpu(p).oper[1]^.reg, taicpu(p).oper[0]^.ref^)
  6233. )
  6234. ) and
  6235. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  6236. (
  6237. (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[0]^.ref^.index) or
  6238. (taicpu(p).oper[0]^.ref^.scalefactor <= 1)
  6239. ) and (
  6240. (
  6241. { lea (reg1,scale2), reg2 variant }
  6242. (taicpu(hp1).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) and
  6243. (
  6244. Adjacent or
  6245. not RegModifiedBetween(taicpu(hp1).oper[0]^.ref^.base, p, hp1)
  6246. ) and
  6247. (
  6248. (
  6249. (taicpu(p).oper[0]^.ref^.base = NR_NO) and
  6250. (taicpu(hp1).oper[0]^.ref^.scalefactor * taicpu(p).oper[0]^.ref^.scalefactor <= 8)
  6251. ) or (
  6252. { lea (regX,regX), reg1 variant }
  6253. (taicpu(p).oper[0]^.ref^.base = taicpu(p).oper[0]^.ref^.index) and
  6254. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 4)
  6255. )
  6256. )
  6257. ) or (
  6258. { lea (reg1,reg1), reg1 variant }
  6259. (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  6260. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1)
  6261. )
  6262. ) then
  6263. begin
  6264. { Make everything homogeneous to make calculations easier }
  6265. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) then
  6266. begin
  6267. if taicpu(p).oper[0]^.ref^.index <> NR_NO then
  6268. { Convert lea (regX,regX),reg1 to lea (regX,2),reg1 }
  6269. taicpu(p).oper[0]^.ref^.scalefactor := 2
  6270. else
  6271. taicpu(p).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.base;
  6272. taicpu(p).oper[0]^.ref^.base := NR_NO;
  6273. end;
  6274. { Make sure the offset doesn't go out of range (use 64-bit arithmetic)}
  6275. offsetcalc := taicpu(hp1).oper[0]^.ref^.offset;
  6276. Inc(offsetcalc, Int64(taicpu(p).oper[0]^.ref^.offset) * max(taicpu(hp1).oper[0]^.ref^.scalefactor, 1));
  6277. if (offsetcalc <= $7FFFFFFF) and (offsetcalc >= -2147483648) then
  6278. begin
  6279. if (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  6280. (taicpu(hp1).oper[0]^.ref^.index <> taicpu(p).oper[1]^.reg) then
  6281. begin
  6282. { Put the register to change in the index register }
  6283. TempReg := taicpu(hp1).oper[0]^.ref^.index;
  6284. taicpu(hp1).oper[0]^.ref^.index := taicpu(hp1).oper[0]^.ref^.base;
  6285. taicpu(hp1).oper[0]^.ref^.base := TempReg;
  6286. end;
  6287. { Change lea (reg,reg) to lea(,reg,2) }
  6288. if (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) then
  6289. begin
  6290. taicpu(hp1).oper[0]^.ref^.base := NR_NO;
  6291. taicpu(hp1).oper[0]^.ref^.scalefactor := 2;
  6292. end;
  6293. if (taicpu(p).oper[0]^.ref^.offset <> 0) then
  6294. Inc(taicpu(hp1).oper[0]^.ref^.offset, taicpu(p).oper[0]^.ref^.offset * max(taicpu(hp1).oper[0]^.ref^.scalefactor, 1));
  6295. taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.index;
  6296. { Just to prevent miscalculations }
  6297. if (taicpu(hp1).oper[0]^.ref^.scalefactor = 0) then
  6298. taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor
  6299. else
  6300. taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(hp1).oper[0]^.ref^.scalefactor * max(taicpu(p).oper[0]^.ref^.scalefactor, 1);
  6301. if (taicpu(p).oper[0]^.ref^.base<>NR_NO) and
  6302. (not RegInUsedRegs(taicpu(p).oper[0]^.ref^.base, TmpUsedRegs)) then
  6303. AllocRegBetween(taicpu(p).oper[0]^.ref^.base , p, hp1, TmpUsedRegs);
  6304. if (taicpu(p).oper[0]^.ref^.index<>NR_NO) and
  6305. (not RegInUsedRegs(taicpu(p).oper[0]^.ref^.index, TmpUsedRegs)) then
  6306. AllocRegBetween(taicpu(p).oper[0]^.ref^.index , p, hp1, TmpUsedRegs);
  6307. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1);
  6308. { Only remove the first LEA if we don't need the intermediate register's value as is }
  6309. if IntermediateRegDiscarded then
  6310. begin
  6311. DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 2 done',p);
  6312. RemoveCurrentP(p);
  6313. end
  6314. else
  6315. DebugMsg(SPeepholeOptimization + 'LeaLea2LeaLea 2 done (intermediate register still in use)',p);
  6316. result:=true;
  6317. exit;
  6318. end;
  6319. end;
  6320. { changes
  6321. lea offset1(regX), reg1
  6322. lea offset2(reg1), reg2
  6323. to
  6324. lea offset1+offset2(regX), reg2 }
  6325. if (
  6326. { Don't optimise if size is a concern and the intermediate register remains in use }
  6327. IntermediateRegDiscarded or
  6328. (
  6329. not (cs_opt_size in current_settings.optimizerswitches) and
  6330. { If the intermediate register is not discarded, it must not
  6331. appear in the first LEA's reference. (Fixes #41166) }
  6332. not RegInRef(taicpu(p).oper[1]^.reg, taicpu(p).oper[0]^.ref^)
  6333. )
  6334. ) and
  6335. (
  6336. (
  6337. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  6338. (getsupreg(taicpu(p).oper[0]^.ref^.base)<>RS_ESP) and
  6339. (taicpu(p).oper[0]^.ref^.index = NR_NO)
  6340. ) or (
  6341. (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  6342. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
  6343. (
  6344. (
  6345. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  6346. (taicpu(p).oper[0]^.ref^.base = NR_NO)
  6347. ) or (
  6348. (taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
  6349. (
  6350. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  6351. (
  6352. (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
  6353. (
  6354. (taicpu(hp1).oper[0]^.ref^.index = NR_NO) or
  6355. (taicpu(hp1).oper[0]^.ref^.base = NR_NO)
  6356. )
  6357. )
  6358. )
  6359. )
  6360. )
  6361. )
  6362. ) then
  6363. begin
  6364. { Make sure the offset doesn't go out of range (use 64-bit arithmetic)}
  6365. offsetcalc := taicpu(hp1).oper[0]^.ref^.offset;
  6366. Inc(offsetcalc, Int64(taicpu(p).oper[0]^.ref^.offset) * max(taicpu(hp1).oper[0]^.ref^.scalefactor, 1));
  6367. if (offsetcalc <= $7FFFFFFF) and (offsetcalc >= -2147483648) then
  6368. begin
  6369. if taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg then
  6370. begin
  6371. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.base;
  6372. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  6373. { if the register is used as index and base, we have to increase for base as well
  6374. and adapt base }
  6375. if taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg then
  6376. begin
  6377. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  6378. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  6379. end;
  6380. end
  6381. else
  6382. begin
  6383. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  6384. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  6385. end;
  6386. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  6387. begin
  6388. taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
  6389. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  6390. if (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) then
  6391. { Catch the situation where the base = index
  6392. and treat this as *2. The scalefactor of
  6393. p will be 0 or 1 due to the conditional
  6394. checks above. Fixes i40647 }
  6395. taicpu(hp1).oper[0]^.ref^.scalefactor := 2
  6396. else
  6397. taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor;
  6398. end;
  6399. if (taicpu(p).oper[0]^.ref^.base<>NR_NO) and
  6400. (not RegInUsedRegs(taicpu(p).oper[0]^.ref^.base, TmpUsedRegs)) then
  6401. AllocRegBetween(taicpu(p).oper[0]^.ref^.base , p, hp1, TmpUsedRegs);
  6402. if (taicpu(p).oper[0]^.ref^.index<>NR_NO) and
  6403. (not RegInUsedRegs(taicpu(p).oper[0]^.ref^.index, TmpUsedRegs)) then
  6404. AllocRegBetween(taicpu(p).oper[0]^.ref^.index , p, hp1, TmpUsedRegs);
  6405. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1);
  6406. { Only remove the first LEA if we don't need the intermediate register's value as is }
  6407. if IntermediateRegDiscarded then
  6408. begin
  6409. DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 1 done',p);
  6410. RemoveCurrentP(p);
  6411. end
  6412. else
  6413. DebugMsg(SPeepholeOptimization + 'LeaLea2LeaLea 1 done (intermediate register still in use)',p);
  6414. result:=true;
  6415. exit;
  6416. end;
  6417. end;
  6418. end;
  6419. { Change:
  6420. leal/q $x(%reg1),%reg2
  6421. ...
  6422. shll/q $y,%reg2
  6423. To:
  6424. leal/q $(x+2^y)(%reg1,2^y),%reg2 (if y <= 3)
  6425. }
  6426. if (taicpu(p).oper[0]^.ref^.base<>NR_STACK_POINTER_REG) and { lea (%rsp,scale),reg is not a valid encoding }
  6427. MatchInstruction(hp1, A_SHL, [taicpu(p).opsize]) and
  6428. MatchOpType(taicpu(hp1), top_const, top_reg) and
  6429. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
  6430. (taicpu(hp1).oper[0]^.val <= 3) then
  6431. begin
  6432. Multiple := 1 shl taicpu(hp1).oper[0]^.val;
  6433. TransferUsedRegs(TmpUsedRegs);
  6434. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  6435. if
  6436. { This allows the optimisation in some circumstances even if the lea instruction already has a scale factor
  6437. (this works even if scalefactor is zero) }
  6438. ((Multiple * taicpu(p).oper[0]^.ref^.scalefactor) <= 8) and
  6439. { Ensure offset doesn't go out of bounds }
  6440. (abs(taicpu(p).oper[0]^.ref^.offset * Multiple) <= $7FFFFFFF) and
  6441. not (RegInUsedRegs(NR_DEFAULTFLAGS,TmpUsedRegs)) and
  6442. (
  6443. (
  6444. not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.base, taicpu(p).oper[1]^.reg) and
  6445. (
  6446. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  6447. (taicpu(p).oper[0]^.ref^.index = NR_INVALID) or
  6448. (
  6449. { Check for lea $x(%reg1,%reg1),%reg2 and treat as it it were lea $x(%reg1,2),%reg2 }
  6450. (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
  6451. (taicpu(p).oper[0]^.ref^.scalefactor <= 1)
  6452. )
  6453. )
  6454. ) or (
  6455. (
  6456. (taicpu(p).oper[0]^.ref^.base = NR_NO) or
  6457. (taicpu(p).oper[0]^.ref^.base = NR_INVALID)
  6458. ) and
  6459. not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.index, taicpu(p).oper[1]^.reg)
  6460. )
  6461. ) then
  6462. begin
  6463. repeat
  6464. with taicpu(p).oper[0]^.ref^ do
  6465. begin
  6466. { Convert lea $x(%reg1,%reg1),%reg2 to lea $x(%reg1,2),%reg2 }
  6467. if index = base then
  6468. begin
  6469. if Multiple > 4 then
  6470. { Optimisation will no longer work because resultant
  6471. scale factor will exceed 8 }
  6472. Break;
  6473. base := NR_NO;
  6474. scalefactor := 2;
  6475. DebugMsg(SPeepholeOptimization + 'lea $x(%reg1,%reg1),%reg2 -> lea $x(%reg1,2),%reg2 for following optimisation', p);
  6476. end
  6477. else if (base <> NR_NO) and (base <> NR_INVALID) then
  6478. begin
  6479. { Scale factor only works on the index register }
  6480. index := base;
  6481. base := NR_NO;
  6482. end;
  6483. { For safety }
  6484. if scalefactor <= 1 then
  6485. begin
  6486. DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 1', p);
  6487. scalefactor := Multiple;
  6488. end
  6489. else
  6490. begin
  6491. DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 2', p);
  6492. scalefactor := scalefactor * Multiple;
  6493. end;
  6494. offset := offset * Multiple;
  6495. end;
  6496. RemoveInstruction(hp1);
  6497. Result := True;
  6498. Exit;
  6499. { This repeat..until loop exists for the benefit of Break }
  6500. until True;
  6501. end;
  6502. end;
  6503. end;
  6504. end;
  6505. end;
  6506. function TX86AsmOptimizer.DoArithCombineOpt(var p: tai): Boolean;
  6507. var
  6508. hp1 : tai;
  6509. SubInstr: Boolean;
  6510. ThisConst: TCGInt;
  6511. const
  6512. OverflowMin: array[S_B..S_Q] of TCGInt = (-128, -32768, -2147483648, -2147483648);
  6513. { Note: 64-bit-sized arithmetic instructions can only take signed 32-bit immediates }
  6514. OverflowMax: array[S_B..S_Q] of TCGInt = ( 255, 65535, $FFFFFFFF, 2147483647);
  6515. begin
  6516. Result := False;
  6517. if taicpu(p).oper[0]^.typ <> top_const then
  6518. { Should have been confirmed before calling }
  6519. InternalError(2021102601);
  6520. SubInstr := (taicpu(p).opcode = A_SUB);
  6521. if not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  6522. GetLastInstruction(p, hp1) and
  6523. (hp1.typ = ait_instruction) and
  6524. (taicpu(hp1).opsize = taicpu(p).opsize) then
  6525. begin
  6526. if not (taicpu(p).opsize in [S_B, S_W, S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
  6527. { Bad size }
  6528. InternalError(2022042001);
  6529. case taicpu(hp1).opcode Of
  6530. A_INC:
  6531. if MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  6532. begin
  6533. if SubInstr then
  6534. ThisConst := taicpu(p).oper[0]^.val - 1
  6535. else
  6536. ThisConst := taicpu(p).oper[0]^.val + 1;
  6537. end
  6538. else
  6539. Exit;
  6540. A_DEC:
  6541. if MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  6542. begin
  6543. if SubInstr then
  6544. ThisConst := taicpu(p).oper[0]^.val + 1
  6545. else
  6546. ThisConst := taicpu(p).oper[0]^.val - 1;
  6547. end
  6548. else
  6549. Exit;
  6550. A_SUB:
  6551. if (taicpu(hp1).oper[0]^.typ = top_const) and
  6552. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  6553. begin
  6554. if SubInstr then
  6555. ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val
  6556. else
  6557. ThisConst := taicpu(p).oper[0]^.val - taicpu(hp1).oper[0]^.val;
  6558. end
  6559. else
  6560. Exit;
  6561. A_ADD:
  6562. if (taicpu(hp1).oper[0]^.typ = top_const) and
  6563. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  6564. begin
  6565. if SubInstr then
  6566. ThisConst := taicpu(p).oper[0]^.val - taicpu(hp1).oper[0]^.val
  6567. else
  6568. ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val;
  6569. end
  6570. else
  6571. Exit;
  6572. else
  6573. Exit;
  6574. end;
  6575. { Check that the values are in range }
  6576. if (ThisConst < OverflowMin[taicpu(p).opsize]) or (ThisConst > OverflowMax[taicpu(p).opsize]) then
  6577. { Overflow; abort }
  6578. Exit;
  6579. if (ThisConst = 0) then
  6580. begin
  6581. DebugMsg(SPeepholeOptimization + 'Arithmetic combine: ' +
  6582. debug_op2str(taicpu(hp1).opcode) + ' $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_operstr(taicpu(hp1).oper[1]^) + '; ' +
  6583. debug_op2str(taicpu(p).opcode) + ' $' + debug_tostr(taicpu(p).oper[0]^.val) + ',' + debug_operstr(taicpu(p).oper[1]^) + ' cancel out (NOP)', p);
  6584. RemoveInstruction(hp1);
  6585. hp1 := tai(p.next);
  6586. RemoveInstruction(p); { Note, the choice to not use RemoveCurrentp is deliberate }
  6587. if not GetLastInstruction(hp1, p) then
  6588. p := hp1;
  6589. end
  6590. else
  6591. begin
  6592. if taicpu(hp1).opercnt=1 then
  6593. DebugMsg(SPeepholeOptimization + 'Arithmetic combine: ' +
  6594. debug_op2str(taicpu(hp1).opcode) + ' $' + debug_tostr(taicpu(hp1).oper[0]^.val) + '; ' +
  6595. debug_op2str(taicpu(p).opcode) + ' $' + debug_tostr(taicpu(p).oper[0]^.val) + ',' + debug_operstr(taicpu(p).oper[1]^) + ' -> ' +
  6596. debug_op2str(taicpu(p).opcode) + ' $' + debug_tostr(ThisConst) + ' ' + debug_operstr(taicpu(p).oper[1]^), p)
  6597. else
  6598. DebugMsg(SPeepholeOptimization + 'Arithmetic combine: ' +
  6599. debug_op2str(taicpu(hp1).opcode) + ' $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_operstr(taicpu(hp1).oper[1]^) + '; ' +
  6600. debug_op2str(taicpu(p).opcode) + ' $' + debug_tostr(taicpu(p).oper[0]^.val) + ',' + debug_operstr(taicpu(p).oper[1]^) + ' -> ' +
  6601. debug_op2str(taicpu(p).opcode) + ' $' + debug_tostr(ThisConst) + ' ' + debug_operstr(taicpu(p).oper[1]^), p);
  6602. RemoveInstruction(hp1);
  6603. taicpu(p).loadconst(0, ThisConst);
  6604. end;
  6605. Result := True;
  6606. end;
  6607. end;
  6608. function TX86AsmOptimizer.DoMovCmpMemOpt(var p : tai; const hp1: tai) : Boolean;
  6609. begin
  6610. Result := False;
  6611. if MatchOpType(taicpu(p),top_ref,top_reg) and
  6612. { The x86 assemblers have difficulty comparing values against absolute addresses }
  6613. (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) and
  6614. (taicpu(hp1).oper[0]^.typ <> top_ref) and
  6615. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  6616. (
  6617. (
  6618. (taicpu(hp1).opcode = A_TEST)
  6619. ) or (
  6620. (taicpu(hp1).opcode = A_CMP) and
  6621. { A sanity check more than anything }
  6622. not MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg)
  6623. )
  6624. ) then
  6625. begin
  6626. { change
  6627. mov mem, %reg
  6628. ...
  6629. cmp/test x, %reg / test %reg,%reg
  6630. (reg deallocated)
  6631. to
  6632. cmp/test x, mem / cmp 0, mem
  6633. }
  6634. TransferUsedRegs(TmpUsedRegs);
  6635. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  6636. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  6637. begin
  6638. { Convert test %reg,%reg or test $-1,%reg to cmp $0,mem }
  6639. if (taicpu(hp1).opcode = A_TEST) and
  6640. (
  6641. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) or
  6642. MatchOperand(taicpu(hp1).oper[0]^, -1)
  6643. ) then
  6644. begin
  6645. taicpu(hp1).opcode := A_CMP;
  6646. taicpu(hp1).loadconst(0, 0);
  6647. end;
  6648. taicpu(hp1).loadref(1, taicpu(p).oper[0]^.ref^);
  6649. DebugMsg(SPeepholeOptimization + 'MOV/CMP -> CMP (memory check)', p);
  6650. RemoveCurrentP(p);
  6651. if (p <> hp1) then
  6652. { Correctly update TmpUsedRegs if p and hp1 aren't adjacent }
  6653. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1);
  6654. { Make sure the flags are allocated across the CMP instruction }
  6655. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  6656. AllocRegBetween(NR_DEFAULTFLAGS, hp1, hp1, TmpUsedRegs);
  6657. Result := True;
  6658. Exit;
  6659. end;
  6660. end;
  6661. end;
  6662. function TX86AsmOptimizer.DoSETccLblRETOpt(var p: tai; const hp_label: tai_label) : Boolean;
  6663. var
  6664. hp_allocstart, hp_pos, hp2, hp3, hp4, hp5, hp6: tai;
  6665. ThisReg, SecondReg: TRegister;
  6666. JumpLoc: TAsmLabel;
  6667. NewSize: TOpSize;
  6668. begin
  6669. Result := False;
  6670. {
  6671. Convert:
  6672. j<c> .L1
  6673. .L2:
  6674. mov 1,reg
  6675. jmp .L3 (or ret, although it might not be a RET yet)
  6676. .L1:
  6677. mov 0,reg
  6678. jmp .L3 (or ret)
  6679. ( As long as .L3 <> .L1 or .L2)
  6680. To:
  6681. mov 0,reg
  6682. set<not(c)> reg
  6683. jmp .L3 (or ret)
  6684. .L2:
  6685. mov 1,reg
  6686. jmp .L3 (or ret)
  6687. .L1:
  6688. mov 0,reg
  6689. jmp .L3 (or ret)
  6690. }
  6691. if JumpTargetOp(taicpu(p))^.ref^.refaddr<>addr_full then
  6692. Exit;
  6693. JumpLoc := TAsmLabel(JumpTargetOp(taicpu(p))^.ref^.symbol);
  6694. if GetNextInstruction(hp_label, hp2) and
  6695. MatchInstruction(hp2,A_MOV,[]) and
  6696. (taicpu(hp2).oper[0]^.typ = top_const) and
  6697. (
  6698. (
  6699. (taicpu(hp2).oper[1]^.typ = top_reg)
  6700. {$ifdef i386}
  6701. { Under i386, ESI, EDI, EBP and ESP
  6702. don't have an 8-bit representation }
  6703. and not (getsupreg(taicpu(hp2).oper[1]^.reg) in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  6704. {$endif i386}
  6705. ) or (
  6706. {$ifdef i386}
  6707. (taicpu(hp2).oper[1]^.typ <> top_reg) and
  6708. {$endif i386}
  6709. (taicpu(hp2).opsize = S_B)
  6710. )
  6711. ) and
  6712. GetNextInstruction(hp2, hp3) and
  6713. MatchInstruction(hp3, A_JMP, A_RET, []) and
  6714. (
  6715. (taicpu(hp3).opcode=A_RET) or
  6716. (
  6717. (taicpu(hp3).oper[0]^.ref^.refaddr=addr_full) and
  6718. (tasmlabel(taicpu(hp3).oper[0]^.ref^.symbol)<>tai_label(hp_label).labsym)
  6719. )
  6720. ) and
  6721. GetNextInstruction(hp3, hp4) and
  6722. FindLabel(JumpLoc, hp4) and
  6723. (
  6724. not (cs_opt_size in current_settings.optimizerswitches) or
  6725. { If the initial jump is the label's only reference, then it will
  6726. become a dead label if the other conditions are met and hence
  6727. remove at least 2 instructions, including a jump }
  6728. (JumpLoc.getrefs = 1)
  6729. ) and
  6730. { Don't check if hp3 jumps to hp4 because this is a zero-distance jump
  6731. that will be optimised out }
  6732. GetNextInstruction(hp4, hp5) and
  6733. MatchInstruction(hp5,A_MOV,[taicpu(hp2).opsize]) and
  6734. (taicpu(hp5).oper[0]^.typ = top_const) and
  6735. (
  6736. ((taicpu(hp2).oper[0]^.val = 0) and (taicpu(hp5).oper[0]^.val = 1)) or
  6737. ((taicpu(hp2).oper[0]^.val = 1) and (taicpu(hp5).oper[0]^.val = 0))
  6738. ) and
  6739. MatchOperand(taicpu(hp2).oper[1]^,taicpu(hp5).oper[1]^) and
  6740. GetNextInstruction(hp5,hp6) and
  6741. (
  6742. not (hp6.typ in [ait_align, ait_label]) or
  6743. SkipLabels(hp6, hp6)
  6744. ) and
  6745. (hp6.typ=ait_instruction) then
  6746. begin
  6747. { First, let's look at the two jumps that are hp3 and hp6 }
  6748. if not
  6749. (
  6750. (taicpu(hp6).opcode=taicpu(hp3).opcode) and { Both RET or both JMP to the same label }
  6751. (
  6752. (taicpu(hp6).opcode=A_RET) or
  6753. MatchOperand(taicpu(hp6).oper[0]^, taicpu(hp3).oper[0]^)
  6754. )
  6755. ) then
  6756. { If condition is False, then the JMP/RET instructions matched conventionally }
  6757. begin
  6758. { See if one of the jumps can be instantly converted into a RET }
  6759. if (taicpu(hp3).opcode=A_JMP) then
  6760. begin
  6761. { Reuse hp5 }
  6762. hp5 := getlabelwithsym(TAsmLabel(JumpTargetOp(taicpu(hp3))^.ref^.symbol));
  6763. { Make sure hp5 doesn't jump back to .L1 (zero distance jump) or .L2 (infinite loop) }
  6764. if not Assigned(hp5) or (hp5 = hp_label) or (hp5 = hp4) or not GetNextInstruction(hp5, hp5) then
  6765. Exit;
  6766. if MatchInstruction(hp5, A_RET, []) then
  6767. begin
  6768. DebugMsg(SPeepholeOptimization + 'Converted JMP to RET as part of SETcc optimisation (1st jump)', hp3);
  6769. ConvertJumpToRET(hp3, hp5);
  6770. Result := True;
  6771. end
  6772. else
  6773. Exit;
  6774. end;
  6775. if (taicpu(hp6).opcode=A_JMP) then
  6776. begin
  6777. { Reuse hp5 }
  6778. hp5 := getlabelwithsym(TAsmLabel(JumpTargetOp(taicpu(hp6))^.ref^.symbol));
  6779. if not Assigned(hp5) or not GetNextInstruction(hp5, hp5) then
  6780. Exit;
  6781. if MatchInstruction(hp5, A_RET, []) then
  6782. begin
  6783. DebugMsg(SPeepholeOptimization + 'Converted JMP to RET as part of SETcc optimisation (2nd jump)', hp6);
  6784. ConvertJumpToRET(hp6, hp5);
  6785. Result := True;
  6786. end
  6787. else
  6788. Exit;
  6789. end;
  6790. if not
  6791. (
  6792. (taicpu(hp6).opcode=taicpu(hp3).opcode) and { Both RET or both JMP to the same label }
  6793. (
  6794. (taicpu(hp6).opcode=A_RET) or
  6795. MatchOperand(taicpu(hp6).oper[0]^, taicpu(hp3).oper[0]^)
  6796. )
  6797. ) then
  6798. { Still doesn't match }
  6799. Exit;
  6800. end;
  6801. if (taicpu(hp2).oper[0]^.val = 1) then
  6802. begin
  6803. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  6804. DebugMsg(SPeepholeOptimization + 'J(c)Mov1Jmp/RetMov0Jmp/Ret -> Set(~c)Jmp/Ret',p)
  6805. end
  6806. else
  6807. DebugMsg(SPeepholeOptimization + 'J(c)Mov0Jmp/RetMov1Jmp/Ret -> Set(c)Jmp/Ret',p);
  6808. if taicpu(hp2).opsize=S_B then
  6809. begin
  6810. if taicpu(hp2).oper[1]^.typ = top_reg then
  6811. begin
  6812. SecondReg := taicpu(hp2).oper[1]^.reg;
  6813. hp4:=taicpu.op_reg(A_SETcc, S_B, SecondReg);
  6814. end
  6815. else
  6816. begin
  6817. hp4:=taicpu.op_ref(A_SETcc, S_B, taicpu(hp2).oper[1]^.ref^);
  6818. SecondReg := NR_NO;
  6819. end;
  6820. hp_pos := p;
  6821. hp_allocstart := hp4;
  6822. end
  6823. else
  6824. begin
  6825. { Will be a register because the size can't be S_B otherwise }
  6826. SecondReg:=taicpu(hp2).oper[1]^.reg;
  6827. ThisReg:=newreg(R_INTREGISTER,getsupreg(SecondReg), R_SUBL);
  6828. hp4:=taicpu.op_reg(A_SETcc, S_B, ThisReg);
  6829. if (cs_opt_size in current_settings.optimizerswitches) then
  6830. begin
  6831. { Favour using MOVZX when optimising for size }
  6832. case taicpu(hp2).opsize of
  6833. S_W:
  6834. NewSize := S_BW;
  6835. S_L:
  6836. NewSize := S_BL;
  6837. {$ifdef x86_64}
  6838. S_Q:
  6839. begin
  6840. NewSize := S_BL;
  6841. { Will implicitly zero-extend to 64-bit }
  6842. setsubreg(SecondReg, R_SUBD);
  6843. end;
  6844. {$endif x86_64}
  6845. else
  6846. InternalError(2022101301);
  6847. end;
  6848. hp5:=taicpu.op_reg_reg(A_MOVZX, NewSize, ThisReg, SecondReg);
  6849. { Inserting it right before p will guarantee that the flags are also tracked }
  6850. Asml.InsertBefore(hp5, p);
  6851. { Make sure the SET instruction gets inserted before the MOVZX instruction }
  6852. hp_pos := hp5;
  6853. hp_allocstart := hp4;
  6854. end
  6855. else
  6856. begin
  6857. hp5:=taicpu.op_const_reg(A_MOV, taicpu(hp2).opsize, 0, SecondReg);
  6858. { Inserting it right before p will guarantee that the flags are also tracked }
  6859. Asml.InsertBefore(hp5, p);
  6860. hp_pos := p;
  6861. hp_allocstart := hp5;
  6862. end;
  6863. taicpu(hp5).fileinfo:=taicpu(p).fileinfo;
  6864. end;
  6865. taicpu(hp4).fileinfo := taicpu(p).fileinfo;
  6866. taicpu(hp4).condition := taicpu(p).condition;
  6867. asml.InsertBefore(hp4, hp_pos);
  6868. if taicpu(hp3).is_jmp then
  6869. begin
  6870. JumpLoc.decrefs;
  6871. MakeUnconditional(taicpu(p));
  6872. { This also increases the reference count }
  6873. taicpu(p).loadref(0, JumpTargetOp(taicpu(hp3))^.ref^);
  6874. end
  6875. else
  6876. ConvertJumpToRET(p, hp3);
  6877. if SecondReg <> NR_NO then
  6878. { Ensure the destination register is allocated over this region }
  6879. AllocRegBetween(SecondReg, hp_allocstart, p, UsedRegs);
  6880. if (JumpLoc.getrefs = 0) then
  6881. RemoveDeadCodeAfterJump(hp3);
  6882. Result:=true;
  6883. exit;
  6884. end;
  6885. end;
  6886. function TX86AsmOptimizer.OptPass1Sub(var p : tai) : boolean;
  6887. var
  6888. hp1, hp2: tai;
  6889. ActiveReg: TRegister;
  6890. OldOffset: asizeint;
  6891. ThisConst: TCGInt;
  6892. function RegDeallocated: Boolean;
  6893. begin
  6894. TransferUsedRegs(TmpUsedRegs);
  6895. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  6896. Result := not(RegUsedAfterInstruction(ActiveReg,hp1,TmpUsedRegs))
  6897. end;
  6898. begin
  6899. Result:=false;
  6900. hp1 := nil;
  6901. { replace
  6902. subX const,%reg1
  6903. leaX (%reg1,%reg1,Y),%reg2 // Base or index might not be equal to reg1
  6904. dealloc %reg1
  6905. by
  6906. leaX -const-const*Y(%reg1,%reg1,Y),%reg2
  6907. }
  6908. if MatchOpType(taicpu(p),top_const,top_reg) then
  6909. begin
  6910. ActiveReg := taicpu(p).oper[1]^.reg;
  6911. { Ensures the entire register was updated }
  6912. if (taicpu(p).opsize >= S_L) and
  6913. GetNextInstructionUsingReg(p,hp1, ActiveReg) and
  6914. MatchInstruction(hp1,A_LEA,[]) and
  6915. (SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.base) or
  6916. SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.index)) and
  6917. (
  6918. { Cover the case where the register in the reference is also the destination register }
  6919. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ActiveReg) or
  6920. (
  6921. { Try to avoid the expensive check of RegUsedAfterInstruction if we know it will return False }
  6922. not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ActiveReg) and
  6923. RegDeallocated
  6924. )
  6925. ) then
  6926. begin
  6927. OldOffset := taicpu(hp1).oper[0]^.ref^.offset;
  6928. if SuperRegistersEqual(ActiveReg,taicpu(hp1).oper[0]^.ref^.base) then
  6929. Dec(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val);
  6930. if SuperRegistersEqual(ActiveReg,taicpu(hp1).oper[0]^.ref^.index) then
  6931. Dec(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  6932. {$ifdef x86_64}
  6933. if (taicpu(hp1).oper[0]^.ref^.offset > $7FFFFFFF) or (taicpu(hp1).oper[0]^.ref^.offset < -2147483648) then
  6934. begin
  6935. { Overflow; abort }
  6936. taicpu(hp1).oper[0]^.ref^.offset := OldOffset;
  6937. end
  6938. else
  6939. {$endif x86_64}
  6940. begin
  6941. DebugMsg(SPeepholeOptimization + 'SubLea2Lea done',p);
  6942. if not (cs_opt_level3 in current_settings.optimizerswitches) then
  6943. { hp1 is the immediate next instruction for sure - good for a quick speed boost }
  6944. RemoveCurrentP(p, hp1)
  6945. else
  6946. RemoveCurrentP(p);
  6947. result:=true;
  6948. Exit;
  6949. end;
  6950. end;
  6951. if (
  6952. { Save calling GetNextInstructionUsingReg again }
  6953. Assigned(hp1) or
  6954. GetNextInstructionUsingReg(p,hp1, ActiveReg)
  6955. ) and
  6956. MatchInstruction(hp1,A_SUB,[taicpu(p).opsize]) and
  6957. (taicpu(hp1).oper[1]^.reg = ActiveReg) then
  6958. begin
  6959. { Make sure the flags aren't in use by the second operation }
  6960. TransferUsedRegs(TmpUsedRegs);
  6961. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.next), hp1);
  6962. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  6963. begin
  6964. if (taicpu(hp1).oper[0]^.typ = top_const) then
  6965. begin
  6966. { Merge add const1,%reg; add const2,%reg to add const1+const2,%reg }
  6967. ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val;
  6968. Result := True;
  6969. { Handle any overflows }
  6970. case taicpu(p).opsize of
  6971. S_B:
  6972. taicpu(p).oper[0]^.val := ThisConst and $FF;
  6973. S_W:
  6974. taicpu(p).oper[0]^.val := ThisConst and $FFFF;
  6975. S_L:
  6976. taicpu(p).oper[0]^.val := ThisConst and $FFFFFFFF;
  6977. {$ifdef x86_64}
  6978. S_Q:
  6979. if (ThisConst > $7FFFFFFF) or (ThisConst < -2147483648) then
  6980. { Overflow; abort }
  6981. Result := False
  6982. else
  6983. taicpu(p).oper[0]^.val := ThisConst;
  6984. {$endif x86_64}
  6985. else
  6986. InternalError(2021102611);
  6987. end;
  6988. { Result may get set to False again if the combined immediate overflows for S_Q sizes }
  6989. if Result then
  6990. begin
  6991. if (taicpu(p).oper[0]^.val < 0) and
  6992. (
  6993. ((taicpu(p).opsize = S_B) and (taicpu(p).oper[0]^.val <> -128)) or
  6994. ((taicpu(p).opsize = S_W) and (taicpu(p).oper[0]^.val <> -32768)) or
  6995. ((taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and (taicpu(p).oper[0]^.val <> -2147483648))
  6996. ) then
  6997. begin
  6998. DebugMsg(SPeepholeOptimization + 'SUB; ADD/SUB -> ADD',p);
  6999. taicpu(p).opcode := A_SUB;
  7000. taicpu(p).oper[0]^.val := -taicpu(p).oper[0]^.val;
  7001. end
  7002. else
  7003. DebugMsg(SPeepholeOptimization + 'SUB; ADD/SUB -> SUB',p);
  7004. RemoveInstruction(hp1);
  7005. end;
  7006. end
  7007. else
  7008. begin
  7009. { Move the constant subtraction to after the reg/ref addition to improve optimisation }
  7010. DebugMsg(SPeepholeOptimization + 'Add/sub swap 1b done',p);
  7011. Asml.Remove(p);
  7012. Asml.InsertAfter(p, hp1);
  7013. p := hp1;
  7014. Result := True;
  7015. Exit;
  7016. end;
  7017. end;
  7018. end;
  7019. { * change "subl $2, %esp; pushw x" to "pushl x"}
  7020. { * change "sub/add const1, reg" or "dec reg" followed by
  7021. "sub const2, reg" to one "sub ..., reg" }
  7022. {$ifdef i386}
  7023. if (taicpu(p).oper[0]^.val = 2) and
  7024. (ActiveReg = NR_ESP) and
  7025. { Don't do the sub/push optimization if the sub }
  7026. { comes from setting up the stack frame (JM) }
  7027. (not(GetLastInstruction(p,hp1)) or
  7028. not(MatchInstruction(hp1,A_MOV,[S_L]) and
  7029. MatchOperand(taicpu(hp1).oper[0]^,NR_ESP) and
  7030. MatchOperand(taicpu(hp1).oper[0]^,NR_EBP))) then
  7031. begin
  7032. hp1 := tai(p.next);
  7033. while Assigned(hp1) and
  7034. (tai(hp1).typ in [ait_instruction]+SkipInstr) and
  7035. not RegReadByInstruction(NR_ESP,hp1) and
  7036. not RegModifiedByInstruction(NR_ESP,hp1) do
  7037. hp1 := tai(hp1.next);
  7038. if Assigned(hp1) and
  7039. MatchInstruction(hp1,A_PUSH,[S_W]) then
  7040. begin
  7041. taicpu(hp1).changeopsize(S_L);
  7042. if taicpu(hp1).oper[0]^.typ=top_reg then
  7043. setsubreg(taicpu(hp1).oper[0]^.reg,R_SUBWHOLE);
  7044. hp1 := tai(p.next);
  7045. RemoveCurrentp(p, hp1);
  7046. Result:=true;
  7047. exit;
  7048. end;
  7049. end;
  7050. {$endif i386}
  7051. if DoArithCombineOpt(p) then
  7052. Result:=true;
  7053. end;
  7054. end;
  7055. function TX86AsmOptimizer.OptPass1SHLSAL(var p : tai) : boolean;
  7056. var
  7057. TmpBool1,TmpBool2 : Boolean;
  7058. tmpref : treference;
  7059. hp1,hp2: tai;
  7060. mask, shiftval: tcgint;
  7061. begin
  7062. Result:=false;
  7063. { All these optimisations work on "shl/sal const,%reg" }
  7064. if not MatchOpType(taicpu(p),top_const,top_reg) then
  7065. Exit;
  7066. if (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  7067. (taicpu(p).oper[0]^.val <= 3) then
  7068. { Changes "shl const, %reg32; add const/reg, %reg32" to one lea statement }
  7069. begin
  7070. { should we check the next instruction? }
  7071. TmpBool1 := True;
  7072. { have we found an add/sub which could be
  7073. integrated in the lea? }
  7074. TmpBool2 := False;
  7075. reference_reset(tmpref,2,[]);
  7076. TmpRef.index := taicpu(p).oper[1]^.reg;
  7077. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  7078. while TmpBool1 and
  7079. GetNextInstruction(p, hp1) and
  7080. (tai(hp1).typ = ait_instruction) and
  7081. ((((taicpu(hp1).opcode = A_ADD) or
  7082. (taicpu(hp1).opcode = A_SUB)) and
  7083. (taicpu(hp1).oper[1]^.typ = Top_Reg) and
  7084. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)) or
  7085. (((taicpu(hp1).opcode = A_INC) or
  7086. (taicpu(hp1).opcode = A_DEC)) and
  7087. (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  7088. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg)) or
  7089. ((taicpu(hp1).opcode = A_LEA) and
  7090. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  7091. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg))) and
  7092. (not GetNextInstruction(hp1,hp2) or
  7093. not instrReadsFlags(hp2)) Do
  7094. begin
  7095. TmpBool1 := False;
  7096. if taicpu(hp1).opcode=A_LEA then
  7097. begin
  7098. if (TmpRef.base = NR_NO) and
  7099. (taicpu(hp1).oper[0]^.ref^.symbol=nil) and
  7100. (taicpu(hp1).oper[0]^.ref^.relsymbol=nil) and
  7101. { Segment register isn't a concern here }
  7102. ((taicpu(hp1).oper[0]^.ref^.scalefactor=0) or
  7103. (taicpu(hp1).oper[0]^.ref^.scalefactor*tmpref.scalefactor<=8)) then
  7104. begin
  7105. TmpBool1 := True;
  7106. TmpBool2 := True;
  7107. inc(TmpRef.offset, taicpu(hp1).oper[0]^.ref^.offset);
  7108. if taicpu(hp1).oper[0]^.ref^.scalefactor<>0 then
  7109. tmpref.scalefactor:=tmpref.scalefactor*taicpu(hp1).oper[0]^.ref^.scalefactor;
  7110. TmpRef.base := taicpu(hp1).oper[0]^.ref^.base;
  7111. RemoveInstruction(hp1);
  7112. end
  7113. end
  7114. else if (taicpu(hp1).oper[0]^.typ = Top_Const) then
  7115. begin
  7116. TmpBool1 := True;
  7117. TmpBool2 := True;
  7118. case taicpu(hp1).opcode of
  7119. A_ADD:
  7120. inc(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  7121. A_SUB:
  7122. dec(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  7123. else
  7124. internalerror(2019050536);
  7125. end;
  7126. RemoveInstruction(hp1);
  7127. end
  7128. else
  7129. if (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  7130. (((taicpu(hp1).opcode = A_ADD) and
  7131. (TmpRef.base = NR_NO)) or
  7132. (taicpu(hp1).opcode = A_INC) or
  7133. (taicpu(hp1).opcode = A_DEC)) then
  7134. begin
  7135. TmpBool1 := True;
  7136. TmpBool2 := True;
  7137. case taicpu(hp1).opcode of
  7138. A_ADD:
  7139. TmpRef.base := taicpu(hp1).oper[0]^.reg;
  7140. A_INC:
  7141. inc(TmpRef.offset);
  7142. A_DEC:
  7143. dec(TmpRef.offset);
  7144. else
  7145. internalerror(2019050535);
  7146. end;
  7147. RemoveInstruction(hp1);
  7148. end;
  7149. end;
  7150. if TmpBool2
  7151. {$ifndef x86_64}
  7152. or
  7153. ((current_settings.optimizecputype < cpu_Pentium2) and
  7154. (taicpu(p).oper[0]^.val <= 3) and
  7155. not(cs_opt_size in current_settings.optimizerswitches))
  7156. {$endif x86_64}
  7157. then
  7158. begin
  7159. if not(TmpBool2) and
  7160. (taicpu(p).oper[0]^.val=1) then
  7161. begin
  7162. taicpu(p).opcode := A_ADD;
  7163. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  7164. end
  7165. else
  7166. begin
  7167. taicpu(p).opcode := A_LEA;
  7168. taicpu(p).loadref(0, TmpRef);
  7169. end;
  7170. DebugMsg(SPeepholeOptimization + 'ShlAddLeaSubIncDec2Lea',p);
  7171. Result := True;
  7172. end;
  7173. end
  7174. {$ifndef x86_64}
  7175. else if (current_settings.optimizecputype < cpu_Pentium2) then
  7176. begin
  7177. { changes "shl $1, %reg" to "add %reg, %reg", which is the same on a 386,
  7178. but faster on a 486, and Tairable in both U and V pipes on the Pentium
  7179. (unlike shl, which is only Tairable in the U pipe) }
  7180. if taicpu(p).oper[0]^.val=1 then
  7181. begin
  7182. taicpu(p).opcode := A_ADD;
  7183. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  7184. Result := True;
  7185. end
  7186. { changes "shl $2, %reg" to "lea (,%reg,4), %reg"
  7187. "shl $3, %reg" to "lea (,%reg,8), %reg }
  7188. else if (taicpu(p).opsize = S_L) and
  7189. (taicpu(p).oper[0]^.val<= 3) then
  7190. begin
  7191. reference_reset(tmpref,2,[]);
  7192. TmpRef.index := taicpu(p).oper[1]^.reg;
  7193. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  7194. taicpu(p).opcode := A_LEA;
  7195. taicpu(p).loadref(0, TmpRef);
  7196. Result := True;
  7197. end;
  7198. end
  7199. {$endif x86_64}
  7200. else if
  7201. GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) and MatchOpType(taicpu(hp1), top_const, top_reg) and
  7202. (
  7203. (
  7204. MatchInstruction(hp1, A_AND, [taicpu(p).opsize]) and
  7205. SetAndTest(hp1, hp2)
  7206. {$ifdef x86_64}
  7207. ) or
  7208. (
  7209. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  7210. GetNextInstruction(hp1, hp2) and
  7211. MatchInstruction(hp2, A_AND, [taicpu(p).opsize]) and
  7212. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  7213. (taicpu(hp1).oper[1]^.reg = taicpu(hp2).oper[0]^.reg)
  7214. {$endif x86_64}
  7215. )
  7216. ) and
  7217. (taicpu(p).oper[1]^.reg = taicpu(hp2).oper[1]^.reg) then
  7218. begin
  7219. { Change:
  7220. shl x, %reg1
  7221. mov -(1<<x), %reg2
  7222. and %reg2, %reg1
  7223. Or:
  7224. shl x, %reg1
  7225. and -(1<<x), %reg1
  7226. To just:
  7227. shl x, %reg1
  7228. Since the and operation only zeroes bits that are already zero from the shl operation
  7229. }
  7230. case taicpu(p).oper[0]^.val of
  7231. 8:
  7232. mask:=$FFFFFFFFFFFFFF00;
  7233. 16:
  7234. mask:=$FFFFFFFFFFFF0000;
  7235. 32:
  7236. mask:=$FFFFFFFF00000000;
  7237. 63:
  7238. { Constant pre-calculated to prevent overflow errors with Int64 }
  7239. mask:=$8000000000000000;
  7240. else
  7241. begin
  7242. if taicpu(p).oper[0]^.val >= 64 then
  7243. { Shouldn't happen realistically, since the register
  7244. is guaranteed to be set to zero at this point }
  7245. mask := 0
  7246. else
  7247. mask := -(Int64(1 shl taicpu(p).oper[0]^.val));
  7248. end;
  7249. end;
  7250. if taicpu(hp1).oper[0]^.val = mask then
  7251. begin
  7252. { Everything checks out, perform the optimisation, as long as
  7253. the FLAGS register isn't being used}
  7254. TransferUsedRegs(TmpUsedRegs);
  7255. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  7256. {$ifdef x86_64}
  7257. if (hp1 <> hp2) then
  7258. begin
  7259. { "shl/mov/and" version }
  7260. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  7261. { Don't do the optimisation if the FLAGS register is in use }
  7262. if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)) then
  7263. begin
  7264. DebugMsg(SPeepholeOptimization + 'ShlMovAnd2Shl', p);
  7265. { Don't remove the 'mov' instruction if its register is used elsewhere }
  7266. if not(RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs)) then
  7267. begin
  7268. RemoveInstruction(hp1);
  7269. Result := True;
  7270. end;
  7271. { Only set Result to True if the 'mov' instruction was removed }
  7272. RemoveInstruction(hp2);
  7273. end;
  7274. end
  7275. else
  7276. {$endif x86_64}
  7277. begin
  7278. { "shl/and" version }
  7279. { Don't do the optimisation if the FLAGS register is in use }
  7280. if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  7281. begin
  7282. DebugMsg(SPeepholeOptimization + 'ShlAnd2Shl', p);
  7283. RemoveInstruction(hp1);
  7284. Result := True;
  7285. end;
  7286. end;
  7287. Exit;
  7288. end
  7289. else {$ifdef x86_64}if (hp1 = hp2) then{$endif x86_64}
  7290. begin
  7291. { Even if the mask doesn't allow for its removal, we might be
  7292. able to optimise the mask for the "shl/and" version, which
  7293. may permit other peephole optimisations }
  7294. {$ifdef DEBUG_AOPTCPU}
  7295. mask := taicpu(hp1).oper[0]^.val and mask;
  7296. if taicpu(hp1).oper[0]^.val <> mask then
  7297. begin
  7298. DebugMsg(
  7299. SPeepholeOptimization +
  7300. 'Changed mask from $' + debug_tostr(taicpu(hp1).oper[0]^.val) +
  7301. ' to $' + debug_tostr(mask) +
  7302. 'based on previous instruction (ShlAnd2ShlAnd)', hp1);
  7303. taicpu(hp1).oper[0]^.val := mask;
  7304. end;
  7305. {$else DEBUG_AOPTCPU}
  7306. { If debugging is off, just set the operand even if it's the same }
  7307. taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and mask;
  7308. {$endif DEBUG_AOPTCPU}
  7309. end;
  7310. end;
  7311. {
  7312. change
  7313. shl/sal const,reg
  7314. <op> ...(...,reg,1),...
  7315. into
  7316. <op> ...(...,reg,1 shl const),...
  7317. if const in 1..3
  7318. }
  7319. if MatchOpType(taicpu(p), top_const, top_reg) and
  7320. (taicpu(p).oper[0]^.val in [1..3]) and
  7321. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  7322. ((MatchInstruction(hp1,A_MOV,A_LEA,[]) and
  7323. MatchOpType(taicpu(hp1),top_ref,top_reg)) or
  7324. (MatchInstruction(hp1,A_FST,A_FSTP,A_FLD,[]) and
  7325. MatchOpType(taicpu(hp1),top_ref))
  7326. ) and
  7327. (taicpu(p).oper[1]^.reg=taicpu(hp1).oper[0]^.ref^.index) and
  7328. (taicpu(p).oper[1]^.reg<>taicpu(hp1).oper[0]^.ref^.base) and
  7329. (taicpu(hp1).oper[0]^.ref^.scalefactor in [0,1]) then
  7330. begin
  7331. TransferUsedRegs(TmpUsedRegs);
  7332. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  7333. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
  7334. begin
  7335. taicpu(hp1).oper[0]^.ref^.scalefactor:=1 shl taicpu(p).oper[0]^.val;
  7336. DebugMsg(SPeepholeOptimization + 'ShlOp2Op', p);
  7337. RemoveCurrentP(p);
  7338. Result:=true;
  7339. exit;
  7340. end;
  7341. end;
  7342. if MatchOpType(taicpu(p), top_const, top_reg) and
  7343. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  7344. MatchInstruction(hp1,A_SHL,[taicpu(p).opsize]) and
  7345. MatchOpType(taicpu(hp1),top_const,top_reg) and
  7346. (taicpu(p).oper[1]^.reg=taicpu(hp1).oper[1]^.reg) then
  7347. begin
  7348. shiftval:=taicpu(p).oper[0]^.val+taicpu(hp1).oper[0]^.val;
  7349. if ((taicpu(p).opsize=S_B) and (shiftval>7)) or
  7350. ((taicpu(p).opsize=S_W) and (shiftval>15)) or
  7351. {$ifdef x86_64}
  7352. ((taicpu(p).opsize=S_Q) and (shiftval>63)) or
  7353. {$endif x86_64}
  7354. ((taicpu(p).opsize=S_L) and (shiftval>31)) then
  7355. begin
  7356. DebugMsg(SPeepholeOptimization + 'ShlShl2Mov', p);
  7357. taicpu(hp1).opcode:=A_MOV;
  7358. taicpu(hp1).oper[0]^.val:=0;
  7359. end
  7360. else
  7361. begin
  7362. DebugMsg(SPeepholeOptimization + 'ShlShl2Shl', p);
  7363. taicpu(hp1).oper[0]^.val:=shiftval;
  7364. end;
  7365. RemoveCurrentP(p);
  7366. Result:=true;
  7367. exit;
  7368. end;
  7369. end;
  7370. class function TX86AsmOptimizer.IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean;
  7371. begin
  7372. case shr_size of
  7373. S_B:
  7374. { No valid combinations }
  7375. Result := False;
  7376. S_W:
  7377. Result := (Shift >= 8) and (movz_size = S_BW);
  7378. S_L:
  7379. Result :=
  7380. (Shift >= 24) { Any opsize is valid for this shift } or
  7381. ((Shift >= 16) and (movz_size = S_WL));
  7382. {$ifdef x86_64}
  7383. S_Q:
  7384. Result :=
  7385. (Shift >= 56) { Any opsize is valid for this shift } or
  7386. ((Shift >= 48) and (movz_size = S_WL));
  7387. {$endif x86_64}
  7388. else
  7389. InternalError(2022081510);
  7390. end;
  7391. end;
  7392. function TX86AsmOptimizer.HandleSHRMerge(var p: tai; const PostPeephole: Boolean): Boolean;
  7393. var
  7394. hp1, hp2: tai;
  7395. IdentityMask, Shift: TCGInt;
  7396. LimitSize: Topsize;
  7397. DoNotMerge: Boolean;
  7398. begin
  7399. if not MatchInstruction(p, A_SHR, []) then
  7400. InternalError(2025040301);
  7401. Result := False;
  7402. DoNotMerge := False;
  7403. Shift := taicpu(p).oper[0]^.val;
  7404. LimitSize := taicpu(p).opsize;
  7405. hp1 := p;
  7406. repeat
  7407. if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
  7408. Exit;
  7409. case taicpu(hp1).opcode of
  7410. A_AND:
  7411. { Detect:
  7412. shr x, %reg
  7413. and y, %reg
  7414. If and y, %reg doesn't actually change the value of %reg (e.g. with
  7415. "shrl $24,%reg; andl $255,%reg", remove the AND instruction.
  7416. (Post-peephole only)
  7417. }
  7418. if PostPeephole and
  7419. (taicpu(hp1).opsize = taicpu(p).opsize) and
  7420. MatchOpType(taicpu(hp1), top_const, top_reg) and
  7421. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  7422. begin
  7423. { Make sure the FLAGS register isn't in use }
  7424. TransferUsedRegs(TmpUsedRegs);
  7425. hp2 := p;
  7426. repeat
  7427. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  7428. until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
  7429. if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
  7430. begin
  7431. { Generate the identity mask }
  7432. case taicpu(p).opsize of
  7433. S_B:
  7434. IdentityMask := $FF shr Shift;
  7435. S_W:
  7436. IdentityMask := $FFFF shr Shift;
  7437. S_L:
  7438. IdentityMask := $FFFFFFFF shr Shift;
  7439. {$ifdef x86_64}
  7440. S_Q:
  7441. { We need to force the operands to be unsigned 64-bit
  7442. integers otherwise the wrong value is generated }
  7443. IdentityMask := TCGInt(QWord($FFFFFFFFFFFFFFFF) shr QWord(Shift));
  7444. {$endif x86_64}
  7445. else
  7446. InternalError(2022081501);
  7447. end;
  7448. if (taicpu(hp1).oper[0]^.val and IdentityMask) = IdentityMask then
  7449. begin
  7450. DebugMsg(SPeepholeOptimization + 'Removed AND instruction since previous SHR makes this an identity operation (ShrAnd2Shr)', hp1);
  7451. { All the possible 1 bits are covered, so we can remove the AND }
  7452. hp2 := tai(hp1.Previous);
  7453. RemoveInstruction(hp1);
  7454. { p wasn't actually changed, so don't set Result to True,
  7455. but a change was nonetheless made elsewhere }
  7456. Include(OptsToCheck, aoc_ForceNewIteration);
  7457. { Do another pass in case other AND or MOVZX instructions
  7458. follow }
  7459. hp1 := hp2;
  7460. Continue;
  7461. end;
  7462. end;
  7463. end;
  7464. A_TEST, A_CMP:
  7465. { Skip over relevant comparisons, but shift instructions must
  7466. now not be merged since the original value is being read }
  7467. begin
  7468. DoNotMerge := True;
  7469. Continue;
  7470. end;
  7471. A_Jcc:
  7472. { Skip over conditional jumps and relevant comparisons }
  7473. Continue;
  7474. A_MOVZX:
  7475. if MatchOpType(taicpu(hp1), top_reg, top_reg) and
  7476. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
  7477. begin
  7478. { Since the original register is being read as is, subsequent
  7479. SHRs must not be merged at this point }
  7480. DoNotMerge := True;
  7481. if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
  7482. begin
  7483. if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
  7484. begin
  7485. { If the MOVZX instruction reads and writes the same register,
  7486. defer this to the post-peephole optimisation stage }
  7487. if PostPeephole then
  7488. begin
  7489. DebugMsg(SPeepholeOptimization + 'Removed MOVZX instruction since previous SHR makes it unnecessary (ShrMovz2Shr)', hp1);
  7490. { All the possible 1 bits are covered, so we can remove the MOVZX }
  7491. hp2 := tai(hp1.Previous);
  7492. RemoveInstruction(hp1);
  7493. hp1 := hp2;
  7494. end;
  7495. end
  7496. else { Different register target }
  7497. begin
  7498. DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 1)', hp1);
  7499. taicpu(hp1).opcode := A_MOV;
  7500. setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
  7501. case taicpu(hp1).opsize of
  7502. S_BW:
  7503. taicpu(hp1).opsize := S_W;
  7504. S_BL, S_WL:
  7505. taicpu(hp1).opsize := S_L;
  7506. else
  7507. InternalError(2022081503);
  7508. end;
  7509. { p itself hasn't changed, so no need to set Result to True }
  7510. Include(OptsToCheck, aoc_ForceNewIteration);
  7511. { See if there's anything afterwards that can be
  7512. optimised, since the input register hasn't changed }
  7513. Continue;
  7514. end;
  7515. Exit;
  7516. end
  7517. else if PostPeephole and
  7518. (Shift > 0) and
  7519. (taicpu(p).opsize = S_W) and
  7520. (taicpu(hp1).opsize = S_WL) and
  7521. (taicpu(hp1).oper[0]^.reg = NR_AX) and
  7522. (taicpu(hp1).oper[1]^.reg = NR_EAX) then
  7523. begin
  7524. { Detect:
  7525. shr x, %ax (x > 0)
  7526. ...
  7527. movzwl %ax,%eax
  7528. -
  7529. Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
  7530. But first, check to see if movzwl %ax,%eax can be removed...
  7531. }
  7532. hp2 := tai(hp1.Previous);
  7533. TransferUsedRegs(TmpUsedRegs);
  7534. UpdateUsedRegsBetween(UsedRegs, p, hp1);
  7535. if PostPeepholeOptMovZX(hp1) then
  7536. hp1 := hp2
  7537. else
  7538. begin
  7539. DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
  7540. taicpu(hp1).opcode := A_CWDE;
  7541. taicpu(hp1).clearop(0);
  7542. taicpu(hp1).clearop(1);
  7543. taicpu(hp1).ops := 0;
  7544. end;
  7545. RestoreUsedRegs(TmpUsedRegs);
  7546. { Don't need to set aoc_ForceNewIteration if
  7547. PostPeepholeOptMovZX returned True because it's the
  7548. post-peephole stage }
  7549. end;
  7550. { Move onto the next instruction }
  7551. Continue;
  7552. end;
  7553. A_SHL, A_SAL, A_SHR:
  7554. if (taicpu(hp1).opsize <= LimitSize) and
  7555. MatchOpType(taicpu(hp1), top_const, top_reg) and
  7556. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  7557. begin
  7558. { Make sure the sizes don't exceed the register size limit
  7559. (measured by the shift value falling below the limit) }
  7560. if taicpu(hp1).opsize < LimitSize then
  7561. LimitSize := taicpu(hp1).opsize;
  7562. if taicpu(hp1).opcode = A_SHR then
  7563. Inc(Shift, taicpu(hp1).oper[0]^.val)
  7564. else
  7565. begin
  7566. Dec(Shift, taicpu(hp1).oper[0]^.val);
  7567. DoNotMerge := True;
  7568. end;
  7569. if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
  7570. Exit;
  7571. { Since we've established that the combined shift is within
  7572. limits, we can actually combine the adjacent SHR
  7573. instructions even if they're different sizes }
  7574. if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
  7575. begin
  7576. hp2 := tai(hp1.Previous);
  7577. DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 1', p);
  7578. Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
  7579. RemoveInstruction(hp1);
  7580. hp1 := hp2;
  7581. { Though p has changed, only the constant has, and its
  7582. effects can still be detected on the next iteration of
  7583. the repeat..until loop }
  7584. Include(OptsToCheck, aoc_ForceNewIteration);
  7585. end;
  7586. { Move onto the next instruction }
  7587. Continue;
  7588. end;
  7589. else
  7590. ;
  7591. end;
  7592. { If the register isn't actually modified, move onto the next instruction,
  7593. but set DoNotMerge to True since the register is being read }
  7594. if (
  7595. { Under -O2 and below, GetNextInstructionUsingReg only returns
  7596. the next instruction, whether or not it contains the register }
  7597. (cs_opt_level3 in current_settings.optimizerswitches) or
  7598. RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1)
  7599. ) and not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
  7600. begin
  7601. DoNotMerge := True;
  7602. Continue;
  7603. end;
  7604. Break;
  7605. until False;
  7606. end;
  7607. function TX86AsmOptimizer.OptPass1SHR(var p : tai) : boolean;
  7608. begin
  7609. Result := False;
  7610. { All these optimisations work on "shr const,%reg" }
  7611. if not MatchOpType(taicpu(p), top_const, top_reg) then
  7612. Exit;
  7613. Result := HandleSHRMerge(p, False);
  7614. end;
  7615. function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
  7616. var
  7617. CurrentRef: TReference;
  7618. FullReg: TRegister;
  7619. hp1, hp2: tai;
  7620. begin
  7621. Result := False;
  7622. if (first_mov.opsize <> S_B) or (second_mov.opsize <> S_B) then
  7623. Exit;
  7624. { We assume you've checked if the operand is actually a reference by
  7625. this point. If it isn't, you'll most likely get an access violation }
  7626. CurrentRef := first_mov.oper[1]^.ref^;
  7627. { Memory must be aligned }
  7628. if (CurrentRef.offset mod 4) <> 0 then
  7629. Exit;
  7630. Inc(CurrentRef.offset);
  7631. CurrentRef.alignment := 1; { Otherwise references_equal will return False }
  7632. if MatchOperand(second_mov.oper[0]^, 0) and
  7633. references_equal(second_mov.oper[1]^.ref^, CurrentRef) and
  7634. GetNextInstruction(second_mov, hp1) and
  7635. (hp1.typ = ait_instruction) and
  7636. (taicpu(hp1).opcode = A_MOV) and
  7637. MatchOpType(taicpu(hp1), top_const, top_ref) and
  7638. (taicpu(hp1).oper[0]^.val = 0) then
  7639. begin
  7640. Inc(CurrentRef.offset);
  7641. CurrentRef.alignment := taicpu(hp1).oper[1]^.ref^.alignment; { Otherwise references_equal might return False }
  7642. FullReg := newreg(R_INTREGISTER,getsupreg(first_mov.oper[0]^.reg), R_SUBD);
  7643. if references_equal(taicpu(hp1).oper[1]^.ref^, CurrentRef) then
  7644. begin
  7645. case taicpu(hp1).opsize of
  7646. S_B:
  7647. if GetNextInstruction(hp1, hp2) and
  7648. MatchInstruction(taicpu(hp2), A_MOV, [S_B]) and
  7649. MatchOpType(taicpu(hp2), top_const, top_ref) and
  7650. (taicpu(hp2).oper[0]^.val = 0) then
  7651. begin
  7652. Inc(CurrentRef.offset);
  7653. CurrentRef.alignment := 1; { Otherwise references_equal will return False }
  7654. if references_equal(taicpu(hp2).oper[1]^.ref^, CurrentRef) and
  7655. (taicpu(hp2).opsize = S_B) then
  7656. begin
  7657. RemoveInstruction(hp1);
  7658. RemoveInstruction(hp2);
  7659. first_mov.opsize := S_L;
  7660. if first_mov.oper[0]^.typ = top_reg then
  7661. begin
  7662. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVb/MOVb -> MOVZX/MOVl', first_mov);
  7663. { Reuse second_mov as a MOVZX instruction }
  7664. second_mov.opcode := A_MOVZX;
  7665. second_mov.opsize := S_BL;
  7666. second_mov.loadreg(0, first_mov.oper[0]^.reg);
  7667. second_mov.loadreg(1, FullReg);
  7668. first_mov.oper[0]^.reg := FullReg;
  7669. asml.Remove(second_mov);
  7670. asml.InsertBefore(second_mov, first_mov);
  7671. end
  7672. else
  7673. { It's a value }
  7674. begin
  7675. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVb/MOVb -> MOVl', first_mov);
  7676. RemoveInstruction(second_mov);
  7677. end;
  7678. Result := True;
  7679. Exit;
  7680. end;
  7681. end;
  7682. S_W:
  7683. begin
  7684. RemoveInstruction(hp1);
  7685. first_mov.opsize := S_L;
  7686. if first_mov.oper[0]^.typ = top_reg then
  7687. begin
  7688. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVw -> MOVZX/MOVl', first_mov);
  7689. { Reuse second_mov as a MOVZX instruction }
  7690. second_mov.opcode := A_MOVZX;
  7691. second_mov.opsize := S_BL;
  7692. second_mov.loadreg(0, first_mov.oper[0]^.reg);
  7693. second_mov.loadreg(1, FullReg);
  7694. first_mov.oper[0]^.reg := FullReg;
  7695. asml.Remove(second_mov);
  7696. asml.InsertBefore(second_mov, first_mov);
  7697. end
  7698. else
  7699. { It's a value }
  7700. begin
  7701. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVw -> MOVl', first_mov);
  7702. RemoveInstruction(second_mov);
  7703. end;
  7704. Result := True;
  7705. Exit;
  7706. end;
  7707. else
  7708. ;
  7709. end;
  7710. end;
  7711. end;
  7712. end;
  7713. function TX86AsmOptimizer.OptPass1FSTP(var p: tai): boolean;
  7714. { returns true if a "continue" should be done after this optimization }
  7715. var
  7716. hp1, hp2, hp3: tai;
  7717. begin
  7718. Result := false;
  7719. hp3 := nil;
  7720. if MatchOpType(taicpu(p),top_ref) and
  7721. GetNextInstruction(p, hp1) and
  7722. (hp1.typ = ait_instruction) and
  7723. (((taicpu(hp1).opcode = A_FLD) and
  7724. (taicpu(p).opcode = A_FSTP)) or
  7725. ((taicpu(p).opcode = A_FISTP) and
  7726. (taicpu(hp1).opcode = A_FILD))) and
  7727. MatchOpType(taicpu(hp1),top_ref) and
  7728. (taicpu(hp1).opsize = taicpu(p).opsize) and
  7729. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  7730. begin
  7731. { replacing fstp f;fld f by fst f is only valid for extended because of rounding or if fastmath is on }
  7732. if ((taicpu(p).opsize=S_FX) or (cs_opt_fastmath in current_settings.optimizerswitches)) and
  7733. GetNextInstruction(hp1, hp2) and
  7734. (((hp2.typ = ait_instruction) and
  7735. IsExitCode(hp2) and
  7736. (taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
  7737. not(assigned(current_procinfo.procdef.funcretsym) and
  7738. (taicpu(p).oper[0]^.ref^.offset < tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)) and
  7739. (taicpu(p).oper[0]^.ref^.index = NR_NO)) or
  7740. { fstp <temp>
  7741. fld <temp>
  7742. <dealloc> <temp>
  7743. }
  7744. ((taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
  7745. (taicpu(p).oper[0]^.ref^.index = NR_NO) and
  7746. SetAndTest(FindTempDeAlloc(taicpu(p).oper[0]^.ref^.offset,tai(hp1.next)),hp2) and
  7747. (tai_tempalloc(hp2).temppos=taicpu(p).oper[0]^.ref^.offset) and
  7748. (((taicpu(p).opsize=S_FX) and (tai_tempalloc(hp2).tempsize=16)) or
  7749. ((taicpu(p).opsize in [S_IQ,S_FL]) and (tai_tempalloc(hp2).tempsize=8)) or
  7750. ((taicpu(p).opsize=S_FS) and (tai_tempalloc(hp2).tempsize=4))
  7751. )
  7752. )
  7753. ) then
  7754. begin
  7755. DebugMsg(SPeepholeOptimization + 'FstpFld2<Nop>',p);
  7756. RemoveInstruction(hp1);
  7757. RemoveCurrentP(p, hp2);
  7758. { first case: exit code }
  7759. if hp2.typ = ait_instruction then
  7760. RemoveLastDeallocForFuncRes(p);
  7761. Result := true;
  7762. end
  7763. else
  7764. { we can do this only in fast math mode as fstp is rounding ...
  7765. ... still disabled as it breaks the compiler and/or rtl }
  7766. if { (cs_opt_fastmath in current_settings.optimizerswitches) or }
  7767. { ... or if another fstp equal to the first one follows }
  7768. GetNextInstruction(hp1,hp2) and
  7769. (hp2.typ = ait_instruction) and
  7770. (taicpu(p).opcode=taicpu(hp2).opcode) and
  7771. (taicpu(p).opsize=taicpu(hp2).opsize) then
  7772. begin
  7773. if (taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
  7774. (taicpu(p).oper[0]^.ref^.index = NR_NO) and
  7775. SetAndTest(FindTempDeAlloc(taicpu(p).oper[0]^.ref^.offset,tai(hp2.next)),hp3) and
  7776. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  7777. (tai_tempalloc(hp3).temppos=taicpu(p).oper[0]^.ref^.offset) and
  7778. (((taicpu(p).opsize=S_FX) and (tai_tempalloc(hp3).tempsize=16)) or
  7779. ((taicpu(p).opsize in [S_IQ,S_FL]) and (tai_tempalloc(hp3).tempsize=8)) or
  7780. ((taicpu(p).opsize=S_FS) and (tai_tempalloc(hp3).tempsize=4))
  7781. ) then
  7782. begin
  7783. DebugMsg(SPeepholeOptimization + 'FstpFldFstp2Fstp',p);
  7784. RemoveCurrentP(p,hp2);
  7785. RemoveInstruction(hp1);
  7786. Result := true;
  7787. end
  7788. else if { fst can't store an extended/comp value }
  7789. (taicpu(p).opsize <> S_FX) and
  7790. (taicpu(p).opsize <> S_IQ) then
  7791. begin
  7792. if (taicpu(p).opcode = A_FSTP) then
  7793. taicpu(p).opcode := A_FST
  7794. else
  7795. taicpu(p).opcode := A_FIST;
  7796. DebugMsg(SPeepholeOptimization + 'FstpFld2Fst',p);
  7797. RemoveInstruction(hp1);
  7798. Result := true;
  7799. end;
  7800. end;
  7801. end;
  7802. end;
  7803. function TX86AsmOptimizer.OptPass1FLD(var p : tai) : boolean;
  7804. var
  7805. hp1, hp2, hp3: tai;
  7806. begin
  7807. result:=false;
  7808. if MatchOpType(taicpu(p),top_reg) and
  7809. GetNextInstruction(p, hp1) and
  7810. (hp1.typ = Ait_Instruction) and
  7811. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  7812. (taicpu(hp1).oper[0]^.reg = NR_ST) and
  7813. (taicpu(hp1).oper[1]^.reg = NR_ST1) then
  7814. { change to
  7815. fld reg fxxx reg,st
  7816. fxxxp st, st1 (hp1)
  7817. Remark: non commutative operations must be reversed!
  7818. }
  7819. begin
  7820. case taicpu(hp1).opcode Of
  7821. A_FMULP,A_FADDP,
  7822. A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  7823. begin
  7824. case taicpu(hp1).opcode Of
  7825. A_FADDP: taicpu(hp1).opcode := A_FADD;
  7826. A_FMULP: taicpu(hp1).opcode := A_FMUL;
  7827. A_FSUBP: taicpu(hp1).opcode := A_FSUBR;
  7828. A_FSUBRP: taicpu(hp1).opcode := A_FSUB;
  7829. A_FDIVP: taicpu(hp1).opcode := A_FDIVR;
  7830. A_FDIVRP: taicpu(hp1).opcode := A_FDIV;
  7831. else
  7832. internalerror(2019050534);
  7833. end;
  7834. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  7835. taicpu(hp1).oper[1]^.reg := NR_ST;
  7836. DebugMsg(SPeepholeOptimization + 'FldF*p2F*',hp1);
  7837. RemoveCurrentP(p, hp1);
  7838. Result:=true;
  7839. exit;
  7840. end;
  7841. else
  7842. ;
  7843. end;
  7844. end
  7845. else
  7846. if MatchOpType(taicpu(p),top_ref) and
  7847. GetNextInstruction(p, hp2) and
  7848. (hp2.typ = Ait_Instruction) and
  7849. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  7850. (taicpu(p).opsize in [S_FS, S_FL]) and
  7851. (taicpu(hp2).oper[0]^.reg = NR_ST) and
  7852. (taicpu(hp2).oper[1]^.reg = NR_ST1) then
  7853. if GetLastInstruction(p, hp1) and
  7854. MatchInstruction(hp1,A_FLD,A_FST,[taicpu(p).opsize]) and
  7855. MatchOpType(taicpu(hp1),top_ref) and
  7856. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  7857. if ((taicpu(hp2).opcode = A_FMULP) or
  7858. (taicpu(hp2).opcode = A_FADDP)) then
  7859. { change to
  7860. fld/fst mem1 (hp1) fld/fst mem1
  7861. fld mem1 (p) fadd/
  7862. faddp/ fmul st, st
  7863. fmulp st, st1 (hp2) }
  7864. begin
  7865. DebugMsg(SPeepholeOptimization + 'Fld/FstFldFaddp/Fmulp2Fld/FstFadd/Fmul',hp1);
  7866. RemoveCurrentP(p, hp1);
  7867. if (taicpu(hp2).opcode = A_FADDP) then
  7868. taicpu(hp2).opcode := A_FADD
  7869. else
  7870. taicpu(hp2).opcode := A_FMUL;
  7871. taicpu(hp2).oper[1]^.reg := NR_ST;
  7872. end
  7873. else
  7874. { change to
  7875. fld/fst mem1 (hp1) fld/fst mem1
  7876. fld mem1 (p) fld st
  7877. }
  7878. begin
  7879. DebugMsg(SPeepholeOptimization + 'Fld/Fst<mem>Fld<mem>2Fld/Fst<mem>Fld<reg>',hp1);
  7880. taicpu(p).changeopsize(S_FL);
  7881. taicpu(p).loadreg(0,NR_ST);
  7882. end
  7883. else
  7884. begin
  7885. case taicpu(hp2).opcode Of
  7886. A_FMULP,A_FADDP,A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  7887. { change to
  7888. fld/fst mem1 (hp1) fld/fst mem1
  7889. fld mem2 (p) fxxx mem2
  7890. fxxxp st, st1 (hp2) }
  7891. begin
  7892. case taicpu(hp2).opcode Of
  7893. A_FADDP: taicpu(p).opcode := A_FADD;
  7894. A_FMULP: taicpu(p).opcode := A_FMUL;
  7895. A_FSUBP: taicpu(p).opcode := A_FSUBR;
  7896. A_FSUBRP: taicpu(p).opcode := A_FSUB;
  7897. A_FDIVP: taicpu(p).opcode := A_FDIVR;
  7898. A_FDIVRP: taicpu(p).opcode := A_FDIV;
  7899. else
  7900. internalerror(2019050533);
  7901. end;
  7902. DebugMsg(SPeepholeOptimization + 'Fld/FstFldF*2Fld/FstF*',p);
  7903. RemoveInstruction(hp2);
  7904. end
  7905. else
  7906. ;
  7907. end
  7908. end
  7909. end;
  7910. function IsCmpSubset(cond1, cond2: TAsmCond): Boolean; inline;
  7911. begin
  7912. Result := condition_in(cond1, cond2) or
  7913. { Not strictly subsets due to the actual flags checked, but because we're
  7914. comparing integers, E is a subset of AE and GE and their aliases }
  7915. ((cond1 in [C_E, C_Z]) and (cond2 in [C_AE, C_NB, C_NC, C_GE, C_NL]));
  7916. end;
  7917. function TX86AsmOptimizer.OptPass1Cmp(var p: tai): boolean;
  7918. var
  7919. v: TCGInt;
  7920. true_hp1, hp1, hp2, p_dist, p_jump, hp1_dist, p_label, hp1_label: tai;
  7921. FirstMatch, TempBool: Boolean;
  7922. NewReg: TRegister;
  7923. JumpLabel, JumpLabel_dist, JumpLabel_far: TAsmLabel;
  7924. begin
  7925. Result:=false;
  7926. { All these optimisations need a next instruction }
  7927. if not GetNextInstruction(p, hp1) then
  7928. Exit;
  7929. true_hp1 := hp1;
  7930. { Search for:
  7931. cmp ###,###
  7932. j(c1) @lbl1
  7933. ...
  7934. @lbl:
  7935. cmp ###,### (same comparison as above)
  7936. j(c2) @lbl2
  7937. If c1 is a subset of c2, change to:
  7938. cmp ###,###
  7939. j(c1) @lbl2
  7940. (@lbl1 may become a dead label as a result)
  7941. }
  7942. { Also handle cases where there are multiple jumps in a row }
  7943. p_jump := hp1;
  7944. while Assigned(p_jump) and MatchInstruction(p_jump, A_JCC, []) do
  7945. begin
  7946. Prefetch(p_jump.Next);
  7947. if IsJumpToLabel(taicpu(p_jump)) then
  7948. begin
  7949. { Do jump optimisations first in case the condition becomes
  7950. unnecessary }
  7951. TempBool := True;
  7952. if DoJumpOptimizations(p_jump, TempBool) or
  7953. not TempBool then
  7954. begin
  7955. if Assigned(p_jump) then
  7956. begin
  7957. { CollapseZeroDistJump will be set to the label or an align
  7958. before it after the jump if it optimises, whether or not
  7959. the label is live or dead }
  7960. if (p_jump.typ = ait_align) or
  7961. (
  7962. (p_jump.typ = ait_label) and
  7963. not (tai_label(p_jump).labsym.is_used)
  7964. ) then
  7965. GetNextInstruction(p_jump, p_jump);
  7966. end;
  7967. TransferUsedRegs(TmpUsedRegs);
  7968. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  7969. if not Assigned(p_jump) or
  7970. (
  7971. not MatchInstruction(p_jump, A_Jcc, A_SETcc, A_CMOVcc, []) and
  7972. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, p_jump, TmpUsedRegs)
  7973. ) then
  7974. begin
  7975. { No more conditional jumps; conditional statement is no longer required }
  7976. DebugMsg(SPeepholeOptimization + 'Removed unnecessary condition (Cmp2Nop)', p);
  7977. RemoveCurrentP(p);
  7978. Result := True;
  7979. Exit;
  7980. end;
  7981. hp1 := p_jump;
  7982. Include(OptsToCheck, aoc_ForceNewIteration);
  7983. Continue;
  7984. end;
  7985. JumpLabel := TAsmLabel(taicpu(p_jump).oper[0]^.ref^.symbol);
  7986. if GetNextInstruction(p_jump, hp2) and
  7987. (
  7988. OptimizeConditionalJump(JumpLabel, p_jump, hp2, TempBool) or
  7989. not TempBool
  7990. ) then
  7991. begin
  7992. hp1 := p_jump;
  7993. Include(OptsToCheck, aoc_ForceNewIteration);
  7994. Continue;
  7995. end;
  7996. p_label := nil;
  7997. if Assigned(JumpLabel) then
  7998. p_label := getlabelwithsym(JumpLabel);
  7999. if Assigned(p_label) and
  8000. GetNextInstruction(p_label, p_dist) and
  8001. MatchInstruction(p_dist, A_CMP, []) and
  8002. MatchOperand(taicpu(p_dist).oper[0]^, taicpu(p).oper[0]^) and
  8003. MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p).oper[1]^) and
  8004. GetNextInstruction(p_dist, hp1_dist) and
  8005. MatchInstruction(hp1_dist, A_JCC, []) then { This doesn't have to be an explicit label }
  8006. begin
  8007. JumpLabel_dist := TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol);
  8008. if JumpLabel = JumpLabel_dist then
  8009. { This is an infinite loop }
  8010. Exit;
  8011. { Best optimisation when the first condition is a subset (or equal) of the second }
  8012. if IsCmpSubset(taicpu(p_jump).condition, taicpu(hp1_dist).condition) then
  8013. begin
  8014. { Any registers used here will already be allocated }
  8015. if Assigned(JumpLabel) then
  8016. JumpLabel.DecRefs;
  8017. DebugMsg(SPeepholeOptimization + 'CMP/Jcc/@Lbl/CMP/Jcc -> CMP/Jcc, redirecting first jump', p_jump);
  8018. taicpu(p_jump).loadref(0, taicpu(hp1_dist).oper[0]^.ref^); { This also increases the reference count }
  8019. Include(OptsToCheck, aoc_ForceNewIteration);
  8020. { Don't exit yet. Since p and p_jump haven't actually been
  8021. removed, we can check for more on this iteration }
  8022. end
  8023. else if IsCmpSubset(taicpu(hp1_dist).condition, inverse_cond(taicpu(p_jump).condition)) and
  8024. GetNextInstruction(hp1_dist, hp1_label) and
  8025. (hp1_label.typ = ait_label) then
  8026. begin
  8027. JumpLabel_far := tai_label(hp1_label).labsym;
  8028. if (JumpLabel_far = JumpLabel_dist) or (JumpLabel_far = JumpLabel) then
  8029. { This is an infinite loop }
  8030. Exit;
  8031. if Assigned(JumpLabel_far) then
  8032. begin
  8033. { In this situation, if the first jump branches, the second one will never,
  8034. branch so change the destination label to after the second jump }
  8035. DebugMsg(SPeepholeOptimization + 'CMP/Jcc/@Lbl/CMP/Jcc/@Lbl -> CMP/Jcc, redirecting first jump to 2nd label', p_jump);
  8036. if Assigned(JumpLabel) then
  8037. JumpLabel.DecRefs;
  8038. JumpLabel_far.IncRefs;
  8039. taicpu(p_jump).oper[0]^.ref^.symbol := JumpLabel_far;
  8040. Result := True;
  8041. { Don't exit yet. Since p and p_jump haven't actually been
  8042. removed, we can check for more on this iteration }
  8043. Continue;
  8044. end;
  8045. end;
  8046. end;
  8047. end;
  8048. { Search for:
  8049. cmp ###,###
  8050. j(c1) @lbl1
  8051. cmp ###,### (same as first)
  8052. Remove second cmp
  8053. }
  8054. if GetNextInstruction(p_jump, hp2) and
  8055. (
  8056. (
  8057. MatchInstruction(hp2, A_CMP, [taicpu(p).opsize]) and
  8058. (
  8059. (
  8060. MatchOpType(taicpu(p), top_const, top_reg) and
  8061. MatchOpType(taicpu(hp2), top_const, top_reg) and
  8062. (taicpu(hp2).oper[0]^.val = taicpu(p).oper[0]^.val) and
  8063. Reg1WriteOverwritesReg2Entirely(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[1]^.reg)
  8064. ) or (
  8065. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
  8066. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^)
  8067. )
  8068. )
  8069. ) or (
  8070. { Also match cmp $0,%reg; jcc @lbl; test %reg,%reg }
  8071. MatchOperand(taicpu(p).oper[0]^, 0) and
  8072. (taicpu(p).oper[1]^.typ = top_reg) and
  8073. MatchInstruction(hp2, A_TEST, []) and
  8074. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  8075. (taicpu(hp2).oper[0]^.reg = taicpu(hp2).oper[1]^.reg) and
  8076. Reg1WriteOverwritesReg2Entirely(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[1]^.reg)
  8077. )
  8078. ) then
  8079. begin
  8080. DebugMsg(SPeepholeOptimization + 'CMP/Jcc/CMP; removed superfluous CMP', hp2);
  8081. TransferUsedRegs(TmpUsedRegs);
  8082. AllocRegBetween(NR_DEFAULTFLAGS, p, hp2, TmpUsedRegs);
  8083. RemoveInstruction(hp2);
  8084. Result := True;
  8085. { Continue the while loop in case "Jcc/CMP" follows the second CMP that was just removed }
  8086. end
  8087. else
  8088. begin
  8089. { hp2 is the next instruction, so save time and just set p_jump
  8090. to it instead of calling GetNextInstruction below }
  8091. p_jump := hp2;
  8092. Continue;
  8093. end;
  8094. GetNextInstruction(p_jump, p_jump);
  8095. end;
  8096. if (
  8097. { Don't call GetNextInstruction again if we already have it }
  8098. (true_hp1 = p_jump) or
  8099. GetNextInstruction(p, hp1)
  8100. ) and
  8101. MatchInstruction(hp1, A_Jcc, []) and
  8102. IsJumpToLabel(taicpu(hp1)) and
  8103. (taicpu(hp1).condition in [C_E, C_Z, C_NE, C_NZ]) and
  8104. GetNextInstruction(hp1, hp2) then
  8105. begin
  8106. {
  8107. cmp x, y (or "cmp y, x")
  8108. je @lbl
  8109. mov x, y
  8110. @lbl:
  8111. (x and y can be constants, registers or references)
  8112. Change to:
  8113. mov x, y (x and y will always be equal in the end)
  8114. @lbl: (may beceome a dead label)
  8115. Also:
  8116. cmp x, y (or "cmp y, x")
  8117. jne @lbl
  8118. mov x, y
  8119. @lbl:
  8120. (x and y can be constants, registers or references)
  8121. Change to:
  8122. Absolutely nothing! (Except @lbl if it's still live)
  8123. }
  8124. if MatchInstruction(hp2, A_MOV, [taicpu(p).opsize]) and
  8125. (
  8126. (
  8127. MatchOperand(taicpu(p).oper[0]^, taicpu(hp2).oper[0]^) and
  8128. MatchOperand(taicpu(p).oper[1]^, taicpu(hp2).oper[1]^)
  8129. ) or (
  8130. MatchOperand(taicpu(p).oper[0]^, taicpu(hp2).oper[1]^) and
  8131. MatchOperand(taicpu(p).oper[1]^, taicpu(hp2).oper[0]^)
  8132. )
  8133. ) and
  8134. GetNextInstruction(hp2, hp1_label) and
  8135. (hp1_label.typ = ait_label) and
  8136. (tai_label(hp1_label).labsym = taicpu(hp1).oper[0]^.ref^.symbol) then
  8137. begin
  8138. tai_label(hp1_label).labsym.DecRefs;
  8139. if (taicpu(hp1).condition in [C_NE, C_NZ]) then
  8140. begin
  8141. DebugMsg(SPeepholeOptimization + 'CMP/JNE/MOV/@Lbl -> NOP, since the MOV is only executed if the operands are equal (CmpJneMov2Nop)', p);
  8142. RemoveInstruction(hp2);
  8143. hp2 := hp1_label; { So RemoveCurrentp below can be set to something valid }
  8144. end
  8145. else
  8146. DebugMsg(SPeepholeOptimization + 'CMP/JE/MOV/@Lbl -> MOV, since the MOV is only executed if the operands aren''t equal (CmpJeMov2Mov)', p);
  8147. RemoveInstruction(hp1);
  8148. RemoveCurrentp(p, hp2);
  8149. Result := True;
  8150. Exit;
  8151. end;
  8152. {
  8153. Try to optimise the following:
  8154. cmp $x,### ($x and $y can be registers or constants)
  8155. je @lbl1 (only reference)
  8156. cmp $y,### (### are identical)
  8157. @Lbl:
  8158. sete %reg1
  8159. Change to:
  8160. cmp $x,###
  8161. sete %reg2 (allocate new %reg2)
  8162. cmp $y,###
  8163. sete %reg1
  8164. orb %reg2,%reg1
  8165. (dealloc %reg2)
  8166. This adds an instruction (so don't perform under -Os), but it removes
  8167. a conditional branch.
  8168. }
  8169. if not (cs_opt_size in current_settings.optimizerswitches) and
  8170. MatchInstruction(hp2, A_CMP, A_TEST, [taicpu(p).opsize]) and
  8171. MatchOperand(taicpu(p).oper[1]^, taicpu(hp2).oper[1]^) and
  8172. { The first operand of CMP instructions can only be a register or
  8173. immediate anyway, so no need to check }
  8174. GetNextInstruction(hp2, p_label) and
  8175. (p_label.typ = ait_label) and
  8176. (tai_label(p_label).labsym.getrefs = 1) and
  8177. (JumpTargetOp(taicpu(hp1))^.ref^.symbol = tai_label(p_label).labsym) and
  8178. GetNextInstruction(p_label, p_dist) and
  8179. MatchInstruction(p_dist, A_SETcc, []) and
  8180. (taicpu(p_dist).condition in [C_E, C_Z]) and
  8181. (taicpu(p_dist).oper[0]^.typ = top_reg) then
  8182. begin
  8183. TransferUsedRegs(TmpUsedRegs);
  8184. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  8185. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  8186. UpdateUsedRegs(TmpUsedRegs, tai(p_label.Next));
  8187. UpdateUsedRegs(TmpUsedRegs, tai(p_dist.Next));
  8188. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  8189. { Get the instruction after the SETcc instruction so we can
  8190. allocate a new register over the entire range }
  8191. GetNextInstruction(p_dist, hp1_dist) then
  8192. begin
  8193. { Register can appear in p if it's not used afterwards, so only
  8194. allocate between hp1 and hp1_dist }
  8195. NewReg := GetIntRegisterBetween(R_SUBL, TmpUsedRegs, hp1, hp1_dist);
  8196. if NewReg <> NR_NO then
  8197. begin
  8198. DebugMsg(SPeepholeOptimization + 'CMP/JE/CMP/@Lbl/SETE -> CMP/SETE/CMP/SETE/OR, removing conditional branch', p);
  8199. { Change the jump instruction into a SETcc instruction }
  8200. taicpu(hp1).opcode := A_SETcc;
  8201. taicpu(hp1).opsize := S_B;
  8202. taicpu(hp1).loadreg(0, NewReg);
  8203. { This is now a dead label }
  8204. tai_label(p_label).labsym.decrefs;
  8205. { Prefer adding before the next instruction so the FLAGS
  8206. register is deallicated first }
  8207. AsmL.InsertBefore(
  8208. taicpu.op_reg_reg(A_OR, S_B, NewReg, taicpu(p_dist).oper[0]^.reg),
  8209. hp1_dist
  8210. );
  8211. Result := True;
  8212. { Don't exit yet, as p wasn't changed and hp1, while
  8213. modified, is still intact and might be optimised by the
  8214. SETcc optimisation below }
  8215. end;
  8216. end;
  8217. end;
  8218. end;
  8219. if (taicpu(p).oper[0]^.typ = top_const) and
  8220. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) then
  8221. begin
  8222. if (taicpu(p).oper[0]^.val = 0) and
  8223. (taicpu(p).oper[1]^.typ = top_reg) then
  8224. begin
  8225. hp2 := p;
  8226. FirstMatch := True;
  8227. { When dealing with "cmp $0,%reg", only ZF and SF contain
  8228. anything meaningful once it's converted to "test %reg,%reg";
  8229. additionally, some jumps will always (or never) branch, so
  8230. evaluate every jump immediately following the
  8231. comparison, optimising the conditions if possible.
  8232. Similarly with SETcc... those that are always set to 0 or 1
  8233. are changed to MOV instructions }
  8234. while FirstMatch or { Saves calling GetNextInstruction unnecessarily }
  8235. (
  8236. GetNextInstruction(hp2, hp1) and
  8237. MatchInstruction(hp1,A_Jcc,A_SETcc,[])
  8238. ) do
  8239. begin
  8240. Prefetch(hp1.Next);
  8241. FirstMatch := False;
  8242. case taicpu(hp1).condition of
  8243. C_B, C_C, C_NAE, C_O:
  8244. { For B/NAE:
  8245. Will never branch since an unsigned integer can never be below zero
  8246. For C/O:
  8247. Result cannot overflow because 0 is being subtracted
  8248. }
  8249. begin
  8250. if taicpu(hp1).opcode = A_Jcc then
  8251. begin
  8252. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (jump removed)', hp1);
  8253. TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol).decrefs;
  8254. RemoveInstruction(hp1);
  8255. { Since hp1 was deleted, hp2 must not be updated }
  8256. Continue;
  8257. end
  8258. else
  8259. begin
  8260. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (set -> mov 0)', hp1);
  8261. { Convert "set(c) %reg" instruction to "movb 0,%reg" }
  8262. taicpu(hp1).opcode := A_MOV;
  8263. taicpu(hp1).ops := 2;
  8264. taicpu(hp1).condition := C_None;
  8265. taicpu(hp1).opsize := S_B;
  8266. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  8267. taicpu(hp1).loadconst(0, 0);
  8268. end;
  8269. end;
  8270. C_BE, C_NA:
  8271. begin
  8272. { Will only branch if equal to zero }
  8273. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition BE/NA --> E', hp1);
  8274. taicpu(hp1).condition := C_E;
  8275. end;
  8276. C_A, C_NBE:
  8277. begin
  8278. { Will only branch if not equal to zero }
  8279. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition A/NBE --> NE', hp1);
  8280. taicpu(hp1).condition := C_NE;
  8281. end;
  8282. C_AE, C_NB, C_NC, C_NO:
  8283. begin
  8284. { Will always branch }
  8285. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition AE/NB/NC/NO --> Always', hp1);
  8286. if taicpu(hp1).opcode = A_Jcc then
  8287. begin
  8288. MakeUnconditional(taicpu(hp1));
  8289. { Any jumps/set that follow will now be dead code }
  8290. RemoveDeadCodeAfterJump(taicpu(hp1));
  8291. Break;
  8292. end
  8293. else
  8294. begin
  8295. { Convert "set(c) %reg" instruction to "movb 1,%reg" }
  8296. taicpu(hp1).opcode := A_MOV;
  8297. taicpu(hp1).ops := 2;
  8298. taicpu(hp1).condition := C_None;
  8299. taicpu(hp1).opsize := S_B;
  8300. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  8301. taicpu(hp1).loadconst(0, 1);
  8302. end;
  8303. end;
  8304. C_None:
  8305. InternalError(2020012201);
  8306. C_P, C_PE, C_NP, C_PO:
  8307. { We can't handle parity checks and they should never be generated
  8308. after a general-purpose CMP (it's used in some floating-point
  8309. comparisons that don't use CMP) }
  8310. InternalError(2020012202);
  8311. else
  8312. { Zero/Equality, Sign, their complements and all of the
  8313. signed comparisons do not need to be converted };
  8314. end;
  8315. hp2 := hp1;
  8316. end;
  8317. { Convert the instruction to a TEST }
  8318. taicpu(p).opcode := A_TEST;
  8319. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  8320. Result := True;
  8321. Exit;
  8322. end
  8323. else
  8324. begin
  8325. TransferUsedRegs(TmpUsedRegs);
  8326. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  8327. if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
  8328. begin
  8329. if (taicpu(p).oper[0]^.val = 1) and
  8330. (taicpu(hp1).condition in [C_L, C_NL, C_NGE, C_GE]) then
  8331. begin
  8332. { Convert; To:
  8333. cmp $1,r/m cmp $0,r/m
  8334. jl @lbl jle @lbl
  8335. (Also do inverted conditions)
  8336. }
  8337. DebugMsg(SPeepholeOptimization + 'Cmp1Jl2Cmp0Jle', p);
  8338. taicpu(p).oper[0]^.val := 0;
  8339. if taicpu(hp1).condition in [C_L, C_NGE] then
  8340. taicpu(hp1).condition := C_LE
  8341. else
  8342. taicpu(hp1).condition := C_NLE;
  8343. { If the instruction is now "cmp $0,%reg", convert it to a
  8344. TEST (and effectively do the work of the "cmp $0,%reg" in
  8345. the block above)
  8346. }
  8347. if (taicpu(p).oper[1]^.typ = top_reg) then
  8348. begin
  8349. taicpu(p).opcode := A_TEST;
  8350. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  8351. end;
  8352. Result := True;
  8353. Exit;
  8354. end
  8355. else if (taicpu(p).oper[1]^.typ = top_reg)
  8356. {$ifdef x86_64}
  8357. and (taicpu(p).opsize <> S_Q) { S_Q will never happen: cmp with 64 bit constants is not possible }
  8358. {$endif x86_64}
  8359. then
  8360. begin
  8361. { cmp register,$8000 neg register
  8362. je target --> jo target
  8363. .... only if register is deallocated before jump.}
  8364. case Taicpu(p).opsize of
  8365. S_B: v:=$80;
  8366. S_W: v:=$8000;
  8367. S_L: v:=qword($80000000);
  8368. else
  8369. internalerror(2013112905);
  8370. end;
  8371. if (taicpu(p).oper[0]^.val=v) and
  8372. (Taicpu(hp1).condition in [C_E,C_NE]) then
  8373. begin
  8374. TransferUsedRegs(TmpUsedRegs);
  8375. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  8376. if not(RegInUsedRegs(Taicpu(p).oper[1]^.reg, TmpUsedRegs)) then
  8377. begin
  8378. DebugMsg(SPeepholeOptimization + 'CmpJe2NegJo done',p);
  8379. Taicpu(p).opcode:=A_NEG;
  8380. Taicpu(p).loadoper(0,Taicpu(p).oper[1]^);
  8381. Taicpu(p).clearop(1);
  8382. Taicpu(p).ops:=1;
  8383. if Taicpu(hp1).condition=C_E then
  8384. Taicpu(hp1).condition:=C_O
  8385. else
  8386. Taicpu(hp1).condition:=C_NO;
  8387. Result:=true;
  8388. exit;
  8389. end;
  8390. end;
  8391. end;
  8392. end;
  8393. end;
  8394. end;
  8395. if TrySwapMovCmp(p, hp1) then
  8396. begin
  8397. Result := True;
  8398. Exit;
  8399. end;
  8400. end;
  8401. function TX86AsmOptimizer.OptPass1PXor(var p: tai): boolean;
  8402. var
  8403. hp1: tai;
  8404. begin
  8405. {
  8406. remove the second (v)pxor from
  8407. pxor reg,reg
  8408. ...
  8409. pxor reg,reg
  8410. }
  8411. Result:=false;
  8412. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  8413. MatchOpType(taicpu(p),top_reg,top_reg) and
  8414. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  8415. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  8416. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  8417. MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^) then
  8418. begin
  8419. DebugMsg(SPeepholeOptimization + 'PXorPXor2PXor done',hp1);
  8420. RemoveInstruction(hp1);
  8421. Result:=true;
  8422. Exit;
  8423. end
  8424. {
  8425. replace
  8426. pxor reg1,reg1
  8427. movapd/s reg1,reg2
  8428. dealloc reg1
  8429. by
  8430. pxor reg2,reg2
  8431. }
  8432. else if GetNextInstruction(p,hp1) and
  8433. { we mix single and double opperations here because we assume that the compiler
  8434. generates vmovapd only after double operations and vmovaps only after single operations }
  8435. MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
  8436. MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  8437. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  8438. (taicpu(p).oper[0]^.typ=top_reg) then
  8439. begin
  8440. TransferUsedRegs(TmpUsedRegs);
  8441. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  8442. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  8443. begin
  8444. taicpu(p).loadoper(0,taicpu(hp1).oper[1]^);
  8445. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  8446. DebugMsg(SPeepholeOptimization + 'PXorMovapd2PXor done',p);
  8447. RemoveInstruction(hp1);
  8448. result:=true;
  8449. end;
  8450. end;
  8451. end;
  8452. function TX86AsmOptimizer.OptPass1VPXor(var p: tai): boolean;
  8453. var
  8454. hp1: tai;
  8455. begin
  8456. {
  8457. remove the second (v)pxor from
  8458. (v)pxor reg,reg
  8459. ...
  8460. (v)pxor reg,reg
  8461. }
  8462. Result:=false;
  8463. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^,taicpu(p).oper[2]^) and
  8464. MatchOpType(taicpu(p),top_reg,top_reg,top_reg) then
  8465. begin
  8466. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  8467. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  8468. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  8469. MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^,taicpu(hp1).oper[2]^) then
  8470. begin
  8471. DebugMsg(SPeepholeOptimization + 'VPXorVPXor2VPXor done',hp1);
  8472. RemoveInstruction(hp1);
  8473. Result:=true;
  8474. Exit;
  8475. end;
  8476. {$ifdef x86_64}
  8477. {
  8478. replace
  8479. vpxor reg1,reg1,reg1
  8480. vmov reg,mem
  8481. by
  8482. movq $0,mem
  8483. }
  8484. if GetNextInstruction(p,hp1) and
  8485. MatchInstruction(hp1,A_VMOVSD,[]) and
  8486. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  8487. MatchOpType(taicpu(hp1),top_reg,top_ref) then
  8488. begin
  8489. TransferUsedRegs(TmpUsedRegs);
  8490. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  8491. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  8492. begin
  8493. taicpu(hp1).loadconst(0,0);
  8494. taicpu(hp1).opcode:=A_MOV;
  8495. taicpu(hp1).opsize:=S_Q;
  8496. DebugMsg(SPeepholeOptimization + 'VPXorVMov2Mov done',p);
  8497. RemoveCurrentP(p);
  8498. result:=true;
  8499. Exit;
  8500. end;
  8501. end;
  8502. {$endif x86_64}
  8503. end
  8504. {
  8505. replace
  8506. vpxor reg1,reg1,reg2
  8507. by
  8508. vpxor reg2,reg2,reg2
  8509. to avoid unncessary data dependencies
  8510. }
  8511. else if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  8512. MatchOpType(taicpu(p),top_reg,top_reg,top_reg) then
  8513. begin
  8514. DebugMsg(SPeepholeOptimization + 'VPXor2VPXor done',p);
  8515. { avoid unncessary data dependency }
  8516. taicpu(p).loadreg(0,taicpu(p).oper[2]^.reg);
  8517. taicpu(p).loadreg(1,taicpu(p).oper[2]^.reg);
  8518. result:=true;
  8519. exit;
  8520. end;
  8521. Result:=OptPass1VOP(p);
  8522. end;
  8523. function TX86AsmOptimizer.OptPass1Imul(var p: tai): boolean;
  8524. var
  8525. hp1 : tai;
  8526. begin
  8527. result:=false;
  8528. { replace
  8529. IMul const,%mreg1,%mreg2
  8530. Mov %reg2,%mreg3
  8531. dealloc %mreg3
  8532. by
  8533. Imul const,%mreg1,%mreg23
  8534. }
  8535. if (taicpu(p).ops=3) and
  8536. GetNextInstruction(p,hp1) and
  8537. MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  8538. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  8539. (taicpu(hp1).oper[1]^.typ=top_reg) then
  8540. begin
  8541. TransferUsedRegs(TmpUsedRegs);
  8542. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  8543. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  8544. begin
  8545. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  8546. DebugMsg(SPeepholeOptimization + 'ImulMov2Imul done',p);
  8547. RemoveInstruction(hp1);
  8548. result:=true;
  8549. end;
  8550. end;
  8551. end;
  8552. function TX86AsmOptimizer.OptPass1SHXX(var p: tai): boolean;
  8553. var
  8554. hp1 : tai;
  8555. begin
  8556. result:=false;
  8557. { replace
  8558. IMul %reg0,%reg1,%reg2
  8559. Mov %reg2,%reg3
  8560. dealloc %reg2
  8561. by
  8562. Imul %reg0,%reg1,%reg3
  8563. }
  8564. if GetNextInstruction(p,hp1) and
  8565. MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  8566. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  8567. (taicpu(hp1).oper[1]^.typ=top_reg) then
  8568. begin
  8569. TransferUsedRegs(TmpUsedRegs);
  8570. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  8571. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  8572. begin
  8573. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  8574. DebugMsg(SPeepholeOptimization + 'SHXXMov2SHXX done',p);
  8575. RemoveInstruction(hp1);
  8576. result:=true;
  8577. end;
  8578. end;
  8579. end;
  8580. function TX86AsmOptimizer.OptPass1_V_Cvtss2sd(var p: tai): boolean;
  8581. var
  8582. hp1: tai;
  8583. begin
  8584. Result:=false;
  8585. { get rid of
  8586. (v)cvtss2sd reg0,<reg1,>reg2
  8587. (v)cvtss2sd reg2,<reg2,>reg0
  8588. }
  8589. if GetNextInstruction(p,hp1) and
  8590. (((taicpu(p).opcode=A_CVTSS2SD) and MatchInstruction(hp1,A_CVTSD2SS,[taicpu(p).opsize]) and
  8591. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)) or
  8592. ((taicpu(p).opcode=A_VCVTSS2SD) and MatchInstruction(hp1,A_VCVTSD2SS,[taicpu(p).opsize]) and
  8593. MatchOpType(taicpu(p),top_reg,top_reg,top_reg) and
  8594. MatchOpType(taicpu(hp1),top_reg,top_reg,top_reg) and
  8595. (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  8596. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  8597. (getsupreg(taicpu(p).oper[2]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg))
  8598. )
  8599. ) then
  8600. begin
  8601. if ((taicpu(p).opcode=A_CVTSS2SD) and (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg))) or
  8602. ((taicpu(p).opcode=A_VCVTSS2SD) and (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[2]^.reg))) then
  8603. begin
  8604. DebugMsg(SPeepholeOptimization + '(V)Cvtss2CvtSd(V)Cvtsd2ss2Nop done',p);
  8605. RemoveCurrentP(p);
  8606. RemoveInstruction(hp1);
  8607. end
  8608. else
  8609. begin
  8610. DebugMsg(SPeepholeOptimization + '(V)Cvtss2CvtSd(V)Cvtsd2ss2Vmovaps done',p);
  8611. if taicpu(hp1).opcode=A_CVTSD2SS then
  8612. begin
  8613. taicpu(p).loadreg(1,taicpu(hp1).oper[1]^.reg);
  8614. taicpu(p).opcode:=A_MOVAPS;
  8615. end
  8616. else
  8617. begin
  8618. taicpu(p).loadreg(1,taicpu(hp1).oper[2]^.reg);
  8619. taicpu(p).opcode:=A_VMOVAPS;
  8620. end;
  8621. taicpu(p).ops:=2;
  8622. RemoveInstruction(hp1);
  8623. end;
  8624. Result:=true;
  8625. Exit;
  8626. end;
  8627. end;
  8628. function TX86AsmOptimizer.OptPass1Jcc(var p : tai) : boolean;
  8629. var
  8630. hp1, hp2, hp3, hp4, hp5: tai;
  8631. ThisReg: TRegister;
  8632. begin
  8633. Result := False;
  8634. if not GetNextInstruction(p,hp1) then
  8635. Exit;
  8636. {
  8637. convert
  8638. j<c> .L1
  8639. mov 1,reg
  8640. jmp .L2
  8641. .L1
  8642. mov 0,reg
  8643. .L2
  8644. into
  8645. mov 0,reg
  8646. set<not(c)> reg
  8647. take care of alignment and that the mov 0,reg is not converted into a xor as this
  8648. would destroy the flag contents
  8649. Use MOVZX if size is preferred, since while mov 0,reg is bigger, it can be
  8650. executed at the same time as a previous comparison.
  8651. set<not(c)> reg
  8652. movzx reg, reg
  8653. }
  8654. if MatchInstruction(hp1,A_MOV,[]) and
  8655. (taicpu(hp1).oper[0]^.typ = top_const) and
  8656. (
  8657. (
  8658. (taicpu(hp1).oper[1]^.typ = top_reg)
  8659. {$ifdef i386}
  8660. { Under i386, ESI, EDI, EBP and ESP
  8661. don't have an 8-bit representation }
  8662. and not (getsupreg(taicpu(hp1).oper[1]^.reg) in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  8663. {$endif i386}
  8664. ) or (
  8665. {$ifdef i386}
  8666. (taicpu(hp1).oper[1]^.typ <> top_reg) and
  8667. {$endif i386}
  8668. (taicpu(hp1).opsize = S_B)
  8669. )
  8670. ) and
  8671. GetNextInstruction(hp1,hp2) and
  8672. MatchInstruction(hp2,A_JMP,[]) and (taicpu(hp2).oper[0]^.ref^.refaddr=addr_full) and
  8673. GetNextInstruction(hp2,hp3) and
  8674. FindLabel(tasmlabel(taicpu(p).oper[0]^.ref^.symbol), hp3) and
  8675. GetNextInstruction(hp3,hp4) and
  8676. MatchInstruction(hp4,A_MOV,[taicpu(hp1).opsize]) and
  8677. (taicpu(hp4).oper[0]^.typ = top_const) and
  8678. (
  8679. ((taicpu(hp1).oper[0]^.val = 0) and (taicpu(hp4).oper[0]^.val = 1)) or
  8680. ((taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0))
  8681. ) and
  8682. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp4).oper[1]^) and
  8683. GetNextInstruction(hp4,hp5) and
  8684. FindLabel(tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol), hp5) then
  8685. begin
  8686. if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
  8687. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  8688. tai_label(hp3).labsym.DecRefs;
  8689. { If this isn't the only reference to the middle label, we can
  8690. still make a saving - only that the first jump and everything
  8691. that follows will remain. }
  8692. if (tai_label(hp3).labsym.getrefs = 0) then
  8693. begin
  8694. if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
  8695. DebugMsg(SPeepholeOptimization + 'J(c)Mov1JmpMov0 -> Set(~c)',p)
  8696. else
  8697. DebugMsg(SPeepholeOptimization + 'J(c)Mov0JmpMov1 -> Set(c)',p);
  8698. { remove jump, first label and second MOV (also catching any aligns) }
  8699. repeat
  8700. if not GetNextInstruction(hp2, hp3) then
  8701. InternalError(2021040810);
  8702. RemoveInstruction(hp2);
  8703. hp2 := hp3;
  8704. until hp2 = hp5;
  8705. { Don't decrement reference count before the removal loop
  8706. above, otherwise GetNextInstruction won't stop on the
  8707. the label }
  8708. tai_label(hp5).labsym.DecRefs;
  8709. end
  8710. else
  8711. begin
  8712. if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
  8713. DebugMsg(SPeepholeOptimization + 'J(c)Mov1JmpMov0 -> Set(~c) (partial)',p)
  8714. else
  8715. DebugMsg(SPeepholeOptimization + 'J(c)Mov0JmpMov1 -> Set(c) (partial)',p);
  8716. end;
  8717. taicpu(p).opcode:=A_SETcc;
  8718. taicpu(p).opsize:=S_B;
  8719. taicpu(p).is_jmp:=False;
  8720. if taicpu(hp1).opsize=S_B then
  8721. begin
  8722. taicpu(p).loadoper(0, taicpu(hp1).oper[1]^);
  8723. if taicpu(hp1).oper[1]^.typ = top_reg then
  8724. AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp2, UsedRegs);
  8725. RemoveInstruction(hp1);
  8726. end
  8727. else
  8728. begin
  8729. { Will be a register because the size can't be S_B otherwise }
  8730. ThisReg := newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBL);
  8731. taicpu(p).loadreg(0, ThisReg);
  8732. AllocRegBetween(ThisReg, p, hp2, UsedRegs);
  8733. if (cs_opt_size in current_settings.optimizerswitches) and IsMOVZXAcceptable then
  8734. begin
  8735. case taicpu(hp1).opsize of
  8736. S_W:
  8737. taicpu(hp1).opsize := S_BW;
  8738. S_L:
  8739. taicpu(hp1).opsize := S_BL;
  8740. {$ifdef x86_64}
  8741. S_Q:
  8742. begin
  8743. taicpu(hp1).opsize := S_BL;
  8744. { Change the destination register to 32-bit }
  8745. taicpu(hp1).loadreg(1, newreg(R_INTREGISTER,getsupreg(ThisReg), R_SUBD));
  8746. end;
  8747. {$endif x86_64}
  8748. else
  8749. InternalError(2021040820);
  8750. end;
  8751. taicpu(hp1).opcode := A_MOVZX;
  8752. taicpu(hp1).loadreg(0, ThisReg);
  8753. end
  8754. else
  8755. begin
  8756. AllocRegBetween(NR_FLAGS,p,hp1,UsedRegs);
  8757. { hp1 is already a MOV instruction with the correct register }
  8758. taicpu(hp1).loadconst(0, 0);
  8759. { Inserting it right before p will guarantee that the flags are also tracked }
  8760. asml.Remove(hp1);
  8761. asml.InsertBefore(hp1, p);
  8762. end;
  8763. end;
  8764. Result:=true;
  8765. exit;
  8766. end
  8767. else if MatchInstruction(hp1, A_CLC, A_STC, []) then
  8768. Result := TryJccStcClcOpt(p, hp1)
  8769. else if (hp1.typ = ait_label) then
  8770. Result := DoSETccLblRETOpt(p, tai_label(hp1));
  8771. end;
  8772. function TX86AsmOptimizer.OptPass1VMOVDQ(var p: tai): Boolean;
  8773. var
  8774. hp1, hp2, hp3: tai;
  8775. SourceRef, TargetRef: TReference;
  8776. CurrentReg: TRegister;
  8777. begin
  8778. { VMOVDQU/CMOVDQA shouldn't have even been generated }
  8779. if not UseAVX then
  8780. InternalError(2021100501);
  8781. Result := False;
  8782. { Look for the following to simplify:
  8783. vmovdqa/u x(mem1), %xmmreg
  8784. vmovdqa/u %xmmreg, y(mem2)
  8785. vmovdqa/u x+16(mem1), %xmmreg
  8786. vmovdqa/u %xmmreg, y+16(mem2)
  8787. Change to:
  8788. vmovdqa/u x(mem1), %ymmreg
  8789. vmovdqa/u %ymmreg, y(mem2)
  8790. vpxor %ymmreg, %ymmreg, %ymmreg
  8791. ( The VPXOR instruction is to zero the upper half, thus removing the
  8792. need to call the potentially expensive VZEROUPPER instruction. Other
  8793. peephole optimisations can remove VPXOR if it's unnecessary )
  8794. }
  8795. TransferUsedRegs(TmpUsedRegs);
  8796. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  8797. { NOTE: In the optimisations below, if the references dictate that an
  8798. aligned move is possible (i.e. VMOVDQA), the existing instructions
  8799. should already be VMOVDQA because if (x mod 32) = 0, then (x mod 16) = 0 }
  8800. if (taicpu(p).opsize = S_XMM) and
  8801. MatchOpType(taicpu(p), top_ref, top_reg) and
  8802. GetNextInstruction(p, hp1) and
  8803. MatchInstruction(hp1, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
  8804. MatchOpType(taicpu(hp1), top_reg, top_ref) and
  8805. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  8806. begin
  8807. SourceRef := taicpu(p).oper[0]^.ref^;
  8808. TargetRef := taicpu(hp1).oper[1]^.ref^;
  8809. if GetNextInstruction(hp1, hp2) and
  8810. MatchInstruction(hp2, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
  8811. MatchOpType(taicpu(hp2), top_ref, top_reg) then
  8812. begin
  8813. { Delay calling GetNextInstruction(hp2, hp3) for as long as possible }
  8814. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  8815. Inc(SourceRef.offset, 16);
  8816. { Reuse the register in the first block move }
  8817. CurrentReg := newreg(R_MMREGISTER, getsupreg(taicpu(p).oper[1]^.reg), R_SUBMMY);
  8818. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) and
  8819. not RefsMightOverlap(taicpu(p).oper[0]^.ref^, TargetRef, 32) then
  8820. begin
  8821. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  8822. Inc(TargetRef.offset, 16);
  8823. if GetNextInstruction(hp2, hp3) and
  8824. MatchInstruction(hp3, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
  8825. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  8826. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  8827. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  8828. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  8829. begin
  8830. { Update the register tracking to the new size }
  8831. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  8832. { Remember that the offsets are 16 ahead }
  8833. { Switch to unaligned if the memory isn't on a 32-byte boundary }
  8834. if not (
  8835. ((SourceRef.offset mod 32) = 16) and
  8836. (SourceRef.alignment >= 32) and ((SourceRef.alignment mod 32) = 0)
  8837. ) then
  8838. taicpu(p).opcode := A_VMOVDQU;
  8839. taicpu(p).opsize := S_YMM;
  8840. taicpu(p).oper[1]^.reg := CurrentReg;
  8841. if not (
  8842. ((TargetRef.offset mod 32) = 16) and
  8843. (TargetRef.alignment >= 32) and ((TargetRef.alignment mod 32) = 0)
  8844. ) then
  8845. taicpu(hp1).opcode := A_VMOVDQU;
  8846. taicpu(hp1).opsize := S_YMM;
  8847. taicpu(hp1).oper[0]^.reg := CurrentReg;
  8848. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 1)', p);
  8849. { If pi_uses_ymm is set, VZEROUPPER is present to do this for us }
  8850. if (pi_uses_ymm in current_procinfo.flags) then
  8851. RemoveInstruction(hp2)
  8852. else
  8853. begin
  8854. { Upper 128 bits will be set to zero; change to XMM
  8855. to avoid requirement of AVX2 }
  8856. setsubreg(CurrentReg, R_SUBMMX);
  8857. taicpu(hp2).opcode := A_VPXOR;
  8858. taicpu(hp2).opsize := S_XMM;
  8859. taicpu(hp2).loadreg(0, CurrentReg);
  8860. taicpu(hp2).loadreg(1, CurrentReg);
  8861. taicpu(hp2).loadreg(2, CurrentReg);
  8862. taicpu(hp2).ops := 3;
  8863. end;
  8864. RemoveInstruction(hp3);
  8865. Result := True;
  8866. Exit;
  8867. end;
  8868. end
  8869. else
  8870. begin
  8871. { See if the next references are 16 less rather than 16 greater }
  8872. Dec(SourceRef.offset, 32); { -16 the other way }
  8873. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
  8874. begin
  8875. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  8876. Dec(TargetRef.offset, 16); { Only 16, not 32, as it wasn't incremented unlike SourceRef }
  8877. if not RefsMightOverlap(SourceRef, TargetRef, 32) and
  8878. GetNextInstruction(hp2, hp3) and
  8879. MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
  8880. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  8881. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  8882. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  8883. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  8884. begin
  8885. { Update the register tracking to the new size }
  8886. AllocRegBetween(CurrentReg, hp2, hp3, UsedRegs);
  8887. { hp2 and hp3 are the starting offsets, so mod = 0 this time }
  8888. { Switch to unaligned if the memory isn't on a 32-byte boundary }
  8889. if not(
  8890. ((SourceRef.offset mod 32) = 0) and
  8891. (SourceRef.alignment >= 32) and ((SourceRef.alignment mod 32) = 0)
  8892. ) then
  8893. taicpu(hp2).opcode := A_VMOVDQU;
  8894. taicpu(hp2).opsize := S_YMM;
  8895. taicpu(hp2).oper[1]^.reg := CurrentReg;
  8896. if not (
  8897. ((TargetRef.offset mod 32) = 0) and
  8898. (TargetRef.alignment >= 32) and ((TargetRef.alignment mod 32) = 0)
  8899. ) then
  8900. taicpu(hp3).opcode := A_VMOVDQU;
  8901. taicpu(hp3).opsize := S_YMM;
  8902. taicpu(hp3).oper[0]^.reg := CurrentReg;
  8903. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 2)', p);
  8904. { If pi_uses_ymm is set, VZEROUPPER is present to do this for us }
  8905. if (pi_uses_ymm in current_procinfo.flags) then
  8906. RemoveInstruction(hp1)
  8907. else
  8908. begin
  8909. { Upper 128 bits will be set to zero; change to
  8910. XMM to avoid requirement of AVX2 }
  8911. setsubreg(CurrentReg, R_SUBMMX);
  8912. taicpu(hp1).opcode := A_VPXOR;
  8913. taicpu(hp1).opsize := S_XMM;
  8914. taicpu(hp1).loadreg(0, CurrentReg);
  8915. taicpu(hp1).loadreg(1, CurrentReg);
  8916. taicpu(hp1).loadreg(2, CurrentReg);
  8917. taicpu(hp1).ops := 3;
  8918. Asml.Remove(hp1);
  8919. Asml.InsertAfter(hp1, hp3); { Register deallocations will be after hp3 }
  8920. end;
  8921. RemoveCurrentP(p, hp2);
  8922. Result := True;
  8923. Exit;
  8924. end;
  8925. end;
  8926. end;
  8927. end;
  8928. end;
  8929. end;
  8930. function TX86AsmOptimizer.CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
  8931. var
  8932. hp2, hp3, first_assignment: tai;
  8933. IncCount, OperIdx: Integer;
  8934. OrigLabel: TAsmLabel;
  8935. begin
  8936. Count := 0;
  8937. Result := False;
  8938. first_assignment := nil;
  8939. if (LoopCount >= 20) then
  8940. begin
  8941. { Guard against infinite loops }
  8942. Exit;
  8943. end;
  8944. if (taicpu(p).oper[0]^.typ <> top_ref) or
  8945. (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) or
  8946. (taicpu(p).oper[0]^.ref^.base <> NR_NO) or
  8947. (taicpu(p).oper[0]^.ref^.index <> NR_NO) or
  8948. not (taicpu(p).oper[0]^.ref^.symbol is TAsmLabel) then
  8949. Exit;
  8950. OrigLabel := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  8951. {
  8952. change
  8953. jmp .L1
  8954. ...
  8955. .L1:
  8956. mov ##, ## ( multiple movs possible )
  8957. jmp/ret
  8958. into
  8959. mov ##, ##
  8960. jmp/ret
  8961. }
  8962. if not Assigned(hp1) then
  8963. begin
  8964. hp1 := GetLabelWithSym(OrigLabel);
  8965. if not Assigned(hp1) or not SkipLabels(hp1, hp1) then
  8966. Exit;
  8967. end;
  8968. hp2 := hp1;
  8969. while Assigned(hp2) do
  8970. begin
  8971. if Assigned(hp2) and (hp2.typ = ait_label) then
  8972. SkipLabels(hp2,hp2);
  8973. if not Assigned(hp2) or (hp2.typ <> ait_instruction) then
  8974. Break;
  8975. case taicpu(hp2).opcode of
  8976. A_MOVSD:
  8977. begin
  8978. if taicpu(hp2).ops = 0 then
  8979. { Wrong MOVSD }
  8980. Break;
  8981. Inc(Count);
  8982. if Count >= 5 then
  8983. { Too many to be worthwhile }
  8984. Break;
  8985. GetNextInstruction(hp2, hp2);
  8986. Continue;
  8987. end;
  8988. A_MOV,
  8989. A_MOVD,
  8990. A_MOVQ,
  8991. A_MOVSX,
  8992. {$ifdef x86_64}
  8993. A_MOVSXD,
  8994. {$endif x86_64}
  8995. A_MOVZX,
  8996. A_MOVAPS,
  8997. A_MOVUPS,
  8998. A_MOVSS,
  8999. A_MOVAPD,
  9000. A_MOVUPD,
  9001. A_MOVDQA,
  9002. A_MOVDQU,
  9003. A_VMOVSS,
  9004. A_VMOVAPS,
  9005. A_VMOVUPS,
  9006. A_VMOVSD,
  9007. A_VMOVAPD,
  9008. A_VMOVUPD,
  9009. A_VMOVDQA,
  9010. A_VMOVDQU:
  9011. begin
  9012. Inc(Count);
  9013. if Count >= 5 then
  9014. { Too many to be worthwhile }
  9015. Break;
  9016. GetNextInstruction(hp2, hp2);
  9017. Continue;
  9018. end;
  9019. A_JMP:
  9020. begin
  9021. { Guard against infinite loops }
  9022. if taicpu(hp2).oper[0]^.ref^.symbol = OrigLabel then
  9023. Exit;
  9024. { Analyse this jump first in case it also duplicates assignments }
  9025. if CheckJumpMovTransferOpt(hp2, nil, LoopCount + 1, IncCount) then
  9026. begin
  9027. { Something did change! }
  9028. Result := True;
  9029. Inc(Count, IncCount);
  9030. if Count >= 5 then
  9031. begin
  9032. { Too many to be worthwhile }
  9033. Exit;
  9034. end;
  9035. if MatchInstruction(hp2, [A_JMP, A_RET], []) then
  9036. Break;
  9037. end;
  9038. Result := True;
  9039. Break;
  9040. end;
  9041. A_RET:
  9042. begin
  9043. Result := True;
  9044. Break;
  9045. end;
  9046. else
  9047. Break;
  9048. end;
  9049. end;
  9050. if Result then
  9051. begin
  9052. { A count of zero can happen when CheckJumpMovTransferOpt is called recursively }
  9053. if Count = 0 then
  9054. begin
  9055. Result := False;
  9056. Exit;
  9057. end;
  9058. TransferUsedRegs(TmpUsedRegs);
  9059. hp3 := p;
  9060. DebugMsg(SPeepholeOptimization + 'Duplicated ' + debug_tostr(Count) + ' assignment(s) and redirected jump', p);
  9061. while True do
  9062. begin
  9063. if Assigned(hp1) and (hp1.typ = ait_label) then
  9064. SkipLabels(hp1,hp1);
  9065. case hp1.typ of
  9066. ait_regalloc:
  9067. if tai_regalloc(hp1).ratype = ra_dealloc then
  9068. begin
  9069. { Duplicate the register deallocation... }
  9070. hp3:=tai(hp1.getcopy);
  9071. if first_assignment = nil then
  9072. first_assignment := hp3;
  9073. asml.InsertBefore(hp3, p);
  9074. { ... but also reallocate it after the jump }
  9075. hp3:=tai(hp1.getcopy);
  9076. tai_regalloc(hp3).ratype := ra_alloc;
  9077. asml.InsertAfter(hp3, p);
  9078. end;
  9079. ait_instruction:
  9080. case taicpu(hp1).opcode of
  9081. A_JMP:
  9082. begin
  9083. { Change the original jump to the new destination }
  9084. OrigLabel.decrefs;
  9085. taicpu(hp1).oper[0]^.ref^.symbol.increfs;
  9086. taicpu(p).loadref(0, taicpu(hp1).oper[0]^.ref^);
  9087. { Set p to the first duplicated assignment so it can get optimised if needs be }
  9088. if not Assigned(first_assignment) then
  9089. InternalError(2021040810)
  9090. else
  9091. p := first_assignment;
  9092. Exit;
  9093. end;
  9094. A_RET:
  9095. begin
  9096. { Now change the jump into a RET instruction }
  9097. ConvertJumpToRET(p, hp1);
  9098. { Set p to the first duplicated assignment so it can get optimised if needs be }
  9099. if not Assigned(first_assignment) then
  9100. InternalError(2021040811)
  9101. else
  9102. p := first_assignment;
  9103. Exit;
  9104. end;
  9105. else
  9106. begin
  9107. { Duplicate the MOV instruction }
  9108. hp3:=tai(hp1.getcopy);
  9109. if first_assignment = nil then
  9110. first_assignment := hp3;
  9111. asml.InsertBefore(hp3, p);
  9112. { Make sure the compiler knows about any final registers written here }
  9113. for OperIdx := 0 to taicpu(hp3).ops - 1 do
  9114. with taicpu(hp3).oper[OperIdx]^ do
  9115. begin
  9116. case typ of
  9117. top_ref:
  9118. begin
  9119. if (ref^.base <> NR_NO) and
  9120. (getsupreg(ref^.base) <> RS_STACK_POINTER_REG) and
  9121. (
  9122. (getsupreg(ref^.base) <> RS_FRAME_POINTER_REG) or
  9123. (
  9124. { Allow the frame pointer if it's not being used by the procedure as such }
  9125. Assigned(current_procinfo) and
  9126. (current_procinfo.framepointer <> NR_FRAME_POINTER_REG)
  9127. )
  9128. )
  9129. {$ifdef x86_64} and (ref^.base <> NR_RIP) {$endif x86_64}
  9130. then
  9131. begin
  9132. AllocRegBetween(ref^.base, hp3, p, TmpUsedRegs);
  9133. if not Assigned(first_assignment) then
  9134. IncludeRegInUsedRegs(ref^.base, UsedRegs);
  9135. end;
  9136. if (ref^.index <> NR_NO) and
  9137. (getsupreg(ref^.index) <> RS_STACK_POINTER_REG) and
  9138. (
  9139. (getsupreg(ref^.index) <> RS_FRAME_POINTER_REG) or
  9140. (
  9141. { Allow the frame pointer if it's not being used by the procedure as such }
  9142. Assigned(current_procinfo) and
  9143. (current_procinfo.framepointer <> NR_FRAME_POINTER_REG)
  9144. )
  9145. )
  9146. {$ifdef x86_64} and (ref^.index <> NR_RIP) {$endif x86_64} and
  9147. (ref^.index <> ref^.base) then
  9148. begin
  9149. AllocRegBetween(ref^.index, hp3, p, TmpUsedRegs);
  9150. if not Assigned(first_assignment) then
  9151. IncludeRegInUsedRegs(ref^.index, UsedRegs);
  9152. end;
  9153. end;
  9154. top_reg:
  9155. begin
  9156. AllocRegBetween(reg, hp3, p, TmpUsedRegs);
  9157. if not Assigned(first_assignment) then
  9158. IncludeRegInUsedRegs(reg, UsedRegs);
  9159. end;
  9160. else
  9161. ;
  9162. end;
  9163. end;
  9164. end;
  9165. end;
  9166. else
  9167. InternalError(2021040720);
  9168. end;
  9169. if not GetNextInstruction(hp1, hp1, [ait_regalloc]) then
  9170. { Should have dropped out earlier }
  9171. InternalError(2021040710);
  9172. end;
  9173. end;
  9174. end;
  9175. const
  9176. WriteOp: array[0..3] of set of TInsChange = (
  9177. [Ch_Wop1, Ch_RWop1, Ch_Mop1],
  9178. [Ch_Wop2, Ch_RWop2, Ch_Mop2],
  9179. [Ch_Wop3, Ch_RWop3, Ch_Mop3],
  9180. [Ch_Wop4, Ch_RWop4, Ch_Mop4]);
  9181. RegWriteFlags: array[0..7] of set of TInsChange = (
  9182. { The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
  9183. [Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
  9184. [Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
  9185. [Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
  9186. [Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
  9187. [Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
  9188. [Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
  9189. [Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
  9190. [Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
  9191. function TX86AsmOptimizer.TrySwapMovOp(var p, hp1: tai): Boolean;
  9192. var
  9193. hp2: tai;
  9194. X: Integer;
  9195. begin
  9196. { If we have something like:
  9197. op ###,###
  9198. mov ###,###
  9199. Try to move the MOV instruction to before OP as long as OP and MOV don't
  9200. interfere in regards to what they write to.
  9201. NOTE: p must be a 2-operand instruction
  9202. }
  9203. Result := False;
  9204. if (hp1.typ <> ait_instruction) or
  9205. taicpu(hp1).is_jmp or
  9206. RegInInstruction(NR_DEFAULTFLAGS, hp1) then
  9207. Exit;
  9208. { NOP is a pipeline fence, likely marking the beginning of the function
  9209. epilogue, so drop out. Similarly, drop out if POP or RET are
  9210. encountered }
  9211. if MatchInstruction(hp1, A_NOP, A_POP, A_RET, []) then
  9212. Exit;
  9213. if (taicpu(hp1).opcode = A_MOVSD) and
  9214. (taicpu(hp1).ops = 0) then
  9215. { Wrong MOVSD }
  9216. Exit;
  9217. { Check for writes to specific registers first }
  9218. { EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP in that order }
  9219. for X := 0 to 7 do
  9220. if (RegWriteFlags[X] * InsProp[taicpu(hp1).opcode].Ch <> [])
  9221. and RegInInstruction(newreg(R_INTREGISTER, TSuperRegister(X), R_SUBWHOLE), p) then
  9222. Exit;
  9223. for X := 0 to taicpu(hp1).ops - 1 do
  9224. begin
  9225. { Check to see if this operand writes to something }
  9226. if ((WriteOp[X] * InsProp[taicpu(hp1).opcode].Ch) <> []) and
  9227. { And matches something in the CMP/TEST instruction }
  9228. (
  9229. MatchOperand(taicpu(hp1).oper[X]^, taicpu(p).oper[0]^) or
  9230. MatchOperand(taicpu(hp1).oper[X]^, taicpu(p).oper[1]^) or
  9231. (
  9232. { If it's a register, make sure the register written to doesn't
  9233. appear in the cmp instruction as part of a reference }
  9234. (taicpu(hp1).oper[X]^.typ = top_reg) and
  9235. RegInInstruction(taicpu(hp1).oper[X]^.reg, p)
  9236. )
  9237. ) then
  9238. Exit;
  9239. end;
  9240. { Check p to make sure it doesn't write to something that affects hp1 }
  9241. { Check for writes to specific registers first }
  9242. { EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP in that order }
  9243. for X := 0 to 7 do
  9244. if (RegWriteFlags[X] * InsProp[taicpu(p).opcode].Ch <> [])
  9245. and RegInInstruction(newreg(R_INTREGISTER, TSuperRegister(X), R_SUBWHOLE), hp1) then
  9246. Exit;
  9247. for X := 0 to taicpu(p).ops - 1 do
  9248. begin
  9249. { Check to see if this operand writes to something }
  9250. if ((WriteOp[X] * InsProp[taicpu(p).opcode].Ch) <> []) and
  9251. { And matches something in hp1 }
  9252. (taicpu(p).oper[X]^.typ = top_reg) and
  9253. RegInInstruction(taicpu(p).oper[X]^.reg, hp1) then
  9254. Exit;
  9255. end;
  9256. { The instruction can be safely moved }
  9257. asml.Remove(hp1);
  9258. { Try to insert after the last instructions where the FLAGS register is not
  9259. yet in use, so "mov $0,%reg" can be optimised into "xor %reg,%reg" later }
  9260. if SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp2) then
  9261. asml.InsertBefore(hp1, hp2)
  9262. { Failing that, try to insert after the last instructions where the
  9263. FLAGS register is not yet in use }
  9264. else if GetLastInstruction(p, hp2) and
  9265. (
  9266. (hp2.typ <> ait_instruction) or
  9267. { Don't insert after an instruction that uses the flags when p doesn't use them }
  9268. RegInInstruction(NR_DEFAULTFLAGS, p) or
  9269. not RegInInstruction(NR_DEFAULTFLAGS, hp2)
  9270. ) then
  9271. asml.InsertAfter(hp1, hp2)
  9272. else
  9273. { Note, if p.Previous is nil (even if it should logically never be the
  9274. case), FindRegAllocBackward immediately exits with False and so we
  9275. safely land here (we can't just pass p because FindRegAllocBackward
  9276. immediately exits on an instruction). [Kit] }
  9277. asml.InsertBefore(hp1, p);
  9278. DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and ' + debug_op2str(taicpu(hp1).opcode) + ' instructions to improve optimisation potential', hp1);
  9279. { We can't trust UsedRegs because we're looking backwards, although we
  9280. know the registers are allocated after p at the very least, so manually
  9281. create tai_regalloc objects if needed }
  9282. for X := 0 to taicpu(hp1).ops - 1 do
  9283. case taicpu(hp1).oper[X]^.typ of
  9284. top_reg:
  9285. begin
  9286. asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.reg, nil), hp1);
  9287. IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.reg, UsedRegs);
  9288. AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
  9289. end;
  9290. top_ref:
  9291. begin
  9292. if taicpu(hp1).oper[X]^.ref^.base <> NR_NO then
  9293. begin
  9294. asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.base, nil), hp1);
  9295. IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.base, UsedRegs);
  9296. AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
  9297. end;
  9298. if taicpu(hp1).oper[X]^.ref^.index <> NR_NO then
  9299. begin
  9300. asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.index, nil), hp1);
  9301. IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.index, UsedRegs);
  9302. AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
  9303. end;
  9304. end;
  9305. else
  9306. ;
  9307. end;
  9308. Result := True;
  9309. end;
  9310. function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
  9311. var
  9312. hp2: tai;
  9313. X: Integer;
  9314. begin
  9315. { If we have something like:
  9316. cmp ###,%reg1
  9317. mov 0,%reg2
  9318. And no modified registers are shared, move the instruction to before
  9319. the comparison as this means it can be optimised without worrying
  9320. about the FLAGS register. (CMP/MOV is generated by
  9321. "J(c)Mov1JmpMov0 -> Set(~c)", among other things).
  9322. As long as the second instruction doesn't use the flags or one of the
  9323. registers used by CMP or TEST (also check any references that use the
  9324. registers), then it can be moved prior to the comparison.
  9325. }
  9326. Result := False;
  9327. if not TrySwapMovOp(p, hp1) then
  9328. Exit;
  9329. if taicpu(hp1).opcode = A_LEA then
  9330. { The flags will be overwritten by the CMP/TEST instruction }
  9331. ConvertLEA(taicpu(hp1));
  9332. Result := True;
  9333. { Can we move it one further back? }
  9334. if GetLastInstruction(hp1, hp2) and (hp2.typ = ait_instruction) and
  9335. { Check to see if CMP/TEST is a comparison against zero }
  9336. (
  9337. (
  9338. (taicpu(p).opcode = A_CMP) and
  9339. MatchOperand(taicpu(p).oper[0]^, 0)
  9340. ) or
  9341. (
  9342. (taicpu(p).opcode = A_TEST) and
  9343. (
  9344. OpsEqual(taicpu(p).oper[0]^, taicpu(p).oper[1]^) or
  9345. MatchOperand(taicpu(p).oper[0]^, -1)
  9346. )
  9347. )
  9348. ) and
  9349. { These instructions set the zero flag if the result is zero }
  9350. MatchInstruction(hp2, [A_ADD, A_SUB, A_OR, A_XOR, A_AND, A_POPCNT, A_LZCNT], []) and
  9351. OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) then
  9352. { Looks like we can - if successful, this benefits PostPeepholeOptTestOr }
  9353. TrySwapMovOp(hp2, hp1);
  9354. end;
  9355. function TX86AsmOptimizer.OptPass1STCCLC(var p: tai): Boolean;
  9356. var
  9357. hp1, hp2, p_last, p_dist, hp1_dist: tai;
  9358. JumpLabel: TAsmLabel;
  9359. TmpBool: Boolean;
  9360. begin
  9361. Result := False;
  9362. { Look for:
  9363. stc/clc
  9364. j(c) .L1
  9365. ...
  9366. .L1:
  9367. set(n)cb %reg
  9368. (flags deallocated)
  9369. j(c) .L2
  9370. Change to:
  9371. mov $0/$1,%reg (depending on if the carry bit is cleared or not)
  9372. j(c) .L2
  9373. }
  9374. p_last := p;
  9375. while GetNextInstruction(p_last, hp1) and
  9376. (hp1.typ = ait_instruction) and
  9377. IsJumpToLabel(taicpu(hp1)) do
  9378. begin
  9379. if DoJumpOptimizations(hp1, TmpBool) then
  9380. { Re-evaluate from p_last. Probably could be faster, but it's guaranteed to be correct }
  9381. Continue;
  9382. JumpLabel := TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol);
  9383. if not Assigned(JumpLabel) then
  9384. InternalError(2024012801);
  9385. { Optimise the J(c); stc/clc optimisation first since this will
  9386. get missed if the main optimisation takes place }
  9387. if (taicpu(hp1).opcode = A_JCC) then
  9388. begin
  9389. if GetNextInstruction(hp1, hp2) and
  9390. MatchInstruction(hp2, A_CLC, A_STC, []) and
  9391. TryJccStcClcOpt(hp1, hp2) then
  9392. begin
  9393. Result := True;
  9394. Exit;
  9395. end;
  9396. hp2 := nil; { Suppress compiler warning }
  9397. if (taicpu(hp1).condition in [C_C, C_NC]) and
  9398. { Make sure the flags aren't used again }
  9399. SetAndTest(FindRegDealloc(NR_DEFAULTFLAGS, tai(hp1.Next)), hp2) then
  9400. begin
  9401. { clc + jc = False; clc + jnc = True; stc + jc = True; stc + jnc = False }
  9402. if ((taicpu(p).opcode = A_STC) xor (taicpu(hp1).condition = C_NC)) then
  9403. begin
  9404. if (taicpu(p).opcode = A_STC) then
  9405. DebugMsg(SPeepholeOptimization + 'STC; JC -> JMP (Deterministic jump) (StcJc2Jmp)', p)
  9406. else
  9407. DebugMsg(SPeepholeOptimization + 'CLC; JNC -> JMP (Deterministic jump) (ClcJnc2Jmp)', p);
  9408. MakeUnconditional(taicpu(hp1));
  9409. { Move the jump to after the flag deallocations }
  9410. Asml.Remove(hp1);
  9411. Asml.InsertAfter(hp1, hp2);
  9412. RemoveCurrentP(p); { hp1 may not be the immediate next instruction }
  9413. Result := True;
  9414. Exit;
  9415. end
  9416. else
  9417. begin
  9418. if (taicpu(p).opcode = A_STC) then
  9419. DebugMsg(SPeepholeOptimization + 'STC; JNC -> NOP (Deterministic jump) (StcJnc2Nop)', p)
  9420. else
  9421. DebugMsg(SPeepholeOptimization + 'CLC; JC -> NOP (Deterministic jump) (ClcJc2Nop)', p);
  9422. { In this case, the jump is deterministic in that it will never be taken }
  9423. JumpLabel.DecRefs;
  9424. RemoveInstruction(hp1);
  9425. RemoveCurrentP(p); { hp1 may not have been the immediate next instruction }
  9426. Result := True;
  9427. Exit;
  9428. end;
  9429. end;
  9430. end;
  9431. hp2 := nil; { Suppress compiler warning }
  9432. if
  9433. { Make sure the carry flag doesn't appear in the jump conditions }
  9434. not (taicpu(hp1).condition in [C_AE, C_NB, C_NC, C_B, C_C, C_NAE, C_BE, C_NA]) and
  9435. SetAndTest(getlabelwithsym(JumpLabel), hp2) and
  9436. GetNextInstruction(hp2, p_dist) and
  9437. MatchInstruction(p_dist, A_Jcc, A_SETcc, []) and
  9438. (taicpu(p_dist).condition in [C_C, C_NC]) then
  9439. begin
  9440. case taicpu(p_dist).opcode of
  9441. A_Jcc:
  9442. begin
  9443. if DoJumpOptimizations(p_dist, TmpBool) then
  9444. { Re-evaluate from p_last. Probably could be faster, but it's guaranteed to be correct }
  9445. Continue;
  9446. { clc + jc = False; clc + jnc = True; stc + jc = True; stc + jnc = False }
  9447. if ((taicpu(p).opcode = A_STC) xor (taicpu(p_dist).condition = C_NC)) then
  9448. begin
  9449. DebugMsg(SPeepholeOptimization + 'STC/CLC; JMP/Jcc; ... J(N)C -> JMP/Jcc (StcClcJ(c)2Jmp)', p);
  9450. JumpLabel.decrefs;
  9451. taicpu(hp1).loadsymbol(0, taicpu(p_dist).oper[0]^.ref^.symbol, 0);
  9452. RemoveCurrentP(p); { hp1 may not be the immediate next instruction }
  9453. Result := True;
  9454. Exit;
  9455. end
  9456. else if GetNextInstruction(p_dist, hp1_dist) and
  9457. (hp1_dist.typ = ait_label) then
  9458. begin
  9459. DebugMsg(SPeepholeOptimization + 'STC/CLC; JMP/Jcc; ... J(N)C; .Lbl -> JMP/Jcc .Lbl (StcClcJ(~c)Lbl2Jmp)', p);
  9460. JumpLabel.decrefs;
  9461. taicpu(hp1).loadsymbol(0, tai_label(hp1_dist).labsym, 0);
  9462. RemoveCurrentP(p); { hp1 may not be the immediate next instruction }
  9463. Result := True;
  9464. Exit;
  9465. end;
  9466. end;
  9467. A_SETcc:
  9468. if { Make sure the flags aren't used again }
  9469. SetAndTest(FindRegDealloc(NR_DEFAULTFLAGS, tai(p_dist.Next)), hp2) and
  9470. GetNextInstruction(hp2, hp1_dist) and
  9471. (hp1_dist.typ = ait_instruction) and
  9472. IsJumpToLabel(taicpu(hp1_dist)) and
  9473. not (taicpu(hp1_dist).condition in [C_AE, C_NB, C_NC, C_B, C_C, C_NAE, C_BE, C_NA]) and
  9474. { This works if hp1_dist or both are regular JMP instructions }
  9475. condition_in(taicpu(hp1).condition, taicpu(hp1_dist).condition) and
  9476. (
  9477. (taicpu(p_dist).oper[0]^.typ <> top_reg) or
  9478. { Make sure the register isn't still in use, otherwise it
  9479. may get corrupted (fixes #40659) }
  9480. not RegUsedBetween(taicpu(p_dist).oper[0]^.reg, p, p_dist)
  9481. ) then
  9482. begin
  9483. taicpu(p).allocate_oper(2);
  9484. taicpu(p).ops := 2;
  9485. { clc + setc = 0; clc + setnc = 1; stc + setc = 1; stc + setnc = 0 }
  9486. taicpu(p).loadconst(0, TCGInt((taicpu(p).opcode = A_STC) xor (taicpu(p_dist).condition = C_NC)));
  9487. taicpu(p).loadoper(1, taicpu(p_dist).oper[0]^);
  9488. taicpu(p).opcode := A_MOV;
  9489. taicpu(p).opsize := S_B;
  9490. if (taicpu(p_dist).oper[0]^.typ = top_reg) then
  9491. AllocRegBetween(taicpu(p_dist).oper[0]^.reg, p, hp1, UsedRegs);
  9492. DebugMsg(SPeepholeOptimization + 'STC/CLC; JMP; ... SET(N)C; JMP -> MOV; JMP (StcClcSet(c)2Mov)', p);
  9493. JumpLabel.decrefs;
  9494. taicpu(hp1).loadsymbol(0, taicpu(hp1_dist).oper[0]^.ref^.symbol, 0);
  9495. { If a flag allocation is found, try to move it to after the MOV so "mov $0,%reg" gets optimised to "xor %reg,%reg" }
  9496. if SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp2) and
  9497. (tai_regalloc(hp2).ratype = ra_alloc) then
  9498. begin
  9499. Asml.Remove(hp2);
  9500. Asml.InsertAfter(hp2, p);
  9501. end;
  9502. Result := True;
  9503. Exit;
  9504. end;
  9505. else
  9506. ;
  9507. end;
  9508. end;
  9509. p_last := hp1;
  9510. end;
  9511. end;
  9512. function TX86AsmOptimizer.TryJccStcClcOpt(var p, hp1: tai): Boolean;
  9513. var
  9514. hp2, hp3: tai;
  9515. TempBool: Boolean;
  9516. begin
  9517. Result := False;
  9518. {
  9519. j(c) .L1
  9520. stc/clc
  9521. .L1:
  9522. jc/jnc .L2
  9523. (Flags deallocated)
  9524. Change to:
  9525. j)c) .L1
  9526. jmp .L2
  9527. .L1:
  9528. jc/jnc .L2
  9529. Then call DoJumpOptimizations to convert to:
  9530. j(nc) .L2
  9531. .L1: (may become a dead label)
  9532. jc/jnc .L2
  9533. }
  9534. if GetNextInstruction(hp1, hp2) and
  9535. (hp2.typ = ait_label) and
  9536. (tai_label(hp2).labsym = TAsmLabel(taicpu(p).oper[0]^.ref^.symbol)) and
  9537. GetNextInstruction(hp2, hp3) and
  9538. MatchInstruction(hp3, A_Jcc, []) and
  9539. (
  9540. (
  9541. (taicpu(hp3).condition = C_C) and
  9542. (taicpu(hp1).opcode = A_STC)
  9543. ) or (
  9544. (taicpu(hp3).condition = C_NC) and
  9545. (taicpu(hp1).opcode = A_CLC)
  9546. )
  9547. ) and
  9548. { Make sure the flags aren't used again }
  9549. Assigned(FindRegDealloc(NR_DEFAULTFLAGS, tai(hp3.Next))) then
  9550. begin
  9551. taicpu(hp1).allocate_oper(1);
  9552. taicpu(hp1).ops := 1;
  9553. taicpu(hp1).loadsymbol(0, TAsmLabel(taicpu(hp3).oper[0]^.ref^.symbol), 0);
  9554. taicpu(hp1).opcode := A_JMP;
  9555. taicpu(hp1).is_jmp := True;
  9556. TempBool := True; { Prevent compiler warnings }
  9557. if DoJumpOptimizations(p, TempBool) then
  9558. Result := True
  9559. else
  9560. Include(OptsToCheck, aoc_ForceNewIteration);
  9561. end;
  9562. end;
  9563. function TX86AsmOptimizer.OptPass2STCCLC(var p: tai): Boolean;
  9564. begin
  9565. { This generally only executes under -O3 and above }
  9566. Result := (aoc_DoPass2JccOpts in OptsToCheck) and OptPass1STCCLC(p);
  9567. end;
  9568. function TX86AsmOptimizer.OptPass2CMOVcc(var p: tai): Boolean;
  9569. var
  9570. hp1, hp2: tai;
  9571. FoundComparison: Boolean;
  9572. begin
  9573. { Run the pass 1 optimisations as well, since they may have some effect
  9574. after the CMOV blocks are created in OptPass2Jcc }
  9575. Result := False;
  9576. { Result := OptPass1CMOVcc(p);
  9577. if Result then
  9578. Exit;}
  9579. { Sometimes, the CMOV optimisations in OptPass2Jcc are a bit overzealous
  9580. and make a slightly inefficent result on branching-type blocks, notably
  9581. when setting a function result then jumping to the function epilogue.
  9582. In this case, change:
  9583. cmov(c) %reg1,%reg2
  9584. j(c) @lbl
  9585. (%reg2 deallocated)
  9586. To:
  9587. mov %reg11,%reg2
  9588. j(c) @lbl
  9589. Note, we can't use GetNextInstructionUsingReg to find the conditional
  9590. jump because if it's not present, we may end up with a jump that's
  9591. completely unrelated.
  9592. }
  9593. hp1 := p;
  9594. while GetNextInstruction(hp1, hp1) and
  9595. MatchInstruction(hp1, A_MOV, A_CMOVcc, []) do { loop };
  9596. if (hp1.typ = ait_instruction) and
  9597. (taicpu(hp1).opcode = A_Jcc) and
  9598. condition_in(taicpu(hp1).condition, taicpu(p).condition) then
  9599. begin
  9600. TransferUsedRegs(TmpUsedRegs);
  9601. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1);
  9602. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) or
  9603. (
  9604. { See if we can find a more distant instruction that overwrites
  9605. the destination register }
  9606. (cs_opt_level3 in current_settings.optimizerswitches) and
  9607. GetNextInstructionUsingReg(hp1, hp2, taicpu(p).oper[1]^.reg) and
  9608. RegLoadedWithNewValue(taicpu(p).oper[1]^.reg, hp2)
  9609. ) then
  9610. begin
  9611. if (taicpu(p).oper[0]^.typ = top_reg) then
  9612. begin
  9613. { Search backwards to see if the source register is set to a
  9614. constant }
  9615. FoundComparison := False;
  9616. hp1 := p;
  9617. while GetLastInstruction(hp1, hp1) and (hp1.typ = ait_instruction) do
  9618. begin
  9619. if RegModifiedByInstruction(NR_DEFAULTFLAGS, hp1) then
  9620. begin
  9621. FoundComparison := True;
  9622. Continue;
  9623. end;
  9624. { Once we find the CMP, TEST or similar instruction, we
  9625. have to stop if we find anything other than a MOV }
  9626. if FoundComparison and (taicpu(hp1).opcode <> A_MOV) then
  9627. Break;
  9628. if RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
  9629. { Destination register was modified }
  9630. Break;
  9631. if (taicpu(hp1).opcode = A_MOV) and MatchOpType(taicpu(hp1), top_const, toP_reg)
  9632. and (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) then
  9633. begin
  9634. { Found a constant! }
  9635. taicpu(p).loadconst(0, taicpu(hp1).oper[0]^.val);
  9636. if not RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, p, UsedRegs) then
  9637. { The source register is no longer in use }
  9638. RemoveInstruction(hp1);
  9639. Break;
  9640. end;
  9641. if RegModifiedByInstruction(taicpu(p).oper[0]^.reg, hp1) then
  9642. { Some other instruction has modified the source register }
  9643. Break;
  9644. end;
  9645. end;
  9646. DebugMsg(SPeepholeOptimization + 'CMOVcc/Jcc -> MOV/Jcc since register is not used if not branching', p);
  9647. taicpu(p).opcode := A_MOV;
  9648. taicpu(p).condition := C_None;
  9649. { Rely on the post peephole stage to put the MOV before the
  9650. CMP/TEST instruction that appears prior }
  9651. Result := True;
  9652. Exit;
  9653. end;
  9654. end;
  9655. end;
  9656. function TX86AsmOptimizer.OptPass2MOV(var p : tai) : boolean;
  9657. function IsXCHGAcceptable: Boolean; inline;
  9658. begin
  9659. { Always accept if optimising for size }
  9660. Result := (cs_opt_size in current_settings.optimizerswitches) or
  9661. { From the Pentium M onwards, XCHG only has a latency of 2 rather
  9662. than 3, so it becomes a saving compared to three MOVs with two of
  9663. them able to execute simultaneously. [Kit] }
  9664. (CPUX86_HINT_FAST_XCHG in cpu_optimization_hints[current_settings.optimizecputype]);
  9665. end;
  9666. var
  9667. NewRef: TReference;
  9668. hp1, hp2, hp3: Tai;
  9669. {$ifndef x86_64}
  9670. hp4: tai;
  9671. OperIdx: Integer;
  9672. {$endif x86_64}
  9673. NewInstr : Taicpu;
  9674. DestLabel: TAsmLabel;
  9675. TempTracking: TAllUsedRegs;
  9676. function TryMovArith2Lea(InputInstr: tai): Boolean;
  9677. var
  9678. NextInstr: tai;
  9679. NextPresent: Boolean;
  9680. begin
  9681. Result := False;
  9682. { be lazy, checking separately for sub would be slightly better }
  9683. if (taicpu(InputInstr).oper[0]^.typ = top_const) and
  9684. (abs(taicpu(InputInstr).oper[0]^.val)<=$7fffffff) then
  9685. begin
  9686. NextPresent := GetNextInstruction(InputInstr, NextInstr);
  9687. if NextPresent then
  9688. begin
  9689. { Try to avoid using TmpUsedRegs if possible (it's slow!) }
  9690. TransferUsedRegs(TmpUsedRegs);
  9691. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  9692. UpdateUsedRegs(TmpUsedRegs, tai(InputInstr.Next));
  9693. end;
  9694. if (
  9695. not NextPresent or
  9696. (
  9697. { The FLAGS register isn't always tracked properly, so do not
  9698. perform this optimisation if a conditional statement follows }
  9699. not RegReadByInstruction(NR_DEFAULTFLAGS, NextInstr) and
  9700. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, NextInstr, TmpUsedRegs)
  9701. )
  9702. ) then
  9703. begin
  9704. reference_reset(NewRef, 1, []);
  9705. NewRef.base := taicpu(p).oper[0]^.reg;
  9706. NewRef.scalefactor := 1;
  9707. if taicpu(InputInstr).opcode = A_ADD then
  9708. begin
  9709. DebugMsg(SPeepholeOptimization + 'MovAdd2Lea', p);
  9710. NewRef.offset := taicpu(InputInstr).oper[0]^.val;
  9711. end
  9712. else
  9713. begin
  9714. DebugMsg(SPeepholeOptimization + 'MovSub2Lea', p);
  9715. NewRef.offset := -taicpu(InputInstr).oper[0]^.val;
  9716. end;
  9717. taicpu(p).opcode := A_LEA;
  9718. taicpu(p).loadref(0, NewRef);
  9719. { For the sake of debugging, have the line info match the
  9720. arithmetic instruction rather than the MOV instruction }
  9721. taicpu(p).fileinfo := taicpu(InputInstr).fileinfo;
  9722. RemoveInstruction(InputInstr);
  9723. Result := True;
  9724. end;
  9725. end;
  9726. end;
  9727. begin
  9728. Result:=false;
  9729. { This optimisation adds an instruction, so only do it for speed }
  9730. if not (cs_opt_size in current_settings.optimizerswitches) and
  9731. MatchOpType(taicpu(p), top_const, top_reg) and
  9732. (taicpu(p).oper[0]^.val = 0) then
  9733. begin
  9734. { To avoid compiler warning }
  9735. DestLabel := nil;
  9736. if (p.typ <> ait_instruction) or (taicpu(p).oper[1]^.typ <> top_reg) then
  9737. InternalError(2021040750);
  9738. if not GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.reg) then
  9739. Exit;
  9740. case hp1.typ of
  9741. ait_label:
  9742. begin
  9743. { Change:
  9744. mov $0,%reg mov $0,%reg
  9745. @Lbl1: @Lbl1:
  9746. test %reg,%reg / cmp $0,%reg test %reg,%reg / mov $0,%reg
  9747. je @Lbl2 jne @Lbl2
  9748. To: To:
  9749. mov $0,%reg mov $0,%reg
  9750. jmp @Lbl2 jmp @Lbl3
  9751. (align) (align)
  9752. @Lbl1: @Lbl1:
  9753. test %reg,%reg / cmp $0,%reg test %reg,%reg / cmp $0,%reg
  9754. je @Lbl2 je @Lbl2
  9755. @Lbl3: <-- Only if label exists
  9756. (Not if it's optimised for size)
  9757. }
  9758. if not GetNextInstruction(hp1, hp2) then
  9759. Exit;
  9760. if (hp2.typ = ait_instruction) and
  9761. (
  9762. { Register sizes must exactly match }
  9763. (
  9764. (taicpu(hp2).opcode = A_CMP) and
  9765. MatchOperand(taicpu(hp2).oper[0]^, 0) and
  9766. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg)
  9767. ) or (
  9768. (taicpu(hp2).opcode = A_TEST) and
  9769. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  9770. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg)
  9771. )
  9772. ) and GetNextInstruction(hp2, hp3) and
  9773. (hp3.typ = ait_instruction) and
  9774. (taicpu(hp3).opcode = A_JCC) and
  9775. (taicpu(hp3).oper[0]^.typ=top_ref) and (taicpu(hp3).oper[0]^.ref^.refaddr=addr_full) and (taicpu(hp3).oper[0]^.ref^.base=NR_NO) and
  9776. (taicpu(hp3).oper[0]^.ref^.index=NR_NO) and (taicpu(hp3).oper[0]^.ref^.symbol is tasmlabel) then
  9777. begin
  9778. { Check condition of jump }
  9779. { Always true? }
  9780. if condition_in(C_E, taicpu(hp3).condition) then
  9781. begin
  9782. { Copy label symbol and obtain matching label entry for the
  9783. conditional jump, as this will be our destination}
  9784. DestLabel := tasmlabel(taicpu(hp3).oper[0]^.ref^.symbol);
  9785. DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Je -> Mov0JmpLblCmp0Je', p);
  9786. Result := True;
  9787. end
  9788. { Always false? }
  9789. else if condition_in(C_NE, taicpu(hp3).condition) and GetNextInstruction(hp3, hp2) then
  9790. begin
  9791. { This is only worth it if there's a jump to take }
  9792. case hp2.typ of
  9793. ait_instruction:
  9794. begin
  9795. if taicpu(hp2).opcode = A_JMP then
  9796. begin
  9797. DestLabel := tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol);
  9798. { An unconditional jump follows the conditional jump which will always be false,
  9799. so use this jump's destination for the new jump }
  9800. DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Jne -> Mov0JmpLblCmp0Jne (with JMP)', p);
  9801. Result := True;
  9802. end
  9803. else if taicpu(hp2).opcode = A_JCC then
  9804. begin
  9805. DestLabel := tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol);
  9806. if condition_in(C_E, taicpu(hp2).condition) then
  9807. begin
  9808. { A second conditional jump follows the conditional jump which will always be false,
  9809. while the second jump is always True, so use this jump's destination for the new jump }
  9810. DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Jne -> Mov0JmpLblCmp0Jne (with second Jcc)', p);
  9811. Result := True;
  9812. end;
  9813. { Don't risk it if the jump isn't always true (Result remains False) }
  9814. end;
  9815. end;
  9816. else
  9817. { If anything else don't optimise };
  9818. end;
  9819. end;
  9820. if Result then
  9821. begin
  9822. { Just so we have something to insert as a paremeter}
  9823. reference_reset(NewRef, 1, []);
  9824. NewInstr := taicpu.op_ref(A_JMP, S_NO, NewRef);
  9825. { Now actually load the correct parameter (this also
  9826. increases the reference count) }
  9827. NewInstr.loadsymbol(0, DestLabel, 0);
  9828. if (cs_opt_level3 in current_settings.optimizerswitches) then
  9829. begin
  9830. { Get instruction before original label (may not be p under -O3) }
  9831. if not GetLastInstruction(hp1, hp2) then
  9832. { Shouldn't fail here }
  9833. InternalError(2021040701);
  9834. end
  9835. else
  9836. hp2 := p;
  9837. taicpu(NewInstr).fileinfo := taicpu(hp2).fileinfo;
  9838. AsmL.InsertAfter(NewInstr, hp2);
  9839. { Add new alignment field }
  9840. (* AsmL.InsertAfter(
  9841. cai_align.create_max(
  9842. current_settings.alignment.jumpalign,
  9843. current_settings.alignment.jumpalignskipmax
  9844. ),
  9845. NewInstr
  9846. ); *)
  9847. end;
  9848. Exit;
  9849. end;
  9850. end;
  9851. else
  9852. ;
  9853. end;
  9854. end;
  9855. if not GetNextInstruction(p, hp1) then
  9856. Exit;
  9857. if MatchInstruction(hp1, A_CMP, A_TEST, []) then
  9858. begin
  9859. if (taicpu(hp1).opsize = taicpu(p).opsize) and DoMovCmpMemOpt(p, hp1) then
  9860. begin
  9861. Result := True;
  9862. Exit;
  9863. end;
  9864. { This optimisation is only effective on a second run of Pass 2,
  9865. hence -O3 or above.
  9866. Change:
  9867. mov %reg1,%reg2
  9868. cmp/test (contains %reg1)
  9869. mov x, %reg1
  9870. (another mov or a j(c))
  9871. To:
  9872. mov %reg1,%reg2
  9873. mov x, %reg1
  9874. cmp (%reg1 replaced with %reg2)
  9875. (another mov or a j(c))
  9876. The requirement of an additional MOV or a jump ensures there
  9877. isn't performance loss, since a j(c) will permit macro-fusion
  9878. with the cmp instruction, while another MOV likely means it's
  9879. not all being executed in a single cycle due to parallelisation.
  9880. }
  9881. if (cs_opt_level3 in current_settings.optimizerswitches) and
  9882. MatchOpType(taicpu(p), top_reg, top_reg) and
  9883. RegInInstruction(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  9884. GetNextInstruction(hp1, hp2) and
  9885. MatchInstruction(hp2, A_MOV, []) and
  9886. (taicpu(hp2).oper[1]^.typ = top_reg) and
  9887. { Registers don't have to be the same size in this case }
  9888. SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
  9889. GetNextInstruction(hp2, hp3) and
  9890. MatchInstruction(hp3, A_MOV, A_Jcc, []) and
  9891. { Make sure the operands in the camparison can be safely replaced }
  9892. (
  9893. not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[0]^) or
  9894. ReplaceRegisterInOper(taicpu(hp1), 0, taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg)
  9895. ) and
  9896. (
  9897. not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^) or
  9898. ReplaceRegisterInOper(taicpu(hp1), 1, taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg)
  9899. ) then
  9900. begin
  9901. DebugMsg(SPeepholeOptimization + 'MOV/CMP/MOV -> MOV/MOV/CMP', p);
  9902. AsmL.Remove(hp2);
  9903. AsmL.InsertAfter(hp2, p);
  9904. Result := True;
  9905. Exit;
  9906. end;
  9907. end;
  9908. if MatchInstruction(hp1, A_JMP, [S_NO]) then
  9909. begin
  9910. { Sometimes the MOVs that OptPass2JMP produces can be improved
  9911. further, but we can't just put this jump optimisation in pass 1
  9912. because it tends to perform worse when conditional jumps are
  9913. nearby (e.g. when converting CMOV instructions). [Kit] }
  9914. CopyUsedRegs(TempTracking);
  9915. UpdateUsedRegs(tai(p.Next));
  9916. if OptPass2JMP(hp1) then
  9917. begin
  9918. { Restore register state }
  9919. RestoreUsedRegs(TempTracking);
  9920. ReleaseUsedRegs(TempTracking);
  9921. { call OptPass1MOV once to potentially merge any MOVs that were created }
  9922. OptPass1MOV(p);
  9923. Result := True;
  9924. Exit;
  9925. end;
  9926. { If OptPass2JMP returned False, no optimisations were done to
  9927. the jump and there are no further optimisations that can be done
  9928. to the MOV instruction on this pass other than FuncMov2Func }
  9929. { Restore register state }
  9930. RestoreUsedRegs(TempTracking);
  9931. ReleaseUsedRegs(TempTracking);
  9932. Result := FuncMov2Func(p, hp1);
  9933. Exit;
  9934. end;
  9935. if MatchOpType(taicpu(p),top_reg,top_reg) and
  9936. (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  9937. MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
  9938. (taicpu(hp1).oper[1]^.typ = top_reg) and
  9939. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  9940. begin
  9941. { Change:
  9942. movl/q %reg1,%reg2 movl/q %reg1,%reg2
  9943. addl/q $x,%reg2 subl/q $x,%reg2
  9944. To:
  9945. leal/q x(%reg1),%reg2 leal/q -x(%reg1),%reg2
  9946. }
  9947. if TryMovArith2Lea(hp1) then
  9948. begin
  9949. Result := True;
  9950. Exit;
  9951. end
  9952. else if
  9953. { Same as above, but also adds or subtracts to %reg2 in between.
  9954. It's still valid as long as the flags aren't in use }
  9955. (
  9956. (
  9957. MatchInstruction(hp1,A_ADD,A_SUB,A_LEA,[]) and
  9958. not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^)
  9959. ) or
  9960. (
  9961. not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, taicpu(hp1)) and
  9962. { If it's not modified, make sure it isn't read as is }
  9963. not RegReadByInstruction(taicpu(p).oper[1]^.reg, taicpu(hp1))
  9964. )
  9965. ) and
  9966. GetNextInstructionUsingReg(hp1, hp2, taicpu(p).oper[1]^.reg) and
  9967. MatchInstruction(hp2,A_ADD,A_SUB,[taicpu(p).opsize]) and
  9968. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg) and
  9969. TryMovArith2Lea(hp2) then
  9970. begin
  9971. Result := True;
  9972. Exit;
  9973. end;
  9974. end;
  9975. if MatchOpType(taicpu(p),top_reg,top_reg) and
  9976. {$ifdef x86_64}
  9977. MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
  9978. {$else x86_64}
  9979. MatchInstruction(hp1,A_MOVZX,A_MOVSX,[]) and
  9980. {$endif x86_64}
  9981. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  9982. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg) then
  9983. { mov reg1, reg2 mov reg1, reg2
  9984. movzx/sx reg2, reg3 to movzx/sx reg1, reg3}
  9985. begin
  9986. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  9987. DebugMsg(SPeepholeOptimization + 'mov %reg1,%reg2; movzx/sx %reg2,%reg3 -> mov %reg1,%reg2;movzx/sx %reg1,%reg3',p);
  9988. { Don't remove the MOV command without first checking that reg2 isn't used afterwards,
  9989. or unless supreg(reg3) = supreg(reg2)). [Kit] }
  9990. TransferUsedRegs(TmpUsedRegs);
  9991. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  9992. if (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) or
  9993. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)
  9994. then
  9995. begin
  9996. RemoveCurrentP(p, hp1);
  9997. Result:=true;
  9998. end;
  9999. Exit;
  10000. end;
  10001. if MatchOpType(taicpu(p),top_reg,top_reg) and
  10002. IsXCHGAcceptable and
  10003. { XCHG doesn't support 8-bit registers }
  10004. (taicpu(p).opsize <> S_B) and
  10005. MatchInstruction(hp1, A_MOV, []) and
  10006. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  10007. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
  10008. GetNextInstruction(hp1, hp2) and
  10009. MatchInstruction(hp2, A_MOV, []) and
  10010. { Don't need to call MatchOpType for hp2 because the operand matches below cover for it }
  10011. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  10012. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) then
  10013. begin
  10014. { mov %reg1,%reg2
  10015. mov %reg3,%reg1 -> xchg %reg3,%reg1
  10016. mov %reg2,%reg3
  10017. (%reg2 not used afterwards)
  10018. Note that xchg takes 3 cycles to execute, and generally mov's take
  10019. only one cycle apiece, but the first two mov's can be executed in
  10020. parallel, only taking 2 cycles overall. Older processors should
  10021. therefore only optimise for size. [Kit]
  10022. }
  10023. TransferUsedRegs(TmpUsedRegs);
  10024. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  10025. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  10026. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) then
  10027. begin
  10028. DebugMsg(SPeepholeOptimization + 'MovMovMov2XChg', p);
  10029. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp1, UsedRegs);
  10030. taicpu(hp1).opcode := A_XCHG;
  10031. RemoveCurrentP(p, hp1);
  10032. RemoveInstruction(hp2);
  10033. Result := True;
  10034. Exit;
  10035. end;
  10036. end;
  10037. if MatchOpType(taicpu(p),top_reg,top_reg) and
  10038. MatchInstruction(hp1, A_SAR, []) then
  10039. begin
  10040. if MatchOperand(taicpu(hp1).oper[0]^, 31) then
  10041. begin
  10042. { the use of %edx also covers the opsize being S_L }
  10043. if MatchOperand(taicpu(hp1).oper[1]^, NR_EDX) then
  10044. begin
  10045. { Note it has to be specifically "movl %eax,%edx", and those specific sub-registers }
  10046. if (taicpu(p).oper[0]^.reg = NR_EAX) and
  10047. (taicpu(p).oper[1]^.reg = NR_EDX) then
  10048. begin
  10049. { Change:
  10050. movl %eax,%edx
  10051. sarl $31,%edx
  10052. To:
  10053. cltd
  10054. }
  10055. DebugMsg(SPeepholeOptimization + 'MovSar2Cltd', p);
  10056. RemoveInstruction(hp1);
  10057. taicpu(p).opcode := A_CDQ;
  10058. taicpu(p).opsize := S_NO;
  10059. taicpu(p).clearop(1);
  10060. taicpu(p).clearop(0);
  10061. taicpu(p).ops:=0;
  10062. Result := True;
  10063. Exit;
  10064. end
  10065. else if (cs_opt_size in current_settings.optimizerswitches) and
  10066. (taicpu(p).oper[0]^.reg = NR_EDX) and
  10067. (taicpu(p).oper[1]^.reg = NR_EAX) then
  10068. begin
  10069. { Change:
  10070. movl %edx,%eax
  10071. sarl $31,%edx
  10072. To:
  10073. movl %edx,%eax
  10074. cltd
  10075. Note that this creates a dependency between the two instructions,
  10076. so only perform if optimising for size.
  10077. }
  10078. DebugMsg(SPeepholeOptimization + 'MovSar2MovCltd', p);
  10079. taicpu(hp1).opcode := A_CDQ;
  10080. taicpu(hp1).opsize := S_NO;
  10081. taicpu(hp1).clearop(1);
  10082. taicpu(hp1).clearop(0);
  10083. taicpu(hp1).ops:=0;
  10084. Include(OptsToCheck, aoc_ForceNewIteration);
  10085. Exit;
  10086. end;
  10087. {$ifndef x86_64}
  10088. end
  10089. { Don't bother if CMOV is supported, because a more optimal
  10090. sequence would have been generated for the Abs() intrinsic }
  10091. else if not(CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) and
  10092. { the use of %eax also covers the opsize being S_L }
  10093. MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) and
  10094. (taicpu(p).oper[0]^.reg = NR_EAX) and
  10095. (taicpu(p).oper[1]^.reg = NR_EDX) and
  10096. GetNextInstruction(hp1, hp2) and
  10097. MatchInstruction(hp2, A_XOR, [S_L]) and
  10098. MatchOperand(taicpu(hp2).oper[0]^, NR_EAX) and
  10099. MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) and
  10100. GetNextInstruction(hp2, hp3) and
  10101. MatchInstruction(hp3, A_SUB, [S_L]) and
  10102. MatchOperand(taicpu(hp3).oper[0]^, NR_EAX) and
  10103. MatchOperand(taicpu(hp3).oper[1]^, NR_EDX) then
  10104. begin
  10105. { Change:
  10106. movl %eax,%edx
  10107. sarl $31,%eax
  10108. xorl %eax,%edx
  10109. subl %eax,%edx
  10110. (Instruction that uses %edx)
  10111. (%eax deallocated)
  10112. (%edx deallocated)
  10113. To:
  10114. cltd
  10115. xorl %edx,%eax <-- Note the registers have swapped
  10116. subl %edx,%eax
  10117. (Instruction that uses %eax) <-- %eax rather than %edx
  10118. }
  10119. TransferUsedRegs(TmpUsedRegs);
  10120. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  10121. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  10122. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  10123. if not RegUsedAfterInstruction(NR_EAX, hp3, TmpUsedRegs) then
  10124. begin
  10125. if GetNextInstruction(hp3, hp4) and
  10126. not RegModifiedByInstruction(NR_EDX, hp4) and
  10127. not RegUsedAfterInstruction(NR_EDX, hp4, TmpUsedRegs) then
  10128. begin
  10129. DebugMsg(SPeepholeOptimization + 'abs() intrinsic optimisation', p);
  10130. taicpu(p).opcode := A_CDQ;
  10131. taicpu(p).clearop(1);
  10132. taicpu(p).clearop(0);
  10133. taicpu(p).ops:=0;
  10134. RemoveInstruction(hp1);
  10135. taicpu(hp2).loadreg(0, NR_EDX);
  10136. taicpu(hp2).loadreg(1, NR_EAX);
  10137. taicpu(hp3).loadreg(0, NR_EDX);
  10138. taicpu(hp3).loadreg(1, NR_EAX);
  10139. AllocRegBetween(NR_EAX, hp3, hp4, TmpUsedRegs);
  10140. { Convert references in the following instruction (hp4) from %edx to %eax }
  10141. for OperIdx := 0 to taicpu(hp4).ops - 1 do
  10142. with taicpu(hp4).oper[OperIdx]^ do
  10143. case typ of
  10144. top_reg:
  10145. if getsupreg(reg) = RS_EDX then
  10146. reg := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  10147. top_ref:
  10148. begin
  10149. if getsupreg(reg) = RS_EDX then
  10150. ref^.base := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  10151. if getsupreg(reg) = RS_EDX then
  10152. ref^.index := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  10153. end;
  10154. else
  10155. ;
  10156. end;
  10157. Result := True;
  10158. Exit;
  10159. end;
  10160. end;
  10161. {$else x86_64}
  10162. end;
  10163. end
  10164. else if MatchOperand(taicpu(hp1).oper[0]^, 63) and
  10165. { the use of %rdx also covers the opsize being S_Q }
  10166. MatchOperand(taicpu(hp1).oper[1]^, NR_RDX) then
  10167. begin
  10168. { Note it has to be specifically "movq %rax,%rdx", and those specific sub-registers }
  10169. if (taicpu(p).oper[0]^.reg = NR_RAX) and
  10170. (taicpu(p).oper[1]^.reg = NR_RDX) then
  10171. begin
  10172. { Change:
  10173. movq %rax,%rdx
  10174. sarq $63,%rdx
  10175. To:
  10176. cqto
  10177. }
  10178. DebugMsg(SPeepholeOptimization + 'MovSar2Cqto', p);
  10179. RemoveInstruction(hp1);
  10180. taicpu(p).opcode := A_CQO;
  10181. taicpu(p).opsize := S_NO;
  10182. taicpu(p).clearop(1);
  10183. taicpu(p).clearop(0);
  10184. taicpu(p).ops:=0;
  10185. Result := True;
  10186. Exit;
  10187. end
  10188. else if (cs_opt_size in current_settings.optimizerswitches) and
  10189. (taicpu(p).oper[0]^.reg = NR_RDX) and
  10190. (taicpu(p).oper[1]^.reg = NR_RAX) then
  10191. begin
  10192. { Change:
  10193. movq %rdx,%rax
  10194. sarq $63,%rdx
  10195. To:
  10196. movq %rdx,%rax
  10197. cqto
  10198. Note that this creates a dependency between the two instructions,
  10199. so only perform if optimising for size.
  10200. }
  10201. DebugMsg(SPeepholeOptimization + 'MovSar2MovCqto', p);
  10202. taicpu(hp1).opcode := A_CQO;
  10203. taicpu(hp1).opsize := S_NO;
  10204. taicpu(hp1).clearop(1);
  10205. taicpu(hp1).clearop(0);
  10206. taicpu(hp1).ops:=0;
  10207. Include(OptsToCheck, aoc_ForceNewIteration);
  10208. Exit;
  10209. {$endif x86_64}
  10210. end;
  10211. end;
  10212. end;
  10213. if MatchInstruction(hp1, A_MOV, []) and
  10214. (taicpu(hp1).oper[1]^.typ = top_reg) then
  10215. { Though "GetNextInstruction" could be factored out, along with
  10216. the instructions that depend on hp2, it is an expensive call that
  10217. should be delayed for as long as possible, hence we do cheaper
  10218. checks first that are likely to be False. [Kit] }
  10219. begin
  10220. if (
  10221. (
  10222. MatchOperand(taicpu(p).oper[1]^, NR_EDX) and
  10223. (taicpu(hp1).oper[1]^.reg = NR_EAX) and
  10224. (
  10225. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  10226. MatchOperand(taicpu(hp1).oper[0]^, NR_EDX)
  10227. )
  10228. ) or
  10229. (
  10230. MatchOperand(taicpu(p).oper[1]^, NR_EAX) and
  10231. (taicpu(hp1).oper[1]^.reg = NR_EDX) and
  10232. (
  10233. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  10234. MatchOperand(taicpu(hp1).oper[0]^, NR_EAX)
  10235. )
  10236. )
  10237. ) and
  10238. GetNextInstruction(hp1, hp2) and
  10239. MatchInstruction(hp2, A_SAR, []) and
  10240. MatchOperand(taicpu(hp2).oper[0]^, 31) then
  10241. begin
  10242. if MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) then
  10243. begin
  10244. { Change:
  10245. movl r/m,%edx movl r/m,%eax movl r/m,%edx movl r/m,%eax
  10246. movl %edx,%eax or movl %eax,%edx or movl r/m,%eax or movl r/m,%edx
  10247. sarl $31,%edx sarl $31,%edx sarl $31,%edx sarl $31,%edx
  10248. To:
  10249. movl r/m,%eax <- Note the change in register
  10250. cltd
  10251. }
  10252. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCltd', p);
  10253. AllocRegBetween(NR_EAX, p, hp1, UsedRegs);
  10254. taicpu(p).loadreg(1, NR_EAX);
  10255. taicpu(hp1).opcode := A_CDQ;
  10256. taicpu(hp1).clearop(1);
  10257. taicpu(hp1).clearop(0);
  10258. taicpu(hp1).ops:=0;
  10259. RemoveInstruction(hp2);
  10260. Include(OptsToCheck, aoc_ForceNewIteration);
  10261. (*
  10262. {$ifdef x86_64}
  10263. end
  10264. else if MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) and
  10265. { This code sequence does not get generated - however it might become useful
  10266. if and when 128-bit signed integer types make an appearance, so the code
  10267. is kept here for when it is eventually needed. [Kit] }
  10268. (
  10269. (
  10270. (taicpu(hp1).oper[1]^.reg = NR_RAX) and
  10271. (
  10272. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  10273. MatchOperand(taicpu(hp1).oper[0]^, NR_RDX)
  10274. )
  10275. ) or
  10276. (
  10277. (taicpu(hp1).oper[1]^.reg = NR_RDX) and
  10278. (
  10279. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  10280. MatchOperand(taicpu(hp1).oper[0]^, NR_RAX)
  10281. )
  10282. )
  10283. ) and
  10284. GetNextInstruction(hp1, hp2) and
  10285. MatchInstruction(hp2, A_SAR, [S_Q]) and
  10286. MatchOperand(taicpu(hp2).oper[0]^, 63) and
  10287. MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) then
  10288. begin
  10289. { Change:
  10290. movq r/m,%rdx movq r/m,%rax movq r/m,%rdx movq r/m,%rax
  10291. movq %rdx,%rax or movq %rax,%rdx or movq r/m,%rax or movq r/m,%rdx
  10292. sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx
  10293. To:
  10294. movq r/m,%rax <- Note the change in register
  10295. cqto
  10296. }
  10297. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCqto', p);
  10298. AllocRegBetween(NR_RAX, p, hp1, UsedRegs);
  10299. taicpu(p).loadreg(1, NR_RAX);
  10300. taicpu(hp1).opcode := A_CQO;
  10301. taicpu(hp1).clearop(1);
  10302. taicpu(hp1).clearop(0);
  10303. taicpu(hp1).ops:=0;
  10304. RemoveInstruction(hp2);
  10305. Include(OptsToCheck, aoc_ForceNewIteration);
  10306. {$endif x86_64}
  10307. *)
  10308. end;
  10309. end;
  10310. {$ifdef x86_64}
  10311. end;
  10312. if (taicpu(p).opsize = S_L) and
  10313. (taicpu(p).oper[1]^.typ = top_reg) and
  10314. (
  10315. MatchInstruction(hp1, A_MOV,[]) and
  10316. (taicpu(hp1).opsize = S_L) and
  10317. (taicpu(hp1).oper[1]^.typ = top_reg)
  10318. ) and (
  10319. GetNextInstruction(hp1, hp2) and
  10320. (tai(hp2).typ=ait_instruction) and
  10321. (taicpu(hp2).opsize = S_Q) and
  10322. (
  10323. (
  10324. MatchInstruction(hp2, A_ADD,[]) and
  10325. (taicpu(hp2).opsize = S_Q) and
  10326. (taicpu(hp2).oper[0]^.typ = top_reg) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  10327. (
  10328. (
  10329. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(p).oper[1]^.reg)) and
  10330. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  10331. ) or (
  10332. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  10333. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  10334. )
  10335. )
  10336. ) or (
  10337. MatchInstruction(hp2, A_LEA,[]) and
  10338. (taicpu(hp2).oper[0]^.ref^.offset = 0) and
  10339. (taicpu(hp2).oper[0]^.ref^.scalefactor <= 1) and
  10340. (
  10341. (
  10342. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(p).oper[1]^.reg)) and
  10343. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(hp1).oper[1]^.reg))
  10344. ) or (
  10345. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  10346. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(p).oper[1]^.reg))
  10347. )
  10348. ) and (
  10349. (
  10350. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  10351. ) or (
  10352. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  10353. )
  10354. )
  10355. )
  10356. )
  10357. ) and (
  10358. GetNextInstruction(hp2, hp3) and
  10359. MatchInstruction(hp3, A_SHR,[]) and
  10360. (taicpu(hp3).opsize = S_Q) and
  10361. (taicpu(hp3).oper[0]^.typ = top_const) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  10362. (taicpu(hp3).oper[0]^.val = 1) and
  10363. (taicpu(hp3).oper[1]^.reg = taicpu(hp2).oper[1]^.reg)
  10364. ) then
  10365. begin
  10366. { Change movl x, reg1d movl x, reg1d
  10367. movl y, reg2d movl y, reg2d
  10368. addq reg2q,reg1q or leaq (reg1q,reg2q),reg1q
  10369. shrq $1, reg1q shrq $1, reg1q
  10370. ( reg1d and reg2d can be switched around in the first two instructions )
  10371. To movl x, reg1d
  10372. addl y, reg1d
  10373. rcrl $1, reg1d
  10374. This corresponds to the common expression (x + y) shr 1, where
  10375. x and y are Cardinals (replacing "shr 1" with "div 2" produces
  10376. smaller code, but won't account for x + y causing an overflow). [Kit]
  10377. }
  10378. DebugMsg(SPeepholeOptimization + 'MovMov*Shr2MovMov*Rcr', p);
  10379. if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then
  10380. begin
  10381. { Change first MOV command to have the same register as the final output }
  10382. taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg;
  10383. AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp1, UsedRegs);
  10384. Result := True;
  10385. end
  10386. else
  10387. begin
  10388. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
  10389. Include(OptsToCheck, aoc_ForceNewIteration);
  10390. end;
  10391. { Change second MOV command to an ADD command. This is easier than
  10392. converting the existing command because it means we don't have to
  10393. touch 'y', which might be a complicated reference, and also the
  10394. fact that the third command might either be ADD or LEA. [Kit] }
  10395. taicpu(hp1).opcode := A_ADD;
  10396. { Delete old ADD/LEA instruction }
  10397. RemoveInstruction(hp2);
  10398. { Convert "shrq $1, reg1q" to "rcr $1, reg1d" }
  10399. taicpu(hp3).opcode := A_RCR;
  10400. taicpu(hp3).changeopsize(S_L);
  10401. setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD);
  10402. { Don't need to Exit yet as p is still a MOV and hp1 hasn't been
  10403. called, so FuncMov2Func below is safe to call }
  10404. {$endif x86_64}
  10405. end;
  10406. {$ifdef x86_64}
  10407. { Note, this optimisation was moved from Pass 1 because the CMOV
  10408. optimisations in OptPass2Jcc fall foul of the loss of information
  10409. about the upper 32 bits of the target register. Fixes #41317. }
  10410. { Change:
  10411. movl %reg1l,%reg2l
  10412. movq %reg2q,%reg3q (%reg1 <> %reg3)
  10413. To:
  10414. movl %reg1l,%reg2l
  10415. movl %reg1l,%reg3l (Upper 32 bits of %reg3q will be zero)
  10416. }
  10417. if MatchOpType(taicpu(p), top_reg, top_reg) and
  10418. (taicpu(p).opsize = S_L) then
  10419. begin
  10420. { If the movq instruction is followed by addq or subq, it
  10421. might be possible to convert them to a leaq instruction
  10422. whose opportunity might be lost if it's changed to a movl
  10423. first, so we can't do this optimisation on a first iteration }
  10424. if not (aoc_MovlMovq2MovlMovl in OptsToCheck) and
  10425. not NotFirstIteration and
  10426. { If -O2 and under, do the optimisation anyway because Pass 2
  10427. won't run more than once }
  10428. (cs_opt_level3 in current_settings.optimizerswitches) then
  10429. begin
  10430. { Flag that we need to run Pass 2 again }
  10431. Include(OptsToCheck, aoc_ForceNewIteration);
  10432. end
  10433. else
  10434. begin
  10435. TransferUsedRegs(TmpUsedRegs);
  10436. { Mark the start point for sequential calls to
  10437. GetNextInstructionUsingReg, RegModifiedBetween and
  10438. UpdateUsedRegsBetween in case this optimisation is run multiple
  10439. times }
  10440. hp2 := p;
  10441. repeat
  10442. if (
  10443. not(cs_opt_level3 in current_settings.optimizerswitches) or
  10444. { Look further ahead for this one }
  10445. GetNextInstructionUsingReg(hp2, hp1, taicpu(p).oper[1]^.reg)
  10446. ) and
  10447. MatchInstruction(hp1,A_MOV,[S_Q]) and
  10448. not RegModifiedBetween(taicpu(p).oper[0]^.reg, hp2, hp1) and
  10449. MatchOpType(taicpu(hp1), top_reg, top_reg) and
  10450. SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^.reg) then
  10451. begin
  10452. UpdateUsedRegsBetween(TmpUsedRegs, tai(hp2.Next), hp1);
  10453. taicpu(hp1).opsize := S_L;
  10454. taicpu(hp1).loadreg(0, taicpu(p).oper[0]^.reg);
  10455. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  10456. AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp1, TmpUsedRegs);
  10457. DebugMsg(SPeepholeOptimization + 'Made 32-to-64-bit zero extension more efficient (MovlMovq2MovlMovl 1)', hp1);
  10458. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  10459. begin
  10460. DebugMsg(SPeepholeOptimization + 'Mov2Nop 8 done', p);
  10461. RemoveCurrentP(p);
  10462. Result := True;
  10463. Exit;
  10464. end;
  10465. { Initial instruction wasn't actually changed }
  10466. Include(OptsToCheck, aoc_ForceNewIteration);
  10467. if (cs_opt_level3 in current_settings.optimizerswitches) then
  10468. begin
  10469. { GetNextInstructionUsingReg will return a different
  10470. instruction, so check this optimisation again }
  10471. { Update the start point for the next calls to
  10472. GetNextInstructionUsingReg, RegModifiedBetween and
  10473. UpdateUsedRegsBetween to grant a speed boost }
  10474. hp2 := hp1;
  10475. Continue; { Jump back to "repeat" }
  10476. end;
  10477. end;
  10478. Break;
  10479. until False;
  10480. end;
  10481. end;
  10482. {$endif x86_64}
  10483. if FuncMov2Func(p, hp1) then
  10484. begin
  10485. Result := True;
  10486. Exit;
  10487. end;
  10488. end;
  10489. {$push}
  10490. {$q-}{$r-}
  10491. function TX86AsmOptimizer.OptPass2Movx(var p : tai) : boolean;
  10492. var
  10493. ThisReg: TRegister;
  10494. MinSize, MaxSize, TryShiftDown, TargetSize: TOpSize;
  10495. TargetSubReg: TSubRegister;
  10496. hp1, hp2: tai;
  10497. RegInUse, RegChanged, p_removed, hp1_removed: Boolean;
  10498. { Store list of found instructions so we don't have to call
  10499. GetNextInstructionUsingReg multiple times }
  10500. InstrList: array of taicpu;
  10501. InstrMax, Index: Integer;
  10502. UpperLimit, SignedUpperLimit, SignedUpperLimitBottom,
  10503. LowerLimit, SignedLowerLimit, SignedLowerLimitBottom,
  10504. TryShiftDownLimit, TryShiftDownSignedLimit, TryShiftDownSignedLimitLower,
  10505. WorkingValue: TCgInt;
  10506. PreMessage: string;
  10507. { Data flow analysis }
  10508. TestValMin, TestValMax, TestValSignedMax: TCgInt;
  10509. BitwiseOnly, OrXorUsed,
  10510. ShiftDownOverflow, UpperSignedOverflow, UpperUnsignedOverflow, LowerSignedOverflow, LowerUnsignedOverflow: Boolean;
  10511. function CheckOverflowConditions: Boolean;
  10512. begin
  10513. Result := True;
  10514. if (TestValSignedMax > SignedUpperLimit) then
  10515. UpperSignedOverflow := True;
  10516. if (TestValSignedMax > SignedLowerLimit) or (TestValSignedMax < SignedLowerLimitBottom) then
  10517. LowerSignedOverflow := True;
  10518. if (TestValMin > LowerLimit) or (TestValMax > LowerLimit) then
  10519. LowerUnsignedOverflow := True;
  10520. if (TestValMin > UpperLimit) or (TestValMax > UpperLimit) or (TestValSignedMax > UpperLimit) or
  10521. (TestValMin < SignedUpperLimitBottom) or (TestValMax < SignedUpperLimitBottom) or (TestValSignedMax < SignedUpperLimitBottom) then
  10522. begin
  10523. { Absolute overflow }
  10524. Result := False;
  10525. Exit;
  10526. end;
  10527. if not ShiftDownOverflow and (TryShiftDown <> S_NO) and
  10528. ((TestValMin > TryShiftDownLimit) or (TestValMax > TryShiftDownLimit)) then
  10529. ShiftDownOverflow := True;
  10530. if (TestValMin < 0) or (TestValMax < 0) then
  10531. begin
  10532. LowerUnsignedOverflow := True;
  10533. UpperUnsignedOverflow := True;
  10534. end;
  10535. end;
  10536. function AdjustInitialLoadAndSize: Boolean;
  10537. begin
  10538. Result := False;
  10539. if not p_removed then
  10540. begin
  10541. if TargetSize = MinSize then
  10542. begin
  10543. { Convert the input MOVZX to a MOV }
  10544. if (taicpu(p).oper[0]^.typ = top_reg) and
  10545. SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg) then
  10546. begin
  10547. { Or remove it completely! }
  10548. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 1', p);
  10549. RemoveCurrentP(p);
  10550. p_removed := True;
  10551. end
  10552. else
  10553. begin
  10554. DebugMsg(SPeepholeOptimization + 'Movzx2Mov 1', p);
  10555. taicpu(p).opcode := A_MOV;
  10556. taicpu(p).oper[1]^.reg := ThisReg;
  10557. taicpu(p).opsize := TargetSize;
  10558. end;
  10559. Result := True;
  10560. end
  10561. else if TargetSize <> MaxSize then
  10562. begin
  10563. case MaxSize of
  10564. S_L:
  10565. if TargetSize = S_W then
  10566. begin
  10567. DebugMsg(SPeepholeOptimization + 'movzbl2movzbw', p);
  10568. taicpu(p).opsize := S_BW;
  10569. taicpu(p).oper[1]^.reg := ThisReg;
  10570. Result := True;
  10571. end
  10572. else
  10573. InternalError(2020112341);
  10574. S_W:
  10575. if TargetSize = S_L then
  10576. begin
  10577. DebugMsg(SPeepholeOptimization + 'movzbw2movzbl', p);
  10578. taicpu(p).opsize := S_BL;
  10579. taicpu(p).oper[1]^.reg := ThisReg;
  10580. Result := True;
  10581. end
  10582. else
  10583. InternalError(2020112342);
  10584. else
  10585. ;
  10586. end;
  10587. end
  10588. else if not hp1_removed and not RegInUse then
  10589. begin
  10590. { If we have something like:
  10591. movzbl (oper),%regd
  10592. add x, %regd
  10593. movzbl %regb, %regd
  10594. We can reduce the register size to the input of the final
  10595. movzbl instruction. Overflows won't have any effect.
  10596. }
  10597. if (taicpu(p).opsize in [S_BW, S_BL]) and
  10598. (taicpu(hp1).opsize in [S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}]) then
  10599. begin
  10600. TargetSize := S_B;
  10601. setsubreg(ThisReg, R_SUBL);
  10602. Result := True;
  10603. end
  10604. else if (taicpu(p).opsize = S_WL) and
  10605. (taicpu(hp1).opsize in [S_WL{$ifdef x86_64}, S_BQ{$endif x86_64}]) then
  10606. begin
  10607. TargetSize := S_W;
  10608. setsubreg(ThisReg, R_SUBW);
  10609. Result := True;
  10610. end;
  10611. if Result then
  10612. begin
  10613. { Convert the input MOVZX to a MOV }
  10614. if (taicpu(p).oper[0]^.typ = top_reg) and
  10615. SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg) then
  10616. begin
  10617. { Or remove it completely! }
  10618. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 1a', p);
  10619. RemoveCurrentP(p);
  10620. p_removed := True;
  10621. end
  10622. else
  10623. begin
  10624. DebugMsg(SPeepholeOptimization + 'Movzx2Mov 1a', p);
  10625. taicpu(p).opcode := A_MOV;
  10626. taicpu(p).oper[1]^.reg := ThisReg;
  10627. taicpu(p).opsize := TargetSize;
  10628. end;
  10629. end;
  10630. end;
  10631. end;
  10632. end;
  10633. procedure AdjustFinalLoad;
  10634. begin
  10635. if not LowerUnsignedOverflow then
  10636. begin
  10637. if ((TargetSize = S_L) and (taicpu(hp1).opsize in [S_L, S_BL, S_WL])) or
  10638. ((TargetSize = S_W) and (taicpu(hp1).opsize in [S_W, S_BW])) then
  10639. begin
  10640. { Convert the output MOVZX to a MOV }
  10641. if SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
  10642. begin
  10643. { Make sure the zero-expansion covers at least the minimum size (fixes i40003) }
  10644. if (MinSize = S_B) or
  10645. (not ShiftDownOverflow and (TryShiftDown = S_B)) or
  10646. ((MinSize = S_W) and (taicpu(hp1).opsize = S_WL)) then
  10647. begin
  10648. { Remove it completely! }
  10649. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 2', hp1);
  10650. { Be careful; if p = hp1 and p was also removed, p
  10651. will become a dangling pointer }
  10652. if p = hp1 then
  10653. begin
  10654. RemoveCurrentp(p); { p = hp1 and will then become the next instruction }
  10655. p_removed := True;
  10656. end
  10657. else
  10658. RemoveInstruction(hp1);
  10659. hp1_removed := True;
  10660. end;
  10661. end
  10662. else
  10663. begin
  10664. DebugMsg(SPeepholeOptimization + 'Movzx2Mov 2', hp1);
  10665. taicpu(hp1).opcode := A_MOV;
  10666. taicpu(hp1).oper[0]^.reg := ThisReg;
  10667. taicpu(hp1).opsize := TargetSize;
  10668. end;
  10669. end
  10670. else if (TargetSize = S_B) and (MaxSize = S_W) and (taicpu(hp1).opsize = S_WL) then
  10671. begin
  10672. { Need to change the size of the output }
  10673. DebugMsg(SPeepholeOptimization + 'movzwl2movzbl 2', hp1);
  10674. taicpu(hp1).oper[0]^.reg := ThisReg;
  10675. taicpu(hp1).opsize := S_BL;
  10676. end;
  10677. end;
  10678. end;
  10679. function CompressInstructions: Boolean;
  10680. var
  10681. LocalIndex: Integer;
  10682. begin
  10683. Result := False;
  10684. { The objective here is to try to find a combination that
  10685. removes one of the MOV/Z instructions. }
  10686. if (
  10687. (taicpu(p).oper[0]^.typ <> top_reg) or
  10688. not SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg)
  10689. ) and
  10690. (taicpu(hp1).oper[1]^.typ = top_reg) and
  10691. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
  10692. begin
  10693. { Make a preference to remove the second MOVZX instruction }
  10694. case taicpu(hp1).opsize of
  10695. S_BL, S_WL:
  10696. begin
  10697. TargetSize := S_L;
  10698. TargetSubReg := R_SUBD;
  10699. end;
  10700. S_BW:
  10701. begin
  10702. TargetSize := S_W;
  10703. TargetSubReg := R_SUBW;
  10704. end;
  10705. else
  10706. InternalError(2020112302);
  10707. end;
  10708. end
  10709. else
  10710. begin
  10711. if LowerUnsignedOverflow and not UpperUnsignedOverflow then
  10712. begin
  10713. { Exceeded lower bound but not upper bound }
  10714. TargetSize := MaxSize;
  10715. end
  10716. else if not LowerUnsignedOverflow then
  10717. begin
  10718. { Size didn't exceed lower bound }
  10719. TargetSize := MinSize;
  10720. end
  10721. else
  10722. Exit;
  10723. end;
  10724. case TargetSize of
  10725. S_B:
  10726. TargetSubReg := R_SUBL;
  10727. S_W:
  10728. TargetSubReg := R_SUBW;
  10729. S_L:
  10730. TargetSubReg := R_SUBD;
  10731. else
  10732. InternalError(2020112350);
  10733. end;
  10734. { Update the register to its new size }
  10735. setsubreg(ThisReg, TargetSubReg);
  10736. RegInUse := False;
  10737. if not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
  10738. begin
  10739. { Check to see if the active register is used afterwards;
  10740. if not, we can change it and make a saving. }
  10741. TransferUsedRegs(TmpUsedRegs);
  10742. { The target register may be marked as in use to cross
  10743. a jump to a distant label, so exclude it }
  10744. ExcludeRegFromUsedRegs(taicpu(hp1).oper[1]^.reg, TmpUsedRegs);
  10745. hp2 := p;
  10746. repeat
  10747. { Explicitly check for the excluded register (don't include the first
  10748. instruction as it may be reading from here }
  10749. if ((p <> hp2) and (RegInInstruction(taicpu(hp1).oper[1]^.reg, hp2))) or
  10750. RegInUsedRegs(taicpu(hp1).oper[1]^.reg, TmpUsedRegs) then
  10751. begin
  10752. RegInUse := True;
  10753. Break;
  10754. end;
  10755. UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
  10756. if not GetNextInstruction(hp2, hp2) then
  10757. InternalError(2020112340);
  10758. until (hp2 = hp1);
  10759. if not RegInUse and RegUsedAfterInstruction(ThisReg, hp1, TmpUsedRegs) then
  10760. { We might still be able to get away with this }
  10761. RegInUse := not
  10762. (
  10763. GetNextInstructionUsingReg(hp1, hp2, ThisReg) and
  10764. (hp2.typ = ait_instruction) and
  10765. (
  10766. { Under -O1 and -O2, GetNextInstructionUsingReg may return an
  10767. instruction that doesn't actually contain ThisReg }
  10768. (cs_opt_level3 in current_settings.optimizerswitches) or
  10769. RegInInstruction(ThisReg, hp2)
  10770. ) and
  10771. RegLoadedWithNewValue(ThisReg, hp2)
  10772. );
  10773. if not RegInUse then
  10774. begin
  10775. { Force the register size to the same as this instruction so it can be removed}
  10776. if (taicpu(hp1).opsize in [S_L, S_BL, S_WL]) then
  10777. begin
  10778. TargetSize := S_L;
  10779. TargetSubReg := R_SUBD;
  10780. end
  10781. else if (taicpu(hp1).opsize in [S_W, S_BW]) then
  10782. begin
  10783. TargetSize := S_W;
  10784. TargetSubReg := R_SUBW;
  10785. end;
  10786. ThisReg := taicpu(hp1).oper[1]^.reg;
  10787. setsubreg(ThisReg, TargetSubReg);
  10788. RegChanged := True;
  10789. DebugMsg(SPeepholeOptimization + 'Simplified register usage so ' + debug_regname(ThisReg) + ' = ' + debug_regname(taicpu(p).oper[1]^.reg), p);
  10790. TransferUsedRegs(TmpUsedRegs);
  10791. AllocRegBetween(ThisReg, p, hp1, TmpUsedRegs);
  10792. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 3', hp1);
  10793. if p = hp1 then
  10794. begin
  10795. RemoveCurrentp(p); { p = hp1 and will then become the next instruction }
  10796. p_removed := True;
  10797. end
  10798. else
  10799. RemoveInstruction(hp1);
  10800. hp1_removed := True;
  10801. { Instruction will become "mov %reg,%reg" }
  10802. if not p_removed and (taicpu(p).opcode = A_MOV) and
  10803. MatchOperand(taicpu(p).oper[0]^, ThisReg) then
  10804. begin
  10805. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 6', p);
  10806. RemoveCurrentP(p);
  10807. p_removed := True;
  10808. end
  10809. else
  10810. taicpu(p).oper[1]^.reg := ThisReg;
  10811. Result := True;
  10812. end
  10813. else
  10814. begin
  10815. if TargetSize <> MaxSize then
  10816. begin
  10817. { Since the register is in use, we have to force it to
  10818. MaxSize otherwise part of it may become undefined later on }
  10819. TargetSize := MaxSize;
  10820. case TargetSize of
  10821. S_B:
  10822. TargetSubReg := R_SUBL;
  10823. S_W:
  10824. TargetSubReg := R_SUBW;
  10825. S_L:
  10826. TargetSubReg := R_SUBD;
  10827. else
  10828. InternalError(2020112351);
  10829. end;
  10830. setsubreg(ThisReg, TargetSubReg);
  10831. end;
  10832. AdjustFinalLoad;
  10833. end;
  10834. end
  10835. else
  10836. AdjustFinalLoad;
  10837. Result := AdjustInitialLoadAndSize or Result;
  10838. { Now go through every instruction we found and change the
  10839. size. If TargetSize = MaxSize, then almost no changes are
  10840. needed and Result can remain False if it hasn't been set
  10841. yet.
  10842. If RegChanged is True, then the register requires changing
  10843. and so the point about TargetSize = MaxSize doesn't apply. }
  10844. if ((TargetSize <> MaxSize) or RegChanged) and (InstrMax >= 0) then
  10845. begin
  10846. for LocalIndex := 0 to InstrMax do
  10847. begin
  10848. { If p_removed is true, then the original MOV/Z was removed
  10849. and removing the AND instruction may not be safe if it
  10850. appears first }
  10851. if (InstrList[LocalIndex].oper[InstrList[LocalIndex].ops - 1]^.typ <> top_reg) then
  10852. InternalError(2020112310);
  10853. if InstrList[LocalIndex].oper[0]^.typ = top_reg then
  10854. InstrList[LocalIndex].oper[0]^.reg := ThisReg;
  10855. InstrList[LocalIndex].oper[InstrList[LocalIndex].ops - 1]^.reg := ThisReg;
  10856. InstrList[LocalIndex].opsize := TargetSize;
  10857. end;
  10858. Result := True;
  10859. end;
  10860. end;
  10861. begin
  10862. Result := False;
  10863. p_removed := False;
  10864. hp1_removed := False;
  10865. ThisReg := taicpu(p).oper[1]^.reg;
  10866. { Check for:
  10867. movs/z ###,%ecx (or %cx or %rcx)
  10868. ...
  10869. shl/shr/sar/rcl/rcr/ror/rol %cl,###
  10870. (dealloc %ecx)
  10871. Change to:
  10872. mov ###,%cl (if ### = %cl, then remove completely)
  10873. ...
  10874. shl/shr/sar/rcl/rcr/ror/rol %cl,###
  10875. }
  10876. if (getsupreg(ThisReg) = RS_ECX) and
  10877. GetNextInstructionUsingReg(p, hp1, NR_ECX) and
  10878. (hp1.typ = ait_instruction) and
  10879. (
  10880. { Under -O1 and -O2, GetNextInstructionUsingReg may return an
  10881. instruction that doesn't actually contain ECX }
  10882. (cs_opt_level3 in current_settings.optimizerswitches) or
  10883. RegInInstruction(NR_ECX, hp1) or
  10884. (
  10885. { It's common for the shift/rotate's read/write register to be
  10886. initialised in between, so under -O2 and under, search ahead
  10887. one more instruction
  10888. }
  10889. GetNextInstruction(hp1, hp1) and
  10890. (hp1.typ = ait_instruction) and
  10891. RegInInstruction(NR_ECX, hp1)
  10892. )
  10893. ) and
  10894. MatchInstruction(hp1, [A_SHL, A_SHR, A_SAR, A_ROR, A_ROL, A_RCR, A_RCL], []) and
  10895. (taicpu(hp1).oper[0]^.typ = top_reg) { This is enough to determine that it's %cl } and
  10896. not RegInOp(NR_ECX, taicpu(hp1).oper[1]^) then
  10897. begin
  10898. TransferUsedRegs(TmpUsedRegs);
  10899. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1);
  10900. if not RegUsedAfterInstruction(NR_CL, hp1, TmpUsedRegs) then
  10901. begin
  10902. case taicpu(p).opsize of
  10903. S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  10904. if MatchOperand(taicpu(p).oper[0]^, NR_CL) then
  10905. begin
  10906. DebugMsg(SPeepholeOptimization + 'MovxOp2Op 3a', p);
  10907. RemoveCurrentP(p);
  10908. end
  10909. else
  10910. begin
  10911. taicpu(p).opcode := A_MOV;
  10912. taicpu(p).opsize := S_B;
  10913. taicpu(p).oper[1]^.reg := NR_CL;
  10914. DebugMsg(SPeepholeOptimization + 'MovxOp2MovOp 1', p);
  10915. end;
  10916. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  10917. if MatchOperand(taicpu(p).oper[0]^, NR_CX) then
  10918. begin
  10919. DebugMsg(SPeepholeOptimization + 'MovxOp2Op 3b', p);
  10920. RemoveCurrentP(p);
  10921. end
  10922. else
  10923. begin
  10924. taicpu(p).opcode := A_MOV;
  10925. taicpu(p).opsize := S_W;
  10926. taicpu(p).oper[1]^.reg := NR_CX;
  10927. DebugMsg(SPeepholeOptimization + 'MovxOp2MovOp 2', p);
  10928. end;
  10929. {$ifdef x86_64}
  10930. S_LQ:
  10931. if MatchOperand(taicpu(p).oper[0]^, NR_ECX) then
  10932. begin
  10933. DebugMsg(SPeepholeOptimization + 'MovxOp2Op 3c', p);
  10934. RemoveCurrentP(p);
  10935. end
  10936. else
  10937. begin
  10938. taicpu(p).opcode := A_MOV;
  10939. taicpu(p).opsize := S_L;
  10940. taicpu(p).oper[1]^.reg := NR_ECX;
  10941. DebugMsg(SPeepholeOptimization + 'MovxOp2MovOp 3', p);
  10942. end;
  10943. {$endif x86_64}
  10944. else
  10945. InternalError(2021120401);
  10946. end;
  10947. Result := True;
  10948. Exit;
  10949. end;
  10950. end;
  10951. { This is anything but quick! }
  10952. if not(cs_opt_level2 in current_settings.optimizerswitches) then
  10953. Exit;
  10954. SetLength(InstrList, 0);
  10955. InstrMax := -1;
  10956. case taicpu(p).opsize of
  10957. S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  10958. begin
  10959. {$if defined(i386) or defined(i8086)}
  10960. { If the target size is 8-bit, make sure we can actually encode it }
  10961. if not (GetSupReg(ThisReg) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX]) then
  10962. Exit;
  10963. {$endif i386 or i8086}
  10964. LowerLimit := $FF;
  10965. SignedLowerLimit := $7F;
  10966. SignedLowerLimitBottom := -128;
  10967. MinSize := S_B;
  10968. if taicpu(p).opsize = S_BW then
  10969. begin
  10970. MaxSize := S_W;
  10971. UpperLimit := $FFFF;
  10972. SignedUpperLimit := $7FFF;
  10973. SignedUpperLimitBottom := -32768;
  10974. end
  10975. else
  10976. begin
  10977. { Keep at a 32-bit limit for BQ as well since one can't really optimise otherwise }
  10978. MaxSize := S_L;
  10979. UpperLimit := $FFFFFFFF;
  10980. SignedUpperLimit := $7FFFFFFF;
  10981. SignedUpperLimitBottom := -2147483648;
  10982. end;
  10983. end;
  10984. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  10985. begin
  10986. { Keep at a 32-bit limit for WQ as well since one can't really optimise otherwise }
  10987. LowerLimit := $FFFF;
  10988. SignedLowerLimit := $7FFF;
  10989. SignedLowerLimitBottom := -32768;
  10990. UpperLimit := $FFFFFFFF;
  10991. SignedUpperLimit := $7FFFFFFF;
  10992. SignedUpperLimitBottom := -2147483648;
  10993. MinSize := S_W;
  10994. MaxSize := S_L;
  10995. end;
  10996. {$ifdef x86_64}
  10997. S_LQ:
  10998. begin
  10999. { Both the lower and upper limits are set to 32-bit. If a limit
  11000. is breached, then optimisation is impossible }
  11001. LowerLimit := $FFFFFFFF;
  11002. SignedLowerLimit := $7FFFFFFF;
  11003. SignedLowerLimitBottom := -2147483648;
  11004. UpperLimit := $FFFFFFFF;
  11005. SignedUpperLimit := $7FFFFFFF;
  11006. SignedUpperLimitBottom := -2147483648;
  11007. MinSize := S_L;
  11008. MaxSize := S_L;
  11009. end;
  11010. {$endif x86_64}
  11011. else
  11012. InternalError(2020112301);
  11013. end;
  11014. TestValMin := 0;
  11015. TestValMax := LowerLimit;
  11016. TestValSignedMax := SignedLowerLimit;
  11017. TryShiftDownLimit := LowerLimit;
  11018. TryShiftDown := S_NO;
  11019. ShiftDownOverflow := False;
  11020. RegChanged := False;
  11021. BitwiseOnly := True;
  11022. OrXorUsed := False;
  11023. UpperSignedOverflow := False;
  11024. LowerSignedOverflow := False;
  11025. UpperUnsignedOverflow := False;
  11026. LowerUnsignedOverflow := False;
  11027. hp1 := p;
  11028. while GetNextInstructionUsingReg(hp1, hp1, ThisReg) and
  11029. (hp1.typ = ait_instruction) and
  11030. (
  11031. { Under -O1 and -O2, GetNextInstructionUsingReg may return an
  11032. instruction that doesn't actually contain ThisReg }
  11033. (cs_opt_level3 in current_settings.optimizerswitches) or
  11034. { This allows this Movx optimisation to work through the SETcc instructions
  11035. inserted by the 'CMP/JE/CMP/@Lbl/SETE -> CMP/SETE/CMP/SETE/OR'
  11036. optimisation on -O1 and -O2 (on -O3, GetNextInstructionUsingReg will
  11037. skip over these SETcc instructions). }
  11038. (taicpu(hp1).opcode = A_SETcc) or
  11039. RegInInstruction(ThisReg, hp1)
  11040. ) do
  11041. begin
  11042. case taicpu(hp1).opcode of
  11043. A_INC,A_DEC:
  11044. begin
  11045. { Has to be an exact match on the register }
  11046. if not MatchOperand(taicpu(hp1).oper[0]^, ThisReg) then
  11047. Break;
  11048. if taicpu(hp1).opcode = A_INC then
  11049. begin
  11050. Inc(TestValMin);
  11051. Inc(TestValMax);
  11052. Inc(TestValSignedMax);
  11053. end
  11054. else
  11055. begin
  11056. Dec(TestValMin);
  11057. Dec(TestValMax);
  11058. Dec(TestValSignedMax);
  11059. end;
  11060. end;
  11061. A_TEST, A_CMP:
  11062. begin
  11063. if (
  11064. { Too high a risk of non-linear behaviour that breaks DFA
  11065. here, unless it's cmp $0,%reg, which is equivalent to
  11066. test %reg,%reg }
  11067. OrXorUsed and
  11068. (taicpu(hp1).opcode = A_CMP) and
  11069. not Matchoperand(taicpu(hp1).oper[0]^, 0)
  11070. ) or
  11071. (taicpu(hp1).oper[1]^.typ <> top_reg) or
  11072. { Has to be an exact match on the register }
  11073. (taicpu(hp1).oper[1]^.reg <> ThisReg) or
  11074. (
  11075. { Permit "test %reg,%reg" }
  11076. (taicpu(hp1).opcode = A_TEST) and
  11077. (taicpu(hp1).oper[0]^.typ = top_reg) and
  11078. (taicpu(hp1).oper[0]^.reg <> ThisReg)
  11079. ) or
  11080. (taicpu(hp1).oper[0]^.typ <> top_const) or
  11081. { Make sure the comparison value is not smaller than the
  11082. smallest allowed signed value for the minimum size (e.g.
  11083. -128 for 8-bit) }
  11084. not (
  11085. ((taicpu(hp1).oper[0]^.val and LowerLimit) = taicpu(hp1).oper[0]^.val) or
  11086. { Is it in the negative range? }
  11087. (
  11088. (taicpu(hp1).oper[0]^.val < 0) and
  11089. (taicpu(hp1).oper[0]^.val >= SignedLowerLimitBottom)
  11090. )
  11091. ) then
  11092. Break;
  11093. { Check to see if the active register is used afterwards }
  11094. TransferUsedRegs(TmpUsedRegs);
  11095. IncludeRegInUsedRegs(ThisReg, TmpUsedRegs);
  11096. if not RegUsedAfterInstruction(ThisReg, hp1, TmpUsedRegs) then
  11097. begin
  11098. { Make sure the comparison or any previous instructions
  11099. hasn't pushed the test values outside of the range of
  11100. MinSize }
  11101. if LowerUnsignedOverflow and not UpperUnsignedOverflow then
  11102. begin
  11103. { Exceeded lower bound but not upper bound }
  11104. Exit;
  11105. end
  11106. else if not LowerSignedOverflow or not LowerUnsignedOverflow then
  11107. begin
  11108. { Size didn't exceed lower bound }
  11109. TargetSize := MinSize;
  11110. end
  11111. else
  11112. Break;
  11113. case TargetSize of
  11114. S_B:
  11115. TargetSubReg := R_SUBL;
  11116. S_W:
  11117. TargetSubReg := R_SUBW;
  11118. S_L:
  11119. TargetSubReg := R_SUBD;
  11120. else
  11121. InternalError(2021051002);
  11122. end;
  11123. if TargetSize <> MaxSize then
  11124. begin
  11125. { Update the register to its new size }
  11126. setsubreg(ThisReg, TargetSubReg);
  11127. DebugMsg(SPeepholeOptimization + 'CMP instruction resized thanks to register size optimisation (see MOV/Z assignment above)', hp1);
  11128. taicpu(hp1).oper[1]^.reg := ThisReg;
  11129. taicpu(hp1).opsize := TargetSize;
  11130. { Convert the input MOVZX to a MOV if necessary }
  11131. AdjustInitialLoadAndSize;
  11132. if (InstrMax >= 0) then
  11133. begin
  11134. for Index := 0 to InstrMax do
  11135. begin
  11136. { If p_removed is true, then the original MOV/Z was removed
  11137. and removing the AND instruction may not be safe if it
  11138. appears first }
  11139. if (InstrList[Index].oper[InstrList[Index].ops - 1]^.typ <> top_reg) then
  11140. InternalError(2020112311);
  11141. if InstrList[Index].oper[0]^.typ = top_reg then
  11142. InstrList[Index].oper[0]^.reg := ThisReg;
  11143. InstrList[Index].oper[InstrList[Index].ops - 1]^.reg := ThisReg;
  11144. InstrList[Index].opsize := MinSize;
  11145. end;
  11146. end;
  11147. Result := True;
  11148. end;
  11149. Exit;
  11150. end;
  11151. end;
  11152. A_SETcc:
  11153. begin
  11154. { This allows this Movx optimisation to work through the SETcc instructions
  11155. inserted by the 'CMP/JE/CMP/@Lbl/SETE -> CMP/SETE/CMP/SETE/OR'
  11156. optimisation on -O1 and -O2 (on -O3, GetNextInstructionUsingReg will
  11157. skip over these SETcc instructions). }
  11158. if (cs_opt_level3 in current_settings.optimizerswitches) or
  11159. { Of course, break out if the current register is used }
  11160. RegInOp(ThisReg, taicpu(hp1).oper[0]^) then
  11161. Break
  11162. else
  11163. { We must use Continue so the instruction doesn't get added
  11164. to InstrList }
  11165. Continue;
  11166. end;
  11167. A_ADD,A_SUB,A_AND,A_OR,A_XOR,A_SHL,A_SHR,A_SAR:
  11168. begin
  11169. if
  11170. (taicpu(hp1).oper[1]^.typ <> top_reg) or
  11171. { Has to be an exact match on the register }
  11172. (taicpu(hp1).oper[1]^.reg <> ThisReg) or not
  11173. (
  11174. (
  11175. (taicpu(hp1).oper[0]^.typ = top_const) and
  11176. (
  11177. (
  11178. (taicpu(hp1).opcode = A_SHL) and
  11179. (
  11180. ((MinSize = S_B) and (taicpu(hp1).oper[0]^.val < 8)) or
  11181. ((MinSize = S_W) and (taicpu(hp1).oper[0]^.val < 16)) or
  11182. ((MinSize = S_L) and (taicpu(hp1).oper[0]^.val < 32))
  11183. )
  11184. ) or (
  11185. (taicpu(hp1).opcode <> A_SHL) and
  11186. (
  11187. ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
  11188. { Is it in the negative range? }
  11189. (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val))
  11190. )
  11191. )
  11192. )
  11193. ) or (
  11194. MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^.reg) and
  11195. ((taicpu(hp1).opcode = A_ADD) or (taicpu(hp1).opcode = A_AND) or (taicpu(hp1).opcode = A_SUB))
  11196. )
  11197. ) then
  11198. Break;
  11199. { Only process OR and XOR if there are only bitwise operations,
  11200. since otherwise they can too easily fool the data flow
  11201. analysis (they can cause non-linear behaviour) }
  11202. case taicpu(hp1).opcode of
  11203. A_ADD:
  11204. begin
  11205. if OrXorUsed then
  11206. { Too high a risk of non-linear behaviour that breaks DFA here }
  11207. Break
  11208. else
  11209. BitwiseOnly := False;
  11210. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  11211. begin
  11212. TestValMin := TestValMin * 2;
  11213. TestValMax := TestValMax * 2;
  11214. TestValSignedMax := TestValSignedMax * 2;
  11215. end
  11216. else
  11217. begin
  11218. WorkingValue := taicpu(hp1).oper[0]^.val;
  11219. TestValMin := TestValMin + WorkingValue;
  11220. TestValMax := TestValMax + WorkingValue;
  11221. TestValSignedMax := TestValSignedMax + WorkingValue;
  11222. end;
  11223. end;
  11224. A_SUB:
  11225. begin
  11226. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  11227. begin
  11228. TestValMin := 0;
  11229. TestValMax := 0;
  11230. TestValSignedMax := 0;
  11231. end
  11232. else
  11233. begin
  11234. if OrXorUsed then
  11235. { Too high a risk of non-linear behaviour that breaks DFA here }
  11236. Break
  11237. else
  11238. BitwiseOnly := False;
  11239. WorkingValue := taicpu(hp1).oper[0]^.val;
  11240. TestValMin := TestValMin - WorkingValue;
  11241. TestValMax := TestValMax - WorkingValue;
  11242. TestValSignedMax := TestValSignedMax - WorkingValue;
  11243. end;
  11244. end;
  11245. A_AND:
  11246. if (taicpu(hp1).oper[0]^.typ = top_const) then
  11247. begin
  11248. { we might be able to go smaller if AND appears first }
  11249. if InstrMax = -1 then
  11250. case MinSize of
  11251. S_B:
  11252. ;
  11253. S_W:
  11254. if ((taicpu(hp1).oper[0]^.val and $FF) = taicpu(hp1).oper[0]^.val) or
  11255. ((not(taicpu(hp1).oper[0]^.val) and $7F) = (not taicpu(hp1).oper[0]^.val)) then
  11256. begin
  11257. TryShiftDown := S_B;
  11258. TryShiftDownLimit := $FF;
  11259. end;
  11260. S_L:
  11261. if ((taicpu(hp1).oper[0]^.val and $FF) = taicpu(hp1).oper[0]^.val) or
  11262. ((not(taicpu(hp1).oper[0]^.val) and $7F) = (not taicpu(hp1).oper[0]^.val)) then
  11263. begin
  11264. TryShiftDown := S_B;
  11265. TryShiftDownLimit := $FF;
  11266. end
  11267. else if ((taicpu(hp1).oper[0]^.val and $FFFF) = taicpu(hp1).oper[0]^.val) or
  11268. ((not(taicpu(hp1).oper[0]^.val) and $7FFF) = (not taicpu(hp1).oper[0]^.val)) then
  11269. begin
  11270. TryShiftDown := S_W;
  11271. TryShiftDownLimit := $FFFF;
  11272. end;
  11273. else
  11274. InternalError(2020112320);
  11275. end;
  11276. WorkingValue := taicpu(hp1).oper[0]^.val;
  11277. TestValMin := TestValMin and WorkingValue;
  11278. TestValMax := TestValMax and WorkingValue;
  11279. TestValSignedMax := TestValSignedMax and WorkingValue;
  11280. end;
  11281. A_OR:
  11282. begin
  11283. if not BitwiseOnly then
  11284. Break;
  11285. OrXorUsed := True;
  11286. WorkingValue := taicpu(hp1).oper[0]^.val;
  11287. TestValMin := TestValMin or WorkingValue;
  11288. TestValMax := TestValMax or WorkingValue;
  11289. TestValSignedMax := TestValSignedMax or WorkingValue;
  11290. end;
  11291. A_XOR:
  11292. begin
  11293. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  11294. begin
  11295. TestValMin := 0;
  11296. TestValMax := 0;
  11297. TestValSignedMax := 0;
  11298. end
  11299. else
  11300. begin
  11301. if not BitwiseOnly then
  11302. Break;
  11303. OrXorUsed := True;
  11304. WorkingValue := taicpu(hp1).oper[0]^.val;
  11305. TestValMin := TestValMin xor WorkingValue;
  11306. TestValMax := TestValMax xor WorkingValue;
  11307. TestValSignedMax := TestValSignedMax xor WorkingValue;
  11308. end;
  11309. end;
  11310. A_SHL:
  11311. begin
  11312. BitwiseOnly := False;
  11313. WorkingValue := taicpu(hp1).oper[0]^.val;
  11314. TestValMin := TestValMin shl WorkingValue;
  11315. TestValMax := TestValMax shl WorkingValue;
  11316. TestValSignedMax := TestValSignedMax shl WorkingValue;
  11317. end;
  11318. A_SHR,
  11319. { The first instruction was MOVZX, so the value won't be negative }
  11320. A_SAR:
  11321. begin
  11322. if InstrMax <> -1 then
  11323. BitwiseOnly := False
  11324. else
  11325. { we might be able to go smaller if SHR appears first }
  11326. case MinSize of
  11327. S_B:
  11328. ;
  11329. S_W:
  11330. if (taicpu(hp1).oper[0]^.val >= 8) then
  11331. begin
  11332. TryShiftDown := S_B;
  11333. TryShiftDownLimit := $FF;
  11334. TryShiftDownSignedLimit := $7F;
  11335. TryShiftDownSignedLimitLower := -128;
  11336. end;
  11337. S_L:
  11338. if (taicpu(hp1).oper[0]^.val >= 24) then
  11339. begin
  11340. TryShiftDown := S_B;
  11341. TryShiftDownLimit := $FF;
  11342. TryShiftDownSignedLimit := $7F;
  11343. TryShiftDownSignedLimitLower := -128;
  11344. end
  11345. else if (taicpu(hp1).oper[0]^.val >= 16) then
  11346. begin
  11347. TryShiftDown := S_W;
  11348. TryShiftDownLimit := $FFFF;
  11349. TryShiftDownSignedLimit := $7FFF;
  11350. TryShiftDownSignedLimitLower := -32768;
  11351. end;
  11352. else
  11353. InternalError(2020112321);
  11354. end;
  11355. WorkingValue := taicpu(hp1).oper[0]^.val;
  11356. if taicpu(hp1).opcode = A_SAR then
  11357. begin
  11358. TestValMin := SarInt64(TestValMin, WorkingValue);
  11359. TestValMax := SarInt64(TestValMax, WorkingValue);
  11360. TestValSignedMax := SarInt64(TestValSignedMax, WorkingValue);
  11361. end
  11362. else
  11363. begin
  11364. TestValMin := TestValMin shr WorkingValue;
  11365. TestValMax := TestValMax shr WorkingValue;
  11366. TestValSignedMax := TestValSignedMax shr WorkingValue;
  11367. end;
  11368. end;
  11369. else
  11370. InternalError(2020112303);
  11371. end;
  11372. end;
  11373. (*
  11374. A_IMUL:
  11375. case taicpu(hp1).ops of
  11376. 2:
  11377. begin
  11378. if not MatchOpType(hp1, top_reg, top_reg) or
  11379. { Has to be an exact match on the register }
  11380. (taicpu(hp1).oper[0]^.reg <> ThisReg) or
  11381. (taicpu(hp1).oper[1]^.reg <> ThisReg) then
  11382. Break;
  11383. TestValMin := TestValMin * TestValMin;
  11384. TestValMax := TestValMax * TestValMax;
  11385. TestValSignedMax := TestValSignedMax * TestValMax;
  11386. end;
  11387. 3:
  11388. begin
  11389. if not MatchOpType(hp1, top_const, top_reg, top_reg) or
  11390. { Has to be an exact match on the register }
  11391. (taicpu(hp1).oper[1]^.reg <> ThisReg) or
  11392. (taicpu(hp1).oper[2]^.reg <> ThisReg) or
  11393. ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
  11394. { Is it in the negative range? }
  11395. (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val)) then
  11396. Break;
  11397. TestValMin := TestValMin * taicpu(hp1).oper[0]^.val;
  11398. TestValMax := TestValMax * taicpu(hp1).oper[0]^.val;
  11399. TestValSignedMax := TestValSignedMax * taicpu(hp1).oper[0]^.val;
  11400. end;
  11401. else
  11402. Break;
  11403. end;
  11404. A_IDIV:
  11405. case taicpu(hp1).ops of
  11406. 3:
  11407. begin
  11408. if not MatchOpType(hp1, top_const, top_reg, top_reg) or
  11409. { Has to be an exact match on the register }
  11410. (taicpu(hp1).oper[1]^.reg <> ThisReg) or
  11411. (taicpu(hp1).oper[2]^.reg <> ThisReg) or
  11412. ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
  11413. { Is it in the negative range? }
  11414. (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val)) then
  11415. Break;
  11416. TestValMin := TestValMin div taicpu(hp1).oper[0]^.val;
  11417. TestValMax := TestValMax div taicpu(hp1).oper[0]^.val;
  11418. TestValSignedMax := TestValSignedMax div taicpu(hp1).oper[0]^.val;
  11419. end;
  11420. else
  11421. Break;
  11422. end;
  11423. *)
  11424. A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  11425. begin
  11426. { If there are no instructions in between, then we might be able to make a saving }
  11427. if UpperSignedOverflow or (taicpu(hp1).oper[0]^.typ <> top_reg) or (taicpu(hp1).oper[0]^.reg <> ThisReg) then
  11428. Break;
  11429. { We have something like:
  11430. movzbw %dl,%dx
  11431. ...
  11432. movswl %dx,%edx
  11433. Change the latter to a zero-extension then enter the
  11434. A_MOVZX case branch.
  11435. }
  11436. {$ifdef x86_64}
  11437. if (taicpu(hp1).opsize = S_LQ) and SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
  11438. begin
  11439. { this becomes a zero extension from 32-bit to 64-bit, but
  11440. the upper 32 bits are already zero, so just delete the
  11441. instruction }
  11442. DebugMsg(SPeepholeOptimization + 'MovzMovsxd2MovzNop', hp1);
  11443. RemoveInstruction(hp1);
  11444. Result := True;
  11445. Exit;
  11446. end
  11447. else
  11448. {$endif x86_64}
  11449. begin
  11450. DebugMsg(SPeepholeOptimization + 'MovzMovs2MovzMovz', hp1);
  11451. taicpu(hp1).opcode := A_MOVZX;
  11452. {$ifdef x86_64}
  11453. case taicpu(hp1).opsize of
  11454. S_BQ:
  11455. begin
  11456. taicpu(hp1).opsize := S_BL;
  11457. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  11458. end;
  11459. S_WQ:
  11460. begin
  11461. taicpu(hp1).opsize := S_WL;
  11462. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  11463. end;
  11464. S_LQ:
  11465. begin
  11466. taicpu(hp1).opcode := A_MOV;
  11467. taicpu(hp1).opsize := S_L;
  11468. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  11469. { In this instance, we need to break out because the
  11470. instruction is no longer MOVZX or MOVSXD }
  11471. Result := True;
  11472. Exit;
  11473. end;
  11474. else
  11475. ;
  11476. end;
  11477. {$endif x86_64}
  11478. Result := CompressInstructions;
  11479. Exit;
  11480. end;
  11481. end;
  11482. A_MOVZX:
  11483. begin
  11484. if UpperUnsignedOverflow or (taicpu(hp1).oper[0]^.typ <> top_reg) then
  11485. Break;
  11486. if (InstrMax = -1) then
  11487. begin
  11488. if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, ThisReg) then
  11489. begin
  11490. { Optimise around i40003 }
  11491. { Check to see if the active register is used afterwards }
  11492. TransferUsedRegs(TmpUsedRegs);
  11493. IncludeRegInUsedRegs(ThisReg, TmpUsedRegs);
  11494. if (
  11495. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) or
  11496. not RegUsedAfterInstruction(ThisReg, hp1, TmpUsedRegs)
  11497. ) and
  11498. (taicpu(p).opsize = S_WL) and (taicpu(hp1).opsize = S_BL)
  11499. {$ifndef x86_64}
  11500. and (
  11501. (taicpu(p).oper[0]^.typ <> top_reg) or
  11502. { Cannot encode byte-sized ESI, EDI, EBP or ESP under i386 }
  11503. (GetSupReg(taicpu(p).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])
  11504. )
  11505. {$endif not x86_64}
  11506. then
  11507. begin
  11508. if (taicpu(p).oper[0]^.typ = top_reg) then
  11509. setsubreg(taicpu(p).oper[0]^.reg, R_SUBL);
  11510. DebugMsg(SPeepholeOptimization + 'movzwl2movzbl 1', p);
  11511. taicpu(p).opsize := S_BL;
  11512. { Only remove if the active register is overwritten }
  11513. if SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
  11514. begin
  11515. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 2a', hp1);
  11516. RemoveInstruction(hp1);
  11517. end;
  11518. Result := True;
  11519. Exit;
  11520. end;
  11521. end
  11522. else
  11523. begin
  11524. { Will return false if the second parameter isn't ThisReg
  11525. (can happen on -O2 and under) }
  11526. if Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ThisReg) then
  11527. begin
  11528. { The two MOVZX instructions are adjacent, so remove the first one }
  11529. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 5', p);
  11530. RemoveCurrentP(p);
  11531. Result := True;
  11532. Exit;
  11533. end;
  11534. Break;
  11535. end;
  11536. end;
  11537. Result := CompressInstructions;
  11538. Exit;
  11539. end;
  11540. else
  11541. { This includes ADC, SBB and IDIV }
  11542. Break;
  11543. end;
  11544. if not CheckOverflowConditions then
  11545. Break;
  11546. { Contains highest index (so instruction count - 1) }
  11547. Inc(InstrMax);
  11548. if InstrMax > High(InstrList) then
  11549. SetLength(InstrList, InstrMax + LIST_STEP_SIZE);
  11550. InstrList[InstrMax] := taicpu(hp1);
  11551. end;
  11552. end;
  11553. {$pop}
  11554. function TX86AsmOptimizer.OptPass2Imul(var p : tai) : boolean;
  11555. var
  11556. hp1 : tai;
  11557. begin
  11558. Result:=false;
  11559. if (taicpu(p).ops >= 2) and
  11560. ((taicpu(p).oper[0]^.typ = top_const) or
  11561. ((taicpu(p).oper[0]^.typ = top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full))) and
  11562. (taicpu(p).oper[1]^.typ = top_reg) and
  11563. ((taicpu(p).ops = 2) or
  11564. ((taicpu(p).oper[2]^.typ = top_reg) and
  11565. (taicpu(p).oper[2]^.reg = taicpu(p).oper[1]^.reg))) and
  11566. GetLastInstruction(p,hp1) and
  11567. MatchInstruction(hp1,A_MOV,[]) and
  11568. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  11569. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  11570. begin
  11571. TransferUsedRegs(TmpUsedRegs);
  11572. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,p,TmpUsedRegs)) or
  11573. ((taicpu(p).ops = 3) and (taicpu(p).oper[1]^.reg=taicpu(p).oper[2]^.reg)) then
  11574. { change
  11575. mov reg1,reg2
  11576. imul y,reg2 to imul y,reg1,reg2 }
  11577. begin
  11578. taicpu(p).ops := 3;
  11579. taicpu(p).loadreg(2,taicpu(p).oper[1]^.reg);
  11580. taicpu(p).loadreg(1,taicpu(hp1).oper[0]^.reg);
  11581. DebugMsg(SPeepholeOptimization + 'MovImul2Imul done',p);
  11582. RemoveInstruction(hp1);
  11583. result:=true;
  11584. end;
  11585. end;
  11586. end;
  11587. procedure TX86AsmOptimizer.ConvertJumpToRET(const p: tai; const ret_p: tai);
  11588. var
  11589. ThisLabel: TAsmLabel;
  11590. begin
  11591. ThisLabel := tasmlabel(taicpu(p).oper[0]^.ref^.symbol);
  11592. ThisLabel.decrefs;
  11593. taicpu(p).condition := C_None;
  11594. taicpu(p).opcode := A_RET;
  11595. taicpu(p).is_jmp := false;
  11596. taicpu(p).ops := taicpu(ret_p).ops;
  11597. case taicpu(ret_p).ops of
  11598. 0:
  11599. taicpu(p).clearop(0);
  11600. 1:
  11601. taicpu(p).loadconst(0,taicpu(ret_p).oper[0]^.val);
  11602. else
  11603. internalerror(2016041301);
  11604. end;
  11605. { If the original label is now dead, it might turn out that the label
  11606. immediately follows p. As a result, everything beyond it, which will
  11607. be just some final register configuration and a RET instruction, is
  11608. now dead code. [Kit] }
  11609. { NOTE: This is much faster than introducing a OptPass2RET routine and
  11610. running RemoveDeadCodeAfterJump for each RET instruction, because
  11611. this optimisation rarely happens and most RETs appear at the end of
  11612. routines where there is nothing that can be stripped. [Kit] }
  11613. if not ThisLabel.is_used then
  11614. RemoveDeadCodeAfterJump(p);
  11615. end;
  11616. function TX86AsmOptimizer.OptPass2SETcc(var p: tai): boolean;
  11617. var
  11618. hp1,hp2,next: tai; SetC, JumpC: TAsmCond;
  11619. Unconditional, PotentialModified: Boolean;
  11620. OperPtr: POper;
  11621. NewRef: TReference;
  11622. InstrList: array of taicpu;
  11623. InstrMax, Index: Integer;
  11624. const
  11625. {$ifdef DEBUG_AOPTCPU}
  11626. SNoFlags: shortstring = ' so the flags aren''t modified';
  11627. {$else DEBUG_AOPTCPU}
  11628. SNoFlags = '';
  11629. {$endif DEBUG_AOPTCPU}
  11630. begin
  11631. Result:=false;
  11632. if MatchOpType(taicpu(p),top_reg) and GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
  11633. begin
  11634. if MatchInstruction(hp1, A_TEST, [S_B]) and
  11635. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  11636. (taicpu(hp1).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  11637. (taicpu(p).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  11638. GetNextInstruction(hp1, hp2) and
  11639. MatchInstruction(hp2, A_Jcc, A_SETcc, []) then
  11640. { Change from: To:
  11641. set(C) %reg j(~C) label
  11642. test %reg,%reg/cmp $0,%reg
  11643. je label
  11644. set(C) %reg j(C) label
  11645. test %reg,%reg/cmp $0,%reg
  11646. jne label
  11647. (Also do something similar with sete/setne instead of je/jne)
  11648. }
  11649. begin
  11650. { Before we do anything else, we need to check the instructions
  11651. in between SETcc and TEST to make sure they don't modify the
  11652. FLAGS register - if -O2 or under, there won't be any
  11653. instructions between SET and TEST }
  11654. TransferUsedRegs(TmpUsedRegs);
  11655. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  11656. if (cs_opt_level3 in current_settings.optimizerswitches) then
  11657. begin
  11658. next := p;
  11659. SetLength(InstrList, 0);
  11660. InstrMax := -1;
  11661. PotentialModified := False;
  11662. { Make a note of every instruction that modifies the FLAGS
  11663. register }
  11664. while GetNextInstruction(next, next) and (next <> hp1) do
  11665. begin
  11666. if next.typ <> ait_instruction then
  11667. { GetNextInstructionUsingReg should have returned False }
  11668. InternalError(2021051701);
  11669. if RegModifiedByInstruction(NR_DEFAULTFLAGS, next) then
  11670. begin
  11671. case taicpu(next).opcode of
  11672. A_SETcc,
  11673. A_CMOVcc,
  11674. A_Jcc:
  11675. begin
  11676. if PotentialModified then
  11677. { Not safe because the flags were modified earlier }
  11678. Exit
  11679. else
  11680. { Condition is the same as the initial SETcc, so this is safe
  11681. (don't add to instruction list though) }
  11682. Continue;
  11683. end;
  11684. A_ADD:
  11685. begin
  11686. if { LEA doesn't support 8-bit in general and 16-bit on x86-64 operands }
  11687. (taicpu(next).opsize in [S_B{$ifdef x86_64},S_W{$endif x86_64}]) or
  11688. (taicpu(next).oper[1]^.typ <> top_reg) or
  11689. { Must write to a register }
  11690. (taicpu(next).oper[0]^.typ = top_ref) then
  11691. { Require a constant or a register }
  11692. Exit;
  11693. PotentialModified := True;
  11694. end;
  11695. A_SUB:
  11696. begin
  11697. if { LEA doesn't support 8-bit in general and 16-bit on x86-64 operands }
  11698. (taicpu(next).opsize in [S_B{$ifdef x86_64},S_W{$endif x86_64}]) or
  11699. (taicpu(next).oper[1]^.typ <> top_reg) or
  11700. { Must write to a register }
  11701. (taicpu(next).oper[0]^.typ <> top_const) or
  11702. (taicpu(next).oper[0]^.val = $80000000) then
  11703. { Can't subtract a register with LEA - also
  11704. check that the value isn't -2^31, as this
  11705. can't be negated }
  11706. Exit;
  11707. PotentialModified := True;
  11708. end;
  11709. A_SAL,
  11710. A_SHL:
  11711. begin
  11712. if { LEA doesn't support 8-bit in general and 16-bit on x86-64 operands }
  11713. (taicpu(next).opsize in [S_B{$ifdef x86_64},S_W{$endif x86_64}]) or
  11714. (taicpu(next).oper[1]^.typ <> top_reg) or
  11715. { Must write to a register }
  11716. (taicpu(next).oper[0]^.typ <> top_const) or
  11717. (taicpu(next).oper[0]^.val < 0) or
  11718. (taicpu(next).oper[0]^.val > 3) then
  11719. Exit;
  11720. PotentialModified := True;
  11721. end;
  11722. A_IMUL:
  11723. begin
  11724. if (taicpu(next).ops <> 3) or
  11725. (taicpu(next).oper[1]^.typ <> top_reg) or
  11726. { Must write to a register }
  11727. (taicpu(next).oper[2]^.val in [2,3,4,5,8,9]) then
  11728. { We can convert "imul x,%reg1,%reg2" (where x = 2, 4 or 8)
  11729. to "lea (%reg1,x),%reg2". If x = 3, 5 or 9, we can
  11730. change this to "lea (%reg1,%reg1,(x-1)),%reg2" }
  11731. Exit
  11732. else
  11733. PotentialModified := True;
  11734. end;
  11735. else
  11736. { Don't know how to change this, so abort }
  11737. Exit;
  11738. end;
  11739. { Contains highest index (so instruction count - 1) }
  11740. Inc(InstrMax);
  11741. if InstrMax > High(InstrList) then
  11742. SetLength(InstrList, InstrMax + LIST_STEP_SIZE);
  11743. InstrList[InstrMax] := taicpu(next);
  11744. end;
  11745. UpdateUsedRegs(TmpUsedRegs, tai(next.next));
  11746. end;
  11747. if not Assigned(next) or (next <> hp1) then
  11748. { It should be equal to hp1 }
  11749. InternalError(2021051702);
  11750. { Cycle through each instruction and check to see if we can
  11751. change them to versions that don't modify the flags }
  11752. if (InstrMax >= 0) then
  11753. begin
  11754. for Index := 0 to InstrMax do
  11755. case InstrList[Index].opcode of
  11756. A_ADD:
  11757. begin
  11758. DebugMsg(SPeepholeOptimization + 'ADD -> LEA' + SNoFlags, InstrList[Index]);
  11759. InstrList[Index].opcode := A_LEA;
  11760. reference_reset(NewRef, 1, []);
  11761. NewRef.base := InstrList[Index].oper[1]^.reg;
  11762. if InstrList[Index].oper[0]^.typ = top_reg then
  11763. begin
  11764. NewRef.index := InstrList[Index].oper[0]^.reg;
  11765. NewRef.scalefactor := 1;
  11766. end
  11767. else
  11768. NewRef.offset := InstrList[Index].oper[0]^.val;
  11769. InstrList[Index].loadref(0, NewRef);
  11770. end;
  11771. A_SUB:
  11772. begin
  11773. DebugMsg(SPeepholeOptimization + 'SUB -> LEA' + SNoFlags, InstrList[Index]);
  11774. InstrList[Index].opcode := A_LEA;
  11775. reference_reset(NewRef, 1, []);
  11776. NewRef.base := InstrList[Index].oper[1]^.reg;
  11777. NewRef.offset := -InstrList[Index].oper[0]^.val;
  11778. InstrList[Index].loadref(0, NewRef);
  11779. end;
  11780. A_SHL,
  11781. A_SAL:
  11782. begin
  11783. DebugMsg(SPeepholeOptimization + 'SHL -> LEA' + SNoFlags, InstrList[Index]);
  11784. InstrList[Index].opcode := A_LEA;
  11785. reference_reset(NewRef, 1, []);
  11786. NewRef.index := InstrList[Index].oper[1]^.reg;
  11787. NewRef.scalefactor := 1 shl (InstrList[Index].oper[0]^.val);
  11788. InstrList[Index].loadref(0, NewRef);
  11789. end;
  11790. A_IMUL:
  11791. begin
  11792. DebugMsg(SPeepholeOptimization + 'IMUL -> LEA' + SNoFlags, InstrList[Index]);
  11793. InstrList[Index].opcode := A_LEA;
  11794. reference_reset(NewRef, 1, []);
  11795. NewRef.index := InstrList[Index].oper[1]^.reg;
  11796. case InstrList[Index].oper[0]^.val of
  11797. 2, 4, 8:
  11798. NewRef.scalefactor := InstrList[Index].oper[0]^.val;
  11799. else {3, 5 and 9}
  11800. begin
  11801. NewRef.scalefactor := InstrList[Index].oper[0]^.val - 1;
  11802. NewRef.base := InstrList[Index].oper[1]^.reg;
  11803. end;
  11804. end;
  11805. InstrList[Index].loadref(0, NewRef);
  11806. end;
  11807. else
  11808. InternalError(2021051710);
  11809. end;
  11810. end;
  11811. { Mark the FLAGS register as used across this whole block }
  11812. AllocRegBetween(NR_DEFAULTFLAGS, p, hp1, UsedRegs);
  11813. end;
  11814. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  11815. JumpC := taicpu(hp2).condition;
  11816. Unconditional := False;
  11817. if conditions_equal(JumpC, C_E) then
  11818. SetC := inverse_cond(taicpu(p).condition)
  11819. else if conditions_equal(JumpC, C_NE) then
  11820. SetC := taicpu(p).condition
  11821. else
  11822. { We've got something weird here (and inefficent) }
  11823. begin
  11824. DebugMsg('DEBUG: Inefficient jump - check code generation', p);
  11825. SetC := C_NONE;
  11826. { JAE/JNB will always branch (use 'condition_in', since C_AE <> C_NB normally) }
  11827. if condition_in(C_AE, JumpC) then
  11828. Unconditional := True
  11829. else
  11830. { Not sure what to do with this jump - drop out }
  11831. Exit;
  11832. end;
  11833. RemoveInstruction(hp1);
  11834. if Unconditional then
  11835. MakeUnconditional(taicpu(hp2))
  11836. else
  11837. begin
  11838. if SetC = C_NONE then
  11839. InternalError(2018061402);
  11840. taicpu(hp2).SetCondition(SetC);
  11841. end;
  11842. { as hp2 is a jump, we cannot use RegUsedAfterInstruction but we have to check if it is included in
  11843. TmpUsedRegs }
  11844. if not TmpUsedRegs[getregtype(taicpu(p).oper[0]^.reg)].IsUsed(taicpu(p).oper[0]^.reg) then
  11845. begin
  11846. RemoveCurrentp(p, hp2);
  11847. if taicpu(hp2).opcode = A_SETcc then
  11848. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/SETcc -> SETcc',p)
  11849. else
  11850. begin
  11851. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/Jcc -> Jcc',p);
  11852. if (cs_opt_level3 in current_settings.optimizerswitches) then
  11853. Include(OptsToCheck, aoc_DoPass2JccOpts);
  11854. end;
  11855. end
  11856. else
  11857. if taicpu(hp2).opcode = A_SETcc then
  11858. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/SETcc -> SETcc/SETcc',p)
  11859. else
  11860. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/Jcc -> SETcc/Jcc',p);
  11861. Result := True;
  11862. end
  11863. else if
  11864. { Make sure the instructions are adjacent }
  11865. (
  11866. not (cs_opt_level3 in current_settings.optimizerswitches) or
  11867. GetNextInstruction(p, hp1)
  11868. ) and
  11869. MatchInstruction(hp1, A_MOV, [S_B]) and
  11870. { Writing to memory is allowed }
  11871. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^.reg) then
  11872. begin
  11873. {
  11874. Watch out for sequences such as:
  11875. set(c)b %regb
  11876. movb %regb,(ref)
  11877. movb $0,1(ref)
  11878. movb $0,2(ref)
  11879. movb $0,3(ref)
  11880. Much more efficient to turn it into:
  11881. movl $0,%regl
  11882. set(c)b %regb
  11883. movl %regl,(ref)
  11884. Or:
  11885. set(c)b %regb
  11886. movzbl %regb,%regl
  11887. movl %regl,(ref)
  11888. }
  11889. if (taicpu(hp1).oper[1]^.typ = top_ref) and
  11890. GetNextInstruction(hp1, hp2) and
  11891. MatchInstruction(hp2, A_MOV, [S_B]) and
  11892. (taicpu(hp2).oper[1]^.typ = top_ref) and
  11893. CheckMemoryWrite(taicpu(hp1), taicpu(hp2)) then
  11894. begin
  11895. { Don't do anything else except set Result to True }
  11896. end
  11897. else
  11898. begin
  11899. if taicpu(p).oper[0]^.typ = top_reg then
  11900. begin
  11901. TransferUsedRegs(TmpUsedRegs);
  11902. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  11903. end;
  11904. { If it's not a register, it's a memory address }
  11905. if (taicpu(p).oper[0]^.typ <> top_reg) or RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) then
  11906. begin
  11907. { Even if the register is still in use, we can minimise the
  11908. pipeline stall by changing the MOV into another SETcc. }
  11909. taicpu(hp1).opcode := A_SETcc;
  11910. taicpu(hp1).condition := taicpu(p).condition;
  11911. if taicpu(hp1).oper[1]^.typ = top_ref then
  11912. begin
  11913. { Swapping the operand pointers like this is probably a
  11914. bit naughty, but it is far faster than using loadoper
  11915. to transfer the reference from oper[1] to oper[0] if
  11916. you take into account the extra procedure calls and
  11917. the memory allocation and deallocation required }
  11918. OperPtr := taicpu(hp1).oper[1];
  11919. taicpu(hp1).oper[1] := taicpu(hp1).oper[0];
  11920. taicpu(hp1).oper[0] := OperPtr;
  11921. end
  11922. else
  11923. taicpu(hp1).oper[0]^.reg := taicpu(hp1).oper[1]^.reg;
  11924. taicpu(hp1).clearop(1);
  11925. taicpu(hp1).ops := 1;
  11926. DebugMsg(SPeepholeOptimization + 'SETcc/Mov -> SETcc/SETcc',p);
  11927. end
  11928. else
  11929. begin
  11930. if taicpu(hp1).oper[1]^.typ = top_reg then
  11931. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
  11932. taicpu(p).loadoper(0, taicpu(hp1).oper[1]^);
  11933. RemoveInstruction(hp1);
  11934. DebugMsg(SPeepholeOptimization + 'SETcc/Mov -> SETcc',p);
  11935. end
  11936. end;
  11937. Result := True;
  11938. end;
  11939. end;
  11940. end;
  11941. function TX86AsmOptimizer.TryCmpCMovOpts(var p, hp1: tai): Boolean;
  11942. var
  11943. hp2, pCond, pFirstMOV, pLastMOV, pCMOV: tai;
  11944. TargetReg: TRegister;
  11945. condition, inverted_condition: TAsmCond;
  11946. FoundMOV: Boolean;
  11947. begin
  11948. Result := False;
  11949. { In some situations, the CMOV optimisations in OptPass2Jcc can't
  11950. create the most optimial instructions possible due to limited
  11951. register availability, and there are situations where two
  11952. complementary "simple" CMOV blocks are created which, after the fact
  11953. can be merged into a "double" block. For example:
  11954. movw $257,%ax
  11955. movw $2,%r8w
  11956. xorl r9d,%r9d
  11957. testw $16,18(%rcx)
  11958. cmovew %ax,%dx
  11959. cmovew %r8w,%bx
  11960. cmovel %r9d,%r14d
  11961. movw $1283,%ax
  11962. movw $4,%r8w
  11963. movl $9,%r9d
  11964. cmovnew %ax,%dx
  11965. cmovnew %r8w,%bx
  11966. cmovnel %r9d,%r14d
  11967. The CMOVNE instructions at the end can be removed, and the
  11968. destination registers copied into the MOV instructions directly
  11969. above them, before finally being moved to before the first CMOVE
  11970. instructions, to produce:
  11971. movw $257,%ax
  11972. movw $2,%r8w
  11973. xorl r9d,%r9d
  11974. testw $16,18(%rcx)
  11975. movw $1283,%dx
  11976. movw $4,%bx
  11977. movl $9,%r14d
  11978. cmovew %ax,%dx
  11979. cmovew %r8w,%bx
  11980. cmovel %r9d,%r14d
  11981. Which can then be later optimised to:
  11982. movw $257,%ax
  11983. movw $2,%r8w
  11984. xorl r9d,%r9d
  11985. movw $1283,%dx
  11986. movw $4,%bx
  11987. movl $9,%r14d
  11988. testw $16,18(%rcx)
  11989. cmovew %ax,%dx
  11990. cmovew %r8w,%bx
  11991. cmovel %r9d,%r14d
  11992. }
  11993. TargetReg := taicpu(hp1).oper[1]^.reg;
  11994. condition := taicpu(hp1).condition;
  11995. inverted_condition := inverse_cond(condition);
  11996. pFirstMov := nil;
  11997. pLastMov := nil;
  11998. pCMOV := nil;
  11999. if (p.typ = ait_instruction) then
  12000. pCond := p
  12001. else if not GetNextInstruction(p, pCond) then
  12002. InternalError(2024012501);
  12003. if not MatchInstruction(pCond, A_CMP, A_TEST, []) then
  12004. { We should get the CMP or TEST instructeion }
  12005. InternalError(2024012502);
  12006. if (
  12007. (taicpu(hp1).oper[0]^.typ = top_reg) or
  12008. IsRefSafe(taicpu(hp1).oper[0]^.ref)
  12009. ) then
  12010. begin
  12011. { We have to tread carefully here, hence why we're not using
  12012. GetNextInstructionUsingReg... we can only accept MOV and other
  12013. CMOV instructions. Anything else and we must drop out}
  12014. hp2 := hp1;
  12015. while GetNextInstruction(hp2, hp2) and (hp2 <> BlockEnd) do
  12016. begin
  12017. if (hp2.typ <> ait_instruction) then
  12018. Exit;
  12019. case taicpu(hp2).opcode of
  12020. A_MOV:
  12021. begin
  12022. if not Assigned(pFirstMov) then
  12023. pFirstMov := hp2;
  12024. pLastMOV := hp2;
  12025. if not MatchOpType(taicpu(hp2), top_const, top_reg) then
  12026. { Something different - drop out }
  12027. Exit;
  12028. { Otherwise, leave it for now }
  12029. end;
  12030. A_CMOVcc:
  12031. begin
  12032. if taicpu(hp2).condition = inverted_condition then
  12033. begin
  12034. { We found what we're looking for }
  12035. if taicpu(hp2).oper[1]^.reg = TargetReg then
  12036. begin
  12037. if (taicpu(hp2).oper[0]^.typ = top_reg) or
  12038. IsRefSafe(taicpu(hp2).oper[0]^.ref) then
  12039. begin
  12040. pCMOV := hp2;
  12041. Break;
  12042. end
  12043. else
  12044. { Unsafe reference - drop out }
  12045. Exit;
  12046. end;
  12047. end
  12048. else if taicpu(hp2).condition <> condition then
  12049. { Something weird - drop out }
  12050. Exit;
  12051. end;
  12052. else
  12053. { Invalid }
  12054. Exit;
  12055. end;
  12056. end;
  12057. if not Assigned(pCMOV) then
  12058. { No complementary CMOV found }
  12059. Exit;
  12060. if not Assigned(pFirstMov) or (taicpu(pCMOV).oper[0]^.typ = top_ref) then
  12061. begin
  12062. { Don't need to do anything special or search for a matching MOV }
  12063. Asml.Remove(pCMOV);
  12064. if RegInInstruction(TargetReg, pCond) then
  12065. { Make sure we don't overwrite the register if it's being used in the condition }
  12066. Asml.InsertAfter(pCMOV, pCond)
  12067. else
  12068. Asml.InsertBefore(pCMOV, pCond);
  12069. taicpu(pCMOV).opcode := A_MOV;
  12070. taicpu(pCMOV).condition := C_None;
  12071. { Don't need to worry about allocating new registers in these cases }
  12072. DebugMsg(SPeepholeOptimization + 'CMovCMov2MovCMov 2', pCMOV);
  12073. Result := True;
  12074. Exit;
  12075. end
  12076. else
  12077. begin
  12078. DebugMsg(SPeepholeOptimization + 'CMovCMov2MovCMov 1', hp1);
  12079. FoundMOV := False;
  12080. { Search for the MOV that sets the target register }
  12081. hp2 := pFirstMov;
  12082. repeat
  12083. if (taicpu(hp2).opcode = A_MOV) and
  12084. (taicpu(hp2).oper[1]^.typ = top_reg) and
  12085. SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, taicpu(pCMOV).oper[0]^.reg) then
  12086. begin
  12087. { Change the destination }
  12088. taicpu(hp2).loadreg(1, newreg(R_INTREGISTER, getsupreg(TargetReg), getsubreg(taicpu(hp2).oper[1]^.reg)));
  12089. if not FoundMOV then
  12090. begin
  12091. FoundMOV := True;
  12092. { Make sure the register is allocated }
  12093. AllocRegBetween(TargetReg, p, hp2, UsedRegs);
  12094. end;
  12095. hp1 := tai(hp2.Previous);
  12096. Asml.Remove(hp2);
  12097. if RegInInstruction(TargetReg, pCond) then
  12098. { Make sure we don't overwrite the register if it's being used in the condition }
  12099. Asml.InsertAfter(hp2, pCond)
  12100. else
  12101. Asml.InsertBefore(hp2, pCond);
  12102. if (hp2 = pLastMov) then
  12103. { If the MOV instruction is the last one, "hp2 = pLastMOV" won't trigger }
  12104. Break;
  12105. hp2 := hp1;
  12106. end;
  12107. until (hp2 = pLastMOV) or not GetNextInstruction(hp2, hp2) or (hp2 = BlockEnd) or (hp2.typ <> ait_instruction);
  12108. if FoundMOV then
  12109. { Delete the CMOV }
  12110. RemoveInstruction(pCMOV)
  12111. else
  12112. begin
  12113. { If no MOV was found, we have to actually move and transmute the CMOV }
  12114. Asml.Remove(pCMOV);
  12115. if RegInInstruction(TargetReg, pCond) then
  12116. { Make sure we don't overwrite the register if it's being used in the condition }
  12117. Asml.InsertAfter(pCMOV, pCond)
  12118. else
  12119. Asml.InsertBefore(pCMOV, pCond);
  12120. taicpu(pCMOV).opcode := A_MOV;
  12121. taicpu(pCMOV).condition := C_None;
  12122. end;
  12123. Result := True;
  12124. Exit;
  12125. end;
  12126. end;
  12127. end;
  12128. function TX86AsmOptimizer.OptPass2Cmp(var p: tai): Boolean;
  12129. var
  12130. hp1, hp2, pCond: tai;
  12131. begin
  12132. Result := False;
  12133. { Search ahead for CMOV instructions }
  12134. if (cs_opt_level2 in current_settings.optimizerswitches) then
  12135. begin
  12136. hp1 := p;
  12137. hp2 := p;
  12138. pCond := nil; { To prevent compiler warnings }
  12139. { For TryCmpCMOVOpts, try to insert MOVs before the allocation of
  12140. DEFAULTFLAGS }
  12141. if not SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, p), pCond) or
  12142. (tai_regalloc(pCond).ratype = ra_dealloc) then
  12143. pCond := p;
  12144. while GetNextInstruction(hp1, hp1) and (hp1 <> BlockEnd) do
  12145. begin
  12146. if (hp1.typ <> ait_instruction) then
  12147. { Break out on markers and labels etc. }
  12148. Break;
  12149. case taicpu(hp1).opcode of
  12150. A_MOV:
  12151. { Ignore regular MOVs unless they are obviously not related
  12152. to a CMOV block }
  12153. if taicpu(hp1).oper[1]^.typ <> top_reg then
  12154. Break;
  12155. A_CMOVcc:
  12156. if TryCmpCMovOpts(pCond, hp1) then
  12157. begin
  12158. hp1 := hp2;
  12159. { p itself isn't changed, and we're still inside a
  12160. while loop to catch subsequent CMOVs, so just flag
  12161. a new iteration }
  12162. Include(OptsToCheck, aoc_ForceNewIteration);
  12163. Continue;
  12164. end;
  12165. else
  12166. { Drop out if we find anything else }
  12167. Break;
  12168. end;
  12169. hp2 := hp1;
  12170. end;
  12171. end;
  12172. end;
  12173. function TX86AsmOptimizer.OptPass2Test(var p: tai): Boolean;
  12174. var
  12175. hp1, hp2, pCond: tai;
  12176. SourceReg, TargetReg: TRegister;
  12177. begin
  12178. Result := False;
  12179. { In some situations, we end up with an inefficient arrangement of
  12180. instructions in the form of:
  12181. or %reg1,%reg2
  12182. (%reg1 deallocated)
  12183. test %reg2,%reg2
  12184. mov x,%reg2
  12185. we may be able to swap and rearrange the registers to produce:
  12186. or %reg2,%reg1
  12187. mov x,%reg2
  12188. test %reg1,%reg1
  12189. (%reg1 deallocated)
  12190. }
  12191. if (cs_opt_level3 in current_settings.optimizerswitches) and
  12192. (taicpu(p).oper[1]^.typ = top_reg) and
  12193. (
  12194. MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^.reg) or
  12195. MatchOperand(taicpu(p).oper[0]^, -1)
  12196. ) and
  12197. GetNextInstruction(p, hp1) and
  12198. MatchInstruction(hp1, A_MOV, []) and
  12199. (taicpu(hp1).oper[1]^.typ = top_reg) and
  12200. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  12201. begin
  12202. TargetReg := taicpu(p).oper[1]^.reg;
  12203. { Now look backwards to find a simple commutative operation: ADD,
  12204. IMUL (2-register version), OR, AND or XOR - whose destination
  12205. register is the same as TEST }
  12206. hp2 := p;
  12207. while GetLastInstruction(hp2, hp2) and (hp2.typ = ait_instruction) do
  12208. if RegInInstruction(TargetReg, hp2) then
  12209. begin
  12210. if MatchInstruction(hp2, [A_ADD, A_IMUL, A_OR, A_AND, A_XOR], [taicpu(p).opsize]) and
  12211. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  12212. (taicpu(hp2).oper[1]^.reg = TargetReg) and
  12213. (taicpu(hp2).oper[0]^.reg <> TargetReg) then
  12214. begin
  12215. SourceReg := taicpu(hp2).oper[0]^.reg;
  12216. if
  12217. { Make sure the MOV doesn't use the other register }
  12218. not RegInOp(SourceReg, taicpu(hp1).oper[0]^) and
  12219. { And make sure the source register is not used afterwards }
  12220. not RegInUsedRegs(SourceReg, UsedRegs) then
  12221. begin
  12222. DebugMsg(SPeepholeOptimization + 'OpTest2OpTest (register swap) done', hp2);
  12223. taicpu(hp2).oper[0]^.reg := TargetReg;
  12224. taicpu(hp2).oper[1]^.reg := SourceReg;
  12225. if taicpu(p).oper[0]^.typ = top_reg then
  12226. taicpu(p).oper[0]^.reg := SourceReg;
  12227. taicpu(p).oper[1]^.reg := SourceReg;
  12228. IncludeRegInUsedRegs(SourceReg, UsedRegs);
  12229. AllocRegBetween(SourceReg, hp2, p, UsedRegs);
  12230. Include(OptsToCheck, aoc_ForceNewIteration);
  12231. { We can still check the following optimisations since
  12232. the instruction is still a TEST }
  12233. end;
  12234. end;
  12235. Break;
  12236. end;
  12237. end;
  12238. { Search ahead3 for CMOV instructions }
  12239. if (cs_opt_level2 in current_settings.optimizerswitches) then
  12240. begin
  12241. hp1 := p;
  12242. hp2 := p;
  12243. pCond := nil; { To prevent compiler warnings }
  12244. { For TryCmpCMOVOpts, try to insert MOVs before the allocation of
  12245. DEFAULTFLAGS }
  12246. if not SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, p), pCond) or
  12247. (tai_regalloc(pCond).ratype = ra_dealloc) then
  12248. pCond := p;
  12249. while GetNextInstruction(hp1, hp1) and (hp1 <> BlockEnd) do
  12250. begin
  12251. if (hp1.typ <> ait_instruction) then
  12252. { Break out on markers and labels etc. }
  12253. Break;
  12254. case taicpu(hp1).opcode of
  12255. A_MOV:
  12256. { Ignore regular MOVs unless they are obviously not related
  12257. to a CMOV block }
  12258. if taicpu(hp1).oper[1]^.typ <> top_reg then
  12259. Break;
  12260. A_CMOVcc:
  12261. if TryCmpCMovOpts(pCond, hp1) then
  12262. begin
  12263. hp1 := hp2;
  12264. { p itself isn't changed, and we're still inside a
  12265. while loop to catch subsequent CMOVs, so just flag
  12266. a new iteration }
  12267. Include(OptsToCheck, aoc_ForceNewIteration);
  12268. Continue;
  12269. end;
  12270. else
  12271. { Drop out if we find anything else }
  12272. Break;
  12273. end;
  12274. hp2 := hp1;
  12275. end;
  12276. end;
  12277. end;
  12278. function TX86AsmOptimizer.OptPass2Jmp(var p : tai) : boolean;
  12279. var
  12280. hp1: tai;
  12281. Count: Integer;
  12282. OrigLabel: TAsmLabel;
  12283. begin
  12284. result := False;
  12285. { Sometimes, the optimisations below can permit this }
  12286. RemoveDeadCodeAfterJump(p);
  12287. if (taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full) and (taicpu(p).oper[0]^.ref^.base=NR_NO) and
  12288. (taicpu(p).oper[0]^.ref^.index=NR_NO) and (taicpu(p).oper[0]^.ref^.symbol is tasmlabel) then
  12289. begin
  12290. OrigLabel := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  12291. { Also a side-effect of optimisations }
  12292. if CollapseZeroDistJump(p, OrigLabel) then
  12293. begin
  12294. Result := True;
  12295. Exit;
  12296. end;
  12297. hp1 := GetLabelWithSym(OrigLabel);
  12298. if (taicpu(p).condition=C_None) and assigned(hp1) and SkipLabels(hp1,hp1) and (hp1.typ = ait_instruction) then
  12299. begin
  12300. if taicpu(hp1).opcode = A_RET then
  12301. begin
  12302. {
  12303. change
  12304. jmp .L1
  12305. ...
  12306. .L1:
  12307. ret
  12308. into
  12309. ret
  12310. }
  12311. begin
  12312. ConvertJumpToRET(p, hp1);
  12313. result:=true;
  12314. end;
  12315. end
  12316. else if (cs_opt_level3 in current_settings.optimizerswitches) and
  12317. not (cs_opt_size in current_settings.optimizerswitches) and
  12318. CheckJumpMovTransferOpt(p, hp1, 0, Count) then
  12319. begin
  12320. Result := True;
  12321. Exit;
  12322. end;
  12323. end;
  12324. end;
  12325. end;
  12326. class function TX86AsmOptimizer.CanBeCMOV(p, cond_p: tai; var RefModified: Boolean) : boolean;
  12327. begin
  12328. Result := assigned(p) and
  12329. MatchInstruction(p,A_MOV,[S_W,S_L,S_Q]) and
  12330. (taicpu(p).oper[1]^.typ = top_reg) and
  12331. (
  12332. (taicpu(p).oper[0]^.typ = top_reg) or
  12333. { allow references, but only pure symbols or got rel. addressing with RIP as based,
  12334. it is not expected that this can cause a seg. violation }
  12335. (
  12336. (taicpu(p).oper[0]^.typ = top_ref) and
  12337. { TODO: Can we detect which references become constants at this
  12338. stage so we don't have to do a blanket ban? }
  12339. (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) and
  12340. (
  12341. IsRefSafe(taicpu(p).oper[0]^.ref) or
  12342. (
  12343. { Don't use the reference in the condition if one of its registers got modified by a previous MOV }
  12344. not RefModified and
  12345. { If the reference also appears in the condition, then we know it's safe, otherwise
  12346. any kind of access violation would have occurred already }
  12347. Assigned(cond_p) and
  12348. { Make sure the sizes match too so we're reading and writing the same number of bytes }
  12349. (cond_p.typ = ait_instruction) and
  12350. (taicpu(cond_p).opsize = taicpu(p).opsize) and
  12351. { Just consider 2-operand comparison instructions for now to be safe }
  12352. (taicpu(cond_p).ops = 2) and
  12353. (
  12354. ((taicpu(cond_p).oper[1]^.typ = top_ref) and RefsEqual(taicpu(cond_p).oper[1]^.ref^, taicpu(p).oper[0]^.ref^)) or
  12355. (
  12356. (taicpu(cond_p).oper[0]^.typ = top_ref) and
  12357. { Don't risk identical registers but different offsets, as we may have constructs
  12358. such as buffer streams with things like length fields that indicate whether
  12359. any more data follows. And there are probably some contrived examples where
  12360. writing to offsets behind the one being read also lead to access violations }
  12361. RefsEqual(taicpu(cond_p).oper[0]^.ref^, taicpu(p).oper[0]^.ref^) and
  12362. (
  12363. { Check that we're not modifying a register that appears in the reference }
  12364. (InsProp[taicpu(cond_p).opcode].Ch * [Ch_Mop2, Ch_RWop2, Ch_Wop2] = []) or
  12365. (taicpu(cond_p).oper[1]^.typ <> top_reg) or
  12366. not RegInRef(taicpu(cond_p).oper[1]^.reg, taicpu(cond_p).oper[0]^.ref^)
  12367. )
  12368. )
  12369. )
  12370. )
  12371. )
  12372. )
  12373. );
  12374. end;
  12375. class procedure TX86AsmOptimizer.UpdateIntRegsNoDealloc(var AUsedRegs: TAllUsedRegs; p: Tai);
  12376. begin
  12377. { Update integer registers, ignoring deallocations }
  12378. repeat
  12379. while assigned(p) and
  12380. ((p.typ in (SkipInstr - [ait_RegAlloc])) or
  12381. (p.typ = ait_label) or
  12382. ((p.typ = ait_marker) and
  12383. (tai_Marker(p).Kind in [mark_AsmBlockEnd,mark_NoLineInfoStart,mark_NoLineInfoEnd]))) do
  12384. p := tai(p.next);
  12385. while assigned(p) and
  12386. (p.typ=ait_RegAlloc) Do
  12387. begin
  12388. if (getregtype(tai_regalloc(p).reg) = R_INTREGISTER) then
  12389. begin
  12390. case tai_regalloc(p).ratype of
  12391. ra_alloc :
  12392. IncludeRegInUsedRegs(tai_regalloc(p).reg, AUsedRegs);
  12393. else
  12394. ;
  12395. end;
  12396. end;
  12397. p := tai(p.next);
  12398. end;
  12399. until not(assigned(p)) or
  12400. (not(p.typ in SkipInstr) and
  12401. not((p.typ = ait_label) and
  12402. labelCanBeSkipped(tai_label(p))));
  12403. end;
  12404. {$ifndef 8086}
  12405. function TCMOVTracking.InitialiseBlock(BlockStart, OneBeforeBlock: tai; out BlockStop: tai; out EndJump: tai): Boolean;
  12406. begin
  12407. Result := False;
  12408. EndJump := nil;
  12409. BlockStop := nil;
  12410. while (BlockStart <> fOptimizer.BlockEnd) and
  12411. { stop on labels }
  12412. (BlockStart.typ <> ait_label) do
  12413. begin
  12414. { Keep track of all integer registers that are used }
  12415. fOptimizer.UpdateIntRegsNoDealloc(RegisterTracking, tai(OneBeforeBlock.Next));
  12416. if BlockStart.typ = ait_instruction then
  12417. begin
  12418. if (taicpu(BlockStart).opcode = A_JMP) then
  12419. begin
  12420. if not IsJumpToLabel(taicpu(BlockStart)) or
  12421. (JumpTargetOp(taicpu(BlockStart))^.ref^.index <> NR_NO) then
  12422. Exit;
  12423. EndJump := BlockStart;
  12424. Break;
  12425. end
  12426. { Check to see if we have a valid MOV instruction instead }
  12427. else if (taicpu(BlockStart).opcode <> A_MOV) or
  12428. (taicpu(BlockStart).oper[1]^.typ <> top_reg) or
  12429. not (taicpu(BlockStart).opsize in [S_W, S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
  12430. begin
  12431. Exit;
  12432. end
  12433. else
  12434. { This will be a valid MOV }
  12435. fAllocationRange := BlockStart;
  12436. end;
  12437. OneBeforeBlock := BlockStart;
  12438. fOptimizer.GetNextInstruction(BlockStart, BlockStart);
  12439. end;
  12440. if (BlockStart = fOptimizer.BlockEnd) then
  12441. Exit;
  12442. BlockStop := BlockStart;
  12443. Result := True;
  12444. end;
  12445. function TCMOVTracking.AnalyseMOVBlock(BlockStart, BlockStop, SearchStart: tai): LongInt;
  12446. var
  12447. hp1: tai;
  12448. RefModified: Boolean;
  12449. begin
  12450. Result := 0;
  12451. hp1 := BlockStart;
  12452. RefModified := False; { As long as the condition is inverted, this can be reset }
  12453. while assigned(hp1) and
  12454. (hp1 <> BlockStop) do
  12455. begin
  12456. case hp1.typ of
  12457. ait_instruction:
  12458. if MatchInstruction(hp1, A_MOV, [S_W, S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
  12459. begin
  12460. if fOptimizer.CanBeCMOV(hp1, fCondition, RefModified) then
  12461. begin
  12462. Inc(Result);
  12463. if { Make sure the sizes match too so we're reading and writing the same number of bytes }
  12464. Assigned(fCondition) and
  12465. { Will have 2 operands }
  12466. (
  12467. (
  12468. (taicpu(fCondition).oper[0]^.typ = top_ref) and
  12469. fOptimizer.RegInRef(taicpu(hp1).oper[1]^.reg, taicpu(fCondition).oper[0]^.ref^)
  12470. ) or
  12471. (
  12472. (taicpu(fCondition).oper[1]^.typ = top_ref) and
  12473. fOptimizer.RegInRef(taicpu(hp1).oper[1]^.reg, taicpu(fCondition).oper[1]^.ref^)
  12474. )
  12475. ) then
  12476. { It is no longer safe to use the reference in the condition.
  12477. this prevents problems such as:
  12478. mov (%reg),%reg
  12479. mov (%reg),...
  12480. When the comparison is cmp (%reg),0 and guarding against a null pointer deallocation
  12481. (fixes #40165)
  12482. Note: "mov (%reg1),%reg2; mov (%reg2),..." won't be optimised this way since
  12483. at least one of (%reg1) and (%reg2) won't be in the condition and is hence unsafe.
  12484. }
  12485. RefModified := True;
  12486. end
  12487. else if not (cs_opt_size in current_settings.optimizerswitches) and
  12488. { CMOV with constants grows the code size }
  12489. TryCMOVConst(hp1, SearchStart, BlockStop, Result) then
  12490. begin
  12491. { Register was reserved by TryCMOVConst and
  12492. stored on ConstRegs }
  12493. end
  12494. else
  12495. begin
  12496. Result := -1;
  12497. Exit;
  12498. end;
  12499. end
  12500. else
  12501. begin
  12502. Result := -1;
  12503. Exit;
  12504. end;
  12505. else
  12506. { Most likely an align };
  12507. end;
  12508. fOptimizer.GetNextInstruction(hp1, hp1);
  12509. end;
  12510. end;
  12511. constructor TCMOVTracking.Init(Optimizer: TX86AsmOptimizer; var p_initialjump, p_initialmov: tai; var AFirstLabel: TAsmLabel);
  12512. { For the tsBranching type, increase the weighting score to account for the new conditional jump
  12513. (this is done as a separate stage because the double types are extensions of the branching type,
  12514. but we can't discount the conditional jump until the last step) }
  12515. procedure EvaluateBranchingType;
  12516. begin
  12517. Inc(CMOVScore);
  12518. if (CMOVScore > MAX_CMOV_INSTRUCTIONS) then
  12519. { Too many instructions to be worthwhile }
  12520. fState := tsInvalid;
  12521. end;
  12522. var
  12523. hp1: tai;
  12524. Count: Integer;
  12525. begin
  12526. { Table of valid CMOV block types
  12527. Block type 2nd Jump Mid-label 2nd MOVs 3rd Jump End-label
  12528. ---------- --------- --------- --------- --------- ---------
  12529. tsSimple X Yes X X X
  12530. tsDetour = 1st X X X X
  12531. tsBranching <> Mid Yes X X X
  12532. tsDouble End-label Yes * Yes X Yes
  12533. tsDoubleBranchSame <> Mid Yes * Yes = 2nd X
  12534. tsDoubleBranchDifferent <> Mid Yes * Yes <> 2nd X
  12535. tsDoubleSecondBranching End-label Yes * Yes <> 2nd Yes
  12536. * Only one reference allowed
  12537. }
  12538. hp1 := nil; { To prevent compiler warnings }
  12539. Optimizer.CopyUsedRegs(RegisterTracking);
  12540. fOptimizer := Optimizer;
  12541. fLabel := AFirstLabel;
  12542. CMOVScore := 0;
  12543. ConstCount := 0;
  12544. { Initialise RegWrites, ConstRegs, ConstVals, ConstSizes, ConstWriteSizes and ConstMovs }
  12545. FillChar(RegWrites[0], MAX_CMOV_INSTRUCTIONS * 2 * SizeOf(TRegister), 0);
  12546. FillChar(ConstRegs[0], MAX_CMOV_REGISTERS * SizeOf(TRegister), 0);
  12547. FillChar(ConstVals[0], MAX_CMOV_REGISTERS * SizeOf(TCGInt), 0);
  12548. FillChar(ConstSizes[0], MAX_CMOV_REGISTERS * SizeOf(TSubRegister), 0);
  12549. FillChar(ConstWriteSizes[0], first_int_imreg * SizeOf(TOpSize), 0);
  12550. FillChar(ConstMovs[0], MAX_CMOV_REGISTERS * SizeOf(taicpu), 0);
  12551. fInsertionPoint := p_initialjump;
  12552. fCondition := nil;
  12553. fInitialJump := p_initialjump;
  12554. fFirstMovBlock := p_initialmov;
  12555. fFirstMovBlockStop := nil;
  12556. fSecondJump := nil;
  12557. fSecondMovBlock := nil;
  12558. fSecondMovBlockStop := nil;
  12559. fMidLabel := nil;
  12560. fSecondJump := nil;
  12561. fSecondMovBlock := nil;
  12562. fEndLabel := nil;
  12563. fAllocationRange := nil;
  12564. { Assume it all goes horribly wrong! }
  12565. fState := tsInvalid;
  12566. { Look backwards at the comparisons to get an accurate picture of register usage and a better position for any MOV const,reg insertions }
  12567. if Optimizer.GetLastInstruction(p_initialjump, fCondition) and
  12568. MatchInstruction(fCondition, [A_CMP, A_TEST, A_BSR, A_BSF, A_COMISS, A_COMISD, A_UCOMISS, A_UCOMISD, A_VCOMISS, A_VCOMISD, A_VUCOMISS, A_VUCOMISD], []) then
  12569. begin
  12570. { Mark all the registers in the comparison as 'in use', even if they've just been deallocated }
  12571. for Count := 0 to 1 do
  12572. with taicpu(fCondition).oper[Count]^ do
  12573. case typ of
  12574. top_reg:
  12575. if getregtype(reg) = R_INTREGISTER then
  12576. Optimizer.IncludeRegInUsedRegs(reg, RegisterTracking);
  12577. top_ref:
  12578. begin
  12579. if
  12580. {$ifdef x86_64}
  12581. (ref^.base <> NR_RIP) and
  12582. {$endif x86_64}
  12583. (ref^.base <> NR_NO) then
  12584. Optimizer.IncludeRegInUsedRegs(ref^.base, RegisterTracking);
  12585. if (ref^.index <> NR_NO) then
  12586. Optimizer.IncludeRegInUsedRegs(ref^.index, RegisterTracking);
  12587. end
  12588. else
  12589. ;
  12590. end;
  12591. { When inserting instructions before hp_prev, try to insert them
  12592. before the allocation of the FLAGS register }
  12593. if not SetAndTest(Optimizer.FindRegAllocBackward(NR_DEFAULTFLAGS, tai(fCondition.Previous)), fInsertionPoint) or
  12594. (tai_regalloc(fInsertionPoint).ratype = ra_dealloc) then
  12595. { If not found, set it equal to the condition so it's something sensible }
  12596. fInsertionPoint := fCondition;
  12597. { When dealing with a comparison against zero, take note of the
  12598. instruction before it to see if we can move instructions further
  12599. back in order to benefit PostPeepholeOptTestOr.
  12600. }
  12601. if (
  12602. (
  12603. (taicpu(fCondition).opcode = A_CMP) and
  12604. MatchOperand(taicpu(fCondition).oper[0]^, 0)
  12605. ) or
  12606. (
  12607. (taicpu(fCondition).opcode = A_TEST) and
  12608. (
  12609. Optimizer.OpsEqual(taicpu(fCondition).oper[0]^, taicpu(fCondition).oper[1]^) or
  12610. MatchOperand(taicpu(fCondition).oper[0]^, -1)
  12611. )
  12612. )
  12613. ) and
  12614. Optimizer.GetLastInstruction(fCondition, hp1) then
  12615. begin
  12616. { These instructions set the zero flag if the result is zero }
  12617. if MatchInstruction(hp1, [A_ADD, A_SUB, A_OR, A_XOR, A_AND, A_POPCNT, A_LZCNT], []) then
  12618. begin
  12619. fInsertionPoint := hp1;
  12620. { Also mark all the registers in this previous instruction
  12621. as 'in use', even if they've just been deallocated }
  12622. for Count := 0 to 1 do
  12623. with taicpu(hp1).oper[Count]^ do
  12624. case typ of
  12625. top_reg:
  12626. if getregtype(reg) = R_INTREGISTER then
  12627. Optimizer.IncludeRegInUsedRegs(reg, RegisterTracking);
  12628. top_ref:
  12629. begin
  12630. if
  12631. {$ifdef x86_64}
  12632. (ref^.base <> NR_RIP) and
  12633. {$endif x86_64}
  12634. (ref^.base <> NR_NO) then
  12635. Optimizer.IncludeRegInUsedRegs(ref^.base, RegisterTracking);
  12636. if (ref^.index <> NR_NO) then
  12637. Optimizer.IncludeRegInUsedRegs(ref^.index, RegisterTracking);
  12638. end
  12639. else
  12640. ;
  12641. end;
  12642. end;
  12643. end;
  12644. end
  12645. else
  12646. fCondition := nil;
  12647. { When inserting instructions, try to insert them before the allocation of the FLAGS register }
  12648. if SetAndTest(Optimizer.FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p_initialjump.Previous)), hp1) and
  12649. (tai_regalloc(hp1).ratype <> ra_dealloc) then
  12650. { If not found, set it equal to p so it's something sensible }
  12651. fInsertionPoint := hp1;
  12652. hp1 := p_initialmov;
  12653. if not InitialiseBlock(p_initialmov, p_initialjump, fFirstMovBlockStop, fSecondJump) then
  12654. Exit;
  12655. hp1 := fFirstMovBlockStop; { Will either be on a label or a jump }
  12656. if (hp1.typ <> ait_label) then { should be on a jump }
  12657. begin
  12658. if not Optimizer.GetNextInstruction(hp1, fMidLabel) or not (fMidLabel.typ = ait_label) then
  12659. { Need a label afterwards }
  12660. Exit;
  12661. end
  12662. else
  12663. fMidLabel := hp1;
  12664. if tai_label(fMidLabel).labsym <> AFirstLabel then
  12665. { Not the correct label }
  12666. fMidLabel := nil;
  12667. if not Assigned(fSecondJump) and not Assigned(fMidLabel) then
  12668. { If there's neither a 2nd jump nor correct label, then it's invalid
  12669. (see above table) }
  12670. Exit;
  12671. { Analyse the first block of MOVs more closely }
  12672. CMOVScore := AnalyseMOVBlock(fFirstMovBlock, fFirstMovBlockStop, fInsertionPoint);
  12673. if Assigned(fSecondJump) then
  12674. begin
  12675. if (JumpTargetOp(taicpu(fSecondJump))^.ref^.symbol = AFirstLabel) then
  12676. begin
  12677. fState := tsDetour
  12678. end
  12679. else
  12680. begin
  12681. { Need the correct mid-label for this one }
  12682. if not Assigned(fMidLabel) then
  12683. Exit;
  12684. fState := tsBranching;
  12685. end;
  12686. end
  12687. else
  12688. { No jump. but mid-label is present }
  12689. fState := tsSimple;
  12690. if (CMOVScore > MAX_CMOV_INSTRUCTIONS) or (CMOVScore <= 0) then
  12691. begin
  12692. { Invalid or too many instructions to be worthwhile }
  12693. fState := tsInvalid;
  12694. Exit;
  12695. end;
  12696. { check further for
  12697. jCC xxx
  12698. <several movs 1>
  12699. jmp yyy
  12700. xxx:
  12701. <several movs 2>
  12702. yyy:
  12703. etc.
  12704. }
  12705. if (fState = tsBranching) and
  12706. { Estimate for required savings for extra jump }
  12707. (CMOVScore <= MAX_CMOV_INSTRUCTIONS - 1) and
  12708. { Only one reference is allowed for double blocks }
  12709. (AFirstLabel.getrefs = 1) then
  12710. begin
  12711. Optimizer.GetNextInstruction(fMidLabel, hp1);
  12712. fSecondMovBlock := hp1;
  12713. if not InitialiseBlock(fSecondMovBlock, fMidLabel, fSecondMovBlockStop, fThirdJump) then
  12714. begin
  12715. EvaluateBranchingType;
  12716. Exit;
  12717. end;
  12718. hp1 := fSecondMovBlockStop; { Will either be on a label or a jump }
  12719. if (hp1.typ <> ait_label) then { should be on a jump }
  12720. begin
  12721. if not Optimizer.GetNextInstruction(hp1, fEndLabel) or not (fEndLabel.typ = ait_label) then
  12722. begin
  12723. { Need a label afterwards }
  12724. EvaluateBranchingType;
  12725. Exit;
  12726. end;
  12727. end
  12728. else
  12729. fEndLabel := hp1;
  12730. if tai_label(fEndLabel).labsym <> JumpTargetOp(taicpu(fSecondJump))^.ref^.symbol then
  12731. { Second jump doesn't go to the end }
  12732. fEndLabel := nil;
  12733. if not Assigned(fThirdJump) and not Assigned(fEndLabel) then
  12734. begin
  12735. { If there's neither a 3rd jump nor correct end label, then it's
  12736. not a invalid double block, but is a valid single branching
  12737. block (see above table) }
  12738. EvaluateBranchingType;
  12739. Exit;
  12740. end;
  12741. Count := AnalyseMOVBlock(fSecondMovBlock, fSecondMovBlockStop, fMidLabel);
  12742. if (Count > MAX_CMOV_INSTRUCTIONS) or (Count <= 0) then
  12743. { Invalid or too many instructions to be worthwhile }
  12744. Exit;
  12745. Inc(CMOVScore, Count);
  12746. if Assigned(fThirdJump) then
  12747. begin
  12748. if not Assigned(fSecondJump) then
  12749. fState := tsDoubleSecondBranching
  12750. else if (JumpTargetOp(taicpu(fSecondJump))^.ref^.symbol = JumpTargetOp(taicpu(fThirdJump))^.ref^.symbol) then
  12751. fState := tsDoubleBranchSame
  12752. else
  12753. fState := tsDoubleBranchDifferent;
  12754. end
  12755. else
  12756. fState := tsDouble;
  12757. end;
  12758. if fState = tsBranching then
  12759. EvaluateBranchingType;
  12760. end;
  12761. { Tries to convert a mov const,%reg instruction into a CMOV by reserving a
  12762. new register to store the constant }
  12763. function TCMOVTracking.TryCMOVConst(p, start, stop: tai; var Count: LongInt): Boolean;
  12764. var
  12765. RegSize: TSubRegister;
  12766. CurrentVal: TCGInt;
  12767. ANewReg: TRegister;
  12768. X: ShortInt;
  12769. begin
  12770. Result := False;
  12771. if not MatchOpType(taicpu(p), top_const, top_reg) then
  12772. Exit;
  12773. if ConstCount >= MAX_CMOV_REGISTERS then
  12774. { Arrays are full }
  12775. Exit;
  12776. { Remember that CMOV can't encode 8-bit registers }
  12777. case taicpu(p).opsize of
  12778. S_W:
  12779. RegSize := R_SUBW;
  12780. S_L:
  12781. RegSize := R_SUBD;
  12782. {$ifdef x86_64}
  12783. S_Q:
  12784. RegSize := R_SUBQ;
  12785. {$endif x86_64}
  12786. else
  12787. InternalError(2021100401);
  12788. end;
  12789. { See if the value has already been reserved for another CMOV instruction }
  12790. CurrentVal := taicpu(p).oper[0]^.val;
  12791. for X := 0 to ConstCount - 1 do
  12792. if ConstVals[X] = CurrentVal then
  12793. begin
  12794. ConstRegs[ConstCount] := ConstRegs[X];
  12795. ConstSizes[ConstCount] := RegSize;
  12796. ConstVals[ConstCount] := CurrentVal;
  12797. Inc(ConstCount);
  12798. Inc(Count);
  12799. Result := True;
  12800. Exit;
  12801. end;
  12802. ANewReg := fOptimizer.GetIntRegisterBetween(R_SUBWHOLE, RegisterTracking, start, stop, True);
  12803. if ANewReg = NR_NO then
  12804. { No free registers }
  12805. Exit;
  12806. { Reserve the register so subsequent TryCMOVConst calls don't all end
  12807. up vying for the same register }
  12808. fOptimizer.IncludeRegInUsedRegs(ANewReg, RegisterTracking);
  12809. ConstRegs[ConstCount] := ANewReg;
  12810. ConstSizes[ConstCount] := RegSize;
  12811. ConstVals[ConstCount] := CurrentVal;
  12812. Inc(ConstCount);
  12813. Inc(Count);
  12814. Result := True;
  12815. end;
  12816. destructor TCMOVTracking.Done;
  12817. begin
  12818. TAOptObj.ReleaseUsedRegs(RegisterTracking);
  12819. end;
  12820. procedure TCMOVTracking.Process(out new_p: tai);
  12821. var
  12822. Count, Writes: LongInt;
  12823. RegMatch: Boolean;
  12824. hp1, hp_new: tai;
  12825. inverted_condition, condition: TAsmCond;
  12826. begin
  12827. if (fState in [tsInvalid, tsProcessed]) then
  12828. InternalError(2023110701);
  12829. { Repurpose RegisterTracking to mark registers that we've defined }
  12830. RegisterTracking[R_INTREGISTER].Clear;
  12831. Count := 0;
  12832. Writes := 0;
  12833. condition := taicpu(fInitialJump).condition;
  12834. inverted_condition := inverse_cond(condition);
  12835. { Exclude tsDoubleBranchDifferent from this check, as the second block
  12836. doesn't get CMOVs in this case }
  12837. if (fState in [tsDouble, tsDoubleBranchSame, tsDoubleSecondBranching]) then
  12838. begin
  12839. { Include the jump in the flag tracking }
  12840. if Assigned(fThirdJump) then
  12841. begin
  12842. if (fState = tsDoubleBranchSame) then
  12843. begin
  12844. { Will be an unconditional jump, so track to the instruction before it }
  12845. if not fOptimizer.GetLastInstruction(fThirdJump, hp1) then
  12846. InternalError(2023110710);
  12847. end
  12848. else
  12849. hp1 := fThirdJump;
  12850. end
  12851. else
  12852. hp1 := fSecondMovBlockStop;
  12853. end
  12854. else
  12855. begin
  12856. { Include a conditional jump in the flag tracking }
  12857. if Assigned(fSecondJump) then
  12858. begin
  12859. if (fState = tsDetour) then
  12860. begin
  12861. { Will be an unconditional jump, so track to the instruction before it }
  12862. if not fOptimizer.GetLastInstruction(fSecondJump, hp1) then
  12863. InternalError(2023110711);
  12864. end
  12865. else
  12866. hp1 := fSecondJump;
  12867. end
  12868. else
  12869. hp1 := fFirstMovBlockStop;
  12870. end;
  12871. fOptimizer.AllocRegBetween(NR_DEFAULTFLAGS, fInitialJump, hp1, fOptimizer.UsedRegs);
  12872. { Process the second set of MOVs first, because if a destination
  12873. register is shared between the first and second MOV sets, it is more
  12874. efficient to turn the first one into a MOV instruction and place it
  12875. before the CMP if possible, but we won't know which registers are
  12876. shared until we've processed at least one list, so we might as well
  12877. make it the second one since that won't be modified again. }
  12878. if (fState in [tsDouble, tsDoubleBranchSame, tsDoubleBranchDifferent, tsDoubleSecondBranching]) then
  12879. begin
  12880. hp1 := fSecondMovBlock;
  12881. repeat
  12882. if not Assigned(hp1) then
  12883. InternalError(2018062902);
  12884. if (hp1.typ = ait_instruction) then
  12885. begin
  12886. { Extra safeguard }
  12887. if (taicpu(hp1).opcode <> A_MOV) then
  12888. InternalError(2018062903);
  12889. { Note: tsDoubleBranchDifferent is essentially identical to
  12890. tsBranching and the 2nd block is best left largely
  12891. untouched, but we need to evaluate which registers the MOVs
  12892. write to in order to track what would be complementary CMOV
  12893. pairs that can be further optimised. [Kit] }
  12894. if fState <> tsDoubleBranchDifferent then
  12895. begin
  12896. if taicpu(hp1).oper[0]^.typ = top_const then
  12897. begin
  12898. RegMatch := False;
  12899. for Count := 0 to ConstCount - 1 do
  12900. if (ConstVals[Count] = taicpu(hp1).oper[0]^.val) and
  12901. (getsubreg(taicpu(hp1).oper[1]^.reg) = ConstSizes[Count]) then
  12902. begin
  12903. RegMatch := True;
  12904. { If it's in RegisterTracking, then this register
  12905. is being used more than once and hence has
  12906. already had its value defined (it gets added to
  12907. UsedRegs through AllocRegBetween below) }
  12908. if not RegisterTracking[R_INTREGISTER].IsUsed(ConstRegs[Count]) then
  12909. begin
  12910. hp_new := taicpu.op_const_reg(A_MOV, subreg2opsize(R_SUBWHOLE), taicpu(hp1).oper[0]^.val, ConstRegs[Count]);
  12911. taicpu(hp_new).fileinfo := taicpu(fInitialJump).fileinfo;
  12912. fOptimizer.asml.InsertBefore(hp_new, fInsertionPoint);
  12913. fOptimizer.IncludeRegInUsedRegs(ConstRegs[Count], RegisterTracking);
  12914. ConstMovs[Count] := hp_new;
  12915. end
  12916. else
  12917. { We just need an instruction between hp_prev and hp1
  12918. where we know the register is marked as in use }
  12919. hp_new := fSecondMovBlock;
  12920. { Keep track of largest write for this register so it can be optimised later }
  12921. if (getsubreg(taicpu(hp1).oper[1]^.reg) > ConstWriteSizes[getsupreg(ConstRegs[Count])]) then
  12922. ConstWriteSizes[getsupreg(ConstRegs[Count])] := getsubreg(taicpu(hp1).oper[1]^.reg);
  12923. fOptimizer.AllocRegBetween(ConstRegs[Count], hp_new, hp1, fOptimizer.UsedRegs);
  12924. taicpu(hp1).loadreg(0, newreg(R_INTREGISTER, getsupreg(ConstRegs[Count]), ConstSizes[Count]));
  12925. Break;
  12926. end;
  12927. if not RegMatch then
  12928. InternalError(2021100411);
  12929. end;
  12930. taicpu(hp1).opcode := A_CMOVcc;
  12931. taicpu(hp1).condition := condition;
  12932. end;
  12933. { Store these writes to search for duplicates later on }
  12934. RegWrites[Writes] := taicpu(hp1).oper[1]^.reg;
  12935. Inc(Writes);
  12936. end;
  12937. fOptimizer.GetNextInstruction(hp1, hp1);
  12938. until (hp1 = fSecondMovBlockStop);
  12939. end;
  12940. { Now do the first set of MOVs }
  12941. hp1 := fFirstMovBlock;
  12942. repeat
  12943. if not Assigned(hp1) then
  12944. InternalError(2018062904);
  12945. if (hp1.typ = ait_instruction) then
  12946. begin
  12947. RegMatch := False;
  12948. { Extra safeguard }
  12949. if (taicpu(hp1).opcode <> A_MOV) then
  12950. InternalError(2018062905);
  12951. { Search through the RegWrites list to see if there are any
  12952. opposing CMOV pairs that write to the same register }
  12953. for Count := 0 to Writes - 1 do
  12954. if (RegWrites[Count] = taicpu(hp1).oper[1]^.reg) then
  12955. begin
  12956. { We have a match. Keep this as a MOV }
  12957. { Move ahead in preparation }
  12958. fOptimizer.GetNextInstruction(hp1, hp1);
  12959. RegMatch := True;
  12960. Break;
  12961. end;
  12962. if RegMatch then
  12963. Continue;
  12964. if taicpu(hp1).oper[0]^.typ = top_const then
  12965. begin
  12966. for Count := 0 to ConstCount - 1 do
  12967. if (ConstVals[Count] = taicpu(hp1).oper[0]^.val) and
  12968. (getsubreg(taicpu(hp1).oper[1]^.reg) = ConstSizes[Count]) then
  12969. begin
  12970. RegMatch := True;
  12971. { If it's in RegisterTracking, then this register is
  12972. being used more than once and hence has already had
  12973. its value defined (it gets added to UsedRegs through
  12974. AllocRegBetween below) }
  12975. if not RegisterTracking[R_INTREGISTER].IsUsed(ConstRegs[Count]) then
  12976. begin
  12977. hp_new := taicpu.op_const_reg(A_MOV, subreg2opsize(R_SUBWHOLE), taicpu(hp1).oper[0]^.val, ConstRegs[Count]);
  12978. taicpu(hp_new).fileinfo := taicpu(fInitialJump).fileinfo;
  12979. fOptimizer.asml.InsertBefore(hp_new, fInsertionPoint);
  12980. fOptimizer.IncludeRegInUsedRegs(ConstRegs[Count], RegisterTracking);
  12981. ConstMovs[Count] := hp_new;
  12982. end
  12983. else
  12984. { We just need an instruction between hp_prev and hp1
  12985. where we know the register is marked as in use }
  12986. hp_new := fFirstMovBlock;
  12987. { Keep track of largest write for this register so it can be optimised later }
  12988. if (getsubreg(taicpu(hp1).oper[1]^.reg) > ConstWriteSizes[getsupreg(ConstRegs[Count])]) then
  12989. ConstWriteSizes[getsupreg(ConstRegs[Count])] := getsubreg(taicpu(hp1).oper[1]^.reg);
  12990. fOptimizer.AllocRegBetween(ConstRegs[Count], hp_new, hp1, fOptimizer.UsedRegs);
  12991. taicpu(hp1).loadreg(0, newreg(R_INTREGISTER, getsupreg(ConstRegs[Count]), ConstSizes[Count]));
  12992. Break;
  12993. end;
  12994. if not RegMatch then
  12995. InternalError(2021100412);
  12996. end;
  12997. taicpu(hp1).opcode := A_CMOVcc;
  12998. taicpu(hp1).condition := inverted_condition;
  12999. if (fState = tsDoubleBranchDifferent) then
  13000. begin
  13001. { Store these writes to search for duplicates later on }
  13002. RegWrites[Writes] := taicpu(hp1).oper[1]^.reg;
  13003. Inc(Writes);
  13004. end;
  13005. end;
  13006. fOptimizer.GetNextInstruction(hp1, hp1);
  13007. until (hp1 = fFirstMovBlockStop);
  13008. { Update initialisation MOVs to the smallest possible size }
  13009. for Count := 0 to ConstCount - 1 do
  13010. if Assigned(ConstMovs[Count]) then
  13011. begin
  13012. taicpu(ConstMovs[Count]).opsize := subreg2opsize(ConstWriteSizes[Word(ConstRegs[Count])]);
  13013. setsubreg(taicpu(ConstMovs[Count]).oper[1]^.reg, ConstWriteSizes[Word(ConstRegs[Count])]);
  13014. end;
  13015. case fState of
  13016. tsSimple:
  13017. begin
  13018. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Simple type)', fInitialJump);
  13019. { No branch to delete }
  13020. end;
  13021. tsDetour:
  13022. begin
  13023. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Detour type)', fInitialJump);
  13024. { Preserve jump }
  13025. end;
  13026. tsBranching, tsDoubleBranchDifferent:
  13027. begin
  13028. if (fState = tsBranching) then
  13029. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Branching type)', fInitialJump)
  13030. else
  13031. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Double branching (different) type)', fInitialJump);
  13032. taicpu(fSecondJump).opcode := A_JCC;
  13033. taicpu(fSecondJump).condition := inverted_condition;
  13034. end;
  13035. tsDouble, tsDoubleBranchSame:
  13036. begin
  13037. if (fState = tsDouble) then
  13038. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Double type)', fInitialJump)
  13039. else
  13040. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Double branching (same) type)', fInitialJump);
  13041. { Delete second jump }
  13042. JumpTargetOp(taicpu(fSecondJump))^.ref^.symbol.decrefs;
  13043. fOptimizer.RemoveInstruction(fSecondJump);
  13044. end;
  13045. tsDoubleSecondBranching:
  13046. begin
  13047. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Double, second branching type)', fInitialJump);
  13048. { Delete second jump, preserve third jump as conditional }
  13049. JumpTargetOp(taicpu(fSecondJump))^.ref^.symbol.decrefs;
  13050. fOptimizer.RemoveInstruction(fSecondJump);
  13051. taicpu(fThirdJump).opcode := A_JCC;
  13052. taicpu(fThirdJump).condition := condition;
  13053. end;
  13054. else
  13055. InternalError(2023110720);
  13056. end;
  13057. { Now we can safely decrement the reference count }
  13058. tasmlabel(fLabel).decrefs;
  13059. fOptimizer.UpdateUsedRegs(tai(fInitialJump.next));
  13060. { Remove the original jump }
  13061. fOptimizer.RemoveInstruction(fInitialJump); { Note, the choice to not use RemoveCurrentp is deliberate }
  13062. new_p := fFirstMovBlock; { Appears immediately after the initial jump }
  13063. fState := tsProcessed;
  13064. end;
  13065. {$endif 8086}
  13066. function TX86AsmOptimizer.OptPass2Jcc(var p : tai) : boolean;
  13067. var
  13068. hp1,hp2: tai;
  13069. carryadd_opcode : TAsmOp;
  13070. symbol: TAsmSymbol;
  13071. increg, tmpreg: TRegister;
  13072. {$ifndef i8086}
  13073. CMOVTracking: PCMOVTracking;
  13074. hp3,hp4,hp5: tai;
  13075. {$endif i8086}
  13076. TempBool: Boolean;
  13077. begin
  13078. if (aoc_DoPass2JccOpts in OptsToCheck) and
  13079. DoJumpOptimizations(p, TempBool) then
  13080. Exit(True);
  13081. result:=false;
  13082. if GetNextInstruction(p,hp1) then
  13083. begin
  13084. if (hp1.typ=ait_label) then
  13085. begin
  13086. Result := DoSETccLblRETOpt(p, tai_label(hp1));
  13087. Exit;
  13088. end
  13089. else if (hp1.typ<>ait_instruction) then
  13090. Exit;
  13091. symbol := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  13092. if (
  13093. (
  13094. ((Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB)) and
  13095. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  13096. (Taicpu(hp1).oper[0]^.val=1)
  13097. ) or
  13098. ((Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC))
  13099. ) and
  13100. GetNextInstruction(hp1,hp2) and
  13101. FindLabel(TAsmLabel(symbol), hp2) then
  13102. { jb @@1 cmc
  13103. inc/dec operand --> adc/sbb operand,0
  13104. @@1:
  13105. ... and ...
  13106. jnb @@1
  13107. inc/dec operand --> adc/sbb operand,0
  13108. @@1: }
  13109. begin
  13110. if Taicpu(p).condition in [C_NAE,C_B,C_C] then
  13111. begin
  13112. case taicpu(hp1).opcode of
  13113. A_INC,
  13114. A_ADD:
  13115. carryadd_opcode:=A_ADC;
  13116. A_DEC,
  13117. A_SUB:
  13118. carryadd_opcode:=A_SBB;
  13119. else
  13120. InternalError(2021011001);
  13121. end;
  13122. Taicpu(p).clearop(0);
  13123. Taicpu(p).ops:=0;
  13124. Taicpu(p).is_jmp:=false;
  13125. Taicpu(p).opcode:=A_CMC;
  13126. Taicpu(p).condition:=C_NONE;
  13127. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2CmcAdc/Sbb',p);
  13128. Taicpu(hp1).ops:=2;
  13129. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  13130. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  13131. else
  13132. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  13133. Taicpu(hp1).loadconst(0,0);
  13134. Taicpu(hp1).opcode:=carryadd_opcode;
  13135. result:=true;
  13136. exit;
  13137. end
  13138. else if Taicpu(p).condition in [C_AE,C_NB,C_NC] then
  13139. begin
  13140. case taicpu(hp1).opcode of
  13141. A_INC,
  13142. A_ADD:
  13143. carryadd_opcode:=A_ADC;
  13144. A_DEC,
  13145. A_SUB:
  13146. carryadd_opcode:=A_SBB;
  13147. else
  13148. InternalError(2021011002);
  13149. end;
  13150. Taicpu(hp1).ops:=2;
  13151. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2Adc/Sbb',p);
  13152. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  13153. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  13154. else
  13155. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  13156. Taicpu(hp1).loadconst(0,0);
  13157. Taicpu(hp1).opcode:=carryadd_opcode;
  13158. RemoveCurrentP(p, hp1);
  13159. result:=true;
  13160. exit;
  13161. end
  13162. {
  13163. jcc @@1 setcc tmpreg
  13164. inc/dec/add/sub operand -> (movzx tmpreg)
  13165. @@1: add/sub tmpreg,operand
  13166. While this increases code size slightly, it makes the code much faster if the
  13167. jump is unpredictable
  13168. }
  13169. else if not(cs_opt_size in current_settings.optimizerswitches) then
  13170. begin
  13171. { search for an available register which is volatile }
  13172. increg := GetIntRegisterBetween(R_SUBL, UsedRegs, p, hp1);
  13173. if increg <> NR_NO then
  13174. begin
  13175. { We don't need to check if tmpreg is in hp1 or not, because
  13176. it will be marked as in use at p (if not, this is
  13177. indictive of a compiler bug). }
  13178. TAsmLabel(symbol).decrefs;
  13179. Taicpu(p).clearop(0);
  13180. Taicpu(p).ops:=1;
  13181. Taicpu(p).is_jmp:=false;
  13182. Taicpu(p).opcode:=A_SETcc;
  13183. DebugMsg(SPeepholeOptimization+'JccAdd2SetccAdd',p);
  13184. Taicpu(p).condition:=inverse_cond(Taicpu(p).condition);
  13185. Taicpu(p).loadreg(0,increg);
  13186. if getsubreg(Taicpu(hp1).oper[1]^.reg)<>R_SUBL then
  13187. begin
  13188. case getsubreg(Taicpu(hp1).oper[1]^.reg) of
  13189. R_SUBW:
  13190. begin
  13191. tmpreg := newreg(R_INTREGISTER,getsupreg(increg),R_SUBW);
  13192. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BW,increg,tmpreg);
  13193. end;
  13194. R_SUBD:
  13195. begin
  13196. tmpreg := newreg(R_INTREGISTER,getsupreg(increg),R_SUBD);
  13197. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BL,increg,tmpreg);
  13198. end;
  13199. {$ifdef x86_64}
  13200. R_SUBQ:
  13201. begin
  13202. { MOVZX doesn't have a 64-bit variant, because
  13203. the 32-bit version implicitly zeroes the
  13204. upper 32-bits of the destination register }
  13205. tmpreg := newreg(R_INTREGISTER,getsupreg(increg),R_SUBD);
  13206. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BL,increg,tmpreg);
  13207. setsubreg(tmpreg, R_SUBQ);
  13208. end;
  13209. {$endif x86_64}
  13210. else
  13211. Internalerror(2020030601);
  13212. end;
  13213. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  13214. asml.InsertAfter(hp2,p);
  13215. end
  13216. else
  13217. tmpreg := increg;
  13218. if (Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC) then
  13219. begin
  13220. Taicpu(hp1).ops:=2;
  13221. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^)
  13222. end;
  13223. Taicpu(hp1).loadreg(0,tmpreg);
  13224. AllocRegBetween(tmpreg,p,hp1,UsedRegs);
  13225. Result := True;
  13226. { p is no longer a Jcc instruction, so exit }
  13227. Exit;
  13228. end;
  13229. end;
  13230. end;
  13231. { Detect the following:
  13232. jmp<cond> @Lbl1
  13233. jmp @Lbl2
  13234. ...
  13235. @Lbl1:
  13236. ret
  13237. Change to:
  13238. jmp<inv_cond> @Lbl2
  13239. ret
  13240. }
  13241. if MatchInstruction(hp1,A_JMP,[]) and (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  13242. begin
  13243. hp2:=getlabelwithsym(TAsmLabel(symbol));
  13244. if Assigned(hp2) and SkipLabels(hp2,hp2) and
  13245. MatchInstruction(hp2,A_RET,[S_NO]) then
  13246. begin
  13247. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  13248. { Change label address to that of the unconditional jump }
  13249. taicpu(p).loadoper(0, taicpu(hp1).oper[0]^);
  13250. TAsmLabel(symbol).DecRefs;
  13251. taicpu(hp1).opcode := A_RET;
  13252. taicpu(hp1).is_jmp := false;
  13253. taicpu(hp1).ops := taicpu(hp2).ops;
  13254. DebugMsg(SPeepholeOptimization+'JccJmpRet2J!ccRet',p);
  13255. case taicpu(hp2).ops of
  13256. 0:
  13257. taicpu(hp1).clearop(0);
  13258. 1:
  13259. taicpu(hp1).loadconst(0,taicpu(hp2).oper[0]^.val);
  13260. else
  13261. internalerror(2016041302);
  13262. end;
  13263. end;
  13264. {$ifndef i8086}
  13265. end
  13266. {
  13267. convert
  13268. j<c> .L1
  13269. mov 1,reg
  13270. jmp .L2
  13271. .L1
  13272. mov 0,reg
  13273. .L2
  13274. into
  13275. mov 0,reg
  13276. set<not(c)> reg
  13277. take care of alignment and that the mov 0,reg is not converted into a xor as this
  13278. would destroy the flag contents
  13279. }
  13280. else if MatchInstruction(hp1,A_MOV,[]) and
  13281. MatchOpType(taicpu(hp1),top_const,top_reg) and
  13282. {$ifdef i386}
  13283. (
  13284. { Under i386, ESI, EDI, EBP and ESP
  13285. don't have an 8-bit representation }
  13286. not (getsupreg(taicpu(hp1).oper[1]^.reg) in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  13287. ) and
  13288. {$endif i386}
  13289. (taicpu(hp1).oper[0]^.val=1) and
  13290. GetNextInstruction(hp1,hp2) and
  13291. MatchInstruction(hp2,A_JMP,[]) and (taicpu(hp2).oper[0]^.ref^.refaddr=addr_full) and
  13292. GetNextInstruction(hp2,hp3) and
  13293. (hp3.typ=ait_label) and
  13294. (tasmlabel(taicpu(p).oper[0]^.ref^.symbol)=tai_label(hp3).labsym) and
  13295. (tai_label(hp3).labsym.getrefs=1) and
  13296. GetNextInstruction(hp3,hp4) and
  13297. MatchInstruction(hp4,A_MOV,[]) and
  13298. MatchOpType(taicpu(hp4),top_const,top_reg) and
  13299. (taicpu(hp4).oper[0]^.val=0) and
  13300. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp4).oper[1]^) and
  13301. GetNextInstruction(hp4,hp5) and
  13302. (hp5.typ=ait_label) and
  13303. (tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol)=tai_label(hp5).labsym) and
  13304. (tai_label(hp5).labsym.getrefs=1) then
  13305. begin
  13306. AllocRegBetween(NR_FLAGS,p,hp4,UsedRegs);
  13307. DebugMsg(SPeepholeOptimization+'JccMovJmpMov2MovSetcc',p);
  13308. { remove last label }
  13309. RemoveInstruction(hp5);
  13310. { remove second label }
  13311. RemoveInstruction(hp3);
  13312. { remove jmp }
  13313. RemoveInstruction(hp2);
  13314. if taicpu(hp1).opsize=S_B then
  13315. RemoveInstruction(hp1)
  13316. else
  13317. taicpu(hp1).loadconst(0,0);
  13318. taicpu(hp4).opcode:=A_SETcc;
  13319. taicpu(hp4).opsize:=S_B;
  13320. taicpu(hp4).condition:=inverse_cond(taicpu(p).condition);
  13321. taicpu(hp4).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(hp4).oper[1]^.reg),R_SUBL));
  13322. taicpu(hp4).opercnt:=1;
  13323. taicpu(hp4).ops:=1;
  13324. taicpu(hp4).freeop(1);
  13325. RemoveCurrentP(p);
  13326. Result:=true;
  13327. exit;
  13328. end
  13329. else if (CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) and
  13330. MatchInstruction(hp1,A_MOV,[S_W,S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
  13331. begin
  13332. { check for
  13333. jCC xxx
  13334. <several movs>
  13335. xxx:
  13336. Also spot:
  13337. Jcc xxx
  13338. <several movs>
  13339. jmp xxx
  13340. Change to:
  13341. <several cmovs with inverted condition>
  13342. jmp xxx (only for the 2nd case)
  13343. }
  13344. CMOVTracking := New(PCMOVTracking, Init(Self, p, hp1, TAsmLabel(symbol)));
  13345. if CMOVTracking^.State <> tsInvalid then
  13346. begin
  13347. CMovTracking^.Process(p);
  13348. Result := True;
  13349. end;
  13350. CMOVTracking^.Done;
  13351. {$endif i8086}
  13352. end;
  13353. end;
  13354. end;
  13355. function TX86AsmOptimizer.OptPass1Movx(var p : tai) : boolean;
  13356. var
  13357. hp1,hp2,hp3: tai;
  13358. reg_and_hp1_is_instr, RegUsed, AndTest: Boolean;
  13359. NewSize: TOpSize;
  13360. NewRegSize: TSubRegister;
  13361. Limit: TCgInt;
  13362. SwapOper: POper;
  13363. begin
  13364. result:=false;
  13365. reg_and_hp1_is_instr:=(taicpu(p).oper[1]^.typ = top_reg) and
  13366. GetNextInstruction(p,hp1) and
  13367. (hp1.typ = ait_instruction);
  13368. if reg_and_hp1_is_instr and
  13369. (
  13370. (taicpu(hp1).opcode <> A_LEA) or
  13371. { If the LEA instruction can be converted into an arithmetic instruction,
  13372. it may be possible to then fold it. }
  13373. (
  13374. { If the flags register is in use, don't change the instruction
  13375. to an ADD otherwise this will scramble the flags. [Kit] }
  13376. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  13377. ConvertLEA(taicpu(hp1))
  13378. )
  13379. ) and
  13380. IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
  13381. GetNextInstruction(hp1,hp2) and
  13382. MatchInstruction(hp2,A_MOV,[]) and
  13383. (taicpu(hp2).oper[0]^.typ = top_reg) and
  13384. OpsEqual(taicpu(hp2).oper[1]^,taicpu(p).oper[0]^) and
  13385. ((taicpu(p).opsize in [S_BW,S_BL]) and (taicpu(hp2).opsize=S_B) or
  13386. (taicpu(p).opsize in [S_WL]) and (taicpu(hp2).opsize=S_W)) and
  13387. {$ifdef i386}
  13388. { not all registers have byte size sub registers on i386 }
  13389. ((taicpu(hp2).opsize<>S_B) or (getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])) and
  13390. {$endif i386}
  13391. (((taicpu(hp1).ops=2) and
  13392. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg))) or
  13393. ((taicpu(hp1).ops=1) and
  13394. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg)))) and
  13395. not(RegUsedAfterInstruction(taicpu(hp2).oper[0]^.reg,hp2,UsedRegs)) then
  13396. begin
  13397. { change movsX/movzX reg/ref, reg2
  13398. add/sub/or/... reg3/$const, reg2
  13399. mov reg2 reg/ref
  13400. to add/sub/or/... reg3/$const, reg/ref }
  13401. { by example:
  13402. movswl %si,%eax movswl %si,%eax p
  13403. decl %eax addl %edx,%eax hp1
  13404. movw %ax,%si movw %ax,%si hp2
  13405. ->
  13406. movswl %si,%eax movswl %si,%eax p
  13407. decw %eax addw %edx,%eax hp1
  13408. movw %ax,%si movw %ax,%si hp2
  13409. }
  13410. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  13411. {
  13412. ->
  13413. movswl %si,%eax movswl %si,%eax p
  13414. decw %si addw %dx,%si hp1
  13415. movw %ax,%si movw %ax,%si hp2
  13416. }
  13417. case taicpu(hp1).ops of
  13418. 1:
  13419. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  13420. 2:
  13421. begin
  13422. taicpu(hp1).loadoper(1,taicpu(hp2).oper[1]^);
  13423. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  13424. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  13425. end;
  13426. else
  13427. internalerror(2008042702);
  13428. end;
  13429. {
  13430. ->
  13431. decw %si addw %dx,%si p
  13432. }
  13433. DebugMsg(SPeepholeOptimization + 'var3',p);
  13434. RemoveCurrentP(p, hp1);
  13435. RemoveInstruction(hp2);
  13436. Result := True;
  13437. Exit;
  13438. end;
  13439. if reg_and_hp1_is_instr and
  13440. (taicpu(hp1).opcode = A_MOV) and
  13441. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  13442. (MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)
  13443. {$ifdef x86_64}
  13444. { check for implicit extension to 64 bit }
  13445. or
  13446. ((taicpu(p).opsize in [S_BL,S_WL]) and
  13447. (taicpu(hp1).opsize=S_Q) and
  13448. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg)
  13449. )
  13450. {$endif x86_64}
  13451. )
  13452. then
  13453. begin
  13454. { change
  13455. movx %reg1,%reg2
  13456. mov %reg2,%reg3
  13457. dealloc %reg2
  13458. into
  13459. movx %reg,%reg3
  13460. }
  13461. TransferUsedRegs(TmpUsedRegs);
  13462. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  13463. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  13464. begin
  13465. DebugMsg(SPeepholeOptimization + 'MovxMov2Movx',p);
  13466. {$ifdef x86_64}
  13467. if (taicpu(p).opsize in [S_BL,S_WL]) and
  13468. (taicpu(hp1).opsize=S_Q) then
  13469. taicpu(p).loadreg(1,newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg),R_SUBD))
  13470. else
  13471. {$endif x86_64}
  13472. taicpu(p).loadreg(1,taicpu(hp1).oper[1]^.reg);
  13473. RemoveInstruction(hp1);
  13474. Result := True;
  13475. Exit;
  13476. end;
  13477. end;
  13478. if reg_and_hp1_is_instr and
  13479. ((taicpu(hp1).opcode=A_MOV) or
  13480. (taicpu(hp1).opcode=A_ADD) or
  13481. (taicpu(hp1).opcode=A_SUB) or
  13482. (taicpu(hp1).opcode=A_CMP) or
  13483. (taicpu(hp1).opcode=A_OR) or
  13484. (taicpu(hp1).opcode=A_XOR) or
  13485. (taicpu(hp1).opcode=A_AND)
  13486. ) and
  13487. (taicpu(hp1).oper[1]^.typ = top_reg) then
  13488. begin
  13489. AndTest := (taicpu(hp1).opcode=A_AND) and
  13490. GetNextInstruction(hp1, hp2) and
  13491. (hp2.typ = ait_instruction) and
  13492. (
  13493. (
  13494. (taicpu(hp2).opcode=A_TEST) and
  13495. (
  13496. MatchOperand(taicpu(hp2).oper[0]^, taicpu(hp1).oper[1]^.reg) or
  13497. MatchOperand(taicpu(hp2).oper[0]^, -1) or
  13498. (
  13499. { If the AND and TEST instructions share a constant, this is also valid }
  13500. (taicpu(hp1).oper[0]^.typ = top_const) and
  13501. MatchOperand(taicpu(hp2).oper[0]^, taicpu(hp1).oper[0]^.val)
  13502. )
  13503. ) and
  13504. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[1]^.reg)
  13505. ) or
  13506. (
  13507. (taicpu(hp2).opcode=A_CMP) and
  13508. MatchOperand(taicpu(hp2).oper[0]^, 0) and
  13509. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[1]^.reg)
  13510. )
  13511. );
  13512. { change
  13513. movx (oper),%reg2
  13514. and $x,%reg2
  13515. test %reg2,%reg2
  13516. dealloc %reg2
  13517. into
  13518. op %reg1,%reg3
  13519. if the second op accesses only the bits stored in reg1
  13520. }
  13521. if ((taicpu(p).oper[0]^.typ=top_reg) or
  13522. ((taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr<>addr_full))) and
  13523. (taicpu(hp1).oper[0]^.typ = top_const) and
  13524. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
  13525. AndTest then
  13526. begin
  13527. { Check if the AND constant is in range }
  13528. case taicpu(p).opsize of
  13529. S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  13530. begin
  13531. NewSize := S_B;
  13532. Limit := $FF;
  13533. end;
  13534. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  13535. begin
  13536. NewSize := S_W;
  13537. Limit := $FFFF;
  13538. end;
  13539. {$ifdef x86_64}
  13540. S_LQ:
  13541. begin
  13542. NewSize := S_L;
  13543. Limit := $FFFFFFFF;
  13544. end;
  13545. {$endif x86_64}
  13546. else
  13547. InternalError(2021120303);
  13548. end;
  13549. if (
  13550. ((taicpu(hp1).oper[0]^.val and Limit) = taicpu(hp1).oper[0]^.val) or
  13551. { Check for negative operands }
  13552. (((not taicpu(hp1).oper[0]^.val) and Limit) = (not taicpu(hp1).oper[0]^.val))
  13553. ) and
  13554. GetNextInstruction(hp2,hp3) and
  13555. MatchInstruction(hp3,A_Jcc,A_Setcc,A_CMOVcc,[]) and
  13556. (taicpu(hp3).condition in [C_E,C_NE]) then
  13557. begin
  13558. TransferUsedRegs(TmpUsedRegs);
  13559. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  13560. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  13561. if not(RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp2, TmpUsedRegs)) then
  13562. begin
  13563. DebugMsg(SPeepholeOptimization + 'MovxAndTest2Test done',p);
  13564. taicpu(hp1).loadoper(1, taicpu(p).oper[0]^);
  13565. taicpu(hp1).opcode := A_TEST;
  13566. taicpu(hp1).opsize := NewSize;
  13567. RemoveInstruction(hp2);
  13568. RemoveCurrentP(p, hp1);
  13569. Result:=true;
  13570. exit;
  13571. end;
  13572. end;
  13573. end;
  13574. if (taicpu(hp1).oper[0]^.typ = top_reg) and
  13575. (((taicpu(p).opsize in [S_BW,S_BL,S_WL{$ifdef x86_64},S_BQ,S_WQ,S_LQ{$endif x86_64}]) and
  13576. (taicpu(hp1).opsize=S_B)) or
  13577. ((taicpu(p).opsize in [S_WL{$ifdef x86_64},S_WQ,S_LQ{$endif x86_64}]) and
  13578. (taicpu(hp1).opsize=S_W))
  13579. {$ifdef x86_64}
  13580. or ((taicpu(p).opsize=S_LQ) and
  13581. (taicpu(hp1).opsize=S_L))
  13582. {$endif x86_64}
  13583. ) and
  13584. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg) then
  13585. begin
  13586. { change
  13587. movx %reg1,%reg2
  13588. op %reg2,%reg3
  13589. dealloc %reg2
  13590. into
  13591. op %reg1,%reg3
  13592. if the second op accesses only the bits stored in reg1
  13593. }
  13594. TransferUsedRegs(TmpUsedRegs);
  13595. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  13596. if AndTest then
  13597. begin
  13598. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  13599. RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs);
  13600. end
  13601. else
  13602. RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs);
  13603. if not RegUsed then
  13604. begin
  13605. DebugMsg(SPeepholeOptimization + 'MovxOp2Op 1',p);
  13606. if taicpu(p).oper[0]^.typ=top_reg then
  13607. begin
  13608. case taicpu(hp1).opsize of
  13609. S_B:
  13610. taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBL));
  13611. S_W:
  13612. taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBW));
  13613. S_L:
  13614. taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBD));
  13615. else
  13616. Internalerror(2020102301);
  13617. end;
  13618. AllocRegBetween(taicpu(hp1).oper[0]^.reg,p,hp1,UsedRegs);
  13619. end
  13620. else
  13621. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  13622. RemoveCurrentP(p);
  13623. if AndTest then
  13624. RemoveInstruction(hp2);
  13625. result:=true;
  13626. exit;
  13627. end;
  13628. end
  13629. else if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
  13630. (
  13631. { Bitwise operations only }
  13632. (taicpu(hp1).opcode=A_AND) or
  13633. (taicpu(hp1).opcode=A_TEST) or
  13634. (
  13635. (taicpu(hp1).oper[0]^.typ = top_const) and
  13636. (
  13637. (taicpu(hp1).opcode=A_OR) or
  13638. (taicpu(hp1).opcode=A_XOR)
  13639. )
  13640. )
  13641. ) and
  13642. (
  13643. (taicpu(hp1).oper[0]^.typ = top_const) or
  13644. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) or
  13645. not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^)
  13646. ) then
  13647. begin
  13648. { change
  13649. movx %reg2,%reg2
  13650. op const,%reg2
  13651. into
  13652. op const,%reg2 (smaller version)
  13653. movx %reg2,%reg2
  13654. also change
  13655. movx %reg1,%reg2
  13656. and/test (oper),%reg2
  13657. dealloc %reg2
  13658. into
  13659. and/test (oper),%reg1
  13660. }
  13661. case taicpu(p).opsize of
  13662. S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  13663. begin
  13664. NewSize := S_B;
  13665. NewRegSize := R_SUBL;
  13666. Limit := $FF;
  13667. end;
  13668. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  13669. begin
  13670. NewSize := S_W;
  13671. NewRegSize := R_SUBW;
  13672. Limit := $FFFF;
  13673. end;
  13674. {$ifdef x86_64}
  13675. S_LQ:
  13676. begin
  13677. NewSize := S_L;
  13678. NewRegSize := R_SUBD;
  13679. Limit := $FFFFFFFF;
  13680. end;
  13681. {$endif x86_64}
  13682. else
  13683. Internalerror(2021120302);
  13684. end;
  13685. TransferUsedRegs(TmpUsedRegs);
  13686. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  13687. if AndTest then
  13688. begin
  13689. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  13690. RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs);
  13691. end
  13692. else
  13693. RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs);
  13694. if
  13695. (
  13696. (taicpu(p).opcode = A_MOVZX) and
  13697. (
  13698. (taicpu(hp1).opcode=A_AND) or
  13699. (taicpu(hp1).opcode=A_TEST)
  13700. ) and
  13701. not (
  13702. { If both are references, then the final instruction will have
  13703. both operands as references, which is not allowed }
  13704. (taicpu(p).oper[0]^.typ = top_ref) and
  13705. (taicpu(hp1).oper[0]^.typ = top_ref)
  13706. ) and
  13707. not RegUsed
  13708. ) or
  13709. (
  13710. (
  13711. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) or
  13712. not RegUsed
  13713. ) and
  13714. (taicpu(p).oper[0]^.typ = top_reg) and
  13715. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  13716. (taicpu(hp1).oper[0]^.typ = top_const) and
  13717. ((taicpu(hp1).oper[0]^.val and Limit) = taicpu(hp1).oper[0]^.val)
  13718. ) then
  13719. begin
  13720. {$if defined(i386) or defined(i8086)}
  13721. { If the target size is 8-bit, make sure we can actually encode it }
  13722. if (NewRegSize = R_SUBL) and (taicpu(hp1).oper[0]^.typ = top_reg) and not (GetSupReg(taicpu(hp1).oper[0]^.reg) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX]) then
  13723. Exit;
  13724. {$endif i386 or i8086}
  13725. DebugMsg(SPeepholeOptimization + 'MovxOp2Op 2',p);
  13726. taicpu(hp1).opsize := NewSize;
  13727. taicpu(hp1).loadoper(1, taicpu(p).oper[0]^);
  13728. if AndTest then
  13729. begin
  13730. RemoveInstruction(hp2);
  13731. if not RegUsed then
  13732. begin
  13733. taicpu(hp1).opcode := A_TEST;
  13734. if (taicpu(hp1).oper[0]^.typ = top_ref) then
  13735. begin
  13736. { Make sure the reference is the second operand }
  13737. SwapOper := taicpu(hp1).oper[0];
  13738. taicpu(hp1).oper[0] := taicpu(hp1).oper[1];
  13739. taicpu(hp1).oper[1] := SwapOper;
  13740. end;
  13741. end;
  13742. end;
  13743. case taicpu(hp1).oper[0]^.typ of
  13744. top_reg:
  13745. setsubreg(taicpu(hp1).oper[0]^.reg, NewRegSize);
  13746. top_const:
  13747. { For the AND/TEST case }
  13748. taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and Limit;
  13749. else
  13750. ;
  13751. end;
  13752. if RegUsed then
  13753. begin
  13754. AsmL.Remove(p);
  13755. AsmL.InsertAfter(p, hp1);
  13756. p := hp1;
  13757. end
  13758. else
  13759. RemoveCurrentP(p, hp1);
  13760. result:=true;
  13761. exit;
  13762. end;
  13763. end;
  13764. end;
  13765. if reg_and_hp1_is_instr and
  13766. (taicpu(p).oper[0]^.typ = top_reg) and
  13767. (
  13768. (taicpu(hp1).opcode = A_SHL) or (taicpu(hp1).opcode = A_SAL)
  13769. ) and
  13770. (taicpu(hp1).oper[0]^.typ = top_const) and
  13771. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  13772. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  13773. { Minimum shift value allowed is the bit difference between the sizes }
  13774. (taicpu(hp1).oper[0]^.val >=
  13775. { Multiply by 8 because tcgsize2size returns bytes, not bits }
  13776. 8 * (
  13777. tcgsize2size[reg_cgsize(taicpu(p).oper[1]^.reg)] -
  13778. tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
  13779. )
  13780. ) then
  13781. begin
  13782. { For:
  13783. movsx/movzx %reg1,%reg1 (same register, just different sizes)
  13784. shl/sal ##, %reg1
  13785. Remove the movsx/movzx instruction if the shift overwrites the
  13786. extended bits of the register (e.g. movslq %eax,%rax; shlq $32,%rax
  13787. }
  13788. DebugMsg(SPeepholeOptimization + 'MovxShl2Shl',p);
  13789. RemoveCurrentP(p, hp1);
  13790. Result := True;
  13791. Exit;
  13792. end
  13793. else if reg_and_hp1_is_instr and
  13794. (taicpu(p).oper[0]^.typ = top_reg) and
  13795. (
  13796. ((taicpu(hp1).opcode = A_SHR) and (taicpu(p).opcode = A_MOVZX)) or
  13797. ((taicpu(hp1).opcode = A_SAR) and (taicpu(p).opcode <> A_MOVZX))
  13798. ) and
  13799. (taicpu(hp1).oper[0]^.typ = top_const) and
  13800. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  13801. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  13802. { Minimum shift value allowed is the bit size of the smallest register - 1 }
  13803. (taicpu(hp1).oper[0]^.val <
  13804. { Multiply by 8 because tcgsize2size returns bytes, not bits }
  13805. 8 * (
  13806. tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
  13807. )
  13808. ) then
  13809. begin
  13810. { For:
  13811. movsx %reg1,%reg1 movzx %reg1,%reg1 (same register, just different sizes)
  13812. sar ##, %reg1 shr ##, %reg1
  13813. Move the shift to before the movx instruction if the shift value
  13814. is not too large.
  13815. }
  13816. asml.Remove(hp1);
  13817. asml.InsertBefore(hp1, p);
  13818. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[0]^.reg;
  13819. case taicpu(p).opsize of
  13820. s_BW, S_BL{$ifdef x86_64}, S_BQ{$endif}:
  13821. taicpu(hp1).opsize := S_B;
  13822. S_WL{$ifdef x86_64}, S_WQ{$endif}:
  13823. taicpu(hp1).opsize := S_W;
  13824. {$ifdef x86_64}
  13825. S_LQ:
  13826. taicpu(hp1).opsize := S_L;
  13827. {$endif}
  13828. else
  13829. InternalError(2020112401);
  13830. end;
  13831. if (taicpu(hp1).opcode = A_SHR) then
  13832. DebugMsg(SPeepholeOptimization + 'MovzShr2ShrMovz', hp1)
  13833. else
  13834. DebugMsg(SPeepholeOptimization + 'MovsSar2SarMovs', hp1);
  13835. Result := True;
  13836. end;
  13837. if reg_and_hp1_is_instr and
  13838. (taicpu(p).oper[0]^.typ = top_reg) and
  13839. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  13840. (
  13841. (taicpu(hp1).opcode = taicpu(p).opcode)
  13842. or ((taicpu(p).opcode = A_MOVZX) and ((taicpu(hp1).opcode = A_MOVSX){$ifdef x86_64} or (taicpu(hp1).opcode = A_MOVSXD){$endif x86_64}))
  13843. {$ifdef x86_64}
  13844. or ((taicpu(p).opcode = A_MOVSX) and (taicpu(hp1).opcode = A_MOVSXD))
  13845. {$endif x86_64}
  13846. ) then
  13847. begin
  13848. if MatchOpType(taicpu(hp1), top_reg, top_reg) and
  13849. (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[0]^.reg) and
  13850. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
  13851. begin
  13852. {
  13853. For example:
  13854. movzbw %al,%ax
  13855. movzwl %ax,%eax
  13856. Compress into:
  13857. movzbl %al,%eax
  13858. }
  13859. RegUsed := False;
  13860. case taicpu(p).opsize of
  13861. S_BW:
  13862. case taicpu(hp1).opsize of
  13863. S_WL:
  13864. begin
  13865. taicpu(p).opsize := S_BL;
  13866. RegUsed := True;
  13867. end;
  13868. {$ifdef x86_64}
  13869. S_WQ:
  13870. begin
  13871. if taicpu(p).opcode = A_MOVZX then
  13872. begin
  13873. taicpu(p).opsize := S_BL;
  13874. { 64-bit zero extension is implicit, so change to the 32-bit register }
  13875. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  13876. end
  13877. else
  13878. taicpu(p).opsize := S_BQ;
  13879. RegUsed := True;
  13880. end;
  13881. {$endif x86_64}
  13882. else
  13883. ;
  13884. end;
  13885. {$ifdef x86_64}
  13886. S_BL:
  13887. case taicpu(hp1).opsize of
  13888. S_LQ:
  13889. begin
  13890. if taicpu(p).opcode = A_MOVZX then
  13891. begin
  13892. taicpu(p).opsize := S_BL;
  13893. { 64-bit zero extension is implicit, so change to the 32-bit register }
  13894. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  13895. end
  13896. else
  13897. taicpu(p).opsize := S_BQ;
  13898. RegUsed := True;
  13899. end;
  13900. else
  13901. ;
  13902. end;
  13903. S_WL:
  13904. case taicpu(hp1).opsize of
  13905. S_LQ:
  13906. begin
  13907. if taicpu(p).opcode = A_MOVZX then
  13908. begin
  13909. taicpu(p).opsize := S_WL;
  13910. { 64-bit zero extension is implicit, so change to the 32-bit register }
  13911. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  13912. end
  13913. else
  13914. taicpu(p).opsize := S_WQ;
  13915. RegUsed := True;
  13916. end;
  13917. else
  13918. ;
  13919. end;
  13920. {$endif x86_64}
  13921. else
  13922. ;
  13923. end;
  13924. if RegUsed then
  13925. begin
  13926. DebugMsg(SPeepholeOptimization + 'MovxMovx2Movx', p);
  13927. taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg;
  13928. RemoveInstruction(hp1);
  13929. Result := True;
  13930. Exit;
  13931. end;
  13932. end;
  13933. if (taicpu(hp1).opsize = taicpu(p).opsize) and
  13934. not RegInInstruction(taicpu(p).oper[1]^.reg, hp1) and
  13935. GetNextInstruction(hp1, hp2) and
  13936. MatchInstruction(hp2, [A_AND, A_OR, A_XOR, A_TEST], []) and
  13937. (
  13938. ((taicpu(hp2).opsize = S_W) and (taicpu(p).opsize = S_BW)) or
  13939. ((taicpu(hp2).opsize = S_L) and (taicpu(p).opsize in [S_BL, S_WL]))
  13940. {$ifdef x86_64}
  13941. or ((taicpu(hp2).opsize = S_Q) and (taicpu(p).opsize in [S_BL, S_BQ, S_WL, S_WQ, S_LQ]))
  13942. {$endif x86_64}
  13943. ) and
  13944. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  13945. (
  13946. (
  13947. (taicpu(hp2).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  13948. (taicpu(hp2).oper[1]^.reg = taicpu(p).oper[1]^.reg)
  13949. ) or
  13950. (
  13951. { Only allow the operands in reverse order for TEST instructions }
  13952. (taicpu(hp2).opcode = A_TEST) and
  13953. (taicpu(hp2).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
  13954. (taicpu(hp2).oper[1]^.reg = taicpu(hp1).oper[1]^.reg)
  13955. )
  13956. ) then
  13957. begin
  13958. {
  13959. For example:
  13960. movzbl %al,%eax
  13961. movzbl (ref),%edx
  13962. andl %edx,%eax
  13963. (%edx deallocated)
  13964. Change to:
  13965. andb (ref),%al
  13966. movzbl %al,%eax
  13967. Rules are:
  13968. - First two instructions have the same opcode and opsize
  13969. - First instruction's operands are the same super-register
  13970. - Second instruction operates on a different register
  13971. - Third instruction is AND, OR, XOR or TEST
  13972. - Third instruction's operands are the destination registers of the first two instructions
  13973. - Third instruction writes to the destination register of the first instruction (except with TEST)
  13974. - Second instruction's destination register is deallocated afterwards
  13975. }
  13976. TransferUsedRegs(TmpUsedRegs);
  13977. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  13978. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  13979. if not RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs) then
  13980. begin
  13981. case taicpu(p).opsize of
  13982. S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  13983. NewSize := S_B;
  13984. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  13985. NewSize := S_W;
  13986. {$ifdef x86_64}
  13987. S_LQ:
  13988. NewSize := S_L;
  13989. {$endif x86_64}
  13990. else
  13991. InternalError(2021120301);
  13992. end;
  13993. taicpu(hp2).loadoper(0, taicpu(hp1).oper[0]^);
  13994. taicpu(hp2).loadreg(1, taicpu(p).oper[0]^.reg);
  13995. taicpu(hp2).opsize := NewSize;
  13996. RemoveInstruction(hp1);
  13997. { With TEST, it's best to keep the MOVX instruction at the top }
  13998. if (taicpu(hp2).opcode <> A_TEST) then
  13999. begin
  14000. DebugMsg(SPeepholeOptimization + 'MovxMovxTest2MovxTest', p);
  14001. asml.Remove(p);
  14002. { If the third instruction uses the flags, the MOVX instruction won't modify then }
  14003. asml.InsertAfter(p, hp2);
  14004. p := hp2;
  14005. end
  14006. else
  14007. DebugMsg(SPeepholeOptimization + 'MovxMovxOp2OpMovx', p);
  14008. Result := True;
  14009. Exit;
  14010. end;
  14011. end;
  14012. end;
  14013. if taicpu(p).opcode=A_MOVZX then
  14014. begin
  14015. { removes superfluous And's after movzx's }
  14016. if reg_and_hp1_is_instr and
  14017. (taicpu(hp1).opcode = A_AND) and
  14018. MatchOpType(taicpu(hp1),top_const,top_reg) and
  14019. ((taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)
  14020. {$ifdef x86_64}
  14021. { check for implicit extension to 64 bit }
  14022. or
  14023. ((taicpu(p).opsize in [S_BL,S_WL]) and
  14024. (taicpu(hp1).opsize=S_Q) and
  14025. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg)
  14026. )
  14027. {$endif x86_64}
  14028. )
  14029. then
  14030. begin
  14031. case taicpu(p).opsize Of
  14032. S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
  14033. if (taicpu(hp1).oper[0]^.val = $ff) then
  14034. begin
  14035. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz1',p);
  14036. RemoveInstruction(hp1);
  14037. Result:=true;
  14038. exit;
  14039. end;
  14040. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  14041. if (taicpu(hp1).oper[0]^.val = $ffff) then
  14042. begin
  14043. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz2',p);
  14044. RemoveInstruction(hp1);
  14045. Result:=true;
  14046. exit;
  14047. end;
  14048. {$ifdef x86_64}
  14049. S_LQ:
  14050. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  14051. begin
  14052. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz3',p);
  14053. RemoveInstruction(hp1);
  14054. Result:=true;
  14055. exit;
  14056. end;
  14057. {$endif x86_64}
  14058. else
  14059. ;
  14060. end;
  14061. { we cannot get rid of the and, but can we get rid of the movz ?}
  14062. if SuperRegistersEqual(taicpu(p).oper[0]^.reg,taicpu(p).oper[1]^.reg) then
  14063. begin
  14064. case taicpu(p).opsize Of
  14065. S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
  14066. if (taicpu(hp1).oper[0]^.val and $ff)=taicpu(hp1).oper[0]^.val then
  14067. begin
  14068. DebugMsg(SPeepholeOptimization + 'MovzAnd2And1',p);
  14069. RemoveCurrentP(p,hp1);
  14070. Result:=true;
  14071. exit;
  14072. end;
  14073. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  14074. if (taicpu(hp1).oper[0]^.val and $ffff)=taicpu(hp1).oper[0]^.val then
  14075. begin
  14076. DebugMsg(SPeepholeOptimization + 'MovzAnd2And2',p);
  14077. RemoveCurrentP(p,hp1);
  14078. Result:=true;
  14079. exit;
  14080. end;
  14081. {$ifdef x86_64}
  14082. S_LQ:
  14083. if (taicpu(hp1).oper[0]^.val and $ffffffff)=taicpu(hp1).oper[0]^.val then
  14084. begin
  14085. DebugMsg(SPeepholeOptimization + 'MovzAnd2And3',p);
  14086. RemoveCurrentP(p,hp1);
  14087. Result:=true;
  14088. exit;
  14089. end;
  14090. {$endif x86_64}
  14091. else
  14092. ;
  14093. end;
  14094. end;
  14095. end;
  14096. { changes some movzx constructs to faster synonyms (all examples
  14097. are given with eax/ax, but are also valid for other registers)}
  14098. if MatchOpType(taicpu(p),top_reg,top_reg) then
  14099. begin
  14100. case taicpu(p).opsize of
  14101. { Technically, movzbw %al,%ax cannot be encoded in 32/64-bit mode
  14102. (the machine code is equivalent to movzbl %al,%eax), but the
  14103. code generator still generates that assembler instruction and
  14104. it is silently converted. This should probably be checked.
  14105. [Kit] }
  14106. S_BW:
  14107. begin
  14108. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  14109. (
  14110. not IsMOVZXAcceptable
  14111. { and $0xff,%ax has a smaller encoding but risks a partial write penalty }
  14112. or (
  14113. (cs_opt_size in current_settings.optimizerswitches) and
  14114. (taicpu(p).oper[1]^.reg = NR_AX)
  14115. )
  14116. ) then
  14117. {Change "movzbw %al, %ax" to "andw $0x0ffh, %ax"}
  14118. begin
  14119. DebugMsg(SPeepholeOptimization + 'var7',p);
  14120. taicpu(p).opcode := A_AND;
  14121. taicpu(p).changeopsize(S_W);
  14122. taicpu(p).loadConst(0,$ff);
  14123. Result := True;
  14124. end
  14125. else if not IsMOVZXAcceptable and
  14126. GetNextInstruction(p, hp1) and
  14127. (tai(hp1).typ = ait_instruction) and
  14128. (taicpu(hp1).opcode = A_AND) and
  14129. MatchOpType(taicpu(hp1),top_const,top_reg) and
  14130. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  14131. { Change "movzbw %reg1, %reg2; andw $const, %reg2"
  14132. to "movw %reg1, reg2; andw $(const1 and $ff), %reg2"}
  14133. begin
  14134. DebugMsg(SPeepholeOptimization + 'var8',p);
  14135. taicpu(p).opcode := A_MOV;
  14136. taicpu(p).changeopsize(S_W);
  14137. setsubreg(taicpu(p).oper[0]^.reg,R_SUBW);
  14138. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  14139. Result := True;
  14140. end;
  14141. end;
  14142. {$ifndef i8086} { movzbl %al,%eax cannot be encoded in 16-bit mode (the machine code is equivalent to movzbw %al,%ax }
  14143. S_BL:
  14144. if not IsMOVZXAcceptable then
  14145. begin
  14146. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) then
  14147. { Change "movzbl %al, %eax" to "andl $0x0ffh, %eax" }
  14148. begin
  14149. DebugMsg(SPeepholeOptimization + 'var9',p);
  14150. taicpu(p).opcode := A_AND;
  14151. taicpu(p).changeopsize(S_L);
  14152. taicpu(p).loadConst(0,$ff);
  14153. Result := True;
  14154. end
  14155. else if GetNextInstruction(p, hp1) and
  14156. (tai(hp1).typ = ait_instruction) and
  14157. (taicpu(hp1).opcode = A_AND) and
  14158. MatchOpType(taicpu(hp1),top_const,top_reg) and
  14159. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  14160. { Change "movzbl %reg1, %reg2; andl $const, %reg2"
  14161. to "movl %reg1, reg2; andl $(const1 and $ff), %reg2"}
  14162. begin
  14163. DebugMsg(SPeepholeOptimization + 'var10',p);
  14164. taicpu(p).opcode := A_MOV;
  14165. taicpu(p).changeopsize(S_L);
  14166. { do not use R_SUBWHOLE
  14167. as movl %rdx,%eax
  14168. is invalid in assembler PM }
  14169. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  14170. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  14171. Result := True;
  14172. end;
  14173. end;
  14174. {$endif i8086}
  14175. S_WL:
  14176. if not IsMOVZXAcceptable then
  14177. begin
  14178. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) then
  14179. { Change "movzwl %ax, %eax" to "andl $0x0ffffh, %eax" }
  14180. begin
  14181. DebugMsg(SPeepholeOptimization + 'var11',p);
  14182. taicpu(p).opcode := A_AND;
  14183. taicpu(p).changeopsize(S_L);
  14184. taicpu(p).loadConst(0,$ffff);
  14185. Result := True;
  14186. end
  14187. else if GetNextInstruction(p, hp1) and
  14188. (tai(hp1).typ = ait_instruction) and
  14189. (taicpu(hp1).opcode = A_AND) and
  14190. (taicpu(hp1).oper[0]^.typ = top_const) and
  14191. (taicpu(hp1).oper[1]^.typ = top_reg) and
  14192. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  14193. { Change "movzwl %reg1, %reg2; andl $const, %reg2"
  14194. to "movl %reg1, reg2; andl $(const1 and $ffff), %reg2"}
  14195. begin
  14196. DebugMsg(SPeepholeOptimization + 'var12',p);
  14197. taicpu(p).opcode := A_MOV;
  14198. taicpu(p).changeopsize(S_L);
  14199. { do not use R_SUBWHOLE
  14200. as movl %rdx,%eax
  14201. is invalid in assembler PM }
  14202. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  14203. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  14204. Result := True;
  14205. end;
  14206. end;
  14207. else
  14208. InternalError(2017050705);
  14209. end;
  14210. end
  14211. else if not IsMOVZXAcceptable and (taicpu(p).oper[0]^.typ = top_ref) then
  14212. begin
  14213. if GetNextInstruction(p, hp1) and
  14214. (tai(hp1).typ = ait_instruction) and
  14215. (taicpu(hp1).opcode = A_AND) and
  14216. MatchOpType(taicpu(hp1),top_const,top_reg) and
  14217. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  14218. begin
  14219. case taicpu(p).opsize Of
  14220. S_BL:
  14221. if (taicpu(hp1).opsize <> S_L) or
  14222. (taicpu(hp1).oper[0]^.val > $FF) then
  14223. begin
  14224. DebugMsg(SPeepholeOptimization + 'var13',p);
  14225. taicpu(hp1).changeopsize(S_L);
  14226. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  14227. Include(OptsToCheck, aoc_ForceNewIteration);
  14228. end;
  14229. S_WL:
  14230. if (taicpu(hp1).opsize <> S_L) or
  14231. (taicpu(hp1).oper[0]^.val > $FFFF) then
  14232. begin
  14233. DebugMsg(SPeepholeOptimization + 'var14',p);
  14234. taicpu(hp1).changeopsize(S_L);
  14235. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  14236. Include(OptsToCheck, aoc_ForceNewIteration);
  14237. end;
  14238. S_BW:
  14239. if (taicpu(hp1).opsize <> S_W) or
  14240. (taicpu(hp1).oper[0]^.val > $FF) then
  14241. begin
  14242. DebugMsg(SPeepholeOptimization + 'var15',p);
  14243. taicpu(hp1).changeopsize(S_W);
  14244. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  14245. Include(OptsToCheck, aoc_ForceNewIteration);
  14246. end;
  14247. else
  14248. Internalerror(2017050704)
  14249. end;
  14250. end;
  14251. end;
  14252. end;
  14253. end;
  14254. {$ifdef x86_64}
  14255. function TX86AsmOptimizer.DoZeroUpper32Opt(var mov_p: tai; var and_p: tai): Boolean;
  14256. var
  14257. hp1, old_hp1: tai;
  14258. FullSourceReg, FullTargetReg: TRegister;
  14259. begin
  14260. if (mov_p.typ<>ait_instruction) or
  14261. (taicpu(mov_p).opsize<>S_L) or
  14262. not MatchOpType(taicpu(mov_p),top_reg,top_reg) then
  14263. InternalError(2025062801);
  14264. Result:=False;
  14265. FullSourceReg:=taicpu(mov_p).oper[0]^.reg; setsubreg(FullSourceReg, R_SUBQ);
  14266. FullTargetReg:=taicpu(mov_p).oper[1]^.reg; setsubreg(FullTargetReg, R_SUBQ);
  14267. { Mark the registers in the MOV command as "used" }
  14268. IncludeRegInUsedRegs(FullSourceReg,UsedRegs);
  14269. IncludeRegInUsedRegs(FullTargetReg,UsedRegs);
  14270. { This is a little hack to get DeepMOVOpt to replace the full 64-bit
  14271. registers. The MOV instruction will be put back as it was afterwards
  14272. (unless it got removed). }
  14273. taicpu(mov_p).oper[0]^.reg:=FullSourceReg;
  14274. taicpu(mov_p).oper[1]^.reg:=FullTargetReg;
  14275. { Start after the and_p otherwise that instruction will be considered
  14276. to have modified the source register }
  14277. old_hp1:=and_p;
  14278. while GetNextInstructionUsingReg(old_hp1,hp1,FullTargetReg) and
  14279. (hp1.typ=ait_instruction) do
  14280. begin
  14281. if RegReadByInstruction(FullTargetReg,hp1) and
  14282. not RegModifiedBetween(FullSourceReg,old_hp1,hp1) and
  14283. DeepMOVOpt(taicpu(mov_p),taicpu(hp1)) then
  14284. begin
  14285. { A change has occurred, just not in mov_p }
  14286. Include(OptsToCheck, aoc_ForceNewIteration);
  14287. TransferUsedRegs(TmpUsedRegs);
  14288. UpdateUsedRegsBetween(TmpUsedRegs,tai(mov_p.Next), hp1);
  14289. if not RegUsedAfterInstruction(FullTargetReg,hp1,TmpUsedRegs) and
  14290. { Just in case something didn't get modified (e.g. an
  14291. implicit register) }
  14292. not RegReadByInstruction(FullTargetReg,hp1) then
  14293. begin
  14294. { We can remove the original MOV }
  14295. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3d done',mov_p);
  14296. RemoveCurrentP(mov_p);
  14297. Result := True;
  14298. Exit;
  14299. end;
  14300. end
  14301. else
  14302. Break;
  14303. old_hp1:=hp1;
  14304. end;
  14305. { Put the MOV instruction back as it was }
  14306. setsubreg(taicpu(mov_p).oper[0]^.reg,R_SUBD);
  14307. setsubreg(taicpu(mov_p).oper[1]^.reg,R_SUBD);
  14308. end;
  14309. {$endif x86_64}
  14310. function TX86AsmOptimizer.OptPass1AND(var p : tai) : boolean;
  14311. var
  14312. hp1, hp2 : tai;
  14313. MaskLength : Cardinal;
  14314. MaskedBits : TCgInt;
  14315. ActiveReg : TRegister;
  14316. begin
  14317. Result:=false;
  14318. { There are no optimisations for reference targets }
  14319. if (taicpu(p).oper[1]^.typ <> top_reg) then
  14320. Exit;
  14321. { Saves on a bunch of dereferences }
  14322. ActiveReg := taicpu(p).oper[1]^.reg;
  14323. while GetNextInstruction(p, hp1) and
  14324. (hp1.typ = ait_instruction) do
  14325. begin
  14326. if (taicpu(p).oper[0]^.typ = top_const) then
  14327. begin
  14328. case taicpu(hp1).opcode of
  14329. A_AND:
  14330. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  14331. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  14332. { the second register must contain the first one, so compare their subreg types }
  14333. (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
  14334. (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
  14335. { change
  14336. and const1, reg
  14337. and const2, reg
  14338. to
  14339. and (const1 and const2), reg
  14340. }
  14341. begin
  14342. taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
  14343. DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
  14344. RemoveCurrentP(p, hp1);
  14345. Result:=true;
  14346. exit;
  14347. end;
  14348. A_CMP:
  14349. if (PopCnt(DWord(taicpu(p).oper[0]^.val)) = 1) and { Only 1 bit set }
  14350. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.val) and
  14351. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  14352. { Just check that the condition on the next instruction is compatible }
  14353. GetNextInstruction(hp1, hp2) and
  14354. (hp2.typ = ait_instruction) and
  14355. (taicpu(hp2).condition in [C_Z, C_E, C_NZ, C_NE])
  14356. then
  14357. { change
  14358. and 2^n, reg
  14359. cmp 2^n, reg
  14360. j(c) / set(c) / cmov(c) (c is equal or not equal)
  14361. to
  14362. and 2^n, reg
  14363. test reg, reg
  14364. j(~c) / set(~c) / cmov(~c)
  14365. }
  14366. begin
  14367. { Keep TEST instruction in, rather than remove it, because
  14368. it may trigger other optimisations such as MovAndTest2Test }
  14369. taicpu(hp1).loadreg(0, taicpu(hp1).oper[1]^.reg);
  14370. taicpu(hp1).opcode := A_TEST;
  14371. DebugMsg(SPeepholeOptimization + 'AND/CMP/J(c) -> AND/J(~c) with power of 2 constant', p);
  14372. taicpu(hp2).condition := inverse_cond(taicpu(hp2).condition);
  14373. Result := True;
  14374. Exit;
  14375. end
  14376. else if ((taicpu(p).oper[0]^.val=$ff) or (taicpu(p).oper[0]^.val=$ffff) or (taicpu(p).oper[0]^.val=$ffffffff)) and
  14377. MatchOpType(taicpu(hp1),top_const,top_reg) and
  14378. (taicpu(p).oper[0]^.val>=taicpu(hp1).oper[0]^.val) and
  14379. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) then
  14380. { change
  14381. and $ff/$ff/$ffff, reg
  14382. cmp val<=$ff/val<=$ffff/val<=$ffffffff, reg
  14383. dealloc reg
  14384. to
  14385. cmp val<=$ff/val<=$ffff/val<=$ffffffff, resized reg
  14386. }
  14387. begin
  14388. TransferUsedRegs(TmpUsedRegs);
  14389. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  14390. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  14391. begin
  14392. DebugMsg(SPeepholeOptimization + 'AND/CMP -> CMP', p);
  14393. case taicpu(p).oper[0]^.val of
  14394. $ff:
  14395. begin
  14396. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBL);
  14397. taicpu(hp1).opsize:=S_B;
  14398. end;
  14399. $ffff:
  14400. begin
  14401. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBW);
  14402. taicpu(hp1).opsize:=S_W;
  14403. end;
  14404. $ffffffff:
  14405. begin
  14406. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  14407. taicpu(hp1).opsize:=S_L;
  14408. end;
  14409. else
  14410. Internalerror(2023030401);
  14411. end;
  14412. RemoveCurrentP(p);
  14413. Result := True;
  14414. Exit;
  14415. end;
  14416. end;
  14417. A_MOVZX:
  14418. if MatchOpType(taicpu(hp1),top_reg,top_reg) and
  14419. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) and
  14420. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  14421. (
  14422. (
  14423. (taicpu(p).opsize=S_W) and
  14424. (taicpu(hp1).opsize=S_BW)
  14425. ) or
  14426. (
  14427. (taicpu(p).opsize=S_L) and
  14428. (taicpu(hp1).opsize in [S_WL,S_BL{$ifdef x86_64},S_BQ,S_WQ{$endif x86_64}])
  14429. )
  14430. {$ifdef x86_64}
  14431. or
  14432. (
  14433. (taicpu(p).opsize=S_Q) and
  14434. (taicpu(hp1).opsize in [S_BQ,S_WQ,S_BL,S_WL])
  14435. )
  14436. {$endif x86_64}
  14437. ) then
  14438. begin
  14439. if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  14440. ((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val)
  14441. ) or
  14442. (((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  14443. ((taicpu(p).oper[0]^.val and $ffff)=taicpu(p).oper[0]^.val))
  14444. then
  14445. begin
  14446. { Unlike MOVSX, MOVZX doesn't actually have a version that zero-extends a
  14447. 32-bit register to a 64-bit register, or even a version called MOVZXD, so
  14448. code that tests for the presence of AND 0xffffffff followed by MOVZX is
  14449. wasted, and is indictive of a compiler bug if it were triggered. [Kit]
  14450. NOTE: To zero-extend from 32 bits to 64 bits, simply use the standard MOV.
  14451. }
  14452. DebugMsg(SPeepholeOptimization + 'AndMovzToAnd done',p);
  14453. RemoveInstruction(hp1);
  14454. { See if there are other optimisations possible }
  14455. Continue;
  14456. end;
  14457. end;
  14458. A_SHL:
  14459. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  14460. (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
  14461. begin
  14462. {$ifopt R+}
  14463. {$define RANGE_WAS_ON}
  14464. {$R-}
  14465. {$endif}
  14466. { get length of potential and mask }
  14467. MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
  14468. { really a mask? }
  14469. {$ifdef RANGE_WAS_ON}
  14470. {$R+}
  14471. {$endif}
  14472. if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
  14473. { unmasked part shifted out? }
  14474. ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
  14475. begin
  14476. DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
  14477. RemoveCurrentP(p, hp1);
  14478. Result:=true;
  14479. exit;
  14480. end;
  14481. end;
  14482. A_SHR:
  14483. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  14484. (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
  14485. (taicpu(hp1).oper[0]^.val <= 63) then
  14486. begin
  14487. { Does SHR combined with the AND cover all the bits?
  14488. e.g. for "andb $252,%reg; shrb $2,%reg" - the "and" can be removed }
  14489. MaskedBits := taicpu(p).oper[0]^.val or ((TCgInt(1) shl taicpu(hp1).oper[0]^.val) - 1);
  14490. if ((taicpu(p).opsize = S_B) and ((MaskedBits and $FF) = $FF)) or
  14491. ((taicpu(p).opsize = S_W) and ((MaskedBits and $FFFF) = $FFFF)) or
  14492. ((taicpu(p).opsize = S_L) and ((MaskedBits and $FFFFFFFF) = $FFFFFFFF)) then
  14493. begin
  14494. DebugMsg(SPeepholeOptimization + 'AndShrToShr done', p);
  14495. RemoveCurrentP(p, hp1);
  14496. Result := True;
  14497. Exit;
  14498. end;
  14499. end;
  14500. A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  14501. if (taicpu(hp1).oper[0]^.typ = top_reg) and
  14502. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
  14503. begin
  14504. if SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
  14505. (
  14506. (
  14507. (taicpu(hp1).opsize in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  14508. ((taicpu(p).oper[0]^.val and $7F) = taicpu(p).oper[0]^.val)
  14509. ) or (
  14510. (taicpu(hp1).opsize in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  14511. ((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val)
  14512. {$ifdef x86_64}
  14513. ) or (
  14514. (taicpu(hp1).opsize = S_LQ) and
  14515. ((taicpu(p).oper[0]^.val and $7fffffff) = taicpu(p).oper[0]^.val)
  14516. {$endif x86_64}
  14517. )
  14518. ) then
  14519. begin
  14520. if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg){$ifdef x86_64} or (taicpu(hp1).opsize = S_LQ){$endif x86_64} then
  14521. begin
  14522. DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
  14523. RemoveInstruction(hp1);
  14524. { See if there are other optimisations possible }
  14525. Continue;
  14526. end;
  14527. { The super-registers are the same though.
  14528. Note that this change by itself doesn't improve
  14529. code speed, but it opens up other optimisations. }
  14530. {$ifdef x86_64}
  14531. { Convert 64-bit register to 32-bit }
  14532. case taicpu(hp1).opsize of
  14533. S_BQ:
  14534. begin
  14535. taicpu(hp1).opsize := S_BL;
  14536. taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
  14537. end;
  14538. S_WQ:
  14539. begin
  14540. taicpu(hp1).opsize := S_WL;
  14541. taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
  14542. end
  14543. else
  14544. ;
  14545. end;
  14546. {$endif x86_64}
  14547. DebugMsg(SPeepholeOptimization + 'AndMovsxToAndMovzx', hp1);
  14548. taicpu(hp1).opcode := A_MOVZX;
  14549. { See if there are other optimisations possible }
  14550. Continue;
  14551. end;
  14552. end;
  14553. else
  14554. ;
  14555. end;
  14556. end
  14557. else if MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^.reg) and
  14558. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  14559. begin
  14560. {$ifdef x86_64}
  14561. if (taicpu(p).opsize = S_Q) then
  14562. begin
  14563. { Never necessary }
  14564. DebugMsg(SPeepholeOptimization + 'Andq2Nop', p);
  14565. RemoveCurrentP(p, hp1);
  14566. Result := True;
  14567. Exit;
  14568. end;
  14569. {$endif x86_64}
  14570. { Forward check to determine necessity of and %reg,%reg }
  14571. TransferUsedRegs(TmpUsedRegs);
  14572. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  14573. case taicpu(hp1).opcode of
  14574. A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  14575. if (
  14576. (taicpu(hp1).oper[0]^.typ <> top_ref) or
  14577. not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
  14578. ) and
  14579. (
  14580. (taicpu(hp1).opcode <> A_MOV) or
  14581. (taicpu(hp1).oper[1]^.typ <> top_ref) or
  14582. not RegInRef(ActiveReg, taicpu(hp1).oper[1]^.ref^)
  14583. ) and
  14584. not (
  14585. { If mov %reg,%reg is present, remove that instruction instead in OptPass1MOV }
  14586. (taicpu(hp1).opcode = A_MOV) and
  14587. MatchOperand(taicpu(hp1).oper[0]^, ActiveReg) and
  14588. MatchOperand(taicpu(hp1).oper[1]^, ActiveReg)
  14589. ) and
  14590. (
  14591. (
  14592. (taicpu(hp1).oper[0]^.typ = top_reg) and
  14593. (taicpu(hp1).oper[0]^.reg = ActiveReg) and
  14594. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg)
  14595. ) or
  14596. (
  14597. {$ifdef x86_64}
  14598. (
  14599. { If we read from the register, make sure it's not dependent on the upper 32 bits }
  14600. (taicpu(hp1).oper[0]^.typ <> top_reg) or
  14601. not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, ActiveReg) or
  14602. (GetSubReg(taicpu(hp1).oper[0]^.reg) <> R_SUBQ)
  14603. ) and
  14604. {$endif x86_64}
  14605. not RegUsedAfterInstruction(ActiveReg, hp1, TmpUsedRegs)
  14606. )
  14607. ) then
  14608. begin
  14609. DebugMsg(SPeepholeOptimization + 'AndMovx2Movx', p);
  14610. RemoveCurrentP(p, hp1);
  14611. Result := True;
  14612. Exit;
  14613. end;
  14614. A_ADD,
  14615. A_AND,
  14616. A_BSF,
  14617. A_BSR,
  14618. A_BTC,
  14619. A_BTR,
  14620. A_BTS,
  14621. A_OR,
  14622. A_SUB,
  14623. A_XOR:
  14624. { Register is written to, so this will clear the upper 32 bits (2-operand instructions) }
  14625. if (
  14626. (taicpu(hp1).oper[0]^.typ <> top_ref) or
  14627. not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
  14628. ) and
  14629. MatchOperand(taicpu(hp1).oper[1]^, ActiveReg) then
  14630. begin
  14631. DebugMsg(SPeepholeOptimization + 'AndOp2Op 2', p);
  14632. RemoveCurrentP(p, hp1);
  14633. Result := True;
  14634. Exit;
  14635. end;
  14636. A_CMP,
  14637. A_TEST:
  14638. if (
  14639. (taicpu(hp1).oper[0]^.typ <> top_ref) or
  14640. not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
  14641. ) and
  14642. MatchOperand(taicpu(hp1).oper[1]^, ActiveReg) and
  14643. not RegUsedAfterInstruction(ActiveReg, hp1, TmpUsedRegs) then
  14644. begin
  14645. DebugMsg(SPeepholeOptimization + 'AND; CMP/TEST -> CMP/TEST', p);
  14646. RemoveCurrentP(p, hp1);
  14647. Result := True;
  14648. Exit;
  14649. end;
  14650. A_BSWAP,
  14651. A_NEG,
  14652. A_NOT:
  14653. { Register is written to, so this will clear the upper 32 bits (1-operand instructions) }
  14654. if MatchOperand(taicpu(hp1).oper[0]^, ActiveReg) then
  14655. begin
  14656. DebugMsg(SPeepholeOptimization + 'AndOp2Op 1', p);
  14657. RemoveCurrentP(p, hp1);
  14658. Result := True;
  14659. Exit;
  14660. end;
  14661. else
  14662. ;
  14663. end;
  14664. end;
  14665. if (taicpu(hp1).is_jmp) and
  14666. (taicpu(hp1).opcode<>A_JMP) and
  14667. not(RegInUsedRegs(taicpu(p).oper[1]^.reg,UsedRegs)) then
  14668. begin
  14669. { change
  14670. and x, reg
  14671. jxx
  14672. to
  14673. test x, reg
  14674. jxx
  14675. if reg is deallocated before the
  14676. jump, but only if it's a conditional jump (PFV)
  14677. }
  14678. DebugMsg(SPeepholeOptimization + 'AndJcc2TestJcc', p);
  14679. taicpu(p).opcode := A_TEST;
  14680. Exit;
  14681. end;
  14682. Break;
  14683. end;
  14684. { Lone AND tests }
  14685. if (taicpu(p).oper[0]^.typ = top_const) then
  14686. begin
  14687. {
  14688. - Convert and $0xFF,reg to and reg,reg if reg is 8-bit
  14689. - Convert and $0xFFFF,reg to and reg,reg if reg is 16-bit
  14690. - Convert and $0xFFFFFFFF,reg to and reg,reg if reg is 32-bit
  14691. }
  14692. if ((taicpu(p).oper[0]^.val = $FF) and (taicpu(p).opsize = S_B)) or
  14693. ((taicpu(p).oper[0]^.val = $FFFF) and (taicpu(p).opsize = S_W)) or
  14694. ((taicpu(p).oper[0]^.val = $FFFFFFFF) and (taicpu(p).opsize = S_L)) then
  14695. begin
  14696. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  14697. if taicpu(p).opsize = S_L then
  14698. begin
  14699. Include(OptsToCheck,aoc_MovAnd2Mov_3);
  14700. Result := True;
  14701. end;
  14702. end;
  14703. end;
  14704. { Backward check to determine necessity of and %reg,%reg }
  14705. if (taicpu(p).oper[0]^.typ = top_reg) and
  14706. (taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
  14707. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  14708. begin
  14709. hp2:=p;
  14710. while GetLastInstruction(hp2, hp2) and
  14711. (cs_opt_level3 in current_settings.optimizerswitches) and
  14712. (hp2.typ=ait_instruction) and
  14713. not RegModifiedByInstruction(ActiveReg,hp2) do { loop };
  14714. if Assigned(hp2) and
  14715. RegModifiedByInstruction(ActiveReg,hp2) and { Also checks if hp2 is an instruction }
  14716. { Check size of instruction to determine if the AND is effectively
  14717. a null operation }
  14718. (
  14719. (taicpu(p).opsize = taicpu(hp2).opsize) or
  14720. { Note: Don't include S_Q }
  14721. ((taicpu(p).opsize = S_L) and (taicpu(hp2).opsize in [S_BL, S_WL])) or
  14722. ((taicpu(p).opsize = S_W) and (taicpu(hp2).opsize in [S_BW, S_BL, S_WL, S_L])) or
  14723. ((taicpu(p).opsize = S_B) and (taicpu(hp2).opsize in [S_BW, S_BL, S_WL, S_W, S_L]))
  14724. ) then
  14725. begin
  14726. { AND %reg,%reg is unnecessary to zero the upper 32 bits. }
  14727. DebugMsg(SPeepholeOptimization + 'AND %reg,%reg proven unnecessary after backward search (And2Nop)', p);
  14728. RemoveCurrentP(p, hp1);
  14729. Result:=True;
  14730. Exit;
  14731. end;
  14732. end;
  14733. end;
  14734. function TX86AsmOptimizer.OptPass2ADD(var p : tai) : boolean;
  14735. var
  14736. hp1, hp2: tai;
  14737. NewRef: TReference;
  14738. Distance: Cardinal;
  14739. TempTracking: TAllUsedRegs;
  14740. DoAddMov2Lea: Boolean;
  14741. { This entire nested function is used in an if-statement below, but we
  14742. want to avoid all the used reg transfers and GetNextInstruction calls
  14743. until we really have to check }
  14744. function MemRegisterNotUsedLater: Boolean; inline;
  14745. var
  14746. hp2: tai;
  14747. begin
  14748. TransferUsedRegs(TmpUsedRegs);
  14749. if (cs_opt_level3 in current_settings.optimizerswitches) then
  14750. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1)
  14751. else
  14752. { p and hp1 will be adjacent }
  14753. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  14754. Result := not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs);
  14755. end;
  14756. begin
  14757. Result := False;
  14758. DoAddMov2Lea:=false;
  14759. if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) and
  14760. (taicpu(p).oper[1]^.typ = top_reg) then
  14761. begin
  14762. Distance := GetNextInstructionUsingRegCount(p, hp1, taicpu(p).oper[1]^.reg);
  14763. if (Distance = 0) or (Distance > 3) { Likely too far to make a meaningful difference } or
  14764. (hp1.typ <> ait_instruction) or
  14765. not
  14766. (
  14767. (cs_opt_level3 in current_settings.optimizerswitches) or
  14768. { GetNextInstructionUsingRegCount just returns the next valid instruction under -O2 and under }
  14769. RegInInstruction(taicpu(p).oper[1]^.reg, hp1)
  14770. ) then
  14771. Exit;
  14772. { Some of the MOV optimisations are much more in-depth. For example, if we have:
  14773. addq $x, %rax
  14774. movq %rax, %rdx
  14775. sarq $63, %rdx
  14776. (%rax still in use)
  14777. ...letting OptPass2ADD run its course (and without -Os) will produce:
  14778. leaq $x(%rax),%rdx
  14779. addq $x, %rax
  14780. sarq $63, %rdx
  14781. ...which is okay since it breaks the dependency chain between
  14782. addq and movq, but if OptPass2MOV is called first:
  14783. addq $x, %rax
  14784. cqto
  14785. ...which is better in all ways, taking only 2 cycles to execute
  14786. and much smaller in code size.
  14787. }
  14788. { The extra register tracking is quite strenuous }
  14789. if (cs_opt_level2 in current_settings.optimizerswitches) and
  14790. MatchInstruction(hp1, A_MOV, []) then
  14791. begin
  14792. { Update the register tracking to the MOV instruction }
  14793. CopyUsedRegs(TempTracking);
  14794. if (cs_opt_level3 in current_settings.optimizerswitches) then
  14795. UpdateUsedRegsBetween(UsedRegs, p, hp1)
  14796. else
  14797. { p and hp1 will be adjacent }
  14798. UpdateUsedRegs(UsedRegs, tai(p.Next));
  14799. hp2 := hp1;
  14800. Include(OptsToCheck, aoc_MovlMovq2MovlMovl);
  14801. if OptPass2MOV(hp1) then
  14802. Include(OptsToCheck, aoc_ForceNewIteration);
  14803. Exclude(OptsToCheck, aoc_MovlMovq2MovlMovl);
  14804. { Reset the tracking to the current instruction }
  14805. RestoreUsedRegs(TempTracking);
  14806. ReleaseUsedRegs(TempTracking);
  14807. { if hp1 <> hp2 after the call, then hp1 got removed, so let
  14808. OptPass2ADD get called again }
  14809. if (hp1 <> hp2) then
  14810. begin
  14811. Result := True;
  14812. Exit;
  14813. end;
  14814. end;
  14815. { Change:
  14816. add %reg2,%reg1
  14817. (%reg2 not modified in between)
  14818. mov/s/z #(%reg1),%reg1 (%reg1 superregisters must be the same)
  14819. To:
  14820. mov/s/z #(%reg1,%reg2),%reg1
  14821. }
  14822. if (taicpu(p).oper[0]^.typ = top_reg) and
  14823. MatchInstruction(hp1, [A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}], []) and
  14824. MatchOpType(taicpu(hp1), top_ref, top_reg) and
  14825. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
  14826. (
  14827. (
  14828. (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  14829. (taicpu(hp1).oper[0]^.ref^.index = NR_NO) and
  14830. { r/esp cannot be an index }
  14831. (taicpu(p).oper[0]^.reg<>NR_STACK_POINTER_REG)
  14832. ) or (
  14833. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  14834. (taicpu(hp1).oper[0]^.ref^.base = NR_NO)
  14835. )
  14836. ) and (
  14837. Reg1WriteOverwritesReg2Entirely(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) or
  14838. (
  14839. { If the super registers ARE equal, then this MOV/S/Z does a partial write }
  14840. not SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
  14841. MemRegisterNotUsedLater
  14842. )
  14843. ) then
  14844. begin
  14845. if (
  14846. { Instructions are guaranteed to be adjacent on -O2 and under }
  14847. (cs_opt_level3 in current_settings.optimizerswitches) and
  14848. RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)
  14849. ) then
  14850. begin
  14851. { If the other register is used in between, move the MOV
  14852. instruction to right after the ADD instruction so a
  14853. saving can still be made }
  14854. Asml.Remove(hp1);
  14855. Asml.InsertAfter(hp1, p);
  14856. taicpu(hp1).oper[0]^.ref^.base := taicpu(p).oper[1]^.reg;
  14857. taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.reg;
  14858. DebugMsg(SPeepholeOptimization + 'AddMov2Mov done (instruction moved)', p);
  14859. RemoveCurrentp(p, hp1);
  14860. end
  14861. else
  14862. begin
  14863. AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp1, UsedRegs);
  14864. taicpu(hp1).oper[0]^.ref^.base := taicpu(p).oper[1]^.reg;
  14865. taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.reg;
  14866. DebugMsg(SPeepholeOptimization + 'AddMov2Mov done', p);
  14867. if (cs_opt_level3 in current_settings.optimizerswitches) then
  14868. { hp1 may not be the immediate next instruction under -O3 }
  14869. RemoveCurrentp(p)
  14870. else
  14871. RemoveCurrentp(p, hp1);
  14872. end;
  14873. Result := True;
  14874. Exit;
  14875. end;
  14876. { Change:
  14877. addl/q $x,%reg1
  14878. movl/q %reg1,%reg2
  14879. To:
  14880. leal/q $x(%reg1),%reg2
  14881. addl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
  14882. Breaks the dependency chain.
  14883. }
  14884. if (taicpu(p).oper[0]^.typ = top_const) and
  14885. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  14886. (taicpu(hp1).oper[1]^.typ = top_reg) and
  14887. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
  14888. (
  14889. { Instructions are guaranteed to be adjacent on -O2 and under }
  14890. not (cs_opt_level3 in current_settings.optimizerswitches) or
  14891. (
  14892. { If the flags are used, don't make the optimisation,
  14893. otherwise they will be scrambled. Fixes #41148 }
  14894. (
  14895. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) or
  14896. not RegUsedBetween(NR_DEFAULTFLAGS, p, hp1)
  14897. ) and
  14898. not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1)
  14899. )
  14900. ) then
  14901. begin
  14902. TransferUsedRegs(TmpUsedRegs);
  14903. if (cs_opt_level3 in current_settings.optimizerswitches) then
  14904. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1)
  14905. else
  14906. { p and hp1 will be adjacent }
  14907. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  14908. if (
  14909. SetAndTest(
  14910. (
  14911. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
  14912. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
  14913. ),
  14914. DoAddMov2Lea
  14915. ) or
  14916. { Don't do AddMov2LeaAdd under -Os, but do allow AddMov2Lea }
  14917. not (cs_opt_size in current_settings.optimizerswitches)
  14918. ) then
  14919. begin
  14920. { Change the MOV instruction to a LEA instruction, and update the
  14921. first operand }
  14922. reference_reset(NewRef, 1, []);
  14923. NewRef.base := taicpu(p).oper[1]^.reg;
  14924. NewRef.scalefactor := 1;
  14925. { if the destination reg is the same as the ADD register,
  14926. and we keep the ADD instruction, do not add the offset
  14927. to LEA instruction, otherwise the reg gets increased by 2 times the offset value }
  14928. if DoAddMov2Lea or not MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^.reg) then
  14929. NewRef.offset := asizeint(taicpu(p).oper[0]^.val);
  14930. taicpu(hp1).opcode := A_LEA;
  14931. taicpu(hp1).loadref(0, NewRef);
  14932. if DoAddMov2Lea then
  14933. begin
  14934. { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
  14935. DebugMsg(SPeepholeOptimization + 'AddMov2Lea', hp1);
  14936. if (cs_opt_level3 in current_settings.optimizerswitches) then
  14937. { hp1 may not be the immediate next instruction under -O3 }
  14938. RemoveCurrentp(p)
  14939. else
  14940. RemoveCurrentp(p, hp1);
  14941. end
  14942. else
  14943. begin
  14944. hp2 := tai(hp1.Next); { for the benefit of AllocRegBetween }
  14945. { Move what is now the LEA instruction to before the ADD instruction }
  14946. Asml.Remove(hp1);
  14947. Asml.InsertBefore(hp1, p);
  14948. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
  14949. DebugMsg(SPeepholeOptimization + 'AddMov2LeaAdd', p);
  14950. p := hp1;
  14951. end;
  14952. Result := True;
  14953. end;
  14954. end;
  14955. end;
  14956. end;
  14957. function TX86AsmOptimizer.OptPass2Lea(var p : tai) : Boolean;
  14958. var
  14959. SubReg: TSubRegister;
  14960. hp1, hp2: tai;
  14961. CallJmp: Boolean;
  14962. begin
  14963. Result := False;
  14964. CallJmp := False;
  14965. SubReg := getsubreg(taicpu(p).oper[1]^.reg);
  14966. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  14967. with taicpu(p).oper[0]^.ref^ do
  14968. if not Assigned(symbol) and not Assigned(relsymbol) and (index <> NR_NO) then
  14969. if (offset = 0) then
  14970. begin
  14971. if (scalefactor <= 1) and SuperRegistersEqual(base, taicpu(p).oper[1]^.reg) then
  14972. begin
  14973. taicpu(p).loadreg(0, newreg(R_INTREGISTER, getsupreg(index), SubReg));
  14974. taicpu(p).opcode := A_ADD;
  14975. DebugMsg(SPeepholeOptimization + 'Lea2AddBase done',p);
  14976. Result := True;
  14977. end
  14978. else if SuperRegistersEqual(index, taicpu(p).oper[1]^.reg) then
  14979. begin
  14980. if (base <> NR_NO) then
  14981. begin
  14982. if (scalefactor <= 1) then
  14983. begin
  14984. taicpu(p).loadreg(0, newreg(R_INTREGISTER, getsupreg(base), SubReg));
  14985. taicpu(p).opcode := A_ADD;
  14986. DebugMsg(SPeepholeOptimization + 'Lea2AddIndex done',p);
  14987. Result := True;
  14988. end;
  14989. end
  14990. else
  14991. { Convert lea (%reg,2^x),%reg to shl x,%reg }
  14992. if (scalefactor in [2, 4, 8]) then
  14993. begin
  14994. { BsrByte is, in essence, the base-2 logarithm of the scale factor }
  14995. taicpu(p).loadconst(0, BsrByte(scalefactor));
  14996. taicpu(p).opcode := A_SHL;
  14997. DebugMsg(SPeepholeOptimization + 'Lea2Shl done',p);
  14998. Result := True;
  14999. end;
  15000. end;
  15001. end
  15002. { lea x(%reg1,%reg2),%reg3 and lea x(symbol,%reg2),%reg3 have a
  15003. lot of latency, so break off the offset if %reg3 is used soon
  15004. afterwards }
  15005. else if not (cs_opt_size in current_settings.optimizerswitches) and
  15006. { If 3-component addresses don't have additional latency, don't
  15007. perform this optimisation }
  15008. not (CPUX86_HINT_FAST_3COMP_ADDR in cpu_optimization_hints[current_settings.optimizecputype]) and
  15009. GetNextInstruction(p, hp1) and
  15010. (hp1.typ = ait_instruction) and
  15011. (
  15012. (
  15013. { Permit jumps and calls since they have a larger degree of overhead }
  15014. (
  15015. not SetAndTest(is_calljmp(taicpu(hp1).opcode), CallJmp) or
  15016. (
  15017. { ... unless the register specifies the location }
  15018. (taicpu(hp1).ops > 0) and
  15019. RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^)
  15020. )
  15021. ) and
  15022. (
  15023. not CallJmp and { Use the Boolean result to avoid calling "is_calljmp" twice }
  15024. RegInInstruction(taicpu(p).oper[1]^.reg, hp1)
  15025. )
  15026. )
  15027. or
  15028. (
  15029. { Check up to two instructions ahead }
  15030. GetNextInstruction(hp1, hp2) and
  15031. (hp2.typ = ait_instruction) and
  15032. (
  15033. not SetAndTest(is_calljmp(taicpu(hp2).opcode), CallJmp) or
  15034. (
  15035. { Same as above }
  15036. (taicpu(hp2).ops > 0) and
  15037. RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp2).oper[0]^)
  15038. )
  15039. ) and
  15040. (
  15041. not CallJmp and { Use the Boolean result to avoid calling "is_calljmp" twice }
  15042. RegInInstruction(taicpu(p).oper[1]^.reg, hp2)
  15043. )
  15044. )
  15045. ) then
  15046. begin
  15047. { Offset will be a 32-bit signed integer, so it's safe to use in the 64-bit version of ADD }
  15048. hp2 := taicpu.op_const_reg(A_ADD, taicpu(p).opsize, offset, taicpu(p).oper[1]^.reg);
  15049. taicpu(hp2).fileinfo := taicpu(p).fileinfo;
  15050. offset := 0;
  15051. if Assigned(symbol) or Assigned(relsymbol) then
  15052. DebugMsg(SPeepholeOptimization + 'lea x(sym,%reg1),%reg2 -> lea(sym,%reg1),%reg2; add $x,%reg2 to minimise instruction latency (Lea2LeaAdd)', p)
  15053. else
  15054. DebugMsg(SPeepholeOptimization + 'lea x(%reg1,%reg2),%reg3 -> lea(%reg1,%reg2),%reg3; add $x,%reg3 to minimise instruction latency (Lea2LeaAdd)', p);
  15055. { Inserting before the next instruction rather than after the
  15056. current instruction gives more accurate register tracking }
  15057. asml.InsertBefore(hp2, hp1);
  15058. AllocRegBetween(taicpu(p).oper[1]^.reg, p, hp2, UsedRegs);
  15059. Result := True;
  15060. end;
  15061. end;
  15062. function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
  15063. var
  15064. hp1, hp2: tai;
  15065. NewRef: TReference;
  15066. Distance: Cardinal;
  15067. TempTracking: TAllUsedRegs;
  15068. DoSubMov2Lea: Boolean;
  15069. begin
  15070. Result := False;
  15071. DoSubMov2Lea:=false;
  15072. if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) and
  15073. MatchOpType(taicpu(p),top_const,top_reg) then
  15074. begin
  15075. Distance := GetNextInstructionUsingRegCount(p, hp1, taicpu(p).oper[1]^.reg);
  15076. if (Distance = 0) or (Distance > 3) { Likely too far to make a meaningful difference } or
  15077. (hp1.typ <> ait_instruction) or
  15078. not
  15079. (
  15080. (cs_opt_level3 in current_settings.optimizerswitches) or
  15081. { GetNextInstructionUsingRegCount just returns the next valid instruction under -O2 and under }
  15082. RegInInstruction(taicpu(p).oper[1]^.reg, hp1)
  15083. ) then
  15084. Exit;
  15085. { Some of the MOV optimisations are much more in-depth. For example, if we have:
  15086. subq $x, %rax
  15087. movq %rax, %rdx
  15088. sarq $63, %rdx
  15089. (%rax still in use)
  15090. ...letting OptPass2SUB run its course (and without -Os) will produce:
  15091. leaq $-x(%rax),%rdx
  15092. movq $x, %rax
  15093. sarq $63, %rdx
  15094. ...which is okay since it breaks the dependency chain between
  15095. subq and movq, but if OptPass2MOV is called first:
  15096. subq $x, %rax
  15097. cqto
  15098. ...which is better in all ways, taking only 2 cycles to execute
  15099. and much smaller in code size.
  15100. }
  15101. { The extra register tracking is quite strenuous }
  15102. if (cs_opt_level2 in current_settings.optimizerswitches) and
  15103. MatchInstruction(hp1, A_MOV, []) then
  15104. begin
  15105. { Update the register tracking to the MOV instruction }
  15106. CopyUsedRegs(TempTracking);
  15107. if (cs_opt_level3 in current_settings.optimizerswitches) then
  15108. UpdateUsedRegsBetween(UsedRegs, p, hp1)
  15109. else
  15110. { p and hp1 will be adjacent }
  15111. UpdateUsedRegs(UsedRegs, tai(p.Next));
  15112. hp2 := hp1;
  15113. Include(OptsToCheck, aoc_MovlMovq2MovlMovl);
  15114. if OptPass2MOV(hp1) then
  15115. Include(OptsToCheck, aoc_ForceNewIteration);
  15116. Exclude(OptsToCheck, aoc_MovlMovq2MovlMovl);
  15117. { Reset the tracking to the current instruction }
  15118. RestoreUsedRegs(TempTracking);
  15119. ReleaseUsedRegs(TempTracking);
  15120. { if hp1 <> hp2 after the call, then hp1 got removed, so let
  15121. OptPass2SUB get called again }
  15122. if (hp1 <> hp2) then
  15123. begin
  15124. Result := True;
  15125. Exit;
  15126. end;
  15127. end;
  15128. { Change:
  15129. subl/q $x,%reg1
  15130. movl/q %reg1,%reg2
  15131. To:
  15132. leal/q $-x(%reg1),%reg2
  15133. subl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
  15134. Breaks the dependency chain and potentially permits the removal of
  15135. a CMP instruction if one follows.
  15136. }
  15137. if MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  15138. (taicpu(hp1).oper[1]^.typ = top_reg) and
  15139. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
  15140. (
  15141. { Instructions are guaranteed to be adjacent on -O2 and under }
  15142. not (cs_opt_level3 in current_settings.optimizerswitches) or
  15143. (
  15144. { If the flags are used, don't make the optimisation,
  15145. otherwise they will be scrambled. Fixes #41148 }
  15146. (
  15147. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) or
  15148. not RegUsedBetween(NR_DEFAULTFLAGS, p, hp1)
  15149. ) and
  15150. not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1)
  15151. )
  15152. ) then
  15153. begin
  15154. TransferUsedRegs(TmpUsedRegs);
  15155. if (cs_opt_level3 in current_settings.optimizerswitches) then
  15156. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1)
  15157. else
  15158. { p and hp1 will be adjacent }
  15159. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  15160. if (
  15161. SetAndTest(
  15162. (
  15163. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
  15164. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
  15165. ),
  15166. DoSubMov2Lea
  15167. ) or
  15168. { Don't do SubMov2LeaSub under -Os, but do allow SubMov2Lea }
  15169. not (cs_opt_size in current_settings.optimizerswitches)
  15170. ) then
  15171. begin
  15172. { Change the MOV instruction to a LEA instruction, and update the
  15173. first operand }
  15174. reference_reset(NewRef, 1, []);
  15175. NewRef.base := taicpu(p).oper[1]^.reg;
  15176. NewRef.scalefactor := 1;
  15177. { if the destination reg is the same as the SUB register,
  15178. and we keep the ADD instruction, do not substract the offset
  15179. to LEA instruction, otherwise the reg gets decreased by 2 times the offset value }
  15180. if DoSubMov2Lea or not MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^.reg) then
  15181. NewRef.offset := -taicpu(p).oper[0]^.val;
  15182. taicpu(hp1).opcode := A_LEA;
  15183. taicpu(hp1).loadref(0, NewRef);
  15184. if DoSubMov2Lea then
  15185. begin
  15186. { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
  15187. DebugMsg(SPeepholeOptimization + 'SubMov2Lea', hp1);
  15188. if (cs_opt_level3 in current_settings.optimizerswitches) then
  15189. { hp1 may not be the immediate next instruction under -O3 }
  15190. RemoveCurrentp(p)
  15191. else
  15192. RemoveCurrentp(p, hp1);
  15193. end
  15194. else
  15195. begin
  15196. hp2 := tai(hp1.Next); { for the benefit of AllocRegBetween }
  15197. { Move what is now the LEA instruction to before the SUB instruction }
  15198. Asml.Remove(hp1);
  15199. Asml.InsertBefore(hp1, p);
  15200. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
  15201. DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
  15202. p := hp1;
  15203. end;
  15204. Result := True;
  15205. end;
  15206. end;
  15207. end;
  15208. end;
  15209. function TX86AsmOptimizer.SkipSimpleInstructions(var hp1 : tai) : Boolean;
  15210. begin
  15211. { we can skip all instructions not messing with the stack pointer }
  15212. while assigned(hp1) and {MatchInstruction(hp1,[A_LEA,A_MOV,A_MOVQ,A_MOVSQ,A_MOVSX,A_MOVSXD,A_MOVZX,
  15213. A_AND,A_OR,A_XOR,A_ADD,A_SHR,A_SHL,A_IMUL,A_SETcc,A_SAR,A_SUB,A_TEST,A_CMOVcc,
  15214. A_MOVSS,A_MOVSD,A_MOVAPS,A_MOVUPD,A_MOVAPD,A_MOVUPS,
  15215. A_VMOVSS,A_VMOVSD,A_VMOVAPS,A_VMOVUPD,A_VMOVAPD,A_VMOVUPS],[]) and}
  15216. ({(taicpu(hp1).ops=0) or }
  15217. ({(MatchOpType(taicpu(hp1),top_reg,top_reg) or MatchOpType(taicpu(hp1),top_const,top_reg) or
  15218. (MatchOpType(taicpu(hp1),top_ref,top_reg))
  15219. ) and }
  15220. not(RegInInstruction(NR_STACK_POINTER_REG,hp1)) { and not(RegInInstruction(NR_FRAME_POINTER_REG,hp1))}
  15221. )
  15222. ) do
  15223. GetNextInstruction(hp1,hp1);
  15224. Result:=assigned(hp1);
  15225. end;
  15226. function TX86AsmOptimizer.PostPeepholeOptLea(var p : tai) : Boolean;
  15227. var
  15228. hp1, hp2, hp3, hp4, hp5, hp6, hp7, hp8: tai;
  15229. begin
  15230. Result:=false;
  15231. {$ifdef x86_64}
  15232. { Change:
  15233. lea x(%reg1d,%reg2d),%reg3d
  15234. To:
  15235. lea x(%reg1q,%reg2q),%reg3d
  15236. Reduces the number of bytes of machine code
  15237. }
  15238. if (getsubreg(taicpu(p).oper[1]^.reg)=R_SUBD) and
  15239. (
  15240. (getsubreg(taicpu(p).oper[0]^.ref^.base)=R_SUBD) or
  15241. (getsubreg(taicpu(p).oper[0]^.ref^.index)=R_SUBD)
  15242. ) then
  15243. begin
  15244. DebugMsg(SPeepholeOptimization + 'Changed 32-bit registers in reference to 64-bit (reduces instruction size)', p);
  15245. if (getsubreg(taicpu(p).oper[0]^.ref^.base)=R_SUBD) then
  15246. setsubreg(taicpu(p).oper[0]^.ref^.base,R_SUBQ);
  15247. if (getsubreg(taicpu(p).oper[0]^.ref^.index)=R_SUBD) then
  15248. setsubreg(taicpu(p).oper[0]^.ref^.index,R_SUBQ);
  15249. { No reason to set Result to true }
  15250. end;
  15251. {$endif x86_64}
  15252. hp5:=nil;
  15253. hp6:=nil;
  15254. hp7:=nil;
  15255. hp8:=nil;
  15256. { replace
  15257. leal(q) x(<stackpointer>),<stackpointer>
  15258. <optional .seh_stackalloc ...>
  15259. <optional .seh_endprologue ...>
  15260. call procname
  15261. <optional NOP>
  15262. leal(q) -x(<stackpointer>),<stackpointer>
  15263. <optional VZEROUPPER>
  15264. ret
  15265. by
  15266. jmp procname
  15267. but do it only on level 4 because it destroys stack back traces
  15268. }
  15269. if (cs_opt_level4 in current_settings.optimizerswitches) and
  15270. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG) and
  15271. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  15272. (taicpu(p).oper[0]^.ref^.index=NR_NO) and
  15273. { the -8, -24, -40 are not required, but bail out early if possible,
  15274. higher values are unlikely }
  15275. ((taicpu(p).oper[0]^.ref^.offset=-8) or
  15276. (taicpu(p).oper[0]^.ref^.offset=-24) or
  15277. (taicpu(p).oper[0]^.ref^.offset=-40)) and
  15278. (taicpu(p).oper[0]^.ref^.symbol=nil) and
  15279. (taicpu(p).oper[0]^.ref^.relsymbol=nil) and
  15280. GetNextInstruction(p, hp1) and
  15281. { Take a copy of hp1 }
  15282. SetAndTest(hp1, hp4) and
  15283. { trick to skip label }
  15284. ((hp1.typ=ait_instruction) or (SetAndTest(hp1, hp7) and GetNextInstruction(hp1, hp1))) and
  15285. { skip directives, .seh_stackalloc and .seh_endprologue on windows
  15286. ((hp1.typ=ait_instruction) or (SetAndTest(hp1, hp7) and GetNextInstruction(hp1, hp1))) and
  15287. ((hp1.typ=ait_instruction) or (SetAndTest(hp1, hp8) and GetNextInstruction(hp1, hp1))) and }
  15288. SkipSimpleInstructions(hp1) and
  15289. MatchInstruction(hp1,A_CALL,[S_NO]) and
  15290. GetNextInstruction(hp1, hp2) and
  15291. (MatchInstruction(hp2,A_LEA,[taicpu(p).opsize]) or
  15292. { skip nop instruction on win64 }
  15293. (MatchInstruction(hp2,A_NOP,[S_NO]) and
  15294. SetAndTest(hp2,hp6) and
  15295. GetNextInstruction(hp2,hp2) and
  15296. MatchInstruction(hp2,A_LEA,[taicpu(p).opsize]))
  15297. ) and
  15298. (taicpu(hp2).oper[1]^.reg=NR_STACK_POINTER_REG) and
  15299. (taicpu(hp2).oper[0]^.ref^.offset=-taicpu(p).oper[0]^.ref^.offset) and
  15300. (taicpu(hp2).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  15301. (taicpu(hp2).oper[0]^.ref^.index=NR_NO) and
  15302. (taicpu(hp2).oper[0]^.ref^.symbol=nil) and
  15303. (taicpu(hp2).oper[0]^.ref^.relsymbol=nil) and
  15304. { Segment register will be NR_NO }
  15305. GetNextInstruction(hp2, hp3) and
  15306. { trick to skip label }
  15307. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  15308. (MatchInstruction(hp3,A_RET,[S_NO]) or
  15309. (MatchInstruction(hp3,A_VZEROUPPER,[S_NO]) and
  15310. SetAndTest(hp3,hp5) and
  15311. GetNextInstruction(hp3,hp3) and
  15312. MatchInstruction(hp3,A_RET,[S_NO])
  15313. )
  15314. ) and
  15315. (taicpu(hp3).ops=0) then
  15316. begin
  15317. taicpu(hp1).opcode := A_JMP;
  15318. taicpu(hp1).is_jmp := true;
  15319. DebugMsg(SPeepholeOptimization + 'LeaCallLeaRet2Jmp done',p);
  15320. { search for the stackalloc directive and remove it }
  15321. hp7:=tai(p.next);
  15322. while assigned(hp7) and (tai(hp7).typ<>ait_instruction) do
  15323. begin
  15324. if (hp7.typ=ait_seh_directive) and (tai_seh_directive(hp7).kind=ash_stackalloc) then
  15325. begin
  15326. { sanity check }
  15327. if taicpu(p).oper[0]^.ref^.offset<>-tai_seh_directive(hp7).data.offset then
  15328. Internalerror(2024012201);
  15329. hp8:=tai(hp7.next);
  15330. RemoveInstruction(tai(hp7));
  15331. hp7:=hp8;
  15332. break;
  15333. end
  15334. else
  15335. hp7:=tai(hp7.next);
  15336. end;
  15337. RemoveCurrentP(p, hp4);
  15338. RemoveInstruction(hp2);
  15339. RemoveInstruction(hp3);
  15340. { if there is a vzeroupper instruction then move it before the jmp }
  15341. if Assigned(hp5) then
  15342. begin
  15343. AsmL.Remove(hp5);
  15344. ASmL.InsertBefore(hp5,hp1)
  15345. end;
  15346. { remove nop on win64 }
  15347. if Assigned(hp6) then
  15348. RemoveInstruction(hp6);
  15349. Result:=true;
  15350. end;
  15351. end;
  15352. function TX86AsmOptimizer.PostPeepholeOptPush(var p : tai) : Boolean;
  15353. {$ifdef x86_64}
  15354. var
  15355. hp1, hp2, hp3, hp4, hp5: tai;
  15356. {$endif x86_64}
  15357. begin
  15358. Result:=false;
  15359. {$ifdef x86_64}
  15360. hp5:=nil;
  15361. { replace
  15362. push %rax
  15363. call procname
  15364. pop %rcx
  15365. ret
  15366. by
  15367. jmp procname
  15368. but do it only on level 4 because it destroys stack back traces
  15369. It depends on the fact, that the sequence push rax/pop rcx is used for stack alignment as rcx is volatile
  15370. for all supported calling conventions
  15371. }
  15372. if (cs_opt_level4 in current_settings.optimizerswitches) and
  15373. MatchOpType(taicpu(p),top_reg) and
  15374. (taicpu(p).oper[0]^.reg=NR_RAX) and
  15375. GetNextInstruction(p, hp1) and
  15376. { Take a copy of hp1 }
  15377. SetAndTest(hp1, hp4) and
  15378. { trick to skip label }
  15379. ((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
  15380. SkipSimpleInstructions(hp1) and
  15381. MatchInstruction(hp1,A_CALL,[S_NO]) and
  15382. GetNextInstruction(hp1, hp2) and
  15383. MatchInstruction(hp2,A_POP,[taicpu(p).opsize]) and
  15384. MatchOpType(taicpu(hp2),top_reg) and
  15385. (taicpu(hp2).oper[0]^.reg=NR_RCX) and
  15386. GetNextInstruction(hp2, hp3) and
  15387. { trick to skip label }
  15388. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  15389. (MatchInstruction(hp3,A_RET,[S_NO]) or
  15390. (MatchInstruction(hp3,A_VZEROUPPER,[S_NO]) and
  15391. SetAndTest(hp3,hp5) and
  15392. GetNextInstruction(hp3,hp3) and
  15393. MatchInstruction(hp3,A_RET,[S_NO])
  15394. )
  15395. ) and
  15396. (taicpu(hp3).ops=0) then
  15397. begin
  15398. taicpu(hp1).opcode := A_JMP;
  15399. taicpu(hp1).is_jmp := true;
  15400. DebugMsg(SPeepholeOptimization + 'PushCallPushRet2Jmp done',p);
  15401. RemoveCurrentP(p, hp4);
  15402. RemoveInstruction(hp2);
  15403. RemoveInstruction(hp3);
  15404. if Assigned(hp5) then
  15405. begin
  15406. AsmL.Remove(hp5);
  15407. ASmL.InsertBefore(hp5,hp1)
  15408. end;
  15409. Result:=true;
  15410. end;
  15411. {$endif x86_64}
  15412. end;
  15413. function TX86AsmOptimizer.PostPeepholeOptMov(var p : tai) : Boolean;
  15414. var
  15415. Value, RegName: string;
  15416. hp1: tai;
  15417. begin
  15418. Result:=false;
  15419. if (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(p).oper[0]^.typ = top_const) then
  15420. begin
  15421. case taicpu(p).oper[0]^.val of
  15422. 0:
  15423. { Don't make this optimisation if the CPU flags are required, since XOR scrambles them }
  15424. if not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs) or
  15425. (
  15426. { See if we can still convert the instruction }
  15427. GetNextInstructionUsingReg(p, hp1, NR_DEFAULTFLAGS) and
  15428. RegLoadedWithNewValue(NR_DEFAULTFLAGS, hp1)
  15429. ) then
  15430. begin
  15431. { change "mov $0,%reg" into "xor %reg,%reg" }
  15432. taicpu(p).opcode := A_XOR;
  15433. taicpu(p).loadReg(0,taicpu(p).oper[1]^.reg);
  15434. Result := True;
  15435. {$ifdef x86_64}
  15436. end
  15437. else if (taicpu(p).opsize = S_Q) then
  15438. begin
  15439. RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
  15440. { The actual optimization }
  15441. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  15442. taicpu(p).changeopsize(S_L);
  15443. DebugMsg(SPeepholeOptimization + 'movq $0,' + RegName + ' -> movl $0,' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
  15444. Result := True;
  15445. end;
  15446. $1..$FFFFFFFF:
  15447. begin
  15448. { Code size reduction by J. Gareth "Kit" Moreton }
  15449. { change 64-bit register to 32-bit register to reduce code size (upper 32 bits will be set to zero) }
  15450. case taicpu(p).opsize of
  15451. S_Q:
  15452. begin
  15453. RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
  15454. Value := debug_tostr(taicpu(p).oper[0]^.val);
  15455. { The actual optimization }
  15456. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  15457. taicpu(p).changeopsize(S_L);
  15458. DebugMsg(SPeepholeOptimization + 'movq $' + Value + ',' + RegName + ' -> movl $' + Value + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
  15459. Result := True;
  15460. end;
  15461. else
  15462. { Do nothing };
  15463. end;
  15464. {$endif x86_64}
  15465. end;
  15466. -1:
  15467. { Don't make this optimisation if the CPU flags are required, since OR scrambles them }
  15468. if (cs_opt_size in current_settings.optimizerswitches) and
  15469. (taicpu(p).opsize <> S_B) and
  15470. (
  15471. not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs) or
  15472. (
  15473. { See if we can still convert the instruction }
  15474. GetNextInstructionUsingReg(p, hp1, NR_DEFAULTFLAGS) and
  15475. RegLoadedWithNewValue(NR_DEFAULTFLAGS, hp1)
  15476. )
  15477. ) then
  15478. begin
  15479. { change "mov $-1,%reg" into "or $-1,%reg" }
  15480. { NOTES:
  15481. - No size saving is made when changing a Word-sized assignment unless the register is AX (smaller encoding)
  15482. - This operation creates a false dependency on the register, so only do it when optimising for size
  15483. - It is possible to set memory operands using this method, but this creates an even greater false dependency, so don't do this at all
  15484. }
  15485. taicpu(p).opcode := A_OR;
  15486. DebugMsg(SPeepholeOptimization + 'Mov-12Or-1',p);
  15487. Result := True;
  15488. end;
  15489. else
  15490. { Do nothing };
  15491. end;
  15492. end;
  15493. end;
  15494. { Returns true if the given logic instruction can be converted into a BTx instruction (BT not included) }
  15495. class function TX86AsmOptimizer.IsBTXAcceptable(p : tai) : boolean;
  15496. begin
  15497. Result := False;
  15498. if not (CPUX86_HAS_BTX in cpu_capabilities[current_settings.optimizecputype]) then
  15499. Exit;
  15500. { For sizes less than S_L, the byte size is equal or larger with BTx,
  15501. so don't bother optimising }
  15502. if not MatchInstruction(p, A_AND, A_OR, A_XOR, [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
  15503. Exit;
  15504. if (taicpu(p).oper[0]^.typ <> top_const) or
  15505. { If the value can fit into an 8-bit signed integer, a smaller
  15506. instruction can be encoded with AND/OR/XOR, so don't optimise if it
  15507. falls within this range }
  15508. (
  15509. (taicpu(p).oper[0]^.val > -128) and
  15510. (taicpu(p).oper[0]^.val <= 127)
  15511. ) then
  15512. Exit;
  15513. { If we're optimising for size, this is acceptable }
  15514. if (cs_opt_size in current_settings.optimizerswitches) then
  15515. Exit(True);
  15516. if (taicpu(p).oper[1]^.typ = top_reg) and
  15517. (CPUX86_HINT_FAST_BTX_REG_IMM in cpu_optimization_hints[current_settings.optimizecputype]) then
  15518. Exit(True);
  15519. if (taicpu(p).oper[1]^.typ <> top_reg) and
  15520. (CPUX86_HINT_FAST_BTX_MEM_IMM in cpu_optimization_hints[current_settings.optimizecputype]) then
  15521. Exit(True);
  15522. end;
  15523. function TX86AsmOptimizer.PostPeepholeOptAnd(var p : tai) : boolean;
  15524. var
  15525. hp1: tai;
  15526. Value: TCGInt;
  15527. begin
  15528. Result := False;
  15529. if MatchOpType(taicpu(p), top_const, top_reg) then
  15530. begin
  15531. { Detect:
  15532. andw x, %ax (0 <= x < $8000)
  15533. ...
  15534. movzwl %ax,%eax
  15535. Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
  15536. }
  15537. if (taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W }
  15538. ((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val) and
  15539. GetNextInstructionUsingReg(p, hp1, NR_EAX) and
  15540. MatchInstruction(hp1, A_MOVZX, [S_WL]) and
  15541. MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and
  15542. MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then
  15543. begin
  15544. DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via AndMovz2AndCwtl)', hp1);
  15545. taicpu(hp1).opcode := A_CWDE;
  15546. taicpu(hp1).clearop(0);
  15547. taicpu(hp1).clearop(1);
  15548. taicpu(hp1).ops := 0;
  15549. { A change was made, but not with p, so don't set Result, but
  15550. notify the compiler that a change was made }
  15551. Include(OptsToCheck, aoc_ForceNewIteration);
  15552. Exit; { and -> btr won't happen because an opsize of S_W won't be optimised anyway }
  15553. end;
  15554. end;
  15555. { If "not x" is a power of 2 (popcnt = 1), change:
  15556. and $x, %reg/ref
  15557. To:
  15558. btr lb(x), %reg/ref
  15559. }
  15560. if IsBTXAcceptable(p) and
  15561. (
  15562. { Make sure a TEST doesn't follow that plays with the register }
  15563. not GetNextInstruction(p, hp1) or
  15564. not MatchInstruction(hp1, A_TEST, A_CMP, [taicpu(p).opsize]) or
  15565. not MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg)
  15566. ) then
  15567. begin
  15568. {$push}{$R-}{$Q-}
  15569. { Value is a sign-extended 32-bit integer - just correct it
  15570. if it's represented as an unsigned value. Also, IsBTXAcceptable
  15571. checks to see if this operand is an immediate. }
  15572. Value := not taicpu(p).oper[0]^.val;
  15573. {$pop}
  15574. {$ifdef x86_64}
  15575. if taicpu(p).opsize = S_L then
  15576. {$endif x86_64}
  15577. Value := Value and $FFFFFFFF;
  15578. if (PopCnt(QWord(Value)) = 1) then
  15579. begin
  15580. DebugMsg(SPeepholeOptimization + 'Changed AND (not $' + debug_hexstr(taicpu(p).oper[0]^.val) + ') to BTR $' + debug_tostr(BsrQWord(Value)) + ' to shrink instruction size (And2Btr)', p);
  15581. taicpu(p).opcode := A_BTR;
  15582. taicpu(p).oper[0]^.val := BsrQWord(Value); { Essentially the base 2 logarithm }
  15583. Result := True;
  15584. Exit;
  15585. end;
  15586. end;
  15587. end;
  15588. function TX86AsmOptimizer.PostPeepholeOptMOVSX(var p : tai) : boolean;
  15589. begin
  15590. Result := False;
  15591. if not MatchOpType(taicpu(p), top_reg, top_reg) then
  15592. Exit;
  15593. { Convert:
  15594. movswl %ax,%eax -> cwtl
  15595. movslq %eax,%rax -> cdqe
  15596. NOTE: Don't convert movswl %al,%ax to cbw, because cbw and cwde
  15597. refer to the same opcode and depends only on the assembler's
  15598. current operand-size attribute. [Kit]
  15599. }
  15600. with taicpu(p) do
  15601. case opsize of
  15602. S_WL:
  15603. if (oper[0]^.reg = NR_AX) and (oper[1]^.reg = NR_EAX) then
  15604. begin
  15605. DebugMsg(SPeepholeOptimization + 'Converted movswl %ax,%eax to cwtl', p);
  15606. opcode := A_CWDE;
  15607. clearop(0);
  15608. clearop(1);
  15609. ops := 0;
  15610. Result := True;
  15611. end;
  15612. {$ifdef x86_64}
  15613. S_LQ:
  15614. if (oper[0]^.reg = NR_EAX) and (oper[1]^.reg = NR_RAX) then
  15615. begin
  15616. DebugMsg(SPeepholeOptimization + 'Converted movslq %eax,%rax to cltq', p);
  15617. opcode := A_CDQE;
  15618. clearop(0);
  15619. clearop(1);
  15620. ops := 0;
  15621. Result := True;
  15622. end;
  15623. {$endif x86_64}
  15624. else
  15625. ;
  15626. end;
  15627. end;
  15628. function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean;
  15629. var
  15630. hp1: tai;
  15631. begin
  15632. Result := False;
  15633. { All these optimisations work on "shr const,%reg" }
  15634. if not MatchOpType(taicpu(p), top_const, top_reg) then
  15635. Exit;
  15636. if HandleSHRMerge(p, True) then
  15637. begin
  15638. Result := True;
  15639. Exit;
  15640. end;
  15641. { Detect the following (looking backwards):
  15642. shr %cl,%reg
  15643. shr x, %reg
  15644. Swap the two SHR instructions to minimise a pipeline stall.
  15645. }
  15646. if GetLastInstruction(p, hp1) and
  15647. MatchInstruction(hp1, A_SHR, [taicpu(p).opsize]) and
  15648. MatchOpType(taicpu(hp1), top_reg, top_reg) and
  15649. { First operand will be %cl }
  15650. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
  15651. { Just to be sure }
  15652. (getsupreg(taicpu(hp1).oper[1]^.reg) <> RS_ECX) then
  15653. begin
  15654. DebugMsg(SPeepholeOptimization + 'Swapped variable and constant SHR instructions to minimise pipeline stall (ShrShr2ShrShr)', hp1);
  15655. { Moving the entries this way ensures the register tracking remains correct }
  15656. Asml.Remove(p);
  15657. Asml.InsertBefore(p, hp1);
  15658. p := hp1;
  15659. { Don't set Result to True because the current instruction is now
  15660. "shr %cl,%reg" and there's nothing more we can do with it }
  15661. end;
  15662. end;
  15663. function TX86AsmOptimizer.PostPeepholeOptADDSUB(var p : tai) : boolean;
  15664. var
  15665. hp1, hp2: tai;
  15666. Opposite, SecondOpposite: TAsmOp;
  15667. NewCond: TAsmCond;
  15668. begin
  15669. Result := False;
  15670. { Change:
  15671. add/sub 128,(dest)
  15672. To:
  15673. sub/add -128,(dest)
  15674. This generaally takes fewer bytes to encode because -128 can be stored
  15675. in a signed byte, whereas +128 cannot.
  15676. }
  15677. if (taicpu(p).opsize <> S_B) and MatchOperand(taicpu(p).oper[0]^, 128) then
  15678. begin
  15679. if taicpu(p).opcode = A_ADD then
  15680. Opposite := A_SUB
  15681. else
  15682. Opposite := A_ADD;
  15683. { Be careful if the flags are in use, because the CF flag inverts
  15684. when changing from ADD to SUB and vice versa }
  15685. if RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  15686. GetNextInstruction(p, hp1) then
  15687. begin
  15688. TransferUsedRegs(TmpUsedRegs);
  15689. TmpUsedRegs[R_SPECIALREGISTER].Update(tai(p.Next), True);
  15690. hp2 := hp1;
  15691. { Scan ahead to check if everything's safe }
  15692. while Assigned(hp1) and RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) do
  15693. begin
  15694. if (hp1.typ <> ait_instruction) then
  15695. { Probably unsafe since the flags are still in use }
  15696. Exit;
  15697. if MatchInstruction(hp1, A_CALL, A_JMP, A_RET, []) then
  15698. { Stop searching at an unconditional jump }
  15699. Break;
  15700. if not
  15701. (
  15702. MatchInstruction(hp1, A_ADC, A_SBB, []) and
  15703. (taicpu(hp1).oper[0]^.typ = top_const) { We need to be able to invert a constant }
  15704. ) and
  15705. (taicpu(hp1).condition = C_None) and RegInInstruction(NR_DEFAULTFLAGS, hp1) then
  15706. { Instruction depends on FLAGS (and is not ADC or SBB); break out }
  15707. Exit;
  15708. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  15709. TmpUsedRegs[R_SPECIALREGISTER].Update(tai(hp1.Next), True);
  15710. { Move to the next instruction }
  15711. GetNextInstruction(hp1, hp1);
  15712. end;
  15713. while Assigned(hp2) and (hp2 <> hp1) do
  15714. begin
  15715. NewCond := C_None;
  15716. case taicpu(hp2).condition of
  15717. C_A, C_NBE:
  15718. NewCond := C_BE;
  15719. C_B, C_C, C_NAE:
  15720. NewCond := C_AE;
  15721. C_AE, C_NB, C_NC:
  15722. NewCond := C_B;
  15723. C_BE, C_NA:
  15724. NewCond := C_A;
  15725. else
  15726. { No change needed };
  15727. end;
  15728. if NewCond <> C_None then
  15729. begin
  15730. DebugMsg(SPeepholeOptimization + 'Condition changed from ' + cond2str[taicpu(hp2).condition] + ' to ' + cond2str[NewCond] +
  15731. ' to accommodate ' + debug_op2str(taicpu(p).opcode) + ' -> ' + debug_op2str(opposite) + ' above', hp2);
  15732. taicpu(hp2).condition := NewCond;
  15733. end
  15734. else
  15735. if MatchInstruction(hp2, A_ADC, A_SBB, []) then
  15736. begin
  15737. { Because of the flipping of the carry bit, to ensure
  15738. the operation remains equivalent, ADC becomes SBB
  15739. and vice versa, and the constant is not-inverted.
  15740. If multiple ADCs or SBBs appear in a row, each one
  15741. changed causes the carry bit to invert, so they all
  15742. need to be flipped }
  15743. if taicpu(hp2).opcode = A_ADC then
  15744. SecondOpposite := A_SBB
  15745. else
  15746. SecondOpposite := A_ADC;
  15747. if taicpu(hp2).oper[0]^.typ <> top_const then
  15748. { Should have broken out of this optimisation already }
  15749. InternalError(2021112901);
  15750. DebugMsg(SPeepholeOptimization + debug_op2str(taicpu(hp2).opcode) + debug_opsize2str(taicpu(hp2).opsize) + ' $' + debug_tostr(taicpu(hp2).oper[0]^.val) + ',' + debug_operstr(taicpu(hp2).oper[1]^) + ' -> ' +
  15751. debug_op2str(SecondOpposite) + debug_opsize2str(taicpu(hp2).opsize) + ' $' + debug_tostr(not taicpu(hp2).oper[0]^.val) + ',' + debug_operstr(taicpu(hp2).oper[1]^) + ' to accommodate inverted carry bit', hp2);
  15752. { Bit-invert the constant (effectively equivalent to "-1 - val") }
  15753. taicpu(hp2).opcode := SecondOpposite;
  15754. taicpu(hp2).oper[0]^.val := not taicpu(hp2).oper[0]^.val;
  15755. end;
  15756. { Move to the next instruction }
  15757. GetNextInstruction(hp2, hp2);
  15758. end;
  15759. if (hp2 <> hp1) then
  15760. InternalError(2021111501);
  15761. end;
  15762. DebugMsg(SPeepholeOptimization + debug_op2str(taicpu(p).opcode) + debug_opsize2str(taicpu(p).opsize) + ' $128,' + debug_operstr(taicpu(p).oper[1]^) + ' changed to ' +
  15763. debug_op2str(opposite) + debug_opsize2str(taicpu(p).opsize) + ' $-128,' + debug_operstr(taicpu(p).oper[1]^) + ' to reduce instruction size', p);
  15764. taicpu(p).opcode := Opposite;
  15765. taicpu(p).oper[0]^.val := -128;
  15766. { No further optimisations can be made on this instruction, so move
  15767. onto the next one to save time }
  15768. p := tai(p.Next);
  15769. UpdateUsedRegs(p);
  15770. Result := True;
  15771. Exit;
  15772. end;
  15773. { Detect:
  15774. add/sub %reg2,(dest)
  15775. add/sub x, (dest)
  15776. (dest can be a register or a reference)
  15777. Swap the instructions to minimise a pipeline stall. This reverses the
  15778. "Add swap" and "Sub swap" optimisations done in pass 1 if no new
  15779. optimisations could be made.
  15780. }
  15781. if (taicpu(p).oper[0]^.typ = top_reg) and
  15782. not RegInOp(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^) and
  15783. (
  15784. (
  15785. (taicpu(p).oper[1]^.typ = top_reg) and
  15786. { We can try searching further ahead if we're writing to a register }
  15787. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.reg)
  15788. ) or
  15789. (
  15790. (taicpu(p).oper[1]^.typ = top_ref) and
  15791. GetNextInstruction(p, hp1)
  15792. )
  15793. ) and
  15794. MatchInstruction(hp1, A_ADD, A_SUB, [taicpu(p).opsize]) and
  15795. (taicpu(hp1).oper[0]^.typ = top_const) and
  15796. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[1]^) then
  15797. begin
  15798. { Make doubly sure the flags aren't in use because the order of additions may affect them }
  15799. TransferUsedRegs(TmpUsedRegs);
  15800. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  15801. hp2 := p;
  15802. while not (cs_opt_level3 in current_settings.optimizerswitches) and
  15803. GetNextInstruction(hp2, hp2) and (hp2 <> hp1) do
  15804. UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
  15805. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  15806. begin
  15807. asml.remove(hp1);
  15808. asml.InsertBefore(hp1, p);
  15809. DebugMsg(SPeepholeOptimization + 'Add/Sub swap 2 done', hp1);
  15810. Result := True;
  15811. end;
  15812. end;
  15813. end;
  15814. function TX86AsmOptimizer.PostPeepholeOptCmp(var p : tai) : Boolean;
  15815. var
  15816. hp1: tai;
  15817. begin
  15818. Result:=false;
  15819. { Final check to see if CMP/MOV pairs can be changed to MOV/CMP }
  15820. while GetNextInstruction(p, hp1) and
  15821. TrySwapMovCmp(p, hp1) do
  15822. begin
  15823. if MatchInstruction(hp1, A_MOV, []) then
  15824. begin
  15825. if RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  15826. begin
  15827. { A little hacky, but since CMP doesn't read the flags, only
  15828. modify them, it's safe if they get scrambled by MOV -> XOR }
  15829. ExcludeRegFromUsedRegs(NR_DEFAULTFLAGS, UsedRegs);
  15830. Result := PostPeepholeOptMov(hp1);
  15831. {$ifdef x86_64}
  15832. if Result and MatchInstruction(hp1, A_XOR, [S_Q]) then
  15833. { Used to shrink instruction size }
  15834. PostPeepholeOptXor(hp1);
  15835. {$endif x86_64}
  15836. IncludeRegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs);
  15837. end
  15838. else
  15839. begin
  15840. Result := PostPeepholeOptMov(hp1);
  15841. {$ifdef x86_64}
  15842. if Result and MatchInstruction(hp1, A_XOR, [S_Q]) then
  15843. { Used to shrink instruction size }
  15844. PostPeepholeOptXor(hp1);
  15845. {$endif x86_64}
  15846. end;
  15847. end;
  15848. { Enabling this flag is actually a null operation, but it marks
  15849. the code as 'modified' during this pass }
  15850. Include(OptsToCheck, aoc_ForceNewIteration);
  15851. end;
  15852. { change "cmp $0, %reg" to "test %reg, %reg" }
  15853. if MatchOpType(taicpu(p),top_const,top_reg) and
  15854. (taicpu(p).oper[0]^.val = 0) then
  15855. begin
  15856. taicpu(p).opcode := A_TEST;
  15857. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  15858. DebugMsg(SPeepholeOptimization + 'Cmp2Test', p);
  15859. Result:=true;
  15860. end;
  15861. end;
  15862. function TX86AsmOptimizer.PostPeepholeOptTestOr(var p : tai) : Boolean;
  15863. var
  15864. IsTestConstX, IsValid : Boolean;
  15865. hp1,hp2 : tai;
  15866. begin
  15867. Result:=false;
  15868. { Final check to see if TEST/MOV pairs can be changed to MOV/TEST }
  15869. if (taicpu(p).opcode = A_TEST) then
  15870. while GetNextInstruction(p, hp1) and
  15871. TrySwapMovCmp(p, hp1) do
  15872. begin
  15873. if MatchInstruction(hp1, A_MOV, []) then
  15874. begin
  15875. if RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  15876. begin
  15877. { A little hacky, but since TEST doesn't read the flags, only
  15878. modify them, it's safe if they get scrambled by MOV -> XOR }
  15879. ExcludeRegFromUsedRegs(NR_DEFAULTFLAGS, UsedRegs);
  15880. Result := PostPeepholeOptMov(hp1);
  15881. {$ifdef x86_64}
  15882. if Result and MatchInstruction(hp1, A_XOR, [S_Q]) then
  15883. { Used to shrink instruction size }
  15884. PostPeepholeOptXor(hp1);
  15885. {$endif x86_64}
  15886. IncludeRegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs);
  15887. end
  15888. else
  15889. begin
  15890. Result := PostPeepholeOptMov(hp1);
  15891. {$ifdef x86_64}
  15892. if Result and MatchInstruction(hp1, A_XOR, [S_Q]) then
  15893. { Used to shrink instruction size }
  15894. PostPeepholeOptXor(hp1);
  15895. {$endif x86_64}
  15896. end;
  15897. end;
  15898. { Enabling this flag is actually a null operation, but it marks
  15899. the code as 'modified' during this pass }
  15900. Include(OptsToCheck, aoc_ForceNewIteration);
  15901. end;
  15902. { If x is a power of 2 (popcnt = 1), change:
  15903. or $x, %reg/ref
  15904. To:
  15905. bts lb(x), %reg/ref
  15906. }
  15907. if (taicpu(p).opcode = A_OR) and
  15908. IsBTXAcceptable(p) and
  15909. { IsBTXAcceptable checks to see if oper[0] is an immediate }
  15910. (PopCnt(QWord(taicpu(p).oper[0]^.val)) = 1) and
  15911. (
  15912. { Don't optimise if a test instruction follows }
  15913. not GetNextInstruction(p, hp1) or
  15914. not MatchInstruction(hp1, A_TEST, [taicpu(p).opsize])
  15915. ) then
  15916. begin
  15917. DebugMsg(SPeepholeOptimization + 'Changed OR $' + debug_hexstr(taicpu(p).oper[0]^.val) + ' to BTS $' + debug_tostr(BsrQWord(taicpu(p).oper[0]^.val)) + ' to shrink instruction size (Or2Bts)', p);
  15918. taicpu(p).opcode := A_BTS;
  15919. taicpu(p).oper[0]^.val := BsrQWord(taicpu(p).oper[0]^.val); { Essentially the base 2 logarithm }
  15920. Result := True;
  15921. Exit;
  15922. end;
  15923. { If x is a power of 2 (popcnt = 1), change:
  15924. test $x, %reg/ref
  15925. je / sete / cmove (or jne / setne)
  15926. To:
  15927. bt lb(x), %reg/ref
  15928. jnc / setnc / cmovnc (or jc / setc / cmovnc)
  15929. }
  15930. if (taicpu(p).opcode = A_TEST) and
  15931. (CPUX86_HAS_BTX in cpu_capabilities[current_settings.optimizecputype]) and
  15932. (taicpu(p).oper[0]^.typ = top_const) and
  15933. (
  15934. (cs_opt_size in current_settings.optimizerswitches) or
  15935. (
  15936. (taicpu(p).oper[1]^.typ = top_reg) and
  15937. (CPUX86_HINT_FAST_BT_REG_IMM in cpu_optimization_hints[current_settings.optimizecputype])
  15938. ) or
  15939. (
  15940. (taicpu(p).oper[1]^.typ <> top_reg) and
  15941. (CPUX86_HINT_FAST_BT_MEM_IMM in cpu_optimization_hints[current_settings.optimizecputype])
  15942. )
  15943. ) and
  15944. (PopCnt(QWord(taicpu(p).oper[0]^.val)) = 1) and
  15945. { For sizes less than S_L, the byte size is equal or larger with BT,
  15946. so don't bother optimising }
  15947. (taicpu(p).opsize >= S_L) then
  15948. begin
  15949. IsValid := True;
  15950. { Check the next set of instructions, watching the FLAGS register
  15951. and the conditions used }
  15952. TransferUsedRegs(TmpUsedRegs);
  15953. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  15954. hp1 := p;
  15955. hp2 := nil;
  15956. while GetNextInstruction(hp1, hp1) do
  15957. begin
  15958. if not Assigned(hp2) then
  15959. { The first instruction after TEST }
  15960. hp2 := hp1;
  15961. if (hp1.typ <> ait_instruction) then
  15962. begin
  15963. { If the flags are no longer in use, everything is fine }
  15964. if RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  15965. IsValid := False;
  15966. Break;
  15967. end;
  15968. case taicpu(hp1).condition of
  15969. C_None:
  15970. begin
  15971. if RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  15972. not RegLoadedWithNewValue(NR_DEFAULTFLAGS, hp1) then
  15973. { Something is not quite normal, so play safe and don't change }
  15974. IsValid := False;
  15975. Break;
  15976. end;
  15977. C_E, C_Z, C_NE, C_NZ:
  15978. { This is fine };
  15979. else
  15980. begin
  15981. { Unsupported condition }
  15982. IsValid := False;
  15983. Break;
  15984. end;
  15985. end;
  15986. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  15987. end;
  15988. if IsValid then
  15989. begin
  15990. while hp2 <> hp1 do
  15991. begin
  15992. case taicpu(hp2).condition of
  15993. C_Z, C_E:
  15994. taicpu(hp2).condition := C_NC;
  15995. C_NZ, C_NE:
  15996. taicpu(hp2).condition := C_C;
  15997. else
  15998. { Should not get this by this point }
  15999. InternalError(2022110701);
  16000. end;
  16001. GetNextInstruction(hp2, hp2);
  16002. end;
  16003. DebugMsg(SPeepholeOptimization + 'Changed TEST $' + debug_hexstr(taicpu(p).oper[0]^.val) + ' to BT $' + debug_tostr(BsrQWord(taicpu(p).oper[0]^.val)) + ' to shrink instruction size (Test2Bt)', p);
  16004. taicpu(p).opcode := A_BT;
  16005. taicpu(p).oper[0]^.val := BsrQWord(taicpu(p).oper[0]^.val); { Essentially the base 2 logarithm }
  16006. Result := True;
  16007. Exit;
  16008. end;
  16009. end;
  16010. { removes the line marked with (x) from the sequence
  16011. and/or/xor/add/sub/... $x, %y
  16012. test/or %y, %y | test $-1, %y (x)
  16013. j(n)z _Label
  16014. as the first instruction already adjusts the ZF
  16015. %y operand may also be a reference }
  16016. IsTestConstX:=(taicpu(p).opcode=A_TEST) and
  16017. MatchOperand(taicpu(p).oper[0]^,-1);
  16018. if (OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) or IsTestConstX) and
  16019. GetLastInstruction(p, hp1) and
  16020. (tai(hp1).typ = ait_instruction) and
  16021. GetNextInstruction(p,hp2) and
  16022. MatchInstruction(hp2,A_SETcc,A_Jcc,A_CMOVcc,[]) then
  16023. case taicpu(hp1).opcode Of
  16024. A_ADD, A_SUB, A_OR, A_XOR, A_AND,
  16025. { These two instructions set the zero flag if the result is zero }
  16026. A_POPCNT, A_LZCNT:
  16027. begin
  16028. if (
  16029. { With POPCNT, an input of zero will set the zero flag
  16030. because the population count of zero is zero }
  16031. (taicpu(hp1).opcode = A_POPCNT) and
  16032. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) and
  16033. (
  16034. OpsEqual(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^) or
  16035. { Faster than going through the second half of the 'or'
  16036. condition below }
  16037. OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^)
  16038. )
  16039. ) or (
  16040. OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^) and
  16041. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  16042. { and in case of carry for A(E)/B(E)/C/NC }
  16043. (
  16044. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) or
  16045. (
  16046. (taicpu(hp1).opcode <> A_ADD) and
  16047. (taicpu(hp1).opcode <> A_SUB) and
  16048. (taicpu(hp1).opcode <> A_LZCNT)
  16049. )
  16050. )
  16051. ) then
  16052. begin
  16053. DebugMsg(SPeepholeOptimization + 'OpTest/Or2Op (2-op) done', hp1);
  16054. RemoveCurrentP(p, hp2);
  16055. Result:=true;
  16056. Exit;
  16057. end;
  16058. end;
  16059. A_SHL, A_SAL, A_SHR, A_SAR:
  16060. begin
  16061. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  16062. { SHL/SAL/SHR/SAR with a value of 0 do not change the flags }
  16063. { therefore, it's only safe to do this optimization for }
  16064. { shifts by a (nonzero) constant }
  16065. (taicpu(hp1).oper[0]^.typ = top_const) and
  16066. (taicpu(hp1).oper[0]^.val <> 0) and
  16067. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  16068. { and in case of carry for A(E)/B(E)/C/NC }
  16069. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  16070. begin
  16071. DebugMsg(SPeepholeOptimization + 'OpTest/Or2Op (shift) done', hp1);
  16072. RemoveCurrentP(p, hp2);
  16073. Result:=true;
  16074. Exit;
  16075. end;
  16076. end;
  16077. A_DEC, A_INC, A_NEG:
  16078. begin
  16079. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) and
  16080. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  16081. { and in case of carry for A(E)/B(E)/C/NC }
  16082. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  16083. begin
  16084. DebugMsg(SPeepholeOptimization + 'OpTest/Or2Op (1-op) done', hp1);
  16085. RemoveCurrentP(p, hp2);
  16086. Result:=true;
  16087. Exit;
  16088. end;
  16089. end;
  16090. A_ANDN, A_BZHI:
  16091. begin
  16092. if OpsEqual(taicpu(hp1).oper[2]^,taicpu(p).oper[1]^) and
  16093. { Only the zero and sign flags are consistent with what the result is }
  16094. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE,C_S,C_NS]) then
  16095. begin
  16096. DebugMsg(SPeepholeOptimization + 'OpTest/Or2Op (ANDN/BZHI) done', hp1);
  16097. RemoveCurrentP(p, hp2);
  16098. Result:=true;
  16099. Exit;
  16100. end;
  16101. end;
  16102. A_BEXTR:
  16103. begin
  16104. if OpsEqual(taicpu(hp1).oper[2]^,taicpu(p).oper[1]^) and
  16105. { Only the zero flag is set }
  16106. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  16107. begin
  16108. DebugMsg(SPeepholeOptimization + 'OpTest/Or2Op (BEXTR) done', hp1);
  16109. RemoveCurrentP(p, hp2);
  16110. Result:=true;
  16111. Exit;
  16112. end;
  16113. end;
  16114. else
  16115. ;
  16116. end; { case }
  16117. { change "test $-1,%reg" into "test %reg,%reg" }
  16118. if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
  16119. taicpu(p).loadoper(0,taicpu(p).oper[1]^);
  16120. { Change "or %reg,%reg" to "test %reg,%reg" as OR generates a false dependency }
  16121. if MatchInstruction(p, A_OR, []) and
  16122. { Can only match if they're both registers }
  16123. MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) then
  16124. begin
  16125. DebugMsg(SPeepholeOptimization + 'or %reg,%reg -> test %reg,%reg to remove false dependency (Or2Test)', p);
  16126. taicpu(p).opcode := A_TEST;
  16127. { No need to set Result to True, as we've done all the optimisations we can }
  16128. end;
  16129. end;
  16130. function TX86AsmOptimizer.PostPeepholeOptCall(var p : tai) : Boolean;
  16131. var
  16132. hp1,hp3 : tai;
  16133. {$ifndef x86_64}
  16134. hp2 : taicpu;
  16135. {$endif x86_64}
  16136. begin
  16137. Result:=false;
  16138. hp3:=nil;
  16139. {$ifndef x86_64}
  16140. { don't do this on modern CPUs, this really hurts them due to
  16141. broken call/ret pairing }
  16142. if (current_settings.optimizecputype < cpu_Pentium2) and
  16143. not(cs_create_pic in current_settings.moduleswitches) and
  16144. GetNextInstruction(p, hp1) and
  16145. MatchInstruction(hp1,A_JMP,[S_NO]) and
  16146. MatchOpType(taicpu(hp1),top_ref) and
  16147. (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  16148. begin
  16149. hp2 := taicpu.Op_sym(A_PUSH,S_L,taicpu(hp1).oper[0]^.ref^.symbol);
  16150. taicpu(hp2).fileinfo := taicpu(p).fileinfo;
  16151. InsertLLItem(p.previous, p, hp2);
  16152. taicpu(p).opcode := A_JMP;
  16153. taicpu(p).is_jmp := true;
  16154. RemoveInstruction(hp1);
  16155. Result:=true;
  16156. end
  16157. else
  16158. {$endif x86_64}
  16159. { replace
  16160. call procname
  16161. ret
  16162. by
  16163. jmp procname
  16164. but do it only on level 4 because it destroys stack back traces
  16165. else if the subroutine is marked as no return, remove the ret
  16166. }
  16167. if ((cs_opt_level4 in current_settings.optimizerswitches) or
  16168. (po_noreturn in current_procinfo.procdef.procoptions)) and
  16169. GetNextInstruction(p, hp1) and
  16170. (MatchInstruction(hp1,A_RET,[S_NO]) or
  16171. (MatchInstruction(hp1,A_VZEROUPPER,[S_NO]) and
  16172. SetAndTest(hp1,hp3) and
  16173. GetNextInstruction(hp1,hp1) and
  16174. MatchInstruction(hp1,A_RET,[S_NO])
  16175. )
  16176. ) and
  16177. (taicpu(hp1).ops=0) then
  16178. begin
  16179. if (cs_opt_level4 in current_settings.optimizerswitches) and
  16180. { we might destroy stack alignment here if we do not do a call }
  16181. (target_info.stackalign<=sizeof(SizeUInt)) then
  16182. begin
  16183. taicpu(p).opcode := A_JMP;
  16184. taicpu(p).is_jmp := true;
  16185. DebugMsg(SPeepholeOptimization + 'CallRet2Jmp done',p);
  16186. end
  16187. else
  16188. DebugMsg(SPeepholeOptimization + 'CallRet2Call done',p);
  16189. RemoveInstruction(hp1);
  16190. if Assigned(hp3) then
  16191. begin
  16192. AsmL.Remove(hp3);
  16193. AsmL.InsertBefore(hp3,p)
  16194. end;
  16195. Result:=true;
  16196. end;
  16197. end;
  16198. function TX86AsmOptimizer.PostPeepholeOptMovzx(var p : tai) : Boolean;
  16199. function ConstInRange(const Val: TCGInt; const OpSize: TOpSize): Boolean;
  16200. begin
  16201. case OpSize of
  16202. S_B, S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  16203. Result := (Val <= $FF) and (Val >= -128);
  16204. S_W, S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  16205. Result := (Val <= $FFFF) and (Val >= -32768);
  16206. S_L{$ifdef x86_64}, S_LQ{$endif x86_64}:
  16207. Result := (Val <= $FFFFFFFF) and (Val >= -2147483648);
  16208. else
  16209. Result := True;
  16210. end;
  16211. end;
  16212. var
  16213. hp1, hp2 : tai;
  16214. SizeChange: Boolean;
  16215. PreMessage: string;
  16216. begin
  16217. Result := False;
  16218. if (taicpu(p).oper[0]^.typ = top_reg) and
  16219. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  16220. GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) then
  16221. begin
  16222. { Change (using movzbl %al,%eax as an example):
  16223. movzbl %al, %eax movzbl %al, %eax
  16224. cmpl x, %eax testl %eax,%eax
  16225. To:
  16226. cmpb x, %al testb %al, %al (Move one back to avoid a false dependency)
  16227. movzbl %al, %eax movzbl %al, %eax
  16228. Smaller instruction and minimises pipeline stall as the CPU
  16229. doesn't have to wait for the register to get zero-extended. [Kit]
  16230. Also allow if the smaller of the two registers is being checked,
  16231. as this still removes the false dependency.
  16232. }
  16233. if
  16234. (
  16235. (
  16236. (taicpu(hp1).opcode = A_CMP) and MatchOpType(taicpu(hp1), top_const, top_reg) and
  16237. ConstInRange(taicpu(hp1).oper[0]^.val, taicpu(p).opsize)
  16238. ) or (
  16239. { If MatchOperand returns True, they must both be registers }
  16240. (taicpu(hp1).opcode = A_TEST) and MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^)
  16241. )
  16242. ) and
  16243. (reg2opsize(taicpu(hp1).oper[1]^.reg) <= reg2opsize(taicpu(p).oper[1]^.reg)) and
  16244. SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) then
  16245. begin
  16246. PreMessage := debug_op2str(taicpu(hp1).opcode) + debug_opsize2str(taicpu(hp1).opsize) + ' ' + debug_operstr(taicpu(hp1).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' -> ' + debug_op2str(taicpu(hp1).opcode);
  16247. asml.Remove(hp1);
  16248. asml.InsertBefore(hp1, p);
  16249. { Swap instructions in the case of cmp 0,%reg or test %reg,%reg }
  16250. if (taicpu(hp1).opcode = A_TEST) or (taicpu(hp1).oper[0]^.val = 0) then
  16251. begin
  16252. taicpu(hp1).opcode := A_TEST;
  16253. taicpu(hp1).loadreg(0, taicpu(p).oper[0]^.reg);
  16254. end;
  16255. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[0]^.reg;
  16256. case taicpu(p).opsize of
  16257. S_BW, S_BL:
  16258. begin
  16259. SizeChange := taicpu(hp1).opsize <> S_B;
  16260. taicpu(hp1).changeopsize(S_B);
  16261. end;
  16262. S_WL:
  16263. begin
  16264. SizeChange := taicpu(hp1).opsize <> S_W;
  16265. taicpu(hp1).changeopsize(S_W);
  16266. end
  16267. else
  16268. InternalError(2020112701);
  16269. end;
  16270. UpdateUsedRegs(tai(p.Next));
  16271. { Check if the register is used aferwards - if not, we can
  16272. remove the movzx instruction completely }
  16273. if not RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, p, UsedRegs) then
  16274. begin
  16275. { Hp1 is a better position than p for debugging purposes }
  16276. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 4a', hp1);
  16277. RemoveCurrentp(p, hp1);
  16278. Result := True;
  16279. end;
  16280. if SizeChange then
  16281. DebugMsg(SPeepholeOptimization + PreMessage +
  16282. debug_opsize2str(taicpu(hp1).opsize) + ' ' + debug_operstr(taicpu(hp1).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (smaller and minimises pipeline stall - MovzxCmp2CmpMovzx)', hp1)
  16283. else
  16284. DebugMsg(SPeepholeOptimization + 'MovzxCmp2CmpMovzx', hp1);
  16285. Exit;
  16286. end;
  16287. { Change (using movzwl %ax,%eax as an example):
  16288. movzwl %ax, %eax
  16289. movb %al, (dest) (Register is smaller than read register in movz)
  16290. To:
  16291. movb %al, (dest) (Move one back to avoid a false dependency)
  16292. movzwl %ax, %eax
  16293. }
  16294. if (taicpu(hp1).opcode = A_MOV) and
  16295. (taicpu(hp1).oper[0]^.typ = top_reg) and
  16296. not RegInOp(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^) and
  16297. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[0]^.reg) and
  16298. (reg2opsize(taicpu(hp1).oper[0]^.reg) <= reg2opsize(taicpu(p).oper[0]^.reg)) then
  16299. begin
  16300. DebugMsg(SPeepholeOptimization + 'MovzxMov2MovMovzx', hp1);
  16301. hp2 := tai(hp1.Previous); { Effectively the old position of hp1 }
  16302. asml.Remove(hp1);
  16303. asml.InsertBefore(hp1, p);
  16304. if taicpu(hp1).oper[1]^.typ = top_reg then
  16305. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
  16306. { Check if the register is used aferwards - if not, we can
  16307. remove the movzx instruction completely }
  16308. if not RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg, p, UsedRegs) then
  16309. begin
  16310. { Hp1 is a better position than p for debugging purposes }
  16311. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 4b', hp1);
  16312. RemoveCurrentp(p, hp1);
  16313. Result := True;
  16314. end;
  16315. Exit;
  16316. end;
  16317. end;
  16318. end;
  16319. function TX86AsmOptimizer.PostPeepholeOptXor(var p : tai) : Boolean;
  16320. var
  16321. hp1: tai;
  16322. {$ifdef x86_64}
  16323. PreMessage, RegName: string;
  16324. {$endif x86_64}
  16325. begin
  16326. Result := False;
  16327. { If x is a power of 2 (popcnt = 1), change:
  16328. xor $x, %reg/ref
  16329. To:
  16330. btc lb(x), %reg/ref
  16331. }
  16332. if IsBTXAcceptable(p) and
  16333. { IsBTXAcceptable checks to see if oper[0] is an immediate }
  16334. (PopCnt(QWord(taicpu(p).oper[0]^.val)) = 1) and
  16335. (
  16336. { Don't optimise if a test instruction follows }
  16337. not GetNextInstruction(p, hp1) or
  16338. not MatchInstruction(hp1, A_TEST, [taicpu(p).opsize])
  16339. ) then
  16340. begin
  16341. DebugMsg(SPeepholeOptimization + 'Changed XOR $' + debug_hexstr(taicpu(p).oper[0]^.val) + ' to BTC $' + debug_tostr(BsrQWord(taicpu(p).oper[0]^.val)) + ' to shrink instruction size (Xor2Btc)', p);
  16342. taicpu(p).opcode := A_BTC;
  16343. taicpu(p).oper[0]^.val := BsrQWord(taicpu(p).oper[0]^.val); { Essentially the base 2 logarithm }
  16344. Result := True;
  16345. Exit;
  16346. end;
  16347. {$ifdef x86_64}
  16348. { Code size reduction by J. Gareth "Kit" Moreton }
  16349. { change "xorq %reg,%reg" to "xorl %reg,%reg" for %rax, %rcx, %rdx, %rbx, %rsi, %rdi, %rbp and %rsp,
  16350. as this removes the REX prefix }
  16351. if not OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  16352. Exit;
  16353. if taicpu(p).oper[0]^.typ <> top_reg then
  16354. { Should be impossible if both operands were equal, since one of XOR's operands must be a register }
  16355. InternalError(2018011500);
  16356. case taicpu(p).opsize of
  16357. S_Q:
  16358. begin
  16359. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 64-bit register name }
  16360. PreMessage := 'xorq ' + RegName + ',' + RegName + ' -> xorl ';
  16361. { The actual optimization }
  16362. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  16363. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  16364. taicpu(p).changeopsize(S_L);
  16365. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 32-bit register name }
  16366. DebugMsg(SPeepholeOptimization + PreMessage + RegName + ',' + RegName + ' (32-bit register recommended when zeroing 64-bit counterpart)', p);
  16367. end;
  16368. else
  16369. ;
  16370. end;
  16371. {$endif x86_64}
  16372. end;
  16373. function TX86AsmOptimizer.PostPeepholeOptVPXOR(var p : tai) : Boolean;
  16374. var
  16375. XReg: TRegister;
  16376. begin
  16377. Result := False;
  16378. { Turn "vpxor %ymmreg2,%ymmreg2,%ymmreg1" to "vpxor %xmmreg2,%xmmreg2,%xmmreg1"
  16379. Smaller encoding and slightly faster on some platforms (also works for
  16380. ZMM-sized registers) }
  16381. if (taicpu(p).opsize in [S_YMM, S_ZMM]) and
  16382. MatchOpType(taicpu(p), top_reg, top_reg, top_reg) then
  16383. begin
  16384. XReg := taicpu(p).oper[0]^.reg;
  16385. if (taicpu(p).oper[1]^.reg = XReg) then
  16386. begin
  16387. taicpu(p).changeopsize(S_XMM);
  16388. setsubreg(taicpu(p).oper[2]^.reg, R_SUBMMX);
  16389. if (cs_opt_size in current_settings.optimizerswitches) then
  16390. begin
  16391. { Change input registers to %xmm0 to reduce size. Note that
  16392. there's a risk of a false dependency doing this, so only
  16393. optimise for size here }
  16394. XReg := NR_XMM0;
  16395. DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM and changed input registers to %xmm0 to reduce size', p);
  16396. end
  16397. else
  16398. begin
  16399. setsubreg(XReg, R_SUBMMX);
  16400. DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM to reduce size and increase efficiency', p);
  16401. end;
  16402. taicpu(p).oper[0]^.reg := XReg;
  16403. taicpu(p).oper[1]^.reg := XReg;
  16404. Result := True;
  16405. end;
  16406. end;
  16407. end;
  16408. function TX86AsmOptimizer.PostPeepholeOptRET(var p: tai): Boolean;
  16409. var
  16410. hp1, p_new: tai;
  16411. begin
  16412. Result := False;
  16413. { Check for:
  16414. ret
  16415. .Lbl:
  16416. ret
  16417. Remove first 'ret'
  16418. }
  16419. if GetNextInstruction(p, hp1) and
  16420. { Remember where the label is }
  16421. SetAndTest(hp1, p_new) and
  16422. (hp1.typ in [ait_align, ait_label]) and
  16423. SkipLabels(hp1, hp1) and
  16424. MatchInstruction(hp1, A_RET, []) and
  16425. { To be safe, make sure the RET instructions are identical }
  16426. (taicpu(p).ops = taicpu(hp1).ops) and
  16427. (
  16428. (taicpu(p).ops = 0) or
  16429. (
  16430. (taicpu(p).ops = 1) and
  16431. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^)
  16432. )
  16433. ) then
  16434. begin
  16435. DebugMsg(SPeepholeOptimization + 'Removed superfluous RET', p);
  16436. UpdateUsedRegs(tai(p.Next));
  16437. RemoveCurrentP(p, p_new);
  16438. Result := True;
  16439. Exit;
  16440. end;
  16441. end;
  16442. function TX86AsmOptimizer.PostPeepholeOptRORX(var p: tai): Boolean;
  16443. begin
  16444. Result := False;
  16445. { Change: To:
  16446. rorx #x,%reg,%reg ror #x,%reg
  16447. (Smaller instruction size)
  16448. }
  16449. if MatchOperand(taicpu(p).oper[1]^,taicpu(p).oper[2]^.reg) and
  16450. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  16451. begin
  16452. taicpu(p).opcode:=A_ROR;
  16453. taicpu(p).ops:=2;
  16454. taicpu(p).clearop(2);
  16455. end;
  16456. end;
  16457. function TX86AsmOptimizer.PostPeepholeOptSARXSHLXSHRX(var p: tai): Boolean;
  16458. begin
  16459. Result := False;
  16460. { Change: bTo:
  16461. shlx %ecx,%reg,%reg shl %cl,%reg
  16462. (Smaller instruction size)
  16463. Same with SARX and SHRX (and when using %rcx for 64-bit)
  16464. }
  16465. if (getsupreg(taicpu(p).oper[0]^.reg)=RS_ECX) and
  16466. MatchOperand(taicpu(p).oper[1]^,taicpu(p).oper[2]^.reg) and
  16467. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  16468. begin
  16469. case taicpu(p).opcode of
  16470. A_SARX: taicpu(p).opcode:=A_SAR;
  16471. A_SHLX: taicpu(p).opcode:=A_SHL;
  16472. A_SHRX: taicpu(p).opcode:=A_SHR;
  16473. else
  16474. InternalError(2025090501);
  16475. end;
  16476. setsubreg(taicpu(p).oper[0]^.reg, R_SUBL);
  16477. taicpu(p).ops:=2;
  16478. taicpu(p).clearop(2);
  16479. end;
  16480. end;
  16481. class procedure TX86AsmOptimizer.OptimizeRefs(var p: taicpu);
  16482. var
  16483. OperIdx: Integer;
  16484. begin
  16485. for OperIdx := 0 to p.ops - 1 do
  16486. if p.oper[OperIdx]^.typ = top_ref then
  16487. optimize_ref(p.oper[OperIdx]^.ref^, False);
  16488. end;
  16489. end.