aoptx86.pas 782 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419144201442114422144231442414425144261442714428144291443014431144321443314434144351443614437144381443914440144411444214443144441444514446144471444814449144501445114452144531445414455144561445714458144591446014461144621446314464144651446614467144681446914470144711447214473144741447514476144771447814479144801448114482144831448414485144861448714488144891449014491144921449314494144951449614497144981449914500145011450214503145041450514506145071450814509145101451114512145131451414515145161451714518145191452014521145221452314524145251452614527145281452914530145311453214533145341453514536145371453814539145401454114542145431454414545145461454714548145491455014551145521455314554145551455614557145581455914560145611456214563145641456514566145671456814569145701457114572145731457414575145761457714578145791458014581145821458314584145851458614587145881458914590145911459214593145941459514596145971459814599146001460114602146031460414605146061460714608146091461014611146121461314614146151461614617146181461914620146211462214623146241462514626146271462814629146301463114632146331463414635146361463714638146391464014641146421464314644146451464614647146481464914650146511465214653146541465514656146571465814659146601466114662146631466414665146661466714668146691467014671146721467314674146751467614677146781467914680146811468214683146841468514686146871468814689146901469114692146931469414695146961469714698146991470014701147021470314704147051470614707147081470914710147111471214713147141471514716147171471814719147201472114722147231472414725147261472714728147291473014731147321473314734147351473614737147381473914740147411474214743147441474514746147471474814749147501475114752147531475414755147561475714758147591476014761147621476314764147651476614767147681476914770147711477214773147741477514776147771477814779147801478114782147831478414785147861478714788147891479014791147921479314794147951479614797147981479914800148011480214803148041480514806148071480814809148101481114812148131481414815148161481714818148191482014821148221482314824148251482614827148281482914830148311483214833148341483514836148371483814839148401484114842148431484414845148461484714848148491485014851148521485314854148551485614857148581485914860148611486214863148641486514866148671486814869148701487114872148731487414875148761487714878148791488014881148821488314884148851488614887148881488914890148911489214893148941489514896148971489814899149001490114902149031490414905149061490714908149091491014911149121491314914149151491614917149181491914920149211492214923149241492514926149271492814929149301493114932149331493414935149361493714938149391494014941149421494314944149451494614947149481494914950149511495214953149541495514956149571495814959149601496114962149631496414965149661496714968149691497014971149721497314974149751497614977149781497914980149811498214983149841498514986149871498814989149901499114992149931499414995149961499714998149991500015001150021500315004150051500615007150081500915010150111501215013150141501515016150171501815019150201502115022150231502415025150261502715028150291503015031150321503315034150351503615037150381503915040150411504215043150441504515046150471504815049150501505115052150531505415055150561505715058150591506015061150621506315064150651506615067150681506915070150711507215073150741507515076150771507815079150801508115082150831508415085150861508715088150891509015091150921509315094150951509615097150981509915100151011510215103151041510515106151071510815109151101511115112151131511415115151161511715118151191512015121151221512315124151251512615127151281512915130151311513215133151341513515136151371513815139151401514115142151431514415145151461514715148151491515015151151521515315154151551515615157151581515915160151611516215163151641516515166151671516815169151701517115172151731517415175151761517715178151791518015181151821518315184151851518615187151881518915190151911519215193151941519515196151971519815199152001520115202152031520415205152061520715208152091521015211152121521315214152151521615217152181521915220152211522215223152241522515226152271522815229152301523115232152331523415235152361523715238152391524015241152421524315244152451524615247152481524915250152511525215253152541525515256152571525815259152601526115262152631526415265152661526715268152691527015271152721527315274152751527615277152781527915280152811528215283152841528515286152871528815289152901529115292152931529415295152961529715298152991530015301153021530315304153051530615307153081530915310153111531215313153141531515316153171531815319153201532115322153231532415325153261532715328153291533015331153321533315334153351533615337153381533915340153411534215343153441534515346153471534815349153501535115352153531535415355153561535715358153591536015361153621536315364153651536615367153681536915370153711537215373153741537515376153771537815379153801538115382153831538415385153861538715388153891539015391153921539315394153951539615397153981539915400154011540215403154041540515406154071540815409154101541115412154131541415415154161541715418154191542015421154221542315424154251542615427154281542915430154311543215433154341543515436154371543815439154401544115442154431544415445154461544715448154491545015451154521545315454154551545615457154581545915460154611546215463154641546515466154671546815469154701547115472154731547415475154761547715478154791548015481154821548315484154851548615487154881548915490154911549215493154941549515496154971549815499155001550115502155031550415505155061550715508155091551015511155121551315514155151551615517155181551915520155211552215523155241552515526155271552815529155301553115532155331553415535155361553715538155391554015541155421554315544155451554615547155481554915550155511555215553155541555515556155571555815559155601556115562155631556415565155661556715568155691557015571155721557315574155751557615577155781557915580155811558215583155841558515586155871558815589155901559115592155931559415595155961559715598155991560015601156021560315604156051560615607156081560915610156111561215613156141561515616156171561815619156201562115622156231562415625156261562715628156291563015631156321563315634156351563615637156381563915640156411564215643156441564515646156471564815649156501565115652156531565415655156561565715658156591566015661156621566315664156651566615667156681566915670156711567215673156741567515676156771567815679156801568115682156831568415685156861568715688156891569015691156921569315694156951569615697156981569915700157011570215703157041570515706157071570815709157101571115712157131571415715157161571715718157191572015721157221572315724157251572615727157281572915730157311573215733157341573515736157371573815739157401574115742157431574415745157461574715748157491575015751157521575315754157551575615757157581575915760157611576215763157641576515766157671576815769157701577115772157731577415775157761577715778157791578015781157821578315784157851578615787157881578915790157911579215793157941579515796157971579815799158001580115802158031580415805158061580715808158091581015811158121581315814158151581615817158181581915820158211582215823158241582515826158271582815829158301583115832158331583415835158361583715838158391584015841158421584315844158451584615847158481584915850158511585215853158541585515856158571585815859158601586115862158631586415865158661586715868158691587015871158721587315874158751587615877158781587915880158811588215883158841588515886158871588815889158901589115892158931589415895158961589715898158991590015901159021590315904159051590615907159081590915910159111591215913159141591515916159171591815919159201592115922159231592415925159261592715928159291593015931159321593315934159351593615937159381593915940159411594215943159441594515946159471594815949159501595115952159531595415955159561595715958159591596015961159621596315964159651596615967159681596915970159711597215973159741597515976159771597815979159801598115982159831598415985159861598715988159891599015991159921599315994159951599615997159981599916000160011600216003160041600516006160071600816009160101601116012160131601416015160161601716018160191602016021160221602316024160251602616027160281602916030160311603216033160341603516036160371603816039160401604116042160431604416045160461604716048160491605016051160521605316054160551605616057160581605916060160611606216063160641606516066160671606816069160701607116072160731607416075160761607716078160791608016081160821608316084160851608616087160881608916090160911609216093160941609516096160971609816099161001610116102161031610416105161061610716108161091611016111161121611316114161151611616117161181611916120161211612216123161241612516126161271612816129161301613116132161331613416135161361613716138161391614016141161421614316144161451614616147161481614916150161511615216153161541615516156161571615816159161601616116162161631616416165161661616716168161691617016171161721617316174161751617616177161781617916180161811618216183161841618516186161871618816189161901619116192161931619416195161961619716198161991620016201162021620316204162051620616207162081620916210162111621216213162141621516216162171621816219162201622116222162231622416225162261622716228162291623016231162321623316234162351623616237162381623916240162411624216243162441624516246162471624816249162501625116252162531625416255162561625716258162591626016261162621626316264162651626616267162681626916270162711627216273162741627516276162771627816279162801628116282162831628416285162861628716288162891629016291162921629316294162951629616297162981629916300163011630216303163041630516306163071630816309163101631116312163131631416315163161631716318163191632016321163221632316324163251632616327163281632916330163311633216333163341633516336163371633816339163401634116342163431634416345163461634716348163491635016351163521635316354163551635616357163581635916360163611636216363163641636516366163671636816369163701637116372163731637416375163761637716378163791638016381163821638316384163851638616387163881638916390163911639216393163941639516396163971639816399164001640116402164031640416405164061640716408164091641016411164121641316414164151641616417164181641916420164211642216423164241642516426164271642816429164301643116432164331643416435164361643716438164391644016441164421644316444164451644616447164481644916450164511645216453164541645516456164571645816459164601646116462164631646416465164661646716468164691647016471164721647316474164751647616477164781647916480164811648216483164841648516486164871648816489164901649116492164931649416495164961649716498164991650016501165021650316504165051650616507165081650916510165111651216513165141651516516165171651816519165201652116522165231652416525165261652716528165291653016531165321653316534165351653616537165381653916540165411654216543165441654516546165471654816549165501655116552165531655416555165561655716558165591656016561165621656316564165651656616567165681656916570165711657216573165741657516576165771657816579165801658116582165831658416585165861658716588165891659016591165921659316594165951659616597165981659916600166011660216603166041660516606166071660816609166101661116612166131661416615166161661716618166191662016621166221662316624166251662616627166281662916630166311663216633166341663516636166371663816639166401664116642166431664416645166461664716648166491665016651166521665316654166551665616657166581665916660166611666216663166641666516666166671666816669166701667116672166731667416675166761667716678166791668016681166821668316684166851668616687166881668916690166911669216693166941669516696166971669816699167001670116702167031670416705167061670716708167091671016711167121671316714167151671616717167181671916720167211672216723167241672516726167271672816729167301673116732167331673416735167361673716738167391674016741167421674316744167451674616747167481674916750167511675216753167541675516756167571675816759167601676116762167631676416765167661676716768167691677016771167721677316774167751677616777167781677916780167811678216783167841678516786167871678816789167901679116792167931679416795167961679716798167991680016801168021680316804168051680616807168081680916810168111681216813168141681516816168171681816819168201682116822168231682416825168261682716828168291683016831168321683316834168351683616837168381683916840168411684216843168441684516846168471684816849168501685116852168531685416855168561685716858168591686016861168621686316864168651686616867168681686916870168711687216873168741687516876168771687816879168801688116882168831688416885168861688716888168891689016891168921689316894168951689616897168981689916900169011690216903169041690516906169071690816909169101691116912169131691416915169161691716918169191692016921169221692316924169251692616927169281692916930169311693216933169341693516936169371693816939169401694116942169431694416945169461694716948169491695016951169521695316954169551695616957169581695916960169611696216963169641696516966169671696816969169701697116972169731697416975169761697716978169791698016981169821698316984169851698616987169881698916990169911699216993169941699516996169971699816999170001700117002170031700417005170061700717008170091701017011170121701317014170151701617017170181701917020170211702217023170241702517026170271702817029170301703117032170331703417035170361703717038170391704017041170421704317044170451704617047170481704917050170511705217053170541705517056170571705817059170601706117062170631706417065170661706717068170691707017071170721707317074170751707617077170781707917080170811708217083170841708517086170871708817089170901709117092170931709417095170961709717098170991710017101171021710317104171051710617107171081710917110171111711217113171141711517116171171711817119171201712117122171231712417125171261712717128171291713017131171321713317134171351713617137171381713917140171411714217143171441714517146171471714817149171501715117152171531715417155171561715717158171591716017161171621716317164171651716617167171681716917170171711717217173171741717517176171771717817179171801718117182171831718417185171861718717188171891719017191171921719317194171951719617197171981719917200172011720217203172041720517206172071720817209172101721117212172131721417215172161721717218172191722017221172221722317224172251722617227172281722917230172311723217233172341723517236172371723817239172401724117242172431724417245172461724717248172491725017251172521725317254172551725617257172581725917260172611726217263172641726517266172671726817269172701727117272172731727417275172761727717278172791728017281172821728317284172851728617287172881728917290172911729217293172941729517296172971729817299173001730117302173031730417305173061730717308173091731017311173121731317314173151731617317173181731917320173211732217323173241732517326173271732817329173301733117332173331733417335173361733717338173391734017341173421734317344173451734617347173481734917350173511735217353173541735517356173571735817359173601736117362173631736417365173661736717368173691737017371173721737317374173751737617377173781737917380173811738217383173841738517386173871738817389173901739117392173931739417395173961739717398173991740017401174021740317404174051740617407174081740917410174111741217413174141741517416174171741817419174201742117422174231742417425174261742717428174291743017431174321743317434174351743617437174381743917440174411744217443174441744517446174471744817449174501745117452174531745417455174561745717458174591746017461174621746317464174651746617467174681746917470174711747217473174741747517476174771747817479174801748117482174831748417485174861748717488174891749017491174921749317494174951749617497174981749917500175011750217503175041750517506175071750817509175101751117512175131751417515175161751717518175191752017521175221752317524175251752617527175281752917530175311753217533175341753517536175371753817539175401754117542175431754417545175461754717548175491755017551175521755317554175551755617557175581755917560175611756217563175641756517566175671756817569175701757117572175731757417575175761757717578175791758017581175821758317584175851758617587175881758917590175911759217593175941759517596175971759817599176001760117602176031760417605176061760717608176091761017611176121761317614176151761617617176181761917620176211762217623176241762517626176271762817629176301763117632176331763417635176361763717638176391764017641176421764317644176451764617647176481764917650176511765217653176541765517656176571765817659176601766117662176631766417665176661766717668176691767017671176721767317674176751767617677176781767917680176811768217683176841768517686176871768817689176901769117692176931769417695176961769717698176991770017701177021770317704177051770617707177081770917710177111771217713177141771517716177171771817719177201772117722177231772417725177261772717728177291773017731177321773317734177351773617737177381773917740177411774217743177441774517746177471774817749177501775117752177531775417755177561775717758177591776017761177621776317764177651776617767177681776917770177711777217773177741777517776177771777817779177801778117782177831778417785177861778717788177891779017791177921779317794177951779617797177981779917800178011780217803178041780517806178071780817809178101781117812178131781417815178161781717818178191782017821178221782317824178251782617827178281782917830178311783217833178341783517836178371783817839178401784117842178431784417845178461784717848178491785017851178521785317854178551785617857178581785917860178611786217863178641786517866178671786817869178701787117872178731787417875178761787717878178791788017881178821788317884178851788617887178881788917890178911789217893178941789517896178971789817899179001790117902179031790417905179061790717908179091791017911179121791317914179151791617917179181791917920179211792217923179241792517926179271792817929179301793117932179331793417935179361793717938179391794017941179421794317944179451794617947179481794917950179511795217953179541795517956179571795817959179601796117962179631796417965179661796717968179691797017971179721797317974179751797617977179781797917980179811798217983179841798517986179871798817989179901799117992179931799417995179961799717998179991800018001180021800318004180051800618007180081800918010180111801218013180141801518016180171801818019180201802118022180231802418025180261802718028180291803018031180321803318034180351803618037180381803918040180411804218043180441804518046180471804818049180501805118052180531805418055180561805718058180591806018061180621806318064180651806618067180681806918070180711807218073180741807518076180771807818079180801808118082180831808418085180861808718088180891809018091180921809318094180951809618097180981809918100181011810218103181041810518106181071810818109181101811118112181131811418115181161811718118181191812018121181221812318124181251812618127181281812918130181311813218133181341813518136181371813818139181401814118142181431814418145181461814718148181491815018151181521815318154181551815618157181581815918160181611816218163181641816518166181671816818169181701817118172181731817418175181761817718178181791818018181181821818318184181851818618187181881818918190181911819218193181941819518196181971819818199182001820118202182031820418205182061820718208182091821018211182121821318214182151821618217182181821918220182211822218223182241822518226182271822818229182301823118232
  1. {
  2. Copyright (c) 1998-2002 by Florian Klaempfl and Jonas Maebe
  3. This unit contains the peephole optimizer.
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit aoptx86;
  18. {$i fpcdefs.inc}
  19. { $define DEBUG_AOPTCPU}
  20. {$ifdef EXTDEBUG}
  21. {$define DEBUG_AOPTCPU}
  22. {$endif EXTDEBUG}
  23. interface
  24. uses
  25. globtype,cclasses,
  26. cpubase,
  27. aasmtai,aasmcpu,
  28. cgbase,cgutils,
  29. aopt,aoptobj;
  30. type
  31. TOptsToCheck = (
  32. aoc_MovAnd2Mov_3,
  33. aoc_ForceNewIteration,
  34. aoc_DoPass2JccOpts,
  35. aoc_MovlMovq2MovlMovl
  36. );
  37. TX86AsmOptimizer = class(TAsmOptimizer)
  38. { some optimizations are very expensive to check, so the
  39. pre opt pass can be used to set some flags, depending on the found
  40. instructions if it is worth to check a certain optimization }
  41. OptsToCheck : set of TOptsToCheck;
  42. function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override;
  43. function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override;
  44. class function RegReadByInstruction(reg : TRegister; hp : tai) : boolean; static;
  45. function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
  46. function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  47. { Identical to GetNextInstructionUsingReg, but returns a value indicating
  48. how many instructions away that Next is from Current is.
  49. 0 = failure, equivalent to False in GetNextInstructionUsingReg }
  50. function GetNextInstructionUsingRegCount(Current: tai; out Next: tai; reg: TRegister): Cardinal;
  51. { This version of GetNextInstructionUsingReg will look across conditional jumps,
  52. potentially allowing further optimisation (although it might need to know if
  53. it crossed a conditional jump. }
  54. function GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var JumpTracking: TLinkedList; var CrossJump: Boolean): Boolean;
  55. {
  56. In comparison with GetNextInstructionUsingReg, GetNextInstructionUsingRegTrackingUse tracks
  57. the use of a register by allocs/dealloc, so it can ignore calls.
  58. In the following example, GetNextInstructionUsingReg will return the second movq,
  59. GetNextInstructionUsingRegTrackingUse won't.
  60. movq %rdi,%rax
  61. # Register rdi released
  62. # Register rdi allocated
  63. movq %rax,%rdi
  64. While in this example:
  65. movq %rdi,%rax
  66. call proc
  67. movq %rdi,%rax
  68. GetNextInstructionUsingRegTrackingUse will return the second instruction while GetNextInstructionUsingReg
  69. won't.
  70. }
  71. function GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  72. function RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean; override;
  73. { returns true if any of the registers in ref are modified by any
  74. instruction between p1 and p2, or if those instructions write to the
  75. reference }
  76. function RefModifiedBetween(Ref: TReference; RefSize: ASizeInt; p1, p2: tai): Boolean;
  77. private
  78. function SkipSimpleInstructions(var hp1: tai): Boolean;
  79. protected
  80. class function IsMOVZXAcceptable: Boolean; static; inline;
  81. function CheckMovMov2MovMov2(const p, hp1: tai): Boolean;
  82. { Attempts to allocate a volatile integer register for use between p and hp,
  83. using AUsedRegs for the current register usage information. Returns NR_NO
  84. if no free register could be found }
  85. function GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
  86. { Attempts to allocate a volatile MM register for use between p and hp,
  87. using AUsedRegs for the current register usage information. Returns NR_NO
  88. if no free register could be found }
  89. function GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
  90. { checks whether loading a new value in reg1 overwrites the entirety of reg2 }
  91. class function Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean; static;
  92. { checks whether reading the value in reg1 depends on the value of reg2. This
  93. is very similar to SuperRegisterEquals, except it takes into account that
  94. R_SUBH and R_SUBL are independendent (e.g. reading from AL does not
  95. depend on the value in AH). }
  96. class function Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean; static;
  97. { Replaces all references to AOldReg in a memory reference to ANewReg }
  98. class function ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean; static;
  99. { Replaces all references to AOldReg in an operand to ANewReg }
  100. class function ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean; static;
  101. { Replaces all references to AOldReg in an instruction to ANewReg,
  102. except where the register is being written }
  103. class function ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean; static;
  104. { Returns true if the reference only refers to ESP or EBP (or their 64-bit equivalents),
  105. or writes to a global symbol }
  106. class function IsRefSafe(const ref: PReference): Boolean; static;
  107. { Returns true if the given MOV instruction can be safely converted to CMOV }
  108. class function CanBeCMOV(p, cond_p: tai; var RefModified: Boolean) : boolean; static;
  109. { Like UpdateUsedRegs, but ignores deallocations }
  110. class procedure UpdateIntRegsNoDealloc(var AUsedRegs: TAllUsedRegs; p: Tai); static;
  111. { Returns true if the given logic instruction can be converted into a BTx instruction (BT not included) }
  112. class function IsBTXAcceptable(p : tai) : boolean; static;
  113. { Converts the LEA instruction to ADD/INC/SUB/DEC. Returns True if the
  114. conversion was successful }
  115. function ConvertLEA(const p : taicpu): Boolean;
  116. function DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  117. function FuncMov2Func(var p: tai; const hp1: tai): Boolean;
  118. {$ifdef x86_64}
  119. { If a "mov %reg1d,%reg2d; and %reg1d,%reg1d" is found, we can possibly
  120. replace %reg2q with %reg1q in later instructions }
  121. function DoZeroUpper32Opt(var mov_p: tai; var and_p: tai): Boolean;
  122. {$endif x86_64}
  123. procedure DebugMsg(const s : string; p : tai);inline;
  124. class function IsExitCode(p : tai) : boolean; static;
  125. class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
  126. class function IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean; static;
  127. procedure RemoveLastDeallocForFuncRes(p : tai);
  128. function DoArithCombineOpt(var p : tai) : Boolean;
  129. function DoMovCmpMemOpt(var p : tai; const hp1: tai) : Boolean;
  130. function DoSETccLblRETOpt(var p: tai; const hp_label: tai_label) : Boolean;
  131. function HandleSHRMerge(var p: tai; const PostPeephole: Boolean): Boolean;
  132. function PrePeepholeOptSxx(var p : tai) : boolean;
  133. function PrePeepholeOptIMUL(var p : tai) : boolean;
  134. function PrePeepholeOptAND(var p : tai) : boolean;
  135. function OptPass1Test(var p: tai): boolean;
  136. function OptPass1Add(var p: tai): boolean;
  137. function OptPass1AND(var p : tai) : boolean;
  138. function OptPass1CMOVcc(var p: tai): Boolean;
  139. function OptPass1_V_MOVAP(var p : tai) : boolean;
  140. function OptPass1VOP(var p : tai) : boolean;
  141. function OptPass1MOV(var p : tai) : boolean;
  142. function OptPass1MOVD(var p : tai) : boolean;
  143. function OptPass1Movx(var p : tai) : boolean;
  144. function OptPass1MOVXX(var p : tai) : boolean;
  145. {$ifndef i8086}
  146. function OptPass1NOT(var p : tai) : boolean;
  147. {$endif not i8086}
  148. function OptPass1OP(var p : tai) : boolean;
  149. function OptPass1LEA(var p : tai) : boolean;
  150. function OptPass1Sub(var p : tai) : boolean;
  151. function OptPass1SHLSAL(var p : tai) : boolean;
  152. function OptPass1SHR(var p : tai) : boolean;
  153. function OptPass1FSTP(var p : tai) : boolean;
  154. function OptPass1FLD(var p : tai) : boolean;
  155. function OptPass1Cmp(var p : tai) : boolean;
  156. function OptPass1PXor(var p : tai) : boolean;
  157. function OptPass1VPXor(var p: tai): boolean;
  158. function OptPass1Imul(var p : tai) : boolean;
  159. function OptPass1Jcc(var p : tai) : boolean;
  160. function OptPass1SHXX(var p: tai): boolean;
  161. function OptPass1VMOVDQ(var p: tai): Boolean;
  162. function OptPass1_V_Cvtss2sd(var p: tai): boolean;
  163. function OptPass1STCCLC(var p: tai): Boolean;
  164. function OptPass2STCCLC(var p: tai): Boolean;
  165. function OptPass2CMOVcc(var p: tai): Boolean;
  166. function OptPass2Movx(var p : tai): Boolean;
  167. function OptPass2MOV(var p : tai) : boolean;
  168. function OptPass2Imul(var p : tai) : boolean;
  169. function OptPass2Jmp(var p : tai) : boolean;
  170. function OptPass2Jcc(var p : tai) : boolean;
  171. function OptPass2Lea(var p: tai): Boolean;
  172. function OptPass2SUB(var p: tai): Boolean;
  173. function OptPass2ADD(var p : tai): Boolean;
  174. function OptPass2SETcc(var p : tai) : boolean;
  175. function OptPass2Cmp(var p: tai): Boolean;
  176. function OptPass2Test(var p: tai): Boolean;
  177. function CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
  178. function PostPeepholeOptMov(var p : tai) : Boolean;
  179. function PostPeepholeOptMovzx(var p : tai) : Boolean;
  180. function PostPeepholeOptXor(var p : tai) : Boolean;
  181. function PostPeepholeOptAnd(var p : tai) : boolean;
  182. function PostPeepholeOptMOVSX(var p : tai) : boolean;
  183. function PostPeepholeOptCmp(var p : tai) : Boolean;
  184. function PostPeepholeOptTestOr(var p : tai) : Boolean;
  185. function PostPeepholeOptCall(var p : tai) : Boolean;
  186. function PostPeepholeOptLea(var p : tai) : Boolean;
  187. function PostPeepholeOptPush(var p: tai): Boolean;
  188. function PostPeepholeOptShr(var p : tai) : boolean;
  189. function PostPeepholeOptADDSUB(var p : tai) : Boolean;
  190. function PostPeepholeOptVPXOR(var p: tai): Boolean;
  191. function PostPeepholeOptRET(var p: tai): Boolean;
  192. function PostPeepholeOptRORX(var p: tai): Boolean;
  193. function PostPeepholeOptSARXSHLXSHRX(var p: tai): Boolean;
  194. procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
  195. function CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
  196. function TrySwapMovOp(var p, hp1: tai): Boolean;
  197. function TrySwapMovCmp(var p, hp1: tai): Boolean;
  198. function TryCmpCMovOpts(var p, hp1: tai) : Boolean;
  199. function TryJccStcClcOpt(var p, hp1: tai): Boolean;
  200. { Processor-dependent reference optimisation }
  201. class procedure OptimizeRefs(var p: taicpu); static;
  202. end;
  203. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  204. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  205. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  206. function MatchInstruction(const instr: tai; const ops: array of TAsmOp; const opsize: topsizes): boolean;
  207. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  208. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  209. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  210. {$if max_operands>2}
  211. function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
  212. {$endif max_operands>2}
  213. function RefsEqual(const r1, r2: treference): boolean;
  214. { Like RefsEqual, but doesn't compare the offsets }
  215. function RefsAlmostEqual(const r1, r2: treference): boolean;
  216. { Note that Result is set to True if the references COULD overlap but the
  217. compiler cannot be sure (e.g. "(%reg1)" and "4(%reg2)" with a range of 4
  218. might still overlap because %reg2 could be equal to %reg1-4 }
  219. function RefsMightOverlap(const r1, r2: treference; const Range: asizeint): boolean;
  220. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  221. { returns true, if ref is a reference using only the registers passed as base and index
  222. and having an offset }
  223. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  224. implementation
  225. uses
  226. cutils,verbose,
  227. systems,
  228. globals,
  229. cpuinfo,
  230. procinfo,
  231. paramgr,
  232. aasmbase,
  233. aoptbase,aoptutils,
  234. symconst,symsym,
  235. cgx86,
  236. itcpugas;
  237. {$ifndef 8086}
  238. const
  239. MAX_CMOV_INSTRUCTIONS = 4;
  240. MAX_CMOV_REGISTERS = 8;
  241. type
  242. TCMovTrackingState = (tsInvalid, tsSimple, tsDetour, tsBranching,
  243. tsDouble, tsDoubleBranchSame, tsDoubleBranchDifferent, tsDoubleSecondBranching,
  244. tsProcessed);
  245. { For OptPass2Jcc }
  246. TCMOVTracking = object
  247. private
  248. CMOVScore, ConstCount: LongInt;
  249. RegWrites: array[0..MAX_CMOV_INSTRUCTIONS*2 - 1] of TRegister;
  250. ConstRegs: array[0..MAX_CMOV_REGISTERS - 1] of TRegister;
  251. ConstVals: array[0..MAX_CMOV_REGISTERS - 1] of TCGInt;
  252. ConstSizes: array[0..MAX_CMOV_REGISTERS - 1] of TSubRegister; { May not match ConstRegs if one is shared over multiple CMOVs. }
  253. ConstMovs: array[0..MAX_CMOV_REGISTERS - 1] of tai; { Location of initialisation instruction }
  254. ConstWriteSizes: array[0..first_int_imreg - 1] of TSubRegister; { Largest size of register written. }
  255. fOptimizer: TX86AsmOptimizer;
  256. fLabel: TAsmSymbol;
  257. fInsertionPoint,
  258. fCondition,
  259. fInitialJump,
  260. fFirstMovBlock,
  261. fFirstMovBlockStop,
  262. fSecondJump,
  263. fThirdJump,
  264. fSecondMovBlock,
  265. fSecondMovBlockStop,
  266. fMidLabel,
  267. fEndLabel,
  268. fAllocationRange: tai;
  269. fState: TCMovTrackingState;
  270. function TryCMOVConst(p, start, stop: tai; var Count: LongInt): Boolean;
  271. function InitialiseBlock(BlockStart, OneBeforeBlock: tai; out BlockStop: tai; out EndJump: tai): Boolean;
  272. function AnalyseMOVBlock(BlockStart, BlockStop, SearchStart: tai): LongInt;
  273. public
  274. RegisterTracking: TAllUsedRegs;
  275. constructor Init(Optimizer: TX86AsmOptimizer; var p_initialjump, p_initialmov: tai; var AFirstLabel: TAsmLabel);
  276. destructor Done;
  277. procedure Process(out new_p: tai);
  278. property State: TCMovTrackingState read fState;
  279. end;
  280. PCMOVTracking = ^TCMOVTracking;
  281. {$endif 8086}
  282. {$ifdef DEBUG_AOPTCPU}
  283. const
  284. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  285. {$else DEBUG_AOPTCPU}
  286. { Empty strings help the optimizer to remove string concatenations that won't
  287. ever appear to the user on release builds. [Kit] }
  288. const
  289. SPeepholeOptimization = '';
  290. {$endif DEBUG_AOPTCPU}
  291. LIST_STEP_SIZE = 4;
  292. type
  293. TJumpTrackingItem = class(TLinkedListItem)
  294. private
  295. FSymbol: TAsmSymbol;
  296. FRefs: LongInt;
  297. public
  298. constructor Create(ASymbol: TAsmSymbol);
  299. procedure IncRefs; {$ifdef USEINLINE}inline;{$endif USEINLINE}
  300. property Symbol: TAsmSymbol read FSymbol;
  301. property Refs: LongInt read FRefs;
  302. end;
  303. constructor TJumpTrackingItem.Create(ASymbol: TAsmSymbol);
  304. begin
  305. inherited Create;
  306. FSymbol := ASymbol;
  307. FRefs := 0;
  308. end;
  309. procedure TJumpTrackingItem.IncRefs; {$ifdef USEINLINE}inline;{$endif USEINLINE}
  310. begin
  311. Inc(FRefs);
  312. end;
  313. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  314. begin
  315. result :=
  316. (instr.typ = ait_instruction) and
  317. (taicpu(instr).opcode = op) and
  318. ((opsize = []) or (taicpu(instr).opsize in opsize));
  319. end;
  320. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  321. begin
  322. result :=
  323. (instr.typ = ait_instruction) and
  324. ((taicpu(instr).opcode = op1) or
  325. (taicpu(instr).opcode = op2)
  326. ) and
  327. ((opsize = []) or (taicpu(instr).opsize in opsize));
  328. end;
  329. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  330. begin
  331. result :=
  332. (instr.typ = ait_instruction) and
  333. ((taicpu(instr).opcode = op1) or
  334. (taicpu(instr).opcode = op2) or
  335. (taicpu(instr).opcode = op3)
  336. ) and
  337. ((opsize = []) or (taicpu(instr).opsize in opsize));
  338. end;
  339. function MatchInstruction(const instr : tai;const ops : array of TAsmOp;
  340. const opsize : topsizes) : boolean;
  341. var
  342. op : TAsmOp;
  343. begin
  344. result:=false;
  345. if (instr.typ <> ait_instruction) or
  346. ((opsize <> []) and not(taicpu(instr).opsize in opsize)) then
  347. exit;
  348. for op in ops do
  349. begin
  350. if taicpu(instr).opcode = op then
  351. begin
  352. result:=true;
  353. exit;
  354. end;
  355. end;
  356. end;
  357. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  358. begin
  359. result := (oper.typ = top_reg) and (oper.reg = reg);
  360. end;
  361. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  362. begin
  363. result := (oper.typ = top_const) and (oper.val = a);
  364. end;
  365. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  366. begin
  367. result := oper1.typ = oper2.typ;
  368. if result then
  369. case oper1.typ of
  370. top_const:
  371. Result:=oper1.val = oper2.val;
  372. top_reg:
  373. Result:=oper1.reg = oper2.reg;
  374. top_ref:
  375. Result:=RefsEqual(oper1.ref^, oper2.ref^);
  376. else
  377. internalerror(2013102801);
  378. end
  379. end;
  380. function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
  381. begin
  382. result := (oper1.typ = oper2.typ) and (oper1.typ = oper3.typ);
  383. if result then
  384. case oper1.typ of
  385. top_const:
  386. Result:=(oper1.val = oper2.val) and (oper1.val = oper3.val);
  387. top_reg:
  388. Result:=(oper1.reg = oper2.reg) and (oper1.reg = oper3.reg);
  389. top_ref:
  390. Result:=RefsEqual(oper1.ref^, oper2.ref^) and RefsEqual(oper1.ref^, oper3.ref^);
  391. else
  392. internalerror(2020052401);
  393. end
  394. end;
  395. function RefsEqual(const r1, r2: treference): boolean;
  396. begin
  397. RefsEqual :=
  398. (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  399. (r1.relsymbol = r2.relsymbol) and
  400. (r1.segment = r2.segment) and (r1.base = r2.base) and
  401. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  402. (r1.offset = r2.offset) and
  403. (r1.volatility + r2.volatility = []);
  404. end;
  405. function RefsAlmostEqual(const r1, r2: treference): boolean;
  406. begin
  407. RefsAlmostEqual :=
  408. (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  409. (r1.relsymbol = r2.relsymbol) and
  410. (r1.segment = r2.segment) and (r1.base = r2.base) and
  411. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  412. { Don't compare the offsets }
  413. (r1.volatility + r2.volatility = []);
  414. end;
  415. function RefsMightOverlap(const r1, r2: treference; const Range: asizeint): boolean;
  416. begin
  417. if (r1.symbol<>r2.symbol) then
  418. { If the index registers are different, there's a chance one could
  419. be set so it equals the other symbol }
  420. Exit((r1.index<>r2.index) or (r1.scalefactor<>r2.scalefactor));
  421. if (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  422. (r1.relsymbol = r2.relsymbol) and
  423. (r1.segment = r2.segment) and (r1.base = r2.base) and
  424. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  425. (r1.volatility + r2.volatility = []) then
  426. { In this case, it all depends on the offsets }
  427. Exit(abs(r1.offset - r2.offset) < Range);
  428. { There's a chance things MIGHT overlap, so take no chances }
  429. Result := True;
  430. end;
  431. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  432. begin
  433. Result:=(ref.offset=0) and
  434. (ref.scalefactor in [0,1]) and
  435. (ref.segment=NR_NO) and
  436. (ref.symbol=nil) and
  437. (ref.relsymbol=nil) and
  438. ((base=NR_INVALID) or
  439. (ref.base=base)) and
  440. ((index=NR_INVALID) or
  441. (ref.index=index)) and
  442. (ref.volatility=[]);
  443. end;
  444. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  445. begin
  446. Result:=(ref.scalefactor in [0,1]) and
  447. (ref.segment=NR_NO) and
  448. (ref.symbol=nil) and
  449. (ref.relsymbol=nil) and
  450. ((base=NR_INVALID) or
  451. (ref.base=base)) and
  452. ((index=NR_INVALID) or
  453. (ref.index=index)) and
  454. (ref.volatility=[]);
  455. end;
  456. function InstrReadsFlags(p: tai): boolean;
  457. begin
  458. InstrReadsFlags := true;
  459. case p.typ of
  460. ait_instruction:
  461. if InsProp[taicpu(p).opcode].Ch*
  462. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  463. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  464. Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc,Ch_All]<>[] then
  465. exit;
  466. ait_label:
  467. exit;
  468. else
  469. ;
  470. end;
  471. InstrReadsFlags := false;
  472. end;
  473. function TX86AsmOptimizer.GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  474. begin
  475. Next:=Current;
  476. repeat
  477. Result:=GetNextInstruction(Next,Next);
  478. until not (Result) or
  479. not(cs_opt_level3 in current_settings.optimizerswitches) or
  480. (Next.typ<>ait_instruction) or
  481. RegInInstruction(reg,Next) or
  482. is_calljmp(taicpu(Next).opcode);
  483. end;
  484. function TX86AsmOptimizer.GetNextInstructionUsingRegCount(Current: tai; out Next: tai; reg: TRegister): Cardinal;
  485. var
  486. GetNextResult: Boolean;
  487. begin
  488. Result:=0;
  489. Next:=Current;
  490. repeat
  491. GetNextResult := GetNextInstruction(Next,Next);
  492. if GetNextResult then
  493. Inc(Result)
  494. else
  495. { Must return zero upon hitting the end of the linked list without a match }
  496. Result := 0;
  497. until not (GetNextResult) or
  498. not(cs_opt_level3 in current_settings.optimizerswitches) or
  499. (Next.typ<>ait_instruction) or
  500. RegInInstruction(reg,Next) or
  501. is_calljmp(taicpu(Next).opcode);
  502. end;
  503. function TX86AsmOptimizer.GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var JumpTracking: TLinkedList; var CrossJump: Boolean): Boolean;
  504. procedure TrackJump(Symbol: TAsmSymbol);
  505. var
  506. Search: TJumpTrackingItem;
  507. begin
  508. { See if an entry already exists in our jump tracking list
  509. (faster to search backwards due to the higher chance of
  510. matching destinations) }
  511. Search := TJumpTrackingItem(JumpTracking.Last);
  512. while Assigned(Search) do
  513. begin
  514. if Search.Symbol = Symbol then
  515. begin
  516. { Found it - remove it so it can be pushed to the front }
  517. JumpTracking.Remove(Search);
  518. Break;
  519. end;
  520. Search := TJumpTrackingItem(Search.Previous);
  521. end;
  522. if not Assigned(Search) then
  523. Search := TJumpTrackingItem.Create(JumpTargetOp(taicpu(Next))^.ref^.symbol);
  524. JumpTracking.Concat(Search);
  525. Search.IncRefs;
  526. end;
  527. function LabelAccountedFor(Symbol: TAsmSymbol): Boolean;
  528. var
  529. Search: TJumpTrackingItem;
  530. begin
  531. Result := False;
  532. { See if this label appears in the tracking list }
  533. Search := TJumpTrackingItem(JumpTracking.Last);
  534. while Assigned(Search) do
  535. begin
  536. if Search.Symbol = Symbol then
  537. begin
  538. { Found it - let's see what we can discover }
  539. if Search.Symbol.getrefs = Search.Refs then
  540. begin
  541. { Success - all the references are accounted for }
  542. JumpTracking.Remove(Search);
  543. Search.Free;
  544. { It is logically impossible for CrossJump to be false here
  545. because we must have run into a conditional jump for
  546. this label at some point }
  547. if not CrossJump then
  548. InternalError(2022041710);
  549. if JumpTracking.First = nil then
  550. { Tracking list is now empty - no more cross jumps }
  551. CrossJump := False;
  552. Result := True;
  553. Exit;
  554. end;
  555. { If the references don't match, it's possible to enter
  556. this label through other means, so drop out }
  557. Exit;
  558. end;
  559. Search := TJumpTrackingItem(Search.Previous);
  560. end;
  561. end;
  562. var
  563. Next_Label: tai;
  564. begin
  565. { Note, CrossJump keeps its input value if a conditional jump is not found - it doesn't get set to False }
  566. Next := Current;
  567. repeat
  568. Result := GetNextInstruction(Next,Next);
  569. if not Result then
  570. Break;
  571. if (Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) then
  572. if is_calljmpuncondret(taicpu(Next).opcode) then
  573. begin
  574. if (taicpu(Next).opcode = A_JMP) and
  575. { Remove dead code now to save time }
  576. RemoveDeadCodeAfterJump(taicpu(Next)) then
  577. { A jump was removed, but not the current instruction, and
  578. Result doesn't necessarily translate into an optimisation
  579. routine's Result, so use the "Force New Iteration" flag so
  580. mark a new pass }
  581. Include(OptsToCheck, aoc_ForceNewIteration);
  582. if not Assigned(JumpTracking) then
  583. begin
  584. { Cross-label optimisations often causes other optimisations
  585. to perform worse because they're not given the chance to
  586. optimise locally. In this case, don't do the cross-label
  587. optimisations yet, but flag them as a potential possibility
  588. for the next iteration of Pass 1 }
  589. if not NotFirstIteration then
  590. Include(OptsToCheck, aoc_ForceNewIteration);
  591. end
  592. else if IsJumpToLabel(taicpu(Next)) and
  593. GetNextInstruction(Next, Next_Label) then
  594. begin
  595. { If we have JMP .lbl, and the label after it has all of its
  596. references tracked, then this is probably an if-else style of
  597. block and we can keep tracking. If the label for this jump
  598. then appears later and is fully tracked, then it's the end
  599. of the if-else blocks and the code paths converge (thus
  600. marking the end of the cross-jump) }
  601. if (Next_Label.typ = ait_label) then
  602. begin
  603. if LabelAccountedFor(tai_label(Next_Label).labsym) then
  604. begin
  605. TrackJump(JumpTargetOp(taicpu(Next))^.ref^.symbol);
  606. Next := Next_Label;
  607. { CrossJump gets set to false by LabelAccountedFor if the
  608. list is completely emptied (as it indicates that all
  609. code paths have converged). We could avoid this nuance
  610. by moving the TrackJump call to before the
  611. LabelAccountedFor call, but this is slower in situations
  612. where LabelAccountedFor would return False due to the
  613. creation of a new object that is not used and destroyed
  614. soon after. }
  615. CrossJump := True;
  616. Continue;
  617. end;
  618. end
  619. else if (Next_Label.typ <> ait_marker) then
  620. { We just did a RemoveDeadCodeAfterJump, so either we find
  621. a label, the end of the procedure or some kind of marker}
  622. InternalError(2022041720);
  623. end;
  624. Result := False;
  625. Exit;
  626. end
  627. else
  628. begin
  629. if not Assigned(JumpTracking) then
  630. begin
  631. { Cross-label optimisations often causes other optimisations
  632. to perform worse because they're not given the chance to
  633. optimise locally. In this case, don't do the cross-label
  634. optimisations yet, but flag them as a potential possibility
  635. for the next iteration of Pass 1 }
  636. if not NotFirstIteration then
  637. Include(OptsToCheck, aoc_ForceNewIteration);
  638. end
  639. else if IsJumpToLabel(taicpu(Next)) then
  640. TrackJump(JumpTargetOp(taicpu(Next))^.ref^.symbol)
  641. else
  642. { Conditional jumps should always be a jump to label }
  643. InternalError(2022041701);
  644. CrossJump := True;
  645. Continue;
  646. end;
  647. if Next.typ = ait_label then
  648. begin
  649. if not Assigned(JumpTracking) then
  650. begin
  651. { Cross-label optimisations often causes other optimisations
  652. to perform worse because they're not given the chance to
  653. optimise locally. In this case, don't do the cross-label
  654. optimisations yet, but flag them as a potential possibility
  655. for the next iteration of Pass 1 }
  656. if not NotFirstIteration then
  657. Include(OptsToCheck, aoc_ForceNewIteration);
  658. end
  659. else if LabelAccountedFor(tai_label(Next).labsym) then
  660. Continue;
  661. { If we reach here, we're at a label that hasn't been seen before
  662. (or JumpTracking was nil) }
  663. Break;
  664. end;
  665. until not Result or
  666. not (cs_opt_level3 in current_settings.optimizerswitches) or
  667. not (Next.typ in [ait_label, ait_instruction]) or
  668. RegInInstruction(reg,Next);
  669. end;
  670. function TX86AsmOptimizer.GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  671. begin
  672. if not(cs_opt_level3 in current_settings.optimizerswitches) then
  673. begin
  674. Result:=GetNextInstruction(Current,Next);
  675. exit;
  676. end;
  677. Next:=tai(Current.Next);
  678. Result:=false;
  679. while assigned(Next) do
  680. begin
  681. if ((Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) and not(taicpu(Next).opcode=A_CALL)) or
  682. ((Next.typ=ait_regalloc) and (getsupreg(tai_regalloc(Next).reg)=getsupreg(reg))) or
  683. ((Next.typ=ait_label) and not(labelCanBeSkipped(Tai_Label(Next)))) then
  684. exit
  685. else if (Next.typ=ait_instruction) and RegInInstruction(reg,Next) and not(taicpu(Next).opcode=A_CALL) then
  686. begin
  687. Result:=true;
  688. exit;
  689. end;
  690. Next:=tai(Next.Next);
  691. end;
  692. end;
  693. function TX86AsmOptimizer.InstructionLoadsFromReg(const reg: TRegister;const hp: tai): boolean;
  694. begin
  695. Result:=RegReadByInstruction(reg,hp);
  696. end;
  697. class function TX86AsmOptimizer.RegReadByInstruction(reg: TRegister; hp: tai): boolean;
  698. var
  699. p: taicpu;
  700. opcount: longint;
  701. begin
  702. RegReadByInstruction := false;
  703. if hp.typ <> ait_instruction then
  704. exit;
  705. p := taicpu(hp);
  706. case p.opcode of
  707. A_CALL:
  708. regreadbyinstruction := true;
  709. A_IMUL:
  710. case p.ops of
  711. 1:
  712. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  713. (
  714. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  715. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  716. );
  717. 2,3:
  718. regReadByInstruction :=
  719. reginop(reg,p.oper[0]^) or
  720. reginop(reg,p.oper[1]^);
  721. else
  722. InternalError(2019112801);
  723. end;
  724. A_MUL:
  725. begin
  726. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  727. (
  728. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  729. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  730. );
  731. end;
  732. A_IDIV,A_DIV:
  733. begin
  734. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  735. (
  736. (getregtype(reg)=R_INTREGISTER) and
  737. (
  738. (getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B))
  739. )
  740. );
  741. end;
  742. else
  743. begin
  744. if (p.opcode=A_LEA) and is_segment_reg(reg) then
  745. begin
  746. RegReadByInstruction := false;
  747. exit;
  748. end;
  749. for opcount := 0 to p.ops-1 do
  750. if (p.oper[opCount]^.typ = top_ref) and
  751. RegInRef(reg,p.oper[opcount]^.ref^) then
  752. begin
  753. RegReadByInstruction := true;
  754. exit
  755. end;
  756. { special handling for SSE MOVSD }
  757. if (p.opcode=A_MOVSD) and (p.ops>0) then
  758. begin
  759. if p.ops<>2 then
  760. internalerror(2017042702);
  761. regReadByInstruction := reginop(reg,p.oper[0]^) or
  762. (
  763. (p.oper[1]^.typ=top_reg) and (p.oper[0]^.typ=top_reg) and reginop(reg, p.oper[1]^)
  764. );
  765. exit;
  766. end;
  767. with insprop[p.opcode] do
  768. begin
  769. case getregtype(reg) of
  770. R_INTREGISTER:
  771. begin
  772. case getsupreg(reg) of
  773. RS_EAX:
  774. if [Ch_REAX,Ch_RWEAX,Ch_MEAX,Ch_WRAX,Ch_RWRAX,Ch_MRAX]*Ch<>[] then
  775. begin
  776. RegReadByInstruction := true;
  777. exit
  778. end;
  779. RS_ECX:
  780. if [Ch_RECX,Ch_RWECX,Ch_MECX,Ch_WRCX,Ch_RWRCX,Ch_MRCX]*Ch<>[] then
  781. begin
  782. RegReadByInstruction := true;
  783. exit
  784. end;
  785. RS_EDX:
  786. if [Ch_REDX,Ch_RWEDX,Ch_MEDX,Ch_WRDX,Ch_RWRDX,Ch_MRDX]*Ch<>[] then
  787. begin
  788. RegReadByInstruction := true;
  789. exit
  790. end;
  791. RS_EBX:
  792. if [Ch_REBX,Ch_RWEBX,Ch_MEBX,Ch_WRBX,Ch_RWRBX,Ch_MRBX]*Ch<>[] then
  793. begin
  794. RegReadByInstruction := true;
  795. exit
  796. end;
  797. RS_ESP:
  798. if [Ch_RESP,Ch_RWESP,Ch_MESP,Ch_WRSP,Ch_RWRSP,Ch_MRSP]*Ch<>[] then
  799. begin
  800. RegReadByInstruction := true;
  801. exit
  802. end;
  803. RS_EBP:
  804. if [Ch_REBP,Ch_RWEBP,Ch_MEBP,Ch_WRBP,Ch_RWRBP,Ch_MRBP]*Ch<>[] then
  805. begin
  806. RegReadByInstruction := true;
  807. exit
  808. end;
  809. RS_ESI:
  810. if [Ch_RESI,Ch_RWESI,Ch_MESI,Ch_WRSI,Ch_RWRSI,Ch_MRSI]*Ch<>[] then
  811. begin
  812. RegReadByInstruction := true;
  813. exit
  814. end;
  815. RS_EDI:
  816. if [Ch_REDI,Ch_RWEDI,Ch_MEDI,Ch_WRDI,Ch_RWRDI,Ch_MRDI]*Ch<>[] then
  817. begin
  818. RegReadByInstruction := true;
  819. exit
  820. end;
  821. end;
  822. end;
  823. R_MMREGISTER:
  824. begin
  825. case getsupreg(reg) of
  826. RS_XMM0:
  827. if [Ch_RXMM0,Ch_RWXMM0,Ch_MXMM0]*Ch<>[] then
  828. begin
  829. RegReadByInstruction := true;
  830. exit
  831. end;
  832. end;
  833. end;
  834. else
  835. ;
  836. end;
  837. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  838. begin
  839. if (Ch_RFLAGScc in Ch) and not(getsubreg(reg) in [R_SUBW,R_SUBD,R_SUBQ]) then
  840. begin
  841. case p.condition of
  842. C_A,C_NBE, { CF=0 and ZF=0 }
  843. C_BE,C_NA: { CF=1 or ZF=1 }
  844. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY,R_SUBFLAGZERO];
  845. C_AE,C_NB,C_NC, { CF=0 }
  846. C_B,C_NAE,C_C: { CF=1 }
  847. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY];
  848. C_NE,C_NZ, { ZF=0 }
  849. C_E,C_Z: { ZF=1 }
  850. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO];
  851. C_G,C_NLE, { ZF=0 and SF=OF }
  852. C_LE,C_NG: { ZF=1 or SF<>OF }
  853. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO,R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  854. C_GE,C_NL, { SF=OF }
  855. C_L,C_NGE: { SF<>OF }
  856. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  857. C_NO, { OF=0 }
  858. C_O: { OF=1 }
  859. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGOVERFLOW];
  860. C_NP,C_PO, { PF=0 }
  861. C_P,C_PE: { PF=1 }
  862. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGPARITY];
  863. C_NS, { SF=0 }
  864. C_S: { SF=1 }
  865. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN];
  866. else
  867. internalerror(2017042701);
  868. end;
  869. if RegReadByInstruction then
  870. exit;
  871. end;
  872. case getsubreg(reg) of
  873. R_SUBW,R_SUBD,R_SUBQ:
  874. RegReadByInstruction :=
  875. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  876. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  877. Ch_RDirFlag,Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc]*Ch<>[];
  878. R_SUBFLAGCARRY:
  879. RegReadByInstruction:=[Ch_RCarryFlag,Ch_RWCarryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  880. R_SUBFLAGPARITY:
  881. RegReadByInstruction:=[Ch_RParityFlag,Ch_RWParityFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  882. R_SUBFLAGAUXILIARY:
  883. RegReadByInstruction:=[Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  884. R_SUBFLAGZERO:
  885. RegReadByInstruction:=[Ch_RZeroFlag,Ch_RWZeroFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  886. R_SUBFLAGSIGN:
  887. RegReadByInstruction:=[Ch_RSignFlag,Ch_RWSignFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  888. R_SUBFLAGOVERFLOW:
  889. RegReadByInstruction:=[Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  890. R_SUBFLAGINTERRUPT:
  891. RegReadByInstruction:=[Ch_RFlags,Ch_RWFlags]*Ch<>[];
  892. R_SUBFLAGDIRECTION:
  893. RegReadByInstruction:=[Ch_RDirFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  894. else
  895. internalerror(2017042601);
  896. end;
  897. exit;
  898. end;
  899. if (Ch_NoReadIfEqualRegs in Ch) and (p.ops=2) and
  900. (p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
  901. (p.oper[0]^.reg=p.oper[1]^.reg) then
  902. exit;
  903. if ([CH_RWOP1,CH_ROP1,CH_MOP1]*Ch<>[]) and reginop(reg,p.oper[0]^) then
  904. begin
  905. RegReadByInstruction := true;
  906. exit
  907. end;
  908. if ([Ch_RWOP2,Ch_ROP2,Ch_MOP2]*Ch<>[]) and reginop(reg,p.oper[1]^) then
  909. begin
  910. RegReadByInstruction := true;
  911. exit
  912. end;
  913. if ([Ch_RWOP3,Ch_ROP3,Ch_MOP3]*Ch<>[]) and reginop(reg,p.oper[2]^) then
  914. begin
  915. RegReadByInstruction := true;
  916. exit
  917. end;
  918. if ([Ch_RWOP4,Ch_ROP4,Ch_MOP4]*Ch<>[]) and reginop(reg,p.oper[3]^) then
  919. begin
  920. RegReadByInstruction := true;
  921. exit
  922. end;
  923. end;
  924. end;
  925. end;
  926. end;
  927. function TX86AsmOptimizer.RegInInstruction(Reg: TRegister; p1: tai): Boolean;
  928. begin
  929. result:=false;
  930. if p1.typ<>ait_instruction then
  931. exit;
  932. if (Ch_All in insprop[taicpu(p1).opcode].Ch) then
  933. exit(true);
  934. if (getregtype(reg)=R_INTREGISTER) and
  935. { change information for xmm movsd are not correct }
  936. ((taicpu(p1).opcode<>A_MOVSD) or (taicpu(p1).ops=0)) then
  937. begin
  938. { Handle instructions that behave differently depending on the size and operand count }
  939. case taicpu(p1).opcode of
  940. A_MUL, A_DIV, A_IDIV:
  941. if taicpu(p1).opsize = S_B then
  942. Result := (getsupreg(Reg) = RS_EAX)
  943. else
  944. Result := (getsupreg(Reg) in [RS_EAX, RS_EDX]);
  945. A_IMUL:
  946. if taicpu(p1).ops = 1 then
  947. begin
  948. if taicpu(p1).opsize = S_B then
  949. Result := (getsupreg(Reg) = RS_EAX)
  950. else
  951. Result := (getsupreg(Reg) in [RS_EAX, RS_EDX]);
  952. end;
  953. { If ops are greater than 1, call inherited method }
  954. else
  955. case getsupreg(reg) of
  956. { RS_EAX = RS_RAX on x86-64 }
  957. RS_EAX:
  958. result:=([Ch_REAX,Ch_RRAX,Ch_WEAX,Ch_WRAX,Ch_RWEAX,Ch_RWRAX,Ch_MEAX,Ch_MRAX]*insprop[taicpu(p1).opcode].Ch)<>[];
  959. RS_ECX:
  960. result:=([Ch_RECX,Ch_RRCX,Ch_WECX,Ch_WRCX,Ch_RWECX,Ch_RWRCX,Ch_MECX,Ch_MRCX]*insprop[taicpu(p1).opcode].Ch)<>[];
  961. RS_EDX:
  962. result:=([Ch_REDX,Ch_RRDX,Ch_WEDX,Ch_WRDX,Ch_RWEDX,Ch_RWRDX,Ch_MEDX,Ch_MRDX]*insprop[taicpu(p1).opcode].Ch)<>[];
  963. RS_EBX:
  964. result:=([Ch_REBX,Ch_RRBX,Ch_WEBX,Ch_WRBX,Ch_RWEBX,Ch_RWRBX,Ch_MEBX,Ch_MRBX]*insprop[taicpu(p1).opcode].Ch)<>[];
  965. RS_ESP:
  966. result:=([Ch_RESP,Ch_RRSP,Ch_WESP,Ch_WRSP,Ch_RWESP,Ch_RWRSP,Ch_MESP,Ch_MRSP]*insprop[taicpu(p1).opcode].Ch)<>[];
  967. RS_EBP:
  968. result:=([Ch_REBP,Ch_RRBP,Ch_WEBP,Ch_WRBP,Ch_RWEBP,Ch_RWRBP,Ch_MEBP,Ch_MRBP]*insprop[taicpu(p1).opcode].Ch)<>[];
  969. RS_ESI:
  970. result:=([Ch_RESI,Ch_RRSI,Ch_WESI,Ch_WRSI,Ch_RWESI,Ch_RWRSI,Ch_MESI,Ch_MRSI,Ch_RMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  971. RS_EDI:
  972. result:=([Ch_REDI,Ch_RRDI,Ch_WEDI,Ch_WRDI,Ch_RWEDI,Ch_RWRDI,Ch_MEDI,Ch_MRDI,Ch_WMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  973. else
  974. ;
  975. end;
  976. end;
  977. if result then
  978. exit;
  979. end
  980. else if getregtype(reg)=R_MMREGISTER then
  981. begin
  982. case getsupreg(reg) of
  983. RS_XMM0:
  984. result:=([Ch_RXMM0,Ch_WXMM0,Ch_RWXMM0,Ch_MXMM0]*insprop[taicpu(p1).opcode].Ch)<>[];
  985. else
  986. ;
  987. end;
  988. if result then
  989. exit;
  990. end
  991. else if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  992. begin
  993. if ([Ch_RFlags,Ch_WFlags,Ch_RWFlags,Ch_RFLAGScc]*insprop[taicpu(p1).opcode].Ch)<>[] then
  994. exit(true);
  995. case getsubreg(reg) of
  996. R_SUBFLAGCARRY:
  997. Result:=([Ch_RCarryFlag,Ch_RWCarryFlag,Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  998. R_SUBFLAGPARITY:
  999. Result:=([Ch_RParityFlag,Ch_RWParityFlag,Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  1000. R_SUBFLAGAUXILIARY:
  1001. Result:=([Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  1002. R_SUBFLAGZERO:
  1003. Result:=([Ch_RZeroFlag,Ch_RWZeroFlag,Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  1004. R_SUBFLAGSIGN:
  1005. Result:=([Ch_RSignFlag,Ch_RWSignFlag,Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  1006. R_SUBFLAGOVERFLOW:
  1007. Result:=([Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  1008. R_SUBFLAGINTERRUPT:
  1009. Result:=([Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  1010. R_SUBFLAGDIRECTION:
  1011. Result:=([Ch_RDirFlag,Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  1012. R_SUBW,R_SUBD,R_SUBQ:
  1013. { Everything except the direction bits }
  1014. Result:=
  1015. ([Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  1016. Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
  1017. Ch_W0CarryFlag,Ch_W0ParityFlag,Ch_W0AuxiliaryFlag,Ch_W0ZeroFlag,Ch_W0SignFlag,Ch_W0OverflowFlag,
  1018. Ch_W1CarryFlag,Ch_W1ParityFlag,Ch_W1AuxiliaryFlag,Ch_W1ZeroFlag,Ch_W1SignFlag,Ch_W1OverflowFlag,
  1019. Ch_WUCarryFlag,Ch_WUParityFlag,Ch_WUAuxiliaryFlag,Ch_WUZeroFlag,Ch_WUSignFlag,Ch_WUOverflowFlag,
  1020. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag
  1021. ]*insprop[taicpu(p1).opcode].Ch)<>[];
  1022. else
  1023. ;
  1024. end;
  1025. if result then
  1026. exit;
  1027. end
  1028. else if (getregtype(reg)=R_FPUREGISTER) and (Ch_FPU in insprop[taicpu(p1).opcode].Ch) then
  1029. exit(true);
  1030. Result:=inherited RegInInstruction(Reg, p1);
  1031. end;
  1032. function TX86AsmOptimizer.RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean;
  1033. const
  1034. WriteOps: array[0..3] of set of TInsChange =
  1035. ([CH_RWOP1,CH_WOP1,CH_MOP1],
  1036. [Ch_RWOP2,Ch_WOP2,Ch_MOP2],
  1037. [Ch_RWOP3,Ch_WOP3,Ch_MOP3],
  1038. [Ch_RWOP4,Ch_WOP4,Ch_MOP4]);
  1039. var
  1040. OperIdx: Integer;
  1041. begin
  1042. Result := False;
  1043. if p1.typ <> ait_instruction then
  1044. exit;
  1045. with insprop[taicpu(p1).opcode] do
  1046. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  1047. begin
  1048. case getsubreg(reg) of
  1049. R_SUBW,R_SUBD,R_SUBQ:
  1050. Result :=
  1051. [Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
  1052. Ch_W0CarryFlag,Ch_W0ParityFlag,Ch_W0AuxiliaryFlag,Ch_W0ZeroFlag,Ch_W0SignFlag,Ch_W0OverflowFlag,
  1053. Ch_W1CarryFlag,Ch_W1ParityFlag,Ch_W1AuxiliaryFlag,Ch_W1ZeroFlag,Ch_W1SignFlag,Ch_W1OverflowFlag,
  1054. Ch_WUCarryFlag,Ch_WUParityFlag,Ch_WUAuxiliaryFlag,Ch_WUZeroFlag,Ch_WUSignFlag,Ch_WUOverflowFlag,
  1055. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  1056. Ch_W0DirFlag,Ch_W1DirFlag,Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1057. R_SUBFLAGCARRY:
  1058. Result:=[Ch_WCarryFlag,Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WUCarryFlag,Ch_RWCarryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1059. R_SUBFLAGPARITY:
  1060. Result:=[Ch_WParityFlag,Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WUParityFlag,Ch_RWParityFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1061. R_SUBFLAGAUXILIARY:
  1062. Result:=[Ch_WAuxiliaryFlag,Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1063. R_SUBFLAGZERO:
  1064. Result:=[Ch_WZeroFlag,Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WUZeroFlag,Ch_RWZeroFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1065. R_SUBFLAGSIGN:
  1066. Result:=[Ch_WSignFlag,Ch_W0SignFlag,Ch_W1SignFlag,Ch_WUSignFlag,Ch_RWSignFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1067. R_SUBFLAGOVERFLOW:
  1068. Result:=[Ch_WOverflowFlag,Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WUOverflowFlag,Ch_RWOverflowFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1069. R_SUBFLAGINTERRUPT:
  1070. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1071. R_SUBFLAGDIRECTION:
  1072. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  1073. else
  1074. internalerror(2017042602);
  1075. end;
  1076. exit;
  1077. end;
  1078. case taicpu(p1).opcode of
  1079. A_CALL:
  1080. { We could potentially set Result to False if the register in
  1081. question is non-volatile for the subroutine's calling convention,
  1082. but this would require detecting the calling convention in use and
  1083. also assuming that the routine doesn't contain malformed assembly
  1084. language, for example... so it could only be done under -O4 as it
  1085. would be considered a side-effect. [Kit] }
  1086. Result := True;
  1087. A_MOVSD:
  1088. { special handling for SSE MOVSD }
  1089. if (taicpu(p1).ops>0) then
  1090. begin
  1091. if taicpu(p1).ops<>2 then
  1092. internalerror(2017042703);
  1093. Result := (taicpu(p1).oper[1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[1]^);
  1094. end;
  1095. { VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
  1096. so fix it here (FK)
  1097. }
  1098. A_VMOVSS,
  1099. A_VMOVSD:
  1100. begin
  1101. Result := (taicpu(p1).ops=3) and (taicpu(p1).oper[2]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[2]^);
  1102. exit;
  1103. end;
  1104. A_MUL, A_DIV, A_IDIV:
  1105. begin
  1106. if taicpu(p1).opsize = S_B then
  1107. Result := (getsupreg(Reg) = RS_EAX)
  1108. else
  1109. Result := (getsupreg(Reg) in [RS_EAX, RS_EDX]);
  1110. end;
  1111. A_IMUL:
  1112. begin
  1113. if taicpu(p1).ops = 1 then
  1114. begin
  1115. Result := (getsupreg(Reg) in [RS_EAX, RS_EDX]);
  1116. end
  1117. else
  1118. Result := (taicpu(p1).oper[taicpu(p1).ops-1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[taicpu(p1).ops-1]^);
  1119. Exit;
  1120. end;
  1121. else
  1122. ;
  1123. end;
  1124. if Result then
  1125. exit;
  1126. with insprop[taicpu(p1).opcode] do
  1127. begin
  1128. if getregtype(reg)=R_INTREGISTER then
  1129. begin
  1130. case getsupreg(reg) of
  1131. RS_EAX:
  1132. if [Ch_WEAX,Ch_RWEAX,Ch_MEAX,Ch_WRAX,Ch_RWRAX,Ch_MRAX]*Ch<>[] then
  1133. begin
  1134. Result := True;
  1135. exit
  1136. end;
  1137. RS_ECX:
  1138. if [Ch_WECX,Ch_RWECX,Ch_MECX,Ch_WRCX,Ch_RWRCX,Ch_MRCX]*Ch<>[] then
  1139. begin
  1140. Result := True;
  1141. exit
  1142. end;
  1143. RS_EDX:
  1144. if [Ch_WEDX,Ch_RWEDX,Ch_MEDX,Ch_WRDX,Ch_RWRDX,Ch_MRDX]*Ch<>[] then
  1145. begin
  1146. Result := True;
  1147. exit
  1148. end;
  1149. RS_EBX:
  1150. if [Ch_WEBX,Ch_RWEBX,Ch_MEBX,Ch_WRBX,Ch_RWRBX,Ch_MRBX]*Ch<>[] then
  1151. begin
  1152. Result := True;
  1153. exit
  1154. end;
  1155. RS_ESP:
  1156. if [Ch_WESP,Ch_RWESP,Ch_MESP,Ch_WRSP,Ch_RWRSP,Ch_MRSP]*Ch<>[] then
  1157. begin
  1158. Result := True;
  1159. exit
  1160. end;
  1161. RS_EBP:
  1162. if [Ch_WEBP,Ch_RWEBP,Ch_MEBP,Ch_WRBP,Ch_RWRBP,Ch_MRBP]*Ch<>[] then
  1163. begin
  1164. Result := True;
  1165. exit
  1166. end;
  1167. RS_ESI:
  1168. if [Ch_WESI,Ch_RWESI,Ch_MESI,Ch_WRSI,Ch_RWRSI,Ch_MRSI]*Ch<>[] then
  1169. begin
  1170. Result := True;
  1171. exit
  1172. end;
  1173. RS_EDI:
  1174. if [Ch_WEDI,Ch_RWEDI,Ch_MEDI,Ch_WRDI,Ch_RWRDI,Ch_MRDI]*Ch<>[] then
  1175. begin
  1176. Result := True;
  1177. exit
  1178. end;
  1179. end;
  1180. end;
  1181. for OperIdx := 0 to taicpu(p1).ops - 1 do
  1182. if (WriteOps[OperIdx]*Ch<>[]) and
  1183. { The register doesn't get modified inside a reference }
  1184. (taicpu(p1).oper[OperIdx]^.typ = top_reg) and
  1185. SuperRegistersEqual(reg,taicpu(p1).oper[OperIdx]^.reg) then
  1186. begin
  1187. Result := true;
  1188. exit
  1189. end;
  1190. end;
  1191. end;
  1192. function TX86AsmOptimizer.RefModifiedBetween(Ref: TReference; RefSize: ASizeInt; p1, p2: tai): Boolean;
  1193. const
  1194. WriteOps: array[0..3] of set of TInsChange =
  1195. ([CH_RWOP1,CH_WOP1,CH_MOP1],
  1196. [Ch_RWOP2,Ch_WOP2,Ch_MOP2],
  1197. [Ch_RWOP3,Ch_WOP3,Ch_MOP3],
  1198. [Ch_RWOP4,Ch_WOP4,Ch_MOP4]);
  1199. var
  1200. X: Integer;
  1201. CurrentP1Size: asizeint;
  1202. begin
  1203. Result := (
  1204. (Ref.base <> NR_NO) and
  1205. {$ifdef x86_64}
  1206. (Ref.base <> NR_RIP) and
  1207. {$endif x86_64}
  1208. RegModifiedBetween(Ref.base, p1, p2)
  1209. ) or
  1210. (
  1211. (Ref.index <> NR_NO) and
  1212. (Ref.index <> Ref.base) and
  1213. RegModifiedBetween(Ref.index, p1, p2)
  1214. );
  1215. { Now check to see if the memory itself is written to }
  1216. if not Result then
  1217. begin
  1218. while assigned(p1) and assigned(p2) and GetNextInstruction(p1,p1) and (p1<>p2) do
  1219. if p1.typ = ait_instruction then
  1220. begin
  1221. CurrentP1Size := topsize2memsize[taicpu(p1).opsize] shr 3; { Convert to bytes }
  1222. with insprop[taicpu(p1).opcode] do
  1223. for X := 0 to taicpu(p1).ops - 1 do
  1224. if (taicpu(p1).oper[X]^.typ = top_ref) and
  1225. RefsAlmostEqual(Ref, taicpu(p1).oper[X]^.ref^) and
  1226. { Catch any potential overlaps }
  1227. (
  1228. (RefSize = 0) or
  1229. ((taicpu(p1).oper[X]^.ref^.offset - Ref.offset) < RefSize)
  1230. ) and
  1231. (
  1232. (CurrentP1Size = 0) or
  1233. ((Ref.offset - taicpu(p1).oper[X]^.ref^.offset) < CurrentP1Size)
  1234. ) and
  1235. { Reference is used, but does the instruction write to it? }
  1236. (
  1237. (Ch_All in Ch) or
  1238. ((WriteOps[X] * Ch) <> [])
  1239. ) then
  1240. begin
  1241. Result := True;
  1242. Break;
  1243. end;
  1244. end;
  1245. end;
  1246. end;
  1247. {$ifdef DEBUG_AOPTCPU}
  1248. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);
  1249. begin
  1250. asml.insertbefore(tai_comment.Create(strpnew(s)), p);
  1251. end;
  1252. function debug_tostr(i: tcgint): string; inline;
  1253. begin
  1254. Result := tostr(i);
  1255. end;
  1256. function debug_hexstr(i: tcgint): string;
  1257. begin
  1258. Result := '0x';
  1259. case i of
  1260. 0..$FF:
  1261. Result := Result + hexstr(i, 2);
  1262. $100..$FFFF:
  1263. Result := Result + hexstr(i, 4);
  1264. $10000..$FFFFFF:
  1265. Result := Result + hexstr(i, 6);
  1266. $1000000..$FFFFFFFF:
  1267. Result := Result + hexstr(i, 8);
  1268. else
  1269. Result := Result + hexstr(i, 16);
  1270. end;
  1271. end;
  1272. function debug_regname(r: TRegister): string; inline;
  1273. begin
  1274. Result := '%' + std_regname(r);
  1275. end;
  1276. { Debug output function - creates a string representation of an operator }
  1277. function debug_operstr(oper: TOper): string;
  1278. begin
  1279. case oper.typ of
  1280. top_const:
  1281. Result := '$' + debug_tostr(oper.val);
  1282. top_reg:
  1283. Result := debug_regname(oper.reg);
  1284. top_ref:
  1285. begin
  1286. if oper.ref^.offset <> 0 then
  1287. Result := debug_tostr(oper.ref^.offset) + '('
  1288. else
  1289. Result := '(';
  1290. if (oper.ref^.base <> NR_INVALID) and (oper.ref^.base <> NR_NO) then
  1291. begin
  1292. Result := Result + debug_regname(oper.ref^.base);
  1293. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  1294. Result := Result + ',' + debug_regname(oper.ref^.index);
  1295. end
  1296. else
  1297. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  1298. Result := Result + debug_regname(oper.ref^.index);
  1299. if (oper.ref^.scalefactor > 1) then
  1300. Result := Result + ',' + debug_tostr(oper.ref^.scalefactor) + ')'
  1301. else
  1302. Result := Result + ')';
  1303. end;
  1304. else
  1305. Result := '[UNKNOWN]';
  1306. end;
  1307. end;
  1308. function debug_op2str(opcode: tasmop): string; inline;
  1309. begin
  1310. Result := std_op2str[opcode];
  1311. end;
  1312. function debug_opsize2str(opsize: topsize): string; inline;
  1313. begin
  1314. Result := gas_opsize2str[opsize];
  1315. end;
  1316. {$else DEBUG_AOPTCPU}
  1317. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);inline;
  1318. begin
  1319. end;
  1320. function debug_tostr(i: tcgint): string; inline;
  1321. begin
  1322. Result := '';
  1323. end;
  1324. function debug_hexstr(i: tcgint): string; inline;
  1325. begin
  1326. Result := '';
  1327. end;
  1328. function debug_regname(r: TRegister): string; inline;
  1329. begin
  1330. Result := '';
  1331. end;
  1332. function debug_operstr(oper: TOper): string; inline;
  1333. begin
  1334. Result := '';
  1335. end;
  1336. function debug_op2str(opcode: tasmop): string; inline;
  1337. begin
  1338. Result := '';
  1339. end;
  1340. function debug_opsize2str(opsize: topsize): string; inline;
  1341. begin
  1342. Result := '';
  1343. end;
  1344. {$endif DEBUG_AOPTCPU}
  1345. class function TX86AsmOptimizer.IsMOVZXAcceptable: Boolean; inline;
  1346. begin
  1347. {$ifdef x86_64}
  1348. { Always fine on x86-64 }
  1349. Result := True;
  1350. {$else x86_64}
  1351. Result :=
  1352. {$ifdef i8086}
  1353. (current_settings.cputype >= cpu_386) and
  1354. {$endif i8086}
  1355. (
  1356. { Always accept if optimising for size }
  1357. (cs_opt_size in current_settings.optimizerswitches) or
  1358. { From the Pentium II onwards, MOVZX only takes 1 cycle. [Kit] }
  1359. (current_settings.optimizecputype >= cpu_Pentium2)
  1360. );
  1361. {$endif x86_64}
  1362. end;
  1363. { Attempts to allocate a volatile integer register for use between p and hp,
  1364. using AUsedRegs for the current register usage information. Returns NR_NO
  1365. if no free register could be found }
  1366. function TX86AsmOptimizer.GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
  1367. var
  1368. RegSet: TCPURegisterSet;
  1369. CurrentSuperReg: Integer;
  1370. CurrentReg: TRegister;
  1371. Currentp: tai;
  1372. Breakout: Boolean;
  1373. begin
  1374. Result := NR_NO;
  1375. RegSet :=
  1376. paramanager.get_volatile_registers_int(current_procinfo.procdef.proccalloption) +
  1377. current_procinfo.saved_regs_int;
  1378. (*
  1379. { Don't use the frame register unless explicitly allowed (fixes i40111) }
  1380. if ([cs_useebp, cs_userbp] * current_settings.optimizerswitches) = [] then
  1381. Exclude(RegSet, RS_FRAME_POINTER_REG);
  1382. *)
  1383. for CurrentSuperReg in RegSet do
  1384. begin
  1385. CurrentReg := newreg(R_INTREGISTER, TSuperRegister(CurrentSuperReg), RegSize);
  1386. if not AUsedRegs[R_INTREGISTER].IsUsed(CurrentReg)
  1387. {$if defined(i386) or defined(i8086)}
  1388. { If the target size is 8-bit, make sure we can actually encode it }
  1389. and (
  1390. (RegSize >= R_SUBW) or { Not R_SUBL or R_SUBH }
  1391. (GetSupReg(CurrentReg) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX])
  1392. )
  1393. {$endif i386 or i8086}
  1394. then
  1395. begin
  1396. Currentp := p;
  1397. Breakout := False;
  1398. while not Breakout and GetNextInstruction(Currentp, Currentp) and (Currentp <> hp) do
  1399. begin
  1400. case Currentp.typ of
  1401. ait_instruction:
  1402. begin
  1403. if RegInInstruction(CurrentReg, Currentp) then
  1404. begin
  1405. Breakout := True;
  1406. Break;
  1407. end;
  1408. { Cannot allocate across an unconditional jump }
  1409. if is_calljmpuncondret(taicpu(Currentp).opcode) then
  1410. Exit;
  1411. end;
  1412. ait_marker:
  1413. { Don't try anything more if a marker is hit }
  1414. Exit;
  1415. ait_regalloc:
  1416. if (tai_regalloc(Currentp).ratype <> ra_dealloc) and SuperRegistersEqual(CurrentReg, tai_regalloc(Currentp).reg) then
  1417. begin
  1418. Breakout := True;
  1419. Break;
  1420. end;
  1421. else
  1422. ;
  1423. end;
  1424. end;
  1425. if Breakout then
  1426. { Try the next register }
  1427. Continue;
  1428. { We have a free register available }
  1429. Result := CurrentReg;
  1430. if not DontAlloc then
  1431. AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
  1432. Exit;
  1433. end;
  1434. end;
  1435. end;
  1436. { Attempts to allocate a volatile MM register for use between p and hp,
  1437. using AUsedRegs for the current register usage information. Returns NR_NO
  1438. if no free register could be found }
  1439. function TX86AsmOptimizer.GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
  1440. var
  1441. RegSet: TCPURegisterSet;
  1442. CurrentSuperReg: Integer;
  1443. CurrentReg: TRegister;
  1444. Currentp: tai;
  1445. Breakout: Boolean;
  1446. begin
  1447. Result := NR_NO;
  1448. RegSet :=
  1449. paramanager.get_volatile_registers_mm(current_procinfo.procdef.proccalloption) +
  1450. current_procinfo.saved_regs_mm;
  1451. for CurrentSuperReg in RegSet do
  1452. begin
  1453. CurrentReg := newreg(R_MMREGISTER, TSuperRegister(CurrentSuperReg), RegSize);
  1454. if not AUsedRegs[R_MMREGISTER].IsUsed(CurrentReg) then
  1455. begin
  1456. Currentp := p;
  1457. Breakout := False;
  1458. while not Breakout and GetNextInstruction(Currentp, Currentp) and (Currentp <> hp) do
  1459. begin
  1460. case Currentp.typ of
  1461. ait_instruction:
  1462. begin
  1463. if RegInInstruction(CurrentReg, Currentp) then
  1464. begin
  1465. Breakout := True;
  1466. Break;
  1467. end;
  1468. { Cannot allocate across an unconditional jump }
  1469. if is_calljmpuncondret(taicpu(Currentp).opcode) then
  1470. Exit;
  1471. end;
  1472. ait_marker:
  1473. { Don't try anything more if a marker is hit }
  1474. Exit;
  1475. ait_regalloc:
  1476. if (tai_regalloc(Currentp).ratype <> ra_dealloc) and SuperRegistersEqual(CurrentReg, tai_regalloc(Currentp).reg) then
  1477. begin
  1478. Breakout := True;
  1479. Break;
  1480. end;
  1481. else
  1482. ;
  1483. end;
  1484. end;
  1485. if Breakout then
  1486. { Try the next register }
  1487. Continue;
  1488. { We have a free register available }
  1489. Result := CurrentReg;
  1490. if not DontAlloc then
  1491. AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
  1492. Exit;
  1493. end;
  1494. end;
  1495. end;
  1496. class function TX86AsmOptimizer.Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  1497. begin
  1498. if not SuperRegistersEqual(reg1,reg2) then
  1499. exit(false);
  1500. if getregtype(reg1)<>R_INTREGISTER then
  1501. exit(true); {because SuperRegisterEqual is true}
  1502. case getsubreg(reg1) of
  1503. { A write to R_SUBL doesn't change R_SUBH and if reg2 is R_SUBW or
  1504. higher, it preserves the high bits, so the new value depends on
  1505. reg2's previous value. In other words, it is equivalent to doing:
  1506. reg2 := (reg2 and $ffffff00) or byte(reg1); }
  1507. R_SUBL:
  1508. exit(getsubreg(reg2)=R_SUBL);
  1509. { A write to R_SUBH doesn't change R_SUBL and if reg2 is R_SUBW or
  1510. higher, it actually does a:
  1511. reg2 := (reg2 and $ffff00ff) or (reg1 and $ff00); }
  1512. R_SUBH:
  1513. exit(getsubreg(reg2)=R_SUBH);
  1514. { If reg2 is R_SUBD or larger, a write to R_SUBW preserves the high 16
  1515. bits of reg2:
  1516. reg2 := (reg2 and $ffff0000) or word(reg1); }
  1517. R_SUBW:
  1518. exit(getsubreg(reg2) in [R_SUBL,R_SUBH,R_SUBW]);
  1519. { a write to R_SUBD always overwrites every other subregister,
  1520. because it clears the high 32 bits of R_SUBQ on x86_64 }
  1521. R_SUBD,
  1522. R_SUBQ:
  1523. exit(true);
  1524. else
  1525. internalerror(2017042801);
  1526. end;
  1527. end;
  1528. class function TX86AsmOptimizer.Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  1529. begin
  1530. if not SuperRegistersEqual(reg1,reg2) then
  1531. exit(false);
  1532. if getregtype(reg1)<>R_INTREGISTER then
  1533. exit(true); {because SuperRegisterEqual is true}
  1534. case getsubreg(reg1) of
  1535. R_SUBL:
  1536. exit(getsubreg(reg2)<>R_SUBH);
  1537. R_SUBH:
  1538. exit(getsubreg(reg2)<>R_SUBL);
  1539. R_SUBW,
  1540. R_SUBD,
  1541. R_SUBQ:
  1542. exit(true);
  1543. else
  1544. internalerror(2017042802);
  1545. end;
  1546. end;
  1547. function TX86AsmOptimizer.PrePeepholeOptSxx(var p : tai) : boolean;
  1548. var
  1549. hp1 : tai;
  1550. l : TCGInt;
  1551. begin
  1552. result:=false;
  1553. if not(GetNextInstruction(p, hp1)) then
  1554. exit;
  1555. { changes the code sequence
  1556. shr/sar const1, x
  1557. shl const2, x
  1558. to
  1559. either "sar/and", "shl/and" or just "and" depending on const1 and const2 }
  1560. if (taicpu(p).oper[0]^.typ = top_const) and
  1561. MatchInstruction(hp1,A_SHL,[]) and
  1562. (taicpu(hp1).oper[0]^.typ = top_const) and
  1563. (taicpu(hp1).opsize = taicpu(p).opsize) and
  1564. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[1]^.typ) and
  1565. OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^) then
  1566. begin
  1567. if (taicpu(p).oper[0]^.val > taicpu(hp1).oper[0]^.val) and
  1568. not(cs_opt_size in current_settings.optimizerswitches)
  1569. {$ifdef x86_64}
  1570. and (
  1571. (taicpu(p).opsize <> S_Q) or
  1572. { 64-bit AND can only store signed 32-bit immediates }
  1573. (taicpu(p).oper[0]^.val < 32)
  1574. )
  1575. {$endif x86_64}
  1576. then
  1577. begin
  1578. { shr/sar const1, %reg
  1579. shl const2, %reg
  1580. with const1 > const2 }
  1581. DebugMsg(SPeepholeOptimization + 'SxrShl2SxrAnd 1 done',p);
  1582. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  1583. taicpu(hp1).opcode := A_AND;
  1584. l := (1 shl (taicpu(hp1).oper[0]^.val)) - 1;
  1585. case taicpu(p).opsize Of
  1586. S_B: taicpu(hp1).loadConst(0,l Xor $ff);
  1587. S_W: taicpu(hp1).loadConst(0,l Xor $ffff);
  1588. S_L: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffff));
  1589. S_Q: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffffffffffff));
  1590. else
  1591. Internalerror(2017050703)
  1592. end;
  1593. end
  1594. else if (taicpu(p).oper[0]^.val<taicpu(hp1).oper[0]^.val) and
  1595. not(cs_opt_size in current_settings.optimizerswitches)
  1596. {$ifdef x86_64}
  1597. and (
  1598. (taicpu(p).opsize <> S_Q) or
  1599. { 64-bit AND can only store signed 32-bit immediates }
  1600. (taicpu(p).oper[0]^.val < 32)
  1601. )
  1602. {$endif x86_64}
  1603. then
  1604. begin
  1605. { shr/sar const1, %reg
  1606. shl const2, %reg
  1607. with const1 < const2 }
  1608. DebugMsg(SPeepholeOptimization + 'SxrShl2SxrAnd 2 done',p);
  1609. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val-taicpu(p).oper[0]^.val);
  1610. taicpu(p).opcode := A_AND;
  1611. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  1612. case taicpu(p).opsize Of
  1613. S_B: taicpu(p).loadConst(0,l Xor $ff);
  1614. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  1615. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  1616. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  1617. else
  1618. Internalerror(2017050702)
  1619. end;
  1620. end
  1621. else if (taicpu(p).oper[0]^.val = taicpu(hp1).oper[0]^.val)
  1622. {$ifdef x86_64}
  1623. and (
  1624. (taicpu(p).opsize <> S_Q) or
  1625. { 64-bit AND can only store signed 32-bit immediates }
  1626. (taicpu(p).oper[0]^.val < 32)
  1627. )
  1628. {$endif x86_64}
  1629. then
  1630. begin
  1631. { shr/sar const1, %reg
  1632. shl const2, %reg
  1633. with const1 = const2 }
  1634. DebugMsg(SPeepholeOptimization + 'SxrShl2And done',p);
  1635. taicpu(p).opcode := A_AND;
  1636. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  1637. case taicpu(p).opsize Of
  1638. S_B: taicpu(p).loadConst(0,l Xor $ff);
  1639. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  1640. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  1641. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  1642. else
  1643. Internalerror(2017050701)
  1644. end;
  1645. RemoveInstruction(hp1);
  1646. end;
  1647. end;
  1648. end;
  1649. function TX86AsmOptimizer.PrePeepholeOptIMUL(var p : tai) : boolean;
  1650. var
  1651. opsize : topsize;
  1652. hp1, hp2 : tai;
  1653. tmpref : treference;
  1654. ShiftValue : Cardinal;
  1655. BaseValue : TCGInt;
  1656. begin
  1657. result:=false;
  1658. opsize:=taicpu(p).opsize;
  1659. { changes certain "imul const, %reg"'s to lea sequences }
  1660. if (MatchOpType(taicpu(p),top_const,top_reg) or
  1661. MatchOpType(taicpu(p),top_const,top_reg,top_reg)) and
  1662. (opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
  1663. if (taicpu(p).oper[0]^.val = 1) then
  1664. if (taicpu(p).ops = 2) then
  1665. { remove "imul $1, reg" }
  1666. begin
  1667. DebugMsg(SPeepholeOptimization + 'Imul2Nop done',p);
  1668. Result := RemoveCurrentP(p);
  1669. end
  1670. else
  1671. { change "imul $1, reg1, reg2" to "mov reg1, reg2" }
  1672. begin
  1673. hp1 := taicpu.Op_Reg_Reg(A_MOV, opsize, taicpu(p).oper[1]^.reg,taicpu(p).oper[2]^.reg);
  1674. taicpu(hp1).fileinfo := taicpu(p).fileinfo;
  1675. asml.InsertAfter(hp1, p);
  1676. DebugMsg(SPeepholeOptimization + 'Imul2Mov done',p);
  1677. RemoveCurrentP(p, hp1);
  1678. Result := True;
  1679. end
  1680. else if ((taicpu(p).ops <= 2) or
  1681. (taicpu(p).oper[2]^.typ = Top_Reg)) and
  1682. not(cs_opt_size in current_settings.optimizerswitches) and
  1683. (not(GetNextInstruction(p, hp1)) or
  1684. not((tai(hp1).typ = ait_instruction) and
  1685. ((taicpu(hp1).opcode=A_Jcc) and
  1686. (taicpu(hp1).condition in [C_O,C_NO])))) then
  1687. begin
  1688. {
  1689. imul X, reg1, reg2 to
  1690. lea (reg1,reg1,Y), reg2
  1691. shl ZZ,reg2
  1692. imul XX, reg1 to
  1693. lea (reg1,reg1,YY), reg1
  1694. shl ZZ,reg2
  1695. This optimziation makes sense for pretty much every x86, except the VIA Nano3000: it has IMUL latency 2, lea/shl pair as well,
  1696. it does not exist as a separate optimization target in FPC though.
  1697. This optimziation can be applied as long as only two bits are set in the constant and those two bits are separated by
  1698. at most two zeros
  1699. }
  1700. reference_reset(tmpref,1,[]);
  1701. if (PopCnt(QWord(taicpu(p).oper[0]^.val))=2) and (BsrQWord(taicpu(p).oper[0]^.val)-BsfQWord(taicpu(p).oper[0]^.val)<=3) then
  1702. begin
  1703. ShiftValue:=BsfQWord(taicpu(p).oper[0]^.val);
  1704. BaseValue:=taicpu(p).oper[0]^.val shr ShiftValue;
  1705. TmpRef.base := taicpu(p).oper[1]^.reg;
  1706. TmpRef.index := taicpu(p).oper[1]^.reg;
  1707. if not(BaseValue in [3,5,9]) then
  1708. Internalerror(2018110101);
  1709. TmpRef.ScaleFactor := BaseValue-1;
  1710. if (taicpu(p).ops = 2) then
  1711. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[1]^.reg)
  1712. else
  1713. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[2]^.reg);
  1714. AsmL.InsertAfter(hp1,p);
  1715. DebugMsg(SPeepholeOptimization + 'Imul2LeaShl done',p);
  1716. taicpu(hp1).fileinfo:=taicpu(p).fileinfo;
  1717. RemoveCurrentP(p, hp1);
  1718. if ShiftValue>0 then
  1719. begin
  1720. hp2 := taicpu.op_const_reg(A_SHL, opsize, ShiftValue, taicpu(hp1).oper[1]^.reg);
  1721. AsmL.InsertAfter(hp2,hp1);
  1722. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  1723. end;
  1724. Result := True;
  1725. end;
  1726. end;
  1727. end;
  1728. function TX86AsmOptimizer.PrePeepholeOptAND(var p : tai) : boolean;
  1729. begin
  1730. Result := False;
  1731. if MatchOperand(taicpu(p).oper[0]^, 0) and
  1732. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  1733. begin
  1734. DebugMsg(SPeepholeOptimization + 'AND 0 -> MOV 0', p);
  1735. taicpu(p).opcode := A_MOV;
  1736. Result := True;
  1737. end;
  1738. end;
  1739. function TX86AsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  1740. var
  1741. p: taicpu absolute hp; { Implicit typecast }
  1742. i: Integer;
  1743. begin
  1744. Result := False;
  1745. if not assigned(hp) or
  1746. (hp.typ <> ait_instruction) then
  1747. Exit;
  1748. Prefetch(insprop[p.opcode]);
  1749. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  1750. with insprop[p.opcode] do
  1751. begin
  1752. case getsubreg(reg) of
  1753. R_SUBW,R_SUBD,R_SUBQ:
  1754. Result:=
  1755. { ZF, CF, OF, SF, PF and AF must all be set in some way (ordered so the most
  1756. uncommon flags are checked first }
  1757. ([Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_WFlags] * Ch <> []) and
  1758. ([Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag,Ch_WFlags]*Ch <> []) and
  1759. ([Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag,Ch_WFlags]*Ch <> []) and
  1760. ([Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag,Ch_WFlags]*Ch <> []) and
  1761. ([Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag,Ch_WFlags]*Ch <> []) and
  1762. ([Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag,Ch_WFlags]*Ch <> []);
  1763. R_SUBFLAGCARRY:
  1764. Result:=[Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag,Ch_WFlags]*Ch<>[];
  1765. R_SUBFLAGPARITY:
  1766. Result:=[Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag,Ch_WFlags]*Ch<>[];
  1767. R_SUBFLAGAUXILIARY:
  1768. Result:=[Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_WFlags]*Ch<>[];
  1769. R_SUBFLAGZERO:
  1770. Result:=[Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag,Ch_WFlags]*Ch<>[];
  1771. R_SUBFLAGSIGN:
  1772. Result:=[Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag,Ch_WFlags]*Ch<>[];
  1773. R_SUBFLAGOVERFLOW:
  1774. Result:=[Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag,Ch_WFlags]*Ch<>[];
  1775. R_SUBFLAGINTERRUPT:
  1776. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*Ch<>[];
  1777. R_SUBFLAGDIRECTION:
  1778. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*Ch<>[];
  1779. else
  1780. internalerror(2017050501);
  1781. end;
  1782. exit;
  1783. end;
  1784. { Handle special cases first }
  1785. case p.opcode of
  1786. A_MOV, A_MOVZX, A_MOVSX, A_LEA, A_VMOVSS, A_VMOVSD, A_VMOVAPD,
  1787. A_VMOVAPS, A_VMOVQ, A_MOVSS, A_MOVSD, A_MOVQ, A_MOVAPD, A_MOVAPS:
  1788. begin
  1789. Result :=
  1790. (p.ops=2) and { A_MOVSD can have zero operands, so this check is needed }
  1791. (p.oper[1]^.typ = top_reg) and
  1792. (Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)) and
  1793. (
  1794. (p.oper[0]^.typ = top_const) or
  1795. (
  1796. (p.oper[0]^.typ = top_reg) and
  1797. not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))
  1798. ) or (
  1799. (p.oper[0]^.typ = top_ref) and
  1800. not RegInRef(reg,p.oper[0]^.ref^)
  1801. )
  1802. );
  1803. end;
  1804. A_MUL, A_IMUL:
  1805. Result :=
  1806. (
  1807. (p.ops=3) and { IMUL only }
  1808. (Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg,reg)) and
  1809. (
  1810. (
  1811. (p.oper[1]^.typ=top_reg) and
  1812. not Reg1ReadDependsOnReg2(p.oper[1]^.reg,reg)
  1813. ) or (
  1814. (p.oper[1]^.typ=top_ref) and
  1815. not RegInRef(reg,p.oper[1]^.ref^)
  1816. )
  1817. )
  1818. ) or (
  1819. (
  1820. (p.ops=1) and
  1821. (
  1822. (
  1823. (
  1824. (p.oper[0]^.typ=top_reg) and
  1825. not Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg)
  1826. )
  1827. ) or (
  1828. (p.oper[0]^.typ=top_ref) and
  1829. not RegInRef(reg,p.oper[0]^.ref^)
  1830. )
  1831. ) and (
  1832. (
  1833. (p.opsize=S_B) and
  1834. Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and
  1835. not Reg1ReadDependsOnReg2(NR_AL,reg)
  1836. ) or (
  1837. (p.opsize=S_W) and
  1838. Reg1WriteOverwritesReg2Entirely(NR_DX,reg)
  1839. ) or (
  1840. (p.opsize=S_L) and
  1841. Reg1WriteOverwritesReg2Entirely(NR_EDX,reg)
  1842. {$ifdef x86_64}
  1843. ) or (
  1844. (p.opsize=S_Q) and
  1845. Reg1WriteOverwritesReg2Entirely(NR_RDX,reg)
  1846. {$endif x86_64}
  1847. )
  1848. )
  1849. )
  1850. );
  1851. A_CBW:
  1852. Result := Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg));
  1853. {$ifndef x86_64}
  1854. A_LDS:
  1855. Result := (reg=NR_DS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1856. A_LES:
  1857. Result := (reg=NR_ES) and not(RegInRef(reg,p.oper[0]^.ref^));
  1858. {$endif not x86_64}
  1859. A_LFS:
  1860. Result := (reg=NR_FS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1861. A_LGS:
  1862. Result := (reg=NR_GS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1863. A_LSS:
  1864. Result := (reg=NR_SS) and not(RegInRef(reg,p.oper[0]^.ref^));
  1865. A_LAHF{$ifndef x86_64}, A_AAM{$endif not x86_64}:
  1866. Result := Reg1WriteOverwritesReg2Entirely(NR_AH,reg);
  1867. A_LODSB:
  1868. Result := Reg1WriteOverwritesReg2Entirely(NR_AL,reg);
  1869. A_LODSW:
  1870. Result := Reg1WriteOverwritesReg2Entirely(NR_AX,reg);
  1871. {$ifdef x86_64}
  1872. A_LODSQ:
  1873. Result := Reg1WriteOverwritesReg2Entirely(NR_RAX,reg);
  1874. {$endif x86_64}
  1875. A_LODSD:
  1876. Result := Reg1WriteOverwritesReg2Entirely(NR_EAX,reg);
  1877. A_FSTSW, A_FNSTSW:
  1878. Result := (p.oper[0]^.typ=top_reg) and Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg);
  1879. else
  1880. begin
  1881. with insprop[p.opcode] do
  1882. begin
  1883. if (
  1884. { xor %reg,%reg etc. is classed as a new value }
  1885. (([Ch_NoReadIfEqualRegs]*Ch)<>[]) and
  1886. MatchOpType(p, top_reg, top_reg) and
  1887. (p.oper[0]^.reg = p.oper[1]^.reg) and
  1888. Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)
  1889. ) then
  1890. begin
  1891. Result := True;
  1892. Exit;
  1893. end;
  1894. { Make sure the entire register is overwritten }
  1895. if (getregtype(reg) = R_INTREGISTER) then
  1896. begin
  1897. if (p.ops > 0) then
  1898. begin
  1899. if RegInOp(reg, p.oper[0]^) then
  1900. begin
  1901. if (p.oper[0]^.typ = top_ref) then
  1902. begin
  1903. if RegInRef(reg, p.oper[0]^.ref^) then
  1904. begin
  1905. Result := False;
  1906. Exit;
  1907. end;
  1908. end
  1909. else if (p.oper[0]^.typ = top_reg) then
  1910. begin
  1911. if ([Ch_ROp1, Ch_RWOp1, Ch_MOp1]*Ch<>[]) then
  1912. begin
  1913. Result := False;
  1914. Exit;
  1915. end
  1916. else if ([Ch_WOp1]*Ch<>[]) then
  1917. begin
  1918. if Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg, reg) then
  1919. Result := True
  1920. else
  1921. begin
  1922. Result := False;
  1923. Exit;
  1924. end;
  1925. end;
  1926. end;
  1927. end;
  1928. if (p.ops > 1) then
  1929. begin
  1930. if RegInOp(reg, p.oper[1]^) then
  1931. begin
  1932. if (p.oper[1]^.typ = top_ref) then
  1933. begin
  1934. if RegInRef(reg, p.oper[1]^.ref^) then
  1935. begin
  1936. Result := False;
  1937. Exit;
  1938. end;
  1939. end
  1940. else if (p.oper[1]^.typ = top_reg) then
  1941. begin
  1942. if ([Ch_ROp2, Ch_RWOp2, Ch_MOp2]*Ch<>[]) then
  1943. begin
  1944. Result := False;
  1945. Exit;
  1946. end
  1947. else if ([Ch_WOp2]*Ch<>[]) then
  1948. begin
  1949. if Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg, reg) then
  1950. Result := True
  1951. else
  1952. begin
  1953. Result := False;
  1954. Exit;
  1955. end;
  1956. end;
  1957. end;
  1958. end;
  1959. if (p.ops > 2) then
  1960. begin
  1961. if RegInOp(reg, p.oper[2]^) then
  1962. begin
  1963. if (p.oper[2]^.typ = top_ref) then
  1964. begin
  1965. if RegInRef(reg, p.oper[2]^.ref^) then
  1966. begin
  1967. Result := False;
  1968. Exit;
  1969. end;
  1970. end
  1971. else if (p.oper[2]^.typ = top_reg) then
  1972. begin
  1973. if ([Ch_ROp3, Ch_RWOp3, Ch_MOp3]*Ch<>[]) then
  1974. begin
  1975. Result := False;
  1976. Exit;
  1977. end
  1978. else if ([Ch_WOp3]*Ch<>[]) then
  1979. begin
  1980. if Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg, reg) then
  1981. Result := True
  1982. else
  1983. begin
  1984. Result := False;
  1985. Exit;
  1986. end;
  1987. end;
  1988. end;
  1989. end;
  1990. if (p.ops > 3) and RegInOp(reg, p.oper[3]^) then
  1991. begin
  1992. if (p.oper[3]^.typ = top_ref) then
  1993. begin
  1994. if RegInRef(reg, p.oper[3]^.ref^) then
  1995. begin
  1996. Result := False;
  1997. Exit;
  1998. end;
  1999. end
  2000. else if (p.oper[3]^.typ = top_reg) then
  2001. begin
  2002. if ([Ch_ROp4, Ch_RWOp4, Ch_MOp4]*Ch<>[]) then
  2003. begin
  2004. Result := False;
  2005. Exit;
  2006. end
  2007. else if ([Ch_WOp4]*Ch<>[]) then
  2008. begin
  2009. if Reg1WriteOverwritesReg2Entirely(p.oper[3]^.reg, reg) then
  2010. Result := True
  2011. else
  2012. begin
  2013. Result := False;
  2014. Exit;
  2015. end;
  2016. end;
  2017. end;
  2018. end;
  2019. end;
  2020. end;
  2021. end;
  2022. { Don't do these ones first in case an input operand is equal to an explicit output register }
  2023. case getsupreg(reg) of
  2024. RS_EAX:
  2025. if ([Ch_WEAX{$ifdef x86_64},Ch_WRAX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EAX, reg) then
  2026. begin
  2027. Result := True;
  2028. Exit;
  2029. end;
  2030. RS_ECX:
  2031. if ([Ch_WECX{$ifdef x86_64},Ch_WRCX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ECX, reg) then
  2032. begin
  2033. Result := True;
  2034. Exit;
  2035. end;
  2036. RS_EDX:
  2037. if ([Ch_REDX{$ifdef x86_64},Ch_WRDX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EDX, reg) then
  2038. begin
  2039. Result := True;
  2040. Exit;
  2041. end;
  2042. RS_EBX:
  2043. if ([Ch_WEBX{$ifdef x86_64},Ch_WRBX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EBX, reg) then
  2044. begin
  2045. Result := True;
  2046. Exit;
  2047. end;
  2048. RS_ESP:
  2049. if ([Ch_WESP{$ifdef x86_64},Ch_WRSP{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ESP, reg) then
  2050. begin
  2051. Result := True;
  2052. Exit;
  2053. end;
  2054. RS_EBP:
  2055. if ([Ch_WEBP{$ifdef x86_64},Ch_WRBP{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EBP, reg) then
  2056. begin
  2057. Result := True;
  2058. Exit;
  2059. end;
  2060. RS_ESI:
  2061. if ([Ch_WESI{$ifdef x86_64},Ch_WRSI{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ESI, reg) then
  2062. begin
  2063. Result := True;
  2064. Exit;
  2065. end;
  2066. RS_EDI:
  2067. if ([Ch_WEDI{$ifdef x86_64},Ch_WRDI{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EDI, reg) then
  2068. begin
  2069. Result := True;
  2070. Exit;
  2071. end;
  2072. else
  2073. ;
  2074. end;
  2075. end;
  2076. end;
  2077. end;
  2078. end;
  2079. end;
  2080. class function TX86AsmOptimizer.IsExitCode(p : tai) : boolean;
  2081. var
  2082. hp2,hp3 : tai;
  2083. begin
  2084. { some x86-64 issue a NOP before the real exit code }
  2085. if MatchInstruction(p,A_NOP,[]) then
  2086. GetNextInstruction(p,p);
  2087. result:=assigned(p) and (p.typ=ait_instruction) and
  2088. ((taicpu(p).opcode = A_RET) or
  2089. ((taicpu(p).opcode=A_LEAVE) and
  2090. GetNextInstruction(p,hp2) and
  2091. MatchInstruction(hp2,A_RET,[S_NO])
  2092. ) or
  2093. (((taicpu(p).opcode=A_LEA) and
  2094. MatchOpType(taicpu(p),top_ref,top_reg) and
  2095. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  2096. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  2097. ) and
  2098. GetNextInstruction(p,hp2) and
  2099. MatchInstruction(hp2,A_RET,[S_NO])
  2100. ) or
  2101. ((((taicpu(p).opcode=A_MOV) and
  2102. MatchOpType(taicpu(p),top_reg,top_reg) and
  2103. (taicpu(p).oper[0]^.reg=current_procinfo.framepointer) and
  2104. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)) or
  2105. ((taicpu(p).opcode=A_LEA) and
  2106. MatchOpType(taicpu(p),top_ref,top_reg) and
  2107. (taicpu(p).oper[0]^.ref^.base=current_procinfo.framepointer) and
  2108. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  2109. )
  2110. ) and
  2111. GetNextInstruction(p,hp2) and
  2112. MatchInstruction(hp2,A_POP,[reg2opsize(current_procinfo.framepointer)]) and
  2113. MatchOpType(taicpu(hp2),top_reg) and
  2114. (taicpu(hp2).oper[0]^.reg=current_procinfo.framepointer) and
  2115. GetNextInstruction(hp2,hp3) and
  2116. MatchInstruction(hp3,A_RET,[S_NO])
  2117. )
  2118. );
  2119. end;
  2120. class function TX86AsmOptimizer.isFoldableArithOp(hp1: taicpu; reg: tregister): boolean;
  2121. begin
  2122. isFoldableArithOp := False;
  2123. case hp1.opcode of
  2124. A_ADD,A_SUB,A_OR,A_XOR,A_AND,A_SHL,A_SHR,A_SAR:
  2125. isFoldableArithOp :=
  2126. ((taicpu(hp1).oper[0]^.typ = top_const) or
  2127. ((taicpu(hp1).oper[0]^.typ = top_reg) and
  2128. (taicpu(hp1).oper[0]^.reg <> reg))) and
  2129. (taicpu(hp1).oper[1]^.typ = top_reg) and
  2130. (taicpu(hp1).oper[1]^.reg = reg);
  2131. A_INC,A_DEC,A_NEG,A_NOT:
  2132. isFoldableArithOp :=
  2133. (taicpu(hp1).oper[0]^.typ = top_reg) and
  2134. (taicpu(hp1).oper[0]^.reg = reg);
  2135. else
  2136. ;
  2137. end;
  2138. end;
  2139. procedure TX86AsmOptimizer.RemoveLastDeallocForFuncRes(p: tai);
  2140. procedure DoRemoveLastDeallocForFuncRes( supreg: tsuperregister);
  2141. var
  2142. hp2: tai;
  2143. begin
  2144. hp2 := p;
  2145. repeat
  2146. hp2 := tai(hp2.previous);
  2147. if assigned(hp2) and
  2148. (hp2.typ = ait_regalloc) and
  2149. (tai_regalloc(hp2).ratype=ra_dealloc) and
  2150. (getregtype(tai_regalloc(hp2).reg) = R_INTREGISTER) and
  2151. (getsupreg(tai_regalloc(hp2).reg) = supreg) then
  2152. begin
  2153. RemoveInstruction(hp2);
  2154. break;
  2155. end;
  2156. until not(assigned(hp2)) or regInInstruction(newreg(R_INTREGISTER,supreg,R_SUBWHOLE),hp2);
  2157. end;
  2158. begin
  2159. case current_procinfo.procdef.returndef.typ of
  2160. arraydef,recorddef,pointerdef,
  2161. stringdef,enumdef,procdef,objectdef,errordef,
  2162. filedef,setdef,procvardef,
  2163. classrefdef,forwarddef:
  2164. DoRemoveLastDeallocForFuncRes(RS_EAX);
  2165. orddef:
  2166. if current_procinfo.procdef.returndef.size <> 0 then
  2167. begin
  2168. DoRemoveLastDeallocForFuncRes(RS_EAX);
  2169. { for int64/qword }
  2170. if current_procinfo.procdef.returndef.size = 8 then
  2171. DoRemoveLastDeallocForFuncRes(RS_EDX);
  2172. end;
  2173. else
  2174. ;
  2175. end;
  2176. end;
  2177. function TX86AsmOptimizer.OptPass1CMOVcc(var p: tai): Boolean;
  2178. var
  2179. hp1: tai;
  2180. operswap: poper;
  2181. begin
  2182. Result := False;
  2183. { Optimise:
  2184. cmov(c) %reg1,%reg2
  2185. mov %reg2,%reg1
  2186. (%reg2 dealloc.)
  2187. To:
  2188. cmov(~c) %reg2,%reg1
  2189. }
  2190. if (taicpu(p).oper[0]^.typ = top_reg) then
  2191. while GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.reg) and
  2192. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  2193. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
  2194. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) do
  2195. begin
  2196. TransferUsedRegs(TmpUsedRegs);
  2197. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1);
  2198. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  2199. begin
  2200. DebugMsg(SPeepholeOptimization + 'CMOV(c) %reg1,%reg2; MOV %reg2,%reg1 -> CMOV(~c) %reg2,%reg1 (CMovMov2CMov)', p);
  2201. { Save time by swapping the pointers (they're both registers, so
  2202. we don't need to worry about reference counts) }
  2203. operswap := taicpu(p).oper[0];
  2204. taicpu(p).oper[0] := taicpu(p).oper[1];
  2205. taicpu(p).oper[1] := operswap;
  2206. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  2207. RemoveInstruction(hp1);
  2208. { It's still a CMOV, so we can look further ahead }
  2209. Include(OptsToCheck, aoc_ForceNewIteration);
  2210. { But first, let's see if this will get optimised again
  2211. (probably won't happen, but best to be sure) }
  2212. Continue;
  2213. end;
  2214. Break;
  2215. end;
  2216. end;
  2217. function TX86AsmOptimizer.OptPass1_V_MOVAP(var p : tai) : boolean;
  2218. var
  2219. hp1,hp2 : tai;
  2220. begin
  2221. result:=false;
  2222. if MatchOpType(taicpu(p),top_reg,top_reg) then
  2223. begin
  2224. { vmova* reg1,reg1
  2225. =>
  2226. <nop> }
  2227. if taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg then
  2228. begin
  2229. RemoveCurrentP(p);
  2230. result:=true;
  2231. exit;
  2232. end;
  2233. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  2234. (hp1.typ = ait_instruction) and
  2235. (
  2236. { Under -O2 and below, the instructions are always adjacent }
  2237. not (cs_opt_level3 in current_settings.optimizerswitches) or
  2238. (taicpu(hp1).ops <= 1) or
  2239. not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^) or
  2240. { If reg1 = reg3, reg1 must not be modified in between }
  2241. not RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)
  2242. ) then
  2243. begin
  2244. if MatchInstruction(hp1,[taicpu(p).opcode],[S_NO]) and
  2245. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  2246. begin
  2247. { vmova* reg1,reg2
  2248. ...
  2249. vmova* reg2,reg3
  2250. dealloc reg2
  2251. =>
  2252. vmova* reg1,reg3 }
  2253. TransferUsedRegs(TmpUsedRegs);
  2254. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2255. if MatchOpType(taicpu(hp1),top_reg,top_reg) and
  2256. not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1) and
  2257. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2258. begin
  2259. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 1',p);
  2260. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2261. TransferUsedRegs(TmpUsedRegs);
  2262. AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp1, TmpUsedRegs);
  2263. RemoveInstruction(hp1);
  2264. result:=true;
  2265. exit;
  2266. end;
  2267. { special case:
  2268. vmova* reg1,<op>
  2269. ...
  2270. vmova* <op>,reg1
  2271. =>
  2272. vmova* reg1,<op> }
  2273. if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
  2274. ((taicpu(p).oper[0]^.typ<>top_ref) or
  2275. (not(vol_read in taicpu(p).oper[0]^.ref^.volatility))
  2276. ) then
  2277. begin
  2278. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 2',p);
  2279. RemoveInstruction(hp1);
  2280. result:=true;
  2281. exit;
  2282. end
  2283. end
  2284. else if ((MatchInstruction(p,[A_MOVAPS,A_VMOVAPS],[S_NO]) and
  2285. MatchInstruction(hp1,[A_MOVSS,A_VMOVSS],[S_NO])) or
  2286. ((MatchInstruction(p,[A_MOVAPD,A_VMOVAPD],[S_NO]) and
  2287. MatchInstruction(hp1,[A_MOVSD,A_VMOVSD],[S_NO])))
  2288. ) and
  2289. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  2290. begin
  2291. { vmova* reg1,reg2
  2292. ...
  2293. vmovs* reg2,<op>
  2294. dealloc reg2
  2295. =>
  2296. vmovs* reg1,<op> }
  2297. TransferUsedRegs(TmpUsedRegs);
  2298. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1);
  2299. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2300. begin
  2301. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVS*2(V)MOVS* 1',p);
  2302. taicpu(p).opcode:=taicpu(hp1).opcode;
  2303. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2304. TransferUsedRegs(TmpUsedRegs);
  2305. AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp1, TmpUsedRegs);
  2306. RemoveInstruction(hp1);
  2307. result:=true;
  2308. exit;
  2309. end
  2310. end;
  2311. if MatchInstruction(hp1,[A_VFMADDPD,
  2312. A_VFMADD132PD,
  2313. A_VFMADD132PS,
  2314. A_VFMADD132SD,
  2315. A_VFMADD132SS,
  2316. A_VFMADD213PD,
  2317. A_VFMADD213PS,
  2318. A_VFMADD213SD,
  2319. A_VFMADD213SS,
  2320. A_VFMADD231PD,
  2321. A_VFMADD231PS,
  2322. A_VFMADD231SD,
  2323. A_VFMADD231SS,
  2324. A_VFMADDSUB132PD,
  2325. A_VFMADDSUB132PS,
  2326. A_VFMADDSUB213PD,
  2327. A_VFMADDSUB213PS,
  2328. A_VFMADDSUB231PD,
  2329. A_VFMADDSUB231PS,
  2330. A_VFMSUB132PD,
  2331. A_VFMSUB132PS,
  2332. A_VFMSUB132SD,
  2333. A_VFMSUB132SS,
  2334. A_VFMSUB213PD,
  2335. A_VFMSUB213PS,
  2336. A_VFMSUB213SD,
  2337. A_VFMSUB213SS,
  2338. A_VFMSUB231PD,
  2339. A_VFMSUB231PS,
  2340. A_VFMSUB231SD,
  2341. A_VFMSUB231SS,
  2342. A_VFMSUBADD132PD,
  2343. A_VFMSUBADD132PS,
  2344. A_VFMSUBADD213PD,
  2345. A_VFMSUBADD213PS,
  2346. A_VFMSUBADD231PD,
  2347. A_VFMSUBADD231PS,
  2348. A_VFNMADD132PD,
  2349. A_VFNMADD132PS,
  2350. A_VFNMADD132SD,
  2351. A_VFNMADD132SS,
  2352. A_VFNMADD213PD,
  2353. A_VFNMADD213PS,
  2354. A_VFNMADD213SD,
  2355. A_VFNMADD213SS,
  2356. A_VFNMADD231PD,
  2357. A_VFNMADD231PS,
  2358. A_VFNMADD231SD,
  2359. A_VFNMADD231SS,
  2360. A_VFNMSUB132PD,
  2361. A_VFNMSUB132PS,
  2362. A_VFNMSUB132SD,
  2363. A_VFNMSUB132SS,
  2364. A_VFNMSUB213PD,
  2365. A_VFNMSUB213PS,
  2366. A_VFNMSUB213SD,
  2367. A_VFNMSUB213SS,
  2368. A_VFNMSUB231PD,
  2369. A_VFNMSUB231PS,
  2370. A_VFNMSUB231SD,
  2371. A_VFNMSUB231SS],[S_NO]) and
  2372. { we mix single and double opperations here because we assume that the compiler
  2373. generates vmovapd only after double operations and vmovaps only after single operations }
  2374. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[2]^.reg) and
  2375. GetNextInstructionUsingReg(hp1, hp2, taicpu(hp1).oper[2]^.reg) and
  2376. MatchInstruction(hp2,[A_VMOVAPD,A_VMOVAPS,A_MOVAPD,A_MOVAPS],[S_NO]) and
  2377. MatchOperand(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) then
  2378. begin
  2379. TransferUsedRegs(TmpUsedRegs);
  2380. UpdateUsedRegsBetween(TmpUsedRegs, p, hp2);
  2381. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2382. begin
  2383. taicpu(hp1).loadoper(2,taicpu(p).oper[0]^);
  2384. if (cs_opt_level3 in current_settings.optimizerswitches) then
  2385. RemoveCurrentP(p)
  2386. else
  2387. RemoveCurrentP(p, hp1); // hp1 is guaranteed to be the immediate next instruction in this case.
  2388. RemoveInstruction(hp2);
  2389. end;
  2390. end
  2391. else if (hp1.typ = ait_instruction) and
  2392. (((taicpu(p).opcode=A_MOVAPS) and
  2393. ((taicpu(hp1).opcode=A_ADDSS) or (taicpu(hp1).opcode=A_SUBSS) or
  2394. (taicpu(hp1).opcode=A_MULSS) or (taicpu(hp1).opcode=A_DIVSS))) or
  2395. ((taicpu(p).opcode=A_MOVAPD) and
  2396. ((taicpu(hp1).opcode=A_ADDSD) or (taicpu(hp1).opcode=A_SUBSD) or
  2397. (taicpu(hp1).opcode=A_MULSD) or (taicpu(hp1).opcode=A_DIVSD)))
  2398. ) and
  2399. GetNextInstructionUsingReg(hp1, hp2, taicpu(hp1).oper[1]^.reg) and
  2400. MatchInstruction(hp2,taicpu(p).opcode,[]) and
  2401. OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  2402. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  2403. MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^) then
  2404. { change
  2405. movapX reg,reg2
  2406. addsX/subsX/... reg3, reg2
  2407. movapX reg2,reg
  2408. to
  2409. addsX/subsX/... reg3,reg
  2410. }
  2411. begin
  2412. TransferUsedRegs(TmpUsedRegs);
  2413. UpdateUsedRegsBetween(TmpUsedRegs, p, hp2);
  2414. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2415. begin
  2416. DebugMsg(SPeepholeOptimization + 'MovapXOpMovapX2Op ('+
  2417. debug_op2str(taicpu(p).opcode)+' '+
  2418. debug_op2str(taicpu(hp1).opcode)+' '+
  2419. debug_op2str(taicpu(hp2).opcode)+') done',p);
  2420. { we cannot eliminate the first move if
  2421. the operations uses the same register for source and dest }
  2422. if not(OpsEqual(taicpu(hp1).oper[1]^,taicpu(hp1).oper[0]^)) then
  2423. { Remember that hp1 is not necessarily the immediate
  2424. next instruction }
  2425. RemoveCurrentP(p);
  2426. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  2427. RemoveInstruction(hp2);
  2428. result:=true;
  2429. end;
  2430. end
  2431. else if (hp1.typ = ait_instruction) and
  2432. (((taicpu(p).opcode=A_VMOVAPD) and
  2433. (taicpu(hp1).opcode=A_VCOMISD)) or
  2434. ((taicpu(p).opcode=A_VMOVAPS) and
  2435. ((taicpu(hp1).opcode=A_VCOMISS))
  2436. )
  2437. ) and not(OpsEqual(taicpu(hp1).oper[1]^,taicpu(hp1).oper[0]^)) then
  2438. { change
  2439. movapX reg,reg1
  2440. vcomisX reg1,reg1
  2441. to
  2442. vcomisX reg,reg
  2443. }
  2444. begin
  2445. TransferUsedRegs(TmpUsedRegs);
  2446. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2447. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2448. begin
  2449. DebugMsg(SPeepholeOptimization + 'MovapXComisX2ComisX2 ('+
  2450. debug_op2str(taicpu(p).opcode)+' '+
  2451. debug_op2str(taicpu(hp1).opcode)+') done',p);
  2452. if OpsEqual(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  2453. taicpu(hp1).loadoper(0, taicpu(p).oper[0]^);
  2454. if OpsEqual(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) then
  2455. taicpu(hp1).loadoper(1, taicpu(p).oper[0]^);
  2456. RemoveCurrentP(p);
  2457. result:=true;
  2458. exit;
  2459. end;
  2460. end
  2461. end;
  2462. end;
  2463. end;
  2464. function TX86AsmOptimizer.OptPass1VOP(var p : tai) : boolean;
  2465. var
  2466. hp1 : tai;
  2467. begin
  2468. result:=false;
  2469. { replace
  2470. V<Op>X %mreg1,%mreg2,%mreg3
  2471. VMovX %mreg3,%mreg4
  2472. dealloc %mreg3
  2473. by
  2474. V<Op>X %mreg1,%mreg2,%mreg4
  2475. ?
  2476. }
  2477. if GetNextInstruction(p,hp1) and
  2478. { we mix single and double operations here because we assume that the compiler
  2479. generates vmovapd only after double operations and vmovaps only after single operations }
  2480. MatchInstruction(hp1,A_VMOVAPD,A_VMOVAPS,[S_NO]) and
  2481. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  2482. (taicpu(hp1).oper[1]^.typ=top_reg) then
  2483. begin
  2484. TransferUsedRegs(TmpUsedRegs);
  2485. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2486. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  2487. begin
  2488. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  2489. DebugMsg(SPeepholeOptimization + 'VOpVmov2VOp done',p);
  2490. RemoveInstruction(hp1);
  2491. result:=true;
  2492. end;
  2493. end;
  2494. end;
  2495. { Replaces all references to AOldReg in a memory reference to ANewReg }
  2496. class function TX86AsmOptimizer.ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean;
  2497. begin
  2498. Result := False;
  2499. { For safety reasons, only check for exact register matches }
  2500. { Check base register }
  2501. if (ref.base = AOldReg) then
  2502. begin
  2503. ref.base := ANewReg;
  2504. Result := True;
  2505. end;
  2506. { Check index register }
  2507. if (ref.index = AOldReg) and (getsupreg(ANewReg)<>RS_ESP) then
  2508. begin
  2509. ref.index := ANewReg;
  2510. Result := True;
  2511. end;
  2512. end;
  2513. { Replaces all references to AOldReg in an operand to ANewReg }
  2514. class function TX86AsmOptimizer.ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean;
  2515. var
  2516. OldSupReg, NewSupReg: TSuperRegister;
  2517. OldSubReg, NewSubReg: TSubRegister;
  2518. OldRegType: TRegisterType;
  2519. ThisOper: POper;
  2520. begin
  2521. ThisOper := p.oper[OperIdx]; { Faster to access overall }
  2522. Result := False;
  2523. if (AOldReg = NR_NO) or (ANewReg = NR_NO) then
  2524. InternalError(2020011801);
  2525. OldSupReg := getsupreg(AOldReg);
  2526. OldSubReg := getsubreg(AOldReg);
  2527. OldRegType := getregtype(AOldReg);
  2528. NewSupReg := getsupreg(ANewReg);
  2529. NewSubReg := getsubreg(ANewReg);
  2530. if OldRegType <> getregtype(ANewReg) then
  2531. InternalError(2020011802);
  2532. if OldSubReg <> NewSubReg then
  2533. InternalError(2020011803);
  2534. case ThisOper^.typ of
  2535. top_reg:
  2536. if (
  2537. (ThisOper^.reg = AOldReg) or
  2538. (
  2539. (OldRegType = R_INTREGISTER) and
  2540. (getsupreg(ThisOper^.reg) = OldSupReg) and
  2541. (getregtype(ThisOper^.reg) = R_INTREGISTER) and
  2542. (
  2543. (getsubreg(ThisOper^.reg) <= OldSubReg)
  2544. {$ifndef x86_64}
  2545. and (
  2546. { Under i386 and i8086, ESI, EDI, EBP and ESP
  2547. don't have an 8-bit representation }
  2548. (getsubreg(ThisOper^.reg) >= R_SUBW) or
  2549. not (NewSupReg in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  2550. )
  2551. {$endif x86_64}
  2552. )
  2553. )
  2554. ) then
  2555. begin
  2556. ThisOper^.reg := newreg(getregtype(ANewReg), NewSupReg, getsubreg(p.oper[OperIdx]^.reg));
  2557. Result := True;
  2558. end;
  2559. top_ref:
  2560. if ReplaceRegisterInRef(ThisOper^.ref^, AOldReg, ANewReg) then
  2561. Result := True;
  2562. else
  2563. ;
  2564. end;
  2565. end;
  2566. { Replaces all references to AOldReg in an instruction to ANewReg }
  2567. class function TX86AsmOptimizer.ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
  2568. const
  2569. ReadFlag: array[0..3] of TInsChange = (Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Rop4);
  2570. var
  2571. OperIdx: Integer;
  2572. begin
  2573. Result := False;
  2574. for OperIdx := 0 to p.ops - 1 do
  2575. if (ReadFlag[OperIdx] in InsProp[p.Opcode].Ch) then
  2576. begin
  2577. { The shift and rotate instructions can only use CL }
  2578. if not (
  2579. (OperIdx = 0) and
  2580. { This second condition just helps to avoid unnecessarily
  2581. calling MatchInstruction for 10 different opcodes }
  2582. (p.oper[0]^.reg = NR_CL) and
  2583. MatchInstruction(p, [A_RCL, A_RCR, A_ROL, A_ROR, A_SAL, A_SAR, A_SHL, A_SHLD, A_SHR, A_SHRD], [])
  2584. ) then
  2585. Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
  2586. end
  2587. else if p.oper[OperIdx]^.typ = top_ref then
  2588. { It's okay to replace registers in references that get written to }
  2589. Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
  2590. end;
  2591. class function TX86AsmOptimizer.IsRefSafe(const ref: PReference): Boolean;
  2592. begin
  2593. Result :=
  2594. (ref^.index = NR_NO) and
  2595. (
  2596. {$ifdef x86_64}
  2597. (
  2598. (ref^.base = NR_RIP) and
  2599. (ref^.refaddr in [addr_pic, addr_pic_no_got])
  2600. ) or
  2601. {$endif x86_64}
  2602. (ref^.refaddr = addr_full) or
  2603. (ref^.base = NR_STACK_POINTER_REG) or
  2604. (ref^.base = current_procinfo.framepointer)
  2605. );
  2606. end;
  2607. function TX86AsmOptimizer.ConvertLEA(const p: taicpu): Boolean;
  2608. var
  2609. l: asizeint;
  2610. begin
  2611. Result := False;
  2612. { Should have been checked previously }
  2613. if p.opcode <> A_LEA then
  2614. InternalError(2020072501);
  2615. { do not mess with the stack point as adjusting it by lea is recommend, except if we optimize for size }
  2616. if (p.oper[1]^.reg=NR_STACK_POINTER_REG) and
  2617. not(cs_opt_size in current_settings.optimizerswitches) then
  2618. exit;
  2619. with p.oper[0]^.ref^ do
  2620. begin
  2621. if (base <> p.oper[1]^.reg) or
  2622. (index <> NR_NO) or
  2623. assigned(symbol) then
  2624. exit;
  2625. l:=offset;
  2626. if (l=1) and UseIncDec then
  2627. begin
  2628. p.opcode:=A_INC;
  2629. p.loadreg(0,p.oper[1]^.reg);
  2630. p.ops:=1;
  2631. DebugMsg(SPeepholeOptimization + 'Lea2Inc done',p);
  2632. end
  2633. else if (l=-1) and UseIncDec then
  2634. begin
  2635. p.opcode:=A_DEC;
  2636. p.loadreg(0,p.oper[1]^.reg);
  2637. p.ops:=1;
  2638. DebugMsg(SPeepholeOptimization + 'Lea2Dec done',p);
  2639. end
  2640. else
  2641. begin
  2642. if (l<0) and (l<>-2147483648) then
  2643. begin
  2644. p.opcode:=A_SUB;
  2645. p.loadConst(0,-l);
  2646. DebugMsg(SPeepholeOptimization + 'Lea2Sub done',p);
  2647. end
  2648. else
  2649. begin
  2650. p.opcode:=A_ADD;
  2651. p.loadConst(0,l);
  2652. DebugMsg(SPeepholeOptimization + 'Lea2Add done',p);
  2653. end;
  2654. end;
  2655. end;
  2656. Result := True;
  2657. end;
  2658. function TX86AsmOptimizer.DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  2659. var
  2660. CurrentReg, ReplaceReg: TRegister;
  2661. begin
  2662. Result := False;
  2663. ReplaceReg := taicpu(p_mov).oper[0]^.reg;
  2664. CurrentReg := taicpu(p_mov).oper[1]^.reg;
  2665. case hp.opcode of
  2666. A_FSTSW, A_FNSTSW,
  2667. A_IN, A_INS, A_OUT, A_OUTS,
  2668. A_CMPS, A_LODS, A_MOVS, A_SCAS, A_STOS:
  2669. { These routines have explicit operands, but they are restricted in
  2670. what they can be (e.g. IN and OUT can only read from AL, AX or
  2671. EAX. }
  2672. Exit;
  2673. A_IMUL:
  2674. begin
  2675. { The 1-operand version writes to implicit registers
  2676. The 2-operand version reads from the first operator, and reads
  2677. from and writes to the second (equivalent to Ch_ROp1, ChRWOp2).
  2678. the 3-operand version reads from a register that it doesn't write to
  2679. }
  2680. case hp.ops of
  2681. 1:
  2682. if (
  2683. (
  2684. (hp.opsize = S_B) and (getsupreg(CurrentReg) <> RS_EAX)
  2685. ) or
  2686. not (getsupreg(CurrentReg) in [RS_EAX, RS_EDX])
  2687. ) and ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  2688. begin
  2689. Result := True;
  2690. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 1)', hp);
  2691. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2692. end;
  2693. 2:
  2694. { Only modify the first parameter }
  2695. if ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  2696. begin
  2697. Result := True;
  2698. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 2)', hp);
  2699. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2700. end;
  2701. 3:
  2702. { Only modify the second parameter }
  2703. if ReplaceRegisterInOper(hp, 1, CurrentReg, ReplaceReg) then
  2704. begin
  2705. Result := True;
  2706. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 3)', hp);
  2707. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2708. end;
  2709. else
  2710. InternalError(2020012901);
  2711. end;
  2712. end;
  2713. else
  2714. if (hp.ops > 0) and
  2715. ReplaceRegisterInInstruction(hp, CurrentReg, ReplaceReg) then
  2716. begin
  2717. Result := True;
  2718. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovXXX2MovXXX)', hp);
  2719. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  2720. end;
  2721. end;
  2722. end;
  2723. function TX86AsmOptimizer.FuncMov2Func(var p: tai; const hp1: tai): Boolean;
  2724. var
  2725. hp2, hp_regalloc: tai;
  2726. p_SourceReg, p_TargetReg: TRegister;
  2727. begin
  2728. Result := False;
  2729. { Backward optimisation. If we have:
  2730. func. %reg1,%reg2
  2731. mov %reg2,%reg3
  2732. (dealloc %reg2)
  2733. Change to:
  2734. func. %reg1,%reg3 (see comment below for what a valid func. is)
  2735. Perform similar optimisations with 1, 3 and 4-operand instructions
  2736. that only have one output.
  2737. }
  2738. if MatchOpType(taicpu(p), top_reg, top_reg) then
  2739. begin
  2740. p_SourceReg := taicpu(p).oper[0]^.reg;
  2741. p_TargetReg := taicpu(p).oper[1]^.reg;
  2742. TransferUsedRegs(TmpUsedRegs);
  2743. if not RegUsedAfterInstruction(p_SourceReg, p, TmpUsedRegs) and
  2744. GetLastInstruction(p, hp2) and
  2745. (hp2.typ = ait_instruction) and
  2746. { Have to make sure it's an instruction that only reads from
  2747. the first operands and only writes (not reads or modifies) to
  2748. the last one; in essence, a pure function such as BSR, POPCNT
  2749. or ANDN }
  2750. (
  2751. (
  2752. (taicpu(hp2).ops = 1) and
  2753. (insprop[taicpu(hp2).opcode].Ch * [Ch_Wop1] = [Ch_Wop1])
  2754. ) or
  2755. (
  2756. (taicpu(hp2).ops = 2) and
  2757. (insprop[taicpu(hp2).opcode].Ch * [Ch_Rop1, Ch_Wop2] = [Ch_Rop1, Ch_Wop2])
  2758. ) or
  2759. (
  2760. (taicpu(hp2).ops = 3) and
  2761. (insprop[taicpu(hp2).opcode].Ch * [Ch_Rop1, Ch_Rop2, Ch_Wop3] = [Ch_Rop1, Ch_Rop2, Ch_Wop3])
  2762. ) or
  2763. (
  2764. (taicpu(hp2).ops = 4) and
  2765. (insprop[taicpu(hp2).opcode].Ch * [Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Wop4] = [Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Wop4])
  2766. )
  2767. ) and
  2768. (taicpu(hp2).oper[taicpu(hp2).ops-1]^.typ = top_reg) and
  2769. (taicpu(hp2).oper[taicpu(hp2).ops-1]^.reg = p_SourceReg) then
  2770. begin
  2771. case taicpu(hp2).opcode of
  2772. A_FSTSW, A_FNSTSW,
  2773. A_IN, A_INS, A_OUT, A_OUTS,
  2774. A_CMPS, A_LODS, A_MOVS, A_SCAS, A_STOS:
  2775. { These routines have explicit operands, but they are restricted in
  2776. what they can be (e.g. IN and OUT can only read from AL, AX or
  2777. EAX. }
  2778. ;
  2779. else
  2780. begin
  2781. DebugMsg(SPeepholeOptimization + 'Removed MOV and changed destination on previous instruction to optimise register usage (FuncMov2Func)', p);
  2782. { if %reg2 (p_SourceReg) is allocated before func., remove it completely }
  2783. hp_regalloc := FindRegAllocBackward(p_SourceReg, hp2);
  2784. if Assigned(hp_regalloc) then
  2785. begin
  2786. Asml.Remove(hp_regalloc);
  2787. if Assigned(FindRegDealloc(p_SourceReg, p)) then
  2788. begin
  2789. ExcludeRegFromUsedRegs(p_SourceReg, UsedRegs);
  2790. hp_regalloc.Free;
  2791. end
  2792. else
  2793. { If the register is not explicitly deallocated, it's
  2794. being reused, so move the allocation to after func. }
  2795. AsmL.InsertAfter(hp_regalloc, hp2);
  2796. end;
  2797. if not RegInInstruction(p_TargetReg, hp2) then
  2798. begin
  2799. TransferUsedRegs(TmpUsedRegs);
  2800. AllocRegBetween(p_TargetReg, hp2, p, TmpUsedRegs);
  2801. end;
  2802. { Actually make the changes }
  2803. taicpu(hp2).oper[taicpu(hp2).ops-1]^.reg := p_TargetReg;
  2804. RemoveCurrentp(p, hp1);
  2805. { If the Func was another MOV instruction, we might get
  2806. "mov %reg,%reg" that doesn't get removed in Pass 2
  2807. otherwise, so deal with it here (also do something
  2808. similar with lea (%reg),%reg}
  2809. if (taicpu(hp2).opcode = A_MOV) and MatchOperand(taicpu(hp2).oper[0]^, taicpu(hp2).oper[1]^.reg) then
  2810. begin
  2811. DebugMsg(SPeepholeOptimization + 'Mov2Nop 1a done', hp2);
  2812. if p = hp2 then
  2813. RemoveCurrentp(p)
  2814. else
  2815. RemoveInstruction(hp2);
  2816. end;
  2817. Result := True;
  2818. Exit;
  2819. end;
  2820. end;
  2821. end;
  2822. end;
  2823. end;
  2824. function TX86AsmOptimizer.CheckMovMov2MovMov2(const p, hp1: tai) : boolean;
  2825. begin
  2826. Result := False;
  2827. if MatchOpType(taicpu(p),top_ref,top_reg) and
  2828. MatchOpType(taicpu(hp1),top_ref,top_reg) and
  2829. (taicpu(p).opsize = taicpu(hp1).opsize) and
  2830. RefsEqual(taicpu(p).oper[0]^.ref^,taicpu(hp1).oper[0]^.ref^) and
  2831. (taicpu(p).oper[0]^.ref^.volatility=[]) and
  2832. (taicpu(hp1).oper[0]^.ref^.volatility=[]) and
  2833. not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.base)) and
  2834. not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.index)) then
  2835. begin
  2836. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 2',p);
  2837. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  2838. Result := True;
  2839. Include(OptsToCheck, aoc_ForceNewIteration);
  2840. end;
  2841. end;
  2842. function TX86AsmOptimizer.OptPass1MOV(var p : tai) : boolean;
  2843. var
  2844. hp1, hp2, hp3, hp4, last_hp1: tai;
  2845. GetNextInstruction_p, DoOptimisation, TempBool: Boolean;
  2846. p_SourceReg, p_TargetReg, NewMMReg: TRegister;
  2847. {$ifdef x86_64}
  2848. NewConst: TCGInt;
  2849. {$endif x86_64}
  2850. procedure convert_mov_value(signed_movop: tasmop; max_value: tcgint); inline;
  2851. begin
  2852. if taicpu(hp1).opcode = signed_movop then
  2853. begin
  2854. if taicpu(p).oper[0]^.val > max_value shr 1 then
  2855. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val - max_value - 1 { Convert to signed }
  2856. end
  2857. else
  2858. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and max_value; { Trim to unsigned }
  2859. end;
  2860. function GetNextHp1(const in_p: tai): Boolean;
  2861. begin
  2862. if NotFirstIteration and (cs_opt_level3 in current_settings.optimizerswitches) then
  2863. GetNextInstruction_p := GetNextInstructionUsingReg(in_p, hp1, p_TargetReg)
  2864. else
  2865. GetNextInstruction_p := GetNextInstruction(in_p, hp1);
  2866. Result := GetNextInstruction_p and (hp1.typ = ait_instruction);
  2867. end;
  2868. function TryConstMerge(var p1, p2: tai): Boolean;
  2869. var
  2870. ThisRef: TReference;
  2871. begin
  2872. Result := False;
  2873. ThisRef := taicpu(p2).oper[1]^.ref^;
  2874. { Only permit writes to the stack, since we can guarantee alignment with that }
  2875. if (ThisRef.index = NR_NO) and
  2876. (
  2877. (ThisRef.base = NR_STACK_POINTER_REG) or
  2878. (ThisRef.base = current_procinfo.framepointer)
  2879. ) then
  2880. begin
  2881. case taicpu(p).opsize of
  2882. S_B:
  2883. begin
  2884. { Word writes must be on a 2-byte boundary }
  2885. if (taicpu(p1).oper[1]^.ref^.offset mod 2) = 0 then
  2886. begin
  2887. { Reduce offset of second reference to see if it is sequential with the first }
  2888. Dec(ThisRef.offset, 1);
  2889. if RefsEqual(taicpu(p1).oper[1]^.ref^, ThisRef) then
  2890. begin
  2891. { Make sure the constants aren't represented as a
  2892. negative number, as these won't merge properly }
  2893. taicpu(p1).opsize := S_W;
  2894. taicpu(p1).oper[0]^.val := (taicpu(p1).oper[0]^.val and $FF) or ((taicpu(p2).oper[0]^.val and $FF) shl 8);
  2895. DebugMsg(SPeepholeOptimization + 'Merged two byte-sized constant writes to stack (MovMov2Mov 2a)', p1);
  2896. RemoveInstruction(p2);
  2897. Result := True;
  2898. end;
  2899. end;
  2900. end;
  2901. S_W:
  2902. begin
  2903. { Longword writes must be on a 4-byte boundary }
  2904. if (taicpu(p1).oper[1]^.ref^.offset mod 4) = 0 then
  2905. begin
  2906. { Reduce offset of second reference to see if it is sequential with the first }
  2907. Dec(ThisRef.offset, 2);
  2908. if RefsEqual(taicpu(p1).oper[1]^.ref^, ThisRef) then
  2909. begin
  2910. { Make sure the constants aren't represented as a
  2911. negative number, as these won't merge properly }
  2912. taicpu(p1).opsize := S_L;
  2913. taicpu(p1).oper[0]^.val := (taicpu(p1).oper[0]^.val and $FFFF) or ((taicpu(p2).oper[0]^.val and $FFFF) shl 16);
  2914. DebugMsg(SPeepholeOptimization + 'Merged two word-sized constant writes to stack (MovMov2Mov 2b)', p1);
  2915. RemoveInstruction(p2);
  2916. Result := True;
  2917. end;
  2918. end;
  2919. end;
  2920. {$ifdef x86_64}
  2921. S_L:
  2922. begin
  2923. { Only sign-extended 32-bit constants can be written to 64-bit memory directly, so check to
  2924. see if the constants can be encoded this way. }
  2925. NewConst := (taicpu(p1).oper[0]^.val and $FFFFFFFF) or (taicpu(p2).oper[0]^.val shl 32);
  2926. if (NewConst >= -2147483648) and (NewConst <= 2147483647) and
  2927. { Quadword writes must be on an 8-byte boundary }
  2928. ((taicpu(p1).oper[1]^.ref^.offset mod 8) = 0) then
  2929. begin
  2930. { Reduce offset of second reference to see if it is sequential with the first }
  2931. Dec(ThisRef.offset, 4);
  2932. if RefsEqual(taicpu(p1).oper[1]^.ref^, ThisRef) then
  2933. begin
  2934. { Make sure the constants aren't represented as a
  2935. negative number, as these won't merge properly }
  2936. taicpu(p1).opsize := S_Q;
  2937. { Force a typecast into a 32-bit signed integer (that will then be sign-extended to 64-bit) }
  2938. taicpu(p1).oper[0]^.val := NewConst;
  2939. DebugMsg(SPeepholeOptimization + 'Merged two longword-sized constant writes to stack (MovMov2Mov 2c)', p1);
  2940. RemoveInstruction(p2);
  2941. Result := True;
  2942. end;
  2943. end;
  2944. end;
  2945. {$endif x86_64}
  2946. else
  2947. ;
  2948. end;
  2949. end;
  2950. end;
  2951. var
  2952. TempRegUsed, CrossJump: Boolean;
  2953. PreMessage, RegName1, RegName2, InputVal, MaskNum: string;
  2954. NewSize: topsize; NewOffset: asizeint;
  2955. SourceRef, TargetRef: TReference;
  2956. MovAligned, MovUnaligned: TAsmOp;
  2957. JumpTracking: TLinkedList;
  2958. begin
  2959. Result:=false;
  2960. { remove mov reg1,reg1? }
  2961. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^)
  2962. then
  2963. begin
  2964. DebugMsg(SPeepholeOptimization + 'Mov2Nop 1 done',p);
  2965. { take care of the register (de)allocs following p }
  2966. RemoveCurrentP(p);
  2967. Result := True;
  2968. exit;
  2969. end;
  2970. { Prevent compiler warnings }
  2971. p_SourceReg := NR_NO;
  2972. p_TargetReg := NR_NO;
  2973. hp1 := nil;
  2974. if taicpu(p).oper[1]^.typ = top_reg then
  2975. begin
  2976. { Saves on a large number of dereferences }
  2977. p_TargetReg := taicpu(p).oper[1]^.reg;
  2978. TransferUsedRegs(TmpUsedRegs);
  2979. last_hp1 := p;
  2980. if GetNextHp1(p) then
  2981. while True do
  2982. begin
  2983. if (taicpu(hp1).opcode = A_AND) and
  2984. (taicpu(hp1).oper[1]^.typ = top_reg) and
  2985. SuperRegistersEqual(p_TargetReg, taicpu(hp1).oper[1]^.reg) then
  2986. begin
  2987. UpdateUsedRegsBetween(TmpUsedRegs, last_hp1, hp1);
  2988. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  2989. (taicpu(hp1).oper[0]^.typ = top_const) and
  2990. (taicpu(p).opsize = taicpu(hp1).opsize) then
  2991. begin
  2992. case taicpu(p).opsize of
  2993. S_L:
  2994. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  2995. begin
  2996. { Optimize out:
  2997. mov x, %reg
  2998. and ffffffffh, %reg
  2999. }
  3000. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 1 done',p);
  3001. hp2 := tai(hp1.Previous);
  3002. RemoveInstruction(hp1);
  3003. //Include(OptsToCheck, aoc_ForceNewIteration);
  3004. if GetNextHp1(hp2) then
  3005. Continue
  3006. else
  3007. Exit;
  3008. end;
  3009. S_Q: { TODO: Confirm if this is even possible }
  3010. if (taicpu(hp1).oper[0]^.val = $ffffffffffffffff) then
  3011. begin
  3012. { Optimize out:
  3013. mov x, %reg
  3014. and ffffffffffffffffh, %reg
  3015. }
  3016. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 2 done',p);
  3017. hp2 := tai(hp1.Previous);
  3018. RemoveInstruction(hp1);
  3019. //Include(OptsToCheck, aoc_ForceNewIteration);
  3020. if GetNextHp1(hp2) then
  3021. Continue
  3022. else
  3023. Exit;
  3024. end;
  3025. else
  3026. ;
  3027. end;
  3028. if (
  3029. { Make sure that if a reference is used, its registers
  3030. are not modified in between }
  3031. (
  3032. (taicpu(p).oper[0]^.typ = top_reg) and
  3033. not RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)
  3034. ) or
  3035. (
  3036. (taicpu(p).oper[0]^.typ = top_ref) and
  3037. (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) and
  3038. not RefModifiedBetween(taicpu(p).oper[0]^.ref^, topsize2memsize[taicpu(p).opsize] shr 3, p, hp1)
  3039. )
  3040. ) and
  3041. GetNextInstruction(hp1,hp2) and
  3042. MatchInstruction(hp2,A_TEST,[]) and
  3043. (
  3044. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp2).oper[1]^) or
  3045. (
  3046. { If the register being tested is smaller than the one
  3047. that received a bitwise AND, permit it if the constant
  3048. fits into the smaller size }
  3049. (taicpu(hp1).oper[1]^.typ = top_reg) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  3050. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg,taicpu(hp2).oper[1]^.reg) and
  3051. (taicpu(hp1).oper[0]^.typ = top_const) and (taicpu(hp1).oper[0]^.val >= 0) and
  3052. (GetSubReg(taicpu(hp2).oper[1]^.reg) < GetSubReg(taicpu(hp1).oper[1]^.reg)) and
  3053. (
  3054. (
  3055. (GetSubReg(taicpu(hp2).oper[1]^.reg) = R_SUBL) and
  3056. (taicpu(hp1).oper[0]^.val <= $FF)
  3057. ) or
  3058. (
  3059. (GetSubReg(taicpu(hp2).oper[1]^.reg) = R_SUBW) and
  3060. (taicpu(hp1).oper[0]^.val <= $FFFF)
  3061. {$ifdef x86_64}
  3062. ) or
  3063. (
  3064. (GetSubReg(taicpu(hp2).oper[1]^.reg) = R_SUBD) and
  3065. (taicpu(hp1).oper[0]^.val <= $FFFFFFFF)
  3066. {$endif x86_64}
  3067. )
  3068. )
  3069. )
  3070. ) and
  3071. (
  3072. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^) or
  3073. MatchOperand(taicpu(hp2).oper[0]^,-1)
  3074. ) and
  3075. GetNextInstruction(hp2,hp3) and
  3076. MatchInstruction(hp3,A_Jcc,A_Setcc,[]) and
  3077. (taicpu(hp3).condition in [C_E,C_NE]) then
  3078. begin
  3079. TransferUsedRegs(TmpUsedRegs);
  3080. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3081. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3082. if not(RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp2, TmpUsedRegs)) then
  3083. begin
  3084. DebugMsg(SPeepholeOptimization + 'MovAndTest2Test done',p);
  3085. taicpu(hp1).loadoper(1,taicpu(p).oper[0]^);
  3086. taicpu(hp1).opcode:=A_TEST;
  3087. { Shrink the TEST instruction down to the smallest possible size }
  3088. case taicpu(hp1).oper[0]^.val of
  3089. 0..255:
  3090. if (taicpu(hp1).opsize <> S_B)
  3091. {$ifndef x86_64}
  3092. and (
  3093. (taicpu(hp1).oper[1]^.typ <> top_reg) or
  3094. { Cannot encode byte-sized ESI, EDI, EBP or ESP under i386 }
  3095. (GetSupReg(taicpu(hp1).oper[1]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])
  3096. )
  3097. {$endif x86_64}
  3098. then
  3099. begin
  3100. if taicpu(hp1).opsize <> taicpu(hp2).opsize then
  3101. { Only print debug message if the TEST instruction
  3102. is a different size before and after }
  3103. DebugMsg(SPeepholeOptimization + 'test' + debug_opsize2str(taicpu(hp1).opsize) + ' -> testb to reduce instruction size (Test2Test 1a)' , p);
  3104. taicpu(hp1).opsize := S_B;
  3105. if (taicpu(hp1).oper[1]^.typ = top_reg) then
  3106. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBL);
  3107. end;
  3108. 256..65535:
  3109. if (taicpu(hp1).opsize <> S_W) then
  3110. begin
  3111. if taicpu(hp1).opsize <> taicpu(hp2).opsize then
  3112. { Only print debug message if the TEST instruction
  3113. is a different size before and after }
  3114. DebugMsg(SPeepholeOptimization + 'test' + debug_opsize2str(taicpu(hp1).opsize) + ' -> testw to reduce instruction size (Test2Test 1b)' , p);
  3115. taicpu(hp1).opsize := S_W;
  3116. if (taicpu(hp1).oper[1]^.typ = top_reg) then
  3117. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBW);
  3118. end;
  3119. {$ifdef x86_64}
  3120. 65536..$7FFFFFFF:
  3121. if (taicpu(hp1).opsize <> S_L) then
  3122. begin
  3123. if taicpu(hp1).opsize <> taicpu(hp2).opsize then
  3124. { Only print debug message if the TEST instruction
  3125. is a different size before and after }
  3126. DebugMsg(SPeepholeOptimization + 'test' + debug_opsize2str(taicpu(hp1).opsize) + ' -> testl to reduce instruction size (Test2Test 1c)' , p);
  3127. taicpu(hp1).opsize := S_L;
  3128. if (taicpu(hp1).oper[1]^.typ = top_reg) then
  3129. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  3130. end;
  3131. {$endif x86_64}
  3132. else
  3133. ;
  3134. end;
  3135. RemoveInstruction(hp2);
  3136. RemoveCurrentP(p);
  3137. Result:=true;
  3138. exit;
  3139. end;
  3140. end;
  3141. end;
  3142. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  3143. (taicpu(p).opsize = taicpu(hp1).opsize) and
  3144. (taicpu(hp1).oper[0]^.typ <> top_ref) and
  3145. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^) and
  3146. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[1]^) and
  3147. (
  3148. not (cs_opt_level3 in current_settings.optimizerswitches) or
  3149. (taicpu(hp1).oper[0]^.typ = top_const) or
  3150. not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)
  3151. ) then
  3152. begin
  3153. { With:
  3154. mov %reg1,%reg2
  3155. ...
  3156. and %reg1,%reg2
  3157. Or:
  3158. mov $x,%reg2
  3159. ...
  3160. and $x,%reg2
  3161. Remove the 'and' instruction
  3162. }
  3163. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 4 done',hp1);
  3164. hp2 := tai(hp1.Previous);
  3165. RemoveInstruction(hp1);
  3166. //Include(OptsToCheck, aoc_ForceNewIteration);
  3167. if GetNextHp1(hp2) then
  3168. Continue
  3169. else
  3170. Exit;
  3171. end;
  3172. if IsMOVZXAcceptable and
  3173. (taicpu(p).oper[0]^.typ <> top_const) then { MOVZX only supports registers and memory, not immediates (use MOV for that!) }
  3174. begin
  3175. InputVal := debug_operstr(taicpu(p).oper[0]^);
  3176. MaskNum := debug_tostr(taicpu(hp1).oper[0]^.val);
  3177. case taicpu(p).opsize of
  3178. S_B:
  3179. if (taicpu(hp1).oper[0]^.val = $ff) then
  3180. begin
  3181. { Convert:
  3182. movb x, %regl movb x, %regl
  3183. andw ffh, %regw andl ffh, %regd
  3184. To:
  3185. movzbw x, %regd movzbl x, %regd
  3186. (Identical registers, just different sizes)
  3187. }
  3188. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 8-bit register name }
  3189. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 16/32-bit register name }
  3190. case taicpu(hp1).opsize of
  3191. S_W: NewSize := S_BW;
  3192. S_L: NewSize := S_BL;
  3193. {$ifdef x86_64}
  3194. S_Q: NewSize := S_BQ;
  3195. {$endif x86_64}
  3196. else
  3197. InternalError(2018011510);
  3198. end;
  3199. end
  3200. else
  3201. NewSize := S_NO;
  3202. S_W:
  3203. if (taicpu(hp1).oper[0]^.val = $ffff) then
  3204. begin
  3205. { Convert:
  3206. movw x, %regw
  3207. andl ffffh, %regd
  3208. To:
  3209. movzwl x, %regd
  3210. (Identical registers, just different sizes)
  3211. }
  3212. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 16-bit register name }
  3213. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 32-bit register name }
  3214. case taicpu(hp1).opsize of
  3215. S_L: NewSize := S_WL;
  3216. {$ifdef x86_64}
  3217. S_Q: NewSize := S_WQ;
  3218. {$endif x86_64}
  3219. else
  3220. InternalError(2018011511);
  3221. end;
  3222. end
  3223. else
  3224. NewSize := S_NO;
  3225. else
  3226. NewSize := S_NO;
  3227. end;
  3228. if NewSize <> S_NO then
  3229. begin
  3230. PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + InputVal + ',' + RegName1;
  3231. { The actual optimization }
  3232. taicpu(p).opcode := A_MOVZX;
  3233. taicpu(p).changeopsize(NewSize);
  3234. taicpu(p).loadoper(1, taicpu(hp1).oper[1]^);
  3235. { Make sure we deal with any reference counts that were increased }
  3236. if taicpu(hp1).oper[1]^.typ = top_ref then
  3237. begin
  3238. if Assigned(taicpu(hp1).oper[1]^.ref^.symbol) then
  3239. taicpu(hp1).oper[1]^.ref^.symbol.decrefs;
  3240. if Assigned(taicpu(hp1).oper[1]^.ref^.relsymbol) then
  3241. taicpu(hp1).oper[1]^.ref^.relsymbol.decrefs;
  3242. end;
  3243. { Safeguard if "and" is followed by a conditional command }
  3244. TransferUsedRegs(TmpUsedRegs);
  3245. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.next), hp1);
  3246. if (RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  3247. begin
  3248. { At this point, the "and" command is effectively equivalent to
  3249. "test %reg,%reg". This will be handled separately by the
  3250. Peephole Optimizer. [Kit] }
  3251. DebugMsg(SPeepholeOptimization + PreMessage +
  3252. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  3253. end
  3254. else
  3255. begin
  3256. DebugMsg(SPeepholeOptimization + PreMessage + '; and' + debug_opsize2str(taicpu(hp1).opsize) + ' $' + MaskNum + ',' + RegName2 +
  3257. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  3258. RemoveInstruction(hp1);
  3259. end;
  3260. Result := True;
  3261. Exit;
  3262. { Go through DeepMOVOpt again (jump to "while True do") }
  3263. Continue;
  3264. end;
  3265. end;
  3266. end;
  3267. if taicpu(p).oper[0]^.typ = top_reg then
  3268. begin
  3269. p_SourceReg := taicpu(p).oper[0]^.reg;
  3270. { Look for:
  3271. mov %reg1,%reg2
  3272. ??? %reg2,r/m
  3273. Change to:
  3274. mov %reg1,%reg2
  3275. ??? %reg1,r/m
  3276. }
  3277. if RegReadByInstruction(p_TargetReg, hp1) and
  3278. not RegModifiedBetween(p_SourceReg, p, hp1) and
  3279. DeepMOVOpt(taicpu(p), taicpu(hp1)) then
  3280. begin
  3281. { A change has occurred, just not in p }
  3282. Include(OptsToCheck, aoc_ForceNewIteration);
  3283. TransferUsedRegs(TmpUsedRegs);
  3284. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3285. if not RegUsedAfterInstruction(p_TargetReg, hp1, TmpUsedRegs) and
  3286. { Just in case something didn't get modified (e.g. an
  3287. implicit register) }
  3288. not RegReadByInstruction(p_TargetReg, hp1) then
  3289. begin
  3290. { We can remove the original MOV }
  3291. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3 done',p);
  3292. RemoveCurrentP(p);
  3293. { UsedRegs got updated by RemoveCurrentp }
  3294. Result := True;
  3295. Exit;
  3296. end;
  3297. { If we know a MOV instruction has become a null operation, we might as well
  3298. get rid of it now to save time. }
  3299. if (taicpu(hp1).opcode = A_MOV) and
  3300. (taicpu(hp1).oper[1]^.typ = top_reg) and
  3301. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
  3302. { Just being a register is enough to confirm it's a null operation }
  3303. (taicpu(hp1).oper[0]^.typ = top_reg) then
  3304. begin
  3305. Result := True;
  3306. { Speed-up to reduce a pipeline stall... if we had something like...
  3307. movl %eax,%edx
  3308. movw %dx,%ax
  3309. ... the second instruction would change to movw %ax,%ax, but
  3310. given that it is now %ax that's active rather than %eax,
  3311. penalties might occur due to a partial register write, so instead,
  3312. change it to a MOVZX instruction when optimising for speed.
  3313. }
  3314. if not (cs_opt_size in current_settings.optimizerswitches) and
  3315. IsMOVZXAcceptable and
  3316. (taicpu(hp1).opsize < taicpu(p).opsize)
  3317. {$ifdef x86_64}
  3318. { operations already implicitly set the upper 64 bits to zero }
  3319. and not ((taicpu(hp1).opsize = S_L) and (taicpu(p).opsize = S_Q))
  3320. {$endif x86_64}
  3321. then
  3322. begin
  3323. DebugMsg(SPeepholeOptimization + 'Zero-extension to minimise pipeline stall (Mov2Movz)',hp1);
  3324. case taicpu(p).opsize of
  3325. S_W:
  3326. if taicpu(hp1).opsize = S_B then
  3327. taicpu(hp1).opsize := S_BL
  3328. else
  3329. InternalError(2020012911);
  3330. S_L{$ifdef x86_64}, S_Q{$endif x86_64}:
  3331. case taicpu(hp1).opsize of
  3332. S_B:
  3333. taicpu(hp1).opsize := S_BL;
  3334. S_W:
  3335. taicpu(hp1).opsize := S_WL;
  3336. else
  3337. InternalError(2020012912);
  3338. end;
  3339. else
  3340. InternalError(2020012910);
  3341. end;
  3342. taicpu(hp1).opcode := A_MOVZX;
  3343. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  3344. end
  3345. else
  3346. begin
  3347. GetNextInstruction_p := GetNextInstruction(hp1, hp2);
  3348. DebugMsg(SPeepholeOptimization + 'Mov2Nop 4 done',hp1);
  3349. RemoveInstruction(hp1);
  3350. { The instruction after what was hp1 is now the immediate next instruction,
  3351. so we can continue to make optimisations if it's present }
  3352. if not GetNextInstruction_p or (hp2.typ <> ait_instruction) then
  3353. Exit;
  3354. hp1 := hp2;
  3355. end;
  3356. end;
  3357. end;
  3358. {$ifdef x86_64}
  3359. { Change:
  3360. movl %reg1l,%reg2l
  3361. movq %reg2q,%reg1q
  3362. To:
  3363. movl %reg1l,%reg2l
  3364. andl %reg1l,%reg1l
  3365. }
  3366. if (taicpu(p).opsize = S_L) and MatchInstruction(hp1,A_MOV,[S_Q]) and
  3367. not RegModifiedBetween(p_SourceReg, p, hp1) and
  3368. MatchOpType(taicpu(hp1), top_reg, top_reg) and
  3369. SuperRegistersEqual(p_TargetReg, taicpu(hp1).oper[0]^.reg) and
  3370. SuperRegistersEqual(p_SourceReg, taicpu(hp1).oper[1]^.reg) then
  3371. begin
  3372. TransferUsedRegs(TmpUsedRegs);
  3373. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3374. taicpu(hp1).opsize := S_L;
  3375. taicpu(hp1).loadreg(0, p_SourceReg);
  3376. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  3377. AllocRegBetween(p_SourceReg, p, hp1, UsedRegs);
  3378. DebugMsg(SPeepholeOptimization + 'Made 32-to-64-bit zero extension more efficient (MovlMovq2MovlAndl 1)', hp1);
  3379. taicpu(hp1).opcode := A_AND;
  3380. { We may be able to do more and replace references
  3381. to %reg2q with %reg1q etc. }
  3382. if (cs_opt_level3 in current_settings.optimizerswitches) and
  3383. { p_TargetReg is not used between, otherwise the earlier
  3384. GetNextInstructionUsingReg would have stopped sooner }
  3385. DoZeroUpper32Opt(p,hp1) then
  3386. begin
  3387. Result := True;
  3388. Exit;
  3389. end;
  3390. end;
  3391. {
  3392. If we have the following already in the code...
  3393. movl %reg1l,%reg2l
  3394. andl %reg1l,%reg1l
  3395. ...we may be able to do more and replace references to
  3396. %reg2q with %reg1q etc. (program flow won't reach this
  3397. point if the second instruction was originally a MOV
  3398. and just got changed to AND)
  3399. }
  3400. if (cs_opt_level3 in current_settings.optimizerswitches) and
  3401. (taicpu(p).opsize = S_L) and MatchInstruction(hp1,A_AND,[S_L]) and
  3402. not RegModifiedBetween(p_SourceReg, p, hp1) and
  3403. { p_TargetReg is not used between, otherwise the earlier
  3404. GetNextInstructionUsingReg would have stopped sooner }
  3405. MatchOperand(taicpu(hp1).oper[1]^, p_SourceReg) and
  3406. (
  3407. MatchOperand(taicpu(hp1).oper[0]^, p_SourceReg) or
  3408. MatchOperand(taicpu(hp1).oper[0]^, $ffffffff)
  3409. ) and
  3410. DoZeroUpper32Opt(p,hp1) then
  3411. begin
  3412. Result := True;
  3413. Exit;
  3414. end;
  3415. {$endif x86_64}
  3416. end
  3417. else if taicpu(p).oper[0]^.typ = top_const then
  3418. begin
  3419. if (taicpu(hp1).opcode = A_OR) and
  3420. (taicpu(p).oper[1]^.typ = top_reg) and
  3421. MatchOperand(taicpu(p).oper[0]^, 0) and
  3422. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) then
  3423. begin
  3424. { mov 0, %reg
  3425. or ###,%reg
  3426. Change to (only if the flags are not used):
  3427. mov ###,%reg
  3428. }
  3429. TransferUsedRegs(TmpUsedRegs);
  3430. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3431. DoOptimisation := True;
  3432. { Even if the flags are used, we might be able to do the optimisation
  3433. if the conditions are predictable }
  3434. if RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  3435. begin
  3436. { Only perform if ### = %reg (the same register) or equal to 0,
  3437. so %reg is guaranteed to still have a value of zero }
  3438. if MatchOperand(taicpu(hp1).oper[0]^, 0) or
  3439. MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^.reg) then
  3440. begin
  3441. hp2 := hp1;
  3442. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3443. while RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  3444. GetNextInstruction(hp2, hp3) do
  3445. begin
  3446. { Don't continue modifying if the flags state is getting changed }
  3447. if RegModifiedByInstruction(NR_DEFAULTFLAGS, hp3) then
  3448. Break;
  3449. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  3450. if MatchInstruction(hp3, A_Jcc, A_SETcc, A_CMOVcc, []) then
  3451. begin
  3452. if condition_in(C_E, taicpu(hp3).condition) or (taicpu(hp3).condition in [C_NC, C_NS, C_NO]) then
  3453. begin
  3454. { Condition is always true }
  3455. case taicpu(hp3).opcode of
  3456. A_Jcc:
  3457. begin
  3458. { Check for jump shortcuts before we destroy the condition }
  3459. hp4 := hp3;
  3460. DoJumpOptimizations(hp3, TempBool);
  3461. { Make sure hp3 hasn't changed }
  3462. if (hp4 = hp3) then
  3463. begin
  3464. DebugMsg(SPeepholeOptimization + 'Condition is always true (jump made unconditional)', hp3);
  3465. MakeUnconditional(taicpu(hp3));
  3466. end;
  3467. Result := True;
  3468. end;
  3469. A_CMOVcc:
  3470. begin
  3471. DebugMsg(SPeepholeOptimization + 'Condition is always true (CMOVcc -> MOV)', hp3);
  3472. taicpu(hp3).opcode := A_MOV;
  3473. taicpu(hp3).condition := C_None;
  3474. Result := True;
  3475. end;
  3476. A_SETcc:
  3477. begin
  3478. DebugMsg(SPeepholeOptimization + 'Condition is always true (changed to MOV 1)', hp3);
  3479. { Convert "set(c) %reg" instruction to "movb 1,%reg" }
  3480. taicpu(hp3).opcode := A_MOV;
  3481. taicpu(hp3).ops := 2;
  3482. taicpu(hp3).condition := C_None;
  3483. taicpu(hp3).opsize := S_B;
  3484. taicpu(hp3).loadreg(1,taicpu(hp3).oper[0]^.reg);
  3485. taicpu(hp3).loadconst(0, 1);
  3486. Result := True;
  3487. end;
  3488. else
  3489. InternalError(2021090701);
  3490. end;
  3491. end
  3492. else if (taicpu(hp3).condition in [C_A, C_B, C_C, C_G, C_L, C_NE, C_NZ, C_O, C_S]) then
  3493. begin
  3494. { Condition is always false }
  3495. case taicpu(hp3).opcode of
  3496. A_Jcc:
  3497. begin
  3498. DebugMsg(SPeepholeOptimization + 'Condition is always false (jump removed)', hp3);
  3499. TAsmLabel(taicpu(hp3).oper[0]^.ref^.symbol).decrefs;
  3500. RemoveInstruction(hp3);
  3501. Result := True;
  3502. { Since hp3 was deleted, hp2 must not be updated }
  3503. Continue;
  3504. end;
  3505. A_CMOVcc:
  3506. begin
  3507. DebugMsg(SPeepholeOptimization + 'Condition is always false (conditional load removed)', hp3);
  3508. RemoveInstruction(hp3);
  3509. Result := True;
  3510. { Since hp3 was deleted, hp2 must not be updated }
  3511. Continue;
  3512. end;
  3513. A_SETcc:
  3514. begin
  3515. DebugMsg(SPeepholeOptimization + 'Condition is always false (changed to MOV 0)', hp3);
  3516. { Convert "set(c) %reg" instruction to "movb 0,%reg" }
  3517. taicpu(hp3).opcode := A_MOV;
  3518. taicpu(hp3).ops := 2;
  3519. taicpu(hp3).condition := C_None;
  3520. taicpu(hp3).opsize := S_B;
  3521. taicpu(hp3).loadreg(1,taicpu(hp3).oper[0]^.reg);
  3522. taicpu(hp3).loadconst(0, 0);
  3523. Result := True;
  3524. end;
  3525. else
  3526. InternalError(2021090702);
  3527. end;
  3528. end
  3529. else
  3530. { Uncertain what to do - don't optimise (although optimise other conditional statements if present) }
  3531. DoOptimisation := False;
  3532. end;
  3533. hp2 := hp3;
  3534. end;
  3535. if DoOptimisation then
  3536. begin
  3537. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  3538. if RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  3539. { Flags are still in use - don't optimise }
  3540. DoOptimisation := False;
  3541. end;
  3542. end
  3543. else
  3544. DoOptimisation := False;
  3545. end;
  3546. if DoOptimisation then
  3547. begin
  3548. {$ifdef x86_64}
  3549. { OR only supports 32-bit sign-extended constants for 64-bit
  3550. instructions, so compensate for this if the constant is
  3551. encoded as a value greater than or equal to 2^31 }
  3552. if (taicpu(hp1).opsize = S_Q) and
  3553. (taicpu(hp1).oper[0]^.typ = top_const) and
  3554. (taicpu(hp1).oper[0]^.val >= $80000000) then
  3555. taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val or $FFFFFFFF00000000;
  3556. {$endif x86_64}
  3557. DebugMsg(SPeepholeOptimization + 'MOV 0 / OR -> MOV', p);
  3558. taicpu(hp1).opcode := A_MOV;
  3559. RemoveCurrentP(p);
  3560. Result := True;
  3561. Exit;
  3562. end;
  3563. end;
  3564. end
  3565. else if
  3566. { oper[0] is a reference }
  3567. (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) then
  3568. begin
  3569. if MatchInstruction(hp1,A_LEA,[S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
  3570. begin
  3571. if ((MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(hp1).oper[1]^.reg,Taicpu(p).oper[1]^.reg) and
  3572. (Taicpu(hp1).oper[0]^.ref^.base<>Taicpu(p).oper[1]^.reg)
  3573. ) or
  3574. (MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(p).oper[1]^.reg,Taicpu(hp1).oper[1]^.reg) and
  3575. (Taicpu(hp1).oper[0]^.ref^.index<>Taicpu(p).oper[1]^.reg)
  3576. )
  3577. ) and
  3578. not RegModifiedBetween(Taicpu(hp1).oper[1]^.reg, p, hp1) then
  3579. { mov ref,reg1
  3580. lea (reg1,reg2),reg2
  3581. to
  3582. add ref,reg2 }
  3583. begin
  3584. TransferUsedRegs(TmpUsedRegs);
  3585. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3586. { If the flags register is in use, don't change the instruction to an
  3587. ADD otherwise this will scramble the flags. [Kit] }
  3588. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  3589. { reg1 may not be used afterwards }
  3590. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
  3591. begin
  3592. Taicpu(hp1).opcode:=A_ADD;
  3593. Taicpu(hp1).oper[0]^.ref^:=Taicpu(p).oper[0]^.ref^;
  3594. DebugMsg(SPeepholeOptimization + 'MovLea2Add done',hp1);
  3595. RemoveCurrentp(p);
  3596. result:=true;
  3597. exit;
  3598. end;
  3599. end;
  3600. { If the LEA instruction can be converted into an arithmetic instruction,
  3601. it may be possible to then fold it in the next optimisation. }
  3602. if ConvertLEA(taicpu(hp1)) then
  3603. Include(OptsToCheck, aoc_ForceNewIteration);
  3604. end;
  3605. {
  3606. mov ref,reg0
  3607. <op> reg0,reg1
  3608. dealloc reg0
  3609. to
  3610. <op> ref,reg1
  3611. }
  3612. if MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3613. (taicpu(hp1).oper[0]^.reg = p_TargetReg) and
  3614. MatchInstruction(hp1, [A_AND, A_OR, A_XOR, A_ADD, A_SUB, A_CMP, A_TEST, A_CMOVcc, A_BSR, A_BSF, A_POPCNT, A_LZCNT], [taicpu(p).opsize]) and
  3615. not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, p_TargetReg) and
  3616. not RefModifiedBetween(taicpu(p).oper[0]^.ref^, topsize2memsize[taicpu(p).opsize] shr 3, p, hp1) then
  3617. begin
  3618. TransferUsedRegs(TmpUsedRegs);
  3619. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3620. if not RegUsedAfterInstruction(p_TargetReg, hp1, TmpUsedRegs) then
  3621. begin
  3622. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  3623. { loadref increases the reference count, so decrement it again }
  3624. if Assigned(taicpu(p).oper[0]^.ref^.symbol) then
  3625. taicpu(p).oper[0]^.ref^.symbol.decrefs;
  3626. if Assigned(taicpu(p).oper[0]^.ref^.relsymbol) then
  3627. taicpu(p).oper[0]^.ref^.relsymbol.decrefs;
  3628. DebugMsg(SPeepholeOptimization + 'MovOp2Op done',hp1);
  3629. { See if we can remove the allocation of reg0 }
  3630. if not RegInRef(p_TargetReg, taicpu(p).oper[0]^.ref^) then
  3631. TryRemoveRegAlloc(p_TargetReg, p, hp1);
  3632. RemoveCurrentp(p);
  3633. Result:=true;
  3634. exit;
  3635. end;
  3636. end;
  3637. end;
  3638. { Depending on the DeepMOVOpt above, it may turn out that hp1 completely
  3639. overwrites the original destination register. e.g.
  3640. movl ###,%reg2d
  3641. movslq ###,%reg2q (### doesn't have to be the same as the first one)
  3642. In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
  3643. }
  3644. if MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
  3645. (taicpu(hp1).oper[1]^.typ = top_reg) and
  3646. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  3647. begin
  3648. if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
  3649. begin
  3650. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  3651. case taicpu(p).oper[0]^.typ of
  3652. top_const:
  3653. { We have something like:
  3654. movb $x, %regb
  3655. movzbl %regb,%regd
  3656. Change to:
  3657. movl $x, %regd
  3658. }
  3659. begin
  3660. case taicpu(hp1).opsize of
  3661. S_BW:
  3662. begin
  3663. convert_mov_value(A_MOVSX, $FF);
  3664. setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
  3665. taicpu(p).opsize := S_W;
  3666. end;
  3667. S_BL:
  3668. begin
  3669. convert_mov_value(A_MOVSX, $FF);
  3670. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  3671. taicpu(p).opsize := S_L;
  3672. end;
  3673. S_WL:
  3674. begin
  3675. convert_mov_value(A_MOVSX, $FFFF);
  3676. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  3677. taicpu(p).opsize := S_L;
  3678. end;
  3679. {$ifdef x86_64}
  3680. S_BQ:
  3681. begin
  3682. convert_mov_value(A_MOVSX, $FF);
  3683. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  3684. taicpu(p).opsize := S_Q;
  3685. end;
  3686. S_WQ:
  3687. begin
  3688. convert_mov_value(A_MOVSX, $FFFF);
  3689. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  3690. taicpu(p).opsize := S_Q;
  3691. end;
  3692. S_LQ:
  3693. begin
  3694. convert_mov_value(A_MOVSXD, $FFFFFFFF); { Note it's MOVSXD, not MOVSX }
  3695. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  3696. taicpu(p).opsize := S_Q;
  3697. end;
  3698. {$endif x86_64}
  3699. else
  3700. { If hp1 was a MOV instruction, it should have been
  3701. optimised already }
  3702. InternalError(2020021001);
  3703. end;
  3704. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
  3705. RemoveInstruction(hp1);
  3706. Result := True;
  3707. Exit;
  3708. end;
  3709. top_ref:
  3710. begin
  3711. { We have something like:
  3712. movb mem, %regb
  3713. movzbl %regb,%regd
  3714. Change to:
  3715. movzbl mem, %regd
  3716. }
  3717. if (taicpu(p).oper[0]^.ref^.refaddr<>addr_full) and (IsMOVZXAcceptable or (taicpu(hp1).opcode<>A_MOVZX)) then
  3718. begin
  3719. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
  3720. taicpu(p).opcode := taicpu(hp1).opcode;
  3721. taicpu(p).opsize := taicpu(hp1).opsize;
  3722. taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg;
  3723. RemoveInstruction(hp1);
  3724. Result := True;
  3725. Exit;
  3726. end;
  3727. end;
  3728. else
  3729. if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
  3730. { Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
  3731. Exit;
  3732. end;
  3733. end
  3734. { The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
  3735. and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
  3736. optimised }
  3737. else
  3738. begin
  3739. DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
  3740. RemoveCurrentP(p);
  3741. Result := True;
  3742. Exit;
  3743. end;
  3744. end;
  3745. if (taicpu(hp1).opcode = A_MOV) and
  3746. (
  3747. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)
  3748. {$ifdef x86_64}
  3749. or (
  3750. { Permit zero extension from 32- to 64-bit when writing
  3751. a constant (it will be checked to see if it fits into
  3752. a signed 32-bit integer) }
  3753. (taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and
  3754. (
  3755. { Valid situations... writing an unsigned 32-bit
  3756. immediate, or the destination is a 64-bit register }
  3757. (taicpu(p).oper[0]^.typ = top_const) or
  3758. (taicpu(hp1).oper[1]^.typ = top_reg)
  3759. ) and
  3760. (taicpu(hp1).oper[0]^.typ = top_reg) and
  3761. SuperRegistersEqual(p_TargetReg, taicpu(hp1).oper[0]^.reg)
  3762. )
  3763. {$endif x86_64}
  3764. ) then
  3765. begin
  3766. { Remember that p_TargetReg contains taicpu(p).oper[1]^.reg }
  3767. TransferUsedRegs(TmpUsedRegs);
  3768. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  3769. { we have
  3770. mov x, %treg
  3771. mov %treg, y
  3772. }
  3773. if not(RegInOp(p_TargetReg, taicpu(hp1).oper[1]^)) then
  3774. if not(RegUsedAfterInstruction(p_TargetReg, hp1, TmpUsedRegs)) then
  3775. begin
  3776. { we've got
  3777. mov x, %treg
  3778. mov %treg, y
  3779. with %treg is not used after }
  3780. case taicpu(p).oper[0]^.typ Of
  3781. { top_reg is covered by DeepMOVOpt }
  3782. top_const:
  3783. begin
  3784. { change
  3785. mov const, %treg
  3786. mov %treg, y
  3787. to
  3788. mov const, y
  3789. }
  3790. {$ifdef x86_64}
  3791. if (taicpu(hp1).oper[1]^.typ=top_reg) or
  3792. (
  3793. { For 32-to-64-bit zero-extension, the immediate
  3794. must be between 0 and 2^31 - 1}
  3795. (taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and
  3796. ((taicpu(p).oper[0]^.val>=0) and (taicpu(p).oper[0]^.val<=high(longint)))
  3797. ) or
  3798. (
  3799. not ((taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q)) and
  3800. (
  3801. (taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))
  3802. )
  3803. ) then
  3804. {$endif x86_64}
  3805. begin
  3806. taicpu(hp1).loadconst(0, taicpu(p).oper[0]^.val);
  3807. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 5 done', hp1);
  3808. RemoveCurrentP(p);
  3809. Result := True;
  3810. Exit;
  3811. end;
  3812. end;
  3813. top_ref:
  3814. case taicpu(hp1).oper[1]^.typ of
  3815. top_reg:
  3816. { change
  3817. mov mem, %treg
  3818. mov %treg, %reg
  3819. to
  3820. mov mem, %reg"
  3821. }
  3822. if not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1) then
  3823. begin
  3824. {$ifdef x86_64}
  3825. { If zero extending from 32-bit to 64-bit,
  3826. we have to make sure the replaced
  3827. register is the right size }
  3828. taicpu(p).loadreg(1, newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg),getsubreg(p_TargetReg)));
  3829. {$else}
  3830. taicpu(p).loadreg(1, taicpu(hp1).oper[1]^.reg);
  3831. {$endif x86_64}
  3832. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3a done', p);
  3833. AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp1, UsedRegs);
  3834. RemoveInstruction(hp1);
  3835. Result := True;
  3836. Exit;
  3837. end
  3838. else if
  3839. { Make sure that if a reference is used, its
  3840. registers are not modified in between }
  3841. not RefModifiedBetween(taicpu(p).oper[0]^.ref^, topsize2memsize[taicpu(p).opsize] shr 3, p, hp1) then
  3842. begin
  3843. if (taicpu(p).oper[0]^.ref^.base <> NR_NO){$ifdef x86_64} and (taicpu(p).oper[0]^.ref^.base <> NR_RIP){$endif x86_64} then
  3844. AllocRegBetween(taicpu(p).oper[0]^.ref^.base, p, hp1, UsedRegs);
  3845. if (taicpu(p).oper[0]^.ref^.index <> NR_NO) and (taicpu(p).oper[0]^.ref^.index <> taicpu(p).oper[0]^.ref^.base) then
  3846. AllocRegBetween(taicpu(p).oper[0]^.ref^.index, p, hp1, UsedRegs);
  3847. taicpu(hp1).loadref(0, taicpu(p).oper[0]^.ref^);
  3848. if Assigned(taicpu(p).oper[0]^.ref^.symbol) then
  3849. taicpu(p).oper[0]^.ref^.symbol.decrefs;
  3850. if Assigned(taicpu(p).oper[0]^.ref^.relsymbol) then
  3851. taicpu(p).oper[0]^.ref^.relsymbol.decrefs;
  3852. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3 done', hp1);
  3853. RemoveCurrentP(p);
  3854. Result := True;
  3855. Exit;
  3856. end;
  3857. top_ref:
  3858. if not RegInRef(p_TargetReg, taicpu(p).oper[0]^.ref^) then
  3859. begin
  3860. {$ifdef x86_64}
  3861. { Look for the following to simplify:
  3862. mov x(mem1), %reg
  3863. mov %reg, y(mem2)
  3864. mov x+8(mem1), %reg
  3865. mov %reg, y+8(mem2)
  3866. Change to:
  3867. movdqu x(mem1), %xmmreg
  3868. movdqu %xmmreg, y(mem2)
  3869. ...but only as long as the memory blocks don't overlap
  3870. }
  3871. SourceRef := taicpu(p).oper[0]^.ref^;
  3872. TargetRef := taicpu(hp1).oper[1]^.ref^;
  3873. if (taicpu(p).opsize = S_Q) and
  3874. not RegUsedAfterInstruction(p_TargetReg, hp1, TmpUsedRegs) and
  3875. GetNextInstruction(hp1, hp2) and
  3876. MatchInstruction(hp2, A_MOV, [taicpu(p).opsize]) and
  3877. MatchOpType(taicpu(hp2), top_ref, top_reg) then
  3878. begin
  3879. { Delay calling GetNextInstruction(hp2, hp3) for as long as possible }
  3880. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  3881. Inc(SourceRef.offset, 8);
  3882. if UseAVX then
  3883. begin
  3884. MovAligned := A_VMOVDQA;
  3885. MovUnaligned := A_VMOVDQU;
  3886. end
  3887. else
  3888. begin
  3889. MovAligned := A_MOVDQA;
  3890. MovUnaligned := A_MOVDQU;
  3891. end;
  3892. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) and
  3893. not RefsMightOverlap(taicpu(p).oper[0]^.ref^, TargetRef, 16) then
  3894. begin
  3895. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  3896. Inc(TargetRef.offset, 8);
  3897. if GetNextInstruction(hp2, hp3) and
  3898. MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
  3899. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  3900. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  3901. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  3902. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  3903. begin
  3904. NewMMReg := GetMMRegisterBetween(R_SUBMMX, UsedRegs, p, hp3);
  3905. if NewMMReg <> NR_NO then
  3906. begin
  3907. { Remember that the offsets are 8 ahead }
  3908. if ((SourceRef.offset mod 16) = 8) and
  3909. (
  3910. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  3911. (SourceRef.base = current_procinfo.framepointer) or
  3912. ((SourceRef.alignment >= 16) and ((SourceRef.alignment mod 16) = 0))
  3913. ) then
  3914. taicpu(p).opcode := MovAligned
  3915. else
  3916. taicpu(p).opcode := MovUnaligned;
  3917. taicpu(p).opsize := S_XMM;
  3918. taicpu(p).oper[1]^.reg := NewMMReg;
  3919. if ((TargetRef.offset mod 16) = 8) and
  3920. (
  3921. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  3922. (TargetRef.base = current_procinfo.framepointer) or
  3923. ((TargetRef.alignment >= 16) and ((TargetRef.alignment mod 16) = 0))
  3924. ) then
  3925. taicpu(hp1).opcode := MovAligned
  3926. else
  3927. taicpu(hp1).opcode := MovUnaligned;
  3928. taicpu(hp1).opsize := S_XMM;
  3929. taicpu(hp1).oper[0]^.reg := NewMMReg;
  3930. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(NewMMReg) + ' to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 1)', p);
  3931. RemoveInstruction(hp2);
  3932. RemoveInstruction(hp3);
  3933. Result := True;
  3934. Exit;
  3935. end;
  3936. end;
  3937. end
  3938. else
  3939. begin
  3940. { See if the next references are 8 less rather than 8 greater }
  3941. Dec(SourceRef.offset, 16); { -8 the other way }
  3942. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
  3943. begin
  3944. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  3945. Dec(TargetRef.offset, 8); { Only 8, not 16, as it wasn't incremented unlike SourceRef }
  3946. if not RefsMightOverlap(SourceRef, TargetRef, 16) and
  3947. GetNextInstruction(hp2, hp3) and
  3948. MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
  3949. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  3950. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  3951. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  3952. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  3953. begin
  3954. NewMMReg := GetMMRegisterBetween(R_SUBMMX, UsedRegs, p, hp3);
  3955. if NewMMReg <> NR_NO then
  3956. begin
  3957. { hp2 and hp3 are the starting offsets, so mod = 0 this time }
  3958. if ((SourceRef.offset mod 16) = 0) and
  3959. (
  3960. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  3961. (SourceRef.base = current_procinfo.framepointer) or
  3962. ((SourceRef.alignment >= 16) and ((SourceRef.alignment mod 16) = 0))
  3963. ) then
  3964. taicpu(hp2).opcode := MovAligned
  3965. else
  3966. taicpu(hp2).opcode := MovUnaligned;
  3967. taicpu(hp2).opsize := S_XMM;
  3968. taicpu(hp2).oper[1]^.reg := NewMMReg;
  3969. if ((TargetRef.offset mod 16) = 0) and
  3970. (
  3971. { Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
  3972. (TargetRef.base = current_procinfo.framepointer) or
  3973. ((TargetRef.alignment >= 16) and ((TargetRef.alignment mod 16) = 0))
  3974. ) then
  3975. taicpu(hp3).opcode := MovAligned
  3976. else
  3977. taicpu(hp3).opcode := MovUnaligned;
  3978. taicpu(hp3).opsize := S_XMM;
  3979. taicpu(hp3).oper[0]^.reg := NewMMReg;
  3980. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(NewMMReg) + ' to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 2)', p);
  3981. RemoveInstruction(hp1);
  3982. RemoveCurrentP(p);
  3983. Result := True;
  3984. Exit;
  3985. end;
  3986. end;
  3987. end;
  3988. end;
  3989. end;
  3990. {$endif x86_64}
  3991. end;
  3992. else
  3993. { The write target should be a reg or a ref }
  3994. InternalError(2021091601);
  3995. end;
  3996. else
  3997. ;
  3998. end;
  3999. end
  4000. else if (taicpu(p).oper[0]^.typ = top_const) and
  4001. { %treg is used afterwards, but all eventualities other
  4002. than the first MOV instruction being a constant are
  4003. covered by DeepMOVOpt, so only check for that }
  4004. (
  4005. { For MOV operations, a size saving is only made if the register/const is byte-sized }
  4006. not (cs_opt_size in current_settings.optimizerswitches) or
  4007. (taicpu(hp1).opsize = S_B)
  4008. ) and
  4009. (
  4010. (taicpu(hp1).oper[1]^.typ=top_reg) or
  4011. (
  4012. { For 32-to-64-bit zero-extension, the immediate
  4013. must be between 0 and 2^31 - 1}
  4014. (taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and
  4015. ((taicpu(p).oper[0]^.val>=0) and (taicpu(p).oper[0]^.val<=high(longint)))
  4016. ) or
  4017. (
  4018. not ((taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q)) and
  4019. (
  4020. (taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))
  4021. )
  4022. )
  4023. ) then
  4024. begin
  4025. DebugMsg(SPeepholeOptimization + debug_operstr(taicpu(hp1).oper[0]^) + ' = $' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 6b)',hp1);
  4026. taicpu(hp1).loadconst(0, taicpu(p).oper[0]^.val);
  4027. Include(OptsToCheck, aoc_ForceNewIteration);
  4028. end;
  4029. end;
  4030. Break;
  4031. end;
  4032. end;
  4033. if taicpu(p).oper[0]^.typ = top_reg then
  4034. begin
  4035. { oper[1] is a reference }
  4036. { Saves on a large number of dereferences }
  4037. p_SourceReg := taicpu(p).oper[0]^.reg;
  4038. if NotFirstIteration and (cs_opt_level3 in current_settings.optimizerswitches) then
  4039. GetNextInstruction_p := GetNextInstructionUsingReg(p, hp1, p_SourceReg)
  4040. else
  4041. GetNextInstruction_p := GetNextInstruction(p, hp1);
  4042. if GetNextInstruction_p and (hp1.typ = ait_instruction) then
  4043. begin
  4044. if taicpu(p).oper[1]^.typ = top_reg then
  4045. begin
  4046. p_TargetReg := taicpu(p).oper[1]^.reg;
  4047. { Change:
  4048. movl %reg1,%reg2
  4049. ...
  4050. movl x(%reg1),%reg1 (If something other than %reg1 is written to, DeepMOVOpt would have caught it)
  4051. ...
  4052. movl x(%reg2),%regX (%regX can be %reg2 or something else)
  4053. To:
  4054. movl %reg1,%reg2 (if %regX = %reg2, then remove this instruction)
  4055. ...
  4056. movl x(%reg1),%reg1
  4057. ...
  4058. movl %reg1,%regX
  4059. }
  4060. if MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  4061. (taicpu(hp1).oper[0]^.typ = top_ref) { The other operand will be a register } and
  4062. (taicpu(hp1).oper[1]^.reg = p_SourceReg) and
  4063. RegInRef(p_SourceReg, taicpu(hp1).oper[0]^.ref^) and
  4064. not RegModifiedBetween(p_TargetReg, p, hp1) and
  4065. GetNextInstructionUsingReg(hp1, hp2, p_TargetReg) and
  4066. MatchInstruction(hp2, A_MOV, [taicpu(p).opsize]) and
  4067. (taicpu(hp2).oper[0]^.typ = top_ref) { The other operand will be a register } and
  4068. not RegModifiedBetween(p_SourceReg, hp1, hp2) then
  4069. begin
  4070. SourceRef := taicpu(hp2).oper[0]^.ref^;
  4071. if RegInRef(p_TargetReg, SourceRef) and
  4072. { If %reg1 also appears in the second reference, then it will
  4073. not refer to the same memory block as the first reference }
  4074. not RegInRef(p_SourceReg, SourceRef) then
  4075. begin
  4076. { Check to see if the references match if %reg2 is changed to %reg1 }
  4077. if SourceRef.base = p_TargetReg then
  4078. SourceRef.base := p_SourceReg;
  4079. if SourceRef.index = p_TargetReg then
  4080. SourceRef.index := p_SourceReg;
  4081. { RefsEqual also checks to ensure both references are non-volatile }
  4082. if RefsEqual(taicpu(hp1).oper[0]^.ref^, SourceRef) then
  4083. begin
  4084. taicpu(hp2).loadreg(0, p_SourceReg);
  4085. TransferUsedRegs(TmpUsedRegs);
  4086. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.Next), hp1);
  4087. { Make sure the register is allocated between these instructions
  4088. even though it doesn't change value, since it may cause
  4089. optimisations on a later pass to behave incorrectly. (Fixes #41155) }
  4090. AllocRegBetween(p_SourceReg, hp1, hp2, TmpUsedRegs);
  4091. DebugMsg(SPeepholeOptimization + 'Optimised register duplication and memory read (MovMovMov2MovMovMov)', p);
  4092. Result := True;
  4093. if taicpu(hp2).oper[1]^.reg = p_TargetReg then
  4094. begin
  4095. DebugMsg(SPeepholeOptimization + 'Mov2Nop 5a done', p);
  4096. RemoveCurrentP(p);
  4097. Exit;
  4098. end
  4099. else
  4100. begin
  4101. if not RegUsedAfterInstruction(p_TargetReg, hp2, TmpUsedRegs) then
  4102. begin
  4103. DebugMsg(SPeepholeOptimization + 'Mov2Nop 5b done', p);
  4104. RemoveCurrentP(p);
  4105. Exit;
  4106. end;
  4107. end;
  4108. { If we reach this point, p and hp1 weren't actually modified,
  4109. so we can do a bit more work on this pass }
  4110. end;
  4111. end;
  4112. end;
  4113. end;
  4114. end;
  4115. end;
  4116. GetNextInstruction_p:=GetNextInstruction(p, hp1);
  4117. { All the next optimisations require a next instruction }
  4118. if not GetNextInstruction_p or (hp1.typ <> ait_instruction) then
  4119. Exit;
  4120. { Change:
  4121. movl/q (ref), %reg
  4122. movd/q %reg, %xmm0
  4123. (dealloc %reg)
  4124. To:
  4125. movd/q (ref), %xmm0
  4126. }
  4127. if MatchOpType(taicpu(p),top_ref,top_reg) and
  4128. MatchInstruction(hp1,[A_MOVD,A_VMOVD{$ifdef x86_64},A_MOVQ,A_VMOVQ{$endif x86_64}],[]) and
  4129. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^.reg) and
  4130. (taicpu(hp1).oper[1]^.typ=top_reg) and
  4131. (GetRegType(taicpu(hp1).oper[1]^.reg)=R_MMREGISTER) then
  4132. begin
  4133. TransferUsedRegs(TmpUsedRegs);
  4134. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  4135. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs) then
  4136. begin
  4137. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  4138. { loadref increases the reference count, so decrement it again }
  4139. if Assigned(taicpu(p).oper[0]^.ref^.symbol) then
  4140. taicpu(p).oper[0]^.ref^.symbol.decrefs;
  4141. if Assigned(taicpu(p).oper[0]^.ref^.relsymbol) then
  4142. taicpu(p).oper[0]^.ref^.relsymbol.decrefs;
  4143. DebugMsg(SPeepholeOptimization+'Merged MOV and (V)MOVD/(V)MOVQ to eliminate intermediate register (MovMovD/Q2MovD/Q)',p);
  4144. RemoveCurrentP(p,hp1);
  4145. Result:=True;
  4146. Exit;
  4147. end;
  4148. end;
  4149. { Next instruction is also a MOV ? }
  4150. if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) then
  4151. begin
  4152. if MatchOpType(taicpu(p), top_const, top_ref) and
  4153. MatchOpType(taicpu(hp1), top_const, top_ref) and
  4154. TryConstMerge(p, hp1) then
  4155. begin
  4156. Result := True;
  4157. { In case we have four byte writes in a row, check for 2 more
  4158. right now so we don't have to wait for another iteration of
  4159. pass 1
  4160. }
  4161. { If two byte-writes were merged, the opsize is now S_W, not S_B }
  4162. case taicpu(p).opsize of
  4163. S_W:
  4164. begin
  4165. if GetNextInstruction(p, hp1) and
  4166. MatchInstruction(hp1, A_MOV, [S_B]) and
  4167. MatchOpType(taicpu(hp1), top_const, top_ref) and
  4168. GetNextInstruction(hp1, hp2) and
  4169. MatchInstruction(hp2, A_MOV, [S_B]) and
  4170. MatchOpType(taicpu(hp2), top_const, top_ref) and
  4171. { Try to merge the two bytes }
  4172. TryConstMerge(hp1, hp2) then
  4173. { Now try to merge the two words (hp2 will get deleted) }
  4174. TryConstMerge(p, hp1);
  4175. end;
  4176. S_L:
  4177. begin
  4178. { Though this only really benefits x86_64 and not i386, it
  4179. gets a potential optimisation done faster and hence
  4180. reduces the number of times OptPass1MOV is entered }
  4181. if GetNextInstruction(p, hp1) and
  4182. MatchInstruction(hp1, A_MOV, [S_W]) and
  4183. MatchOpType(taicpu(hp1), top_const, top_ref) and
  4184. GetNextInstruction(hp1, hp2) and
  4185. MatchInstruction(hp2, A_MOV, [S_W]) and
  4186. MatchOpType(taicpu(hp2), top_const, top_ref) and
  4187. { Try to merge the two words }
  4188. TryConstMerge(hp1, hp2) then
  4189. { This will always fail on i386, so don't bother
  4190. calling it unless we're doing x86_64 }
  4191. {$ifdef x86_64}
  4192. { Now try to merge the two longwords (hp2 will get deleted) }
  4193. TryConstMerge(p, hp1)
  4194. {$endif x86_64}
  4195. ;
  4196. end;
  4197. else
  4198. ;
  4199. end;
  4200. Exit;
  4201. end;
  4202. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  4203. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  4204. { mov reg1, mem1 or mov mem1, reg1
  4205. mov mem2, reg2 mov reg2, mem2}
  4206. begin
  4207. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  4208. { mov reg1, mem1 or mov mem1, reg1
  4209. mov mem2, reg1 mov reg2, mem1}
  4210. begin
  4211. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  4212. { Removes the second statement from
  4213. mov reg1, mem1/reg2
  4214. mov mem1/reg2, reg1 }
  4215. begin
  4216. if taicpu(p).oper[0]^.typ=top_reg then
  4217. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  4218. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 1',p);
  4219. RemoveInstruction(hp1);
  4220. Result:=true;
  4221. if (taicpu(p).oper[1]^.typ = top_reg) then
  4222. begin
  4223. TransferUsedRegs(TmpUsedRegs);
  4224. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, p, TmpUsedRegs) then
  4225. begin
  4226. { reg2 is no longer in use }
  4227. DebugMsg(SPeepholeOptimization + 'Mov2Nop 6 done',p);
  4228. RemoveCurrentP(p);
  4229. end;
  4230. end;
  4231. exit;
  4232. end
  4233. else
  4234. begin
  4235. TransferUsedRegs(TmpUsedRegs);
  4236. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  4237. if (taicpu(p).oper[1]^.typ = top_ref) and
  4238. { mov reg1, mem1
  4239. mov mem2, reg1 }
  4240. (taicpu(hp1).oper[0]^.ref^.refaddr = addr_no) and
  4241. GetNextInstruction(hp1, hp2) and
  4242. MatchInstruction(hp2,A_CMP,[taicpu(p).opsize]) and
  4243. OpsEqual(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^) and
  4244. OpsEqual(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) and
  4245. not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs)) then
  4246. { change to
  4247. mov reg1, mem1 mov reg1, mem1
  4248. mov mem2, reg1 cmp reg1, mem2
  4249. cmp mem1, reg1
  4250. }
  4251. begin
  4252. RemoveInstruction(hp2);
  4253. taicpu(hp1).opcode := A_CMP;
  4254. taicpu(hp1).loadref(1,taicpu(hp1).oper[0]^.ref^);
  4255. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  4256. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  4257. DebugMsg(SPeepholeOptimization + 'MovMovCmp2MovCmp done',hp1);
  4258. end;
  4259. end;
  4260. end
  4261. else if (taicpu(p).oper[1]^.typ=top_ref) and
  4262. OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  4263. begin
  4264. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  4265. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  4266. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov1 done',p);
  4267. end
  4268. else
  4269. begin
  4270. TransferUsedRegs(TmpUsedRegs);
  4271. if GetNextInstruction(hp1, hp2) and
  4272. MatchOpType(taicpu(p),top_ref,top_reg) and
  4273. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  4274. (taicpu(hp1).oper[1]^.typ = top_ref) and
  4275. MatchInstruction(hp2,A_MOV,[taicpu(p).opsize]) and
  4276. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  4277. RefsEqual(taicpu(hp2).oper[0]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  4278. if not RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^) and
  4279. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,tmpUsedRegs)) then
  4280. { mov mem1, %reg1
  4281. mov %reg1, mem2
  4282. mov mem2, reg2
  4283. to:
  4284. mov mem1, reg2
  4285. mov reg2, mem2}
  4286. begin
  4287. AllocRegBetween(taicpu(hp2).oper[1]^.reg,p,hp2,usedregs);
  4288. DebugMsg(SPeepholeOptimization + 'MovMovMov2MovMov 1 done',p);
  4289. taicpu(p).loadoper(1,taicpu(hp2).oper[1]^);
  4290. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  4291. RemoveInstruction(hp2);
  4292. Result := True;
  4293. end
  4294. {$ifdef i386}
  4295. { this is enabled for i386 only, as the rules to create the reg sets below
  4296. are too complicated for x86-64, so this makes this code too error prone
  4297. on x86-64
  4298. }
  4299. else if (taicpu(p).oper[1]^.reg <> taicpu(hp2).oper[1]^.reg) and
  4300. not(RegInRef(taicpu(p).oper[1]^.reg,taicpu(p).oper[0]^.ref^)) and
  4301. not(RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^)) then
  4302. { mov mem1, reg1 mov mem1, reg1
  4303. mov reg1, mem2 mov reg1, mem2
  4304. mov mem2, reg2 mov mem2, reg1
  4305. to: to:
  4306. mov mem1, reg1 mov mem1, reg1
  4307. mov mem1, reg2 mov reg1, mem2
  4308. mov reg1, mem2
  4309. or (if mem1 depends on reg1
  4310. and/or if mem2 depends on reg2)
  4311. to:
  4312. mov mem1, reg1
  4313. mov reg1, mem2
  4314. mov reg1, reg2
  4315. }
  4316. begin
  4317. taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^);
  4318. taicpu(hp1).loadReg(1,taicpu(hp2).oper[1]^.reg);
  4319. taicpu(hp2).loadRef(1,taicpu(hp2).oper[0]^.ref^);
  4320. taicpu(hp2).loadReg(0,taicpu(p).oper[1]^.reg);
  4321. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  4322. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  4323. (getsupreg(taicpu(p).oper[0]^.ref^.base) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  4324. AllocRegBetween(taicpu(p).oper[0]^.ref^.base,p,hp2,usedregs);
  4325. if (taicpu(p).oper[0]^.ref^.index <> NR_NO) and
  4326. (getsupreg(taicpu(p).oper[0]^.ref^.index) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  4327. AllocRegBetween(taicpu(p).oper[0]^.ref^.index,p,hp2,usedregs);
  4328. end
  4329. else if (taicpu(hp1).Oper[0]^.reg <> taicpu(hp2).Oper[1]^.reg) then
  4330. begin
  4331. taicpu(hp2).loadReg(0,taicpu(hp1).Oper[0]^.reg);
  4332. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  4333. end
  4334. else
  4335. begin
  4336. RemoveInstruction(hp2);
  4337. end
  4338. {$endif i386}
  4339. ;
  4340. end;
  4341. end
  4342. { movl [mem1],reg1
  4343. movl [mem1],reg2
  4344. to
  4345. movl [mem1],reg1
  4346. movl reg1,reg2
  4347. }
  4348. else if not CheckMovMov2MovMov2(p, hp1) and
  4349. { movl const1,[mem1]
  4350. movl [mem1],reg1
  4351. to
  4352. movl const1,reg1
  4353. movl reg1,[mem1]
  4354. }
  4355. MatchOpType(Taicpu(p),top_const,top_ref) and
  4356. MatchOpType(Taicpu(hp1),top_ref,top_reg) and
  4357. (taicpu(p).opsize = taicpu(hp1).opsize) and
  4358. RefsEqual(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.ref^) and
  4359. not(RegInRef(taicpu(hp1).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^)) then
  4360. begin
  4361. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  4362. taicpu(hp1).loadReg(0,taicpu(hp1).oper[1]^.reg);
  4363. taicpu(hp1).loadRef(1,taicpu(p).oper[1]^.ref^);
  4364. taicpu(p).loadReg(1,taicpu(hp1).oper[0]^.reg);
  4365. taicpu(hp1).fileinfo := taicpu(p).fileinfo;
  4366. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 1',p);
  4367. Result:=true;
  4368. exit;
  4369. end;
  4370. { mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
  4371. end;
  4372. { search further than the next instruction for a mov (as long as it's not a jump) }
  4373. if not is_calljmpuncondret(taicpu(hp1).opcode) and
  4374. { check as much as possible before the expensive GetNextInstructionUsingRegCond call }
  4375. (taicpu(p).oper[1]^.typ = top_reg) and
  4376. (taicpu(p).oper[0]^.typ in [top_reg,top_const]) and
  4377. not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
  4378. begin
  4379. { we work with hp2 here, so hp1 can be still used later on when
  4380. checking for GetNextInstruction_p }
  4381. hp3 := hp1;
  4382. { Initialise CrossJump (if it becomes True at any point, it will remain True) }
  4383. CrossJump := (taicpu(hp1).opcode = A_Jcc);
  4384. { Remember that p_TargetReg contains taicpu(p).oper[1]^.reg }
  4385. TransferUsedRegs(TmpUsedRegs);
  4386. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  4387. if NotFirstIteration then
  4388. JumpTracking := TLinkedList.Create
  4389. else
  4390. JumpTracking := nil;
  4391. while GetNextInstructionUsingRegCond(hp3,hp2,p_TargetReg,JumpTracking,CrossJump) and
  4392. { GetNextInstructionUsingRegCond only searches one instruction ahead unless -O3 is specified }
  4393. (hp2.typ=ait_instruction) do
  4394. begin
  4395. case taicpu(hp2).opcode of
  4396. A_POP:
  4397. if MatchOperand(taicpu(hp2).oper[0]^,p_TargetReg) then
  4398. begin
  4399. if not CrossJump and
  4400. not RegUsedBetween(p_TargetReg, p, hp2) then
  4401. begin
  4402. { We can remove the original MOV since the register
  4403. wasn't used between it and its popping from the stack }
  4404. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3c done',p);
  4405. RemoveCurrentp(p, hp1);
  4406. Result := True;
  4407. JumpTracking.Free;
  4408. Exit;
  4409. end;
  4410. { Can't go any further }
  4411. Break;
  4412. end;
  4413. A_MOV:
  4414. if MatchOperand(taicpu(hp2).oper[0]^,p_TargetReg) and
  4415. ((taicpu(p).oper[0]^.typ=top_const) or
  4416. ((taicpu(p).oper[0]^.typ=top_reg) and
  4417. not(RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp2))
  4418. )
  4419. ) then
  4420. begin
  4421. { we have
  4422. mov x, %treg
  4423. mov %treg, y
  4424. }
  4425. { We don't need to call UpdateUsedRegs for every instruction between
  4426. p and hp2 because the register we're concerned about will not
  4427. become deallocated (otherwise GetNextInstructionUsingReg would
  4428. have stopped at an earlier instruction). [Kit] }
  4429. TempRegUsed :=
  4430. CrossJump { Assume the register is in use if it crossed a conditional jump } or
  4431. RegReadByInstruction(p_TargetReg, hp3) or
  4432. RegUsedAfterInstruction(p_TargetReg, hp2, TmpUsedRegs);
  4433. case taicpu(p).oper[0]^.typ Of
  4434. top_reg:
  4435. begin
  4436. { change
  4437. mov %reg, %treg
  4438. mov %treg, y
  4439. to
  4440. mov %reg, y
  4441. }
  4442. p_SourceReg := taicpu(p).oper[0]^.reg; { Saves on a handful of pointer dereferences }
  4443. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  4444. if MatchOperand(taicpu(hp2).oper[1]^, p_SourceReg) then
  4445. begin
  4446. { %reg = y - remove hp2 completely (doing it here instead of relying on
  4447. the "mov %reg,%reg" optimisation might cut down on a pass iteration) }
  4448. if TempRegUsed then
  4449. begin
  4450. DebugMsg(SPeepholeOptimization + debug_regname(p_SourceReg) + ' = ' + RegName1 + '; removed unnecessary instruction (MovMov2MovNop 6b}',hp2);
  4451. AllocRegBetween(p_SourceReg, p, hp2, UsedRegs);
  4452. { Set the start of the next GetNextInstructionUsingRegCond search
  4453. to start at the entry right before hp2 (which is about to be removed) }
  4454. hp3 := tai(hp2.Previous);
  4455. RemoveInstruction(hp2);
  4456. Include(OptsToCheck, aoc_ForceNewIteration);
  4457. { See if there's more we can optimise }
  4458. Continue;
  4459. end
  4460. else
  4461. begin
  4462. RemoveInstruction(hp2);
  4463. { We can remove the original MOV too }
  4464. DebugMsg(SPeepholeOptimization + 'MovMov2NopNop 6b done',p);
  4465. RemoveCurrentP(p, hp1);
  4466. Result:=true;
  4467. JumpTracking.Free;
  4468. Exit;
  4469. end;
  4470. end
  4471. else
  4472. begin
  4473. AllocRegBetween(p_SourceReg, p, hp2, UsedRegs);
  4474. taicpu(hp2).loadReg(0, p_SourceReg);
  4475. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_regname(p_SourceReg) + '; changed to minimise pipeline stall (MovMov2Mov 6a}',hp2);
  4476. { Check to see if the register also appears in the reference }
  4477. if (taicpu(hp2).oper[1]^.typ = top_ref) then
  4478. ReplaceRegisterInRef(taicpu(hp2).oper[1]^.ref^, p_TargetReg, p_SourceReg);
  4479. { ReplaceRegisterInRef won't actually replace the register if it's a different size }
  4480. if not RegInOp(p_TargetReg, taicpu(hp2).oper[1]^) then
  4481. begin
  4482. { Don't remove the first instruction if the temporary register is in use }
  4483. if not TempRegUsed then
  4484. begin
  4485. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 6 done',p);
  4486. RemoveCurrentP(p, hp1);
  4487. Result:=true;
  4488. JumpTracking.Free;
  4489. Exit;
  4490. end;
  4491. { No need to set Result to True here. If there's another instruction later
  4492. on that can be optimised, it will be detected when the main Pass 1 loop
  4493. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] }
  4494. hp3 := hp2;
  4495. Continue;
  4496. end;
  4497. end;
  4498. end;
  4499. top_const:
  4500. if not (cs_opt_size in current_settings.optimizerswitches) or (taicpu(hp2).opsize = S_B) then
  4501. begin
  4502. { change
  4503. mov const, %treg
  4504. mov %treg, y
  4505. to
  4506. mov const, y
  4507. }
  4508. if (taicpu(hp2).oper[1]^.typ=top_reg) or
  4509. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  4510. begin
  4511. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  4512. taicpu(hp2).loadOper(0,taicpu(p).oper[0]^);
  4513. if TempRegUsed then
  4514. begin
  4515. { Don't remove the first instruction if the temporary register is in use }
  4516. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 7a)',hp2);
  4517. { No need to set Result to True. If there's another instruction later on
  4518. that can be optimised, it will be detected when the main Pass 1 loop
  4519. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
  4520. end
  4521. else
  4522. begin
  4523. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 7 done',p);
  4524. RemoveCurrentP(p, hp1);
  4525. Result:=true;
  4526. Exit;
  4527. end;
  4528. end;
  4529. end;
  4530. else
  4531. Internalerror(2019103001);
  4532. end;
  4533. end
  4534. else if MatchOperand(taicpu(hp2).oper[1]^, p_TargetReg) then
  4535. begin
  4536. if not CrossJump and
  4537. not RegUsedBetween(p_TargetReg, p, hp2) and
  4538. not RegReadByInstruction(p_TargetReg, hp2) then
  4539. begin
  4540. { Register is not used before it is overwritten }
  4541. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3a done',p);
  4542. RemoveCurrentp(p, hp1);
  4543. Result := True;
  4544. Exit;
  4545. end;
  4546. if (taicpu(p).oper[0]^.typ = top_const) and
  4547. (taicpu(hp2).oper[0]^.typ = top_const) then
  4548. begin
  4549. if taicpu(p).oper[0]^.val = taicpu(hp2).oper[0]^.val then
  4550. begin
  4551. { Same value - register hasn't changed }
  4552. DebugMsg(SPeepholeOptimization + 'Mov2Nop 2 done', hp2);
  4553. RemoveInstruction(hp2);
  4554. Include(OptsToCheck, aoc_ForceNewIteration);
  4555. { See if there's more we can optimise }
  4556. Continue;
  4557. end;
  4558. end;
  4559. {$ifdef x86_64}
  4560. end
  4561. { Change:
  4562. movl %reg1l,%reg2l
  4563. ...
  4564. movq %reg2q,%reg3q (%reg1 <> %reg3)
  4565. To:
  4566. movl %reg1l,%reg2l
  4567. ...
  4568. movl %reg1l,%reg3l (Upper 32 bits of %reg3q will be zero)
  4569. If %reg1 = %reg3, convert to:
  4570. movl %reg1l,%reg2l
  4571. ...
  4572. andl %reg1l,%reg1l
  4573. }
  4574. else if (taicpu(p).opsize = S_L) and MatchInstruction(hp2,A_MOV,[S_Q]) and
  4575. (taicpu(p).oper[0]^.typ = top_reg) and
  4576. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  4577. SuperRegistersEqual(p_TargetReg, taicpu(hp2).oper[0]^.reg) and
  4578. not RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp2) then
  4579. begin
  4580. TempRegUsed :=
  4581. CrossJump { Assume the register is in use if it crossed a conditional jump } or
  4582. RegReadByInstruction(p_TargetReg, hp3) or
  4583. RegUsedAfterInstruction(p_TargetReg, hp2, TmpUsedRegs);
  4584. taicpu(hp2).opsize := S_L;
  4585. taicpu(hp2).loadreg(0, taicpu(p).oper[0]^.reg);
  4586. setsubreg(taicpu(hp2).oper[1]^.reg, R_SUBD);
  4587. AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp2, UsedRegs);
  4588. if (taicpu(p).oper[0]^.reg = taicpu(hp2).oper[1]^.reg) then
  4589. begin
  4590. { %reg1 = %reg3 }
  4591. DebugMsg(SPeepholeOptimization + 'Made 32-to-64-bit zero extension more efficient (MovlMovq2MovlAndl 2)', hp2);
  4592. taicpu(hp2).opcode := A_AND;
  4593. end
  4594. else
  4595. begin
  4596. { %reg1 <> %reg3 }
  4597. DebugMsg(SPeepholeOptimization + 'Made 32-to-64-bit zero extension more efficient (MovlMovq2MovlMovl 2)', hp2);
  4598. end;
  4599. if not TempRegUsed then
  4600. begin
  4601. DebugMsg(SPeepholeOptimization + 'Mov2Nop 8a done', p);
  4602. RemoveCurrentP(p, hp1);
  4603. Result := True;
  4604. Exit;
  4605. end
  4606. else
  4607. begin
  4608. { Initial instruction wasn't actually changed }
  4609. Include(OptsToCheck, aoc_ForceNewIteration);
  4610. { if %reg1 = %reg3, don't do the long-distance lookahead that
  4611. appears below since %reg1 has technically changed }
  4612. if taicpu(hp2).opcode = A_AND then
  4613. Break;
  4614. end;
  4615. {$endif x86_64}
  4616. end
  4617. else if (taicpu(hp2).oper[0]^.typ = top_ref) and
  4618. GetNextInstruction(hp2, hp4) and
  4619. (hp4.typ = ait_instruction) and (taicpu(hp4).opcode = A_MOV) then
  4620. { Optimise the following first:
  4621. movl [mem1],reg1
  4622. movl [mem1],reg2
  4623. to
  4624. movl [mem1],reg1
  4625. movl reg1,reg2
  4626. If [mem1] contains the target register and reg1 is the
  4627. the source register, this optimisation will get missed
  4628. and produce less efficient code later on.
  4629. }
  4630. if CheckMovMov2MovMov2(hp2, hp4) then
  4631. { Initial instruction wasn't actually changed }
  4632. Include(OptsToCheck, aoc_ForceNewIteration);
  4633. A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  4634. if MatchOpType(taicpu(hp2), top_reg, top_reg) and
  4635. MatchOperand(taicpu(hp2).oper[0]^, p_TargetReg) and
  4636. SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, p_TargetReg) then
  4637. begin
  4638. {
  4639. Change from:
  4640. mov ###, %reg
  4641. ...
  4642. movs/z %reg,%reg (Same register, just different sizes)
  4643. To:
  4644. movs/z ###, %reg (Longer version)
  4645. ...
  4646. (remove)
  4647. }
  4648. DebugMsg(SPeepholeOptimization + 'MovMovs/z2Mov/s/z done', p);
  4649. taicpu(p).oper[1]^.reg := taicpu(hp2).oper[1]^.reg;
  4650. { Keep the first instruction as mov if ### is a constant }
  4651. if taicpu(p).oper[0]^.typ = top_const then
  4652. taicpu(p).opsize := reg2opsize(taicpu(hp2).oper[1]^.reg)
  4653. else
  4654. begin
  4655. taicpu(p).opcode := taicpu(hp2).opcode;
  4656. taicpu(p).opsize := taicpu(hp2).opsize;
  4657. end;
  4658. DebugMsg(SPeepholeOptimization + 'Removed movs/z instruction and extended earlier write (MovMovs/z2Mov/s/z)', hp2);
  4659. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp2, UsedRegs);
  4660. RemoveInstruction(hp2);
  4661. Result := True;
  4662. JumpTracking.Free;
  4663. Exit;
  4664. end;
  4665. else
  4666. { Move down to the if-block below };
  4667. end;
  4668. { Also catches MOV/S/Z instructions that aren't modified }
  4669. if taicpu(p).oper[0]^.typ = top_reg then
  4670. begin
  4671. p_SourceReg := taicpu(p).oper[0]^.reg;
  4672. if
  4673. not RegModifiedByInstruction(p_SourceReg, hp3) and
  4674. not RegModifiedBetween(p_SourceReg, hp3, hp2) and
  4675. DeepMOVOpt(taicpu(p), taicpu(hp2)) then
  4676. begin
  4677. Result := True;
  4678. { Just in case something didn't get modified (e.g. an
  4679. implicit register). Also, if it does read from this
  4680. register, then there's no longer an advantage to
  4681. changing the register on subsequent instructions.}
  4682. if not RegReadByInstruction(p_TargetReg, hp2) then
  4683. begin
  4684. { If a conditional jump was crossed, do not delete
  4685. the original MOV no matter what }
  4686. if not CrossJump and
  4687. { RegEndOfLife returns True if the register is
  4688. deallocated before the next instruction or has
  4689. been loaded with a new value }
  4690. RegEndOfLife(p_TargetReg, taicpu(hp2)) then
  4691. begin
  4692. { We can remove the original MOV }
  4693. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3b done',p);
  4694. RemoveCurrentp(p, hp1);
  4695. JumpTracking.Free;
  4696. Result := True;
  4697. Exit;
  4698. end;
  4699. if not RegModifiedByInstruction(p_TargetReg, hp2) then
  4700. begin
  4701. { See if there's more we can optimise }
  4702. hp3 := hp2;
  4703. Continue;
  4704. end;
  4705. end;
  4706. end;
  4707. end;
  4708. { Break out of the while loop under normal circumstances }
  4709. Break;
  4710. end;
  4711. JumpTracking.Free;
  4712. end;
  4713. if (aoc_MovAnd2Mov_3 in OptsToCheck) and
  4714. (taicpu(p).oper[1]^.typ = top_reg) and
  4715. (taicpu(p).opsize = S_L) and
  4716. GetNextInstructionUsingRegTrackingUse(p,hp2,taicpu(p).oper[1]^.reg) and
  4717. (hp2.typ = ait_instruction) and
  4718. (taicpu(hp2).opcode = A_AND) and
  4719. (MatchOpType(taicpu(hp2),top_const,top_reg) or
  4720. (MatchOpType(taicpu(hp2),top_reg,top_reg) and
  4721. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^))
  4722. ) then
  4723. begin
  4724. if SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp2).oper[1]^.reg) then
  4725. begin
  4726. if ((taicpu(hp2).oper[0]^.typ=top_const) and (taicpu(hp2).oper[0]^.val = $ffffffff)) or
  4727. ((taicpu(hp2).oper[0]^.typ=top_reg) and (taicpu(hp2).opsize=S_L)) then
  4728. begin
  4729. { Optimize out:
  4730. mov x, %reg
  4731. and ffffffffh, %reg
  4732. }
  4733. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 3 done',p);
  4734. RemoveInstruction(hp2);
  4735. Result:=true;
  4736. exit;
  4737. end;
  4738. end;
  4739. end;
  4740. { leave out the mov from "mov reg, x(%frame_pointer); leave/ret" (with
  4741. x >= RetOffset) as it doesn't do anything (it writes either to a
  4742. parameter or to the temporary storage room for the function
  4743. result)
  4744. }
  4745. if IsExitCode(hp1) and
  4746. (taicpu(p).oper[1]^.typ = top_ref) and
  4747. (taicpu(p).oper[1]^.ref^.index = NR_NO) and
  4748. (
  4749. (
  4750. (taicpu(p).oper[1]^.ref^.base = current_procinfo.FramePointer) and
  4751. not (
  4752. assigned(current_procinfo.procdef.funcretsym) and
  4753. (taicpu(p).oper[1]^.ref^.offset <= tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)
  4754. )
  4755. ) or
  4756. { Also discard writes to the stack that are below the base pointer,
  4757. as this is temporary storage rather than a function result on the
  4758. stack, say. }
  4759. (
  4760. (taicpu(p).oper[1]^.ref^.base = NR_STACK_POINTER_REG) and
  4761. (taicpu(p).oper[1]^.ref^.offset < current_procinfo.final_localsize)
  4762. )
  4763. ) then
  4764. begin
  4765. RemoveCurrentp(p, hp1);
  4766. DebugMsg(SPeepholeOptimization + 'removed deadstore before leave/ret',p);
  4767. RemoveLastDeallocForFuncRes(p);
  4768. Result:=true;
  4769. exit;
  4770. end;
  4771. if MatchInstruction(hp1,A_CMP,A_TEST,[taicpu(p).opsize]) then
  4772. begin
  4773. if MatchOpType(taicpu(p),top_reg,top_ref) and
  4774. (taicpu(hp1).oper[1]^.typ = top_ref) and
  4775. RefsEqual(taicpu(p).oper[1]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  4776. begin
  4777. { change
  4778. mov reg1, mem1
  4779. test/cmp x, mem1
  4780. to
  4781. mov reg1, mem1
  4782. test/cmp x, reg1
  4783. }
  4784. taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
  4785. DebugMsg(SPeepholeOptimization + 'MovTestCmp2MovTestCmp 1',hp1);
  4786. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  4787. Result := True;
  4788. Exit;
  4789. end;
  4790. if DoMovCmpMemOpt(p, hp1) then
  4791. begin
  4792. Result := True;
  4793. Exit;
  4794. end;
  4795. end;
  4796. if (taicpu(p).oper[1]^.typ = top_reg) and
  4797. (hp1.typ = ait_instruction) and
  4798. GetNextInstruction(hp1, hp2) and
  4799. MatchInstruction(hp2,A_MOV,[]) and
  4800. (SuperRegistersEqual(taicpu(hp2).oper[0]^.reg,taicpu(p).oper[1]^.reg)) and
  4801. (topsize2memsize[taicpu(hp1).opsize]>=topsize2memsize[taicpu(hp2).opsize]) and
  4802. (
  4803. IsFoldableArithOp(taicpu(hp1), taicpu(p).oper[1]^.reg)
  4804. {$ifdef x86_64}
  4805. or
  4806. (
  4807. (taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and (taicpu(hp2).opsize=S_L) and
  4808. IsFoldableArithOp(taicpu(hp1), newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[1]^.reg),R_SUBQ))
  4809. )
  4810. {$endif x86_64}
  4811. ) then
  4812. begin
  4813. if OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  4814. (taicpu(hp2).oper[0]^.typ=top_reg) then
  4815. { change movsX/movzX reg/ref, reg2
  4816. add/sub/or/... reg3/$const, reg2
  4817. mov reg2 reg/ref
  4818. dealloc reg2
  4819. to
  4820. add/sub/or/... reg3/$const, reg/ref }
  4821. begin
  4822. TransferUsedRegs(TmpUsedRegs);
  4823. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4824. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  4825. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  4826. begin
  4827. { by example:
  4828. movswl %si,%eax movswl %si,%eax p
  4829. decl %eax addl %edx,%eax hp1
  4830. movw %ax,%si movw %ax,%si hp2
  4831. ->
  4832. movswl %si,%eax movswl %si,%eax p
  4833. decw %eax addw %edx,%eax hp1
  4834. movw %ax,%si movw %ax,%si hp2
  4835. }
  4836. DebugMsg(SPeepholeOptimization + 'MovOpMov2Op ('+
  4837. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  4838. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  4839. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  4840. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  4841. {
  4842. ->
  4843. movswl %si,%eax movswl %si,%eax p
  4844. decw %si addw %dx,%si hp1
  4845. movw %ax,%si movw %ax,%si hp2
  4846. }
  4847. case taicpu(hp1).ops of
  4848. 1:
  4849. begin
  4850. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  4851. if taicpu(hp1).oper[0]^.typ=top_reg then
  4852. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  4853. end;
  4854. 2:
  4855. begin
  4856. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  4857. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  4858. (taicpu(hp1).opcode<>A_SHL) and
  4859. (taicpu(hp1).opcode<>A_SHR) and
  4860. (taicpu(hp1).opcode<>A_SAR) then
  4861. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  4862. end;
  4863. else
  4864. internalerror(2008042701);
  4865. end;
  4866. {
  4867. ->
  4868. decw %si addw %dx,%si p
  4869. }
  4870. RemoveInstruction(hp2);
  4871. RemoveCurrentP(p, hp1);
  4872. Result:=True;
  4873. Exit;
  4874. end;
  4875. end;
  4876. if MatchOpType(taicpu(hp2),top_reg,top_reg) and
  4877. not(SuperRegistersEqual(taicpu(hp1).oper[0]^.reg,taicpu(hp2).oper[1]^.reg)) and
  4878. ((topsize2memsize[taicpu(hp1).opsize]<= topsize2memsize[taicpu(hp2).opsize]) or
  4879. { opsize matters for these opcodes, we could probably work around this, but it is not worth the effort }
  4880. ((taicpu(hp1).opcode<>A_SHL) and (taicpu(hp1).opcode<>A_SHR) and (taicpu(hp1).opcode<>A_SAR))
  4881. ) and
  4882. { if ref contains a symbol, we cannot change its size to a smaller size }
  4883. ((taicpu(p).oper[0]^.typ<>top_ref) or (taicpu(p).oper[0]^.ref^.symbol=nil) or
  4884. (topsize2memsize[taicpu(p).opsize]<=topsize2memsize[taicpu(hp2).opsize])
  4885. )
  4886. {$ifdef i386}
  4887. { byte registers of esi, edi, ebp, esp are not available on i386 }
  4888. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  4889. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(p).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  4890. {$endif i386}
  4891. then
  4892. { change movsX/movzX reg/ref, reg2
  4893. add/sub/or/... regX/$const, reg2
  4894. mov reg2, reg3
  4895. dealloc reg2
  4896. to
  4897. movsX/movzX reg/ref, reg3
  4898. add/sub/or/... reg3/$const, reg3
  4899. }
  4900. begin
  4901. TransferUsedRegs(TmpUsedRegs);
  4902. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4903. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  4904. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  4905. begin
  4906. { by example:
  4907. movswl %si,%eax movswl %si,%eax p
  4908. decl %eax addl %edx,%eax hp1
  4909. movw %ax,%si movw %ax,%si hp2
  4910. ->
  4911. movswl %si,%eax movswl %si,%eax p
  4912. decw %eax addw %edx,%eax hp1
  4913. movw %ax,%si movw %ax,%si hp2
  4914. }
  4915. DebugMsg(SPeepholeOptimization + 'MovOpMov2MovOp ('+
  4916. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  4917. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  4918. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  4919. { limit size of constants as well to avoid assembler errors, but
  4920. check opsize to avoid overflow when left shifting the 1 }
  4921. if (taicpu(p).oper[0]^.typ=top_const) and (topsize2memsize[taicpu(hp2).opsize]<=63) then
  4922. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and ((qword(1) shl topsize2memsize[taicpu(hp2).opsize])-1);
  4923. {$ifdef x86_64}
  4924. { Be careful of, for example:
  4925. movl %reg1,%reg2
  4926. addl %reg3,%reg2
  4927. movq %reg2,%reg4
  4928. This will cause problems if the upper 32-bits of %reg3 or %reg4 are non-zero
  4929. }
  4930. if (taicpu(hp1).opsize = S_L) and (taicpu(hp2).opsize = S_Q) then
  4931. begin
  4932. taicpu(hp2).changeopsize(S_L);
  4933. setsubreg(taicpu(hp2).oper[0]^.reg, R_SUBD);
  4934. setsubreg(taicpu(hp2).oper[1]^.reg, R_SUBD);
  4935. end;
  4936. {$endif x86_64}
  4937. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  4938. taicpu(p).changeopsize(taicpu(hp2).opsize);
  4939. if taicpu(p).oper[0]^.typ=top_reg then
  4940. setsubreg(taicpu(p).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  4941. taicpu(p).loadoper(1, taicpu(hp2).oper[1]^);
  4942. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,usedregs);
  4943. {
  4944. ->
  4945. movswl %si,%eax movswl %si,%eax p
  4946. decw %si addw %dx,%si hp1
  4947. movw %ax,%si movw %ax,%si hp2
  4948. }
  4949. case taicpu(hp1).ops of
  4950. 1:
  4951. begin
  4952. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  4953. if taicpu(hp1).oper[0]^.typ=top_reg then
  4954. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  4955. end;
  4956. 2:
  4957. begin
  4958. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  4959. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  4960. (taicpu(hp1).opcode<>A_SHL) and
  4961. (taicpu(hp1).opcode<>A_SHR) and
  4962. (taicpu(hp1).opcode<>A_SAR) then
  4963. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  4964. end;
  4965. else
  4966. internalerror(2018111801);
  4967. end;
  4968. {
  4969. ->
  4970. decw %si addw %dx,%si p
  4971. }
  4972. RemoveInstruction(hp2);
  4973. end;
  4974. end;
  4975. end;
  4976. if MatchInstruction(hp1,A_BTS,A_BTR,[Taicpu(p).opsize]) and
  4977. GetNextInstruction(hp1, hp2) and
  4978. MatchInstruction(hp2,A_OR,[Taicpu(p).opsize]) and
  4979. MatchOperand(Taicpu(p).oper[0]^,0) and
  4980. (Taicpu(p).oper[1]^.typ = top_reg) and
  4981. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp1).oper[1]^) and
  4982. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp2).oper[1]^) then
  4983. { mov reg1,0
  4984. bts reg1,operand1 --> mov reg1,operand2
  4985. or reg1,operand2 bts reg1,operand1}
  4986. begin
  4987. Taicpu(hp2).opcode:=A_MOV;
  4988. DebugMsg(SPeepholeOptimization + 'MovBtsOr2MovBts done',hp1);
  4989. asml.remove(hp1);
  4990. insertllitem(hp2,hp2.next,hp1);
  4991. RemoveCurrentp(p, hp1);
  4992. Result:=true;
  4993. exit;
  4994. end;
  4995. if MatchInstruction(hp1,A_SUB,[Taicpu(p).opsize]) and
  4996. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp1).oper[1]^) and
  4997. GetNextInstruction(hp1, hp2) and
  4998. MatchInstruction(hp2,A_CMP,[Taicpu(p).opsize]) and
  4999. MatchOperand(Taicpu(p).oper[0]^,Taicpu(hp2).oper[1]^) and
  5000. MatchOperand(Taicpu(hp1).oper[0]^,Taicpu(hp2).oper[0]^) then
  5001. { change
  5002. mov reg1,reg2
  5003. sub reg3,reg2
  5004. cmp reg3,reg1
  5005. into
  5006. mov reg1,reg2
  5007. sub reg3,reg2
  5008. }
  5009. begin
  5010. DebugMsg(SPeepholeOptimization + 'MovSubCmp2MovSub done',p);
  5011. RemoveInstruction(hp2);
  5012. Result:=true;
  5013. exit;
  5014. end;
  5015. if (taicpu(p).oper[0]^.typ = top_ref) and { Second operand will be a register }
  5016. MatchInstruction(hp1, A_SHR, A_SAR, [taicpu(p).opsize]) and
  5017. MatchOpType(taicpu(hp1), top_const, top_reg) and
  5018. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5019. begin
  5020. RegName1 := debug_regname(taicpu(hp1).oper[1]^.reg);
  5021. {$ifdef x86_64}
  5022. { Convert:
  5023. movq x(ref),%reg64
  5024. shrq y,%reg64
  5025. To:
  5026. movl x+4(ref),%reg32
  5027. shrl y-32,%reg32 (Remove if y = 32)
  5028. }
  5029. if (taicpu(p).opsize = S_Q) and
  5030. (taicpu(hp1).opcode = A_SHR) and
  5031. (taicpu(hp1).oper[0]^.val >= 32) then
  5032. begin
  5033. PreMessage := 'movq ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
  5034. 'shrq $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> movl ';
  5035. { Convert to 32-bit }
  5036. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5037. taicpu(p).opsize := S_L;
  5038. Inc(taicpu(p).oper[0]^.ref^.offset, 4);
  5039. PreMessage := PreMessage + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg);
  5040. if (taicpu(hp1).oper[0]^.val = 32) then
  5041. begin
  5042. DebugMsg(SPeepholeOptimization + PreMessage + ' (MovShr2Mov)', p);
  5043. RemoveInstruction(hp1);
  5044. end
  5045. else
  5046. begin
  5047. { This will potentially open up more arithmetic operations since
  5048. the peephole optimizer now has a big hint that only the lower
  5049. 32 bits are currently in use (and opcodes are smaller in size) }
  5050. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  5051. taicpu(hp1).opsize := S_L;
  5052. Dec(taicpu(hp1).oper[0]^.val, 32);
  5053. DebugMsg(SPeepholeOptimization + PreMessage +
  5054. '; shrl $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr2MovShr)', p);
  5055. end;
  5056. Result := True;
  5057. Exit;
  5058. end;
  5059. {$endif x86_64}
  5060. { Convert:
  5061. movl x(ref),%reg
  5062. shrl $24,%reg
  5063. To:
  5064. movzbl x+3(ref),%reg
  5065. Do similar things for movl; shrl $16 -> movzwl and movw; shrw $8 -> movzbw
  5066. Also accept sar instead of shr, but convert to movsx instead of movzx
  5067. }
  5068. if taicpu(hp1).opcode = A_SHR then
  5069. MovUnaligned := A_MOVZX
  5070. else
  5071. MovUnaligned := A_MOVSX;
  5072. NewSize := S_NO;
  5073. NewOffset := 0;
  5074. case taicpu(p).opsize of
  5075. S_B:
  5076. { No valid combinations };
  5077. S_W:
  5078. if (taicpu(hp1).oper[0]^.val = 8) then
  5079. begin
  5080. NewSize := S_BW;
  5081. NewOffset := 1;
  5082. end;
  5083. S_L:
  5084. case taicpu(hp1).oper[0]^.val of
  5085. 16:
  5086. begin
  5087. NewSize := S_WL;
  5088. NewOffset := 2;
  5089. end;
  5090. 24:
  5091. begin
  5092. NewSize := S_BL;
  5093. NewOffset := 3;
  5094. end;
  5095. else
  5096. ;
  5097. end;
  5098. {$ifdef x86_64}
  5099. S_Q:
  5100. case taicpu(hp1).oper[0]^.val of
  5101. 32:
  5102. begin
  5103. if taicpu(hp1).opcode = A_SAR then
  5104. begin
  5105. { 32-bit to 64-bit is a distinct instruction }
  5106. MovUnaligned := A_MOVSXD;
  5107. NewSize := S_LQ;
  5108. NewOffset := 4;
  5109. end
  5110. else
  5111. { Should have been handled by MovShr2Mov above }
  5112. InternalError(2022081811);
  5113. end;
  5114. 48:
  5115. begin
  5116. NewSize := S_WQ;
  5117. NewOffset := 6;
  5118. end;
  5119. 56:
  5120. begin
  5121. NewSize := S_BQ;
  5122. NewOffset := 7;
  5123. end;
  5124. else
  5125. ;
  5126. end;
  5127. {$endif x86_64}
  5128. else
  5129. InternalError(2022081810);
  5130. end;
  5131. if (NewSize <> S_NO) and
  5132. (taicpu(p).oper[0]^.ref^.offset <= $7FFFFFFF - NewOffset) then
  5133. begin
  5134. PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
  5135. 'shr' + debug_opsize2str(taicpu(p).opsize) + ' $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> ' +
  5136. debug_op2str(MovUnaligned);
  5137. {$ifdef x86_64}
  5138. if MovUnaligned <> A_MOVSXD then
  5139. { Don't add size suffix for MOVSXD }
  5140. {$endif x86_64}
  5141. PreMessage := PreMessage + debug_opsize2str(NewSize);
  5142. Inc(taicpu(p).oper[0]^.ref^.offset, NewOffset);
  5143. taicpu(p).opcode := MovUnaligned;
  5144. taicpu(p).opsize := NewSize;
  5145. DebugMsg(SPeepholeOptimization + PreMessage + ' ' +
  5146. debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr/Sar2Movx)', p);
  5147. RemoveInstruction(hp1);
  5148. Result := True;
  5149. Exit;
  5150. end;
  5151. end;
  5152. { Backward optimisation shared with OptPass2MOV }
  5153. if FuncMov2Func(p, hp1) then
  5154. begin
  5155. Result := True;
  5156. Exit;
  5157. end;
  5158. end;
  5159. function TX86AsmOptimizer.OptPass1MOVD(var p : tai) : boolean;
  5160. { This function also handles the 64-bit version, MOVQ }
  5161. var
  5162. hp1: tai;
  5163. begin
  5164. Result:=false;
  5165. { Change:
  5166. movd/q %xmm0, %reg
  5167. ...
  5168. movl/q %reg, (ref)
  5169. (dealloc %reg)
  5170. To:
  5171. movd/q %xmm0, (ref)
  5172. }
  5173. if MatchOpType(taicpu(p),top_reg,top_reg) and
  5174. (GetRegType(taicpu(p).oper[0]^.reg)=R_MMREGISTER) and
  5175. (GetRegType(taicpu(p).oper[1]^.reg)=R_INTREGISTER) and
  5176. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  5177. MatchInstruction(hp1, A_MOV, []) and
  5178. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^.reg) and
  5179. (taicpu(hp1).oper[1]^.typ=top_ref) and
  5180. not RegInRef(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.ref^) then
  5181. begin
  5182. TransferUsedRegs(TmpUsedRegs);
  5183. UpdateUsedRegsBetween(TmpUsedRegs,p,hp1);
  5184. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs) then
  5185. begin
  5186. if (
  5187. { Instructions are always adjacent under -O2 and under }
  5188. not(cs_opt_level3 in current_settings.optimizerswitches) or
  5189. (
  5190. (
  5191. (taicpu(hp1).oper[1]^.ref^.base=NR_NO) or
  5192. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base,p,hp1)
  5193. ) and
  5194. (
  5195. (taicpu(hp1).oper[1]^.ref^.index=NR_NO) or
  5196. not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index,p,hp1)
  5197. )
  5198. )
  5199. ) then
  5200. begin
  5201. DebugMsg(SPeepholeOptimization+'Merged (V)MOVD/(V)MOVQ and MOV to eliminate intermediate register (MovD/QMov2MovD/Q 1a)',p);
  5202. taicpu(p).loadref(1,taicpu(hp1).oper[1]^.ref^);
  5203. { loadref increases the reference count, so decrement it again }
  5204. if Assigned(taicpu(hp1).oper[1]^.ref^.symbol) then
  5205. taicpu(hp1).oper[1]^.ref^.symbol.decrefs;
  5206. if Assigned(taicpu(hp1).oper[1]^.ref^.relsymbol) then
  5207. taicpu(hp1).oper[1]^.ref^.relsymbol.decrefs;
  5208. RemoveInstruction(hp1);
  5209. Include(OptsToCheck, aoc_ForceNewIteration);
  5210. end
  5211. else if not RegModifiedBetween(taicpu(p).oper[0]^.reg,p,hp1) then
  5212. begin
  5213. { Still possible to optimise if hp1 is converted instead }
  5214. DebugMsg(SPeepholeOptimization+'Merged (V)MOVD/(V)MOVQ and MOV to eliminate intermediate register (MovD/QMov2MovD/Q 1b)',hp1);
  5215. { Decrement the reference prior to replacing it }
  5216. if Assigned(taicpu(hp1).oper[1]^.ref^.symbol) then
  5217. taicpu(hp1).oper[1]^.ref^.symbol.decrefs;
  5218. if Assigned(taicpu(hp1).oper[1]^.ref^.relsymbol) then
  5219. taicpu(hp1).oper[1]^.ref^.relsymbol.decrefs;
  5220. taicpu(hp1).opcode:=taicpu(p).opcode;
  5221. taicpu(hp1).opsize:=taicpu(p).opsize;
  5222. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  5223. TransferUsedRegs(TmpUsedRegs);
  5224. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,TmpUsedRegs);
  5225. RemoveCurrentP(p);
  5226. Result:=True;
  5227. Exit;
  5228. end;
  5229. end;
  5230. end;
  5231. end;
  5232. function TX86AsmOptimizer.OptPass1MOVXX(var p : tai) : boolean;
  5233. var
  5234. hp1 : tai;
  5235. begin
  5236. Result:=false;
  5237. if taicpu(p).ops <> 2 then
  5238. exit;
  5239. if (MatchOpType(taicpu(p),top_reg,top_reg) and GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg)) or
  5240. GetNextInstruction(p,hp1) then
  5241. begin
  5242. if MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  5243. (taicpu(hp1).ops = 2) then
  5244. begin
  5245. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  5246. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  5247. { movXX reg1, mem1 or movXX mem1, reg1
  5248. movXX mem2, reg2 movXX reg2, mem2}
  5249. begin
  5250. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  5251. { movXX reg1, mem1 or movXX mem1, reg1
  5252. movXX mem2, reg1 movXX reg2, mem1}
  5253. begin
  5254. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  5255. begin
  5256. { Removes the second statement from
  5257. movXX reg1, mem1/reg2
  5258. movXX mem1/reg2, reg1
  5259. }
  5260. if taicpu(p).oper[0]^.typ=top_reg then
  5261. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  5262. { Removes the second statement from
  5263. movXX mem1/reg1, reg2
  5264. movXX reg2, mem1/reg1
  5265. }
  5266. if (taicpu(p).oper[1]^.typ=top_reg) and
  5267. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,UsedRegs)) then
  5268. begin
  5269. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2Nop 1 done',p);
  5270. RemoveInstruction(hp1);
  5271. RemoveCurrentp(p); { p will now be equal to the instruction that follows what was hp1 }
  5272. Result:=true;
  5273. exit;
  5274. end
  5275. else if (taicpu(hp1).oper[1]^.typ<>top_ref) or (not(vol_write in taicpu(hp1).oper[1]^.ref^.volatility)) and
  5276. (taicpu(hp1).oper[0]^.typ<>top_ref) or (not(vol_read in taicpu(hp1).oper[0]^.ref^.volatility)) then
  5277. begin
  5278. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2MoVXX 1 done',p);
  5279. RemoveInstruction(hp1);
  5280. Result:=true;
  5281. exit;
  5282. end;
  5283. end
  5284. end;
  5285. end;
  5286. end;
  5287. end;
  5288. end;
  5289. {$ifndef i8086}
  5290. function TX86AsmOptimizer.OptPass1NOT(var p: tai): Boolean;
  5291. var
  5292. hp1, p_next: tai;
  5293. flags_used: Boolean;
  5294. procedure Do_NotAnd2Andn1;
  5295. var
  5296. tempoper: poper;
  5297. begin
  5298. { Change "and %reg1,%reg2" to "andn %reg2,%reg1,%reg2" }
  5299. taicpu(hp1).allocate_oper(3);
  5300. taicpu(hp1).ops:=3;
  5301. { Swap the 1st and 2nd operands by swapping their pointers }
  5302. tempoper:=taicpu(hp1).oper[1];
  5303. taicpu(hp1).oper[1]:=taicpu(hp1).oper[0];
  5304. taicpu(hp1).oper[0]:=tempoper;
  5305. taicpu(hp1).loadreg(2, tempoper^.reg);
  5306. taicpu(hp1).opcode:=A_ANDN;
  5307. end;
  5308. begin
  5309. Result:=False;
  5310. { Don't optimise this for size as ANDN is bigger than NOT and AND combined }
  5311. if not (cs_opt_size in current_settings.optimizerswitches) and
  5312. (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.optimizecputype]) then
  5313. begin
  5314. { Convert: To:
  5315. not %reg1 andn %reg2,%reg1,%reg2
  5316. and %reg1,%reg2 not %reg1
  5317. Or remove "not %reg1" completely if %reg1 is deallocated.
  5318. This breaks the dependency chain.
  5319. }
  5320. if (taicpu(p).oper[0]^.typ=top_reg) and
  5321. { ANDN only supports 32-bit and 64-bit }
  5322. (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  5323. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
  5324. MatchInstruction(hp1, A_AND, [taicpu(p).opsize]) and
  5325. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^.reg) and
  5326. (taicpu(hp1).oper[1]^.typ=top_reg) and
  5327. (taicpu(hp1).oper[1]^.reg<>taicpu(p).oper[0]^.reg) and
  5328. (
  5329. { p and hp1 are adjacent on -O2 and below }
  5330. not(cs_opt_level3 in current_settings.optimizerswitches) or
  5331. not RegModifiedBetween(taicpu(hp1).oper[1]^.reg,p,hp1)
  5332. ) then
  5333. begin
  5334. p_next:=tai(p.Next);
  5335. TransferUsedRegs(TmpUsedRegs);
  5336. UpdateUsedRegsBetween(TmpUsedRegs, p_next, hp1);
  5337. { Make a note as to whether the flags are in use because
  5338. RegUsedAfterInstruction might change the state }
  5339. flags_used:=RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs);
  5340. if not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) then
  5341. begin
  5342. DebugMsg(SPeepholeOptimization + 'NotAnd2Andn 1 done', p);
  5343. Do_NotAnd2Andn1;
  5344. RemoveCurrentP(p, p_next);
  5345. Result:=True;
  5346. Exit;
  5347. end
  5348. else if not flags_used then
  5349. begin
  5350. DebugMsg(SPeepholeOptimization + 'NotAnd2AndnNot 1 done', p);
  5351. Do_NotAnd2Andn1;
  5352. asml.Remove(p);
  5353. asml.InsertAfter(p, hp1);
  5354. AllocRegBetween(taicpu(p).oper[0]^.reg, hp1, p, TmpUsedRegs);
  5355. { Make sure the pass 2 iteration continues from the
  5356. correct place, right after p }
  5357. p:=p_next;
  5358. Result:=True;
  5359. Exit;
  5360. end;
  5361. end;
  5362. end;
  5363. end;
  5364. {$endif not i8086}
  5365. function TX86AsmOptimizer.OptPass1OP(var p : tai) : boolean;
  5366. var
  5367. hp1 : tai;
  5368. begin
  5369. result:=false;
  5370. { replace
  5371. <Op>X %mreg1,%mreg2 // Op in [ADD,MUL]
  5372. MovX %mreg2,%mreg1
  5373. dealloc %mreg2
  5374. by
  5375. <Op>X %mreg2,%mreg1
  5376. ?
  5377. }
  5378. if GetNextInstruction(p,hp1) and
  5379. { we mix single and double opperations here because we assume that the compiler
  5380. generates vmovapd only after double operations and vmovaps only after single operations }
  5381. MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
  5382. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  5383. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
  5384. (taicpu(p).oper[0]^.typ=top_reg) then
  5385. begin
  5386. TransferUsedRegs(TmpUsedRegs);
  5387. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  5388. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  5389. begin
  5390. taicpu(p).loadoper(0,taicpu(hp1).oper[0]^);
  5391. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  5392. DebugMsg(SPeepholeOptimization + 'OpMov2Op done',p);
  5393. RemoveInstruction(hp1);
  5394. result:=true;
  5395. end;
  5396. end;
  5397. end;
  5398. function TX86AsmOptimizer.OptPass1Test(var p: tai) : boolean;
  5399. var
  5400. hp1, p_label, p_dist, hp1_dist, hp1_last: tai;
  5401. JumpLabel, JumpLabel_dist: TAsmLabel;
  5402. FirstValue, SecondValue: TCGInt;
  5403. function OptimizeJump(var InputP: tai): Boolean;
  5404. var
  5405. TempBool: Boolean;
  5406. begin
  5407. Result := False;
  5408. TempBool := True;
  5409. if DoJumpOptimizations(InputP, TempBool) or
  5410. not TempBool then
  5411. begin
  5412. Result := True;
  5413. if Assigned(InputP) then
  5414. begin
  5415. { CollapseZeroDistJump will be set to the label or an align
  5416. before it after the jump if it optimises, whether or not
  5417. the label is live or dead }
  5418. if (InputP.typ = ait_align) or
  5419. (
  5420. (InputP.typ = ait_label) and
  5421. not (tai_label(InputP).labsym.is_used)
  5422. ) then
  5423. GetNextInstruction(InputP, InputP);
  5424. end;
  5425. Exit;
  5426. end;
  5427. end;
  5428. begin
  5429. Result := False;
  5430. if (taicpu(p).oper[0]^.typ = top_const) and
  5431. (taicpu(p).oper[0]^.val <> -1) then
  5432. begin
  5433. { Convert unsigned maximum constants to -1 to aid optimisation }
  5434. case taicpu(p).opsize of
  5435. S_B:
  5436. if (taicpu(p).oper[0]^.val and $FF) = $FF then
  5437. begin
  5438. taicpu(p).oper[0]^.val := -1;
  5439. Result := True;
  5440. Exit;
  5441. end;
  5442. S_W:
  5443. if (taicpu(p).oper[0]^.val and $FFFF) = $FFFF then
  5444. begin
  5445. taicpu(p).oper[0]^.val := -1;
  5446. Result := True;
  5447. Exit;
  5448. end;
  5449. S_L:
  5450. if (taicpu(p).oper[0]^.val and $FFFFFFFF) = $FFFFFFFF then
  5451. begin
  5452. taicpu(p).oper[0]^.val := -1;
  5453. Result := True;
  5454. Exit;
  5455. end;
  5456. {$ifdef x86_64}
  5457. S_Q:
  5458. { Storing anything greater than $7FFFFFFF is not possible so do
  5459. nothing };
  5460. {$endif x86_64}
  5461. else
  5462. InternalError(2021121001);
  5463. end;
  5464. end;
  5465. if GetNextInstruction(p, hp1) and
  5466. TrySwapMovCmp(p, hp1) then
  5467. begin
  5468. Result := True;
  5469. Exit;
  5470. end;
  5471. p_label := nil;
  5472. JumpLabel := nil;
  5473. if MatchInstruction(hp1, A_Jcc, []) then
  5474. begin
  5475. if OptimizeJump(hp1) then
  5476. begin
  5477. Result := True;
  5478. if Assigned(hp1) then
  5479. begin
  5480. { CollapseZeroDistJump will be set to the label or an align
  5481. before it after the jump if it optimises, whether or not
  5482. the label is live or dead }
  5483. if (hp1.typ = ait_align) or
  5484. (
  5485. (hp1.typ = ait_label) and
  5486. not (tai_label(hp1).labsym.is_used)
  5487. ) then
  5488. GetNextInstruction(hp1, hp1);
  5489. end;
  5490. TransferUsedRegs(TmpUsedRegs);
  5491. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  5492. if not Assigned(hp1) or
  5493. (
  5494. not MatchInstruction(hp1, A_Jcc, A_SETcc, A_CMOVcc, []) and
  5495. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
  5496. ) then
  5497. begin
  5498. { No more conditional jumps; conditional statement is no longer required }
  5499. DebugMsg(SPeepholeOptimization + 'Removed unnecessary condition (Test2Nop)', p);
  5500. RemoveCurrentP(p);
  5501. end;
  5502. Exit;
  5503. end;
  5504. if IsJumpToLabel(taicpu(hp1)) then
  5505. begin
  5506. JumpLabel := TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol);
  5507. if Assigned(JumpLabel) then
  5508. p_label := getlabelwithsym(JumpLabel);
  5509. end;
  5510. end;
  5511. { Search for:
  5512. test $x,(reg/ref)
  5513. jne @lbl1
  5514. test $y,(reg/ref) (same register or reference)
  5515. jne @lbl1
  5516. Change to:
  5517. test $(x or y),(reg/ref)
  5518. jne @lbl1
  5519. (Note, this doesn't work with je instead of jne)
  5520. Also catch cases where "cmp $0,(reg/ref)" and "test %reg,%reg" are used.
  5521. Also search for:
  5522. test $x,(reg/ref)
  5523. je @lbl1
  5524. ...
  5525. test $y,(reg/ref)
  5526. je/jne @lbl2
  5527. If (x or y) = x, then the second jump is deterministic
  5528. }
  5529. if (
  5530. (
  5531. (taicpu(p).oper[0]^.typ = top_const) or
  5532. (
  5533. { test %reg,%reg can be considered equivalent to test, -1,%reg }
  5534. (taicpu(p).oper[0]^.typ = top_reg) and
  5535. MatchOperand(taicpu(p).oper[1]^, taicpu(p).oper[0]^.reg)
  5536. )
  5537. ) and
  5538. MatchInstruction(hp1, A_JCC, [])
  5539. ) then
  5540. begin
  5541. if (taicpu(p).oper[0]^.typ = top_reg) and
  5542. MatchOperand(taicpu(p).oper[1]^, taicpu(p).oper[0]^.reg) then
  5543. FirstValue := -1
  5544. else
  5545. FirstValue := taicpu(p).oper[0]^.val;
  5546. { If we have several test/jne's in a row, it might be the case that
  5547. the second label doesn't go to the same location, but the one
  5548. after it might (e.g. test; jne @lbl1; test; jne @lbl2; test @lbl1),
  5549. so accommodate for this with a while loop.
  5550. }
  5551. hp1_last := hp1;
  5552. while (
  5553. (
  5554. (taicpu(p).oper[1]^.typ = top_reg) and
  5555. GetNextInstructionUsingReg(hp1_last, p_dist, taicpu(p).oper[1]^.reg)
  5556. ) or GetNextInstruction(hp1_last, p_dist)
  5557. ) and (p_dist.typ = ait_instruction) do
  5558. begin
  5559. if (
  5560. (
  5561. (taicpu(p_dist).opcode = A_TEST) and
  5562. (
  5563. (taicpu(p_dist).oper[0]^.typ = top_const) or
  5564. { test %reg,%reg can be considered equivalent to test, -1,%reg }
  5565. MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p_dist).oper[0]^)
  5566. )
  5567. ) or
  5568. (
  5569. { cmp 0,%reg = test %reg,%reg }
  5570. (taicpu(p_dist).opcode = A_CMP) and
  5571. MatchOperand(taicpu(p_dist).oper[0]^, 0)
  5572. )
  5573. ) and
  5574. { Make sure the destination operands are actually the same }
  5575. MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p).oper[1]^) and
  5576. GetNextInstruction(p_dist, hp1_dist) and
  5577. MatchInstruction(hp1_dist, A_JCC, []) then
  5578. begin
  5579. if OptimizeJump(hp1_dist) then
  5580. begin
  5581. Result := True;
  5582. Exit;
  5583. end;
  5584. if
  5585. (taicpu(p_dist).opcode = A_CMP) { constant will be zero } or
  5586. (
  5587. (taicpu(p_dist).oper[0]^.typ = top_reg) and
  5588. MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p_dist).oper[0]^.reg)
  5589. ) then
  5590. SecondValue := -1
  5591. else
  5592. SecondValue := taicpu(p_dist).oper[0]^.val;
  5593. { If both of the TEST constants are identical, delete the
  5594. second TEST that is unnecessary (be careful though, just
  5595. in case the flags are modified in between) }
  5596. if (FirstValue = SecondValue) then
  5597. begin
  5598. if condition_in(taicpu(hp1_dist).condition, taicpu(hp1).condition) then
  5599. begin
  5600. { Since the second jump's condition is a subset of the first, we
  5601. know it will never branch because the first jump dominates it.
  5602. Get it out of the way now rather than wait for the jump
  5603. optimisations for a speed boost. }
  5604. if IsJumpToLabel(taicpu(hp1_dist)) then
  5605. TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol).DecRefs;
  5606. DebugMsg(SPeepholeOptimization + 'Removed dominated jump (via TEST/Jcc/TEST)', hp1_dist);
  5607. RemoveInstruction(hp1_dist);
  5608. Result := True;
  5609. end
  5610. else if condition_in(inverse_cond(taicpu(hp1).condition), taicpu(hp1_dist).condition) then
  5611. begin
  5612. { If the inverse of the first condition is a subset of the second,
  5613. the second one will definitely branch if the first one doesn't }
  5614. DebugMsg(SPeepholeOptimization + 'Conditional jump will always branch (via TEST/Jcc/TEST)', hp1_dist);
  5615. { We can remove the TEST instruction too }
  5616. DebugMsg(SPeepholeOptimization + 'TEST/Jcc/TEST; removed superfluous TEST', p_dist);
  5617. RemoveInstruction(p_dist);
  5618. MakeUnconditional(taicpu(hp1_dist));
  5619. RemoveDeadCodeAfterJump(hp1_dist);
  5620. { Since the jump is now unconditional, we can't
  5621. continue any further with this particular
  5622. optimisation. The original TEST is still intact
  5623. though, so there might be something else we can
  5624. do }
  5625. Include(OptsToCheck, aoc_ForceNewIteration);
  5626. Break;
  5627. end;
  5628. if Result or
  5629. { If a jump wasn't removed or made unconditional, only
  5630. remove the identical TEST instruction if the flags
  5631. weren't modified }
  5632. not RegModifiedBetween(NR_DEFAULTFLAGS, hp1, p_dist) then
  5633. begin
  5634. DebugMsg(SPeepholeOptimization + 'TEST/Jcc/TEST; removed superfluous TEST', p_dist);
  5635. RemoveInstruction(p_dist);
  5636. { If the jump was removed or made unconditional, we
  5637. don't need to allocate NR_DEFAULTFLAGS over the
  5638. entire range }
  5639. if not Result then
  5640. begin
  5641. { Mark the flags as 'in use' over the entire range }
  5642. AllocRegBetween(NR_DEFAULTFLAGS, hp1, hp1_dist, UsedRegs);
  5643. { Speed gain - continue search from the Jcc instruction }
  5644. hp1_last := hp1_dist;
  5645. { Only the TEST instruction was removed, and the
  5646. original was unchanged, so we can safely do
  5647. another iteration of the while loop }
  5648. Include(OptsToCheck, aoc_ForceNewIteration);
  5649. Continue;
  5650. end;
  5651. Exit;
  5652. end;
  5653. end;
  5654. hp1_last := nil;
  5655. if (taicpu(hp1).condition in [C_NE, C_NZ]) and
  5656. (
  5657. { In this situation, the TEST/JNE pairs must be adjacent (fixes #40366) }
  5658. { Always adjacent under -O2 and under }
  5659. not(cs_opt_level3 in current_settings.optimizerswitches) or
  5660. (
  5661. GetNextInstruction(hp1, hp1_last) and
  5662. (hp1_last = p_dist)
  5663. )
  5664. ) and
  5665. (
  5666. (
  5667. { Test the following variant:
  5668. test $x,(reg/ref)
  5669. jne @lbl1
  5670. test $y,(reg/ref)
  5671. je @lbl2
  5672. @lbl1:
  5673. Becomes:
  5674. test $(x or y),(reg/ref)
  5675. je @lbl2
  5676. @lbl1: (may become a dead label)
  5677. }
  5678. (taicpu(hp1_dist).condition in [C_E, C_Z]) and
  5679. GetNextInstruction(hp1_dist, hp1_last) and
  5680. (hp1_last = p_label)
  5681. ) or
  5682. (
  5683. (taicpu(hp1_dist).condition in [C_NE, C_NZ]) and
  5684. { If the first instruction is test %reg,%reg or test $-1,%reg,
  5685. then the second jump will never branch, so it can also be
  5686. removed regardless of where it goes }
  5687. (
  5688. (FirstValue = -1) or
  5689. (SecondValue = -1) or
  5690. MatchOperand(taicpu(hp1_dist).oper[0]^, taicpu(hp1).oper[0]^)
  5691. )
  5692. )
  5693. ) then
  5694. begin
  5695. { Same jump location... can be a register since nothing's changed }
  5696. { If any of the entries are equivalent to test %reg,%reg, then the
  5697. merged $(x or y) is also test %reg,%reg / test $-1,%reg }
  5698. taicpu(p).loadconst(0, FirstValue or SecondValue);
  5699. if (hp1_last = p_label) then
  5700. begin
  5701. { Variant }
  5702. DebugMsg(SPeepholeOptimization + 'TEST/JNE/TEST/JE/@Lbl merged', p);
  5703. RemoveInstruction(p_dist);
  5704. if Assigned(JumpLabel) then
  5705. JumpLabel.decrefs;
  5706. RemoveInstruction(hp1);
  5707. end
  5708. else
  5709. begin
  5710. { Only remove the second test if no jumps or other conditional instructions follow }
  5711. TransferUsedRegs(TmpUsedRegs);
  5712. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  5713. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  5714. UpdateUsedRegs(TmpUsedRegs, tai(p_dist.Next));
  5715. if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1_dist, TmpUsedRegs) then
  5716. begin
  5717. DebugMsg(SPeepholeOptimization + 'TEST/JNE/TEST/JNE merged', p);
  5718. RemoveInstruction(p_dist);
  5719. { Remove the first jump, not the second, to keep
  5720. any register deallocations between the second
  5721. TEST/JNE pair in the same place. Aids future
  5722. optimisation. }
  5723. if Assigned(JumpLabel) then
  5724. JumpLabel.decrefs;
  5725. RemoveInstruction(hp1);
  5726. end
  5727. else
  5728. begin
  5729. DebugMsg(SPeepholeOptimization + 'TEST/JNE/TEST/JNE merged (second TEST preserved)', p);
  5730. if IsJumpToLabel(taicpu(hp1_dist)) then
  5731. TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol).DecRefs;
  5732. { Remove second jump in this instance }
  5733. RemoveInstruction(hp1_dist);
  5734. end;
  5735. end;
  5736. Result := True;
  5737. Exit;
  5738. end;
  5739. end;
  5740. if { If -O2 and under, it may stop on any old instruction }
  5741. (cs_opt_level3 in current_settings.optimizerswitches) and
  5742. (taicpu(p).oper[1]^.typ = top_reg) and
  5743. not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, p_dist) then
  5744. begin
  5745. hp1_last := p_dist;
  5746. Continue;
  5747. end;
  5748. Break;
  5749. end;
  5750. end;
  5751. { Search for:
  5752. test %reg,%reg
  5753. j(c1) @lbl1
  5754. ...
  5755. @lbl:
  5756. test %reg,%reg (same register)
  5757. j(c2) @lbl2
  5758. If c2 is a subset of c1, change to:
  5759. test %reg,%reg
  5760. j(c1) @lbl2
  5761. (@lbl1 may become a dead label as a result)
  5762. }
  5763. if (taicpu(p).oper[1]^.typ = top_reg) and
  5764. (taicpu(p).oper[0]^.typ = top_reg) and
  5765. (taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
  5766. { p_label <> nil is a marker that hp1 is a Jcc to a label }
  5767. Assigned(p_label) and
  5768. GetNextInstruction(p_label, p_dist) and
  5769. MatchInstruction(p_dist, A_TEST, []) and
  5770. { It's fine if the second test uses smaller sub-registers }
  5771. (taicpu(p_dist).opsize <= taicpu(p).opsize) and
  5772. MatchOpType(taicpu(p_dist), top_reg, top_reg) and
  5773. SuperRegistersEqual(taicpu(p_dist).oper[0]^.reg, taicpu(p).oper[0]^.reg) and
  5774. SuperRegistersEqual(taicpu(p_dist).oper[1]^.reg, taicpu(p).oper[1]^.reg) and
  5775. GetNextInstruction(p_dist, hp1_dist) and
  5776. MatchInstruction(hp1_dist, A_JCC, []) then { This doesn't have to be an explicit label }
  5777. begin
  5778. JumpLabel_dist := TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol);
  5779. if JumpLabel = JumpLabel_dist then
  5780. { This is an infinite loop }
  5781. Exit;
  5782. { Best optimisation when the first condition is a subset (or equal) of the second }
  5783. if condition_in(taicpu(hp1).condition, taicpu(hp1_dist).condition) then
  5784. begin
  5785. { Any registers used here will already be allocated }
  5786. if Assigned(JumpLabel) then
  5787. JumpLabel.DecRefs;
  5788. DebugMsg(SPeepholeOptimization + 'TEST/Jcc/@Lbl/TEST/Jcc -> TEST/Jcc, redirecting first jump', hp1);
  5789. taicpu(hp1).loadref(0, taicpu(hp1_dist).oper[0]^.ref^); { This also increases the reference count }
  5790. Result := True;
  5791. Exit;
  5792. end;
  5793. end;
  5794. end;
  5795. function TX86AsmOptimizer.OptPass1Add(var p : tai) : boolean;
  5796. var
  5797. hp1, hp2: tai;
  5798. ActiveReg: TRegister;
  5799. OldOffset: asizeint;
  5800. ThisConst: TCGInt;
  5801. function RegDeallocated: Boolean;
  5802. begin
  5803. TransferUsedRegs(TmpUsedRegs);
  5804. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  5805. Result := not(RegUsedAfterInstruction(ActiveReg,hp1,TmpUsedRegs))
  5806. end;
  5807. begin
  5808. result:=false;
  5809. hp1 := nil;
  5810. { replace
  5811. addX const,%reg1
  5812. leaX (%reg1,%reg1,Y),%reg2 // Base or index might not be equal to reg1
  5813. dealloc %reg1
  5814. by
  5815. leaX const+const*Y(%reg1,%reg1,Y),%reg2
  5816. }
  5817. if MatchOpType(taicpu(p),top_const,top_reg) then
  5818. begin
  5819. ActiveReg := taicpu(p).oper[1]^.reg;
  5820. { Ensures the entire register was updated }
  5821. if (taicpu(p).opsize >= S_L) and
  5822. GetNextInstructionUsingReg(p,hp1, ActiveReg) and
  5823. MatchInstruction(hp1,A_LEA,[]) and
  5824. (SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.base) or
  5825. SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.index)) and
  5826. (
  5827. { Cover the case where the register in the reference is also the destination register }
  5828. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ActiveReg) or
  5829. (
  5830. { Try to avoid the expensive check of RegUsedAfterInstruction if we know it will return False }
  5831. not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ActiveReg) and
  5832. RegDeallocated
  5833. )
  5834. ) then
  5835. begin
  5836. OldOffset := taicpu(hp1).oper[0]^.ref^.offset;
  5837. {$push}
  5838. {$R-}{$Q-}
  5839. { Explicitly disable overflow checking for these offset calculation
  5840. as those do not matter for the final result }
  5841. if ActiveReg=taicpu(hp1).oper[0]^.ref^.base then
  5842. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val);
  5843. if ActiveReg=taicpu(hp1).oper[0]^.ref^.index then
  5844. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  5845. {$pop}
  5846. {$ifdef x86_64}
  5847. if (taicpu(hp1).oper[0]^.ref^.offset > $7FFFFFFF) or (taicpu(hp1).oper[0]^.ref^.offset < -2147483648) then
  5848. begin
  5849. { Overflow; abort }
  5850. taicpu(hp1).oper[0]^.ref^.offset := OldOffset;
  5851. end
  5852. else
  5853. {$endif x86_64}
  5854. begin
  5855. DebugMsg(SPeepholeOptimization + 'AddLea2Lea done',p);
  5856. if not (cs_opt_level3 in current_settings.optimizerswitches) then
  5857. { hp1 is the immediate next instruction for sure - good for a quick speed boost }
  5858. RemoveCurrentP(p, hp1)
  5859. else
  5860. RemoveCurrentP(p);
  5861. result:=true;
  5862. Exit;
  5863. end;
  5864. end;
  5865. if (
  5866. { Save calling GetNextInstructionUsingReg again }
  5867. Assigned(hp1) or
  5868. GetNextInstructionUsingReg(p,hp1, ActiveReg)
  5869. ) and
  5870. MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
  5871. (taicpu(hp1).oper[1]^.reg = ActiveReg) then
  5872. begin
  5873. { Make sure the flags aren't in use by the second operation }
  5874. TransferUsedRegs(TmpUsedRegs);
  5875. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.next), hp1);
  5876. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  5877. begin
  5878. if taicpu(hp1).oper[0]^.typ = top_const then
  5879. begin
  5880. { Merge add const1,%reg; add/sub const2,%reg to add const1+/-const2,%reg }
  5881. if taicpu(hp1).opcode = A_ADD then
  5882. ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val
  5883. else
  5884. ThisConst := taicpu(p).oper[0]^.val - taicpu(hp1).oper[0]^.val;
  5885. Result := True;
  5886. { Handle any overflows }
  5887. case taicpu(p).opsize of
  5888. S_B:
  5889. taicpu(p).oper[0]^.val := ThisConst and $FF;
  5890. S_W:
  5891. taicpu(p).oper[0]^.val := ThisConst and $FFFF;
  5892. S_L:
  5893. taicpu(p).oper[0]^.val := ThisConst and $FFFFFFFF;
  5894. {$ifdef x86_64}
  5895. S_Q:
  5896. if (ThisConst > $7FFFFFFF) or (ThisConst < -2147483648) then
  5897. { Overflow; abort }
  5898. Result := False
  5899. else
  5900. taicpu(p).oper[0]^.val := ThisConst;
  5901. {$endif x86_64}
  5902. else
  5903. InternalError(2021102610);
  5904. end;
  5905. { Result may get set to False again if the combined immediate overflows for S_Q sizes }
  5906. if Result then
  5907. begin
  5908. if (taicpu(p).oper[0]^.val < 0) and
  5909. (
  5910. ((taicpu(p).opsize = S_B) and (taicpu(p).oper[0]^.val <> -128)) or
  5911. ((taicpu(p).opsize = S_W) and (taicpu(p).oper[0]^.val <> -32768)) or
  5912. ((taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and (taicpu(p).oper[0]^.val <> -2147483648))
  5913. ) then
  5914. begin
  5915. DebugMsg(SPeepholeOptimization + 'ADD; ADD/SUB -> SUB',p);
  5916. taicpu(p).opcode := A_SUB;
  5917. taicpu(p).oper[0]^.val := -taicpu(p).oper[0]^.val;
  5918. end
  5919. else
  5920. DebugMsg(SPeepholeOptimization + 'ADD; ADD/SUB -> ADD',p);
  5921. RemoveInstruction(hp1);
  5922. end;
  5923. end
  5924. else
  5925. begin
  5926. { Move the constant addition to after the reg/ref addition to improve optimisation }
  5927. DebugMsg(SPeepholeOptimization + 'Add/sub swap 1a done',p);
  5928. Asml.Remove(p);
  5929. Asml.InsertAfter(p, hp1);
  5930. p := hp1;
  5931. Result := True;
  5932. Exit;
  5933. end;
  5934. end;
  5935. end;
  5936. if DoArithCombineOpt(p) then
  5937. Result:=true;
  5938. end;
  5939. end;
  5940. function TX86AsmOptimizer.OptPass1LEA(var p : tai) : boolean;
  5941. var
  5942. hp1, hp2: tai;
  5943. ref: Integer;
  5944. saveref: treference;
  5945. offsetcalc: Int64;
  5946. TempReg: TRegister;
  5947. Multiple: TCGInt;
  5948. Adjacent, IntermediateRegDiscarded: Boolean;
  5949. begin
  5950. Result:=false;
  5951. { play save and throw an error if LEA uses a seg register prefix,
  5952. this is most likely an error somewhere else }
  5953. if taicpu(p).oper[0]^.ref^.Segment<>NR_NO then
  5954. internalerror(2022022001);
  5955. { changes "lea (%reg1), %reg2" into "mov %reg1, %reg2" }
  5956. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  5957. (taicpu(p).oper[0]^.ref^.index = NR_NO) and
  5958. (
  5959. { do not mess with leas accessing the stack pointer
  5960. unless it's a null operation }
  5961. (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) or
  5962. (
  5963. (taicpu(p).oper[0]^.ref^.base = NR_STACK_POINTER_REG) and
  5964. (taicpu(p).oper[0]^.ref^.offset = 0)
  5965. )
  5966. ) and
  5967. (not(Assigned(taicpu(p).oper[0]^.ref^.Symbol))) then
  5968. begin
  5969. if (taicpu(p).oper[0]^.ref^.offset = 0) then
  5970. begin
  5971. if (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) then
  5972. begin
  5973. taicpu(p).opcode := A_MOV;
  5974. taicpu(p).loadreg(0, taicpu(p).oper[0]^.ref^.base);
  5975. DebugMsg(SPeepholeOptimization + 'Lea2Mov done',p);
  5976. end
  5977. else
  5978. begin
  5979. DebugMsg(SPeepholeOptimization + 'Lea2Nop done',p);
  5980. RemoveCurrentP(p);
  5981. end;
  5982. Result:=true;
  5983. exit;
  5984. end
  5985. else if (
  5986. { continue to use lea to adjust the stack pointer,
  5987. it is the recommended way, but only if not optimizing for size }
  5988. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) or
  5989. (cs_opt_size in current_settings.optimizerswitches)
  5990. ) and
  5991. { If the flags register is in use, don't change the instruction
  5992. to an ADD otherwise this will scramble the flags. [Kit] }
  5993. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  5994. ConvertLEA(taicpu(p)) then
  5995. begin
  5996. Result:=true;
  5997. exit;
  5998. end;
  5999. end;
  6000. { Don't optimise if the stack or frame pointer is the destination register }
  6001. if (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG) or (taicpu(p).oper[1]^.reg=current_procinfo.framepointer) then
  6002. Exit;
  6003. if GetNextInstruction(p,hp1) and
  6004. (hp1.typ=ait_instruction) then
  6005. begin
  6006. if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  6007. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  6008. MatchOpType(Taicpu(hp1),top_reg,top_reg) then
  6009. begin
  6010. TransferUsedRegs(TmpUsedRegs);
  6011. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  6012. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  6013. begin
  6014. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  6015. DebugMsg(SPeepholeOptimization + 'LeaMov2Lea done',p);
  6016. RemoveInstruction(hp1);
  6017. result:=true;
  6018. exit;
  6019. end;
  6020. end;
  6021. { changes
  6022. lea <ref1>, reg1
  6023. <op> ...,<ref. with reg1>,...
  6024. to
  6025. <op> ...,<ref1>,... }
  6026. { find a reference which uses reg1 }
  6027. if (taicpu(hp1).ops>=1) and (taicpu(hp1).oper[0]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^) then
  6028. ref:=0
  6029. else if (taicpu(hp1).ops>=2) and (taicpu(hp1).oper[1]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^) then
  6030. ref:=1
  6031. else
  6032. ref:=-1;
  6033. if (ref<>-1) and
  6034. { reg1 must be either the base or the index }
  6035. ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) xor (taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg)) then
  6036. begin
  6037. { reg1 can be removed from the reference }
  6038. saveref:=taicpu(hp1).oper[ref]^.ref^;
  6039. if taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg then
  6040. taicpu(hp1).oper[ref]^.ref^.base:=NR_NO
  6041. else if taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg then
  6042. taicpu(hp1).oper[ref]^.ref^.index:=NR_NO
  6043. else
  6044. Internalerror(2019111201);
  6045. { check if the can insert all data of the lea into the second instruction }
  6046. if ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
  6047. ((taicpu(p).oper[0]^.ref^.base=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.base=NR_NO)) and
  6048. ((taicpu(p).oper[0]^.ref^.index=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.index=NR_NO)) and
  6049. ((taicpu(p).oper[0]^.ref^.symbol=nil) or (taicpu(hp1).oper[ref]^.ref^.symbol=nil)) and
  6050. ((taicpu(p).oper[0]^.ref^.relsymbol=nil) or (taicpu(hp1).oper[ref]^.ref^.relsymbol=nil)) and
  6051. ((taicpu(p).oper[0]^.ref^.scalefactor <= 1) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
  6052. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.segment=NR_NO)
  6053. {$ifdef x86_64}
  6054. and (abs(taicpu(hp1).oper[ref]^.ref^.offset+taicpu(p).oper[0]^.ref^.offset)<=$7fffffff)
  6055. and (((taicpu(p).oper[0]^.ref^.base<>NR_RIP) and (taicpu(p).oper[0]^.ref^.index<>NR_RIP)) or
  6056. ((taicpu(hp1).oper[ref]^.ref^.base=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.index=NR_NO))
  6057. )
  6058. {$endif x86_64}
  6059. then
  6060. begin
  6061. { reg1 might not used by the second instruction after it is remove from the reference }
  6062. if not(RegInInstruction(taicpu(p).oper[1]^.reg,taicpu(hp1))) then
  6063. begin
  6064. TransferUsedRegs(TmpUsedRegs);
  6065. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  6066. { reg1 is not updated so it might not be used afterwards }
  6067. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  6068. begin
  6069. DebugMsg(SPeepholeOptimization + 'LeaOp2Op done',p);
  6070. if taicpu(p).oper[0]^.ref^.base<>NR_NO then
  6071. taicpu(hp1).oper[ref]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  6072. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  6073. taicpu(hp1).oper[ref]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  6074. if taicpu(p).oper[0]^.ref^.symbol<>nil then
  6075. taicpu(hp1).oper[ref]^.ref^.symbol:=taicpu(p).oper[0]^.ref^.symbol;
  6076. if taicpu(p).oper[0]^.ref^.relsymbol<>nil then
  6077. taicpu(hp1).oper[ref]^.ref^.relsymbol:=taicpu(p).oper[0]^.ref^.relsymbol;
  6078. if taicpu(p).oper[0]^.ref^.scalefactor > 1 then
  6079. taicpu(hp1).oper[ref]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  6080. inc(taicpu(hp1).oper[ref]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  6081. RemoveCurrentP(p, hp1);
  6082. result:=true;
  6083. exit;
  6084. end
  6085. end;
  6086. end;
  6087. { recover }
  6088. taicpu(hp1).oper[ref]^.ref^:=saveref;
  6089. end;
  6090. Adjacent := RegInInstruction(taicpu(p).oper[1]^.reg, hp1);
  6091. if Adjacent or
  6092. { Check further ahead (up to 2 instructions ahead for -O2) }
  6093. GetNextInstructionUsingReg(hp1,hp1,taicpu(p).oper[1]^.reg) then
  6094. begin
  6095. { Check common LEA/LEA conditions }
  6096. if MatchInstruction(hp1,A_LEA,[taicpu(p).opsize]) and
  6097. (taicpu(p).oper[0]^.ref^.relsymbol = nil) and
  6098. (taicpu(p).oper[0]^.ref^.segment = NR_NO) and
  6099. (taicpu(p).oper[0]^.ref^.symbol = nil) and
  6100. (taicpu(hp1).oper[0]^.ref^.relsymbol = nil) and
  6101. (taicpu(hp1).oper[0]^.ref^.segment = NR_NO) and
  6102. (taicpu(hp1).oper[0]^.ref^.symbol = nil) and
  6103. (
  6104. { If p and hp1 are adjacent, RegModifiedBetween always returns False, so avoid
  6105. calling it (since it calls GetNextInstruction) }
  6106. Adjacent or
  6107. (
  6108. (
  6109. (taicpu(p).oper[0]^.ref^.base = NR_NO) or { Don't call RegModifiedBetween unnecessarily }
  6110. not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.base,p,hp1))
  6111. ) and (
  6112. (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) or { Don't call RegModifiedBetween unnecessarily }
  6113. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  6114. not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.index,p,hp1))
  6115. )
  6116. )
  6117. ) then
  6118. begin
  6119. TransferUsedRegs(TmpUsedRegs);
  6120. hp2 := p;
  6121. repeat
  6122. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  6123. until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
  6124. IntermediateRegDiscarded :=
  6125. (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) or
  6126. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs);
  6127. { changes
  6128. lea offset1(regX,scale), reg1
  6129. lea offset2(reg1,reg1), reg2
  6130. to
  6131. lea (offset1*scale*2)+offset2(regX,scale*2), reg2
  6132. and
  6133. lea offset1(regX,scale1), reg1
  6134. lea offset2(reg1,scale2), reg2
  6135. to
  6136. lea (offset1*scale1*2)+offset2(regX,scale1*scale2), reg2
  6137. and
  6138. lea offset1(regX,scale1), reg1
  6139. lea offset2(reg3,reg1,scale2), reg2
  6140. to
  6141. lea (offset1*scale*2)+offset2(reg3,regX,scale1*scale2), reg2
  6142. ... so long as the final scale does not exceed 8
  6143. (Similarly, allow the first instruction to be "lea (regX,regX),reg1")
  6144. }
  6145. if (taicpu(p).oper[0]^.ref^.base<>NR_STACK_POINTER_REG) and { lea (%rsp,scale),reg is not a valid encoding }
  6146. (
  6147. { Don't optimise if size is a concern and the intermediate register remains in use }
  6148. IntermediateRegDiscarded or
  6149. (
  6150. not (cs_opt_size in current_settings.optimizerswitches) and
  6151. { If the intermediate register is not discarded, it must not
  6152. appear in the first LEA's reference. (Fixes #41166) }
  6153. not RegInRef(taicpu(p).oper[1]^.reg, taicpu(p).oper[0]^.ref^)
  6154. )
  6155. ) and
  6156. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  6157. (
  6158. (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[0]^.ref^.index) or
  6159. (taicpu(p).oper[0]^.ref^.scalefactor <= 1)
  6160. ) and (
  6161. (
  6162. { lea (reg1,scale2), reg2 variant }
  6163. (taicpu(hp1).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) and
  6164. (
  6165. Adjacent or
  6166. not RegModifiedBetween(taicpu(hp1).oper[0]^.ref^.base, p, hp1)
  6167. ) and
  6168. (
  6169. (
  6170. (taicpu(p).oper[0]^.ref^.base = NR_NO) and
  6171. (taicpu(hp1).oper[0]^.ref^.scalefactor * taicpu(p).oper[0]^.ref^.scalefactor <= 8)
  6172. ) or (
  6173. { lea (regX,regX), reg1 variant }
  6174. (taicpu(p).oper[0]^.ref^.base = taicpu(p).oper[0]^.ref^.index) and
  6175. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 4)
  6176. )
  6177. )
  6178. ) or (
  6179. { lea (reg1,reg1), reg1 variant }
  6180. (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  6181. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1)
  6182. )
  6183. ) then
  6184. begin
  6185. { Make everything homogeneous to make calculations easier }
  6186. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) then
  6187. begin
  6188. if taicpu(p).oper[0]^.ref^.index <> NR_NO then
  6189. { Convert lea (regX,regX),reg1 to lea (regX,2),reg1 }
  6190. taicpu(p).oper[0]^.ref^.scalefactor := 2
  6191. else
  6192. taicpu(p).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.base;
  6193. taicpu(p).oper[0]^.ref^.base := NR_NO;
  6194. end;
  6195. { Make sure the offset doesn't go out of range (use 64-bit arithmetic)}
  6196. offsetcalc := taicpu(hp1).oper[0]^.ref^.offset;
  6197. Inc(offsetcalc, Int64(taicpu(p).oper[0]^.ref^.offset) * max(taicpu(hp1).oper[0]^.ref^.scalefactor, 1));
  6198. if (offsetcalc <= $7FFFFFFF) and (offsetcalc >= -2147483648) then
  6199. begin
  6200. if (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  6201. (taicpu(hp1).oper[0]^.ref^.index <> taicpu(p).oper[1]^.reg) then
  6202. begin
  6203. { Put the register to change in the index register }
  6204. TempReg := taicpu(hp1).oper[0]^.ref^.index;
  6205. taicpu(hp1).oper[0]^.ref^.index := taicpu(hp1).oper[0]^.ref^.base;
  6206. taicpu(hp1).oper[0]^.ref^.base := TempReg;
  6207. end;
  6208. { Change lea (reg,reg) to lea(,reg,2) }
  6209. if (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) then
  6210. begin
  6211. taicpu(hp1).oper[0]^.ref^.base := NR_NO;
  6212. taicpu(hp1).oper[0]^.ref^.scalefactor := 2;
  6213. end;
  6214. if (taicpu(p).oper[0]^.ref^.offset <> 0) then
  6215. Inc(taicpu(hp1).oper[0]^.ref^.offset, taicpu(p).oper[0]^.ref^.offset * max(taicpu(hp1).oper[0]^.ref^.scalefactor, 1));
  6216. taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.index;
  6217. { Just to prevent miscalculations }
  6218. if (taicpu(hp1).oper[0]^.ref^.scalefactor = 0) then
  6219. taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor
  6220. else
  6221. taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(hp1).oper[0]^.ref^.scalefactor * max(taicpu(p).oper[0]^.ref^.scalefactor, 1);
  6222. { Only remove the first LEA if we don't need the intermediate register's value as is }
  6223. if IntermediateRegDiscarded then
  6224. begin
  6225. DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 2 done',p);
  6226. RemoveCurrentP(p);
  6227. end
  6228. else
  6229. DebugMsg(SPeepholeOptimization + 'LeaLea2LeaLea 2 done (intermediate register still in use)',p);
  6230. result:=true;
  6231. exit;
  6232. end;
  6233. end;
  6234. { changes
  6235. lea offset1(regX), reg1
  6236. lea offset2(reg1), reg2
  6237. to
  6238. lea offset1+offset2(regX), reg2 }
  6239. if (
  6240. { Don't optimise if size is a concern and the intermediate register remains in use }
  6241. IntermediateRegDiscarded or
  6242. (
  6243. not (cs_opt_size in current_settings.optimizerswitches) and
  6244. { If the intermediate register is not discarded, it must not
  6245. appear in the first LEA's reference. (Fixes #41166) }
  6246. not RegInRef(taicpu(p).oper[1]^.reg, taicpu(p).oper[0]^.ref^)
  6247. )
  6248. ) and
  6249. (
  6250. (
  6251. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  6252. (getsupreg(taicpu(p).oper[0]^.ref^.base)<>RS_ESP) and
  6253. (taicpu(p).oper[0]^.ref^.index = NR_NO)
  6254. ) or (
  6255. (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  6256. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
  6257. (
  6258. (
  6259. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  6260. (taicpu(p).oper[0]^.ref^.base = NR_NO)
  6261. ) or (
  6262. (taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
  6263. (
  6264. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  6265. (
  6266. (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
  6267. (
  6268. (taicpu(hp1).oper[0]^.ref^.index = NR_NO) or
  6269. (taicpu(hp1).oper[0]^.ref^.base = NR_NO)
  6270. )
  6271. )
  6272. )
  6273. )
  6274. )
  6275. )
  6276. ) then
  6277. begin
  6278. { Make sure the offset doesn't go out of range (use 64-bit arithmetic)}
  6279. offsetcalc := taicpu(hp1).oper[0]^.ref^.offset;
  6280. Inc(offsetcalc, Int64(taicpu(p).oper[0]^.ref^.offset) * max(taicpu(hp1).oper[0]^.ref^.scalefactor, 1));
  6281. if (offsetcalc <= $7FFFFFFF) and (offsetcalc >= -2147483648) then
  6282. begin
  6283. if taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg then
  6284. begin
  6285. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.base;
  6286. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  6287. { if the register is used as index and base, we have to increase for base as well
  6288. and adapt base }
  6289. if taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg then
  6290. begin
  6291. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  6292. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  6293. end;
  6294. end
  6295. else
  6296. begin
  6297. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  6298. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  6299. end;
  6300. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  6301. begin
  6302. taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
  6303. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  6304. if (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) then
  6305. { Catch the situation where the base = index
  6306. and treat this as *2. The scalefactor of
  6307. p will be 0 or 1 due to the conditional
  6308. checks above. Fixes i40647 }
  6309. taicpu(hp1).oper[0]^.ref^.scalefactor := 2
  6310. else
  6311. taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor;
  6312. end;
  6313. { Only remove the first LEA if we don't need the intermediate register's value as is }
  6314. if IntermediateRegDiscarded then
  6315. begin
  6316. DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 1 done',p);
  6317. RemoveCurrentP(p);
  6318. end
  6319. else
  6320. DebugMsg(SPeepholeOptimization + 'LeaLea2LeaLea 1 done (intermediate register still in use)',p);
  6321. result:=true;
  6322. exit;
  6323. end;
  6324. end;
  6325. end;
  6326. { Change:
  6327. leal/q $x(%reg1),%reg2
  6328. ...
  6329. shll/q $y,%reg2
  6330. To:
  6331. leal/q $(x+2^y)(%reg1,2^y),%reg2 (if y <= 3)
  6332. }
  6333. if (taicpu(p).oper[0]^.ref^.base<>NR_STACK_POINTER_REG) and { lea (%rsp,scale),reg is not a valid encoding }
  6334. MatchInstruction(hp1, A_SHL, [taicpu(p).opsize]) and
  6335. MatchOpType(taicpu(hp1), top_const, top_reg) and
  6336. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
  6337. (taicpu(hp1).oper[0]^.val <= 3) then
  6338. begin
  6339. Multiple := 1 shl taicpu(hp1).oper[0]^.val;
  6340. TransferUsedRegs(TmpUsedRegs);
  6341. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  6342. if
  6343. { This allows the optimisation in some circumstances even if the lea instruction already has a scale factor
  6344. (this works even if scalefactor is zero) }
  6345. ((Multiple * taicpu(p).oper[0]^.ref^.scalefactor) <= 8) and
  6346. { Ensure offset doesn't go out of bounds }
  6347. (abs(taicpu(p).oper[0]^.ref^.offset * Multiple) <= $7FFFFFFF) and
  6348. not (RegInUsedRegs(NR_DEFAULTFLAGS,TmpUsedRegs)) and
  6349. (
  6350. (
  6351. not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.base, taicpu(p).oper[1]^.reg) and
  6352. (
  6353. (taicpu(p).oper[0]^.ref^.index = NR_NO) or
  6354. (taicpu(p).oper[0]^.ref^.index = NR_INVALID) or
  6355. (
  6356. { Check for lea $x(%reg1,%reg1),%reg2 and treat as it it were lea $x(%reg1,2),%reg2 }
  6357. (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
  6358. (taicpu(p).oper[0]^.ref^.scalefactor <= 1)
  6359. )
  6360. )
  6361. ) or (
  6362. (
  6363. (taicpu(p).oper[0]^.ref^.base = NR_NO) or
  6364. (taicpu(p).oper[0]^.ref^.base = NR_INVALID)
  6365. ) and
  6366. not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.index, taicpu(p).oper[1]^.reg)
  6367. )
  6368. ) then
  6369. begin
  6370. repeat
  6371. with taicpu(p).oper[0]^.ref^ do
  6372. begin
  6373. { Convert lea $x(%reg1,%reg1),%reg2 to lea $x(%reg1,2),%reg2 }
  6374. if index = base then
  6375. begin
  6376. if Multiple > 4 then
  6377. { Optimisation will no longer work because resultant
  6378. scale factor will exceed 8 }
  6379. Break;
  6380. base := NR_NO;
  6381. scalefactor := 2;
  6382. DebugMsg(SPeepholeOptimization + 'lea $x(%reg1,%reg1),%reg2 -> lea $x(%reg1,2),%reg2 for following optimisation', p);
  6383. end
  6384. else if (base <> NR_NO) and (base <> NR_INVALID) then
  6385. begin
  6386. { Scale factor only works on the index register }
  6387. index := base;
  6388. base := NR_NO;
  6389. end;
  6390. { For safety }
  6391. if scalefactor <= 1 then
  6392. begin
  6393. DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 1', p);
  6394. scalefactor := Multiple;
  6395. end
  6396. else
  6397. begin
  6398. DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 2', p);
  6399. scalefactor := scalefactor * Multiple;
  6400. end;
  6401. offset := offset * Multiple;
  6402. end;
  6403. RemoveInstruction(hp1);
  6404. Result := True;
  6405. Exit;
  6406. { This repeat..until loop exists for the benefit of Break }
  6407. until True;
  6408. end;
  6409. end;
  6410. end;
  6411. end;
  6412. end;
  6413. function TX86AsmOptimizer.DoArithCombineOpt(var p: tai): Boolean;
  6414. var
  6415. hp1 : tai;
  6416. SubInstr: Boolean;
  6417. ThisConst: TCGInt;
  6418. const
  6419. OverflowMin: array[S_B..S_Q] of TCGInt = (-128, -32768, -2147483648, -2147483648);
  6420. { Note: 64-bit-sized arithmetic instructions can only take signed 32-bit immediates }
  6421. OverflowMax: array[S_B..S_Q] of TCGInt = ( 255, 65535, $FFFFFFFF, 2147483647);
  6422. begin
  6423. Result := False;
  6424. if taicpu(p).oper[0]^.typ <> top_const then
  6425. { Should have been confirmed before calling }
  6426. InternalError(2021102601);
  6427. SubInstr := (taicpu(p).opcode = A_SUB);
  6428. if not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  6429. GetLastInstruction(p, hp1) and
  6430. (hp1.typ = ait_instruction) and
  6431. (taicpu(hp1).opsize = taicpu(p).opsize) then
  6432. begin
  6433. if not (taicpu(p).opsize in [S_B, S_W, S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
  6434. { Bad size }
  6435. InternalError(2022042001);
  6436. case taicpu(hp1).opcode Of
  6437. A_INC:
  6438. if MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  6439. begin
  6440. if SubInstr then
  6441. ThisConst := taicpu(p).oper[0]^.val - 1
  6442. else
  6443. ThisConst := taicpu(p).oper[0]^.val + 1;
  6444. end
  6445. else
  6446. Exit;
  6447. A_DEC:
  6448. if MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  6449. begin
  6450. if SubInstr then
  6451. ThisConst := taicpu(p).oper[0]^.val + 1
  6452. else
  6453. ThisConst := taicpu(p).oper[0]^.val - 1;
  6454. end
  6455. else
  6456. Exit;
  6457. A_SUB:
  6458. if (taicpu(hp1).oper[0]^.typ = top_const) and
  6459. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  6460. begin
  6461. if SubInstr then
  6462. ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val
  6463. else
  6464. ThisConst := taicpu(p).oper[0]^.val - taicpu(hp1).oper[0]^.val;
  6465. end
  6466. else
  6467. Exit;
  6468. A_ADD:
  6469. if (taicpu(hp1).oper[0]^.typ = top_const) and
  6470. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  6471. begin
  6472. if SubInstr then
  6473. ThisConst := taicpu(p).oper[0]^.val - taicpu(hp1).oper[0]^.val
  6474. else
  6475. ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val;
  6476. end
  6477. else
  6478. Exit;
  6479. else
  6480. Exit;
  6481. end;
  6482. { Check that the values are in range }
  6483. if (ThisConst < OverflowMin[taicpu(p).opsize]) or (ThisConst > OverflowMax[taicpu(p).opsize]) then
  6484. { Overflow; abort }
  6485. Exit;
  6486. if (ThisConst = 0) then
  6487. begin
  6488. DebugMsg(SPeepholeOptimization + 'Arithmetic combine: ' +
  6489. debug_op2str(taicpu(hp1).opcode) + ' $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_operstr(taicpu(hp1).oper[1]^) + '; ' +
  6490. debug_op2str(taicpu(p).opcode) + ' $' + debug_tostr(taicpu(p).oper[0]^.val) + ',' + debug_operstr(taicpu(p).oper[1]^) + ' cancel out (NOP)', p);
  6491. RemoveInstruction(hp1);
  6492. hp1 := tai(p.next);
  6493. RemoveInstruction(p); { Note, the choice to not use RemoveCurrentp is deliberate }
  6494. if not GetLastInstruction(hp1, p) then
  6495. p := hp1;
  6496. end
  6497. else
  6498. begin
  6499. if taicpu(hp1).opercnt=1 then
  6500. DebugMsg(SPeepholeOptimization + 'Arithmetic combine: ' +
  6501. debug_op2str(taicpu(hp1).opcode) + ' $' + debug_tostr(taicpu(hp1).oper[0]^.val) + '; ' +
  6502. debug_op2str(taicpu(p).opcode) + ' $' + debug_tostr(taicpu(p).oper[0]^.val) + ',' + debug_operstr(taicpu(p).oper[1]^) + ' -> ' +
  6503. debug_op2str(taicpu(p).opcode) + ' $' + debug_tostr(ThisConst) + ' ' + debug_operstr(taicpu(p).oper[1]^), p)
  6504. else
  6505. DebugMsg(SPeepholeOptimization + 'Arithmetic combine: ' +
  6506. debug_op2str(taicpu(hp1).opcode) + ' $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_operstr(taicpu(hp1).oper[1]^) + '; ' +
  6507. debug_op2str(taicpu(p).opcode) + ' $' + debug_tostr(taicpu(p).oper[0]^.val) + ',' + debug_operstr(taicpu(p).oper[1]^) + ' -> ' +
  6508. debug_op2str(taicpu(p).opcode) + ' $' + debug_tostr(ThisConst) + ' ' + debug_operstr(taicpu(p).oper[1]^), p);
  6509. RemoveInstruction(hp1);
  6510. taicpu(p).loadconst(0, ThisConst);
  6511. end;
  6512. Result := True;
  6513. end;
  6514. end;
  6515. function TX86AsmOptimizer.DoMovCmpMemOpt(var p : tai; const hp1: tai) : Boolean;
  6516. begin
  6517. Result := False;
  6518. if MatchOpType(taicpu(p),top_ref,top_reg) and
  6519. { The x86 assemblers have difficulty comparing values against absolute addresses }
  6520. (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) and
  6521. (taicpu(hp1).oper[0]^.typ <> top_ref) and
  6522. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  6523. (
  6524. (
  6525. (taicpu(hp1).opcode = A_TEST)
  6526. ) or (
  6527. (taicpu(hp1).opcode = A_CMP) and
  6528. { A sanity check more than anything }
  6529. not MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg)
  6530. )
  6531. ) then
  6532. begin
  6533. { change
  6534. mov mem, %reg
  6535. ...
  6536. cmp/test x, %reg / test %reg,%reg
  6537. (reg deallocated)
  6538. to
  6539. cmp/test x, mem / cmp 0, mem
  6540. }
  6541. TransferUsedRegs(TmpUsedRegs);
  6542. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  6543. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  6544. begin
  6545. { Convert test %reg,%reg or test $-1,%reg to cmp $0,mem }
  6546. if (taicpu(hp1).opcode = A_TEST) and
  6547. (
  6548. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) or
  6549. MatchOperand(taicpu(hp1).oper[0]^, -1)
  6550. ) then
  6551. begin
  6552. taicpu(hp1).opcode := A_CMP;
  6553. taicpu(hp1).loadconst(0, 0);
  6554. end;
  6555. taicpu(hp1).loadref(1, taicpu(p).oper[0]^.ref^);
  6556. DebugMsg(SPeepholeOptimization + 'MOV/CMP -> CMP (memory check)', p);
  6557. RemoveCurrentP(p);
  6558. if (p <> hp1) then
  6559. { Correctly update TmpUsedRegs if p and hp1 aren't adjacent }
  6560. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1);
  6561. { Make sure the flags are allocated across the CMP instruction }
  6562. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  6563. AllocRegBetween(NR_DEFAULTFLAGS, hp1, hp1, TmpUsedRegs);
  6564. Result := True;
  6565. Exit;
  6566. end;
  6567. end;
  6568. end;
  6569. function TX86AsmOptimizer.DoSETccLblRETOpt(var p: tai; const hp_label: tai_label) : Boolean;
  6570. var
  6571. hp_allocstart, hp_pos, hp2, hp3, hp4, hp5, hp6: tai;
  6572. ThisReg, SecondReg: TRegister;
  6573. JumpLoc: TAsmLabel;
  6574. NewSize: TOpSize;
  6575. begin
  6576. Result := False;
  6577. {
  6578. Convert:
  6579. j<c> .L1
  6580. .L2:
  6581. mov 1,reg
  6582. jmp .L3 (or ret, although it might not be a RET yet)
  6583. .L1:
  6584. mov 0,reg
  6585. jmp .L3 (or ret)
  6586. ( As long as .L3 <> .L1 or .L2)
  6587. To:
  6588. mov 0,reg
  6589. set<not(c)> reg
  6590. jmp .L3 (or ret)
  6591. .L2:
  6592. mov 1,reg
  6593. jmp .L3 (or ret)
  6594. .L1:
  6595. mov 0,reg
  6596. jmp .L3 (or ret)
  6597. }
  6598. if JumpTargetOp(taicpu(p))^.ref^.refaddr<>addr_full then
  6599. Exit;
  6600. JumpLoc := TAsmLabel(JumpTargetOp(taicpu(p))^.ref^.symbol);
  6601. if GetNextInstruction(hp_label, hp2) and
  6602. MatchInstruction(hp2,A_MOV,[]) and
  6603. (taicpu(hp2).oper[0]^.typ = top_const) and
  6604. (
  6605. (
  6606. (taicpu(hp2).oper[1]^.typ = top_reg)
  6607. {$ifdef i386}
  6608. { Under i386, ESI, EDI, EBP and ESP
  6609. don't have an 8-bit representation }
  6610. and not (getsupreg(taicpu(hp2).oper[1]^.reg) in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  6611. {$endif i386}
  6612. ) or (
  6613. {$ifdef i386}
  6614. (taicpu(hp2).oper[1]^.typ <> top_reg) and
  6615. {$endif i386}
  6616. (taicpu(hp2).opsize = S_B)
  6617. )
  6618. ) and
  6619. GetNextInstruction(hp2, hp3) and
  6620. MatchInstruction(hp3, A_JMP, A_RET, []) and
  6621. (
  6622. (taicpu(hp3).opcode=A_RET) or
  6623. (
  6624. (taicpu(hp3).oper[0]^.ref^.refaddr=addr_full) and
  6625. (tasmlabel(taicpu(hp3).oper[0]^.ref^.symbol)<>tai_label(hp_label).labsym)
  6626. )
  6627. ) and
  6628. GetNextInstruction(hp3, hp4) and
  6629. FindLabel(JumpLoc, hp4) and
  6630. (
  6631. not (cs_opt_size in current_settings.optimizerswitches) or
  6632. { If the initial jump is the label's only reference, then it will
  6633. become a dead label if the other conditions are met and hence
  6634. remove at least 2 instructions, including a jump }
  6635. (JumpLoc.getrefs = 1)
  6636. ) and
  6637. { Don't check if hp3 jumps to hp4 because this is a zero-distance jump
  6638. that will be optimised out }
  6639. GetNextInstruction(hp4, hp5) and
  6640. MatchInstruction(hp5,A_MOV,[taicpu(hp2).opsize]) and
  6641. (taicpu(hp5).oper[0]^.typ = top_const) and
  6642. (
  6643. ((taicpu(hp2).oper[0]^.val = 0) and (taicpu(hp5).oper[0]^.val = 1)) or
  6644. ((taicpu(hp2).oper[0]^.val = 1) and (taicpu(hp5).oper[0]^.val = 0))
  6645. ) and
  6646. MatchOperand(taicpu(hp2).oper[1]^,taicpu(hp5).oper[1]^) and
  6647. GetNextInstruction(hp5,hp6) and
  6648. (
  6649. not (hp6.typ in [ait_align, ait_label]) or
  6650. SkipLabels(hp6, hp6)
  6651. ) and
  6652. (hp6.typ=ait_instruction) then
  6653. begin
  6654. { First, let's look at the two jumps that are hp3 and hp6 }
  6655. if not
  6656. (
  6657. (taicpu(hp6).opcode=taicpu(hp3).opcode) and { Both RET or both JMP to the same label }
  6658. (
  6659. (taicpu(hp6).opcode=A_RET) or
  6660. MatchOperand(taicpu(hp6).oper[0]^, taicpu(hp3).oper[0]^)
  6661. )
  6662. ) then
  6663. { If condition is False, then the JMP/RET instructions matched conventionally }
  6664. begin
  6665. { See if one of the jumps can be instantly converted into a RET }
  6666. if (taicpu(hp3).opcode=A_JMP) then
  6667. begin
  6668. { Reuse hp5 }
  6669. hp5 := getlabelwithsym(TAsmLabel(JumpTargetOp(taicpu(hp3))^.ref^.symbol));
  6670. { Make sure hp5 doesn't jump back to .L1 (zero distance jump) or .L2 (infinite loop) }
  6671. if not Assigned(hp5) or (hp5 = hp_label) or (hp5 = hp4) or not GetNextInstruction(hp5, hp5) then
  6672. Exit;
  6673. if MatchInstruction(hp5, A_RET, []) then
  6674. begin
  6675. DebugMsg(SPeepholeOptimization + 'Converted JMP to RET as part of SETcc optimisation (1st jump)', hp3);
  6676. ConvertJumpToRET(hp3, hp5);
  6677. Result := True;
  6678. end
  6679. else
  6680. Exit;
  6681. end;
  6682. if (taicpu(hp6).opcode=A_JMP) then
  6683. begin
  6684. { Reuse hp5 }
  6685. hp5 := getlabelwithsym(TAsmLabel(JumpTargetOp(taicpu(hp6))^.ref^.symbol));
  6686. if not Assigned(hp5) or not GetNextInstruction(hp5, hp5) then
  6687. Exit;
  6688. if MatchInstruction(hp5, A_RET, []) then
  6689. begin
  6690. DebugMsg(SPeepholeOptimization + 'Converted JMP to RET as part of SETcc optimisation (2nd jump)', hp6);
  6691. ConvertJumpToRET(hp6, hp5);
  6692. Result := True;
  6693. end
  6694. else
  6695. Exit;
  6696. end;
  6697. if not
  6698. (
  6699. (taicpu(hp6).opcode=taicpu(hp3).opcode) and { Both RET or both JMP to the same label }
  6700. (
  6701. (taicpu(hp6).opcode=A_RET) or
  6702. MatchOperand(taicpu(hp6).oper[0]^, taicpu(hp3).oper[0]^)
  6703. )
  6704. ) then
  6705. { Still doesn't match }
  6706. Exit;
  6707. end;
  6708. if (taicpu(hp2).oper[0]^.val = 1) then
  6709. begin
  6710. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  6711. DebugMsg(SPeepholeOptimization + 'J(c)Mov1Jmp/RetMov0Jmp/Ret -> Set(~c)Jmp/Ret',p)
  6712. end
  6713. else
  6714. DebugMsg(SPeepholeOptimization + 'J(c)Mov0Jmp/RetMov1Jmp/Ret -> Set(c)Jmp/Ret',p);
  6715. if taicpu(hp2).opsize=S_B then
  6716. begin
  6717. if taicpu(hp2).oper[1]^.typ = top_reg then
  6718. begin
  6719. SecondReg := taicpu(hp2).oper[1]^.reg;
  6720. hp4:=taicpu.op_reg(A_SETcc, S_B, SecondReg);
  6721. end
  6722. else
  6723. begin
  6724. hp4:=taicpu.op_ref(A_SETcc, S_B, taicpu(hp2).oper[1]^.ref^);
  6725. SecondReg := NR_NO;
  6726. end;
  6727. hp_pos := p;
  6728. hp_allocstart := hp4;
  6729. end
  6730. else
  6731. begin
  6732. { Will be a register because the size can't be S_B otherwise }
  6733. SecondReg:=taicpu(hp2).oper[1]^.reg;
  6734. ThisReg:=newreg(R_INTREGISTER,getsupreg(SecondReg), R_SUBL);
  6735. hp4:=taicpu.op_reg(A_SETcc, S_B, ThisReg);
  6736. if (cs_opt_size in current_settings.optimizerswitches) then
  6737. begin
  6738. { Favour using MOVZX when optimising for size }
  6739. case taicpu(hp2).opsize of
  6740. S_W:
  6741. NewSize := S_BW;
  6742. S_L:
  6743. NewSize := S_BL;
  6744. {$ifdef x86_64}
  6745. S_Q:
  6746. begin
  6747. NewSize := S_BL;
  6748. { Will implicitly zero-extend to 64-bit }
  6749. setsubreg(SecondReg, R_SUBD);
  6750. end;
  6751. {$endif x86_64}
  6752. else
  6753. InternalError(2022101301);
  6754. end;
  6755. hp5:=taicpu.op_reg_reg(A_MOVZX, NewSize, ThisReg, SecondReg);
  6756. { Inserting it right before p will guarantee that the flags are also tracked }
  6757. Asml.InsertBefore(hp5, p);
  6758. { Make sure the SET instruction gets inserted before the MOVZX instruction }
  6759. hp_pos := hp5;
  6760. hp_allocstart := hp4;
  6761. end
  6762. else
  6763. begin
  6764. hp5:=taicpu.op_const_reg(A_MOV, taicpu(hp2).opsize, 0, SecondReg);
  6765. { Inserting it right before p will guarantee that the flags are also tracked }
  6766. Asml.InsertBefore(hp5, p);
  6767. hp_pos := p;
  6768. hp_allocstart := hp5;
  6769. end;
  6770. taicpu(hp5).fileinfo:=taicpu(p).fileinfo;
  6771. end;
  6772. taicpu(hp4).fileinfo := taicpu(p).fileinfo;
  6773. taicpu(hp4).condition := taicpu(p).condition;
  6774. asml.InsertBefore(hp4, hp_pos);
  6775. if taicpu(hp3).is_jmp then
  6776. begin
  6777. JumpLoc.decrefs;
  6778. MakeUnconditional(taicpu(p));
  6779. { This also increases the reference count }
  6780. taicpu(p).loadref(0, JumpTargetOp(taicpu(hp3))^.ref^);
  6781. end
  6782. else
  6783. ConvertJumpToRET(p, hp3);
  6784. if SecondReg <> NR_NO then
  6785. { Ensure the destination register is allocated over this region }
  6786. AllocRegBetween(SecondReg, hp_allocstart, p, UsedRegs);
  6787. if (JumpLoc.getrefs = 0) then
  6788. RemoveDeadCodeAfterJump(hp3);
  6789. Result:=true;
  6790. exit;
  6791. end;
  6792. end;
  6793. function TX86AsmOptimizer.OptPass1Sub(var p : tai) : boolean;
  6794. var
  6795. hp1, hp2: tai;
  6796. ActiveReg: TRegister;
  6797. OldOffset: asizeint;
  6798. ThisConst: TCGInt;
  6799. function RegDeallocated: Boolean;
  6800. begin
  6801. TransferUsedRegs(TmpUsedRegs);
  6802. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  6803. Result := not(RegUsedAfterInstruction(ActiveReg,hp1,TmpUsedRegs))
  6804. end;
  6805. begin
  6806. Result:=false;
  6807. hp1 := nil;
  6808. { replace
  6809. subX const,%reg1
  6810. leaX (%reg1,%reg1,Y),%reg2 // Base or index might not be equal to reg1
  6811. dealloc %reg1
  6812. by
  6813. leaX -const-const*Y(%reg1,%reg1,Y),%reg2
  6814. }
  6815. if MatchOpType(taicpu(p),top_const,top_reg) then
  6816. begin
  6817. ActiveReg := taicpu(p).oper[1]^.reg;
  6818. { Ensures the entire register was updated }
  6819. if (taicpu(p).opsize >= S_L) and
  6820. GetNextInstructionUsingReg(p,hp1, ActiveReg) and
  6821. MatchInstruction(hp1,A_LEA,[]) and
  6822. (SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.base) or
  6823. SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.index)) and
  6824. (
  6825. { Cover the case where the register in the reference is also the destination register }
  6826. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ActiveReg) or
  6827. (
  6828. { Try to avoid the expensive check of RegUsedAfterInstruction if we know it will return False }
  6829. not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ActiveReg) and
  6830. RegDeallocated
  6831. )
  6832. ) then
  6833. begin
  6834. OldOffset := taicpu(hp1).oper[0]^.ref^.offset;
  6835. if SuperRegistersEqual(ActiveReg,taicpu(hp1).oper[0]^.ref^.base) then
  6836. Dec(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val);
  6837. if SuperRegistersEqual(ActiveReg,taicpu(hp1).oper[0]^.ref^.index) then
  6838. Dec(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  6839. {$ifdef x86_64}
  6840. if (taicpu(hp1).oper[0]^.ref^.offset > $7FFFFFFF) or (taicpu(hp1).oper[0]^.ref^.offset < -2147483648) then
  6841. begin
  6842. { Overflow; abort }
  6843. taicpu(hp1).oper[0]^.ref^.offset := OldOffset;
  6844. end
  6845. else
  6846. {$endif x86_64}
  6847. begin
  6848. DebugMsg(SPeepholeOptimization + 'SubLea2Lea done',p);
  6849. if not (cs_opt_level3 in current_settings.optimizerswitches) then
  6850. { hp1 is the immediate next instruction for sure - good for a quick speed boost }
  6851. RemoveCurrentP(p, hp1)
  6852. else
  6853. RemoveCurrentP(p);
  6854. result:=true;
  6855. Exit;
  6856. end;
  6857. end;
  6858. if (
  6859. { Save calling GetNextInstructionUsingReg again }
  6860. Assigned(hp1) or
  6861. GetNextInstructionUsingReg(p,hp1, ActiveReg)
  6862. ) and
  6863. MatchInstruction(hp1,A_SUB,[taicpu(p).opsize]) and
  6864. (taicpu(hp1).oper[1]^.reg = ActiveReg) then
  6865. begin
  6866. { Make sure the flags aren't in use by the second operation }
  6867. TransferUsedRegs(TmpUsedRegs);
  6868. UpdateUsedRegsBetween(TmpUsedRegs, tai(p.next), hp1);
  6869. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  6870. begin
  6871. if (taicpu(hp1).oper[0]^.typ = top_const) then
  6872. begin
  6873. { Merge add const1,%reg; add const2,%reg to add const1+const2,%reg }
  6874. ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val;
  6875. Result := True;
  6876. { Handle any overflows }
  6877. case taicpu(p).opsize of
  6878. S_B:
  6879. taicpu(p).oper[0]^.val := ThisConst and $FF;
  6880. S_W:
  6881. taicpu(p).oper[0]^.val := ThisConst and $FFFF;
  6882. S_L:
  6883. taicpu(p).oper[0]^.val := ThisConst and $FFFFFFFF;
  6884. {$ifdef x86_64}
  6885. S_Q:
  6886. if (ThisConst > $7FFFFFFF) or (ThisConst < -2147483648) then
  6887. { Overflow; abort }
  6888. Result := False
  6889. else
  6890. taicpu(p).oper[0]^.val := ThisConst;
  6891. {$endif x86_64}
  6892. else
  6893. InternalError(2021102611);
  6894. end;
  6895. { Result may get set to False again if the combined immediate overflows for S_Q sizes }
  6896. if Result then
  6897. begin
  6898. if (taicpu(p).oper[0]^.val < 0) and
  6899. (
  6900. ((taicpu(p).opsize = S_B) and (taicpu(p).oper[0]^.val <> -128)) or
  6901. ((taicpu(p).opsize = S_W) and (taicpu(p).oper[0]^.val <> -32768)) or
  6902. ((taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and (taicpu(p).oper[0]^.val <> -2147483648))
  6903. ) then
  6904. begin
  6905. DebugMsg(SPeepholeOptimization + 'SUB; ADD/SUB -> ADD',p);
  6906. taicpu(p).opcode := A_SUB;
  6907. taicpu(p).oper[0]^.val := -taicpu(p).oper[0]^.val;
  6908. end
  6909. else
  6910. DebugMsg(SPeepholeOptimization + 'SUB; ADD/SUB -> SUB',p);
  6911. RemoveInstruction(hp1);
  6912. end;
  6913. end
  6914. else
  6915. begin
  6916. { Move the constant subtraction to after the reg/ref addition to improve optimisation }
  6917. DebugMsg(SPeepholeOptimization + 'Add/sub swap 1b done',p);
  6918. Asml.Remove(p);
  6919. Asml.InsertAfter(p, hp1);
  6920. p := hp1;
  6921. Result := True;
  6922. Exit;
  6923. end;
  6924. end;
  6925. end;
  6926. { * change "subl $2, %esp; pushw x" to "pushl x"}
  6927. { * change "sub/add const1, reg" or "dec reg" followed by
  6928. "sub const2, reg" to one "sub ..., reg" }
  6929. {$ifdef i386}
  6930. if (taicpu(p).oper[0]^.val = 2) and
  6931. (ActiveReg = NR_ESP) and
  6932. { Don't do the sub/push optimization if the sub }
  6933. { comes from setting up the stack frame (JM) }
  6934. (not(GetLastInstruction(p,hp1)) or
  6935. not(MatchInstruction(hp1,A_MOV,[S_L]) and
  6936. MatchOperand(taicpu(hp1).oper[0]^,NR_ESP) and
  6937. MatchOperand(taicpu(hp1).oper[0]^,NR_EBP))) then
  6938. begin
  6939. hp1 := tai(p.next);
  6940. while Assigned(hp1) and
  6941. (tai(hp1).typ in [ait_instruction]+SkipInstr) and
  6942. not RegReadByInstruction(NR_ESP,hp1) and
  6943. not RegModifiedByInstruction(NR_ESP,hp1) do
  6944. hp1 := tai(hp1.next);
  6945. if Assigned(hp1) and
  6946. MatchInstruction(hp1,A_PUSH,[S_W]) then
  6947. begin
  6948. taicpu(hp1).changeopsize(S_L);
  6949. if taicpu(hp1).oper[0]^.typ=top_reg then
  6950. setsubreg(taicpu(hp1).oper[0]^.reg,R_SUBWHOLE);
  6951. hp1 := tai(p.next);
  6952. RemoveCurrentp(p, hp1);
  6953. Result:=true;
  6954. exit;
  6955. end;
  6956. end;
  6957. {$endif i386}
  6958. if DoArithCombineOpt(p) then
  6959. Result:=true;
  6960. end;
  6961. end;
  6962. function TX86AsmOptimizer.OptPass1SHLSAL(var p : tai) : boolean;
  6963. var
  6964. TmpBool1,TmpBool2 : Boolean;
  6965. tmpref : treference;
  6966. hp1,hp2: tai;
  6967. mask, shiftval: tcgint;
  6968. begin
  6969. Result:=false;
  6970. { All these optimisations work on "shl/sal const,%reg" }
  6971. if not MatchOpType(taicpu(p),top_const,top_reg) then
  6972. Exit;
  6973. if (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  6974. (taicpu(p).oper[0]^.val <= 3) then
  6975. { Changes "shl const, %reg32; add const/reg, %reg32" to one lea statement }
  6976. begin
  6977. { should we check the next instruction? }
  6978. TmpBool1 := True;
  6979. { have we found an add/sub which could be
  6980. integrated in the lea? }
  6981. TmpBool2 := False;
  6982. reference_reset(tmpref,2,[]);
  6983. TmpRef.index := taicpu(p).oper[1]^.reg;
  6984. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  6985. while TmpBool1 and
  6986. GetNextInstruction(p, hp1) and
  6987. (tai(hp1).typ = ait_instruction) and
  6988. ((((taicpu(hp1).opcode = A_ADD) or
  6989. (taicpu(hp1).opcode = A_SUB)) and
  6990. (taicpu(hp1).oper[1]^.typ = Top_Reg) and
  6991. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)) or
  6992. (((taicpu(hp1).opcode = A_INC) or
  6993. (taicpu(hp1).opcode = A_DEC)) and
  6994. (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  6995. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg)) or
  6996. ((taicpu(hp1).opcode = A_LEA) and
  6997. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  6998. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg))) and
  6999. (not GetNextInstruction(hp1,hp2) or
  7000. not instrReadsFlags(hp2)) Do
  7001. begin
  7002. TmpBool1 := False;
  7003. if taicpu(hp1).opcode=A_LEA then
  7004. begin
  7005. if (TmpRef.base = NR_NO) and
  7006. (taicpu(hp1).oper[0]^.ref^.symbol=nil) and
  7007. (taicpu(hp1).oper[0]^.ref^.relsymbol=nil) and
  7008. { Segment register isn't a concern here }
  7009. ((taicpu(hp1).oper[0]^.ref^.scalefactor=0) or
  7010. (taicpu(hp1).oper[0]^.ref^.scalefactor*tmpref.scalefactor<=8)) then
  7011. begin
  7012. TmpBool1 := True;
  7013. TmpBool2 := True;
  7014. inc(TmpRef.offset, taicpu(hp1).oper[0]^.ref^.offset);
  7015. if taicpu(hp1).oper[0]^.ref^.scalefactor<>0 then
  7016. tmpref.scalefactor:=tmpref.scalefactor*taicpu(hp1).oper[0]^.ref^.scalefactor;
  7017. TmpRef.base := taicpu(hp1).oper[0]^.ref^.base;
  7018. RemoveInstruction(hp1);
  7019. end
  7020. end
  7021. else if (taicpu(hp1).oper[0]^.typ = Top_Const) then
  7022. begin
  7023. TmpBool1 := True;
  7024. TmpBool2 := True;
  7025. case taicpu(hp1).opcode of
  7026. A_ADD:
  7027. inc(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  7028. A_SUB:
  7029. dec(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  7030. else
  7031. internalerror(2019050536);
  7032. end;
  7033. RemoveInstruction(hp1);
  7034. end
  7035. else
  7036. if (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  7037. (((taicpu(hp1).opcode = A_ADD) and
  7038. (TmpRef.base = NR_NO)) or
  7039. (taicpu(hp1).opcode = A_INC) or
  7040. (taicpu(hp1).opcode = A_DEC)) then
  7041. begin
  7042. TmpBool1 := True;
  7043. TmpBool2 := True;
  7044. case taicpu(hp1).opcode of
  7045. A_ADD:
  7046. TmpRef.base := taicpu(hp1).oper[0]^.reg;
  7047. A_INC:
  7048. inc(TmpRef.offset);
  7049. A_DEC:
  7050. dec(TmpRef.offset);
  7051. else
  7052. internalerror(2019050535);
  7053. end;
  7054. RemoveInstruction(hp1);
  7055. end;
  7056. end;
  7057. if TmpBool2
  7058. {$ifndef x86_64}
  7059. or
  7060. ((current_settings.optimizecputype < cpu_Pentium2) and
  7061. (taicpu(p).oper[0]^.val <= 3) and
  7062. not(cs_opt_size in current_settings.optimizerswitches))
  7063. {$endif x86_64}
  7064. then
  7065. begin
  7066. if not(TmpBool2) and
  7067. (taicpu(p).oper[0]^.val=1) then
  7068. begin
  7069. taicpu(p).opcode := A_ADD;
  7070. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  7071. end
  7072. else
  7073. begin
  7074. taicpu(p).opcode := A_LEA;
  7075. taicpu(p).loadref(0, TmpRef);
  7076. end;
  7077. DebugMsg(SPeepholeOptimization + 'ShlAddLeaSubIncDec2Lea',p);
  7078. Result := True;
  7079. end;
  7080. end
  7081. {$ifndef x86_64}
  7082. else if (current_settings.optimizecputype < cpu_Pentium2) then
  7083. begin
  7084. { changes "shl $1, %reg" to "add %reg, %reg", which is the same on a 386,
  7085. but faster on a 486, and Tairable in both U and V pipes on the Pentium
  7086. (unlike shl, which is only Tairable in the U pipe) }
  7087. if taicpu(p).oper[0]^.val=1 then
  7088. begin
  7089. taicpu(p).opcode := A_ADD;
  7090. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  7091. Result := True;
  7092. end
  7093. { changes "shl $2, %reg" to "lea (,%reg,4), %reg"
  7094. "shl $3, %reg" to "lea (,%reg,8), %reg }
  7095. else if (taicpu(p).opsize = S_L) and
  7096. (taicpu(p).oper[0]^.val<= 3) then
  7097. begin
  7098. reference_reset(tmpref,2,[]);
  7099. TmpRef.index := taicpu(p).oper[1]^.reg;
  7100. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  7101. taicpu(p).opcode := A_LEA;
  7102. taicpu(p).loadref(0, TmpRef);
  7103. Result := True;
  7104. end;
  7105. end
  7106. {$endif x86_64}
  7107. else if
  7108. GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) and MatchOpType(taicpu(hp1), top_const, top_reg) and
  7109. (
  7110. (
  7111. MatchInstruction(hp1, A_AND, [taicpu(p).opsize]) and
  7112. SetAndTest(hp1, hp2)
  7113. {$ifdef x86_64}
  7114. ) or
  7115. (
  7116. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  7117. GetNextInstruction(hp1, hp2) and
  7118. MatchInstruction(hp2, A_AND, [taicpu(p).opsize]) and
  7119. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  7120. (taicpu(hp1).oper[1]^.reg = taicpu(hp2).oper[0]^.reg)
  7121. {$endif x86_64}
  7122. )
  7123. ) and
  7124. (taicpu(p).oper[1]^.reg = taicpu(hp2).oper[1]^.reg) then
  7125. begin
  7126. { Change:
  7127. shl x, %reg1
  7128. mov -(1<<x), %reg2
  7129. and %reg2, %reg1
  7130. Or:
  7131. shl x, %reg1
  7132. and -(1<<x), %reg1
  7133. To just:
  7134. shl x, %reg1
  7135. Since the and operation only zeroes bits that are already zero from the shl operation
  7136. }
  7137. case taicpu(p).oper[0]^.val of
  7138. 8:
  7139. mask:=$FFFFFFFFFFFFFF00;
  7140. 16:
  7141. mask:=$FFFFFFFFFFFF0000;
  7142. 32:
  7143. mask:=$FFFFFFFF00000000;
  7144. 63:
  7145. { Constant pre-calculated to prevent overflow errors with Int64 }
  7146. mask:=$8000000000000000;
  7147. else
  7148. begin
  7149. if taicpu(p).oper[0]^.val >= 64 then
  7150. { Shouldn't happen realistically, since the register
  7151. is guaranteed to be set to zero at this point }
  7152. mask := 0
  7153. else
  7154. mask := -(Int64(1 shl taicpu(p).oper[0]^.val));
  7155. end;
  7156. end;
  7157. if taicpu(hp1).oper[0]^.val = mask then
  7158. begin
  7159. { Everything checks out, perform the optimisation, as long as
  7160. the FLAGS register isn't being used}
  7161. TransferUsedRegs(TmpUsedRegs);
  7162. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  7163. {$ifdef x86_64}
  7164. if (hp1 <> hp2) then
  7165. begin
  7166. { "shl/mov/and" version }
  7167. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  7168. { Don't do the optimisation if the FLAGS register is in use }
  7169. if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)) then
  7170. begin
  7171. DebugMsg(SPeepholeOptimization + 'ShlMovAnd2Shl', p);
  7172. { Don't remove the 'mov' instruction if its register is used elsewhere }
  7173. if not(RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs)) then
  7174. begin
  7175. RemoveInstruction(hp1);
  7176. Result := True;
  7177. end;
  7178. { Only set Result to True if the 'mov' instruction was removed }
  7179. RemoveInstruction(hp2);
  7180. end;
  7181. end
  7182. else
  7183. {$endif x86_64}
  7184. begin
  7185. { "shl/and" version }
  7186. { Don't do the optimisation if the FLAGS register is in use }
  7187. if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  7188. begin
  7189. DebugMsg(SPeepholeOptimization + 'ShlAnd2Shl', p);
  7190. RemoveInstruction(hp1);
  7191. Result := True;
  7192. end;
  7193. end;
  7194. Exit;
  7195. end
  7196. else {$ifdef x86_64}if (hp1 = hp2) then{$endif x86_64}
  7197. begin
  7198. { Even if the mask doesn't allow for its removal, we might be
  7199. able to optimise the mask for the "shl/and" version, which
  7200. may permit other peephole optimisations }
  7201. {$ifdef DEBUG_AOPTCPU}
  7202. mask := taicpu(hp1).oper[0]^.val and mask;
  7203. if taicpu(hp1).oper[0]^.val <> mask then
  7204. begin
  7205. DebugMsg(
  7206. SPeepholeOptimization +
  7207. 'Changed mask from $' + debug_tostr(taicpu(hp1).oper[0]^.val) +
  7208. ' to $' + debug_tostr(mask) +
  7209. 'based on previous instruction (ShlAnd2ShlAnd)', hp1);
  7210. taicpu(hp1).oper[0]^.val := mask;
  7211. end;
  7212. {$else DEBUG_AOPTCPU}
  7213. { If debugging is off, just set the operand even if it's the same }
  7214. taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and mask;
  7215. {$endif DEBUG_AOPTCPU}
  7216. end;
  7217. end;
  7218. {
  7219. change
  7220. shl/sal const,reg
  7221. <op> ...(...,reg,1),...
  7222. into
  7223. <op> ...(...,reg,1 shl const),...
  7224. if const in 1..3
  7225. }
  7226. if MatchOpType(taicpu(p), top_const, top_reg) and
  7227. (taicpu(p).oper[0]^.val in [1..3]) and
  7228. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  7229. ((MatchInstruction(hp1,A_MOV,A_LEA,[]) and
  7230. MatchOpType(taicpu(hp1),top_ref,top_reg)) or
  7231. (MatchInstruction(hp1,A_FST,A_FSTP,A_FLD,[]) and
  7232. MatchOpType(taicpu(hp1),top_ref))
  7233. ) and
  7234. (taicpu(p).oper[1]^.reg=taicpu(hp1).oper[0]^.ref^.index) and
  7235. (taicpu(p).oper[1]^.reg<>taicpu(hp1).oper[0]^.ref^.base) and
  7236. (taicpu(hp1).oper[0]^.ref^.scalefactor in [0,1]) then
  7237. begin
  7238. TransferUsedRegs(TmpUsedRegs);
  7239. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  7240. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
  7241. begin
  7242. taicpu(hp1).oper[0]^.ref^.scalefactor:=1 shl taicpu(p).oper[0]^.val;
  7243. DebugMsg(SPeepholeOptimization + 'ShlOp2Op', p);
  7244. RemoveCurrentP(p);
  7245. Result:=true;
  7246. exit;
  7247. end;
  7248. end;
  7249. if MatchOpType(taicpu(p), top_const, top_reg) and
  7250. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  7251. MatchInstruction(hp1,A_SHL,[taicpu(p).opsize]) and
  7252. MatchOpType(taicpu(hp1),top_const,top_reg) and
  7253. (taicpu(p).oper[1]^.reg=taicpu(hp1).oper[1]^.reg) then
  7254. begin
  7255. shiftval:=taicpu(p).oper[0]^.val+taicpu(hp1).oper[0]^.val;
  7256. if ((taicpu(p).opsize=S_B) and (shiftval>7)) or
  7257. ((taicpu(p).opsize=S_W) and (shiftval>15)) or
  7258. {$ifdef x86_64}
  7259. ((taicpu(p).opsize=S_Q) and (shiftval>63)) or
  7260. {$endif x86_64}
  7261. ((taicpu(p).opsize=S_L) and (shiftval>31)) then
  7262. begin
  7263. DebugMsg(SPeepholeOptimization + 'ShlShl2Mov', p);
  7264. taicpu(hp1).opcode:=A_MOV;
  7265. taicpu(hp1).oper[0]^.val:=0;
  7266. end
  7267. else
  7268. begin
  7269. DebugMsg(SPeepholeOptimization + 'ShlShl2Shl', p);
  7270. taicpu(hp1).oper[0]^.val:=shiftval;
  7271. end;
  7272. RemoveCurrentP(p);
  7273. Result:=true;
  7274. exit;
  7275. end;
  7276. end;
  7277. class function TX86AsmOptimizer.IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean;
  7278. begin
  7279. case shr_size of
  7280. S_B:
  7281. { No valid combinations }
  7282. Result := False;
  7283. S_W:
  7284. Result := (Shift >= 8) and (movz_size = S_BW);
  7285. S_L:
  7286. Result :=
  7287. (Shift >= 24) { Any opsize is valid for this shift } or
  7288. ((Shift >= 16) and (movz_size = S_WL));
  7289. {$ifdef x86_64}
  7290. S_Q:
  7291. Result :=
  7292. (Shift >= 56) { Any opsize is valid for this shift } or
  7293. ((Shift >= 48) and (movz_size = S_WL));
  7294. {$endif x86_64}
  7295. else
  7296. InternalError(2022081510);
  7297. end;
  7298. end;
  7299. function TX86AsmOptimizer.HandleSHRMerge(var p: tai; const PostPeephole: Boolean): Boolean;
  7300. var
  7301. hp1, hp2: tai;
  7302. IdentityMask, Shift: TCGInt;
  7303. LimitSize: Topsize;
  7304. DoNotMerge: Boolean;
  7305. begin
  7306. if not MatchInstruction(p, A_SHR, []) then
  7307. InternalError(2025040301);
  7308. Result := False;
  7309. DoNotMerge := False;
  7310. Shift := taicpu(p).oper[0]^.val;
  7311. LimitSize := taicpu(p).opsize;
  7312. hp1 := p;
  7313. repeat
  7314. if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
  7315. Exit;
  7316. case taicpu(hp1).opcode of
  7317. A_AND:
  7318. { Detect:
  7319. shr x, %reg
  7320. and y, %reg
  7321. If and y, %reg doesn't actually change the value of %reg (e.g. with
  7322. "shrl $24,%reg; andl $255,%reg", remove the AND instruction.
  7323. (Post-peephole only)
  7324. }
  7325. if PostPeephole and
  7326. (taicpu(hp1).opsize = taicpu(p).opsize) and
  7327. MatchOpType(taicpu(hp1), top_const, top_reg) and
  7328. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  7329. begin
  7330. { Make sure the FLAGS register isn't in use }
  7331. TransferUsedRegs(TmpUsedRegs);
  7332. hp2 := p;
  7333. repeat
  7334. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  7335. until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
  7336. if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
  7337. begin
  7338. { Generate the identity mask }
  7339. case taicpu(p).opsize of
  7340. S_B:
  7341. IdentityMask := $FF shr Shift;
  7342. S_W:
  7343. IdentityMask := $FFFF shr Shift;
  7344. S_L:
  7345. IdentityMask := $FFFFFFFF shr Shift;
  7346. {$ifdef x86_64}
  7347. S_Q:
  7348. { We need to force the operands to be unsigned 64-bit
  7349. integers otherwise the wrong value is generated }
  7350. IdentityMask := TCGInt(QWord($FFFFFFFFFFFFFFFF) shr QWord(Shift));
  7351. {$endif x86_64}
  7352. else
  7353. InternalError(2022081501);
  7354. end;
  7355. if (taicpu(hp1).oper[0]^.val and IdentityMask) = IdentityMask then
  7356. begin
  7357. DebugMsg(SPeepholeOptimization + 'Removed AND instruction since previous SHR makes this an identity operation (ShrAnd2Shr)', hp1);
  7358. { All the possible 1 bits are covered, so we can remove the AND }
  7359. hp2 := tai(hp1.Previous);
  7360. RemoveInstruction(hp1);
  7361. { p wasn't actually changed, so don't set Result to True,
  7362. but a change was nonetheless made elsewhere }
  7363. Include(OptsToCheck, aoc_ForceNewIteration);
  7364. { Do another pass in case other AND or MOVZX instructions
  7365. follow }
  7366. hp1 := hp2;
  7367. Continue;
  7368. end;
  7369. end;
  7370. end;
  7371. A_TEST, A_CMP:
  7372. { Skip over relevant comparisons, but shift instructions must
  7373. now not be merged since the original value is being read }
  7374. begin
  7375. DoNotMerge := True;
  7376. Continue;
  7377. end;
  7378. A_Jcc:
  7379. { Skip over conditional jumps and relevant comparisons }
  7380. Continue;
  7381. A_MOVZX:
  7382. if MatchOpType(taicpu(hp1), top_reg, top_reg) and
  7383. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
  7384. begin
  7385. { Since the original register is being read as is, subsequent
  7386. SHRs must not be merged at this point }
  7387. DoNotMerge := True;
  7388. if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
  7389. begin
  7390. if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
  7391. begin
  7392. { If the MOVZX instruction reads and writes the same register,
  7393. defer this to the post-peephole optimisation stage }
  7394. if PostPeephole then
  7395. begin
  7396. DebugMsg(SPeepholeOptimization + 'Removed MOVZX instruction since previous SHR makes it unnecessary (ShrMovz2Shr)', hp1);
  7397. { All the possible 1 bits are covered, so we can remove the MOVZX }
  7398. hp2 := tai(hp1.Previous);
  7399. RemoveInstruction(hp1);
  7400. hp1 := hp2;
  7401. end;
  7402. end
  7403. else { Different register target }
  7404. begin
  7405. DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 1)', hp1);
  7406. taicpu(hp1).opcode := A_MOV;
  7407. setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
  7408. case taicpu(hp1).opsize of
  7409. S_BW:
  7410. taicpu(hp1).opsize := S_W;
  7411. S_BL, S_WL:
  7412. taicpu(hp1).opsize := S_L;
  7413. else
  7414. InternalError(2022081503);
  7415. end;
  7416. { p itself hasn't changed, so no need to set Result to True }
  7417. Include(OptsToCheck, aoc_ForceNewIteration);
  7418. { See if there's anything afterwards that can be
  7419. optimised, since the input register hasn't changed }
  7420. Continue;
  7421. end;
  7422. Exit;
  7423. end
  7424. else if PostPeephole and
  7425. (Shift > 0) and
  7426. (taicpu(p).opsize = S_W) and
  7427. (taicpu(hp1).opsize = S_WL) and
  7428. (taicpu(hp1).oper[0]^.reg = NR_AX) and
  7429. (taicpu(hp1).oper[1]^.reg = NR_EAX) then
  7430. begin
  7431. { Detect:
  7432. shr x, %ax (x > 0)
  7433. ...
  7434. movzwl %ax,%eax
  7435. -
  7436. Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
  7437. But first, check to see if movzwl %ax,%eax can be removed...
  7438. }
  7439. hp2 := tai(hp1.Previous);
  7440. TransferUsedRegs(TmpUsedRegs);
  7441. UpdateUsedRegsBetween(UsedRegs, p, hp1);
  7442. if PostPeepholeOptMovZX(hp1) then
  7443. hp1 := hp2
  7444. else
  7445. begin
  7446. DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
  7447. taicpu(hp1).opcode := A_CWDE;
  7448. taicpu(hp1).clearop(0);
  7449. taicpu(hp1).clearop(1);
  7450. taicpu(hp1).ops := 0;
  7451. end;
  7452. RestoreUsedRegs(TmpUsedRegs);
  7453. { Don't need to set aoc_ForceNewIteration if
  7454. PostPeepholeOptMovZX returned True because it's the
  7455. post-peephole stage }
  7456. end;
  7457. { Move onto the next instruction }
  7458. Continue;
  7459. end;
  7460. A_SHL, A_SAL, A_SHR:
  7461. if (taicpu(hp1).opsize <= LimitSize) and
  7462. MatchOpType(taicpu(hp1), top_const, top_reg) and
  7463. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  7464. begin
  7465. { Make sure the sizes don't exceed the register size limit
  7466. (measured by the shift value falling below the limit) }
  7467. if taicpu(hp1).opsize < LimitSize then
  7468. LimitSize := taicpu(hp1).opsize;
  7469. if taicpu(hp1).opcode = A_SHR then
  7470. Inc(Shift, taicpu(hp1).oper[0]^.val)
  7471. else
  7472. begin
  7473. Dec(Shift, taicpu(hp1).oper[0]^.val);
  7474. DoNotMerge := True;
  7475. end;
  7476. if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
  7477. Exit;
  7478. { Since we've established that the combined shift is within
  7479. limits, we can actually combine the adjacent SHR
  7480. instructions even if they're different sizes }
  7481. if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
  7482. begin
  7483. hp2 := tai(hp1.Previous);
  7484. DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 1', p);
  7485. Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
  7486. RemoveInstruction(hp1);
  7487. hp1 := hp2;
  7488. { Though p has changed, only the constant has, and its
  7489. effects can still be detected on the next iteration of
  7490. the repeat..until loop }
  7491. Include(OptsToCheck, aoc_ForceNewIteration);
  7492. end;
  7493. { Move onto the next instruction }
  7494. Continue;
  7495. end;
  7496. else
  7497. ;
  7498. end;
  7499. { If the register isn't actually modified, move onto the next instruction,
  7500. but set DoNotMerge to True since the register is being read }
  7501. if (
  7502. { Under -O2 and below, GetNextInstructionUsingReg only returns
  7503. the next instruction, whether or not it contains the register }
  7504. (cs_opt_level3 in current_settings.optimizerswitches) or
  7505. RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1)
  7506. ) and not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
  7507. begin
  7508. DoNotMerge := True;
  7509. Continue;
  7510. end;
  7511. Break;
  7512. until False;
  7513. end;
  7514. function TX86AsmOptimizer.OptPass1SHR(var p : tai) : boolean;
  7515. begin
  7516. Result := False;
  7517. { All these optimisations work on "shr const,%reg" }
  7518. if not MatchOpType(taicpu(p), top_const, top_reg) then
  7519. Exit;
  7520. Result := HandleSHRMerge(p, False);
  7521. end;
  7522. function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
  7523. var
  7524. CurrentRef: TReference;
  7525. FullReg: TRegister;
  7526. hp1, hp2: tai;
  7527. begin
  7528. Result := False;
  7529. if (first_mov.opsize <> S_B) or (second_mov.opsize <> S_B) then
  7530. Exit;
  7531. { We assume you've checked if the operand is actually a reference by
  7532. this point. If it isn't, you'll most likely get an access violation }
  7533. CurrentRef := first_mov.oper[1]^.ref^;
  7534. { Memory must be aligned }
  7535. if (CurrentRef.offset mod 4) <> 0 then
  7536. Exit;
  7537. Inc(CurrentRef.offset);
  7538. CurrentRef.alignment := 1; { Otherwise references_equal will return False }
  7539. if MatchOperand(second_mov.oper[0]^, 0) and
  7540. references_equal(second_mov.oper[1]^.ref^, CurrentRef) and
  7541. GetNextInstruction(second_mov, hp1) and
  7542. (hp1.typ = ait_instruction) and
  7543. (taicpu(hp1).opcode = A_MOV) and
  7544. MatchOpType(taicpu(hp1), top_const, top_ref) and
  7545. (taicpu(hp1).oper[0]^.val = 0) then
  7546. begin
  7547. Inc(CurrentRef.offset);
  7548. CurrentRef.alignment := taicpu(hp1).oper[1]^.ref^.alignment; { Otherwise references_equal might return False }
  7549. FullReg := newreg(R_INTREGISTER,getsupreg(first_mov.oper[0]^.reg), R_SUBD);
  7550. if references_equal(taicpu(hp1).oper[1]^.ref^, CurrentRef) then
  7551. begin
  7552. case taicpu(hp1).opsize of
  7553. S_B:
  7554. if GetNextInstruction(hp1, hp2) and
  7555. MatchInstruction(taicpu(hp2), A_MOV, [S_B]) and
  7556. MatchOpType(taicpu(hp2), top_const, top_ref) and
  7557. (taicpu(hp2).oper[0]^.val = 0) then
  7558. begin
  7559. Inc(CurrentRef.offset);
  7560. CurrentRef.alignment := 1; { Otherwise references_equal will return False }
  7561. if references_equal(taicpu(hp2).oper[1]^.ref^, CurrentRef) and
  7562. (taicpu(hp2).opsize = S_B) then
  7563. begin
  7564. RemoveInstruction(hp1);
  7565. RemoveInstruction(hp2);
  7566. first_mov.opsize := S_L;
  7567. if first_mov.oper[0]^.typ = top_reg then
  7568. begin
  7569. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVb/MOVb -> MOVZX/MOVl', first_mov);
  7570. { Reuse second_mov as a MOVZX instruction }
  7571. second_mov.opcode := A_MOVZX;
  7572. second_mov.opsize := S_BL;
  7573. second_mov.loadreg(0, first_mov.oper[0]^.reg);
  7574. second_mov.loadreg(1, FullReg);
  7575. first_mov.oper[0]^.reg := FullReg;
  7576. asml.Remove(second_mov);
  7577. asml.InsertBefore(second_mov, first_mov);
  7578. end
  7579. else
  7580. { It's a value }
  7581. begin
  7582. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVb/MOVb -> MOVl', first_mov);
  7583. RemoveInstruction(second_mov);
  7584. end;
  7585. Result := True;
  7586. Exit;
  7587. end;
  7588. end;
  7589. S_W:
  7590. begin
  7591. RemoveInstruction(hp1);
  7592. first_mov.opsize := S_L;
  7593. if first_mov.oper[0]^.typ = top_reg then
  7594. begin
  7595. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVw -> MOVZX/MOVl', first_mov);
  7596. { Reuse second_mov as a MOVZX instruction }
  7597. second_mov.opcode := A_MOVZX;
  7598. second_mov.opsize := S_BL;
  7599. second_mov.loadreg(0, first_mov.oper[0]^.reg);
  7600. second_mov.loadreg(1, FullReg);
  7601. first_mov.oper[0]^.reg := FullReg;
  7602. asml.Remove(second_mov);
  7603. asml.InsertBefore(second_mov, first_mov);
  7604. end
  7605. else
  7606. { It's a value }
  7607. begin
  7608. DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVw -> MOVl', first_mov);
  7609. RemoveInstruction(second_mov);
  7610. end;
  7611. Result := True;
  7612. Exit;
  7613. end;
  7614. else
  7615. ;
  7616. end;
  7617. end;
  7618. end;
  7619. end;
  7620. function TX86AsmOptimizer.OptPass1FSTP(var p: tai): boolean;
  7621. { returns true if a "continue" should be done after this optimization }
  7622. var
  7623. hp1, hp2, hp3: tai;
  7624. begin
  7625. Result := false;
  7626. hp3 := nil;
  7627. if MatchOpType(taicpu(p),top_ref) and
  7628. GetNextInstruction(p, hp1) and
  7629. (hp1.typ = ait_instruction) and
  7630. (((taicpu(hp1).opcode = A_FLD) and
  7631. (taicpu(p).opcode = A_FSTP)) or
  7632. ((taicpu(p).opcode = A_FISTP) and
  7633. (taicpu(hp1).opcode = A_FILD))) and
  7634. MatchOpType(taicpu(hp1),top_ref) and
  7635. (taicpu(hp1).opsize = taicpu(p).opsize) and
  7636. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  7637. begin
  7638. { replacing fstp f;fld f by fst f is only valid for extended because of rounding or if fastmath is on }
  7639. if ((taicpu(p).opsize=S_FX) or (cs_opt_fastmath in current_settings.optimizerswitches)) and
  7640. GetNextInstruction(hp1, hp2) and
  7641. (((hp2.typ = ait_instruction) and
  7642. IsExitCode(hp2) and
  7643. (taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
  7644. not(assigned(current_procinfo.procdef.funcretsym) and
  7645. (taicpu(p).oper[0]^.ref^.offset < tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)) and
  7646. (taicpu(p).oper[0]^.ref^.index = NR_NO)) or
  7647. { fstp <temp>
  7648. fld <temp>
  7649. <dealloc> <temp>
  7650. }
  7651. ((taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
  7652. (taicpu(p).oper[0]^.ref^.index = NR_NO) and
  7653. SetAndTest(FindTempDeAlloc(taicpu(p).oper[0]^.ref^.offset,tai(hp1.next)),hp2) and
  7654. (tai_tempalloc(hp2).temppos=taicpu(p).oper[0]^.ref^.offset) and
  7655. (((taicpu(p).opsize=S_FX) and (tai_tempalloc(hp2).tempsize=16)) or
  7656. ((taicpu(p).opsize in [S_IQ,S_FL]) and (tai_tempalloc(hp2).tempsize=8)) or
  7657. ((taicpu(p).opsize=S_FS) and (tai_tempalloc(hp2).tempsize=4))
  7658. )
  7659. )
  7660. ) then
  7661. begin
  7662. DebugMsg(SPeepholeOptimization + 'FstpFld2<Nop>',p);
  7663. RemoveInstruction(hp1);
  7664. RemoveCurrentP(p, hp2);
  7665. { first case: exit code }
  7666. if hp2.typ = ait_instruction then
  7667. RemoveLastDeallocForFuncRes(p);
  7668. Result := true;
  7669. end
  7670. else
  7671. { we can do this only in fast math mode as fstp is rounding ...
  7672. ... still disabled as it breaks the compiler and/or rtl }
  7673. if { (cs_opt_fastmath in current_settings.optimizerswitches) or }
  7674. { ... or if another fstp equal to the first one follows }
  7675. GetNextInstruction(hp1,hp2) and
  7676. (hp2.typ = ait_instruction) and
  7677. (taicpu(p).opcode=taicpu(hp2).opcode) and
  7678. (taicpu(p).opsize=taicpu(hp2).opsize) then
  7679. begin
  7680. if (taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
  7681. (taicpu(p).oper[0]^.ref^.index = NR_NO) and
  7682. SetAndTest(FindTempDeAlloc(taicpu(p).oper[0]^.ref^.offset,tai(hp2.next)),hp3) and
  7683. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  7684. (tai_tempalloc(hp3).temppos=taicpu(p).oper[0]^.ref^.offset) and
  7685. (((taicpu(p).opsize=S_FX) and (tai_tempalloc(hp3).tempsize=16)) or
  7686. ((taicpu(p).opsize in [S_IQ,S_FL]) and (tai_tempalloc(hp3).tempsize=8)) or
  7687. ((taicpu(p).opsize=S_FS) and (tai_tempalloc(hp3).tempsize=4))
  7688. ) then
  7689. begin
  7690. DebugMsg(SPeepholeOptimization + 'FstpFldFstp2Fstp',p);
  7691. RemoveCurrentP(p,hp2);
  7692. RemoveInstruction(hp1);
  7693. Result := true;
  7694. end
  7695. else if { fst can't store an extended/comp value }
  7696. (taicpu(p).opsize <> S_FX) and
  7697. (taicpu(p).opsize <> S_IQ) then
  7698. begin
  7699. if (taicpu(p).opcode = A_FSTP) then
  7700. taicpu(p).opcode := A_FST
  7701. else
  7702. taicpu(p).opcode := A_FIST;
  7703. DebugMsg(SPeepholeOptimization + 'FstpFld2Fst',p);
  7704. RemoveInstruction(hp1);
  7705. Result := true;
  7706. end;
  7707. end;
  7708. end;
  7709. end;
  7710. function TX86AsmOptimizer.OptPass1FLD(var p : tai) : boolean;
  7711. var
  7712. hp1, hp2, hp3: tai;
  7713. begin
  7714. result:=false;
  7715. if MatchOpType(taicpu(p),top_reg) and
  7716. GetNextInstruction(p, hp1) and
  7717. (hp1.typ = Ait_Instruction) and
  7718. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  7719. (taicpu(hp1).oper[0]^.reg = NR_ST) and
  7720. (taicpu(hp1).oper[1]^.reg = NR_ST1) then
  7721. { change to
  7722. fld reg fxxx reg,st
  7723. fxxxp st, st1 (hp1)
  7724. Remark: non commutative operations must be reversed!
  7725. }
  7726. begin
  7727. case taicpu(hp1).opcode Of
  7728. A_FMULP,A_FADDP,
  7729. A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  7730. begin
  7731. case taicpu(hp1).opcode Of
  7732. A_FADDP: taicpu(hp1).opcode := A_FADD;
  7733. A_FMULP: taicpu(hp1).opcode := A_FMUL;
  7734. A_FSUBP: taicpu(hp1).opcode := A_FSUBR;
  7735. A_FSUBRP: taicpu(hp1).opcode := A_FSUB;
  7736. A_FDIVP: taicpu(hp1).opcode := A_FDIVR;
  7737. A_FDIVRP: taicpu(hp1).opcode := A_FDIV;
  7738. else
  7739. internalerror(2019050534);
  7740. end;
  7741. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  7742. taicpu(hp1).oper[1]^.reg := NR_ST;
  7743. DebugMsg(SPeepholeOptimization + 'FldF*p2F*',hp1);
  7744. RemoveCurrentP(p, hp1);
  7745. Result:=true;
  7746. exit;
  7747. end;
  7748. else
  7749. ;
  7750. end;
  7751. end
  7752. else
  7753. if MatchOpType(taicpu(p),top_ref) and
  7754. GetNextInstruction(p, hp2) and
  7755. (hp2.typ = Ait_Instruction) and
  7756. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  7757. (taicpu(p).opsize in [S_FS, S_FL]) and
  7758. (taicpu(hp2).oper[0]^.reg = NR_ST) and
  7759. (taicpu(hp2).oper[1]^.reg = NR_ST1) then
  7760. if GetLastInstruction(p, hp1) and
  7761. MatchInstruction(hp1,A_FLD,A_FST,[taicpu(p).opsize]) and
  7762. MatchOpType(taicpu(hp1),top_ref) and
  7763. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  7764. if ((taicpu(hp2).opcode = A_FMULP) or
  7765. (taicpu(hp2).opcode = A_FADDP)) then
  7766. { change to
  7767. fld/fst mem1 (hp1) fld/fst mem1
  7768. fld mem1 (p) fadd/
  7769. faddp/ fmul st, st
  7770. fmulp st, st1 (hp2) }
  7771. begin
  7772. DebugMsg(SPeepholeOptimization + 'Fld/FstFldFaddp/Fmulp2Fld/FstFadd/Fmul',hp1);
  7773. RemoveCurrentP(p, hp1);
  7774. if (taicpu(hp2).opcode = A_FADDP) then
  7775. taicpu(hp2).opcode := A_FADD
  7776. else
  7777. taicpu(hp2).opcode := A_FMUL;
  7778. taicpu(hp2).oper[1]^.reg := NR_ST;
  7779. end
  7780. else
  7781. { change to
  7782. fld/fst mem1 (hp1) fld/fst mem1
  7783. fld mem1 (p) fld st
  7784. }
  7785. begin
  7786. DebugMsg(SPeepholeOptimization + 'Fld/Fst<mem>Fld<mem>2Fld/Fst<mem>Fld<reg>',hp1);
  7787. taicpu(p).changeopsize(S_FL);
  7788. taicpu(p).loadreg(0,NR_ST);
  7789. end
  7790. else
  7791. begin
  7792. case taicpu(hp2).opcode Of
  7793. A_FMULP,A_FADDP,A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  7794. { change to
  7795. fld/fst mem1 (hp1) fld/fst mem1
  7796. fld mem2 (p) fxxx mem2
  7797. fxxxp st, st1 (hp2) }
  7798. begin
  7799. case taicpu(hp2).opcode Of
  7800. A_FADDP: taicpu(p).opcode := A_FADD;
  7801. A_FMULP: taicpu(p).opcode := A_FMUL;
  7802. A_FSUBP: taicpu(p).opcode := A_FSUBR;
  7803. A_FSUBRP: taicpu(p).opcode := A_FSUB;
  7804. A_FDIVP: taicpu(p).opcode := A_FDIVR;
  7805. A_FDIVRP: taicpu(p).opcode := A_FDIV;
  7806. else
  7807. internalerror(2019050533);
  7808. end;
  7809. DebugMsg(SPeepholeOptimization + 'Fld/FstFldF*2Fld/FstF*',p);
  7810. RemoveInstruction(hp2);
  7811. end
  7812. else
  7813. ;
  7814. end
  7815. end
  7816. end;
  7817. function IsCmpSubset(cond1, cond2: TAsmCond): Boolean; inline;
  7818. begin
  7819. Result := condition_in(cond1, cond2) or
  7820. { Not strictly subsets due to the actual flags checked, but because we're
  7821. comparing integers, E is a subset of AE and GE and their aliases }
  7822. ((cond1 in [C_E, C_Z]) and (cond2 in [C_AE, C_NB, C_NC, C_GE, C_NL]));
  7823. end;
  7824. function TX86AsmOptimizer.OptPass1Cmp(var p: tai): boolean;
  7825. var
  7826. v: TCGInt;
  7827. true_hp1, hp1, hp2, p_dist, p_jump, hp1_dist, p_label, hp1_label: tai;
  7828. FirstMatch, TempBool: Boolean;
  7829. NewReg: TRegister;
  7830. JumpLabel, JumpLabel_dist, JumpLabel_far: TAsmLabel;
  7831. begin
  7832. Result:=false;
  7833. { All these optimisations need a next instruction }
  7834. if not GetNextInstruction(p, hp1) then
  7835. Exit;
  7836. true_hp1 := hp1;
  7837. { Search for:
  7838. cmp ###,###
  7839. j(c1) @lbl1
  7840. ...
  7841. @lbl:
  7842. cmp ###,### (same comparison as above)
  7843. j(c2) @lbl2
  7844. If c1 is a subset of c2, change to:
  7845. cmp ###,###
  7846. j(c1) @lbl2
  7847. (@lbl1 may become a dead label as a result)
  7848. }
  7849. { Also handle cases where there are multiple jumps in a row }
  7850. p_jump := hp1;
  7851. while Assigned(p_jump) and MatchInstruction(p_jump, A_JCC, []) do
  7852. begin
  7853. Prefetch(p_jump.Next);
  7854. if IsJumpToLabel(taicpu(p_jump)) then
  7855. begin
  7856. { Do jump optimisations first in case the condition becomes
  7857. unnecessary }
  7858. TempBool := True;
  7859. if DoJumpOptimizations(p_jump, TempBool) or
  7860. not TempBool then
  7861. begin
  7862. if Assigned(p_jump) then
  7863. begin
  7864. { CollapseZeroDistJump will be set to the label or an align
  7865. before it after the jump if it optimises, whether or not
  7866. the label is live or dead }
  7867. if (p_jump.typ = ait_align) or
  7868. (
  7869. (p_jump.typ = ait_label) and
  7870. not (tai_label(p_jump).labsym.is_used)
  7871. ) then
  7872. GetNextInstruction(p_jump, p_jump);
  7873. end;
  7874. TransferUsedRegs(TmpUsedRegs);
  7875. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  7876. if not Assigned(p_jump) or
  7877. (
  7878. not MatchInstruction(p_jump, A_Jcc, A_SETcc, A_CMOVcc, []) and
  7879. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, p_jump, TmpUsedRegs)
  7880. ) then
  7881. begin
  7882. { No more conditional jumps; conditional statement is no longer required }
  7883. DebugMsg(SPeepholeOptimization + 'Removed unnecessary condition (Cmp2Nop)', p);
  7884. RemoveCurrentP(p);
  7885. Result := True;
  7886. Exit;
  7887. end;
  7888. hp1 := p_jump;
  7889. Include(OptsToCheck, aoc_ForceNewIteration);
  7890. Continue;
  7891. end;
  7892. JumpLabel := TAsmLabel(taicpu(p_jump).oper[0]^.ref^.symbol);
  7893. if GetNextInstruction(p_jump, hp2) and
  7894. (
  7895. OptimizeConditionalJump(JumpLabel, p_jump, hp2, TempBool) or
  7896. not TempBool
  7897. ) then
  7898. begin
  7899. hp1 := p_jump;
  7900. Include(OptsToCheck, aoc_ForceNewIteration);
  7901. Continue;
  7902. end;
  7903. p_label := nil;
  7904. if Assigned(JumpLabel) then
  7905. p_label := getlabelwithsym(JumpLabel);
  7906. if Assigned(p_label) and
  7907. GetNextInstruction(p_label, p_dist) and
  7908. MatchInstruction(p_dist, A_CMP, []) and
  7909. MatchOperand(taicpu(p_dist).oper[0]^, taicpu(p).oper[0]^) and
  7910. MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p).oper[1]^) and
  7911. GetNextInstruction(p_dist, hp1_dist) and
  7912. MatchInstruction(hp1_dist, A_JCC, []) then { This doesn't have to be an explicit label }
  7913. begin
  7914. JumpLabel_dist := TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol);
  7915. if JumpLabel = JumpLabel_dist then
  7916. { This is an infinite loop }
  7917. Exit;
  7918. { Best optimisation when the first condition is a subset (or equal) of the second }
  7919. if IsCmpSubset(taicpu(p_jump).condition, taicpu(hp1_dist).condition) then
  7920. begin
  7921. { Any registers used here will already be allocated }
  7922. if Assigned(JumpLabel) then
  7923. JumpLabel.DecRefs;
  7924. DebugMsg(SPeepholeOptimization + 'CMP/Jcc/@Lbl/CMP/Jcc -> CMP/Jcc, redirecting first jump', p_jump);
  7925. taicpu(p_jump).loadref(0, taicpu(hp1_dist).oper[0]^.ref^); { This also increases the reference count }
  7926. Include(OptsToCheck, aoc_ForceNewIteration);
  7927. { Don't exit yet. Since p and p_jump haven't actually been
  7928. removed, we can check for more on this iteration }
  7929. end
  7930. else if IsCmpSubset(taicpu(hp1_dist).condition, inverse_cond(taicpu(p_jump).condition)) and
  7931. GetNextInstruction(hp1_dist, hp1_label) and
  7932. (hp1_label.typ = ait_label) then
  7933. begin
  7934. JumpLabel_far := tai_label(hp1_label).labsym;
  7935. if (JumpLabel_far = JumpLabel_dist) or (JumpLabel_far = JumpLabel) then
  7936. { This is an infinite loop }
  7937. Exit;
  7938. if Assigned(JumpLabel_far) then
  7939. begin
  7940. { In this situation, if the first jump branches, the second one will never,
  7941. branch so change the destination label to after the second jump }
  7942. DebugMsg(SPeepholeOptimization + 'CMP/Jcc/@Lbl/CMP/Jcc/@Lbl -> CMP/Jcc, redirecting first jump to 2nd label', p_jump);
  7943. if Assigned(JumpLabel) then
  7944. JumpLabel.DecRefs;
  7945. JumpLabel_far.IncRefs;
  7946. taicpu(p_jump).oper[0]^.ref^.symbol := JumpLabel_far;
  7947. Result := True;
  7948. { Don't exit yet. Since p and p_jump haven't actually been
  7949. removed, we can check for more on this iteration }
  7950. Continue;
  7951. end;
  7952. end;
  7953. end;
  7954. end;
  7955. { Search for:
  7956. cmp ###,###
  7957. j(c1) @lbl1
  7958. cmp ###,### (same as first)
  7959. Remove second cmp
  7960. }
  7961. if GetNextInstruction(p_jump, hp2) and
  7962. (
  7963. (
  7964. MatchInstruction(hp2, A_CMP, [taicpu(p).opsize]) and
  7965. (
  7966. (
  7967. MatchOpType(taicpu(p), top_const, top_reg) and
  7968. MatchOpType(taicpu(hp2), top_const, top_reg) and
  7969. (taicpu(hp2).oper[0]^.val = taicpu(p).oper[0]^.val) and
  7970. Reg1WriteOverwritesReg2Entirely(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[1]^.reg)
  7971. ) or (
  7972. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
  7973. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^)
  7974. )
  7975. )
  7976. ) or (
  7977. { Also match cmp $0,%reg; jcc @lbl; test %reg,%reg }
  7978. MatchOperand(taicpu(p).oper[0]^, 0) and
  7979. (taicpu(p).oper[1]^.typ = top_reg) and
  7980. MatchInstruction(hp2, A_TEST, []) and
  7981. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  7982. (taicpu(hp2).oper[0]^.reg = taicpu(hp2).oper[1]^.reg) and
  7983. Reg1WriteOverwritesReg2Entirely(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[1]^.reg)
  7984. )
  7985. ) then
  7986. begin
  7987. DebugMsg(SPeepholeOptimization + 'CMP/Jcc/CMP; removed superfluous CMP', hp2);
  7988. TransferUsedRegs(TmpUsedRegs);
  7989. AllocRegBetween(NR_DEFAULTFLAGS, p, hp2, TmpUsedRegs);
  7990. RemoveInstruction(hp2);
  7991. Result := True;
  7992. { Continue the while loop in case "Jcc/CMP" follows the second CMP that was just removed }
  7993. end
  7994. else
  7995. begin
  7996. { hp2 is the next instruction, so save time and just set p_jump
  7997. to it instead of calling GetNextInstruction below }
  7998. p_jump := hp2;
  7999. Continue;
  8000. end;
  8001. GetNextInstruction(p_jump, p_jump);
  8002. end;
  8003. if (
  8004. { Don't call GetNextInstruction again if we already have it }
  8005. (true_hp1 = p_jump) or
  8006. GetNextInstruction(p, hp1)
  8007. ) and
  8008. MatchInstruction(hp1, A_Jcc, []) and
  8009. IsJumpToLabel(taicpu(hp1)) and
  8010. (taicpu(hp1).condition in [C_E, C_Z, C_NE, C_NZ]) and
  8011. GetNextInstruction(hp1, hp2) then
  8012. begin
  8013. {
  8014. cmp x, y (or "cmp y, x")
  8015. je @lbl
  8016. mov x, y
  8017. @lbl:
  8018. (x and y can be constants, registers or references)
  8019. Change to:
  8020. mov x, y (x and y will always be equal in the end)
  8021. @lbl: (may beceome a dead label)
  8022. Also:
  8023. cmp x, y (or "cmp y, x")
  8024. jne @lbl
  8025. mov x, y
  8026. @lbl:
  8027. (x and y can be constants, registers or references)
  8028. Change to:
  8029. Absolutely nothing! (Except @lbl if it's still live)
  8030. }
  8031. if MatchInstruction(hp2, A_MOV, [taicpu(p).opsize]) and
  8032. (
  8033. (
  8034. MatchOperand(taicpu(p).oper[0]^, taicpu(hp2).oper[0]^) and
  8035. MatchOperand(taicpu(p).oper[1]^, taicpu(hp2).oper[1]^)
  8036. ) or (
  8037. MatchOperand(taicpu(p).oper[0]^, taicpu(hp2).oper[1]^) and
  8038. MatchOperand(taicpu(p).oper[1]^, taicpu(hp2).oper[0]^)
  8039. )
  8040. ) and
  8041. GetNextInstruction(hp2, hp1_label) and
  8042. (hp1_label.typ = ait_label) and
  8043. (tai_label(hp1_label).labsym = taicpu(hp1).oper[0]^.ref^.symbol) then
  8044. begin
  8045. tai_label(hp1_label).labsym.DecRefs;
  8046. if (taicpu(hp1).condition in [C_NE, C_NZ]) then
  8047. begin
  8048. DebugMsg(SPeepholeOptimization + 'CMP/JNE/MOV/@Lbl -> NOP, since the MOV is only executed if the operands are equal (CmpJneMov2Nop)', p);
  8049. RemoveInstruction(hp2);
  8050. hp2 := hp1_label; { So RemoveCurrentp below can be set to something valid }
  8051. end
  8052. else
  8053. DebugMsg(SPeepholeOptimization + 'CMP/JE/MOV/@Lbl -> MOV, since the MOV is only executed if the operands aren''t equal (CmpJeMov2Mov)', p);
  8054. RemoveInstruction(hp1);
  8055. RemoveCurrentp(p, hp2);
  8056. Result := True;
  8057. Exit;
  8058. end;
  8059. {
  8060. Try to optimise the following:
  8061. cmp $x,### ($x and $y can be registers or constants)
  8062. je @lbl1 (only reference)
  8063. cmp $y,### (### are identical)
  8064. @Lbl:
  8065. sete %reg1
  8066. Change to:
  8067. cmp $x,###
  8068. sete %reg2 (allocate new %reg2)
  8069. cmp $y,###
  8070. sete %reg1
  8071. orb %reg2,%reg1
  8072. (dealloc %reg2)
  8073. This adds an instruction (so don't perform under -Os), but it removes
  8074. a conditional branch.
  8075. }
  8076. if not (cs_opt_size in current_settings.optimizerswitches) and
  8077. MatchInstruction(hp2, A_CMP, A_TEST, [taicpu(p).opsize]) and
  8078. MatchOperand(taicpu(p).oper[1]^, taicpu(hp2).oper[1]^) and
  8079. { The first operand of CMP instructions can only be a register or
  8080. immediate anyway, so no need to check }
  8081. GetNextInstruction(hp2, p_label) and
  8082. (p_label.typ = ait_label) and
  8083. (tai_label(p_label).labsym.getrefs = 1) and
  8084. (JumpTargetOp(taicpu(hp1))^.ref^.symbol = tai_label(p_label).labsym) and
  8085. GetNextInstruction(p_label, p_dist) and
  8086. MatchInstruction(p_dist, A_SETcc, []) and
  8087. (taicpu(p_dist).condition in [C_E, C_Z]) and
  8088. (taicpu(p_dist).oper[0]^.typ = top_reg) then
  8089. begin
  8090. TransferUsedRegs(TmpUsedRegs);
  8091. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  8092. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  8093. UpdateUsedRegs(TmpUsedRegs, tai(p_label.Next));
  8094. UpdateUsedRegs(TmpUsedRegs, tai(p_dist.Next));
  8095. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  8096. { Get the instruction after the SETcc instruction so we can
  8097. allocate a new register over the entire range }
  8098. GetNextInstruction(p_dist, hp1_dist) then
  8099. begin
  8100. { Register can appear in p if it's not used afterwards, so only
  8101. allocate between hp1 and hp1_dist }
  8102. NewReg := GetIntRegisterBetween(R_SUBL, TmpUsedRegs, hp1, hp1_dist);
  8103. if NewReg <> NR_NO then
  8104. begin
  8105. DebugMsg(SPeepholeOptimization + 'CMP/JE/CMP/@Lbl/SETE -> CMP/SETE/CMP/SETE/OR, removing conditional branch', p);
  8106. { Change the jump instruction into a SETcc instruction }
  8107. taicpu(hp1).opcode := A_SETcc;
  8108. taicpu(hp1).opsize := S_B;
  8109. taicpu(hp1).loadreg(0, NewReg);
  8110. { This is now a dead label }
  8111. tai_label(p_label).labsym.decrefs;
  8112. { Prefer adding before the next instruction so the FLAGS
  8113. register is deallicated first }
  8114. AsmL.InsertBefore(
  8115. taicpu.op_reg_reg(A_OR, S_B, NewReg, taicpu(p_dist).oper[0]^.reg),
  8116. hp1_dist
  8117. );
  8118. Result := True;
  8119. { Don't exit yet, as p wasn't changed and hp1, while
  8120. modified, is still intact and might be optimised by the
  8121. SETcc optimisation below }
  8122. end;
  8123. end;
  8124. end;
  8125. end;
  8126. if (taicpu(p).oper[0]^.typ = top_const) and
  8127. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) then
  8128. begin
  8129. if (taicpu(p).oper[0]^.val = 0) and
  8130. (taicpu(p).oper[1]^.typ = top_reg) then
  8131. begin
  8132. hp2 := p;
  8133. FirstMatch := True;
  8134. { When dealing with "cmp $0,%reg", only ZF and SF contain
  8135. anything meaningful once it's converted to "test %reg,%reg";
  8136. additionally, some jumps will always (or never) branch, so
  8137. evaluate every jump immediately following the
  8138. comparison, optimising the conditions if possible.
  8139. Similarly with SETcc... those that are always set to 0 or 1
  8140. are changed to MOV instructions }
  8141. while FirstMatch or { Saves calling GetNextInstruction unnecessarily }
  8142. (
  8143. GetNextInstruction(hp2, hp1) and
  8144. MatchInstruction(hp1,A_Jcc,A_SETcc,[])
  8145. ) do
  8146. begin
  8147. Prefetch(hp1.Next);
  8148. FirstMatch := False;
  8149. case taicpu(hp1).condition of
  8150. C_B, C_C, C_NAE, C_O:
  8151. { For B/NAE:
  8152. Will never branch since an unsigned integer can never be below zero
  8153. For C/O:
  8154. Result cannot overflow because 0 is being subtracted
  8155. }
  8156. begin
  8157. if taicpu(hp1).opcode = A_Jcc then
  8158. begin
  8159. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (jump removed)', hp1);
  8160. TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol).decrefs;
  8161. RemoveInstruction(hp1);
  8162. { Since hp1 was deleted, hp2 must not be updated }
  8163. Continue;
  8164. end
  8165. else
  8166. begin
  8167. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (set -> mov 0)', hp1);
  8168. { Convert "set(c) %reg" instruction to "movb 0,%reg" }
  8169. taicpu(hp1).opcode := A_MOV;
  8170. taicpu(hp1).ops := 2;
  8171. taicpu(hp1).condition := C_None;
  8172. taicpu(hp1).opsize := S_B;
  8173. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  8174. taicpu(hp1).loadconst(0, 0);
  8175. end;
  8176. end;
  8177. C_BE, C_NA:
  8178. begin
  8179. { Will only branch if equal to zero }
  8180. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition BE/NA --> E', hp1);
  8181. taicpu(hp1).condition := C_E;
  8182. end;
  8183. C_A, C_NBE:
  8184. begin
  8185. { Will only branch if not equal to zero }
  8186. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition A/NBE --> NE', hp1);
  8187. taicpu(hp1).condition := C_NE;
  8188. end;
  8189. C_AE, C_NB, C_NC, C_NO:
  8190. begin
  8191. { Will always branch }
  8192. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition AE/NB/NC/NO --> Always', hp1);
  8193. if taicpu(hp1).opcode = A_Jcc then
  8194. begin
  8195. MakeUnconditional(taicpu(hp1));
  8196. { Any jumps/set that follow will now be dead code }
  8197. RemoveDeadCodeAfterJump(taicpu(hp1));
  8198. Break;
  8199. end
  8200. else
  8201. begin
  8202. { Convert "set(c) %reg" instruction to "movb 1,%reg" }
  8203. taicpu(hp1).opcode := A_MOV;
  8204. taicpu(hp1).ops := 2;
  8205. taicpu(hp1).condition := C_None;
  8206. taicpu(hp1).opsize := S_B;
  8207. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  8208. taicpu(hp1).loadconst(0, 1);
  8209. end;
  8210. end;
  8211. C_None:
  8212. InternalError(2020012201);
  8213. C_P, C_PE, C_NP, C_PO:
  8214. { We can't handle parity checks and they should never be generated
  8215. after a general-purpose CMP (it's used in some floating-point
  8216. comparisons that don't use CMP) }
  8217. InternalError(2020012202);
  8218. else
  8219. { Zero/Equality, Sign, their complements and all of the
  8220. signed comparisons do not need to be converted };
  8221. end;
  8222. hp2 := hp1;
  8223. end;
  8224. { Convert the instruction to a TEST }
  8225. taicpu(p).opcode := A_TEST;
  8226. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  8227. Result := True;
  8228. Exit;
  8229. end
  8230. else
  8231. begin
  8232. TransferUsedRegs(TmpUsedRegs);
  8233. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  8234. if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
  8235. begin
  8236. if (taicpu(p).oper[0]^.val = 1) and
  8237. (taicpu(hp1).condition in [C_L, C_NL, C_NGE, C_GE]) then
  8238. begin
  8239. { Convert; To:
  8240. cmp $1,r/m cmp $0,r/m
  8241. jl @lbl jle @lbl
  8242. (Also do inverted conditions)
  8243. }
  8244. DebugMsg(SPeepholeOptimization + 'Cmp1Jl2Cmp0Jle', p);
  8245. taicpu(p).oper[0]^.val := 0;
  8246. if taicpu(hp1).condition in [C_L, C_NGE] then
  8247. taicpu(hp1).condition := C_LE
  8248. else
  8249. taicpu(hp1).condition := C_NLE;
  8250. { If the instruction is now "cmp $0,%reg", convert it to a
  8251. TEST (and effectively do the work of the "cmp $0,%reg" in
  8252. the block above)
  8253. }
  8254. if (taicpu(p).oper[1]^.typ = top_reg) then
  8255. begin
  8256. taicpu(p).opcode := A_TEST;
  8257. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  8258. end;
  8259. Result := True;
  8260. Exit;
  8261. end
  8262. else if (taicpu(p).oper[1]^.typ = top_reg)
  8263. {$ifdef x86_64}
  8264. and (taicpu(p).opsize <> S_Q) { S_Q will never happen: cmp with 64 bit constants is not possible }
  8265. {$endif x86_64}
  8266. then
  8267. begin
  8268. { cmp register,$8000 neg register
  8269. je target --> jo target
  8270. .... only if register is deallocated before jump.}
  8271. case Taicpu(p).opsize of
  8272. S_B: v:=$80;
  8273. S_W: v:=$8000;
  8274. S_L: v:=qword($80000000);
  8275. else
  8276. internalerror(2013112905);
  8277. end;
  8278. if (taicpu(p).oper[0]^.val=v) and
  8279. (Taicpu(hp1).condition in [C_E,C_NE]) then
  8280. begin
  8281. TransferUsedRegs(TmpUsedRegs);
  8282. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  8283. if not(RegInUsedRegs(Taicpu(p).oper[1]^.reg, TmpUsedRegs)) then
  8284. begin
  8285. DebugMsg(SPeepholeOptimization + 'CmpJe2NegJo done',p);
  8286. Taicpu(p).opcode:=A_NEG;
  8287. Taicpu(p).loadoper(0,Taicpu(p).oper[1]^);
  8288. Taicpu(p).clearop(1);
  8289. Taicpu(p).ops:=1;
  8290. if Taicpu(hp1).condition=C_E then
  8291. Taicpu(hp1).condition:=C_O
  8292. else
  8293. Taicpu(hp1).condition:=C_NO;
  8294. Result:=true;
  8295. exit;
  8296. end;
  8297. end;
  8298. end;
  8299. end;
  8300. end;
  8301. end;
  8302. if TrySwapMovCmp(p, hp1) then
  8303. begin
  8304. Result := True;
  8305. Exit;
  8306. end;
  8307. end;
  8308. function TX86AsmOptimizer.OptPass1PXor(var p: tai): boolean;
  8309. var
  8310. hp1: tai;
  8311. begin
  8312. {
  8313. remove the second (v)pxor from
  8314. pxor reg,reg
  8315. ...
  8316. pxor reg,reg
  8317. }
  8318. Result:=false;
  8319. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  8320. MatchOpType(taicpu(p),top_reg,top_reg) and
  8321. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  8322. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  8323. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  8324. MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^) then
  8325. begin
  8326. DebugMsg(SPeepholeOptimization + 'PXorPXor2PXor done',hp1);
  8327. RemoveInstruction(hp1);
  8328. Result:=true;
  8329. Exit;
  8330. end
  8331. {
  8332. replace
  8333. pxor reg1,reg1
  8334. movapd/s reg1,reg2
  8335. dealloc reg1
  8336. by
  8337. pxor reg2,reg2
  8338. }
  8339. else if GetNextInstruction(p,hp1) and
  8340. { we mix single and double opperations here because we assume that the compiler
  8341. generates vmovapd only after double operations and vmovaps only after single operations }
  8342. MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
  8343. MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  8344. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  8345. (taicpu(p).oper[0]^.typ=top_reg) then
  8346. begin
  8347. TransferUsedRegs(TmpUsedRegs);
  8348. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  8349. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  8350. begin
  8351. taicpu(p).loadoper(0,taicpu(hp1).oper[1]^);
  8352. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  8353. DebugMsg(SPeepholeOptimization + 'PXorMovapd2PXor done',p);
  8354. RemoveInstruction(hp1);
  8355. result:=true;
  8356. end;
  8357. end;
  8358. end;
  8359. function TX86AsmOptimizer.OptPass1VPXor(var p: tai): boolean;
  8360. var
  8361. hp1: tai;
  8362. begin
  8363. {
  8364. remove the second (v)pxor from
  8365. (v)pxor reg,reg
  8366. ...
  8367. (v)pxor reg,reg
  8368. }
  8369. Result:=false;
  8370. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^,taicpu(p).oper[2]^) and
  8371. MatchOpType(taicpu(p),top_reg,top_reg,top_reg) then
  8372. begin
  8373. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  8374. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  8375. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  8376. MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^,taicpu(hp1).oper[2]^) then
  8377. begin
  8378. DebugMsg(SPeepholeOptimization + 'VPXorVPXor2VPXor done',hp1);
  8379. RemoveInstruction(hp1);
  8380. Result:=true;
  8381. Exit;
  8382. end;
  8383. {$ifdef x86_64}
  8384. {
  8385. replace
  8386. vpxor reg1,reg1,reg1
  8387. vmov reg,mem
  8388. by
  8389. movq $0,mem
  8390. }
  8391. if GetNextInstruction(p,hp1) and
  8392. MatchInstruction(hp1,A_VMOVSD,[]) and
  8393. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  8394. MatchOpType(taicpu(hp1),top_reg,top_ref) then
  8395. begin
  8396. TransferUsedRegs(TmpUsedRegs);
  8397. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  8398. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  8399. begin
  8400. taicpu(hp1).loadconst(0,0);
  8401. taicpu(hp1).opcode:=A_MOV;
  8402. taicpu(hp1).opsize:=S_Q;
  8403. DebugMsg(SPeepholeOptimization + 'VPXorVMov2Mov done',p);
  8404. RemoveCurrentP(p);
  8405. result:=true;
  8406. Exit;
  8407. end;
  8408. end;
  8409. {$endif x86_64}
  8410. end
  8411. {
  8412. replace
  8413. vpxor reg1,reg1,reg2
  8414. by
  8415. vpxor reg2,reg2,reg2
  8416. to avoid unncessary data dependencies
  8417. }
  8418. else if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  8419. MatchOpType(taicpu(p),top_reg,top_reg,top_reg) then
  8420. begin
  8421. DebugMsg(SPeepholeOptimization + 'VPXor2VPXor done',p);
  8422. { avoid unncessary data dependency }
  8423. taicpu(p).loadreg(0,taicpu(p).oper[2]^.reg);
  8424. taicpu(p).loadreg(1,taicpu(p).oper[2]^.reg);
  8425. result:=true;
  8426. exit;
  8427. end;
  8428. Result:=OptPass1VOP(p);
  8429. end;
  8430. function TX86AsmOptimizer.OptPass1Imul(var p: tai): boolean;
  8431. var
  8432. hp1 : tai;
  8433. begin
  8434. result:=false;
  8435. { replace
  8436. IMul const,%mreg1,%mreg2
  8437. Mov %reg2,%mreg3
  8438. dealloc %mreg3
  8439. by
  8440. Imul const,%mreg1,%mreg23
  8441. }
  8442. if (taicpu(p).ops=3) and
  8443. GetNextInstruction(p,hp1) and
  8444. MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  8445. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  8446. (taicpu(hp1).oper[1]^.typ=top_reg) then
  8447. begin
  8448. TransferUsedRegs(TmpUsedRegs);
  8449. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  8450. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  8451. begin
  8452. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  8453. DebugMsg(SPeepholeOptimization + 'ImulMov2Imul done',p);
  8454. RemoveInstruction(hp1);
  8455. result:=true;
  8456. end;
  8457. end;
  8458. end;
  8459. function TX86AsmOptimizer.OptPass1SHXX(var p: tai): boolean;
  8460. var
  8461. hp1 : tai;
  8462. begin
  8463. result:=false;
  8464. { replace
  8465. IMul %reg0,%reg1,%reg2
  8466. Mov %reg2,%reg3
  8467. dealloc %reg2
  8468. by
  8469. Imul %reg0,%reg1,%reg3
  8470. }
  8471. if GetNextInstruction(p,hp1) and
  8472. MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  8473. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  8474. (taicpu(hp1).oper[1]^.typ=top_reg) then
  8475. begin
  8476. TransferUsedRegs(TmpUsedRegs);
  8477. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  8478. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  8479. begin
  8480. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  8481. DebugMsg(SPeepholeOptimization + 'SHXXMov2SHXX done',p);
  8482. RemoveInstruction(hp1);
  8483. result:=true;
  8484. end;
  8485. end;
  8486. end;
  8487. function TX86AsmOptimizer.OptPass1_V_Cvtss2sd(var p: tai): boolean;
  8488. var
  8489. hp1: tai;
  8490. begin
  8491. Result:=false;
  8492. { get rid of
  8493. (v)cvtss2sd reg0,<reg1,>reg2
  8494. (v)cvtss2sd reg2,<reg2,>reg0
  8495. }
  8496. if GetNextInstruction(p,hp1) and
  8497. (((taicpu(p).opcode=A_CVTSS2SD) and MatchInstruction(hp1,A_CVTSD2SS,[taicpu(p).opsize]) and
  8498. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)) or
  8499. ((taicpu(p).opcode=A_VCVTSS2SD) and MatchInstruction(hp1,A_VCVTSD2SS,[taicpu(p).opsize]) and
  8500. MatchOpType(taicpu(p),top_reg,top_reg,top_reg) and
  8501. MatchOpType(taicpu(hp1),top_reg,top_reg,top_reg) and
  8502. (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  8503. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  8504. (getsupreg(taicpu(p).oper[2]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg))
  8505. )
  8506. ) then
  8507. begin
  8508. if ((taicpu(p).opcode=A_CVTSS2SD) and (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg))) or
  8509. ((taicpu(p).opcode=A_VCVTSS2SD) and (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[2]^.reg))) then
  8510. begin
  8511. DebugMsg(SPeepholeOptimization + '(V)Cvtss2CvtSd(V)Cvtsd2ss2Nop done',p);
  8512. RemoveCurrentP(p);
  8513. RemoveInstruction(hp1);
  8514. end
  8515. else
  8516. begin
  8517. DebugMsg(SPeepholeOptimization + '(V)Cvtss2CvtSd(V)Cvtsd2ss2Vmovaps done',p);
  8518. if taicpu(hp1).opcode=A_CVTSD2SS then
  8519. begin
  8520. taicpu(p).loadreg(1,taicpu(hp1).oper[1]^.reg);
  8521. taicpu(p).opcode:=A_MOVAPS;
  8522. end
  8523. else
  8524. begin
  8525. taicpu(p).loadreg(1,taicpu(hp1).oper[2]^.reg);
  8526. taicpu(p).opcode:=A_VMOVAPS;
  8527. end;
  8528. taicpu(p).ops:=2;
  8529. RemoveInstruction(hp1);
  8530. end;
  8531. Result:=true;
  8532. Exit;
  8533. end;
  8534. end;
  8535. function TX86AsmOptimizer.OptPass1Jcc(var p : tai) : boolean;
  8536. var
  8537. hp1, hp2, hp3, hp4, hp5: tai;
  8538. ThisReg: TRegister;
  8539. begin
  8540. Result := False;
  8541. if not GetNextInstruction(p,hp1) then
  8542. Exit;
  8543. {
  8544. convert
  8545. j<c> .L1
  8546. mov 1,reg
  8547. jmp .L2
  8548. .L1
  8549. mov 0,reg
  8550. .L2
  8551. into
  8552. mov 0,reg
  8553. set<not(c)> reg
  8554. take care of alignment and that the mov 0,reg is not converted into a xor as this
  8555. would destroy the flag contents
  8556. Use MOVZX if size is preferred, since while mov 0,reg is bigger, it can be
  8557. executed at the same time as a previous comparison.
  8558. set<not(c)> reg
  8559. movzx reg, reg
  8560. }
  8561. if MatchInstruction(hp1,A_MOV,[]) and
  8562. (taicpu(hp1).oper[0]^.typ = top_const) and
  8563. (
  8564. (
  8565. (taicpu(hp1).oper[1]^.typ = top_reg)
  8566. {$ifdef i386}
  8567. { Under i386, ESI, EDI, EBP and ESP
  8568. don't have an 8-bit representation }
  8569. and not (getsupreg(taicpu(hp1).oper[1]^.reg) in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  8570. {$endif i386}
  8571. ) or (
  8572. {$ifdef i386}
  8573. (taicpu(hp1).oper[1]^.typ <> top_reg) and
  8574. {$endif i386}
  8575. (taicpu(hp1).opsize = S_B)
  8576. )
  8577. ) and
  8578. GetNextInstruction(hp1,hp2) and
  8579. MatchInstruction(hp2,A_JMP,[]) and (taicpu(hp2).oper[0]^.ref^.refaddr=addr_full) and
  8580. GetNextInstruction(hp2,hp3) and
  8581. FindLabel(tasmlabel(taicpu(p).oper[0]^.ref^.symbol), hp3) and
  8582. GetNextInstruction(hp3,hp4) and
  8583. MatchInstruction(hp4,A_MOV,[taicpu(hp1).opsize]) and
  8584. (taicpu(hp4).oper[0]^.typ = top_const) and
  8585. (
  8586. ((taicpu(hp1).oper[0]^.val = 0) and (taicpu(hp4).oper[0]^.val = 1)) or
  8587. ((taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0))
  8588. ) and
  8589. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp4).oper[1]^) and
  8590. GetNextInstruction(hp4,hp5) and
  8591. FindLabel(tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol), hp5) then
  8592. begin
  8593. if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
  8594. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  8595. tai_label(hp3).labsym.DecRefs;
  8596. { If this isn't the only reference to the middle label, we can
  8597. still make a saving - only that the first jump and everything
  8598. that follows will remain. }
  8599. if (tai_label(hp3).labsym.getrefs = 0) then
  8600. begin
  8601. if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
  8602. DebugMsg(SPeepholeOptimization + 'J(c)Mov1JmpMov0 -> Set(~c)',p)
  8603. else
  8604. DebugMsg(SPeepholeOptimization + 'J(c)Mov0JmpMov1 -> Set(c)',p);
  8605. { remove jump, first label and second MOV (also catching any aligns) }
  8606. repeat
  8607. if not GetNextInstruction(hp2, hp3) then
  8608. InternalError(2021040810);
  8609. RemoveInstruction(hp2);
  8610. hp2 := hp3;
  8611. until hp2 = hp5;
  8612. { Don't decrement reference count before the removal loop
  8613. above, otherwise GetNextInstruction won't stop on the
  8614. the label }
  8615. tai_label(hp5).labsym.DecRefs;
  8616. end
  8617. else
  8618. begin
  8619. if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
  8620. DebugMsg(SPeepholeOptimization + 'J(c)Mov1JmpMov0 -> Set(~c) (partial)',p)
  8621. else
  8622. DebugMsg(SPeepholeOptimization + 'J(c)Mov0JmpMov1 -> Set(c) (partial)',p);
  8623. end;
  8624. taicpu(p).opcode:=A_SETcc;
  8625. taicpu(p).opsize:=S_B;
  8626. taicpu(p).is_jmp:=False;
  8627. if taicpu(hp1).opsize=S_B then
  8628. begin
  8629. taicpu(p).loadoper(0, taicpu(hp1).oper[1]^);
  8630. if taicpu(hp1).oper[1]^.typ = top_reg then
  8631. AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp2, UsedRegs);
  8632. RemoveInstruction(hp1);
  8633. end
  8634. else
  8635. begin
  8636. { Will be a register because the size can't be S_B otherwise }
  8637. ThisReg := newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBL);
  8638. taicpu(p).loadreg(0, ThisReg);
  8639. AllocRegBetween(ThisReg, p, hp2, UsedRegs);
  8640. if (cs_opt_size in current_settings.optimizerswitches) and IsMOVZXAcceptable then
  8641. begin
  8642. case taicpu(hp1).opsize of
  8643. S_W:
  8644. taicpu(hp1).opsize := S_BW;
  8645. S_L:
  8646. taicpu(hp1).opsize := S_BL;
  8647. {$ifdef x86_64}
  8648. S_Q:
  8649. begin
  8650. taicpu(hp1).opsize := S_BL;
  8651. { Change the destination register to 32-bit }
  8652. taicpu(hp1).loadreg(1, newreg(R_INTREGISTER,getsupreg(ThisReg), R_SUBD));
  8653. end;
  8654. {$endif x86_64}
  8655. else
  8656. InternalError(2021040820);
  8657. end;
  8658. taicpu(hp1).opcode := A_MOVZX;
  8659. taicpu(hp1).loadreg(0, ThisReg);
  8660. end
  8661. else
  8662. begin
  8663. AllocRegBetween(NR_FLAGS,p,hp1,UsedRegs);
  8664. { hp1 is already a MOV instruction with the correct register }
  8665. taicpu(hp1).loadconst(0, 0);
  8666. { Inserting it right before p will guarantee that the flags are also tracked }
  8667. asml.Remove(hp1);
  8668. asml.InsertBefore(hp1, p);
  8669. end;
  8670. end;
  8671. Result:=true;
  8672. exit;
  8673. end
  8674. else if MatchInstruction(hp1, A_CLC, A_STC, []) then
  8675. Result := TryJccStcClcOpt(p, hp1)
  8676. else if (hp1.typ = ait_label) then
  8677. Result := DoSETccLblRETOpt(p, tai_label(hp1));
  8678. end;
  8679. function TX86AsmOptimizer.OptPass1VMOVDQ(var p: tai): Boolean;
  8680. var
  8681. hp1, hp2, hp3: tai;
  8682. SourceRef, TargetRef: TReference;
  8683. CurrentReg: TRegister;
  8684. begin
  8685. { VMOVDQU/CMOVDQA shouldn't have even been generated }
  8686. if not UseAVX then
  8687. InternalError(2021100501);
  8688. Result := False;
  8689. { Look for the following to simplify:
  8690. vmovdqa/u x(mem1), %xmmreg
  8691. vmovdqa/u %xmmreg, y(mem2)
  8692. vmovdqa/u x+16(mem1), %xmmreg
  8693. vmovdqa/u %xmmreg, y+16(mem2)
  8694. Change to:
  8695. vmovdqa/u x(mem1), %ymmreg
  8696. vmovdqa/u %ymmreg, y(mem2)
  8697. vpxor %ymmreg, %ymmreg, %ymmreg
  8698. ( The VPXOR instruction is to zero the upper half, thus removing the
  8699. need to call the potentially expensive VZEROUPPER instruction. Other
  8700. peephole optimisations can remove VPXOR if it's unnecessary )
  8701. }
  8702. TransferUsedRegs(TmpUsedRegs);
  8703. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  8704. { NOTE: In the optimisations below, if the references dictate that an
  8705. aligned move is possible (i.e. VMOVDQA), the existing instructions
  8706. should already be VMOVDQA because if (x mod 32) = 0, then (x mod 16) = 0 }
  8707. if (taicpu(p).opsize = S_XMM) and
  8708. MatchOpType(taicpu(p), top_ref, top_reg) and
  8709. GetNextInstruction(p, hp1) and
  8710. MatchInstruction(hp1, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
  8711. MatchOpType(taicpu(hp1), top_reg, top_ref) and
  8712. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  8713. begin
  8714. SourceRef := taicpu(p).oper[0]^.ref^;
  8715. TargetRef := taicpu(hp1).oper[1]^.ref^;
  8716. if GetNextInstruction(hp1, hp2) and
  8717. MatchInstruction(hp2, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
  8718. MatchOpType(taicpu(hp2), top_ref, top_reg) then
  8719. begin
  8720. { Delay calling GetNextInstruction(hp2, hp3) for as long as possible }
  8721. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  8722. Inc(SourceRef.offset, 16);
  8723. { Reuse the register in the first block move }
  8724. CurrentReg := newreg(R_MMREGISTER, getsupreg(taicpu(p).oper[1]^.reg), R_SUBMMY);
  8725. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) and
  8726. not RefsMightOverlap(taicpu(p).oper[0]^.ref^, TargetRef, 32) then
  8727. begin
  8728. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  8729. Inc(TargetRef.offset, 16);
  8730. if GetNextInstruction(hp2, hp3) and
  8731. MatchInstruction(hp3, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
  8732. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  8733. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  8734. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  8735. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  8736. begin
  8737. { Update the register tracking to the new size }
  8738. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  8739. { Remember that the offsets are 16 ahead }
  8740. { Switch to unaligned if the memory isn't on a 32-byte boundary }
  8741. if not (
  8742. ((SourceRef.offset mod 32) = 16) and
  8743. (SourceRef.alignment >= 32) and ((SourceRef.alignment mod 32) = 0)
  8744. ) then
  8745. taicpu(p).opcode := A_VMOVDQU;
  8746. taicpu(p).opsize := S_YMM;
  8747. taicpu(p).oper[1]^.reg := CurrentReg;
  8748. if not (
  8749. ((TargetRef.offset mod 32) = 16) and
  8750. (TargetRef.alignment >= 32) and ((TargetRef.alignment mod 32) = 0)
  8751. ) then
  8752. taicpu(hp1).opcode := A_VMOVDQU;
  8753. taicpu(hp1).opsize := S_YMM;
  8754. taicpu(hp1).oper[0]^.reg := CurrentReg;
  8755. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 1)', p);
  8756. { If pi_uses_ymm is set, VZEROUPPER is present to do this for us }
  8757. if (pi_uses_ymm in current_procinfo.flags) then
  8758. RemoveInstruction(hp2)
  8759. else
  8760. begin
  8761. { Upper 128 bits will be set to zero; change to XMM
  8762. to avoid requirement of AVX2 }
  8763. setsubreg(CurrentReg, R_SUBMMX);
  8764. taicpu(hp2).opcode := A_VPXOR;
  8765. taicpu(hp2).opsize := S_XMM;
  8766. taicpu(hp2).loadreg(0, CurrentReg);
  8767. taicpu(hp2).loadreg(1, CurrentReg);
  8768. taicpu(hp2).loadreg(2, CurrentReg);
  8769. taicpu(hp2).ops := 3;
  8770. end;
  8771. RemoveInstruction(hp3);
  8772. Result := True;
  8773. Exit;
  8774. end;
  8775. end
  8776. else
  8777. begin
  8778. { See if the next references are 16 less rather than 16 greater }
  8779. Dec(SourceRef.offset, 32); { -16 the other way }
  8780. if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
  8781. begin
  8782. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  8783. Dec(TargetRef.offset, 16); { Only 16, not 32, as it wasn't incremented unlike SourceRef }
  8784. if not RefsMightOverlap(SourceRef, TargetRef, 32) and
  8785. GetNextInstruction(hp2, hp3) and
  8786. MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
  8787. MatchOpType(taicpu(hp3), top_reg, top_ref) and
  8788. (taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
  8789. RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
  8790. not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
  8791. begin
  8792. { Update the register tracking to the new size }
  8793. AllocRegBetween(CurrentReg, hp2, hp3, UsedRegs);
  8794. { hp2 and hp3 are the starting offsets, so mod = 0 this time }
  8795. { Switch to unaligned if the memory isn't on a 32-byte boundary }
  8796. if not(
  8797. ((SourceRef.offset mod 32) = 0) and
  8798. (SourceRef.alignment >= 32) and ((SourceRef.alignment mod 32) = 0)
  8799. ) then
  8800. taicpu(hp2).opcode := A_VMOVDQU;
  8801. taicpu(hp2).opsize := S_YMM;
  8802. taicpu(hp2).oper[1]^.reg := CurrentReg;
  8803. if not (
  8804. ((TargetRef.offset mod 32) = 0) and
  8805. (TargetRef.alignment >= 32) and ((TargetRef.alignment mod 32) = 0)
  8806. ) then
  8807. taicpu(hp3).opcode := A_VMOVDQU;
  8808. taicpu(hp3).opsize := S_YMM;
  8809. taicpu(hp3).oper[0]^.reg := CurrentReg;
  8810. DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 2)', p);
  8811. { If pi_uses_ymm is set, VZEROUPPER is present to do this for us }
  8812. if (pi_uses_ymm in current_procinfo.flags) then
  8813. RemoveInstruction(hp1)
  8814. else
  8815. begin
  8816. { Upper 128 bits will be set to zero; change to
  8817. XMM to avoid requirement of AVX2 }
  8818. setsubreg(CurrentReg, R_SUBMMX);
  8819. taicpu(hp1).opcode := A_VPXOR;
  8820. taicpu(hp1).opsize := S_XMM;
  8821. taicpu(hp1).loadreg(0, CurrentReg);
  8822. taicpu(hp1).loadreg(1, CurrentReg);
  8823. taicpu(hp1).loadreg(2, CurrentReg);
  8824. taicpu(hp1).ops := 3;
  8825. Asml.Remove(hp1);
  8826. Asml.InsertAfter(hp1, hp3); { Register deallocations will be after hp3 }
  8827. end;
  8828. RemoveCurrentP(p, hp2);
  8829. Result := True;
  8830. Exit;
  8831. end;
  8832. end;
  8833. end;
  8834. end;
  8835. end;
  8836. end;
  8837. function TX86AsmOptimizer.CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
  8838. var
  8839. hp2, hp3, first_assignment: tai;
  8840. IncCount, OperIdx: Integer;
  8841. OrigLabel: TAsmLabel;
  8842. begin
  8843. Count := 0;
  8844. Result := False;
  8845. first_assignment := nil;
  8846. if (LoopCount >= 20) then
  8847. begin
  8848. { Guard against infinite loops }
  8849. Exit;
  8850. end;
  8851. if (taicpu(p).oper[0]^.typ <> top_ref) or
  8852. (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) or
  8853. (taicpu(p).oper[0]^.ref^.base <> NR_NO) or
  8854. (taicpu(p).oper[0]^.ref^.index <> NR_NO) or
  8855. not (taicpu(p).oper[0]^.ref^.symbol is TAsmLabel) then
  8856. Exit;
  8857. OrigLabel := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  8858. {
  8859. change
  8860. jmp .L1
  8861. ...
  8862. .L1:
  8863. mov ##, ## ( multiple movs possible )
  8864. jmp/ret
  8865. into
  8866. mov ##, ##
  8867. jmp/ret
  8868. }
  8869. if not Assigned(hp1) then
  8870. begin
  8871. hp1 := GetLabelWithSym(OrigLabel);
  8872. if not Assigned(hp1) or not SkipLabels(hp1, hp1) then
  8873. Exit;
  8874. end;
  8875. hp2 := hp1;
  8876. while Assigned(hp2) do
  8877. begin
  8878. if Assigned(hp2) and (hp2.typ = ait_label) then
  8879. SkipLabels(hp2,hp2);
  8880. if not Assigned(hp2) or (hp2.typ <> ait_instruction) then
  8881. Break;
  8882. case taicpu(hp2).opcode of
  8883. A_MOVSD:
  8884. begin
  8885. if taicpu(hp2).ops = 0 then
  8886. { Wrong MOVSD }
  8887. Break;
  8888. Inc(Count);
  8889. if Count >= 5 then
  8890. { Too many to be worthwhile }
  8891. Break;
  8892. GetNextInstruction(hp2, hp2);
  8893. Continue;
  8894. end;
  8895. A_MOV,
  8896. A_MOVD,
  8897. A_MOVQ,
  8898. A_MOVSX,
  8899. {$ifdef x86_64}
  8900. A_MOVSXD,
  8901. {$endif x86_64}
  8902. A_MOVZX,
  8903. A_MOVAPS,
  8904. A_MOVUPS,
  8905. A_MOVSS,
  8906. A_MOVAPD,
  8907. A_MOVUPD,
  8908. A_MOVDQA,
  8909. A_MOVDQU,
  8910. A_VMOVSS,
  8911. A_VMOVAPS,
  8912. A_VMOVUPS,
  8913. A_VMOVSD,
  8914. A_VMOVAPD,
  8915. A_VMOVUPD,
  8916. A_VMOVDQA,
  8917. A_VMOVDQU:
  8918. begin
  8919. Inc(Count);
  8920. if Count >= 5 then
  8921. { Too many to be worthwhile }
  8922. Break;
  8923. GetNextInstruction(hp2, hp2);
  8924. Continue;
  8925. end;
  8926. A_JMP:
  8927. begin
  8928. { Guard against infinite loops }
  8929. if taicpu(hp2).oper[0]^.ref^.symbol = OrigLabel then
  8930. Exit;
  8931. { Analyse this jump first in case it also duplicates assignments }
  8932. if CheckJumpMovTransferOpt(hp2, nil, LoopCount + 1, IncCount) then
  8933. begin
  8934. { Something did change! }
  8935. Result := True;
  8936. Inc(Count, IncCount);
  8937. if Count >= 5 then
  8938. begin
  8939. { Too many to be worthwhile }
  8940. Exit;
  8941. end;
  8942. if MatchInstruction(hp2, [A_JMP, A_RET], []) then
  8943. Break;
  8944. end;
  8945. Result := True;
  8946. Break;
  8947. end;
  8948. A_RET:
  8949. begin
  8950. Result := True;
  8951. Break;
  8952. end;
  8953. else
  8954. Break;
  8955. end;
  8956. end;
  8957. if Result then
  8958. begin
  8959. { A count of zero can happen when CheckJumpMovTransferOpt is called recursively }
  8960. if Count = 0 then
  8961. begin
  8962. Result := False;
  8963. Exit;
  8964. end;
  8965. TransferUsedRegs(TmpUsedRegs);
  8966. hp3 := p;
  8967. DebugMsg(SPeepholeOptimization + 'Duplicated ' + debug_tostr(Count) + ' assignment(s) and redirected jump', p);
  8968. while True do
  8969. begin
  8970. if Assigned(hp1) and (hp1.typ = ait_label) then
  8971. SkipLabels(hp1,hp1);
  8972. case hp1.typ of
  8973. ait_regalloc:
  8974. if tai_regalloc(hp1).ratype = ra_dealloc then
  8975. begin
  8976. { Duplicate the register deallocation... }
  8977. hp3:=tai(hp1.getcopy);
  8978. if first_assignment = nil then
  8979. first_assignment := hp3;
  8980. asml.InsertBefore(hp3, p);
  8981. { ... but also reallocate it after the jump }
  8982. hp3:=tai(hp1.getcopy);
  8983. tai_regalloc(hp3).ratype := ra_alloc;
  8984. asml.InsertAfter(hp3, p);
  8985. end;
  8986. ait_instruction:
  8987. case taicpu(hp1).opcode of
  8988. A_JMP:
  8989. begin
  8990. { Change the original jump to the new destination }
  8991. OrigLabel.decrefs;
  8992. taicpu(hp1).oper[0]^.ref^.symbol.increfs;
  8993. taicpu(p).loadref(0, taicpu(hp1).oper[0]^.ref^);
  8994. { Set p to the first duplicated assignment so it can get optimised if needs be }
  8995. if not Assigned(first_assignment) then
  8996. InternalError(2021040810)
  8997. else
  8998. p := first_assignment;
  8999. Exit;
  9000. end;
  9001. A_RET:
  9002. begin
  9003. { Now change the jump into a RET instruction }
  9004. ConvertJumpToRET(p, hp1);
  9005. { Set p to the first duplicated assignment so it can get optimised if needs be }
  9006. if not Assigned(first_assignment) then
  9007. InternalError(2021040811)
  9008. else
  9009. p := first_assignment;
  9010. Exit;
  9011. end;
  9012. else
  9013. begin
  9014. { Duplicate the MOV instruction }
  9015. hp3:=tai(hp1.getcopy);
  9016. if first_assignment = nil then
  9017. first_assignment := hp3;
  9018. asml.InsertBefore(hp3, p);
  9019. { Make sure the compiler knows about any final registers written here }
  9020. for OperIdx := 0 to taicpu(hp3).ops - 1 do
  9021. with taicpu(hp3).oper[OperIdx]^ do
  9022. begin
  9023. case typ of
  9024. top_ref:
  9025. begin
  9026. if (ref^.base <> NR_NO) and
  9027. (getsupreg(ref^.base) <> RS_STACK_POINTER_REG) and
  9028. (
  9029. (getsupreg(ref^.base) <> RS_FRAME_POINTER_REG) or
  9030. (
  9031. { Allow the frame pointer if it's not being used by the procedure as such }
  9032. Assigned(current_procinfo) and
  9033. (current_procinfo.framepointer <> NR_FRAME_POINTER_REG)
  9034. )
  9035. )
  9036. {$ifdef x86_64} and (ref^.base <> NR_RIP) {$endif x86_64}
  9037. then
  9038. begin
  9039. AllocRegBetween(ref^.base, hp3, p, TmpUsedRegs);
  9040. if not Assigned(first_assignment) then
  9041. IncludeRegInUsedRegs(ref^.base, UsedRegs);
  9042. end;
  9043. if (ref^.index <> NR_NO) and
  9044. (getsupreg(ref^.index) <> RS_STACK_POINTER_REG) and
  9045. (
  9046. (getsupreg(ref^.index) <> RS_FRAME_POINTER_REG) or
  9047. (
  9048. { Allow the frame pointer if it's not being used by the procedure as such }
  9049. Assigned(current_procinfo) and
  9050. (current_procinfo.framepointer <> NR_FRAME_POINTER_REG)
  9051. )
  9052. )
  9053. {$ifdef x86_64} and (ref^.index <> NR_RIP) {$endif x86_64} and
  9054. (ref^.index <> ref^.base) then
  9055. begin
  9056. AllocRegBetween(ref^.index, hp3, p, TmpUsedRegs);
  9057. if not Assigned(first_assignment) then
  9058. IncludeRegInUsedRegs(ref^.index, UsedRegs);
  9059. end;
  9060. end;
  9061. top_reg:
  9062. begin
  9063. AllocRegBetween(reg, hp3, p, TmpUsedRegs);
  9064. if not Assigned(first_assignment) then
  9065. IncludeRegInUsedRegs(reg, UsedRegs);
  9066. end;
  9067. else
  9068. ;
  9069. end;
  9070. end;
  9071. end;
  9072. end;
  9073. else
  9074. InternalError(2021040720);
  9075. end;
  9076. if not GetNextInstruction(hp1, hp1, [ait_regalloc]) then
  9077. { Should have dropped out earlier }
  9078. InternalError(2021040710);
  9079. end;
  9080. end;
  9081. end;
  9082. const
  9083. WriteOp: array[0..3] of set of TInsChange = (
  9084. [Ch_Wop1, Ch_RWop1, Ch_Mop1],
  9085. [Ch_Wop2, Ch_RWop2, Ch_Mop2],
  9086. [Ch_Wop3, Ch_RWop3, Ch_Mop3],
  9087. [Ch_Wop4, Ch_RWop4, Ch_Mop4]);
  9088. RegWriteFlags: array[0..7] of set of TInsChange = (
  9089. { The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
  9090. [Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
  9091. [Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
  9092. [Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
  9093. [Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
  9094. [Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
  9095. [Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
  9096. [Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
  9097. [Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
  9098. function TX86AsmOptimizer.TrySwapMovOp(var p, hp1: tai): Boolean;
  9099. var
  9100. hp2: tai;
  9101. X: Integer;
  9102. begin
  9103. { If we have something like:
  9104. op ###,###
  9105. mov ###,###
  9106. Try to move the MOV instruction to before OP as long as OP and MOV don't
  9107. interfere in regards to what they write to.
  9108. NOTE: p must be a 2-operand instruction
  9109. }
  9110. Result := False;
  9111. if (hp1.typ <> ait_instruction) or
  9112. taicpu(hp1).is_jmp or
  9113. RegInInstruction(NR_DEFAULTFLAGS, hp1) then
  9114. Exit;
  9115. { NOP is a pipeline fence, likely marking the beginning of the function
  9116. epilogue, so drop out. Similarly, drop out if POP or RET are
  9117. encountered }
  9118. if MatchInstruction(hp1, A_NOP, A_POP, A_RET, []) then
  9119. Exit;
  9120. if (taicpu(hp1).opcode = A_MOVSD) and
  9121. (taicpu(hp1).ops = 0) then
  9122. { Wrong MOVSD }
  9123. Exit;
  9124. { Check for writes to specific registers first }
  9125. { EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP in that order }
  9126. for X := 0 to 7 do
  9127. if (RegWriteFlags[X] * InsProp[taicpu(hp1).opcode].Ch <> [])
  9128. and RegInInstruction(newreg(R_INTREGISTER, TSuperRegister(X), R_SUBWHOLE), p) then
  9129. Exit;
  9130. for X := 0 to taicpu(hp1).ops - 1 do
  9131. begin
  9132. { Check to see if this operand writes to something }
  9133. if ((WriteOp[X] * InsProp[taicpu(hp1).opcode].Ch) <> []) and
  9134. { And matches something in the CMP/TEST instruction }
  9135. (
  9136. MatchOperand(taicpu(hp1).oper[X]^, taicpu(p).oper[0]^) or
  9137. MatchOperand(taicpu(hp1).oper[X]^, taicpu(p).oper[1]^) or
  9138. (
  9139. { If it's a register, make sure the register written to doesn't
  9140. appear in the cmp instruction as part of a reference }
  9141. (taicpu(hp1).oper[X]^.typ = top_reg) and
  9142. RegInInstruction(taicpu(hp1).oper[X]^.reg, p)
  9143. )
  9144. ) then
  9145. Exit;
  9146. end;
  9147. { Check p to make sure it doesn't write to something that affects hp1 }
  9148. { Check for writes to specific registers first }
  9149. { EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP in that order }
  9150. for X := 0 to 7 do
  9151. if (RegWriteFlags[X] * InsProp[taicpu(p).opcode].Ch <> [])
  9152. and RegInInstruction(newreg(R_INTREGISTER, TSuperRegister(X), R_SUBWHOLE), hp1) then
  9153. Exit;
  9154. for X := 0 to taicpu(p).ops - 1 do
  9155. begin
  9156. { Check to see if this operand writes to something }
  9157. if ((WriteOp[X] * InsProp[taicpu(p).opcode].Ch) <> []) and
  9158. { And matches something in hp1 }
  9159. (taicpu(p).oper[X]^.typ = top_reg) and
  9160. RegInInstruction(taicpu(p).oper[X]^.reg, hp1) then
  9161. Exit;
  9162. end;
  9163. { The instruction can be safely moved }
  9164. asml.Remove(hp1);
  9165. { Try to insert after the last instructions where the FLAGS register is not
  9166. yet in use, so "mov $0,%reg" can be optimised into "xor %reg,%reg" later }
  9167. if SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp2) then
  9168. asml.InsertBefore(hp1, hp2)
  9169. { Failing that, try to insert after the last instructions where the
  9170. FLAGS register is not yet in use }
  9171. else if GetLastInstruction(p, hp2) and
  9172. (
  9173. (hp2.typ <> ait_instruction) or
  9174. { Don't insert after an instruction that uses the flags when p doesn't use them }
  9175. RegInInstruction(NR_DEFAULTFLAGS, p) or
  9176. not RegInInstruction(NR_DEFAULTFLAGS, hp2)
  9177. ) then
  9178. asml.InsertAfter(hp1, hp2)
  9179. else
  9180. { Note, if p.Previous is nil (even if it should logically never be the
  9181. case), FindRegAllocBackward immediately exits with False and so we
  9182. safely land here (we can't just pass p because FindRegAllocBackward
  9183. immediately exits on an instruction). [Kit] }
  9184. asml.InsertBefore(hp1, p);
  9185. DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and ' + debug_op2str(taicpu(hp1).opcode) + ' instructions to improve optimisation potential', hp1);
  9186. { We can't trust UsedRegs because we're looking backwards, although we
  9187. know the registers are allocated after p at the very least, so manually
  9188. create tai_regalloc objects if needed }
  9189. for X := 0 to taicpu(hp1).ops - 1 do
  9190. case taicpu(hp1).oper[X]^.typ of
  9191. top_reg:
  9192. begin
  9193. asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.reg, nil), hp1);
  9194. IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.reg, UsedRegs);
  9195. AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
  9196. end;
  9197. top_ref:
  9198. begin
  9199. if taicpu(hp1).oper[X]^.ref^.base <> NR_NO then
  9200. begin
  9201. asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.base, nil), hp1);
  9202. IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.base, UsedRegs);
  9203. AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
  9204. end;
  9205. if taicpu(hp1).oper[X]^.ref^.index <> NR_NO then
  9206. begin
  9207. asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.index, nil), hp1);
  9208. IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.index, UsedRegs);
  9209. AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
  9210. end;
  9211. end;
  9212. else
  9213. ;
  9214. end;
  9215. Result := True;
  9216. end;
  9217. function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
  9218. var
  9219. hp2: tai;
  9220. X: Integer;
  9221. begin
  9222. { If we have something like:
  9223. cmp ###,%reg1
  9224. mov 0,%reg2
  9225. And no modified registers are shared, move the instruction to before
  9226. the comparison as this means it can be optimised without worrying
  9227. about the FLAGS register. (CMP/MOV is generated by
  9228. "J(c)Mov1JmpMov0 -> Set(~c)", among other things).
  9229. As long as the second instruction doesn't use the flags or one of the
  9230. registers used by CMP or TEST (also check any references that use the
  9231. registers), then it can be moved prior to the comparison.
  9232. }
  9233. Result := False;
  9234. if not TrySwapMovOp(p, hp1) then
  9235. Exit;
  9236. if taicpu(hp1).opcode = A_LEA then
  9237. { The flags will be overwritten by the CMP/TEST instruction }
  9238. ConvertLEA(taicpu(hp1));
  9239. Result := True;
  9240. { Can we move it one further back? }
  9241. if GetLastInstruction(hp1, hp2) and (hp2.typ = ait_instruction) and
  9242. { Check to see if CMP/TEST is a comparison against zero }
  9243. (
  9244. (
  9245. (taicpu(p).opcode = A_CMP) and
  9246. MatchOperand(taicpu(p).oper[0]^, 0)
  9247. ) or
  9248. (
  9249. (taicpu(p).opcode = A_TEST) and
  9250. (
  9251. OpsEqual(taicpu(p).oper[0]^, taicpu(p).oper[1]^) or
  9252. MatchOperand(taicpu(p).oper[0]^, -1)
  9253. )
  9254. )
  9255. ) and
  9256. { These instructions set the zero flag if the result is zero }
  9257. MatchInstruction(hp2, [A_ADD, A_SUB, A_OR, A_XOR, A_AND, A_POPCNT, A_LZCNT], []) and
  9258. OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) then
  9259. { Looks like we can - if successful, this benefits PostPeepholeOptTestOr }
  9260. TrySwapMovOp(hp2, hp1);
  9261. end;
  9262. function TX86AsmOptimizer.OptPass1STCCLC(var p: tai): Boolean;
  9263. var
  9264. hp1, hp2, p_last, p_dist, hp1_dist: tai;
  9265. JumpLabel: TAsmLabel;
  9266. TmpBool: Boolean;
  9267. begin
  9268. Result := False;
  9269. { Look for:
  9270. stc/clc
  9271. j(c) .L1
  9272. ...
  9273. .L1:
  9274. set(n)cb %reg
  9275. (flags deallocated)
  9276. j(c) .L2
  9277. Change to:
  9278. mov $0/$1,%reg (depending on if the carry bit is cleared or not)
  9279. j(c) .L2
  9280. }
  9281. p_last := p;
  9282. while GetNextInstruction(p_last, hp1) and
  9283. (hp1.typ = ait_instruction) and
  9284. IsJumpToLabel(taicpu(hp1)) do
  9285. begin
  9286. if DoJumpOptimizations(hp1, TmpBool) then
  9287. { Re-evaluate from p_last. Probably could be faster, but it's guaranteed to be correct }
  9288. Continue;
  9289. JumpLabel := TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol);
  9290. if not Assigned(JumpLabel) then
  9291. InternalError(2024012801);
  9292. { Optimise the J(c); stc/clc optimisation first since this will
  9293. get missed if the main optimisation takes place }
  9294. if (taicpu(hp1).opcode = A_JCC) then
  9295. begin
  9296. if GetNextInstruction(hp1, hp2) and
  9297. MatchInstruction(hp2, A_CLC, A_STC, []) and
  9298. TryJccStcClcOpt(hp1, hp2) then
  9299. begin
  9300. Result := True;
  9301. Exit;
  9302. end;
  9303. hp2 := nil; { Suppress compiler warning }
  9304. if (taicpu(hp1).condition in [C_C, C_NC]) and
  9305. { Make sure the flags aren't used again }
  9306. SetAndTest(FindRegDealloc(NR_DEFAULTFLAGS, tai(hp1.Next)), hp2) then
  9307. begin
  9308. { clc + jc = False; clc + jnc = True; stc + jc = True; stc + jnc = False }
  9309. if ((taicpu(p).opcode = A_STC) xor (taicpu(hp1).condition = C_NC)) then
  9310. begin
  9311. if (taicpu(p).opcode = A_STC) then
  9312. DebugMsg(SPeepholeOptimization + 'STC; JC -> JMP (Deterministic jump) (StcJc2Jmp)', p)
  9313. else
  9314. DebugMsg(SPeepholeOptimization + 'CLC; JNC -> JMP (Deterministic jump) (ClcJnc2Jmp)', p);
  9315. MakeUnconditional(taicpu(hp1));
  9316. { Move the jump to after the flag deallocations }
  9317. Asml.Remove(hp1);
  9318. Asml.InsertAfter(hp1, hp2);
  9319. RemoveCurrentP(p); { hp1 may not be the immediate next instruction }
  9320. Result := True;
  9321. Exit;
  9322. end
  9323. else
  9324. begin
  9325. if (taicpu(p).opcode = A_STC) then
  9326. DebugMsg(SPeepholeOptimization + 'STC; JNC -> NOP (Deterministic jump) (StcJnc2Nop)', p)
  9327. else
  9328. DebugMsg(SPeepholeOptimization + 'CLC; JC -> NOP (Deterministic jump) (ClcJc2Nop)', p);
  9329. { In this case, the jump is deterministic in that it will never be taken }
  9330. JumpLabel.DecRefs;
  9331. RemoveInstruction(hp1);
  9332. RemoveCurrentP(p); { hp1 may not have been the immediate next instruction }
  9333. Result := True;
  9334. Exit;
  9335. end;
  9336. end;
  9337. end;
  9338. hp2 := nil; { Suppress compiler warning }
  9339. if
  9340. { Make sure the carry flag doesn't appear in the jump conditions }
  9341. not (taicpu(hp1).condition in [C_AE, C_NB, C_NC, C_B, C_C, C_NAE, C_BE, C_NA]) and
  9342. SetAndTest(getlabelwithsym(JumpLabel), hp2) and
  9343. GetNextInstruction(hp2, p_dist) and
  9344. MatchInstruction(p_dist, A_Jcc, A_SETcc, []) and
  9345. (taicpu(p_dist).condition in [C_C, C_NC]) then
  9346. begin
  9347. case taicpu(p_dist).opcode of
  9348. A_Jcc:
  9349. begin
  9350. if DoJumpOptimizations(p_dist, TmpBool) then
  9351. { Re-evaluate from p_last. Probably could be faster, but it's guaranteed to be correct }
  9352. Continue;
  9353. { clc + jc = False; clc + jnc = True; stc + jc = True; stc + jnc = False }
  9354. if ((taicpu(p).opcode = A_STC) xor (taicpu(p_dist).condition = C_NC)) then
  9355. begin
  9356. DebugMsg(SPeepholeOptimization + 'STC/CLC; JMP/Jcc; ... J(N)C -> JMP/Jcc (StcClcJ(c)2Jmp)', p);
  9357. JumpLabel.decrefs;
  9358. taicpu(hp1).loadsymbol(0, taicpu(p_dist).oper[0]^.ref^.symbol, 0);
  9359. RemoveCurrentP(p); { hp1 may not be the immediate next instruction }
  9360. Result := True;
  9361. Exit;
  9362. end
  9363. else if GetNextInstruction(p_dist, hp1_dist) and
  9364. (hp1_dist.typ = ait_label) then
  9365. begin
  9366. DebugMsg(SPeepholeOptimization + 'STC/CLC; JMP/Jcc; ... J(N)C; .Lbl -> JMP/Jcc .Lbl (StcClcJ(~c)Lbl2Jmp)', p);
  9367. JumpLabel.decrefs;
  9368. taicpu(hp1).loadsymbol(0, tai_label(hp1_dist).labsym, 0);
  9369. RemoveCurrentP(p); { hp1 may not be the immediate next instruction }
  9370. Result := True;
  9371. Exit;
  9372. end;
  9373. end;
  9374. A_SETcc:
  9375. if { Make sure the flags aren't used again }
  9376. SetAndTest(FindRegDealloc(NR_DEFAULTFLAGS, tai(p_dist.Next)), hp2) and
  9377. GetNextInstruction(hp2, hp1_dist) and
  9378. (hp1_dist.typ = ait_instruction) and
  9379. IsJumpToLabel(taicpu(hp1_dist)) and
  9380. not (taicpu(hp1_dist).condition in [C_AE, C_NB, C_NC, C_B, C_C, C_NAE, C_BE, C_NA]) and
  9381. { This works if hp1_dist or both are regular JMP instructions }
  9382. condition_in(taicpu(hp1).condition, taicpu(hp1_dist).condition) and
  9383. (
  9384. (taicpu(p_dist).oper[0]^.typ <> top_reg) or
  9385. { Make sure the register isn't still in use, otherwise it
  9386. may get corrupted (fixes #40659) }
  9387. not RegUsedBetween(taicpu(p_dist).oper[0]^.reg, p, p_dist)
  9388. ) then
  9389. begin
  9390. taicpu(p).allocate_oper(2);
  9391. taicpu(p).ops := 2;
  9392. { clc + setc = 0; clc + setnc = 1; stc + setc = 1; stc + setnc = 0 }
  9393. taicpu(p).loadconst(0, TCGInt((taicpu(p).opcode = A_STC) xor (taicpu(p_dist).condition = C_NC)));
  9394. taicpu(p).loadoper(1, taicpu(p_dist).oper[0]^);
  9395. taicpu(p).opcode := A_MOV;
  9396. taicpu(p).opsize := S_B;
  9397. if (taicpu(p_dist).oper[0]^.typ = top_reg) then
  9398. AllocRegBetween(taicpu(p_dist).oper[0]^.reg, p, hp1, UsedRegs);
  9399. DebugMsg(SPeepholeOptimization + 'STC/CLC; JMP; ... SET(N)C; JMP -> MOV; JMP (StcClcSet(c)2Mov)', p);
  9400. JumpLabel.decrefs;
  9401. taicpu(hp1).loadsymbol(0, taicpu(hp1_dist).oper[0]^.ref^.symbol, 0);
  9402. { If a flag allocation is found, try to move it to after the MOV so "mov $0,%reg" gets optimised to "xor %reg,%reg" }
  9403. if SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp2) and
  9404. (tai_regalloc(hp2).ratype = ra_alloc) then
  9405. begin
  9406. Asml.Remove(hp2);
  9407. Asml.InsertAfter(hp2, p);
  9408. end;
  9409. Result := True;
  9410. Exit;
  9411. end;
  9412. else
  9413. ;
  9414. end;
  9415. end;
  9416. p_last := hp1;
  9417. end;
  9418. end;
  9419. function TX86AsmOptimizer.TryJccStcClcOpt(var p, hp1: tai): Boolean;
  9420. var
  9421. hp2, hp3: tai;
  9422. TempBool: Boolean;
  9423. begin
  9424. Result := False;
  9425. {
  9426. j(c) .L1
  9427. stc/clc
  9428. .L1:
  9429. jc/jnc .L2
  9430. (Flags deallocated)
  9431. Change to:
  9432. j)c) .L1
  9433. jmp .L2
  9434. .L1:
  9435. jc/jnc .L2
  9436. Then call DoJumpOptimizations to convert to:
  9437. j(nc) .L2
  9438. .L1: (may become a dead label)
  9439. jc/jnc .L2
  9440. }
  9441. if GetNextInstruction(hp1, hp2) and
  9442. (hp2.typ = ait_label) and
  9443. (tai_label(hp2).labsym = TAsmLabel(taicpu(p).oper[0]^.ref^.symbol)) and
  9444. GetNextInstruction(hp2, hp3) and
  9445. MatchInstruction(hp3, A_Jcc, []) and
  9446. (
  9447. (
  9448. (taicpu(hp3).condition = C_C) and
  9449. (taicpu(hp1).opcode = A_STC)
  9450. ) or (
  9451. (taicpu(hp3).condition = C_NC) and
  9452. (taicpu(hp1).opcode = A_CLC)
  9453. )
  9454. ) and
  9455. { Make sure the flags aren't used again }
  9456. Assigned(FindRegDealloc(NR_DEFAULTFLAGS, tai(hp3.Next))) then
  9457. begin
  9458. taicpu(hp1).allocate_oper(1);
  9459. taicpu(hp1).ops := 1;
  9460. taicpu(hp1).loadsymbol(0, TAsmLabel(taicpu(hp3).oper[0]^.ref^.symbol), 0);
  9461. taicpu(hp1).opcode := A_JMP;
  9462. taicpu(hp1).is_jmp := True;
  9463. TempBool := True; { Prevent compiler warnings }
  9464. if DoJumpOptimizations(p, TempBool) then
  9465. Result := True
  9466. else
  9467. Include(OptsToCheck, aoc_ForceNewIteration);
  9468. end;
  9469. end;
  9470. function TX86AsmOptimizer.OptPass2STCCLC(var p: tai): Boolean;
  9471. begin
  9472. { This generally only executes under -O3 and above }
  9473. Result := (aoc_DoPass2JccOpts in OptsToCheck) and OptPass1STCCLC(p);
  9474. end;
  9475. function TX86AsmOptimizer.OptPass2CMOVcc(var p: tai): Boolean;
  9476. var
  9477. hp1, hp2: tai;
  9478. FoundComparison: Boolean;
  9479. begin
  9480. { Run the pass 1 optimisations as well, since they may have some effect
  9481. after the CMOV blocks are created in OptPass2Jcc }
  9482. Result := False;
  9483. { Result := OptPass1CMOVcc(p);
  9484. if Result then
  9485. Exit;}
  9486. { Sometimes, the CMOV optimisations in OptPass2Jcc are a bit overzealous
  9487. and make a slightly inefficent result on branching-type blocks, notably
  9488. when setting a function result then jumping to the function epilogue.
  9489. In this case, change:
  9490. cmov(c) %reg1,%reg2
  9491. j(c) @lbl
  9492. (%reg2 deallocated)
  9493. To:
  9494. mov %reg11,%reg2
  9495. j(c) @lbl
  9496. Note, we can't use GetNextInstructionUsingReg to find the conditional
  9497. jump because if it's not present, we may end up with a jump that's
  9498. completely unrelated.
  9499. }
  9500. hp1 := p;
  9501. while GetNextInstruction(hp1, hp1) and
  9502. MatchInstruction(hp1, A_MOV, A_CMOVcc, []) do { loop };
  9503. if (hp1.typ = ait_instruction) and
  9504. (taicpu(hp1).opcode = A_Jcc) and
  9505. condition_in(taicpu(hp1).condition, taicpu(p).condition) then
  9506. begin
  9507. TransferUsedRegs(TmpUsedRegs);
  9508. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1);
  9509. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) or
  9510. (
  9511. { See if we can find a more distant instruction that overwrites
  9512. the destination register }
  9513. (cs_opt_level3 in current_settings.optimizerswitches) and
  9514. GetNextInstructionUsingReg(hp1, hp2, taicpu(p).oper[1]^.reg) and
  9515. RegLoadedWithNewValue(taicpu(p).oper[1]^.reg, hp2)
  9516. ) then
  9517. begin
  9518. if (taicpu(p).oper[0]^.typ = top_reg) then
  9519. begin
  9520. { Search backwards to see if the source register is set to a
  9521. constant }
  9522. FoundComparison := False;
  9523. hp1 := p;
  9524. while GetLastInstruction(hp1, hp1) and (hp1.typ = ait_instruction) do
  9525. begin
  9526. if RegModifiedByInstruction(NR_DEFAULTFLAGS, hp1) then
  9527. begin
  9528. FoundComparison := True;
  9529. Continue;
  9530. end;
  9531. { Once we find the CMP, TEST or similar instruction, we
  9532. have to stop if we find anything other than a MOV }
  9533. if FoundComparison and (taicpu(hp1).opcode <> A_MOV) then
  9534. Break;
  9535. if RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
  9536. { Destination register was modified }
  9537. Break;
  9538. if (taicpu(hp1).opcode = A_MOV) and MatchOpType(taicpu(hp1), top_const, toP_reg)
  9539. and (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) then
  9540. begin
  9541. { Found a constant! }
  9542. taicpu(p).loadconst(0, taicpu(hp1).oper[0]^.val);
  9543. if not RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, p, UsedRegs) then
  9544. { The source register is no longer in use }
  9545. RemoveInstruction(hp1);
  9546. Break;
  9547. end;
  9548. if RegModifiedByInstruction(taicpu(p).oper[0]^.reg, hp1) then
  9549. { Some other instruction has modified the source register }
  9550. Break;
  9551. end;
  9552. end;
  9553. DebugMsg(SPeepholeOptimization + 'CMOVcc/Jcc -> MOV/Jcc since register is not used if not branching', p);
  9554. taicpu(p).opcode := A_MOV;
  9555. taicpu(p).condition := C_None;
  9556. { Rely on the post peephole stage to put the MOV before the
  9557. CMP/TEST instruction that appears prior }
  9558. Result := True;
  9559. Exit;
  9560. end;
  9561. end;
  9562. end;
  9563. function TX86AsmOptimizer.OptPass2MOV(var p : tai) : boolean;
  9564. function IsXCHGAcceptable: Boolean; inline;
  9565. begin
  9566. { Always accept if optimising for size }
  9567. Result := (cs_opt_size in current_settings.optimizerswitches) or
  9568. { From the Pentium M onwards, XCHG only has a latency of 2 rather
  9569. than 3, so it becomes a saving compared to three MOVs with two of
  9570. them able to execute simultaneously. [Kit] }
  9571. (CPUX86_HINT_FAST_XCHG in cpu_optimization_hints[current_settings.optimizecputype]);
  9572. end;
  9573. var
  9574. NewRef: TReference;
  9575. hp1, hp2, hp3: Tai;
  9576. {$ifndef x86_64}
  9577. hp4: tai;
  9578. OperIdx: Integer;
  9579. {$endif x86_64}
  9580. NewInstr : Taicpu;
  9581. DestLabel: TAsmLabel;
  9582. TempTracking: TAllUsedRegs;
  9583. function TryMovArith2Lea(InputInstr: tai): Boolean;
  9584. var
  9585. NextInstr: tai;
  9586. NextPresent: Boolean;
  9587. begin
  9588. Result := False;
  9589. { be lazy, checking separately for sub would be slightly better }
  9590. if (taicpu(InputInstr).oper[0]^.typ = top_const) and
  9591. (abs(taicpu(InputInstr).oper[0]^.val)<=$7fffffff) then
  9592. begin
  9593. NextPresent := GetNextInstruction(InputInstr, NextInstr);
  9594. if NextPresent then
  9595. begin
  9596. { Try to avoid using TmpUsedRegs if possible (it's slow!) }
  9597. TransferUsedRegs(TmpUsedRegs);
  9598. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  9599. UpdateUsedRegs(TmpUsedRegs, tai(InputInstr.Next));
  9600. end;
  9601. if (
  9602. not NextPresent or
  9603. (
  9604. { The FLAGS register isn't always tracked properly, so do not
  9605. perform this optimisation if a conditional statement follows }
  9606. not RegReadByInstruction(NR_DEFAULTFLAGS, NextInstr) and
  9607. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, NextInstr, TmpUsedRegs)
  9608. )
  9609. ) then
  9610. begin
  9611. reference_reset(NewRef, 1, []);
  9612. NewRef.base := taicpu(p).oper[0]^.reg;
  9613. NewRef.scalefactor := 1;
  9614. if taicpu(InputInstr).opcode = A_ADD then
  9615. begin
  9616. DebugMsg(SPeepholeOptimization + 'MovAdd2Lea', p);
  9617. NewRef.offset := taicpu(InputInstr).oper[0]^.val;
  9618. end
  9619. else
  9620. begin
  9621. DebugMsg(SPeepholeOptimization + 'MovSub2Lea', p);
  9622. NewRef.offset := -taicpu(InputInstr).oper[0]^.val;
  9623. end;
  9624. taicpu(p).opcode := A_LEA;
  9625. taicpu(p).loadref(0, NewRef);
  9626. { For the sake of debugging, have the line info match the
  9627. arithmetic instruction rather than the MOV instruction }
  9628. taicpu(p).fileinfo := taicpu(InputInstr).fileinfo;
  9629. RemoveInstruction(InputInstr);
  9630. Result := True;
  9631. end;
  9632. end;
  9633. end;
  9634. begin
  9635. Result:=false;
  9636. { This optimisation adds an instruction, so only do it for speed }
  9637. if not (cs_opt_size in current_settings.optimizerswitches) and
  9638. MatchOpType(taicpu(p), top_const, top_reg) and
  9639. (taicpu(p).oper[0]^.val = 0) then
  9640. begin
  9641. { To avoid compiler warning }
  9642. DestLabel := nil;
  9643. if (p.typ <> ait_instruction) or (taicpu(p).oper[1]^.typ <> top_reg) then
  9644. InternalError(2021040750);
  9645. if not GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.reg) then
  9646. Exit;
  9647. case hp1.typ of
  9648. ait_label:
  9649. begin
  9650. { Change:
  9651. mov $0,%reg mov $0,%reg
  9652. @Lbl1: @Lbl1:
  9653. test %reg,%reg / cmp $0,%reg test %reg,%reg / mov $0,%reg
  9654. je @Lbl2 jne @Lbl2
  9655. To: To:
  9656. mov $0,%reg mov $0,%reg
  9657. jmp @Lbl2 jmp @Lbl3
  9658. (align) (align)
  9659. @Lbl1: @Lbl1:
  9660. test %reg,%reg / cmp $0,%reg test %reg,%reg / cmp $0,%reg
  9661. je @Lbl2 je @Lbl2
  9662. @Lbl3: <-- Only if label exists
  9663. (Not if it's optimised for size)
  9664. }
  9665. if not GetNextInstruction(hp1, hp2) then
  9666. Exit;
  9667. if (hp2.typ = ait_instruction) and
  9668. (
  9669. { Register sizes must exactly match }
  9670. (
  9671. (taicpu(hp2).opcode = A_CMP) and
  9672. MatchOperand(taicpu(hp2).oper[0]^, 0) and
  9673. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg)
  9674. ) or (
  9675. (taicpu(hp2).opcode = A_TEST) and
  9676. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  9677. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg)
  9678. )
  9679. ) and GetNextInstruction(hp2, hp3) and
  9680. (hp3.typ = ait_instruction) and
  9681. (taicpu(hp3).opcode = A_JCC) and
  9682. (taicpu(hp3).oper[0]^.typ=top_ref) and (taicpu(hp3).oper[0]^.ref^.refaddr=addr_full) and (taicpu(hp3).oper[0]^.ref^.base=NR_NO) and
  9683. (taicpu(hp3).oper[0]^.ref^.index=NR_NO) and (taicpu(hp3).oper[0]^.ref^.symbol is tasmlabel) then
  9684. begin
  9685. { Check condition of jump }
  9686. { Always true? }
  9687. if condition_in(C_E, taicpu(hp3).condition) then
  9688. begin
  9689. { Copy label symbol and obtain matching label entry for the
  9690. conditional jump, as this will be our destination}
  9691. DestLabel := tasmlabel(taicpu(hp3).oper[0]^.ref^.symbol);
  9692. DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Je -> Mov0JmpLblCmp0Je', p);
  9693. Result := True;
  9694. end
  9695. { Always false? }
  9696. else if condition_in(C_NE, taicpu(hp3).condition) and GetNextInstruction(hp3, hp2) then
  9697. begin
  9698. { This is only worth it if there's a jump to take }
  9699. case hp2.typ of
  9700. ait_instruction:
  9701. begin
  9702. if taicpu(hp2).opcode = A_JMP then
  9703. begin
  9704. DestLabel := tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol);
  9705. { An unconditional jump follows the conditional jump which will always be false,
  9706. so use this jump's destination for the new jump }
  9707. DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Jne -> Mov0JmpLblCmp0Jne (with JMP)', p);
  9708. Result := True;
  9709. end
  9710. else if taicpu(hp2).opcode = A_JCC then
  9711. begin
  9712. DestLabel := tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol);
  9713. if condition_in(C_E, taicpu(hp2).condition) then
  9714. begin
  9715. { A second conditional jump follows the conditional jump which will always be false,
  9716. while the second jump is always True, so use this jump's destination for the new jump }
  9717. DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Jne -> Mov0JmpLblCmp0Jne (with second Jcc)', p);
  9718. Result := True;
  9719. end;
  9720. { Don't risk it if the jump isn't always true (Result remains False) }
  9721. end;
  9722. end;
  9723. else
  9724. { If anything else don't optimise };
  9725. end;
  9726. end;
  9727. if Result then
  9728. begin
  9729. { Just so we have something to insert as a paremeter}
  9730. reference_reset(NewRef, 1, []);
  9731. NewInstr := taicpu.op_ref(A_JMP, S_NO, NewRef);
  9732. { Now actually load the correct parameter (this also
  9733. increases the reference count) }
  9734. NewInstr.loadsymbol(0, DestLabel, 0);
  9735. if (cs_opt_level3 in current_settings.optimizerswitches) then
  9736. begin
  9737. { Get instruction before original label (may not be p under -O3) }
  9738. if not GetLastInstruction(hp1, hp2) then
  9739. { Shouldn't fail here }
  9740. InternalError(2021040701);
  9741. end
  9742. else
  9743. hp2 := p;
  9744. taicpu(NewInstr).fileinfo := taicpu(hp2).fileinfo;
  9745. AsmL.InsertAfter(NewInstr, hp2);
  9746. { Add new alignment field }
  9747. (* AsmL.InsertAfter(
  9748. cai_align.create_max(
  9749. current_settings.alignment.jumpalign,
  9750. current_settings.alignment.jumpalignskipmax
  9751. ),
  9752. NewInstr
  9753. ); *)
  9754. end;
  9755. Exit;
  9756. end;
  9757. end;
  9758. else
  9759. ;
  9760. end;
  9761. end;
  9762. if not GetNextInstruction(p, hp1) then
  9763. Exit;
  9764. if MatchInstruction(hp1, A_CMP, A_TEST, []) then
  9765. begin
  9766. if (taicpu(hp1).opsize = taicpu(p).opsize) and DoMovCmpMemOpt(p, hp1) then
  9767. begin
  9768. Result := True;
  9769. Exit;
  9770. end;
  9771. { This optimisation is only effective on a second run of Pass 2,
  9772. hence -O3 or above.
  9773. Change:
  9774. mov %reg1,%reg2
  9775. cmp/test (contains %reg1)
  9776. mov x, %reg1
  9777. (another mov or a j(c))
  9778. To:
  9779. mov %reg1,%reg2
  9780. mov x, %reg1
  9781. cmp (%reg1 replaced with %reg2)
  9782. (another mov or a j(c))
  9783. The requirement of an additional MOV or a jump ensures there
  9784. isn't performance loss, since a j(c) will permit macro-fusion
  9785. with the cmp instruction, while another MOV likely means it's
  9786. not all being executed in a single cycle due to parallelisation.
  9787. }
  9788. if (cs_opt_level3 in current_settings.optimizerswitches) and
  9789. MatchOpType(taicpu(p), top_reg, top_reg) and
  9790. RegInInstruction(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
  9791. GetNextInstruction(hp1, hp2) and
  9792. MatchInstruction(hp2, A_MOV, []) and
  9793. (taicpu(hp2).oper[1]^.typ = top_reg) and
  9794. { Registers don't have to be the same size in this case }
  9795. SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
  9796. GetNextInstruction(hp2, hp3) and
  9797. MatchInstruction(hp3, A_MOV, A_Jcc, []) and
  9798. { Make sure the operands in the camparison can be safely replaced }
  9799. (
  9800. not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[0]^) or
  9801. ReplaceRegisterInOper(taicpu(hp1), 0, taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg)
  9802. ) and
  9803. (
  9804. not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^) or
  9805. ReplaceRegisterInOper(taicpu(hp1), 1, taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg)
  9806. ) then
  9807. begin
  9808. DebugMsg(SPeepholeOptimization + 'MOV/CMP/MOV -> MOV/MOV/CMP', p);
  9809. AsmL.Remove(hp2);
  9810. AsmL.InsertAfter(hp2, p);
  9811. Result := True;
  9812. Exit;
  9813. end;
  9814. end;
  9815. if MatchInstruction(hp1, A_JMP, [S_NO]) then
  9816. begin
  9817. { Sometimes the MOVs that OptPass2JMP produces can be improved
  9818. further, but we can't just put this jump optimisation in pass 1
  9819. because it tends to perform worse when conditional jumps are
  9820. nearby (e.g. when converting CMOV instructions). [Kit] }
  9821. CopyUsedRegs(TempTracking);
  9822. UpdateUsedRegs(tai(p.Next));
  9823. if OptPass2JMP(hp1) then
  9824. begin
  9825. { Restore register state }
  9826. RestoreUsedRegs(TempTracking);
  9827. ReleaseUsedRegs(TempTracking);
  9828. { call OptPass1MOV once to potentially merge any MOVs that were created }
  9829. OptPass1MOV(p);
  9830. Result := True;
  9831. Exit;
  9832. end;
  9833. { If OptPass2JMP returned False, no optimisations were done to
  9834. the jump and there are no further optimisations that can be done
  9835. to the MOV instruction on this pass other than FuncMov2Func }
  9836. { Restore register state }
  9837. RestoreUsedRegs(TempTracking);
  9838. ReleaseUsedRegs(TempTracking);
  9839. Result := FuncMov2Func(p, hp1);
  9840. Exit;
  9841. end;
  9842. if MatchOpType(taicpu(p),top_reg,top_reg) and
  9843. (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  9844. MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
  9845. (taicpu(hp1).oper[1]^.typ = top_reg) and
  9846. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  9847. begin
  9848. { Change:
  9849. movl/q %reg1,%reg2 movl/q %reg1,%reg2
  9850. addl/q $x,%reg2 subl/q $x,%reg2
  9851. To:
  9852. leal/q x(%reg1),%reg2 leal/q -x(%reg1),%reg2
  9853. }
  9854. if TryMovArith2Lea(hp1) then
  9855. begin
  9856. Result := True;
  9857. Exit;
  9858. end
  9859. else if
  9860. { Same as above, but also adds or subtracts to %reg2 in between.
  9861. It's still valid as long as the flags aren't in use }
  9862. (
  9863. (
  9864. MatchInstruction(hp1,A_ADD,A_SUB,A_LEA,[]) and
  9865. not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^)
  9866. ) or
  9867. (
  9868. not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, taicpu(hp1)) and
  9869. { If it's not modified, make sure it isn't read as is }
  9870. not RegReadByInstruction(taicpu(p).oper[1]^.reg, taicpu(hp1))
  9871. )
  9872. ) and
  9873. GetNextInstructionUsingReg(hp1, hp2, taicpu(p).oper[1]^.reg) and
  9874. MatchInstruction(hp2,A_ADD,A_SUB,[taicpu(p).opsize]) and
  9875. MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg) and
  9876. TryMovArith2Lea(hp2) then
  9877. begin
  9878. Result := True;
  9879. Exit;
  9880. end;
  9881. end;
  9882. if MatchOpType(taicpu(p),top_reg,top_reg) and
  9883. {$ifdef x86_64}
  9884. MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
  9885. {$else x86_64}
  9886. MatchInstruction(hp1,A_MOVZX,A_MOVSX,[]) and
  9887. {$endif x86_64}
  9888. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  9889. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg) then
  9890. { mov reg1, reg2 mov reg1, reg2
  9891. movzx/sx reg2, reg3 to movzx/sx reg1, reg3}
  9892. begin
  9893. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  9894. DebugMsg(SPeepholeOptimization + 'mov %reg1,%reg2; movzx/sx %reg2,%reg3 -> mov %reg1,%reg2;movzx/sx %reg1,%reg3',p);
  9895. { Don't remove the MOV command without first checking that reg2 isn't used afterwards,
  9896. or unless supreg(reg3) = supreg(reg2)). [Kit] }
  9897. TransferUsedRegs(TmpUsedRegs);
  9898. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  9899. if (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) or
  9900. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)
  9901. then
  9902. begin
  9903. RemoveCurrentP(p, hp1);
  9904. Result:=true;
  9905. end;
  9906. Exit;
  9907. end;
  9908. if MatchOpType(taicpu(p),top_reg,top_reg) and
  9909. IsXCHGAcceptable and
  9910. { XCHG doesn't support 8-bit registers }
  9911. (taicpu(p).opsize <> S_B) and
  9912. MatchInstruction(hp1, A_MOV, []) and
  9913. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  9914. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
  9915. GetNextInstruction(hp1, hp2) and
  9916. MatchInstruction(hp2, A_MOV, []) and
  9917. { Don't need to call MatchOpType for hp2 because the operand matches below cover for it }
  9918. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  9919. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) then
  9920. begin
  9921. { mov %reg1,%reg2
  9922. mov %reg3,%reg1 -> xchg %reg3,%reg1
  9923. mov %reg2,%reg3
  9924. (%reg2 not used afterwards)
  9925. Note that xchg takes 3 cycles to execute, and generally mov's take
  9926. only one cycle apiece, but the first two mov's can be executed in
  9927. parallel, only taking 2 cycles overall. Older processors should
  9928. therefore only optimise for size. [Kit]
  9929. }
  9930. TransferUsedRegs(TmpUsedRegs);
  9931. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  9932. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  9933. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) then
  9934. begin
  9935. DebugMsg(SPeepholeOptimization + 'MovMovMov2XChg', p);
  9936. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp1, UsedRegs);
  9937. taicpu(hp1).opcode := A_XCHG;
  9938. RemoveCurrentP(p, hp1);
  9939. RemoveInstruction(hp2);
  9940. Result := True;
  9941. Exit;
  9942. end;
  9943. end;
  9944. if MatchOpType(taicpu(p),top_reg,top_reg) and
  9945. MatchInstruction(hp1, A_SAR, []) then
  9946. begin
  9947. if MatchOperand(taicpu(hp1).oper[0]^, 31) then
  9948. begin
  9949. { the use of %edx also covers the opsize being S_L }
  9950. if MatchOperand(taicpu(hp1).oper[1]^, NR_EDX) then
  9951. begin
  9952. { Note it has to be specifically "movl %eax,%edx", and those specific sub-registers }
  9953. if (taicpu(p).oper[0]^.reg = NR_EAX) and
  9954. (taicpu(p).oper[1]^.reg = NR_EDX) then
  9955. begin
  9956. { Change:
  9957. movl %eax,%edx
  9958. sarl $31,%edx
  9959. To:
  9960. cltd
  9961. }
  9962. DebugMsg(SPeepholeOptimization + 'MovSar2Cltd', p);
  9963. RemoveInstruction(hp1);
  9964. taicpu(p).opcode := A_CDQ;
  9965. taicpu(p).opsize := S_NO;
  9966. taicpu(p).clearop(1);
  9967. taicpu(p).clearop(0);
  9968. taicpu(p).ops:=0;
  9969. Result := True;
  9970. Exit;
  9971. end
  9972. else if (cs_opt_size in current_settings.optimizerswitches) and
  9973. (taicpu(p).oper[0]^.reg = NR_EDX) and
  9974. (taicpu(p).oper[1]^.reg = NR_EAX) then
  9975. begin
  9976. { Change:
  9977. movl %edx,%eax
  9978. sarl $31,%edx
  9979. To:
  9980. movl %edx,%eax
  9981. cltd
  9982. Note that this creates a dependency between the two instructions,
  9983. so only perform if optimising for size.
  9984. }
  9985. DebugMsg(SPeepholeOptimization + 'MovSar2MovCltd', p);
  9986. taicpu(hp1).opcode := A_CDQ;
  9987. taicpu(hp1).opsize := S_NO;
  9988. taicpu(hp1).clearop(1);
  9989. taicpu(hp1).clearop(0);
  9990. taicpu(hp1).ops:=0;
  9991. Include(OptsToCheck, aoc_ForceNewIteration);
  9992. Exit;
  9993. end;
  9994. {$ifndef x86_64}
  9995. end
  9996. { Don't bother if CMOV is supported, because a more optimal
  9997. sequence would have been generated for the Abs() intrinsic }
  9998. else if not(CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) and
  9999. { the use of %eax also covers the opsize being S_L }
  10000. MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) and
  10001. (taicpu(p).oper[0]^.reg = NR_EAX) and
  10002. (taicpu(p).oper[1]^.reg = NR_EDX) and
  10003. GetNextInstruction(hp1, hp2) and
  10004. MatchInstruction(hp2, A_XOR, [S_L]) and
  10005. MatchOperand(taicpu(hp2).oper[0]^, NR_EAX) and
  10006. MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) and
  10007. GetNextInstruction(hp2, hp3) and
  10008. MatchInstruction(hp3, A_SUB, [S_L]) and
  10009. MatchOperand(taicpu(hp3).oper[0]^, NR_EAX) and
  10010. MatchOperand(taicpu(hp3).oper[1]^, NR_EDX) then
  10011. begin
  10012. { Change:
  10013. movl %eax,%edx
  10014. sarl $31,%eax
  10015. xorl %eax,%edx
  10016. subl %eax,%edx
  10017. (Instruction that uses %edx)
  10018. (%eax deallocated)
  10019. (%edx deallocated)
  10020. To:
  10021. cltd
  10022. xorl %edx,%eax <-- Note the registers have swapped
  10023. subl %edx,%eax
  10024. (Instruction that uses %eax) <-- %eax rather than %edx
  10025. }
  10026. TransferUsedRegs(TmpUsedRegs);
  10027. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  10028. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  10029. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  10030. if not RegUsedAfterInstruction(NR_EAX, hp3, TmpUsedRegs) then
  10031. begin
  10032. if GetNextInstruction(hp3, hp4) and
  10033. not RegModifiedByInstruction(NR_EDX, hp4) and
  10034. not RegUsedAfterInstruction(NR_EDX, hp4, TmpUsedRegs) then
  10035. begin
  10036. DebugMsg(SPeepholeOptimization + 'abs() intrinsic optimisation', p);
  10037. taicpu(p).opcode := A_CDQ;
  10038. taicpu(p).clearop(1);
  10039. taicpu(p).clearop(0);
  10040. taicpu(p).ops:=0;
  10041. RemoveInstruction(hp1);
  10042. taicpu(hp2).loadreg(0, NR_EDX);
  10043. taicpu(hp2).loadreg(1, NR_EAX);
  10044. taicpu(hp3).loadreg(0, NR_EDX);
  10045. taicpu(hp3).loadreg(1, NR_EAX);
  10046. AllocRegBetween(NR_EAX, hp3, hp4, TmpUsedRegs);
  10047. { Convert references in the following instruction (hp4) from %edx to %eax }
  10048. for OperIdx := 0 to taicpu(hp4).ops - 1 do
  10049. with taicpu(hp4).oper[OperIdx]^ do
  10050. case typ of
  10051. top_reg:
  10052. if getsupreg(reg) = RS_EDX then
  10053. reg := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  10054. top_ref:
  10055. begin
  10056. if getsupreg(reg) = RS_EDX then
  10057. ref^.base := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  10058. if getsupreg(reg) = RS_EDX then
  10059. ref^.index := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  10060. end;
  10061. else
  10062. ;
  10063. end;
  10064. Result := True;
  10065. Exit;
  10066. end;
  10067. end;
  10068. {$else x86_64}
  10069. end;
  10070. end
  10071. else if MatchOperand(taicpu(hp1).oper[0]^, 63) and
  10072. { the use of %rdx also covers the opsize being S_Q }
  10073. MatchOperand(taicpu(hp1).oper[1]^, NR_RDX) then
  10074. begin
  10075. { Note it has to be specifically "movq %rax,%rdx", and those specific sub-registers }
  10076. if (taicpu(p).oper[0]^.reg = NR_RAX) and
  10077. (taicpu(p).oper[1]^.reg = NR_RDX) then
  10078. begin
  10079. { Change:
  10080. movq %rax,%rdx
  10081. sarq $63,%rdx
  10082. To:
  10083. cqto
  10084. }
  10085. DebugMsg(SPeepholeOptimization + 'MovSar2Cqto', p);
  10086. RemoveInstruction(hp1);
  10087. taicpu(p).opcode := A_CQO;
  10088. taicpu(p).opsize := S_NO;
  10089. taicpu(p).clearop(1);
  10090. taicpu(p).clearop(0);
  10091. taicpu(p).ops:=0;
  10092. Result := True;
  10093. Exit;
  10094. end
  10095. else if (cs_opt_size in current_settings.optimizerswitches) and
  10096. (taicpu(p).oper[0]^.reg = NR_RDX) and
  10097. (taicpu(p).oper[1]^.reg = NR_RAX) then
  10098. begin
  10099. { Change:
  10100. movq %rdx,%rax
  10101. sarq $63,%rdx
  10102. To:
  10103. movq %rdx,%rax
  10104. cqto
  10105. Note that this creates a dependency between the two instructions,
  10106. so only perform if optimising for size.
  10107. }
  10108. DebugMsg(SPeepholeOptimization + 'MovSar2MovCqto', p);
  10109. taicpu(hp1).opcode := A_CQO;
  10110. taicpu(hp1).opsize := S_NO;
  10111. taicpu(hp1).clearop(1);
  10112. taicpu(hp1).clearop(0);
  10113. taicpu(hp1).ops:=0;
  10114. Include(OptsToCheck, aoc_ForceNewIteration);
  10115. Exit;
  10116. {$endif x86_64}
  10117. end;
  10118. end;
  10119. end;
  10120. if MatchInstruction(hp1, A_MOV, []) and
  10121. (taicpu(hp1).oper[1]^.typ = top_reg) then
  10122. { Though "GetNextInstruction" could be factored out, along with
  10123. the instructions that depend on hp2, it is an expensive call that
  10124. should be delayed for as long as possible, hence we do cheaper
  10125. checks first that are likely to be False. [Kit] }
  10126. begin
  10127. if (
  10128. (
  10129. MatchOperand(taicpu(p).oper[1]^, NR_EDX) and
  10130. (taicpu(hp1).oper[1]^.reg = NR_EAX) and
  10131. (
  10132. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  10133. MatchOperand(taicpu(hp1).oper[0]^, NR_EDX)
  10134. )
  10135. ) or
  10136. (
  10137. MatchOperand(taicpu(p).oper[1]^, NR_EAX) and
  10138. (taicpu(hp1).oper[1]^.reg = NR_EDX) and
  10139. (
  10140. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  10141. MatchOperand(taicpu(hp1).oper[0]^, NR_EAX)
  10142. )
  10143. )
  10144. ) and
  10145. GetNextInstruction(hp1, hp2) and
  10146. MatchInstruction(hp2, A_SAR, []) and
  10147. MatchOperand(taicpu(hp2).oper[0]^, 31) then
  10148. begin
  10149. if MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) then
  10150. begin
  10151. { Change:
  10152. movl r/m,%edx movl r/m,%eax movl r/m,%edx movl r/m,%eax
  10153. movl %edx,%eax or movl %eax,%edx or movl r/m,%eax or movl r/m,%edx
  10154. sarl $31,%edx sarl $31,%edx sarl $31,%edx sarl $31,%edx
  10155. To:
  10156. movl r/m,%eax <- Note the change in register
  10157. cltd
  10158. }
  10159. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCltd', p);
  10160. AllocRegBetween(NR_EAX, p, hp1, UsedRegs);
  10161. taicpu(p).loadreg(1, NR_EAX);
  10162. taicpu(hp1).opcode := A_CDQ;
  10163. taicpu(hp1).clearop(1);
  10164. taicpu(hp1).clearop(0);
  10165. taicpu(hp1).ops:=0;
  10166. RemoveInstruction(hp2);
  10167. Include(OptsToCheck, aoc_ForceNewIteration);
  10168. (*
  10169. {$ifdef x86_64}
  10170. end
  10171. else if MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) and
  10172. { This code sequence does not get generated - however it might become useful
  10173. if and when 128-bit signed integer types make an appearance, so the code
  10174. is kept here for when it is eventually needed. [Kit] }
  10175. (
  10176. (
  10177. (taicpu(hp1).oper[1]^.reg = NR_RAX) and
  10178. (
  10179. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  10180. MatchOperand(taicpu(hp1).oper[0]^, NR_RDX)
  10181. )
  10182. ) or
  10183. (
  10184. (taicpu(hp1).oper[1]^.reg = NR_RDX) and
  10185. (
  10186. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  10187. MatchOperand(taicpu(hp1).oper[0]^, NR_RAX)
  10188. )
  10189. )
  10190. ) and
  10191. GetNextInstruction(hp1, hp2) and
  10192. MatchInstruction(hp2, A_SAR, [S_Q]) and
  10193. MatchOperand(taicpu(hp2).oper[0]^, 63) and
  10194. MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) then
  10195. begin
  10196. { Change:
  10197. movq r/m,%rdx movq r/m,%rax movq r/m,%rdx movq r/m,%rax
  10198. movq %rdx,%rax or movq %rax,%rdx or movq r/m,%rax or movq r/m,%rdx
  10199. sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx
  10200. To:
  10201. movq r/m,%rax <- Note the change in register
  10202. cqto
  10203. }
  10204. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCqto', p);
  10205. AllocRegBetween(NR_RAX, p, hp1, UsedRegs);
  10206. taicpu(p).loadreg(1, NR_RAX);
  10207. taicpu(hp1).opcode := A_CQO;
  10208. taicpu(hp1).clearop(1);
  10209. taicpu(hp1).clearop(0);
  10210. taicpu(hp1).ops:=0;
  10211. RemoveInstruction(hp2);
  10212. Include(OptsToCheck, aoc_ForceNewIteration);
  10213. {$endif x86_64}
  10214. *)
  10215. end;
  10216. end;
  10217. {$ifdef x86_64}
  10218. end;
  10219. if (taicpu(p).opsize = S_L) and
  10220. (taicpu(p).oper[1]^.typ = top_reg) and
  10221. (
  10222. MatchInstruction(hp1, A_MOV,[]) and
  10223. (taicpu(hp1).opsize = S_L) and
  10224. (taicpu(hp1).oper[1]^.typ = top_reg)
  10225. ) and (
  10226. GetNextInstruction(hp1, hp2) and
  10227. (tai(hp2).typ=ait_instruction) and
  10228. (taicpu(hp2).opsize = S_Q) and
  10229. (
  10230. (
  10231. MatchInstruction(hp2, A_ADD,[]) and
  10232. (taicpu(hp2).opsize = S_Q) and
  10233. (taicpu(hp2).oper[0]^.typ = top_reg) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  10234. (
  10235. (
  10236. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(p).oper[1]^.reg)) and
  10237. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  10238. ) or (
  10239. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  10240. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  10241. )
  10242. )
  10243. ) or (
  10244. MatchInstruction(hp2, A_LEA,[]) and
  10245. (taicpu(hp2).oper[0]^.ref^.offset = 0) and
  10246. (taicpu(hp2).oper[0]^.ref^.scalefactor <= 1) and
  10247. (
  10248. (
  10249. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(p).oper[1]^.reg)) and
  10250. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(hp1).oper[1]^.reg))
  10251. ) or (
  10252. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  10253. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(p).oper[1]^.reg))
  10254. )
  10255. ) and (
  10256. (
  10257. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  10258. ) or (
  10259. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  10260. )
  10261. )
  10262. )
  10263. )
  10264. ) and (
  10265. GetNextInstruction(hp2, hp3) and
  10266. MatchInstruction(hp3, A_SHR,[]) and
  10267. (taicpu(hp3).opsize = S_Q) and
  10268. (taicpu(hp3).oper[0]^.typ = top_const) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  10269. (taicpu(hp3).oper[0]^.val = 1) and
  10270. (taicpu(hp3).oper[1]^.reg = taicpu(hp2).oper[1]^.reg)
  10271. ) then
  10272. begin
  10273. { Change movl x, reg1d movl x, reg1d
  10274. movl y, reg2d movl y, reg2d
  10275. addq reg2q,reg1q or leaq (reg1q,reg2q),reg1q
  10276. shrq $1, reg1q shrq $1, reg1q
  10277. ( reg1d and reg2d can be switched around in the first two instructions )
  10278. To movl x, reg1d
  10279. addl y, reg1d
  10280. rcrl $1, reg1d
  10281. This corresponds to the common expression (x + y) shr 1, where
  10282. x and y are Cardinals (replacing "shr 1" with "div 2" produces
  10283. smaller code, but won't account for x + y causing an overflow). [Kit]
  10284. }
  10285. DebugMsg(SPeepholeOptimization + 'MovMov*Shr2MovMov*Rcr', p);
  10286. if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then
  10287. begin
  10288. { Change first MOV command to have the same register as the final output }
  10289. taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg;
  10290. AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp1, UsedRegs);
  10291. Result := True;
  10292. end
  10293. else
  10294. begin
  10295. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
  10296. Include(OptsToCheck, aoc_ForceNewIteration);
  10297. end;
  10298. { Change second MOV command to an ADD command. This is easier than
  10299. converting the existing command because it means we don't have to
  10300. touch 'y', which might be a complicated reference, and also the
  10301. fact that the third command might either be ADD or LEA. [Kit] }
  10302. taicpu(hp1).opcode := A_ADD;
  10303. { Delete old ADD/LEA instruction }
  10304. RemoveInstruction(hp2);
  10305. { Convert "shrq $1, reg1q" to "rcr $1, reg1d" }
  10306. taicpu(hp3).opcode := A_RCR;
  10307. taicpu(hp3).changeopsize(S_L);
  10308. setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD);
  10309. { Don't need to Exit yet as p is still a MOV and hp1 hasn't been
  10310. called, so FuncMov2Func below is safe to call }
  10311. {$endif x86_64}
  10312. end;
  10313. {$ifdef x86_64}
  10314. { Note, this optimisation was moved from Pass 1 because the CMOV
  10315. optimisations in OptPass2Jcc fall foul of the loss of information
  10316. about the upper 32 bits of the target register. Fixes #41317. }
  10317. { Change:
  10318. movl %reg1l,%reg2l
  10319. movq %reg2q,%reg3q (%reg1 <> %reg3)
  10320. To:
  10321. movl %reg1l,%reg2l
  10322. movl %reg1l,%reg3l (Upper 32 bits of %reg3q will be zero)
  10323. }
  10324. if MatchOpType(taicpu(p), top_reg, top_reg) and
  10325. (taicpu(p).opsize = S_L) then
  10326. begin
  10327. { If the movq instruction is followed by addq or subq, it
  10328. might be possible to convert them to a leaq instruction
  10329. whose opportunity might be lost if it's changed to a movl
  10330. first, so we can't do this optimisation on a first iteration }
  10331. if not (aoc_MovlMovq2MovlMovl in OptsToCheck) and
  10332. not NotFirstIteration and
  10333. { If -O2 and under, do the optimisation anyway because Pass 2
  10334. won't run more than once }
  10335. (cs_opt_level3 in current_settings.optimizerswitches) then
  10336. begin
  10337. { Flag that we need to run Pass 2 again }
  10338. Include(OptsToCheck, aoc_ForceNewIteration);
  10339. end
  10340. else
  10341. begin
  10342. TransferUsedRegs(TmpUsedRegs);
  10343. { Mark the start point for sequential calls to
  10344. GetNextInstructionUsingReg, RegModifiedBetween and
  10345. UpdateUsedRegsBetween in case this optimisation is run multiple
  10346. times }
  10347. hp2 := p;
  10348. repeat
  10349. if (
  10350. not(cs_opt_level3 in current_settings.optimizerswitches) or
  10351. { Look further ahead for this one }
  10352. GetNextInstructionUsingReg(hp2, hp1, taicpu(p).oper[1]^.reg)
  10353. ) and
  10354. MatchInstruction(hp1,A_MOV,[S_Q]) and
  10355. not RegModifiedBetween(taicpu(p).oper[0]^.reg, hp2, hp1) and
  10356. MatchOpType(taicpu(hp1), top_reg, top_reg) and
  10357. SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^.reg) then
  10358. begin
  10359. UpdateUsedRegsBetween(TmpUsedRegs, tai(hp2.Next), hp1);
  10360. taicpu(hp1).opsize := S_L;
  10361. taicpu(hp1).loadreg(0, taicpu(p).oper[0]^.reg);
  10362. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  10363. AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp1, TmpUsedRegs);
  10364. DebugMsg(SPeepholeOptimization + 'Made 32-to-64-bit zero extension more efficient (MovlMovq2MovlMovl 1)', hp1);
  10365. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  10366. begin
  10367. DebugMsg(SPeepholeOptimization + 'Mov2Nop 8 done', p);
  10368. RemoveCurrentP(p);
  10369. Result := True;
  10370. Exit;
  10371. end;
  10372. { Initial instruction wasn't actually changed }
  10373. Include(OptsToCheck, aoc_ForceNewIteration);
  10374. if (cs_opt_level3 in current_settings.optimizerswitches) then
  10375. begin
  10376. { GetNextInstructionUsingReg will return a different
  10377. instruction, so check this optimisation again }
  10378. { Update the start point for the next calls to
  10379. GetNextInstructionUsingReg, RegModifiedBetween and
  10380. UpdateUsedRegsBetween to grant a speed boost }
  10381. hp2 := hp1;
  10382. Continue; { Jump back to "repeat" }
  10383. end;
  10384. end;
  10385. Break;
  10386. until False;
  10387. end;
  10388. end;
  10389. {$endif x86_64}
  10390. if FuncMov2Func(p, hp1) then
  10391. begin
  10392. Result := True;
  10393. Exit;
  10394. end;
  10395. end;
  10396. {$push}
  10397. {$q-}{$r-}
  10398. function TX86AsmOptimizer.OptPass2Movx(var p : tai) : boolean;
  10399. var
  10400. ThisReg: TRegister;
  10401. MinSize, MaxSize, TryShiftDown, TargetSize: TOpSize;
  10402. TargetSubReg: TSubRegister;
  10403. hp1, hp2: tai;
  10404. RegInUse, RegChanged, p_removed, hp1_removed: Boolean;
  10405. { Store list of found instructions so we don't have to call
  10406. GetNextInstructionUsingReg multiple times }
  10407. InstrList: array of taicpu;
  10408. InstrMax, Index: Integer;
  10409. UpperLimit, SignedUpperLimit, SignedUpperLimitBottom,
  10410. LowerLimit, SignedLowerLimit, SignedLowerLimitBottom,
  10411. TryShiftDownLimit, TryShiftDownSignedLimit, TryShiftDownSignedLimitLower,
  10412. WorkingValue: TCgInt;
  10413. PreMessage: string;
  10414. { Data flow analysis }
  10415. TestValMin, TestValMax, TestValSignedMax: TCgInt;
  10416. BitwiseOnly, OrXorUsed,
  10417. ShiftDownOverflow, UpperSignedOverflow, UpperUnsignedOverflow, LowerSignedOverflow, LowerUnsignedOverflow: Boolean;
  10418. function CheckOverflowConditions: Boolean;
  10419. begin
  10420. Result := True;
  10421. if (TestValSignedMax > SignedUpperLimit) then
  10422. UpperSignedOverflow := True;
  10423. if (TestValSignedMax > SignedLowerLimit) or (TestValSignedMax < SignedLowerLimitBottom) then
  10424. LowerSignedOverflow := True;
  10425. if (TestValMin > LowerLimit) or (TestValMax > LowerLimit) then
  10426. LowerUnsignedOverflow := True;
  10427. if (TestValMin > UpperLimit) or (TestValMax > UpperLimit) or (TestValSignedMax > UpperLimit) or
  10428. (TestValMin < SignedUpperLimitBottom) or (TestValMax < SignedUpperLimitBottom) or (TestValSignedMax < SignedUpperLimitBottom) then
  10429. begin
  10430. { Absolute overflow }
  10431. Result := False;
  10432. Exit;
  10433. end;
  10434. if not ShiftDownOverflow and (TryShiftDown <> S_NO) and
  10435. ((TestValMin > TryShiftDownLimit) or (TestValMax > TryShiftDownLimit)) then
  10436. ShiftDownOverflow := True;
  10437. if (TestValMin < 0) or (TestValMax < 0) then
  10438. begin
  10439. LowerUnsignedOverflow := True;
  10440. UpperUnsignedOverflow := True;
  10441. end;
  10442. end;
  10443. function AdjustInitialLoadAndSize: Boolean;
  10444. begin
  10445. Result := False;
  10446. if not p_removed then
  10447. begin
  10448. if TargetSize = MinSize then
  10449. begin
  10450. { Convert the input MOVZX to a MOV }
  10451. if (taicpu(p).oper[0]^.typ = top_reg) and
  10452. SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg) then
  10453. begin
  10454. { Or remove it completely! }
  10455. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 1', p);
  10456. RemoveCurrentP(p);
  10457. p_removed := True;
  10458. end
  10459. else
  10460. begin
  10461. DebugMsg(SPeepholeOptimization + 'Movzx2Mov 1', p);
  10462. taicpu(p).opcode := A_MOV;
  10463. taicpu(p).oper[1]^.reg := ThisReg;
  10464. taicpu(p).opsize := TargetSize;
  10465. end;
  10466. Result := True;
  10467. end
  10468. else if TargetSize <> MaxSize then
  10469. begin
  10470. case MaxSize of
  10471. S_L:
  10472. if TargetSize = S_W then
  10473. begin
  10474. DebugMsg(SPeepholeOptimization + 'movzbl2movzbw', p);
  10475. taicpu(p).opsize := S_BW;
  10476. taicpu(p).oper[1]^.reg := ThisReg;
  10477. Result := True;
  10478. end
  10479. else
  10480. InternalError(2020112341);
  10481. S_W:
  10482. if TargetSize = S_L then
  10483. begin
  10484. DebugMsg(SPeepholeOptimization + 'movzbw2movzbl', p);
  10485. taicpu(p).opsize := S_BL;
  10486. taicpu(p).oper[1]^.reg := ThisReg;
  10487. Result := True;
  10488. end
  10489. else
  10490. InternalError(2020112342);
  10491. else
  10492. ;
  10493. end;
  10494. end
  10495. else if not hp1_removed and not RegInUse then
  10496. begin
  10497. { If we have something like:
  10498. movzbl (oper),%regd
  10499. add x, %regd
  10500. movzbl %regb, %regd
  10501. We can reduce the register size to the input of the final
  10502. movzbl instruction. Overflows won't have any effect.
  10503. }
  10504. if (taicpu(p).opsize in [S_BW, S_BL]) and
  10505. (taicpu(hp1).opsize in [S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}]) then
  10506. begin
  10507. TargetSize := S_B;
  10508. setsubreg(ThisReg, R_SUBL);
  10509. Result := True;
  10510. end
  10511. else if (taicpu(p).opsize = S_WL) and
  10512. (taicpu(hp1).opsize in [S_WL{$ifdef x86_64}, S_BQ{$endif x86_64}]) then
  10513. begin
  10514. TargetSize := S_W;
  10515. setsubreg(ThisReg, R_SUBW);
  10516. Result := True;
  10517. end;
  10518. if Result then
  10519. begin
  10520. { Convert the input MOVZX to a MOV }
  10521. if (taicpu(p).oper[0]^.typ = top_reg) and
  10522. SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg) then
  10523. begin
  10524. { Or remove it completely! }
  10525. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 1a', p);
  10526. RemoveCurrentP(p);
  10527. p_removed := True;
  10528. end
  10529. else
  10530. begin
  10531. DebugMsg(SPeepholeOptimization + 'Movzx2Mov 1a', p);
  10532. taicpu(p).opcode := A_MOV;
  10533. taicpu(p).oper[1]^.reg := ThisReg;
  10534. taicpu(p).opsize := TargetSize;
  10535. end;
  10536. end;
  10537. end;
  10538. end;
  10539. end;
  10540. procedure AdjustFinalLoad;
  10541. begin
  10542. if not LowerUnsignedOverflow then
  10543. begin
  10544. if ((TargetSize = S_L) and (taicpu(hp1).opsize in [S_L, S_BL, S_WL])) or
  10545. ((TargetSize = S_W) and (taicpu(hp1).opsize in [S_W, S_BW])) then
  10546. begin
  10547. { Convert the output MOVZX to a MOV }
  10548. if SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
  10549. begin
  10550. { Make sure the zero-expansion covers at least the minimum size (fixes i40003) }
  10551. if (MinSize = S_B) or
  10552. (not ShiftDownOverflow and (TryShiftDown = S_B)) or
  10553. ((MinSize = S_W) and (taicpu(hp1).opsize = S_WL)) then
  10554. begin
  10555. { Remove it completely! }
  10556. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 2', hp1);
  10557. { Be careful; if p = hp1 and p was also removed, p
  10558. will become a dangling pointer }
  10559. if p = hp1 then
  10560. begin
  10561. RemoveCurrentp(p); { p = hp1 and will then become the next instruction }
  10562. p_removed := True;
  10563. end
  10564. else
  10565. RemoveInstruction(hp1);
  10566. hp1_removed := True;
  10567. end;
  10568. end
  10569. else
  10570. begin
  10571. DebugMsg(SPeepholeOptimization + 'Movzx2Mov 2', hp1);
  10572. taicpu(hp1).opcode := A_MOV;
  10573. taicpu(hp1).oper[0]^.reg := ThisReg;
  10574. taicpu(hp1).opsize := TargetSize;
  10575. end;
  10576. end
  10577. else if (TargetSize = S_B) and (MaxSize = S_W) and (taicpu(hp1).opsize = S_WL) then
  10578. begin
  10579. { Need to change the size of the output }
  10580. DebugMsg(SPeepholeOptimization + 'movzwl2movzbl 2', hp1);
  10581. taicpu(hp1).oper[0]^.reg := ThisReg;
  10582. taicpu(hp1).opsize := S_BL;
  10583. end;
  10584. end;
  10585. end;
  10586. function CompressInstructions: Boolean;
  10587. var
  10588. LocalIndex: Integer;
  10589. begin
  10590. Result := False;
  10591. { The objective here is to try to find a combination that
  10592. removes one of the MOV/Z instructions. }
  10593. if (
  10594. (taicpu(p).oper[0]^.typ <> top_reg) or
  10595. not SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg)
  10596. ) and
  10597. (taicpu(hp1).oper[1]^.typ = top_reg) and
  10598. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
  10599. begin
  10600. { Make a preference to remove the second MOVZX instruction }
  10601. case taicpu(hp1).opsize of
  10602. S_BL, S_WL:
  10603. begin
  10604. TargetSize := S_L;
  10605. TargetSubReg := R_SUBD;
  10606. end;
  10607. S_BW:
  10608. begin
  10609. TargetSize := S_W;
  10610. TargetSubReg := R_SUBW;
  10611. end;
  10612. else
  10613. InternalError(2020112302);
  10614. end;
  10615. end
  10616. else
  10617. begin
  10618. if LowerUnsignedOverflow and not UpperUnsignedOverflow then
  10619. begin
  10620. { Exceeded lower bound but not upper bound }
  10621. TargetSize := MaxSize;
  10622. end
  10623. else if not LowerUnsignedOverflow then
  10624. begin
  10625. { Size didn't exceed lower bound }
  10626. TargetSize := MinSize;
  10627. end
  10628. else
  10629. Exit;
  10630. end;
  10631. case TargetSize of
  10632. S_B:
  10633. TargetSubReg := R_SUBL;
  10634. S_W:
  10635. TargetSubReg := R_SUBW;
  10636. S_L:
  10637. TargetSubReg := R_SUBD;
  10638. else
  10639. InternalError(2020112350);
  10640. end;
  10641. { Update the register to its new size }
  10642. setsubreg(ThisReg, TargetSubReg);
  10643. RegInUse := False;
  10644. if not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
  10645. begin
  10646. { Check to see if the active register is used afterwards;
  10647. if not, we can change it and make a saving. }
  10648. TransferUsedRegs(TmpUsedRegs);
  10649. { The target register may be marked as in use to cross
  10650. a jump to a distant label, so exclude it }
  10651. ExcludeRegFromUsedRegs(taicpu(hp1).oper[1]^.reg, TmpUsedRegs);
  10652. hp2 := p;
  10653. repeat
  10654. { Explicitly check for the excluded register (don't include the first
  10655. instruction as it may be reading from here }
  10656. if ((p <> hp2) and (RegInInstruction(taicpu(hp1).oper[1]^.reg, hp2))) or
  10657. RegInUsedRegs(taicpu(hp1).oper[1]^.reg, TmpUsedRegs) then
  10658. begin
  10659. RegInUse := True;
  10660. Break;
  10661. end;
  10662. UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
  10663. if not GetNextInstruction(hp2, hp2) then
  10664. InternalError(2020112340);
  10665. until (hp2 = hp1);
  10666. if not RegInUse and RegUsedAfterInstruction(ThisReg, hp1, TmpUsedRegs) then
  10667. { We might still be able to get away with this }
  10668. RegInUse := not
  10669. (
  10670. GetNextInstructionUsingReg(hp1, hp2, ThisReg) and
  10671. (hp2.typ = ait_instruction) and
  10672. (
  10673. { Under -O1 and -O2, GetNextInstructionUsingReg may return an
  10674. instruction that doesn't actually contain ThisReg }
  10675. (cs_opt_level3 in current_settings.optimizerswitches) or
  10676. RegInInstruction(ThisReg, hp2)
  10677. ) and
  10678. RegLoadedWithNewValue(ThisReg, hp2)
  10679. );
  10680. if not RegInUse then
  10681. begin
  10682. { Force the register size to the same as this instruction so it can be removed}
  10683. if (taicpu(hp1).opsize in [S_L, S_BL, S_WL]) then
  10684. begin
  10685. TargetSize := S_L;
  10686. TargetSubReg := R_SUBD;
  10687. end
  10688. else if (taicpu(hp1).opsize in [S_W, S_BW]) then
  10689. begin
  10690. TargetSize := S_W;
  10691. TargetSubReg := R_SUBW;
  10692. end;
  10693. ThisReg := taicpu(hp1).oper[1]^.reg;
  10694. setsubreg(ThisReg, TargetSubReg);
  10695. RegChanged := True;
  10696. DebugMsg(SPeepholeOptimization + 'Simplified register usage so ' + debug_regname(ThisReg) + ' = ' + debug_regname(taicpu(p).oper[1]^.reg), p);
  10697. TransferUsedRegs(TmpUsedRegs);
  10698. AllocRegBetween(ThisReg, p, hp1, TmpUsedRegs);
  10699. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 3', hp1);
  10700. if p = hp1 then
  10701. begin
  10702. RemoveCurrentp(p); { p = hp1 and will then become the next instruction }
  10703. p_removed := True;
  10704. end
  10705. else
  10706. RemoveInstruction(hp1);
  10707. hp1_removed := True;
  10708. { Instruction will become "mov %reg,%reg" }
  10709. if not p_removed and (taicpu(p).opcode = A_MOV) and
  10710. MatchOperand(taicpu(p).oper[0]^, ThisReg) then
  10711. begin
  10712. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 6', p);
  10713. RemoveCurrentP(p);
  10714. p_removed := True;
  10715. end
  10716. else
  10717. taicpu(p).oper[1]^.reg := ThisReg;
  10718. Result := True;
  10719. end
  10720. else
  10721. begin
  10722. if TargetSize <> MaxSize then
  10723. begin
  10724. { Since the register is in use, we have to force it to
  10725. MaxSize otherwise part of it may become undefined later on }
  10726. TargetSize := MaxSize;
  10727. case TargetSize of
  10728. S_B:
  10729. TargetSubReg := R_SUBL;
  10730. S_W:
  10731. TargetSubReg := R_SUBW;
  10732. S_L:
  10733. TargetSubReg := R_SUBD;
  10734. else
  10735. InternalError(2020112351);
  10736. end;
  10737. setsubreg(ThisReg, TargetSubReg);
  10738. end;
  10739. AdjustFinalLoad;
  10740. end;
  10741. end
  10742. else
  10743. AdjustFinalLoad;
  10744. Result := AdjustInitialLoadAndSize or Result;
  10745. { Now go through every instruction we found and change the
  10746. size. If TargetSize = MaxSize, then almost no changes are
  10747. needed and Result can remain False if it hasn't been set
  10748. yet.
  10749. If RegChanged is True, then the register requires changing
  10750. and so the point about TargetSize = MaxSize doesn't apply. }
  10751. if ((TargetSize <> MaxSize) or RegChanged) and (InstrMax >= 0) then
  10752. begin
  10753. for LocalIndex := 0 to InstrMax do
  10754. begin
  10755. { If p_removed is true, then the original MOV/Z was removed
  10756. and removing the AND instruction may not be safe if it
  10757. appears first }
  10758. if (InstrList[LocalIndex].oper[InstrList[LocalIndex].ops - 1]^.typ <> top_reg) then
  10759. InternalError(2020112310);
  10760. if InstrList[LocalIndex].oper[0]^.typ = top_reg then
  10761. InstrList[LocalIndex].oper[0]^.reg := ThisReg;
  10762. InstrList[LocalIndex].oper[InstrList[LocalIndex].ops - 1]^.reg := ThisReg;
  10763. InstrList[LocalIndex].opsize := TargetSize;
  10764. end;
  10765. Result := True;
  10766. end;
  10767. end;
  10768. begin
  10769. Result := False;
  10770. p_removed := False;
  10771. hp1_removed := False;
  10772. ThisReg := taicpu(p).oper[1]^.reg;
  10773. { Check for:
  10774. movs/z ###,%ecx (or %cx or %rcx)
  10775. ...
  10776. shl/shr/sar/rcl/rcr/ror/rol %cl,###
  10777. (dealloc %ecx)
  10778. Change to:
  10779. mov ###,%cl (if ### = %cl, then remove completely)
  10780. ...
  10781. shl/shr/sar/rcl/rcr/ror/rol %cl,###
  10782. }
  10783. if (getsupreg(ThisReg) = RS_ECX) and
  10784. GetNextInstructionUsingReg(p, hp1, NR_ECX) and
  10785. (hp1.typ = ait_instruction) and
  10786. (
  10787. { Under -O1 and -O2, GetNextInstructionUsingReg may return an
  10788. instruction that doesn't actually contain ECX }
  10789. (cs_opt_level3 in current_settings.optimizerswitches) or
  10790. RegInInstruction(NR_ECX, hp1) or
  10791. (
  10792. { It's common for the shift/rotate's read/write register to be
  10793. initialised in between, so under -O2 and under, search ahead
  10794. one more instruction
  10795. }
  10796. GetNextInstruction(hp1, hp1) and
  10797. (hp1.typ = ait_instruction) and
  10798. RegInInstruction(NR_ECX, hp1)
  10799. )
  10800. ) and
  10801. MatchInstruction(hp1, [A_SHL, A_SHR, A_SAR, A_ROR, A_ROL, A_RCR, A_RCL], []) and
  10802. (taicpu(hp1).oper[0]^.typ = top_reg) { This is enough to determine that it's %cl } then
  10803. begin
  10804. TransferUsedRegs(TmpUsedRegs);
  10805. hp2 := p;
  10806. repeat
  10807. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  10808. until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
  10809. if not RegUsedAfterInstruction(NR_CL, hp1, TmpUsedRegs) then
  10810. begin
  10811. case taicpu(p).opsize of
  10812. S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  10813. if MatchOperand(taicpu(p).oper[0]^, NR_CL) then
  10814. begin
  10815. DebugMsg(SPeepholeOptimization + 'MovxOp2Op 3a', p);
  10816. RemoveCurrentP(p);
  10817. end
  10818. else
  10819. begin
  10820. taicpu(p).opcode := A_MOV;
  10821. taicpu(p).opsize := S_B;
  10822. taicpu(p).oper[1]^.reg := NR_CL;
  10823. DebugMsg(SPeepholeOptimization + 'MovxOp2MovOp 1', p);
  10824. end;
  10825. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  10826. if MatchOperand(taicpu(p).oper[0]^, NR_CX) then
  10827. begin
  10828. DebugMsg(SPeepholeOptimization + 'MovxOp2Op 3b', p);
  10829. RemoveCurrentP(p);
  10830. end
  10831. else
  10832. begin
  10833. taicpu(p).opcode := A_MOV;
  10834. taicpu(p).opsize := S_W;
  10835. taicpu(p).oper[1]^.reg := NR_CX;
  10836. DebugMsg(SPeepholeOptimization + 'MovxOp2MovOp 2', p);
  10837. end;
  10838. {$ifdef x86_64}
  10839. S_LQ:
  10840. if MatchOperand(taicpu(p).oper[0]^, NR_ECX) then
  10841. begin
  10842. DebugMsg(SPeepholeOptimization + 'MovxOp2Op 3c', p);
  10843. RemoveCurrentP(p);
  10844. end
  10845. else
  10846. begin
  10847. taicpu(p).opcode := A_MOV;
  10848. taicpu(p).opsize := S_L;
  10849. taicpu(p).oper[1]^.reg := NR_ECX;
  10850. DebugMsg(SPeepholeOptimization + 'MovxOp2MovOp 3', p);
  10851. end;
  10852. {$endif x86_64}
  10853. else
  10854. InternalError(2021120401);
  10855. end;
  10856. Result := True;
  10857. Exit;
  10858. end;
  10859. end;
  10860. { This is anything but quick! }
  10861. if not(cs_opt_level2 in current_settings.optimizerswitches) then
  10862. Exit;
  10863. SetLength(InstrList, 0);
  10864. InstrMax := -1;
  10865. case taicpu(p).opsize of
  10866. S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  10867. begin
  10868. {$if defined(i386) or defined(i8086)}
  10869. { If the target size is 8-bit, make sure we can actually encode it }
  10870. if not (GetSupReg(ThisReg) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX]) then
  10871. Exit;
  10872. {$endif i386 or i8086}
  10873. LowerLimit := $FF;
  10874. SignedLowerLimit := $7F;
  10875. SignedLowerLimitBottom := -128;
  10876. MinSize := S_B;
  10877. if taicpu(p).opsize = S_BW then
  10878. begin
  10879. MaxSize := S_W;
  10880. UpperLimit := $FFFF;
  10881. SignedUpperLimit := $7FFF;
  10882. SignedUpperLimitBottom := -32768;
  10883. end
  10884. else
  10885. begin
  10886. { Keep at a 32-bit limit for BQ as well since one can't really optimise otherwise }
  10887. MaxSize := S_L;
  10888. UpperLimit := $FFFFFFFF;
  10889. SignedUpperLimit := $7FFFFFFF;
  10890. SignedUpperLimitBottom := -2147483648;
  10891. end;
  10892. end;
  10893. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  10894. begin
  10895. { Keep at a 32-bit limit for WQ as well since one can't really optimise otherwise }
  10896. LowerLimit := $FFFF;
  10897. SignedLowerLimit := $7FFF;
  10898. SignedLowerLimitBottom := -32768;
  10899. UpperLimit := $FFFFFFFF;
  10900. SignedUpperLimit := $7FFFFFFF;
  10901. SignedUpperLimitBottom := -2147483648;
  10902. MinSize := S_W;
  10903. MaxSize := S_L;
  10904. end;
  10905. {$ifdef x86_64}
  10906. S_LQ:
  10907. begin
  10908. { Both the lower and upper limits are set to 32-bit. If a limit
  10909. is breached, then optimisation is impossible }
  10910. LowerLimit := $FFFFFFFF;
  10911. SignedLowerLimit := $7FFFFFFF;
  10912. SignedLowerLimitBottom := -2147483648;
  10913. UpperLimit := $FFFFFFFF;
  10914. SignedUpperLimit := $7FFFFFFF;
  10915. SignedUpperLimitBottom := -2147483648;
  10916. MinSize := S_L;
  10917. MaxSize := S_L;
  10918. end;
  10919. {$endif x86_64}
  10920. else
  10921. InternalError(2020112301);
  10922. end;
  10923. TestValMin := 0;
  10924. TestValMax := LowerLimit;
  10925. TestValSignedMax := SignedLowerLimit;
  10926. TryShiftDownLimit := LowerLimit;
  10927. TryShiftDown := S_NO;
  10928. ShiftDownOverflow := False;
  10929. RegChanged := False;
  10930. BitwiseOnly := True;
  10931. OrXorUsed := False;
  10932. UpperSignedOverflow := False;
  10933. LowerSignedOverflow := False;
  10934. UpperUnsignedOverflow := False;
  10935. LowerUnsignedOverflow := False;
  10936. hp1 := p;
  10937. while GetNextInstructionUsingReg(hp1, hp1, ThisReg) and
  10938. (hp1.typ = ait_instruction) and
  10939. (
  10940. { Under -O1 and -O2, GetNextInstructionUsingReg may return an
  10941. instruction that doesn't actually contain ThisReg }
  10942. (cs_opt_level3 in current_settings.optimizerswitches) or
  10943. { This allows this Movx optimisation to work through the SETcc instructions
  10944. inserted by the 'CMP/JE/CMP/@Lbl/SETE -> CMP/SETE/CMP/SETE/OR'
  10945. optimisation on -O1 and -O2 (on -O3, GetNextInstructionUsingReg will
  10946. skip over these SETcc instructions). }
  10947. (taicpu(hp1).opcode = A_SETcc) or
  10948. RegInInstruction(ThisReg, hp1)
  10949. ) do
  10950. begin
  10951. case taicpu(hp1).opcode of
  10952. A_INC,A_DEC:
  10953. begin
  10954. { Has to be an exact match on the register }
  10955. if not MatchOperand(taicpu(hp1).oper[0]^, ThisReg) then
  10956. Break;
  10957. if taicpu(hp1).opcode = A_INC then
  10958. begin
  10959. Inc(TestValMin);
  10960. Inc(TestValMax);
  10961. Inc(TestValSignedMax);
  10962. end
  10963. else
  10964. begin
  10965. Dec(TestValMin);
  10966. Dec(TestValMax);
  10967. Dec(TestValSignedMax);
  10968. end;
  10969. end;
  10970. A_TEST, A_CMP:
  10971. begin
  10972. if (
  10973. { Too high a risk of non-linear behaviour that breaks DFA
  10974. here, unless it's cmp $0,%reg, which is equivalent to
  10975. test %reg,%reg }
  10976. OrXorUsed and
  10977. (taicpu(hp1).opcode = A_CMP) and
  10978. not Matchoperand(taicpu(hp1).oper[0]^, 0)
  10979. ) or
  10980. (taicpu(hp1).oper[1]^.typ <> top_reg) or
  10981. { Has to be an exact match on the register }
  10982. (taicpu(hp1).oper[1]^.reg <> ThisReg) or
  10983. (
  10984. { Permit "test %reg,%reg" }
  10985. (taicpu(hp1).opcode = A_TEST) and
  10986. (taicpu(hp1).oper[0]^.typ = top_reg) and
  10987. (taicpu(hp1).oper[0]^.reg <> ThisReg)
  10988. ) or
  10989. (taicpu(hp1).oper[0]^.typ <> top_const) or
  10990. { Make sure the comparison value is not smaller than the
  10991. smallest allowed signed value for the minimum size (e.g.
  10992. -128 for 8-bit) }
  10993. not (
  10994. ((taicpu(hp1).oper[0]^.val and LowerLimit) = taicpu(hp1).oper[0]^.val) or
  10995. { Is it in the negative range? }
  10996. (
  10997. (taicpu(hp1).oper[0]^.val < 0) and
  10998. (taicpu(hp1).oper[0]^.val >= SignedLowerLimitBottom)
  10999. )
  11000. ) then
  11001. Break;
  11002. { Check to see if the active register is used afterwards }
  11003. TransferUsedRegs(TmpUsedRegs);
  11004. IncludeRegInUsedRegs(ThisReg, TmpUsedRegs);
  11005. if not RegUsedAfterInstruction(ThisReg, hp1, TmpUsedRegs) then
  11006. begin
  11007. { Make sure the comparison or any previous instructions
  11008. hasn't pushed the test values outside of the range of
  11009. MinSize }
  11010. if LowerUnsignedOverflow and not UpperUnsignedOverflow then
  11011. begin
  11012. { Exceeded lower bound but not upper bound }
  11013. Exit;
  11014. end
  11015. else if not LowerSignedOverflow or not LowerUnsignedOverflow then
  11016. begin
  11017. { Size didn't exceed lower bound }
  11018. TargetSize := MinSize;
  11019. end
  11020. else
  11021. Break;
  11022. case TargetSize of
  11023. S_B:
  11024. TargetSubReg := R_SUBL;
  11025. S_W:
  11026. TargetSubReg := R_SUBW;
  11027. S_L:
  11028. TargetSubReg := R_SUBD;
  11029. else
  11030. InternalError(2021051002);
  11031. end;
  11032. if TargetSize <> MaxSize then
  11033. begin
  11034. { Update the register to its new size }
  11035. setsubreg(ThisReg, TargetSubReg);
  11036. DebugMsg(SPeepholeOptimization + 'CMP instruction resized thanks to register size optimisation (see MOV/Z assignment above)', hp1);
  11037. taicpu(hp1).oper[1]^.reg := ThisReg;
  11038. taicpu(hp1).opsize := TargetSize;
  11039. { Convert the input MOVZX to a MOV if necessary }
  11040. AdjustInitialLoadAndSize;
  11041. if (InstrMax >= 0) then
  11042. begin
  11043. for Index := 0 to InstrMax do
  11044. begin
  11045. { If p_removed is true, then the original MOV/Z was removed
  11046. and removing the AND instruction may not be safe if it
  11047. appears first }
  11048. if (InstrList[Index].oper[InstrList[Index].ops - 1]^.typ <> top_reg) then
  11049. InternalError(2020112311);
  11050. if InstrList[Index].oper[0]^.typ = top_reg then
  11051. InstrList[Index].oper[0]^.reg := ThisReg;
  11052. InstrList[Index].oper[InstrList[Index].ops - 1]^.reg := ThisReg;
  11053. InstrList[Index].opsize := MinSize;
  11054. end;
  11055. end;
  11056. Result := True;
  11057. end;
  11058. Exit;
  11059. end;
  11060. end;
  11061. A_SETcc:
  11062. begin
  11063. { This allows this Movx optimisation to work through the SETcc instructions
  11064. inserted by the 'CMP/JE/CMP/@Lbl/SETE -> CMP/SETE/CMP/SETE/OR'
  11065. optimisation on -O1 and -O2 (on -O3, GetNextInstructionUsingReg will
  11066. skip over these SETcc instructions). }
  11067. if (cs_opt_level3 in current_settings.optimizerswitches) or
  11068. { Of course, break out if the current register is used }
  11069. RegInOp(ThisReg, taicpu(hp1).oper[0]^) then
  11070. Break
  11071. else
  11072. { We must use Continue so the instruction doesn't get added
  11073. to InstrList }
  11074. Continue;
  11075. end;
  11076. A_ADD,A_SUB,A_AND,A_OR,A_XOR,A_SHL,A_SHR,A_SAR:
  11077. begin
  11078. if
  11079. (taicpu(hp1).oper[1]^.typ <> top_reg) or
  11080. { Has to be an exact match on the register }
  11081. (taicpu(hp1).oper[1]^.reg <> ThisReg) or not
  11082. (
  11083. (
  11084. (taicpu(hp1).oper[0]^.typ = top_const) and
  11085. (
  11086. (
  11087. (taicpu(hp1).opcode = A_SHL) and
  11088. (
  11089. ((MinSize = S_B) and (taicpu(hp1).oper[0]^.val < 8)) or
  11090. ((MinSize = S_W) and (taicpu(hp1).oper[0]^.val < 16)) or
  11091. ((MinSize = S_L) and (taicpu(hp1).oper[0]^.val < 32))
  11092. )
  11093. ) or (
  11094. (taicpu(hp1).opcode <> A_SHL) and
  11095. (
  11096. ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
  11097. { Is it in the negative range? }
  11098. (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val))
  11099. )
  11100. )
  11101. )
  11102. ) or (
  11103. MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^.reg) and
  11104. ((taicpu(hp1).opcode = A_ADD) or (taicpu(hp1).opcode = A_AND) or (taicpu(hp1).opcode = A_SUB))
  11105. )
  11106. ) then
  11107. Break;
  11108. { Only process OR and XOR if there are only bitwise operations,
  11109. since otherwise they can too easily fool the data flow
  11110. analysis (they can cause non-linear behaviour) }
  11111. case taicpu(hp1).opcode of
  11112. A_ADD:
  11113. begin
  11114. if OrXorUsed then
  11115. { Too high a risk of non-linear behaviour that breaks DFA here }
  11116. Break
  11117. else
  11118. BitwiseOnly := False;
  11119. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  11120. begin
  11121. TestValMin := TestValMin * 2;
  11122. TestValMax := TestValMax * 2;
  11123. TestValSignedMax := TestValSignedMax * 2;
  11124. end
  11125. else
  11126. begin
  11127. WorkingValue := taicpu(hp1).oper[0]^.val;
  11128. TestValMin := TestValMin + WorkingValue;
  11129. TestValMax := TestValMax + WorkingValue;
  11130. TestValSignedMax := TestValSignedMax + WorkingValue;
  11131. end;
  11132. end;
  11133. A_SUB:
  11134. begin
  11135. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  11136. begin
  11137. TestValMin := 0;
  11138. TestValMax := 0;
  11139. TestValSignedMax := 0;
  11140. end
  11141. else
  11142. begin
  11143. if OrXorUsed then
  11144. { Too high a risk of non-linear behaviour that breaks DFA here }
  11145. Break
  11146. else
  11147. BitwiseOnly := False;
  11148. WorkingValue := taicpu(hp1).oper[0]^.val;
  11149. TestValMin := TestValMin - WorkingValue;
  11150. TestValMax := TestValMax - WorkingValue;
  11151. TestValSignedMax := TestValSignedMax - WorkingValue;
  11152. end;
  11153. end;
  11154. A_AND:
  11155. if (taicpu(hp1).oper[0]^.typ = top_const) then
  11156. begin
  11157. { we might be able to go smaller if AND appears first }
  11158. if InstrMax = -1 then
  11159. case MinSize of
  11160. S_B:
  11161. ;
  11162. S_W:
  11163. if ((taicpu(hp1).oper[0]^.val and $FF) = taicpu(hp1).oper[0]^.val) or
  11164. ((not(taicpu(hp1).oper[0]^.val) and $7F) = (not taicpu(hp1).oper[0]^.val)) then
  11165. begin
  11166. TryShiftDown := S_B;
  11167. TryShiftDownLimit := $FF;
  11168. end;
  11169. S_L:
  11170. if ((taicpu(hp1).oper[0]^.val and $FF) = taicpu(hp1).oper[0]^.val) or
  11171. ((not(taicpu(hp1).oper[0]^.val) and $7F) = (not taicpu(hp1).oper[0]^.val)) then
  11172. begin
  11173. TryShiftDown := S_B;
  11174. TryShiftDownLimit := $FF;
  11175. end
  11176. else if ((taicpu(hp1).oper[0]^.val and $FFFF) = taicpu(hp1).oper[0]^.val) or
  11177. ((not(taicpu(hp1).oper[0]^.val) and $7FFF) = (not taicpu(hp1).oper[0]^.val)) then
  11178. begin
  11179. TryShiftDown := S_W;
  11180. TryShiftDownLimit := $FFFF;
  11181. end;
  11182. else
  11183. InternalError(2020112320);
  11184. end;
  11185. WorkingValue := taicpu(hp1).oper[0]^.val;
  11186. TestValMin := TestValMin and WorkingValue;
  11187. TestValMax := TestValMax and WorkingValue;
  11188. TestValSignedMax := TestValSignedMax and WorkingValue;
  11189. end;
  11190. A_OR:
  11191. begin
  11192. if not BitwiseOnly then
  11193. Break;
  11194. OrXorUsed := True;
  11195. WorkingValue := taicpu(hp1).oper[0]^.val;
  11196. TestValMin := TestValMin or WorkingValue;
  11197. TestValMax := TestValMax or WorkingValue;
  11198. TestValSignedMax := TestValSignedMax or WorkingValue;
  11199. end;
  11200. A_XOR:
  11201. begin
  11202. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  11203. begin
  11204. TestValMin := 0;
  11205. TestValMax := 0;
  11206. TestValSignedMax := 0;
  11207. end
  11208. else
  11209. begin
  11210. if not BitwiseOnly then
  11211. Break;
  11212. OrXorUsed := True;
  11213. WorkingValue := taicpu(hp1).oper[0]^.val;
  11214. TestValMin := TestValMin xor WorkingValue;
  11215. TestValMax := TestValMax xor WorkingValue;
  11216. TestValSignedMax := TestValSignedMax xor WorkingValue;
  11217. end;
  11218. end;
  11219. A_SHL:
  11220. begin
  11221. BitwiseOnly := False;
  11222. WorkingValue := taicpu(hp1).oper[0]^.val;
  11223. TestValMin := TestValMin shl WorkingValue;
  11224. TestValMax := TestValMax shl WorkingValue;
  11225. TestValSignedMax := TestValSignedMax shl WorkingValue;
  11226. end;
  11227. A_SHR,
  11228. { The first instruction was MOVZX, so the value won't be negative }
  11229. A_SAR:
  11230. begin
  11231. if InstrMax <> -1 then
  11232. BitwiseOnly := False
  11233. else
  11234. { we might be able to go smaller if SHR appears first }
  11235. case MinSize of
  11236. S_B:
  11237. ;
  11238. S_W:
  11239. if (taicpu(hp1).oper[0]^.val >= 8) then
  11240. begin
  11241. TryShiftDown := S_B;
  11242. TryShiftDownLimit := $FF;
  11243. TryShiftDownSignedLimit := $7F;
  11244. TryShiftDownSignedLimitLower := -128;
  11245. end;
  11246. S_L:
  11247. if (taicpu(hp1).oper[0]^.val >= 24) then
  11248. begin
  11249. TryShiftDown := S_B;
  11250. TryShiftDownLimit := $FF;
  11251. TryShiftDownSignedLimit := $7F;
  11252. TryShiftDownSignedLimitLower := -128;
  11253. end
  11254. else if (taicpu(hp1).oper[0]^.val >= 16) then
  11255. begin
  11256. TryShiftDown := S_W;
  11257. TryShiftDownLimit := $FFFF;
  11258. TryShiftDownSignedLimit := $7FFF;
  11259. TryShiftDownSignedLimitLower := -32768;
  11260. end;
  11261. else
  11262. InternalError(2020112321);
  11263. end;
  11264. WorkingValue := taicpu(hp1).oper[0]^.val;
  11265. if taicpu(hp1).opcode = A_SAR then
  11266. begin
  11267. TestValMin := SarInt64(TestValMin, WorkingValue);
  11268. TestValMax := SarInt64(TestValMax, WorkingValue);
  11269. TestValSignedMax := SarInt64(TestValSignedMax, WorkingValue);
  11270. end
  11271. else
  11272. begin
  11273. TestValMin := TestValMin shr WorkingValue;
  11274. TestValMax := TestValMax shr WorkingValue;
  11275. TestValSignedMax := TestValSignedMax shr WorkingValue;
  11276. end;
  11277. end;
  11278. else
  11279. InternalError(2020112303);
  11280. end;
  11281. end;
  11282. (*
  11283. A_IMUL:
  11284. case taicpu(hp1).ops of
  11285. 2:
  11286. begin
  11287. if not MatchOpType(hp1, top_reg, top_reg) or
  11288. { Has to be an exact match on the register }
  11289. (taicpu(hp1).oper[0]^.reg <> ThisReg) or
  11290. (taicpu(hp1).oper[1]^.reg <> ThisReg) then
  11291. Break;
  11292. TestValMin := TestValMin * TestValMin;
  11293. TestValMax := TestValMax * TestValMax;
  11294. TestValSignedMax := TestValSignedMax * TestValMax;
  11295. end;
  11296. 3:
  11297. begin
  11298. if not MatchOpType(hp1, top_const, top_reg, top_reg) or
  11299. { Has to be an exact match on the register }
  11300. (taicpu(hp1).oper[1]^.reg <> ThisReg) or
  11301. (taicpu(hp1).oper[2]^.reg <> ThisReg) or
  11302. ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
  11303. { Is it in the negative range? }
  11304. (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val)) then
  11305. Break;
  11306. TestValMin := TestValMin * taicpu(hp1).oper[0]^.val;
  11307. TestValMax := TestValMax * taicpu(hp1).oper[0]^.val;
  11308. TestValSignedMax := TestValSignedMax * taicpu(hp1).oper[0]^.val;
  11309. end;
  11310. else
  11311. Break;
  11312. end;
  11313. A_IDIV:
  11314. case taicpu(hp1).ops of
  11315. 3:
  11316. begin
  11317. if not MatchOpType(hp1, top_const, top_reg, top_reg) or
  11318. { Has to be an exact match on the register }
  11319. (taicpu(hp1).oper[1]^.reg <> ThisReg) or
  11320. (taicpu(hp1).oper[2]^.reg <> ThisReg) or
  11321. ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
  11322. { Is it in the negative range? }
  11323. (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val)) then
  11324. Break;
  11325. TestValMin := TestValMin div taicpu(hp1).oper[0]^.val;
  11326. TestValMax := TestValMax div taicpu(hp1).oper[0]^.val;
  11327. TestValSignedMax := TestValSignedMax div taicpu(hp1).oper[0]^.val;
  11328. end;
  11329. else
  11330. Break;
  11331. end;
  11332. *)
  11333. A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  11334. begin
  11335. { If there are no instructions in between, then we might be able to make a saving }
  11336. if UpperSignedOverflow or (taicpu(hp1).oper[0]^.typ <> top_reg) or (taicpu(hp1).oper[0]^.reg <> ThisReg) then
  11337. Break;
  11338. { We have something like:
  11339. movzbw %dl,%dx
  11340. ...
  11341. movswl %dx,%edx
  11342. Change the latter to a zero-extension then enter the
  11343. A_MOVZX case branch.
  11344. }
  11345. {$ifdef x86_64}
  11346. if (taicpu(hp1).opsize = S_LQ) and SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
  11347. begin
  11348. { this becomes a zero extension from 32-bit to 64-bit, but
  11349. the upper 32 bits are already zero, so just delete the
  11350. instruction }
  11351. DebugMsg(SPeepholeOptimization + 'MovzMovsxd2MovzNop', hp1);
  11352. RemoveInstruction(hp1);
  11353. Result := True;
  11354. Exit;
  11355. end
  11356. else
  11357. {$endif x86_64}
  11358. begin
  11359. DebugMsg(SPeepholeOptimization + 'MovzMovs2MovzMovz', hp1);
  11360. taicpu(hp1).opcode := A_MOVZX;
  11361. {$ifdef x86_64}
  11362. case taicpu(hp1).opsize of
  11363. S_BQ:
  11364. begin
  11365. taicpu(hp1).opsize := S_BL;
  11366. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  11367. end;
  11368. S_WQ:
  11369. begin
  11370. taicpu(hp1).opsize := S_WL;
  11371. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  11372. end;
  11373. S_LQ:
  11374. begin
  11375. taicpu(hp1).opcode := A_MOV;
  11376. taicpu(hp1).opsize := S_L;
  11377. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  11378. { In this instance, we need to break out because the
  11379. instruction is no longer MOVZX or MOVSXD }
  11380. Result := True;
  11381. Exit;
  11382. end;
  11383. else
  11384. ;
  11385. end;
  11386. {$endif x86_64}
  11387. Result := CompressInstructions;
  11388. Exit;
  11389. end;
  11390. end;
  11391. A_MOVZX:
  11392. begin
  11393. if UpperUnsignedOverflow or (taicpu(hp1).oper[0]^.typ <> top_reg) then
  11394. Break;
  11395. if (InstrMax = -1) then
  11396. begin
  11397. if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, ThisReg) then
  11398. begin
  11399. { Optimise around i40003 }
  11400. { Check to see if the active register is used afterwards }
  11401. TransferUsedRegs(TmpUsedRegs);
  11402. IncludeRegInUsedRegs(ThisReg, TmpUsedRegs);
  11403. if (
  11404. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) or
  11405. not RegUsedAfterInstruction(ThisReg, hp1, TmpUsedRegs)
  11406. ) and
  11407. (taicpu(p).opsize = S_WL) and (taicpu(hp1).opsize = S_BL)
  11408. {$ifndef x86_64}
  11409. and (
  11410. (taicpu(p).oper[0]^.typ <> top_reg) or
  11411. { Cannot encode byte-sized ESI, EDI, EBP or ESP under i386 }
  11412. (GetSupReg(taicpu(p).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])
  11413. )
  11414. {$endif not x86_64}
  11415. then
  11416. begin
  11417. if (taicpu(p).oper[0]^.typ = top_reg) then
  11418. setsubreg(taicpu(p).oper[0]^.reg, R_SUBL);
  11419. DebugMsg(SPeepholeOptimization + 'movzwl2movzbl 1', p);
  11420. taicpu(p).opsize := S_BL;
  11421. { Only remove if the active register is overwritten }
  11422. if SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
  11423. begin
  11424. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 2a', hp1);
  11425. RemoveInstruction(hp1);
  11426. end;
  11427. Result := True;
  11428. Exit;
  11429. end;
  11430. end
  11431. else
  11432. begin
  11433. { Will return false if the second parameter isn't ThisReg
  11434. (can happen on -O2 and under) }
  11435. if Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ThisReg) then
  11436. begin
  11437. { The two MOVZX instructions are adjacent, so remove the first one }
  11438. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 5', p);
  11439. RemoveCurrentP(p);
  11440. Result := True;
  11441. Exit;
  11442. end;
  11443. Break;
  11444. end;
  11445. end;
  11446. Result := CompressInstructions;
  11447. Exit;
  11448. end;
  11449. else
  11450. { This includes ADC, SBB and IDIV }
  11451. Break;
  11452. end;
  11453. if not CheckOverflowConditions then
  11454. Break;
  11455. { Contains highest index (so instruction count - 1) }
  11456. Inc(InstrMax);
  11457. if InstrMax > High(InstrList) then
  11458. SetLength(InstrList, InstrMax + LIST_STEP_SIZE);
  11459. InstrList[InstrMax] := taicpu(hp1);
  11460. end;
  11461. end;
  11462. {$pop}
  11463. function TX86AsmOptimizer.OptPass2Imul(var p : tai) : boolean;
  11464. var
  11465. hp1 : tai;
  11466. begin
  11467. Result:=false;
  11468. if (taicpu(p).ops >= 2) and
  11469. ((taicpu(p).oper[0]^.typ = top_const) or
  11470. ((taicpu(p).oper[0]^.typ = top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full))) and
  11471. (taicpu(p).oper[1]^.typ = top_reg) and
  11472. ((taicpu(p).ops = 2) or
  11473. ((taicpu(p).oper[2]^.typ = top_reg) and
  11474. (taicpu(p).oper[2]^.reg = taicpu(p).oper[1]^.reg))) and
  11475. GetLastInstruction(p,hp1) and
  11476. MatchInstruction(hp1,A_MOV,[]) and
  11477. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  11478. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  11479. begin
  11480. TransferUsedRegs(TmpUsedRegs);
  11481. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,p,TmpUsedRegs)) or
  11482. ((taicpu(p).ops = 3) and (taicpu(p).oper[1]^.reg=taicpu(p).oper[2]^.reg)) then
  11483. { change
  11484. mov reg1,reg2
  11485. imul y,reg2 to imul y,reg1,reg2 }
  11486. begin
  11487. taicpu(p).ops := 3;
  11488. taicpu(p).loadreg(2,taicpu(p).oper[1]^.reg);
  11489. taicpu(p).loadreg(1,taicpu(hp1).oper[0]^.reg);
  11490. DebugMsg(SPeepholeOptimization + 'MovImul2Imul done',p);
  11491. RemoveInstruction(hp1);
  11492. result:=true;
  11493. end;
  11494. end;
  11495. end;
  11496. procedure TX86AsmOptimizer.ConvertJumpToRET(const p: tai; const ret_p: tai);
  11497. var
  11498. ThisLabel: TAsmLabel;
  11499. begin
  11500. ThisLabel := tasmlabel(taicpu(p).oper[0]^.ref^.symbol);
  11501. ThisLabel.decrefs;
  11502. taicpu(p).condition := C_None;
  11503. taicpu(p).opcode := A_RET;
  11504. taicpu(p).is_jmp := false;
  11505. taicpu(p).ops := taicpu(ret_p).ops;
  11506. case taicpu(ret_p).ops of
  11507. 0:
  11508. taicpu(p).clearop(0);
  11509. 1:
  11510. taicpu(p).loadconst(0,taicpu(ret_p).oper[0]^.val);
  11511. else
  11512. internalerror(2016041301);
  11513. end;
  11514. { If the original label is now dead, it might turn out that the label
  11515. immediately follows p. As a result, everything beyond it, which will
  11516. be just some final register configuration and a RET instruction, is
  11517. now dead code. [Kit] }
  11518. { NOTE: This is much faster than introducing a OptPass2RET routine and
  11519. running RemoveDeadCodeAfterJump for each RET instruction, because
  11520. this optimisation rarely happens and most RETs appear at the end of
  11521. routines where there is nothing that can be stripped. [Kit] }
  11522. if not ThisLabel.is_used then
  11523. RemoveDeadCodeAfterJump(p);
  11524. end;
  11525. function TX86AsmOptimizer.OptPass2SETcc(var p: tai): boolean;
  11526. var
  11527. hp1,hp2,next: tai; SetC, JumpC: TAsmCond;
  11528. Unconditional, PotentialModified: Boolean;
  11529. OperPtr: POper;
  11530. NewRef: TReference;
  11531. InstrList: array of taicpu;
  11532. InstrMax, Index: Integer;
  11533. const
  11534. {$ifdef DEBUG_AOPTCPU}
  11535. SNoFlags: shortstring = ' so the flags aren''t modified';
  11536. {$else DEBUG_AOPTCPU}
  11537. SNoFlags = '';
  11538. {$endif DEBUG_AOPTCPU}
  11539. begin
  11540. Result:=false;
  11541. if MatchOpType(taicpu(p),top_reg) and GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
  11542. begin
  11543. if MatchInstruction(hp1, A_TEST, [S_B]) and
  11544. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  11545. (taicpu(hp1).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  11546. (taicpu(p).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  11547. GetNextInstruction(hp1, hp2) and
  11548. MatchInstruction(hp2, A_Jcc, A_SETcc, []) then
  11549. { Change from: To:
  11550. set(C) %reg j(~C) label
  11551. test %reg,%reg/cmp $0,%reg
  11552. je label
  11553. set(C) %reg j(C) label
  11554. test %reg,%reg/cmp $0,%reg
  11555. jne label
  11556. (Also do something similar with sete/setne instead of je/jne)
  11557. }
  11558. begin
  11559. { Before we do anything else, we need to check the instructions
  11560. in between SETcc and TEST to make sure they don't modify the
  11561. FLAGS register - if -O2 or under, there won't be any
  11562. instructions between SET and TEST }
  11563. TransferUsedRegs(TmpUsedRegs);
  11564. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  11565. if (cs_opt_level3 in current_settings.optimizerswitches) then
  11566. begin
  11567. next := p;
  11568. SetLength(InstrList, 0);
  11569. InstrMax := -1;
  11570. PotentialModified := False;
  11571. { Make a note of every instruction that modifies the FLAGS
  11572. register }
  11573. while GetNextInstruction(next, next) and (next <> hp1) do
  11574. begin
  11575. if next.typ <> ait_instruction then
  11576. { GetNextInstructionUsingReg should have returned False }
  11577. InternalError(2021051701);
  11578. if RegModifiedByInstruction(NR_DEFAULTFLAGS, next) then
  11579. begin
  11580. case taicpu(next).opcode of
  11581. A_SETcc,
  11582. A_CMOVcc,
  11583. A_Jcc:
  11584. begin
  11585. if PotentialModified then
  11586. { Not safe because the flags were modified earlier }
  11587. Exit
  11588. else
  11589. { Condition is the same as the initial SETcc, so this is safe
  11590. (don't add to instruction list though) }
  11591. Continue;
  11592. end;
  11593. A_ADD:
  11594. begin
  11595. if { LEA doesn't support 8-bit in general and 16-bit on x86-64 operands }
  11596. (taicpu(next).opsize in [S_B{$ifdef x86_64},S_W{$endif x86_64}]) or
  11597. (taicpu(next).oper[1]^.typ <> top_reg) or
  11598. { Must write to a register }
  11599. (taicpu(next).oper[0]^.typ = top_ref) then
  11600. { Require a constant or a register }
  11601. Exit;
  11602. PotentialModified := True;
  11603. end;
  11604. A_SUB:
  11605. begin
  11606. if { LEA doesn't support 8-bit in general and 16-bit on x86-64 operands }
  11607. (taicpu(next).opsize in [S_B{$ifdef x86_64},S_W{$endif x86_64}]) or
  11608. (taicpu(next).oper[1]^.typ <> top_reg) or
  11609. { Must write to a register }
  11610. (taicpu(next).oper[0]^.typ <> top_const) or
  11611. (taicpu(next).oper[0]^.val = $80000000) then
  11612. { Can't subtract a register with LEA - also
  11613. check that the value isn't -2^31, as this
  11614. can't be negated }
  11615. Exit;
  11616. PotentialModified := True;
  11617. end;
  11618. A_SAL,
  11619. A_SHL:
  11620. begin
  11621. if { LEA doesn't support 8-bit in general and 16-bit on x86-64 operands }
  11622. (taicpu(next).opsize in [S_B{$ifdef x86_64},S_W{$endif x86_64}]) or
  11623. (taicpu(next).oper[1]^.typ <> top_reg) or
  11624. { Must write to a register }
  11625. (taicpu(next).oper[0]^.typ <> top_const) or
  11626. (taicpu(next).oper[0]^.val < 0) or
  11627. (taicpu(next).oper[0]^.val > 3) then
  11628. Exit;
  11629. PotentialModified := True;
  11630. end;
  11631. A_IMUL:
  11632. begin
  11633. if (taicpu(next).ops <> 3) or
  11634. (taicpu(next).oper[1]^.typ <> top_reg) or
  11635. { Must write to a register }
  11636. (taicpu(next).oper[2]^.val in [2,3,4,5,8,9]) then
  11637. { We can convert "imul x,%reg1,%reg2" (where x = 2, 4 or 8)
  11638. to "lea (%reg1,x),%reg2". If x = 3, 5 or 9, we can
  11639. change this to "lea (%reg1,%reg1,(x-1)),%reg2" }
  11640. Exit
  11641. else
  11642. PotentialModified := True;
  11643. end;
  11644. else
  11645. { Don't know how to change this, so abort }
  11646. Exit;
  11647. end;
  11648. { Contains highest index (so instruction count - 1) }
  11649. Inc(InstrMax);
  11650. if InstrMax > High(InstrList) then
  11651. SetLength(InstrList, InstrMax + LIST_STEP_SIZE);
  11652. InstrList[InstrMax] := taicpu(next);
  11653. end;
  11654. UpdateUsedRegs(TmpUsedRegs, tai(next.next));
  11655. end;
  11656. if not Assigned(next) or (next <> hp1) then
  11657. { It should be equal to hp1 }
  11658. InternalError(2021051702);
  11659. { Cycle through each instruction and check to see if we can
  11660. change them to versions that don't modify the flags }
  11661. if (InstrMax >= 0) then
  11662. begin
  11663. for Index := 0 to InstrMax do
  11664. case InstrList[Index].opcode of
  11665. A_ADD:
  11666. begin
  11667. DebugMsg(SPeepholeOptimization + 'ADD -> LEA' + SNoFlags, InstrList[Index]);
  11668. InstrList[Index].opcode := A_LEA;
  11669. reference_reset(NewRef, 1, []);
  11670. NewRef.base := InstrList[Index].oper[1]^.reg;
  11671. if InstrList[Index].oper[0]^.typ = top_reg then
  11672. begin
  11673. NewRef.index := InstrList[Index].oper[0]^.reg;
  11674. NewRef.scalefactor := 1;
  11675. end
  11676. else
  11677. NewRef.offset := InstrList[Index].oper[0]^.val;
  11678. InstrList[Index].loadref(0, NewRef);
  11679. end;
  11680. A_SUB:
  11681. begin
  11682. DebugMsg(SPeepholeOptimization + 'SUB -> LEA' + SNoFlags, InstrList[Index]);
  11683. InstrList[Index].opcode := A_LEA;
  11684. reference_reset(NewRef, 1, []);
  11685. NewRef.base := InstrList[Index].oper[1]^.reg;
  11686. NewRef.offset := -InstrList[Index].oper[0]^.val;
  11687. InstrList[Index].loadref(0, NewRef);
  11688. end;
  11689. A_SHL,
  11690. A_SAL:
  11691. begin
  11692. DebugMsg(SPeepholeOptimization + 'SHL -> LEA' + SNoFlags, InstrList[Index]);
  11693. InstrList[Index].opcode := A_LEA;
  11694. reference_reset(NewRef, 1, []);
  11695. NewRef.index := InstrList[Index].oper[1]^.reg;
  11696. NewRef.scalefactor := 1 shl (InstrList[Index].oper[0]^.val);
  11697. InstrList[Index].loadref(0, NewRef);
  11698. end;
  11699. A_IMUL:
  11700. begin
  11701. DebugMsg(SPeepholeOptimization + 'IMUL -> LEA' + SNoFlags, InstrList[Index]);
  11702. InstrList[Index].opcode := A_LEA;
  11703. reference_reset(NewRef, 1, []);
  11704. NewRef.index := InstrList[Index].oper[1]^.reg;
  11705. case InstrList[Index].oper[0]^.val of
  11706. 2, 4, 8:
  11707. NewRef.scalefactor := InstrList[Index].oper[0]^.val;
  11708. else {3, 5 and 9}
  11709. begin
  11710. NewRef.scalefactor := InstrList[Index].oper[0]^.val - 1;
  11711. NewRef.base := InstrList[Index].oper[1]^.reg;
  11712. end;
  11713. end;
  11714. InstrList[Index].loadref(0, NewRef);
  11715. end;
  11716. else
  11717. InternalError(2021051710);
  11718. end;
  11719. end;
  11720. { Mark the FLAGS register as used across this whole block }
  11721. AllocRegBetween(NR_DEFAULTFLAGS, p, hp1, UsedRegs);
  11722. end;
  11723. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  11724. JumpC := taicpu(hp2).condition;
  11725. Unconditional := False;
  11726. if conditions_equal(JumpC, C_E) then
  11727. SetC := inverse_cond(taicpu(p).condition)
  11728. else if conditions_equal(JumpC, C_NE) then
  11729. SetC := taicpu(p).condition
  11730. else
  11731. { We've got something weird here (and inefficent) }
  11732. begin
  11733. DebugMsg('DEBUG: Inefficient jump - check code generation', p);
  11734. SetC := C_NONE;
  11735. { JAE/JNB will always branch (use 'condition_in', since C_AE <> C_NB normally) }
  11736. if condition_in(C_AE, JumpC) then
  11737. Unconditional := True
  11738. else
  11739. { Not sure what to do with this jump - drop out }
  11740. Exit;
  11741. end;
  11742. RemoveInstruction(hp1);
  11743. if Unconditional then
  11744. MakeUnconditional(taicpu(hp2))
  11745. else
  11746. begin
  11747. if SetC = C_NONE then
  11748. InternalError(2018061402);
  11749. taicpu(hp2).SetCondition(SetC);
  11750. end;
  11751. { as hp2 is a jump, we cannot use RegUsedAfterInstruction but we have to check if it is included in
  11752. TmpUsedRegs }
  11753. if not TmpUsedRegs[getregtype(taicpu(p).oper[0]^.reg)].IsUsed(taicpu(p).oper[0]^.reg) then
  11754. begin
  11755. RemoveCurrentp(p, hp2);
  11756. if taicpu(hp2).opcode = A_SETcc then
  11757. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/SETcc -> SETcc',p)
  11758. else
  11759. begin
  11760. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/Jcc -> Jcc',p);
  11761. if (cs_opt_level3 in current_settings.optimizerswitches) then
  11762. Include(OptsToCheck, aoc_DoPass2JccOpts);
  11763. end;
  11764. end
  11765. else
  11766. if taicpu(hp2).opcode = A_SETcc then
  11767. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/SETcc -> SETcc/SETcc',p)
  11768. else
  11769. DebugMsg(SPeepholeOptimization + 'SETcc/TEST/Jcc -> SETcc/Jcc',p);
  11770. Result := True;
  11771. end
  11772. else if
  11773. { Make sure the instructions are adjacent }
  11774. (
  11775. not (cs_opt_level3 in current_settings.optimizerswitches) or
  11776. GetNextInstruction(p, hp1)
  11777. ) and
  11778. MatchInstruction(hp1, A_MOV, [S_B]) and
  11779. { Writing to memory is allowed }
  11780. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^.reg) then
  11781. begin
  11782. {
  11783. Watch out for sequences such as:
  11784. set(c)b %regb
  11785. movb %regb,(ref)
  11786. movb $0,1(ref)
  11787. movb $0,2(ref)
  11788. movb $0,3(ref)
  11789. Much more efficient to turn it into:
  11790. movl $0,%regl
  11791. set(c)b %regb
  11792. movl %regl,(ref)
  11793. Or:
  11794. set(c)b %regb
  11795. movzbl %regb,%regl
  11796. movl %regl,(ref)
  11797. }
  11798. if (taicpu(hp1).oper[1]^.typ = top_ref) and
  11799. GetNextInstruction(hp1, hp2) and
  11800. MatchInstruction(hp2, A_MOV, [S_B]) and
  11801. (taicpu(hp2).oper[1]^.typ = top_ref) and
  11802. CheckMemoryWrite(taicpu(hp1), taicpu(hp2)) then
  11803. begin
  11804. { Don't do anything else except set Result to True }
  11805. end
  11806. else
  11807. begin
  11808. if taicpu(p).oper[0]^.typ = top_reg then
  11809. begin
  11810. TransferUsedRegs(TmpUsedRegs);
  11811. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  11812. end;
  11813. { If it's not a register, it's a memory address }
  11814. if (taicpu(p).oper[0]^.typ <> top_reg) or RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) then
  11815. begin
  11816. { Even if the register is still in use, we can minimise the
  11817. pipeline stall by changing the MOV into another SETcc. }
  11818. taicpu(hp1).opcode := A_SETcc;
  11819. taicpu(hp1).condition := taicpu(p).condition;
  11820. if taicpu(hp1).oper[1]^.typ = top_ref then
  11821. begin
  11822. { Swapping the operand pointers like this is probably a
  11823. bit naughty, but it is far faster than using loadoper
  11824. to transfer the reference from oper[1] to oper[0] if
  11825. you take into account the extra procedure calls and
  11826. the memory allocation and deallocation required }
  11827. OperPtr := taicpu(hp1).oper[1];
  11828. taicpu(hp1).oper[1] := taicpu(hp1).oper[0];
  11829. taicpu(hp1).oper[0] := OperPtr;
  11830. end
  11831. else
  11832. taicpu(hp1).oper[0]^.reg := taicpu(hp1).oper[1]^.reg;
  11833. taicpu(hp1).clearop(1);
  11834. taicpu(hp1).ops := 1;
  11835. DebugMsg(SPeepholeOptimization + 'SETcc/Mov -> SETcc/SETcc',p);
  11836. end
  11837. else
  11838. begin
  11839. if taicpu(hp1).oper[1]^.typ = top_reg then
  11840. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
  11841. taicpu(p).loadoper(0, taicpu(hp1).oper[1]^);
  11842. RemoveInstruction(hp1);
  11843. DebugMsg(SPeepholeOptimization + 'SETcc/Mov -> SETcc',p);
  11844. end
  11845. end;
  11846. Result := True;
  11847. end;
  11848. end;
  11849. end;
  11850. function TX86AsmOptimizer.TryCmpCMovOpts(var p, hp1: tai): Boolean;
  11851. var
  11852. hp2, pCond, pFirstMOV, pLastMOV, pCMOV: tai;
  11853. TargetReg: TRegister;
  11854. condition, inverted_condition: TAsmCond;
  11855. FoundMOV: Boolean;
  11856. begin
  11857. Result := False;
  11858. { In some situations, the CMOV optimisations in OptPass2Jcc can't
  11859. create the most optimial instructions possible due to limited
  11860. register availability, and there are situations where two
  11861. complementary "simple" CMOV blocks are created which, after the fact
  11862. can be merged into a "double" block. For example:
  11863. movw $257,%ax
  11864. movw $2,%r8w
  11865. xorl r9d,%r9d
  11866. testw $16,18(%rcx)
  11867. cmovew %ax,%dx
  11868. cmovew %r8w,%bx
  11869. cmovel %r9d,%r14d
  11870. movw $1283,%ax
  11871. movw $4,%r8w
  11872. movl $9,%r9d
  11873. cmovnew %ax,%dx
  11874. cmovnew %r8w,%bx
  11875. cmovnel %r9d,%r14d
  11876. The CMOVNE instructions at the end can be removed, and the
  11877. destination registers copied into the MOV instructions directly
  11878. above them, before finally being moved to before the first CMOVE
  11879. instructions, to produce:
  11880. movw $257,%ax
  11881. movw $2,%r8w
  11882. xorl r9d,%r9d
  11883. testw $16,18(%rcx)
  11884. movw $1283,%dx
  11885. movw $4,%bx
  11886. movl $9,%r14d
  11887. cmovew %ax,%dx
  11888. cmovew %r8w,%bx
  11889. cmovel %r9d,%r14d
  11890. Which can then be later optimised to:
  11891. movw $257,%ax
  11892. movw $2,%r8w
  11893. xorl r9d,%r9d
  11894. movw $1283,%dx
  11895. movw $4,%bx
  11896. movl $9,%r14d
  11897. testw $16,18(%rcx)
  11898. cmovew %ax,%dx
  11899. cmovew %r8w,%bx
  11900. cmovel %r9d,%r14d
  11901. }
  11902. TargetReg := taicpu(hp1).oper[1]^.reg;
  11903. condition := taicpu(hp1).condition;
  11904. inverted_condition := inverse_cond(condition);
  11905. pFirstMov := nil;
  11906. pLastMov := nil;
  11907. pCMOV := nil;
  11908. if (p.typ = ait_instruction) then
  11909. pCond := p
  11910. else if not GetNextInstruction(p, pCond) then
  11911. InternalError(2024012501);
  11912. if not MatchInstruction(pCond, A_CMP, A_TEST, []) then
  11913. { We should get the CMP or TEST instructeion }
  11914. InternalError(2024012502);
  11915. if (
  11916. (taicpu(hp1).oper[0]^.typ = top_reg) or
  11917. IsRefSafe(taicpu(hp1).oper[0]^.ref)
  11918. ) then
  11919. begin
  11920. { We have to tread carefully here, hence why we're not using
  11921. GetNextInstructionUsingReg... we can only accept MOV and other
  11922. CMOV instructions. Anything else and we must drop out}
  11923. hp2 := hp1;
  11924. while GetNextInstruction(hp2, hp2) and (hp2 <> BlockEnd) do
  11925. begin
  11926. if (hp2.typ <> ait_instruction) then
  11927. Exit;
  11928. case taicpu(hp2).opcode of
  11929. A_MOV:
  11930. begin
  11931. if not Assigned(pFirstMov) then
  11932. pFirstMov := hp2;
  11933. pLastMOV := hp2;
  11934. if not MatchOpType(taicpu(hp2), top_const, top_reg) then
  11935. { Something different - drop out }
  11936. Exit;
  11937. { Otherwise, leave it for now }
  11938. end;
  11939. A_CMOVcc:
  11940. begin
  11941. if taicpu(hp2).condition = inverted_condition then
  11942. begin
  11943. { We found what we're looking for }
  11944. if taicpu(hp2).oper[1]^.reg = TargetReg then
  11945. begin
  11946. if (taicpu(hp2).oper[0]^.typ = top_reg) or
  11947. IsRefSafe(taicpu(hp2).oper[0]^.ref) then
  11948. begin
  11949. pCMOV := hp2;
  11950. Break;
  11951. end
  11952. else
  11953. { Unsafe reference - drop out }
  11954. Exit;
  11955. end;
  11956. end
  11957. else if taicpu(hp2).condition <> condition then
  11958. { Something weird - drop out }
  11959. Exit;
  11960. end;
  11961. else
  11962. { Invalid }
  11963. Exit;
  11964. end;
  11965. end;
  11966. if not Assigned(pCMOV) then
  11967. { No complementary CMOV found }
  11968. Exit;
  11969. if not Assigned(pFirstMov) or (taicpu(pCMOV).oper[0]^.typ = top_ref) then
  11970. begin
  11971. { Don't need to do anything special or search for a matching MOV }
  11972. Asml.Remove(pCMOV);
  11973. if RegInInstruction(TargetReg, pCond) then
  11974. { Make sure we don't overwrite the register if it's being used in the condition }
  11975. Asml.InsertAfter(pCMOV, pCond)
  11976. else
  11977. Asml.InsertBefore(pCMOV, pCond);
  11978. taicpu(pCMOV).opcode := A_MOV;
  11979. taicpu(pCMOV).condition := C_None;
  11980. { Don't need to worry about allocating new registers in these cases }
  11981. DebugMsg(SPeepholeOptimization + 'CMovCMov2MovCMov 2', pCMOV);
  11982. Result := True;
  11983. Exit;
  11984. end
  11985. else
  11986. begin
  11987. DebugMsg(SPeepholeOptimization + 'CMovCMov2MovCMov 1', hp1);
  11988. FoundMOV := False;
  11989. { Search for the MOV that sets the target register }
  11990. hp2 := pFirstMov;
  11991. repeat
  11992. if (taicpu(hp2).opcode = A_MOV) and
  11993. (taicpu(hp2).oper[1]^.typ = top_reg) and
  11994. SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, taicpu(pCMOV).oper[0]^.reg) then
  11995. begin
  11996. { Change the destination }
  11997. taicpu(hp2).loadreg(1, newreg(R_INTREGISTER, getsupreg(TargetReg), getsubreg(taicpu(hp2).oper[1]^.reg)));
  11998. if not FoundMOV then
  11999. begin
  12000. FoundMOV := True;
  12001. { Make sure the register is allocated }
  12002. AllocRegBetween(TargetReg, p, hp2, UsedRegs);
  12003. end;
  12004. hp1 := tai(hp2.Previous);
  12005. Asml.Remove(hp2);
  12006. if RegInInstruction(TargetReg, pCond) then
  12007. { Make sure we don't overwrite the register if it's being used in the condition }
  12008. Asml.InsertAfter(hp2, pCond)
  12009. else
  12010. Asml.InsertBefore(hp2, pCond);
  12011. if (hp2 = pLastMov) then
  12012. { If the MOV instruction is the last one, "hp2 = pLastMOV" won't trigger }
  12013. Break;
  12014. hp2 := hp1;
  12015. end;
  12016. until (hp2 = pLastMOV) or not GetNextInstruction(hp2, hp2) or (hp2 = BlockEnd) or (hp2.typ <> ait_instruction);
  12017. if FoundMOV then
  12018. { Delete the CMOV }
  12019. RemoveInstruction(pCMOV)
  12020. else
  12021. begin
  12022. { If no MOV was found, we have to actually move and transmute the CMOV }
  12023. Asml.Remove(pCMOV);
  12024. if RegInInstruction(TargetReg, pCond) then
  12025. { Make sure we don't overwrite the register if it's being used in the condition }
  12026. Asml.InsertAfter(pCMOV, pCond)
  12027. else
  12028. Asml.InsertBefore(pCMOV, pCond);
  12029. taicpu(pCMOV).opcode := A_MOV;
  12030. taicpu(pCMOV).condition := C_None;
  12031. end;
  12032. Result := True;
  12033. Exit;
  12034. end;
  12035. end;
  12036. end;
  12037. function TX86AsmOptimizer.OptPass2Cmp(var p: tai): Boolean;
  12038. var
  12039. hp1, hp2, pCond: tai;
  12040. begin
  12041. Result := False;
  12042. { Search ahead for CMOV instructions }
  12043. if (cs_opt_level2 in current_settings.optimizerswitches) then
  12044. begin
  12045. hp1 := p;
  12046. hp2 := p;
  12047. pCond := nil; { To prevent compiler warnings }
  12048. { For TryCmpCMOVOpts, try to insert MOVs before the allocation of
  12049. DEFAULTFLAGS }
  12050. if not SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, p), pCond) or
  12051. (tai_regalloc(pCond).ratype = ra_dealloc) then
  12052. pCond := p;
  12053. while GetNextInstruction(hp1, hp1) and (hp1 <> BlockEnd) do
  12054. begin
  12055. if (hp1.typ <> ait_instruction) then
  12056. { Break out on markers and labels etc. }
  12057. Break;
  12058. case taicpu(hp1).opcode of
  12059. A_MOV:
  12060. { Ignore regular MOVs unless they are obviously not related
  12061. to a CMOV block }
  12062. if taicpu(hp1).oper[1]^.typ <> top_reg then
  12063. Break;
  12064. A_CMOVcc:
  12065. if TryCmpCMovOpts(pCond, hp1) then
  12066. begin
  12067. hp1 := hp2;
  12068. { p itself isn't changed, and we're still inside a
  12069. while loop to catch subsequent CMOVs, so just flag
  12070. a new iteration }
  12071. Include(OptsToCheck, aoc_ForceNewIteration);
  12072. Continue;
  12073. end;
  12074. else
  12075. { Drop out if we find anything else }
  12076. Break;
  12077. end;
  12078. hp2 := hp1;
  12079. end;
  12080. end;
  12081. end;
  12082. function TX86AsmOptimizer.OptPass2Test(var p: tai): Boolean;
  12083. var
  12084. hp1, hp2, pCond: tai;
  12085. SourceReg, TargetReg: TRegister;
  12086. begin
  12087. Result := False;
  12088. { In some situations, we end up with an inefficient arrangement of
  12089. instructions in the form of:
  12090. or %reg1,%reg2
  12091. (%reg1 deallocated)
  12092. test %reg2,%reg2
  12093. mov x,%reg2
  12094. we may be able to swap and rearrange the registers to produce:
  12095. or %reg2,%reg1
  12096. mov x,%reg2
  12097. test %reg1,%reg1
  12098. (%reg1 deallocated)
  12099. }
  12100. if (cs_opt_level3 in current_settings.optimizerswitches) and
  12101. (taicpu(p).oper[1]^.typ = top_reg) and
  12102. (
  12103. MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^.reg) or
  12104. MatchOperand(taicpu(p).oper[0]^, -1)
  12105. ) and
  12106. GetNextInstruction(p, hp1) and
  12107. MatchInstruction(hp1, A_MOV, []) and
  12108. (taicpu(hp1).oper[1]^.typ = top_reg) and
  12109. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  12110. begin
  12111. TargetReg := taicpu(p).oper[1]^.reg;
  12112. { Now look backwards to find a simple commutative operation: ADD,
  12113. IMUL (2-register version), OR, AND or XOR - whose destination
  12114. register is the same as TEST }
  12115. hp2 := p;
  12116. while GetLastInstruction(hp2, hp2) and (hp2.typ = ait_instruction) do
  12117. if RegInInstruction(TargetReg, hp2) then
  12118. begin
  12119. if MatchInstruction(hp2, [A_ADD, A_IMUL, A_OR, A_AND, A_XOR], [taicpu(p).opsize]) and
  12120. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  12121. (taicpu(hp2).oper[1]^.reg = TargetReg) and
  12122. (taicpu(hp2).oper[0]^.reg <> TargetReg) then
  12123. begin
  12124. SourceReg := taicpu(hp2).oper[0]^.reg;
  12125. if
  12126. { Make sure the MOV doesn't use the other register }
  12127. not RegInOp(SourceReg, taicpu(hp1).oper[0]^) and
  12128. { And make sure the source register is not used afterwards }
  12129. not RegInUsedRegs(SourceReg, UsedRegs) then
  12130. begin
  12131. DebugMsg(SPeepholeOptimization + 'OpTest2OpTest (register swap) done', hp2);
  12132. taicpu(hp2).oper[0]^.reg := TargetReg;
  12133. taicpu(hp2).oper[1]^.reg := SourceReg;
  12134. if taicpu(p).oper[0]^.typ = top_reg then
  12135. taicpu(p).oper[0]^.reg := SourceReg;
  12136. taicpu(p).oper[1]^.reg := SourceReg;
  12137. IncludeRegInUsedRegs(SourceReg, UsedRegs);
  12138. AllocRegBetween(SourceReg, hp2, p, UsedRegs);
  12139. Include(OptsToCheck, aoc_ForceNewIteration);
  12140. { We can still check the following optimisations since
  12141. the instruction is still a TEST }
  12142. end;
  12143. end;
  12144. Break;
  12145. end;
  12146. end;
  12147. { Search ahead3 for CMOV instructions }
  12148. if (cs_opt_level2 in current_settings.optimizerswitches) then
  12149. begin
  12150. hp1 := p;
  12151. hp2 := p;
  12152. pCond := nil; { To prevent compiler warnings }
  12153. { For TryCmpCMOVOpts, try to insert MOVs before the allocation of
  12154. DEFAULTFLAGS }
  12155. if not SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, p), pCond) or
  12156. (tai_regalloc(pCond).ratype = ra_dealloc) then
  12157. pCond := p;
  12158. while GetNextInstruction(hp1, hp1) and (hp1 <> BlockEnd) do
  12159. begin
  12160. if (hp1.typ <> ait_instruction) then
  12161. { Break out on markers and labels etc. }
  12162. Break;
  12163. case taicpu(hp1).opcode of
  12164. A_MOV:
  12165. { Ignore regular MOVs unless they are obviously not related
  12166. to a CMOV block }
  12167. if taicpu(hp1).oper[1]^.typ <> top_reg then
  12168. Break;
  12169. A_CMOVcc:
  12170. if TryCmpCMovOpts(pCond, hp1) then
  12171. begin
  12172. hp1 := hp2;
  12173. { p itself isn't changed, and we're still inside a
  12174. while loop to catch subsequent CMOVs, so just flag
  12175. a new iteration }
  12176. Include(OptsToCheck, aoc_ForceNewIteration);
  12177. Continue;
  12178. end;
  12179. else
  12180. { Drop out if we find anything else }
  12181. Break;
  12182. end;
  12183. hp2 := hp1;
  12184. end;
  12185. end;
  12186. end;
  12187. function TX86AsmOptimizer.OptPass2Jmp(var p : tai) : boolean;
  12188. var
  12189. hp1: tai;
  12190. Count: Integer;
  12191. OrigLabel: TAsmLabel;
  12192. begin
  12193. result := False;
  12194. { Sometimes, the optimisations below can permit this }
  12195. RemoveDeadCodeAfterJump(p);
  12196. if (taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full) and (taicpu(p).oper[0]^.ref^.base=NR_NO) and
  12197. (taicpu(p).oper[0]^.ref^.index=NR_NO) and (taicpu(p).oper[0]^.ref^.symbol is tasmlabel) then
  12198. begin
  12199. OrigLabel := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  12200. { Also a side-effect of optimisations }
  12201. if CollapseZeroDistJump(p, OrigLabel) then
  12202. begin
  12203. Result := True;
  12204. Exit;
  12205. end;
  12206. hp1 := GetLabelWithSym(OrigLabel);
  12207. if (taicpu(p).condition=C_None) and assigned(hp1) and SkipLabels(hp1,hp1) and (hp1.typ = ait_instruction) then
  12208. begin
  12209. if taicpu(hp1).opcode = A_RET then
  12210. begin
  12211. {
  12212. change
  12213. jmp .L1
  12214. ...
  12215. .L1:
  12216. ret
  12217. into
  12218. ret
  12219. }
  12220. begin
  12221. ConvertJumpToRET(p, hp1);
  12222. result:=true;
  12223. end;
  12224. end
  12225. else if (cs_opt_level3 in current_settings.optimizerswitches) and
  12226. not (cs_opt_size in current_settings.optimizerswitches) and
  12227. CheckJumpMovTransferOpt(p, hp1, 0, Count) then
  12228. begin
  12229. Result := True;
  12230. Exit;
  12231. end;
  12232. end;
  12233. end;
  12234. end;
  12235. class function TX86AsmOptimizer.CanBeCMOV(p, cond_p: tai; var RefModified: Boolean) : boolean;
  12236. begin
  12237. Result := assigned(p) and
  12238. MatchInstruction(p,A_MOV,[S_W,S_L,S_Q]) and
  12239. (taicpu(p).oper[1]^.typ = top_reg) and
  12240. (
  12241. (taicpu(p).oper[0]^.typ = top_reg) or
  12242. { allow references, but only pure symbols or got rel. addressing with RIP as based,
  12243. it is not expected that this can cause a seg. violation }
  12244. (
  12245. (taicpu(p).oper[0]^.typ = top_ref) and
  12246. { TODO: Can we detect which references become constants at this
  12247. stage so we don't have to do a blanket ban? }
  12248. (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) and
  12249. (
  12250. IsRefSafe(taicpu(p).oper[0]^.ref) or
  12251. (
  12252. { Don't use the reference in the condition if one of its registers got modified by a previous MOV }
  12253. not RefModified and
  12254. { If the reference also appears in the condition, then we know it's safe, otherwise
  12255. any kind of access violation would have occurred already }
  12256. Assigned(cond_p) and
  12257. { Make sure the sizes match too so we're reading and writing the same number of bytes }
  12258. (cond_p.typ = ait_instruction) and
  12259. (taicpu(cond_p).opsize = taicpu(p).opsize) and
  12260. { Just consider 2-operand comparison instructions for now to be safe }
  12261. (taicpu(cond_p).ops = 2) and
  12262. (
  12263. ((taicpu(cond_p).oper[1]^.typ = top_ref) and RefsEqual(taicpu(cond_p).oper[1]^.ref^, taicpu(p).oper[0]^.ref^)) or
  12264. (
  12265. (taicpu(cond_p).oper[0]^.typ = top_ref) and
  12266. { Don't risk identical registers but different offsets, as we may have constructs
  12267. such as buffer streams with things like length fields that indicate whether
  12268. any more data follows. And there are probably some contrived examples where
  12269. writing to offsets behind the one being read also lead to access violations }
  12270. RefsEqual(taicpu(cond_p).oper[0]^.ref^, taicpu(p).oper[0]^.ref^) and
  12271. (
  12272. { Check that we're not modifying a register that appears in the reference }
  12273. (InsProp[taicpu(cond_p).opcode].Ch * [Ch_Mop2, Ch_RWop2, Ch_Wop2] = []) or
  12274. (taicpu(cond_p).oper[1]^.typ <> top_reg) or
  12275. not RegInRef(taicpu(cond_p).oper[1]^.reg, taicpu(cond_p).oper[0]^.ref^)
  12276. )
  12277. )
  12278. )
  12279. )
  12280. )
  12281. )
  12282. );
  12283. end;
  12284. class procedure TX86AsmOptimizer.UpdateIntRegsNoDealloc(var AUsedRegs: TAllUsedRegs; p: Tai);
  12285. begin
  12286. { Update integer registers, ignoring deallocations }
  12287. repeat
  12288. while assigned(p) and
  12289. ((p.typ in (SkipInstr - [ait_RegAlloc])) or
  12290. (p.typ = ait_label) or
  12291. ((p.typ = ait_marker) and
  12292. (tai_Marker(p).Kind in [mark_AsmBlockEnd,mark_NoLineInfoStart,mark_NoLineInfoEnd]))) do
  12293. p := tai(p.next);
  12294. while assigned(p) and
  12295. (p.typ=ait_RegAlloc) Do
  12296. begin
  12297. if (getregtype(tai_regalloc(p).reg) = R_INTREGISTER) then
  12298. begin
  12299. case tai_regalloc(p).ratype of
  12300. ra_alloc :
  12301. IncludeRegInUsedRegs(tai_regalloc(p).reg, AUsedRegs);
  12302. else
  12303. ;
  12304. end;
  12305. end;
  12306. p := tai(p.next);
  12307. end;
  12308. until not(assigned(p)) or
  12309. (not(p.typ in SkipInstr) and
  12310. not((p.typ = ait_label) and
  12311. labelCanBeSkipped(tai_label(p))));
  12312. end;
  12313. {$ifndef 8086}
  12314. function TCMOVTracking.InitialiseBlock(BlockStart, OneBeforeBlock: tai; out BlockStop: tai; out EndJump: tai): Boolean;
  12315. begin
  12316. Result := False;
  12317. EndJump := nil;
  12318. BlockStop := nil;
  12319. while (BlockStart <> fOptimizer.BlockEnd) and
  12320. { stop on labels }
  12321. (BlockStart.typ <> ait_label) do
  12322. begin
  12323. { Keep track of all integer registers that are used }
  12324. fOptimizer.UpdateIntRegsNoDealloc(RegisterTracking, tai(OneBeforeBlock.Next));
  12325. if BlockStart.typ = ait_instruction then
  12326. begin
  12327. if (taicpu(BlockStart).opcode = A_JMP) then
  12328. begin
  12329. if not IsJumpToLabel(taicpu(BlockStart)) or
  12330. (JumpTargetOp(taicpu(BlockStart))^.ref^.index <> NR_NO) then
  12331. Exit;
  12332. EndJump := BlockStart;
  12333. Break;
  12334. end
  12335. { Check to see if we have a valid MOV instruction instead }
  12336. else if (taicpu(BlockStart).opcode <> A_MOV) or
  12337. (taicpu(BlockStart).oper[1]^.typ <> top_reg) or
  12338. not (taicpu(BlockStart).opsize in [S_W, S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
  12339. begin
  12340. Exit;
  12341. end
  12342. else
  12343. { This will be a valid MOV }
  12344. fAllocationRange := BlockStart;
  12345. end;
  12346. OneBeforeBlock := BlockStart;
  12347. fOptimizer.GetNextInstruction(BlockStart, BlockStart);
  12348. end;
  12349. if (BlockStart = fOptimizer.BlockEnd) then
  12350. Exit;
  12351. BlockStop := BlockStart;
  12352. Result := True;
  12353. end;
  12354. function TCMOVTracking.AnalyseMOVBlock(BlockStart, BlockStop, SearchStart: tai): LongInt;
  12355. var
  12356. hp1: tai;
  12357. RefModified: Boolean;
  12358. begin
  12359. Result := 0;
  12360. hp1 := BlockStart;
  12361. RefModified := False; { As long as the condition is inverted, this can be reset }
  12362. while assigned(hp1) and
  12363. (hp1 <> BlockStop) do
  12364. begin
  12365. case hp1.typ of
  12366. ait_instruction:
  12367. if MatchInstruction(hp1, A_MOV, [S_W, S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
  12368. begin
  12369. if fOptimizer.CanBeCMOV(hp1, fCondition, RefModified) then
  12370. begin
  12371. Inc(Result);
  12372. if { Make sure the sizes match too so we're reading and writing the same number of bytes }
  12373. Assigned(fCondition) and
  12374. { Will have 2 operands }
  12375. (
  12376. (
  12377. (taicpu(fCondition).oper[0]^.typ = top_ref) and
  12378. fOptimizer.RegInRef(taicpu(hp1).oper[1]^.reg, taicpu(fCondition).oper[0]^.ref^)
  12379. ) or
  12380. (
  12381. (taicpu(fCondition).oper[1]^.typ = top_ref) and
  12382. fOptimizer.RegInRef(taicpu(hp1).oper[1]^.reg, taicpu(fCondition).oper[1]^.ref^)
  12383. )
  12384. ) then
  12385. { It is no longer safe to use the reference in the condition.
  12386. this prevents problems such as:
  12387. mov (%reg),%reg
  12388. mov (%reg),...
  12389. When the comparison is cmp (%reg),0 and guarding against a null pointer deallocation
  12390. (fixes #40165)
  12391. Note: "mov (%reg1),%reg2; mov (%reg2),..." won't be optimised this way since
  12392. at least one of (%reg1) and (%reg2) won't be in the condition and is hence unsafe.
  12393. }
  12394. RefModified := True;
  12395. end
  12396. else if not (cs_opt_size in current_settings.optimizerswitches) and
  12397. { CMOV with constants grows the code size }
  12398. TryCMOVConst(hp1, SearchStart, BlockStop, Result) then
  12399. begin
  12400. { Register was reserved by TryCMOVConst and
  12401. stored on ConstRegs }
  12402. end
  12403. else
  12404. begin
  12405. Result := -1;
  12406. Exit;
  12407. end;
  12408. end
  12409. else
  12410. begin
  12411. Result := -1;
  12412. Exit;
  12413. end;
  12414. else
  12415. { Most likely an align };
  12416. end;
  12417. fOptimizer.GetNextInstruction(hp1, hp1);
  12418. end;
  12419. end;
  12420. constructor TCMOVTracking.Init(Optimizer: TX86AsmOptimizer; var p_initialjump, p_initialmov: tai; var AFirstLabel: TAsmLabel);
  12421. { For the tsBranching type, increase the weighting score to account for the new conditional jump
  12422. (this is done as a separate stage because the double types are extensions of the branching type,
  12423. but we can't discount the conditional jump until the last step) }
  12424. procedure EvaluateBranchingType;
  12425. begin
  12426. Inc(CMOVScore);
  12427. if (CMOVScore > MAX_CMOV_INSTRUCTIONS) then
  12428. { Too many instructions to be worthwhile }
  12429. fState := tsInvalid;
  12430. end;
  12431. var
  12432. hp1: tai;
  12433. Count: Integer;
  12434. begin
  12435. { Table of valid CMOV block types
  12436. Block type 2nd Jump Mid-label 2nd MOVs 3rd Jump End-label
  12437. ---------- --------- --------- --------- --------- ---------
  12438. tsSimple X Yes X X X
  12439. tsDetour = 1st X X X X
  12440. tsBranching <> Mid Yes X X X
  12441. tsDouble End-label Yes * Yes X Yes
  12442. tsDoubleBranchSame <> Mid Yes * Yes = 2nd X
  12443. tsDoubleBranchDifferent <> Mid Yes * Yes <> 2nd X
  12444. tsDoubleSecondBranching End-label Yes * Yes <> 2nd Yes
  12445. * Only one reference allowed
  12446. }
  12447. hp1 := nil; { To prevent compiler warnings }
  12448. Optimizer.CopyUsedRegs(RegisterTracking);
  12449. fOptimizer := Optimizer;
  12450. fLabel := AFirstLabel;
  12451. CMOVScore := 0;
  12452. ConstCount := 0;
  12453. { Initialise RegWrites, ConstRegs, ConstVals, ConstSizes, ConstWriteSizes and ConstMovs }
  12454. FillChar(RegWrites[0], MAX_CMOV_INSTRUCTIONS * 2 * SizeOf(TRegister), 0);
  12455. FillChar(ConstRegs[0], MAX_CMOV_REGISTERS * SizeOf(TRegister), 0);
  12456. FillChar(ConstVals[0], MAX_CMOV_REGISTERS * SizeOf(TCGInt), 0);
  12457. FillChar(ConstSizes[0], MAX_CMOV_REGISTERS * SizeOf(TSubRegister), 0);
  12458. FillChar(ConstWriteSizes[0], first_int_imreg * SizeOf(TOpSize), 0);
  12459. FillChar(ConstMovs[0], MAX_CMOV_REGISTERS * SizeOf(taicpu), 0);
  12460. fInsertionPoint := p_initialjump;
  12461. fCondition := nil;
  12462. fInitialJump := p_initialjump;
  12463. fFirstMovBlock := p_initialmov;
  12464. fFirstMovBlockStop := nil;
  12465. fSecondJump := nil;
  12466. fSecondMovBlock := nil;
  12467. fSecondMovBlockStop := nil;
  12468. fMidLabel := nil;
  12469. fSecondJump := nil;
  12470. fSecondMovBlock := nil;
  12471. fEndLabel := nil;
  12472. fAllocationRange := nil;
  12473. { Assume it all goes horribly wrong! }
  12474. fState := tsInvalid;
  12475. { Look backwards at the comparisons to get an accurate picture of register usage and a better position for any MOV const,reg insertions }
  12476. if Optimizer.GetLastInstruction(p_initialjump, fCondition) and
  12477. MatchInstruction(fCondition, [A_CMP, A_TEST, A_BSR, A_BSF, A_COMISS, A_COMISD, A_UCOMISS, A_UCOMISD, A_VCOMISS, A_VCOMISD, A_VUCOMISS, A_VUCOMISD], []) then
  12478. begin
  12479. { Mark all the registers in the comparison as 'in use', even if they've just been deallocated }
  12480. for Count := 0 to 1 do
  12481. with taicpu(fCondition).oper[Count]^ do
  12482. case typ of
  12483. top_reg:
  12484. if getregtype(reg) = R_INTREGISTER then
  12485. Optimizer.IncludeRegInUsedRegs(reg, RegisterTracking);
  12486. top_ref:
  12487. begin
  12488. if
  12489. {$ifdef x86_64}
  12490. (ref^.base <> NR_RIP) and
  12491. {$endif x86_64}
  12492. (ref^.base <> NR_NO) then
  12493. Optimizer.IncludeRegInUsedRegs(ref^.base, RegisterTracking);
  12494. if (ref^.index <> NR_NO) then
  12495. Optimizer.IncludeRegInUsedRegs(ref^.index, RegisterTracking);
  12496. end
  12497. else
  12498. ;
  12499. end;
  12500. { When inserting instructions before hp_prev, try to insert them
  12501. before the allocation of the FLAGS register }
  12502. if not SetAndTest(Optimizer.FindRegAllocBackward(NR_DEFAULTFLAGS, tai(fCondition.Previous)), fInsertionPoint) or
  12503. (tai_regalloc(fInsertionPoint).ratype = ra_dealloc) then
  12504. { If not found, set it equal to the condition so it's something sensible }
  12505. fInsertionPoint := fCondition;
  12506. { When dealing with a comparison against zero, take note of the
  12507. instruction before it to see if we can move instructions further
  12508. back in order to benefit PostPeepholeOptTestOr.
  12509. }
  12510. if (
  12511. (
  12512. (taicpu(fCondition).opcode = A_CMP) and
  12513. MatchOperand(taicpu(fCondition).oper[0]^, 0)
  12514. ) or
  12515. (
  12516. (taicpu(fCondition).opcode = A_TEST) and
  12517. (
  12518. Optimizer.OpsEqual(taicpu(fCondition).oper[0]^, taicpu(fCondition).oper[1]^) or
  12519. MatchOperand(taicpu(fCondition).oper[0]^, -1)
  12520. )
  12521. )
  12522. ) and
  12523. Optimizer.GetLastInstruction(fCondition, hp1) then
  12524. begin
  12525. { These instructions set the zero flag if the result is zero }
  12526. if MatchInstruction(hp1, [A_ADD, A_SUB, A_OR, A_XOR, A_AND, A_POPCNT, A_LZCNT], []) then
  12527. begin
  12528. fInsertionPoint := hp1;
  12529. { Also mark all the registers in this previous instruction
  12530. as 'in use', even if they've just been deallocated }
  12531. for Count := 0 to 1 do
  12532. with taicpu(hp1).oper[Count]^ do
  12533. case typ of
  12534. top_reg:
  12535. if getregtype(reg) = R_INTREGISTER then
  12536. Optimizer.IncludeRegInUsedRegs(reg, RegisterTracking);
  12537. top_ref:
  12538. begin
  12539. if
  12540. {$ifdef x86_64}
  12541. (ref^.base <> NR_RIP) and
  12542. {$endif x86_64}
  12543. (ref^.base <> NR_NO) then
  12544. Optimizer.IncludeRegInUsedRegs(ref^.base, RegisterTracking);
  12545. if (ref^.index <> NR_NO) then
  12546. Optimizer.IncludeRegInUsedRegs(ref^.index, RegisterTracking);
  12547. end
  12548. else
  12549. ;
  12550. end;
  12551. end;
  12552. end;
  12553. end
  12554. else
  12555. fCondition := nil;
  12556. { When inserting instructions, try to insert them before the allocation of the FLAGS register }
  12557. if SetAndTest(Optimizer.FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p_initialjump.Previous)), hp1) and
  12558. (tai_regalloc(hp1).ratype <> ra_dealloc) then
  12559. { If not found, set it equal to p so it's something sensible }
  12560. fInsertionPoint := hp1;
  12561. hp1 := p_initialmov;
  12562. if not InitialiseBlock(p_initialmov, p_initialjump, fFirstMovBlockStop, fSecondJump) then
  12563. Exit;
  12564. hp1 := fFirstMovBlockStop; { Will either be on a label or a jump }
  12565. if (hp1.typ <> ait_label) then { should be on a jump }
  12566. begin
  12567. if not Optimizer.GetNextInstruction(hp1, fMidLabel) or not (fMidLabel.typ = ait_label) then
  12568. { Need a label afterwards }
  12569. Exit;
  12570. end
  12571. else
  12572. fMidLabel := hp1;
  12573. if tai_label(fMidLabel).labsym <> AFirstLabel then
  12574. { Not the correct label }
  12575. fMidLabel := nil;
  12576. if not Assigned(fSecondJump) and not Assigned(fMidLabel) then
  12577. { If there's neither a 2nd jump nor correct label, then it's invalid
  12578. (see above table) }
  12579. Exit;
  12580. { Analyse the first block of MOVs more closely }
  12581. CMOVScore := AnalyseMOVBlock(fFirstMovBlock, fFirstMovBlockStop, fInsertionPoint);
  12582. if Assigned(fSecondJump) then
  12583. begin
  12584. if (JumpTargetOp(taicpu(fSecondJump))^.ref^.symbol = AFirstLabel) then
  12585. begin
  12586. fState := tsDetour
  12587. end
  12588. else
  12589. begin
  12590. { Need the correct mid-label for this one }
  12591. if not Assigned(fMidLabel) then
  12592. Exit;
  12593. fState := tsBranching;
  12594. end;
  12595. end
  12596. else
  12597. { No jump. but mid-label is present }
  12598. fState := tsSimple;
  12599. if (CMOVScore > MAX_CMOV_INSTRUCTIONS) or (CMOVScore <= 0) then
  12600. begin
  12601. { Invalid or too many instructions to be worthwhile }
  12602. fState := tsInvalid;
  12603. Exit;
  12604. end;
  12605. { check further for
  12606. jCC xxx
  12607. <several movs 1>
  12608. jmp yyy
  12609. xxx:
  12610. <several movs 2>
  12611. yyy:
  12612. etc.
  12613. }
  12614. if (fState = tsBranching) and
  12615. { Estimate for required savings for extra jump }
  12616. (CMOVScore <= MAX_CMOV_INSTRUCTIONS - 1) and
  12617. { Only one reference is allowed for double blocks }
  12618. (AFirstLabel.getrefs = 1) then
  12619. begin
  12620. Optimizer.GetNextInstruction(fMidLabel, hp1);
  12621. fSecondMovBlock := hp1;
  12622. if not InitialiseBlock(fSecondMovBlock, fMidLabel, fSecondMovBlockStop, fThirdJump) then
  12623. begin
  12624. EvaluateBranchingType;
  12625. Exit;
  12626. end;
  12627. hp1 := fSecondMovBlockStop; { Will either be on a label or a jump }
  12628. if (hp1.typ <> ait_label) then { should be on a jump }
  12629. begin
  12630. if not Optimizer.GetNextInstruction(hp1, fEndLabel) or not (fEndLabel.typ = ait_label) then
  12631. begin
  12632. { Need a label afterwards }
  12633. EvaluateBranchingType;
  12634. Exit;
  12635. end;
  12636. end
  12637. else
  12638. fEndLabel := hp1;
  12639. if tai_label(fEndLabel).labsym <> JumpTargetOp(taicpu(fSecondJump))^.ref^.symbol then
  12640. { Second jump doesn't go to the end }
  12641. fEndLabel := nil;
  12642. if not Assigned(fThirdJump) and not Assigned(fEndLabel) then
  12643. begin
  12644. { If there's neither a 3rd jump nor correct end label, then it's
  12645. not a invalid double block, but is a valid single branching
  12646. block (see above table) }
  12647. EvaluateBranchingType;
  12648. Exit;
  12649. end;
  12650. Count := AnalyseMOVBlock(fSecondMovBlock, fSecondMovBlockStop, fMidLabel);
  12651. if (Count > MAX_CMOV_INSTRUCTIONS) or (Count <= 0) then
  12652. { Invalid or too many instructions to be worthwhile }
  12653. Exit;
  12654. Inc(CMOVScore, Count);
  12655. if Assigned(fThirdJump) then
  12656. begin
  12657. if not Assigned(fSecondJump) then
  12658. fState := tsDoubleSecondBranching
  12659. else if (JumpTargetOp(taicpu(fSecondJump))^.ref^.symbol = JumpTargetOp(taicpu(fThirdJump))^.ref^.symbol) then
  12660. fState := tsDoubleBranchSame
  12661. else
  12662. fState := tsDoubleBranchDifferent;
  12663. end
  12664. else
  12665. fState := tsDouble;
  12666. end;
  12667. if fState = tsBranching then
  12668. EvaluateBranchingType;
  12669. end;
  12670. { Tries to convert a mov const,%reg instruction into a CMOV by reserving a
  12671. new register to store the constant }
  12672. function TCMOVTracking.TryCMOVConst(p, start, stop: tai; var Count: LongInt): Boolean;
  12673. var
  12674. RegSize: TSubRegister;
  12675. CurrentVal: TCGInt;
  12676. ANewReg: TRegister;
  12677. X: ShortInt;
  12678. begin
  12679. Result := False;
  12680. if not MatchOpType(taicpu(p), top_const, top_reg) then
  12681. Exit;
  12682. if ConstCount >= MAX_CMOV_REGISTERS then
  12683. { Arrays are full }
  12684. Exit;
  12685. { Remember that CMOV can't encode 8-bit registers }
  12686. case taicpu(p).opsize of
  12687. S_W:
  12688. RegSize := R_SUBW;
  12689. S_L:
  12690. RegSize := R_SUBD;
  12691. {$ifdef x86_64}
  12692. S_Q:
  12693. RegSize := R_SUBQ;
  12694. {$endif x86_64}
  12695. else
  12696. InternalError(2021100401);
  12697. end;
  12698. { See if the value has already been reserved for another CMOV instruction }
  12699. CurrentVal := taicpu(p).oper[0]^.val;
  12700. for X := 0 to ConstCount - 1 do
  12701. if ConstVals[X] = CurrentVal then
  12702. begin
  12703. ConstRegs[ConstCount] := ConstRegs[X];
  12704. ConstSizes[ConstCount] := RegSize;
  12705. ConstVals[ConstCount] := CurrentVal;
  12706. Inc(ConstCount);
  12707. Inc(Count);
  12708. Result := True;
  12709. Exit;
  12710. end;
  12711. ANewReg := fOptimizer.GetIntRegisterBetween(R_SUBWHOLE, RegisterTracking, start, stop, True);
  12712. if ANewReg = NR_NO then
  12713. { No free registers }
  12714. Exit;
  12715. { Reserve the register so subsequent TryCMOVConst calls don't all end
  12716. up vying for the same register }
  12717. fOptimizer.IncludeRegInUsedRegs(ANewReg, RegisterTracking);
  12718. ConstRegs[ConstCount] := ANewReg;
  12719. ConstSizes[ConstCount] := RegSize;
  12720. ConstVals[ConstCount] := CurrentVal;
  12721. Inc(ConstCount);
  12722. Inc(Count);
  12723. Result := True;
  12724. end;
  12725. destructor TCMOVTracking.Done;
  12726. begin
  12727. TAOptObj.ReleaseUsedRegs(RegisterTracking);
  12728. end;
  12729. procedure TCMOVTracking.Process(out new_p: tai);
  12730. var
  12731. Count, Writes: LongInt;
  12732. RegMatch: Boolean;
  12733. hp1, hp_new: tai;
  12734. inverted_condition, condition: TAsmCond;
  12735. begin
  12736. if (fState in [tsInvalid, tsProcessed]) then
  12737. InternalError(2023110701);
  12738. { Repurpose RegisterTracking to mark registers that we've defined }
  12739. RegisterTracking[R_INTREGISTER].Clear;
  12740. Count := 0;
  12741. Writes := 0;
  12742. condition := taicpu(fInitialJump).condition;
  12743. inverted_condition := inverse_cond(condition);
  12744. { Exclude tsDoubleBranchDifferent from this check, as the second block
  12745. doesn't get CMOVs in this case }
  12746. if (fState in [tsDouble, tsDoubleBranchSame, tsDoubleSecondBranching]) then
  12747. begin
  12748. { Include the jump in the flag tracking }
  12749. if Assigned(fThirdJump) then
  12750. begin
  12751. if (fState = tsDoubleBranchSame) then
  12752. begin
  12753. { Will be an unconditional jump, so track to the instruction before it }
  12754. if not fOptimizer.GetLastInstruction(fThirdJump, hp1) then
  12755. InternalError(2023110710);
  12756. end
  12757. else
  12758. hp1 := fThirdJump;
  12759. end
  12760. else
  12761. hp1 := fSecondMovBlockStop;
  12762. end
  12763. else
  12764. begin
  12765. { Include a conditional jump in the flag tracking }
  12766. if Assigned(fSecondJump) then
  12767. begin
  12768. if (fState = tsDetour) then
  12769. begin
  12770. { Will be an unconditional jump, so track to the instruction before it }
  12771. if not fOptimizer.GetLastInstruction(fSecondJump, hp1) then
  12772. InternalError(2023110711);
  12773. end
  12774. else
  12775. hp1 := fSecondJump;
  12776. end
  12777. else
  12778. hp1 := fFirstMovBlockStop;
  12779. end;
  12780. fOptimizer.AllocRegBetween(NR_DEFAULTFLAGS, fInitialJump, hp1, fOptimizer.UsedRegs);
  12781. { Process the second set of MOVs first, because if a destination
  12782. register is shared between the first and second MOV sets, it is more
  12783. efficient to turn the first one into a MOV instruction and place it
  12784. before the CMP if possible, but we won't know which registers are
  12785. shared until we've processed at least one list, so we might as well
  12786. make it the second one since that won't be modified again. }
  12787. if (fState in [tsDouble, tsDoubleBranchSame, tsDoubleBranchDifferent, tsDoubleSecondBranching]) then
  12788. begin
  12789. hp1 := fSecondMovBlock;
  12790. repeat
  12791. if not Assigned(hp1) then
  12792. InternalError(2018062902);
  12793. if (hp1.typ = ait_instruction) then
  12794. begin
  12795. { Extra safeguard }
  12796. if (taicpu(hp1).opcode <> A_MOV) then
  12797. InternalError(2018062903);
  12798. { Note: tsDoubleBranchDifferent is essentially identical to
  12799. tsBranching and the 2nd block is best left largely
  12800. untouched, but we need to evaluate which registers the MOVs
  12801. write to in order to track what would be complementary CMOV
  12802. pairs that can be further optimised. [Kit] }
  12803. if fState <> tsDoubleBranchDifferent then
  12804. begin
  12805. if taicpu(hp1).oper[0]^.typ = top_const then
  12806. begin
  12807. RegMatch := False;
  12808. for Count := 0 to ConstCount - 1 do
  12809. if (ConstVals[Count] = taicpu(hp1).oper[0]^.val) and
  12810. (getsubreg(taicpu(hp1).oper[1]^.reg) = ConstSizes[Count]) then
  12811. begin
  12812. RegMatch := True;
  12813. { If it's in RegisterTracking, then this register
  12814. is being used more than once and hence has
  12815. already had its value defined (it gets added to
  12816. UsedRegs through AllocRegBetween below) }
  12817. if not RegisterTracking[R_INTREGISTER].IsUsed(ConstRegs[Count]) then
  12818. begin
  12819. hp_new := taicpu.op_const_reg(A_MOV, subreg2opsize(R_SUBWHOLE), taicpu(hp1).oper[0]^.val, ConstRegs[Count]);
  12820. taicpu(hp_new).fileinfo := taicpu(fInitialJump).fileinfo;
  12821. fOptimizer.asml.InsertBefore(hp_new, fInsertionPoint);
  12822. fOptimizer.IncludeRegInUsedRegs(ConstRegs[Count], RegisterTracking);
  12823. ConstMovs[Count] := hp_new;
  12824. end
  12825. else
  12826. { We just need an instruction between hp_prev and hp1
  12827. where we know the register is marked as in use }
  12828. hp_new := fSecondMovBlock;
  12829. { Keep track of largest write for this register so it can be optimised later }
  12830. if (getsubreg(taicpu(hp1).oper[1]^.reg) > ConstWriteSizes[getsupreg(ConstRegs[Count])]) then
  12831. ConstWriteSizes[getsupreg(ConstRegs[Count])] := getsubreg(taicpu(hp1).oper[1]^.reg);
  12832. fOptimizer.AllocRegBetween(ConstRegs[Count], hp_new, hp1, fOptimizer.UsedRegs);
  12833. taicpu(hp1).loadreg(0, newreg(R_INTREGISTER, getsupreg(ConstRegs[Count]), ConstSizes[Count]));
  12834. Break;
  12835. end;
  12836. if not RegMatch then
  12837. InternalError(2021100411);
  12838. end;
  12839. taicpu(hp1).opcode := A_CMOVcc;
  12840. taicpu(hp1).condition := condition;
  12841. end;
  12842. { Store these writes to search for duplicates later on }
  12843. RegWrites[Writes] := taicpu(hp1).oper[1]^.reg;
  12844. Inc(Writes);
  12845. end;
  12846. fOptimizer.GetNextInstruction(hp1, hp1);
  12847. until (hp1 = fSecondMovBlockStop);
  12848. end;
  12849. { Now do the first set of MOVs }
  12850. hp1 := fFirstMovBlock;
  12851. repeat
  12852. if not Assigned(hp1) then
  12853. InternalError(2018062904);
  12854. if (hp1.typ = ait_instruction) then
  12855. begin
  12856. RegMatch := False;
  12857. { Extra safeguard }
  12858. if (taicpu(hp1).opcode <> A_MOV) then
  12859. InternalError(2018062905);
  12860. { Search through the RegWrites list to see if there are any
  12861. opposing CMOV pairs that write to the same register }
  12862. for Count := 0 to Writes - 1 do
  12863. if (RegWrites[Count] = taicpu(hp1).oper[1]^.reg) then
  12864. begin
  12865. { We have a match. Keep this as a MOV }
  12866. { Move ahead in preparation }
  12867. fOptimizer.GetNextInstruction(hp1, hp1);
  12868. RegMatch := True;
  12869. Break;
  12870. end;
  12871. if RegMatch then
  12872. Continue;
  12873. if taicpu(hp1).oper[0]^.typ = top_const then
  12874. begin
  12875. for Count := 0 to ConstCount - 1 do
  12876. if (ConstVals[Count] = taicpu(hp1).oper[0]^.val) and
  12877. (getsubreg(taicpu(hp1).oper[1]^.reg) = ConstSizes[Count]) then
  12878. begin
  12879. RegMatch := True;
  12880. { If it's in RegisterTracking, then this register is
  12881. being used more than once and hence has already had
  12882. its value defined (it gets added to UsedRegs through
  12883. AllocRegBetween below) }
  12884. if not RegisterTracking[R_INTREGISTER].IsUsed(ConstRegs[Count]) then
  12885. begin
  12886. hp_new := taicpu.op_const_reg(A_MOV, subreg2opsize(R_SUBWHOLE), taicpu(hp1).oper[0]^.val, ConstRegs[Count]);
  12887. taicpu(hp_new).fileinfo := taicpu(fInitialJump).fileinfo;
  12888. fOptimizer.asml.InsertBefore(hp_new, fInsertionPoint);
  12889. fOptimizer.IncludeRegInUsedRegs(ConstRegs[Count], RegisterTracking);
  12890. ConstMovs[Count] := hp_new;
  12891. end
  12892. else
  12893. { We just need an instruction between hp_prev and hp1
  12894. where we know the register is marked as in use }
  12895. hp_new := fFirstMovBlock;
  12896. { Keep track of largest write for this register so it can be optimised later }
  12897. if (getsubreg(taicpu(hp1).oper[1]^.reg) > ConstWriteSizes[getsupreg(ConstRegs[Count])]) then
  12898. ConstWriteSizes[getsupreg(ConstRegs[Count])] := getsubreg(taicpu(hp1).oper[1]^.reg);
  12899. fOptimizer.AllocRegBetween(ConstRegs[Count], hp_new, hp1, fOptimizer.UsedRegs);
  12900. taicpu(hp1).loadreg(0, newreg(R_INTREGISTER, getsupreg(ConstRegs[Count]), ConstSizes[Count]));
  12901. Break;
  12902. end;
  12903. if not RegMatch then
  12904. InternalError(2021100412);
  12905. end;
  12906. taicpu(hp1).opcode := A_CMOVcc;
  12907. taicpu(hp1).condition := inverted_condition;
  12908. if (fState = tsDoubleBranchDifferent) then
  12909. begin
  12910. { Store these writes to search for duplicates later on }
  12911. RegWrites[Writes] := taicpu(hp1).oper[1]^.reg;
  12912. Inc(Writes);
  12913. end;
  12914. end;
  12915. fOptimizer.GetNextInstruction(hp1, hp1);
  12916. until (hp1 = fFirstMovBlockStop);
  12917. { Update initialisation MOVs to the smallest possible size }
  12918. for Count := 0 to ConstCount - 1 do
  12919. if Assigned(ConstMovs[Count]) then
  12920. begin
  12921. taicpu(ConstMovs[Count]).opsize := subreg2opsize(ConstWriteSizes[Word(ConstRegs[Count])]);
  12922. setsubreg(taicpu(ConstMovs[Count]).oper[1]^.reg, ConstWriteSizes[Word(ConstRegs[Count])]);
  12923. end;
  12924. case fState of
  12925. tsSimple:
  12926. begin
  12927. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Simple type)', fInitialJump);
  12928. { No branch to delete }
  12929. end;
  12930. tsDetour:
  12931. begin
  12932. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Detour type)', fInitialJump);
  12933. { Preserve jump }
  12934. end;
  12935. tsBranching, tsDoubleBranchDifferent:
  12936. begin
  12937. if (fState = tsBranching) then
  12938. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Branching type)', fInitialJump)
  12939. else
  12940. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Double branching (different) type)', fInitialJump);
  12941. taicpu(fSecondJump).opcode := A_JCC;
  12942. taicpu(fSecondJump).condition := inverted_condition;
  12943. end;
  12944. tsDouble, tsDoubleBranchSame:
  12945. begin
  12946. if (fState = tsDouble) then
  12947. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Double type)', fInitialJump)
  12948. else
  12949. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Double branching (same) type)', fInitialJump);
  12950. { Delete second jump }
  12951. JumpTargetOp(taicpu(fSecondJump))^.ref^.symbol.decrefs;
  12952. fOptimizer.RemoveInstruction(fSecondJump);
  12953. end;
  12954. tsDoubleSecondBranching:
  12955. begin
  12956. fOptimizer.DebugMsg(SPeepholeOptimization + 'CMOV Block (Double, second branching type)', fInitialJump);
  12957. { Delete second jump, preserve third jump as conditional }
  12958. JumpTargetOp(taicpu(fSecondJump))^.ref^.symbol.decrefs;
  12959. fOptimizer.RemoveInstruction(fSecondJump);
  12960. taicpu(fThirdJump).opcode := A_JCC;
  12961. taicpu(fThirdJump).condition := condition;
  12962. end;
  12963. else
  12964. InternalError(2023110720);
  12965. end;
  12966. { Now we can safely decrement the reference count }
  12967. tasmlabel(fLabel).decrefs;
  12968. fOptimizer.UpdateUsedRegs(tai(fInitialJump.next));
  12969. { Remove the original jump }
  12970. fOptimizer.RemoveInstruction(fInitialJump); { Note, the choice to not use RemoveCurrentp is deliberate }
  12971. new_p := fFirstMovBlock; { Appears immediately after the initial jump }
  12972. fState := tsProcessed;
  12973. end;
  12974. {$endif 8086}
  12975. function TX86AsmOptimizer.OptPass2Jcc(var p : tai) : boolean;
  12976. var
  12977. hp1,hp2: tai;
  12978. carryadd_opcode : TAsmOp;
  12979. symbol: TAsmSymbol;
  12980. increg, tmpreg: TRegister;
  12981. {$ifndef i8086}
  12982. CMOVTracking: PCMOVTracking;
  12983. hp3,hp4,hp5: tai;
  12984. {$endif i8086}
  12985. TempBool: Boolean;
  12986. begin
  12987. if (aoc_DoPass2JccOpts in OptsToCheck) and
  12988. DoJumpOptimizations(p, TempBool) then
  12989. Exit(True);
  12990. result:=false;
  12991. if GetNextInstruction(p,hp1) then
  12992. begin
  12993. if (hp1.typ=ait_label) then
  12994. begin
  12995. Result := DoSETccLblRETOpt(p, tai_label(hp1));
  12996. Exit;
  12997. end
  12998. else if (hp1.typ<>ait_instruction) then
  12999. Exit;
  13000. symbol := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  13001. if (
  13002. (
  13003. ((Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB)) and
  13004. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  13005. (Taicpu(hp1).oper[0]^.val=1)
  13006. ) or
  13007. ((Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC))
  13008. ) and
  13009. GetNextInstruction(hp1,hp2) and
  13010. FindLabel(TAsmLabel(symbol), hp2) then
  13011. { jb @@1 cmc
  13012. inc/dec operand --> adc/sbb operand,0
  13013. @@1:
  13014. ... and ...
  13015. jnb @@1
  13016. inc/dec operand --> adc/sbb operand,0
  13017. @@1: }
  13018. begin
  13019. if Taicpu(p).condition in [C_NAE,C_B,C_C] then
  13020. begin
  13021. case taicpu(hp1).opcode of
  13022. A_INC,
  13023. A_ADD:
  13024. carryadd_opcode:=A_ADC;
  13025. A_DEC,
  13026. A_SUB:
  13027. carryadd_opcode:=A_SBB;
  13028. else
  13029. InternalError(2021011001);
  13030. end;
  13031. Taicpu(p).clearop(0);
  13032. Taicpu(p).ops:=0;
  13033. Taicpu(p).is_jmp:=false;
  13034. Taicpu(p).opcode:=A_CMC;
  13035. Taicpu(p).condition:=C_NONE;
  13036. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2CmcAdc/Sbb',p);
  13037. Taicpu(hp1).ops:=2;
  13038. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  13039. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  13040. else
  13041. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  13042. Taicpu(hp1).loadconst(0,0);
  13043. Taicpu(hp1).opcode:=carryadd_opcode;
  13044. result:=true;
  13045. exit;
  13046. end
  13047. else if Taicpu(p).condition in [C_AE,C_NB,C_NC] then
  13048. begin
  13049. case taicpu(hp1).opcode of
  13050. A_INC,
  13051. A_ADD:
  13052. carryadd_opcode:=A_ADC;
  13053. A_DEC,
  13054. A_SUB:
  13055. carryadd_opcode:=A_SBB;
  13056. else
  13057. InternalError(2021011002);
  13058. end;
  13059. Taicpu(hp1).ops:=2;
  13060. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2Adc/Sbb',p);
  13061. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  13062. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  13063. else
  13064. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  13065. Taicpu(hp1).loadconst(0,0);
  13066. Taicpu(hp1).opcode:=carryadd_opcode;
  13067. RemoveCurrentP(p, hp1);
  13068. result:=true;
  13069. exit;
  13070. end
  13071. {
  13072. jcc @@1 setcc tmpreg
  13073. inc/dec/add/sub operand -> (movzx tmpreg)
  13074. @@1: add/sub tmpreg,operand
  13075. While this increases code size slightly, it makes the code much faster if the
  13076. jump is unpredictable
  13077. }
  13078. else if not(cs_opt_size in current_settings.optimizerswitches) then
  13079. begin
  13080. { search for an available register which is volatile }
  13081. increg := GetIntRegisterBetween(R_SUBL, UsedRegs, p, hp1);
  13082. if increg <> NR_NO then
  13083. begin
  13084. { We don't need to check if tmpreg is in hp1 or not, because
  13085. it will be marked as in use at p (if not, this is
  13086. indictive of a compiler bug). }
  13087. TAsmLabel(symbol).decrefs;
  13088. Taicpu(p).clearop(0);
  13089. Taicpu(p).ops:=1;
  13090. Taicpu(p).is_jmp:=false;
  13091. Taicpu(p).opcode:=A_SETcc;
  13092. DebugMsg(SPeepholeOptimization+'JccAdd2SetccAdd',p);
  13093. Taicpu(p).condition:=inverse_cond(Taicpu(p).condition);
  13094. Taicpu(p).loadreg(0,increg);
  13095. if getsubreg(Taicpu(hp1).oper[1]^.reg)<>R_SUBL then
  13096. begin
  13097. case getsubreg(Taicpu(hp1).oper[1]^.reg) of
  13098. R_SUBW:
  13099. begin
  13100. tmpreg := newreg(R_INTREGISTER,getsupreg(increg),R_SUBW);
  13101. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BW,increg,tmpreg);
  13102. end;
  13103. R_SUBD:
  13104. begin
  13105. tmpreg := newreg(R_INTREGISTER,getsupreg(increg),R_SUBD);
  13106. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BL,increg,tmpreg);
  13107. end;
  13108. {$ifdef x86_64}
  13109. R_SUBQ:
  13110. begin
  13111. { MOVZX doesn't have a 64-bit variant, because
  13112. the 32-bit version implicitly zeroes the
  13113. upper 32-bits of the destination register }
  13114. tmpreg := newreg(R_INTREGISTER,getsupreg(increg),R_SUBD);
  13115. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BL,increg,tmpreg);
  13116. setsubreg(tmpreg, R_SUBQ);
  13117. end;
  13118. {$endif x86_64}
  13119. else
  13120. Internalerror(2020030601);
  13121. end;
  13122. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  13123. asml.InsertAfter(hp2,p);
  13124. end
  13125. else
  13126. tmpreg := increg;
  13127. if (Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC) then
  13128. begin
  13129. Taicpu(hp1).ops:=2;
  13130. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^)
  13131. end;
  13132. Taicpu(hp1).loadreg(0,tmpreg);
  13133. AllocRegBetween(tmpreg,p,hp1,UsedRegs);
  13134. Result := True;
  13135. { p is no longer a Jcc instruction, so exit }
  13136. Exit;
  13137. end;
  13138. end;
  13139. end;
  13140. { Detect the following:
  13141. jmp<cond> @Lbl1
  13142. jmp @Lbl2
  13143. ...
  13144. @Lbl1:
  13145. ret
  13146. Change to:
  13147. jmp<inv_cond> @Lbl2
  13148. ret
  13149. }
  13150. if MatchInstruction(hp1,A_JMP,[]) and (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  13151. begin
  13152. hp2:=getlabelwithsym(TAsmLabel(symbol));
  13153. if Assigned(hp2) and SkipLabels(hp2,hp2) and
  13154. MatchInstruction(hp2,A_RET,[S_NO]) then
  13155. begin
  13156. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  13157. { Change label address to that of the unconditional jump }
  13158. taicpu(p).loadoper(0, taicpu(hp1).oper[0]^);
  13159. TAsmLabel(symbol).DecRefs;
  13160. taicpu(hp1).opcode := A_RET;
  13161. taicpu(hp1).is_jmp := false;
  13162. taicpu(hp1).ops := taicpu(hp2).ops;
  13163. DebugMsg(SPeepholeOptimization+'JccJmpRet2J!ccRet',p);
  13164. case taicpu(hp2).ops of
  13165. 0:
  13166. taicpu(hp1).clearop(0);
  13167. 1:
  13168. taicpu(hp1).loadconst(0,taicpu(hp2).oper[0]^.val);
  13169. else
  13170. internalerror(2016041302);
  13171. end;
  13172. end;
  13173. {$ifndef i8086}
  13174. end
  13175. {
  13176. convert
  13177. j<c> .L1
  13178. mov 1,reg
  13179. jmp .L2
  13180. .L1
  13181. mov 0,reg
  13182. .L2
  13183. into
  13184. mov 0,reg
  13185. set<not(c)> reg
  13186. take care of alignment and that the mov 0,reg is not converted into a xor as this
  13187. would destroy the flag contents
  13188. }
  13189. else if MatchInstruction(hp1,A_MOV,[]) and
  13190. MatchOpType(taicpu(hp1),top_const,top_reg) and
  13191. {$ifdef i386}
  13192. (
  13193. { Under i386, ESI, EDI, EBP and ESP
  13194. don't have an 8-bit representation }
  13195. not (getsupreg(taicpu(hp1).oper[1]^.reg) in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  13196. ) and
  13197. {$endif i386}
  13198. (taicpu(hp1).oper[0]^.val=1) and
  13199. GetNextInstruction(hp1,hp2) and
  13200. MatchInstruction(hp2,A_JMP,[]) and (taicpu(hp2).oper[0]^.ref^.refaddr=addr_full) and
  13201. GetNextInstruction(hp2,hp3) and
  13202. (hp3.typ=ait_label) and
  13203. (tasmlabel(taicpu(p).oper[0]^.ref^.symbol)=tai_label(hp3).labsym) and
  13204. (tai_label(hp3).labsym.getrefs=1) and
  13205. GetNextInstruction(hp3,hp4) and
  13206. MatchInstruction(hp4,A_MOV,[]) and
  13207. MatchOpType(taicpu(hp4),top_const,top_reg) and
  13208. (taicpu(hp4).oper[0]^.val=0) and
  13209. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp4).oper[1]^) and
  13210. GetNextInstruction(hp4,hp5) and
  13211. (hp5.typ=ait_label) and
  13212. (tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol)=tai_label(hp5).labsym) and
  13213. (tai_label(hp5).labsym.getrefs=1) then
  13214. begin
  13215. AllocRegBetween(NR_FLAGS,p,hp4,UsedRegs);
  13216. DebugMsg(SPeepholeOptimization+'JccMovJmpMov2MovSetcc',p);
  13217. { remove last label }
  13218. RemoveInstruction(hp5);
  13219. { remove second label }
  13220. RemoveInstruction(hp3);
  13221. { remove jmp }
  13222. RemoveInstruction(hp2);
  13223. if taicpu(hp1).opsize=S_B then
  13224. RemoveInstruction(hp1)
  13225. else
  13226. taicpu(hp1).loadconst(0,0);
  13227. taicpu(hp4).opcode:=A_SETcc;
  13228. taicpu(hp4).opsize:=S_B;
  13229. taicpu(hp4).condition:=inverse_cond(taicpu(p).condition);
  13230. taicpu(hp4).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(hp4).oper[1]^.reg),R_SUBL));
  13231. taicpu(hp4).opercnt:=1;
  13232. taicpu(hp4).ops:=1;
  13233. taicpu(hp4).freeop(1);
  13234. RemoveCurrentP(p);
  13235. Result:=true;
  13236. exit;
  13237. end
  13238. else if (CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) and
  13239. MatchInstruction(hp1,A_MOV,[S_W,S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
  13240. begin
  13241. { check for
  13242. jCC xxx
  13243. <several movs>
  13244. xxx:
  13245. Also spot:
  13246. Jcc xxx
  13247. <several movs>
  13248. jmp xxx
  13249. Change to:
  13250. <several cmovs with inverted condition>
  13251. jmp xxx (only for the 2nd case)
  13252. }
  13253. CMOVTracking := New(PCMOVTracking, Init(Self, p, hp1, TAsmLabel(symbol)));
  13254. if CMOVTracking^.State <> tsInvalid then
  13255. begin
  13256. CMovTracking^.Process(p);
  13257. Result := True;
  13258. end;
  13259. CMOVTracking^.Done;
  13260. {$endif i8086}
  13261. end;
  13262. end;
  13263. end;
  13264. function TX86AsmOptimizer.OptPass1Movx(var p : tai) : boolean;
  13265. var
  13266. hp1,hp2,hp3: tai;
  13267. reg_and_hp1_is_instr, RegUsed, AndTest: Boolean;
  13268. NewSize: TOpSize;
  13269. NewRegSize: TSubRegister;
  13270. Limit: TCgInt;
  13271. SwapOper: POper;
  13272. begin
  13273. result:=false;
  13274. reg_and_hp1_is_instr:=(taicpu(p).oper[1]^.typ = top_reg) and
  13275. GetNextInstruction(p,hp1) and
  13276. (hp1.typ = ait_instruction);
  13277. if reg_and_hp1_is_instr and
  13278. (
  13279. (taicpu(hp1).opcode <> A_LEA) or
  13280. { If the LEA instruction can be converted into an arithmetic instruction,
  13281. it may be possible to then fold it. }
  13282. (
  13283. { If the flags register is in use, don't change the instruction
  13284. to an ADD otherwise this will scramble the flags. [Kit] }
  13285. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  13286. ConvertLEA(taicpu(hp1))
  13287. )
  13288. ) and
  13289. IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
  13290. GetNextInstruction(hp1,hp2) and
  13291. MatchInstruction(hp2,A_MOV,[]) and
  13292. (taicpu(hp2).oper[0]^.typ = top_reg) and
  13293. OpsEqual(taicpu(hp2).oper[1]^,taicpu(p).oper[0]^) and
  13294. ((taicpu(p).opsize in [S_BW,S_BL]) and (taicpu(hp2).opsize=S_B) or
  13295. (taicpu(p).opsize in [S_WL]) and (taicpu(hp2).opsize=S_W)) and
  13296. {$ifdef i386}
  13297. { not all registers have byte size sub registers on i386 }
  13298. ((taicpu(hp2).opsize<>S_B) or (getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])) and
  13299. {$endif i386}
  13300. (((taicpu(hp1).ops=2) and
  13301. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg))) or
  13302. ((taicpu(hp1).ops=1) and
  13303. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg)))) and
  13304. not(RegUsedAfterInstruction(taicpu(hp2).oper[0]^.reg,hp2,UsedRegs)) then
  13305. begin
  13306. { change movsX/movzX reg/ref, reg2
  13307. add/sub/or/... reg3/$const, reg2
  13308. mov reg2 reg/ref
  13309. to add/sub/or/... reg3/$const, reg/ref }
  13310. { by example:
  13311. movswl %si,%eax movswl %si,%eax p
  13312. decl %eax addl %edx,%eax hp1
  13313. movw %ax,%si movw %ax,%si hp2
  13314. ->
  13315. movswl %si,%eax movswl %si,%eax p
  13316. decw %eax addw %edx,%eax hp1
  13317. movw %ax,%si movw %ax,%si hp2
  13318. }
  13319. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  13320. {
  13321. ->
  13322. movswl %si,%eax movswl %si,%eax p
  13323. decw %si addw %dx,%si hp1
  13324. movw %ax,%si movw %ax,%si hp2
  13325. }
  13326. case taicpu(hp1).ops of
  13327. 1:
  13328. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  13329. 2:
  13330. begin
  13331. taicpu(hp1).loadoper(1,taicpu(hp2).oper[1]^);
  13332. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  13333. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  13334. end;
  13335. else
  13336. internalerror(2008042702);
  13337. end;
  13338. {
  13339. ->
  13340. decw %si addw %dx,%si p
  13341. }
  13342. DebugMsg(SPeepholeOptimization + 'var3',p);
  13343. RemoveCurrentP(p, hp1);
  13344. RemoveInstruction(hp2);
  13345. Result := True;
  13346. Exit;
  13347. end;
  13348. if reg_and_hp1_is_instr and
  13349. (taicpu(hp1).opcode = A_MOV) and
  13350. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  13351. (MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)
  13352. {$ifdef x86_64}
  13353. { check for implicit extension to 64 bit }
  13354. or
  13355. ((taicpu(p).opsize in [S_BL,S_WL]) and
  13356. (taicpu(hp1).opsize=S_Q) and
  13357. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg)
  13358. )
  13359. {$endif x86_64}
  13360. )
  13361. then
  13362. begin
  13363. { change
  13364. movx %reg1,%reg2
  13365. mov %reg2,%reg3
  13366. dealloc %reg2
  13367. into
  13368. movx %reg,%reg3
  13369. }
  13370. TransferUsedRegs(TmpUsedRegs);
  13371. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  13372. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  13373. begin
  13374. DebugMsg(SPeepholeOptimization + 'MovxMov2Movx',p);
  13375. {$ifdef x86_64}
  13376. if (taicpu(p).opsize in [S_BL,S_WL]) and
  13377. (taicpu(hp1).opsize=S_Q) then
  13378. taicpu(p).loadreg(1,newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg),R_SUBD))
  13379. else
  13380. {$endif x86_64}
  13381. taicpu(p).loadreg(1,taicpu(hp1).oper[1]^.reg);
  13382. RemoveInstruction(hp1);
  13383. Result := True;
  13384. Exit;
  13385. end;
  13386. end;
  13387. if reg_and_hp1_is_instr and
  13388. ((taicpu(hp1).opcode=A_MOV) or
  13389. (taicpu(hp1).opcode=A_ADD) or
  13390. (taicpu(hp1).opcode=A_SUB) or
  13391. (taicpu(hp1).opcode=A_CMP) or
  13392. (taicpu(hp1).opcode=A_OR) or
  13393. (taicpu(hp1).opcode=A_XOR) or
  13394. (taicpu(hp1).opcode=A_AND)
  13395. ) and
  13396. (taicpu(hp1).oper[1]^.typ = top_reg) then
  13397. begin
  13398. AndTest := (taicpu(hp1).opcode=A_AND) and
  13399. GetNextInstruction(hp1, hp2) and
  13400. (hp2.typ = ait_instruction) and
  13401. (
  13402. (
  13403. (taicpu(hp2).opcode=A_TEST) and
  13404. (
  13405. MatchOperand(taicpu(hp2).oper[0]^, taicpu(hp1).oper[1]^.reg) or
  13406. MatchOperand(taicpu(hp2).oper[0]^, -1) or
  13407. (
  13408. { If the AND and TEST instructions share a constant, this is also valid }
  13409. (taicpu(hp1).oper[0]^.typ = top_const) and
  13410. MatchOperand(taicpu(hp2).oper[0]^, taicpu(hp1).oper[0]^.val)
  13411. )
  13412. ) and
  13413. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[1]^.reg)
  13414. ) or
  13415. (
  13416. (taicpu(hp2).opcode=A_CMP) and
  13417. MatchOperand(taicpu(hp2).oper[0]^, 0) and
  13418. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[1]^.reg)
  13419. )
  13420. );
  13421. { change
  13422. movx (oper),%reg2
  13423. and $x,%reg2
  13424. test %reg2,%reg2
  13425. dealloc %reg2
  13426. into
  13427. op %reg1,%reg3
  13428. if the second op accesses only the bits stored in reg1
  13429. }
  13430. if ((taicpu(p).oper[0]^.typ=top_reg) or
  13431. ((taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr<>addr_full))) and
  13432. (taicpu(hp1).oper[0]^.typ = top_const) and
  13433. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
  13434. AndTest then
  13435. begin
  13436. { Check if the AND constant is in range }
  13437. case taicpu(p).opsize of
  13438. S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  13439. begin
  13440. NewSize := S_B;
  13441. Limit := $FF;
  13442. end;
  13443. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  13444. begin
  13445. NewSize := S_W;
  13446. Limit := $FFFF;
  13447. end;
  13448. {$ifdef x86_64}
  13449. S_LQ:
  13450. begin
  13451. NewSize := S_L;
  13452. Limit := $FFFFFFFF;
  13453. end;
  13454. {$endif x86_64}
  13455. else
  13456. InternalError(2021120303);
  13457. end;
  13458. if (
  13459. ((taicpu(hp1).oper[0]^.val and Limit) = taicpu(hp1).oper[0]^.val) or
  13460. { Check for negative operands }
  13461. (((not taicpu(hp1).oper[0]^.val) and Limit) = (not taicpu(hp1).oper[0]^.val))
  13462. ) and
  13463. GetNextInstruction(hp2,hp3) and
  13464. MatchInstruction(hp3,A_Jcc,A_Setcc,A_CMOVcc,[]) and
  13465. (taicpu(hp3).condition in [C_E,C_NE]) then
  13466. begin
  13467. TransferUsedRegs(TmpUsedRegs);
  13468. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  13469. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  13470. if not(RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp2, TmpUsedRegs)) then
  13471. begin
  13472. DebugMsg(SPeepholeOptimization + 'MovxAndTest2Test done',p);
  13473. taicpu(hp1).loadoper(1, taicpu(p).oper[0]^);
  13474. taicpu(hp1).opcode := A_TEST;
  13475. taicpu(hp1).opsize := NewSize;
  13476. RemoveInstruction(hp2);
  13477. RemoveCurrentP(p, hp1);
  13478. Result:=true;
  13479. exit;
  13480. end;
  13481. end;
  13482. end;
  13483. if (taicpu(hp1).oper[0]^.typ = top_reg) and
  13484. (((taicpu(p).opsize in [S_BW,S_BL,S_WL{$ifdef x86_64},S_BQ,S_WQ,S_LQ{$endif x86_64}]) and
  13485. (taicpu(hp1).opsize=S_B)) or
  13486. ((taicpu(p).opsize in [S_WL{$ifdef x86_64},S_WQ,S_LQ{$endif x86_64}]) and
  13487. (taicpu(hp1).opsize=S_W))
  13488. {$ifdef x86_64}
  13489. or ((taicpu(p).opsize=S_LQ) and
  13490. (taicpu(hp1).opsize=S_L))
  13491. {$endif x86_64}
  13492. ) and
  13493. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg) then
  13494. begin
  13495. { change
  13496. movx %reg1,%reg2
  13497. op %reg2,%reg3
  13498. dealloc %reg2
  13499. into
  13500. op %reg1,%reg3
  13501. if the second op accesses only the bits stored in reg1
  13502. }
  13503. TransferUsedRegs(TmpUsedRegs);
  13504. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  13505. if AndTest then
  13506. begin
  13507. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  13508. RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs);
  13509. end
  13510. else
  13511. RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs);
  13512. if not RegUsed then
  13513. begin
  13514. DebugMsg(SPeepholeOptimization + 'MovxOp2Op 1',p);
  13515. if taicpu(p).oper[0]^.typ=top_reg then
  13516. begin
  13517. case taicpu(hp1).opsize of
  13518. S_B:
  13519. taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBL));
  13520. S_W:
  13521. taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBW));
  13522. S_L:
  13523. taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBD));
  13524. else
  13525. Internalerror(2020102301);
  13526. end;
  13527. AllocRegBetween(taicpu(hp1).oper[0]^.reg,p,hp1,UsedRegs);
  13528. end
  13529. else
  13530. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  13531. RemoveCurrentP(p);
  13532. if AndTest then
  13533. RemoveInstruction(hp2);
  13534. result:=true;
  13535. exit;
  13536. end;
  13537. end
  13538. else if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
  13539. (
  13540. { Bitwise operations only }
  13541. (taicpu(hp1).opcode=A_AND) or
  13542. (taicpu(hp1).opcode=A_TEST) or
  13543. (
  13544. (taicpu(hp1).oper[0]^.typ = top_const) and
  13545. (
  13546. (taicpu(hp1).opcode=A_OR) or
  13547. (taicpu(hp1).opcode=A_XOR)
  13548. )
  13549. )
  13550. ) and
  13551. (
  13552. (taicpu(hp1).oper[0]^.typ = top_const) or
  13553. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) or
  13554. not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^)
  13555. ) then
  13556. begin
  13557. { change
  13558. movx %reg2,%reg2
  13559. op const,%reg2
  13560. into
  13561. op const,%reg2 (smaller version)
  13562. movx %reg2,%reg2
  13563. also change
  13564. movx %reg1,%reg2
  13565. and/test (oper),%reg2
  13566. dealloc %reg2
  13567. into
  13568. and/test (oper),%reg1
  13569. }
  13570. case taicpu(p).opsize of
  13571. S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  13572. begin
  13573. NewSize := S_B;
  13574. NewRegSize := R_SUBL;
  13575. Limit := $FF;
  13576. end;
  13577. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  13578. begin
  13579. NewSize := S_W;
  13580. NewRegSize := R_SUBW;
  13581. Limit := $FFFF;
  13582. end;
  13583. {$ifdef x86_64}
  13584. S_LQ:
  13585. begin
  13586. NewSize := S_L;
  13587. NewRegSize := R_SUBD;
  13588. Limit := $FFFFFFFF;
  13589. end;
  13590. {$endif x86_64}
  13591. else
  13592. Internalerror(2021120302);
  13593. end;
  13594. TransferUsedRegs(TmpUsedRegs);
  13595. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  13596. if AndTest then
  13597. begin
  13598. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  13599. RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs);
  13600. end
  13601. else
  13602. RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs);
  13603. if
  13604. (
  13605. (taicpu(p).opcode = A_MOVZX) and
  13606. (
  13607. (taicpu(hp1).opcode=A_AND) or
  13608. (taicpu(hp1).opcode=A_TEST)
  13609. ) and
  13610. not (
  13611. { If both are references, then the final instruction will have
  13612. both operands as references, which is not allowed }
  13613. (taicpu(p).oper[0]^.typ = top_ref) and
  13614. (taicpu(hp1).oper[0]^.typ = top_ref)
  13615. ) and
  13616. not RegUsed
  13617. ) or
  13618. (
  13619. (
  13620. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) or
  13621. not RegUsed
  13622. ) and
  13623. (taicpu(p).oper[0]^.typ = top_reg) and
  13624. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  13625. (taicpu(hp1).oper[0]^.typ = top_const) and
  13626. ((taicpu(hp1).oper[0]^.val and Limit) = taicpu(hp1).oper[0]^.val)
  13627. ) then
  13628. begin
  13629. {$if defined(i386) or defined(i8086)}
  13630. { If the target size is 8-bit, make sure we can actually encode it }
  13631. if (NewRegSize = R_SUBL) and (taicpu(hp1).oper[0]^.typ = top_reg) and not (GetSupReg(taicpu(hp1).oper[0]^.reg) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX]) then
  13632. Exit;
  13633. {$endif i386 or i8086}
  13634. DebugMsg(SPeepholeOptimization + 'MovxOp2Op 2',p);
  13635. taicpu(hp1).opsize := NewSize;
  13636. taicpu(hp1).loadoper(1, taicpu(p).oper[0]^);
  13637. if AndTest then
  13638. begin
  13639. RemoveInstruction(hp2);
  13640. if not RegUsed then
  13641. begin
  13642. taicpu(hp1).opcode := A_TEST;
  13643. if (taicpu(hp1).oper[0]^.typ = top_ref) then
  13644. begin
  13645. { Make sure the reference is the second operand }
  13646. SwapOper := taicpu(hp1).oper[0];
  13647. taicpu(hp1).oper[0] := taicpu(hp1).oper[1];
  13648. taicpu(hp1).oper[1] := SwapOper;
  13649. end;
  13650. end;
  13651. end;
  13652. case taicpu(hp1).oper[0]^.typ of
  13653. top_reg:
  13654. setsubreg(taicpu(hp1).oper[0]^.reg, NewRegSize);
  13655. top_const:
  13656. { For the AND/TEST case }
  13657. taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and Limit;
  13658. else
  13659. ;
  13660. end;
  13661. if RegUsed then
  13662. begin
  13663. AsmL.Remove(p);
  13664. AsmL.InsertAfter(p, hp1);
  13665. p := hp1;
  13666. end
  13667. else
  13668. RemoveCurrentP(p, hp1);
  13669. result:=true;
  13670. exit;
  13671. end;
  13672. end;
  13673. end;
  13674. if reg_and_hp1_is_instr and
  13675. (taicpu(p).oper[0]^.typ = top_reg) and
  13676. (
  13677. (taicpu(hp1).opcode = A_SHL) or (taicpu(hp1).opcode = A_SAL)
  13678. ) and
  13679. (taicpu(hp1).oper[0]^.typ = top_const) and
  13680. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  13681. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  13682. { Minimum shift value allowed is the bit difference between the sizes }
  13683. (taicpu(hp1).oper[0]^.val >=
  13684. { Multiply by 8 because tcgsize2size returns bytes, not bits }
  13685. 8 * (
  13686. tcgsize2size[reg_cgsize(taicpu(p).oper[1]^.reg)] -
  13687. tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
  13688. )
  13689. ) then
  13690. begin
  13691. { For:
  13692. movsx/movzx %reg1,%reg1 (same register, just different sizes)
  13693. shl/sal ##, %reg1
  13694. Remove the movsx/movzx instruction if the shift overwrites the
  13695. extended bits of the register (e.g. movslq %eax,%rax; shlq $32,%rax
  13696. }
  13697. DebugMsg(SPeepholeOptimization + 'MovxShl2Shl',p);
  13698. RemoveCurrentP(p, hp1);
  13699. Result := True;
  13700. Exit;
  13701. end
  13702. else if reg_and_hp1_is_instr and
  13703. (taicpu(p).oper[0]^.typ = top_reg) and
  13704. (
  13705. ((taicpu(hp1).opcode = A_SHR) and (taicpu(p).opcode = A_MOVZX)) or
  13706. ((taicpu(hp1).opcode = A_SAR) and (taicpu(p).opcode <> A_MOVZX))
  13707. ) and
  13708. (taicpu(hp1).oper[0]^.typ = top_const) and
  13709. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  13710. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  13711. { Minimum shift value allowed is the bit size of the smallest register - 1 }
  13712. (taicpu(hp1).oper[0]^.val <
  13713. { Multiply by 8 because tcgsize2size returns bytes, not bits }
  13714. 8 * (
  13715. tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
  13716. )
  13717. ) then
  13718. begin
  13719. { For:
  13720. movsx %reg1,%reg1 movzx %reg1,%reg1 (same register, just different sizes)
  13721. sar ##, %reg1 shr ##, %reg1
  13722. Move the shift to before the movx instruction if the shift value
  13723. is not too large.
  13724. }
  13725. asml.Remove(hp1);
  13726. asml.InsertBefore(hp1, p);
  13727. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[0]^.reg;
  13728. case taicpu(p).opsize of
  13729. s_BW, S_BL{$ifdef x86_64}, S_BQ{$endif}:
  13730. taicpu(hp1).opsize := S_B;
  13731. S_WL{$ifdef x86_64}, S_WQ{$endif}:
  13732. taicpu(hp1).opsize := S_W;
  13733. {$ifdef x86_64}
  13734. S_LQ:
  13735. taicpu(hp1).opsize := S_L;
  13736. {$endif}
  13737. else
  13738. InternalError(2020112401);
  13739. end;
  13740. if (taicpu(hp1).opcode = A_SHR) then
  13741. DebugMsg(SPeepholeOptimization + 'MovzShr2ShrMovz', hp1)
  13742. else
  13743. DebugMsg(SPeepholeOptimization + 'MovsSar2SarMovs', hp1);
  13744. Result := True;
  13745. end;
  13746. if reg_and_hp1_is_instr and
  13747. (taicpu(p).oper[0]^.typ = top_reg) and
  13748. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  13749. (
  13750. (taicpu(hp1).opcode = taicpu(p).opcode)
  13751. or ((taicpu(p).opcode = A_MOVZX) and ((taicpu(hp1).opcode = A_MOVSX){$ifdef x86_64} or (taicpu(hp1).opcode = A_MOVSXD){$endif x86_64}))
  13752. {$ifdef x86_64}
  13753. or ((taicpu(p).opcode = A_MOVSX) and (taicpu(hp1).opcode = A_MOVSXD))
  13754. {$endif x86_64}
  13755. ) then
  13756. begin
  13757. if MatchOpType(taicpu(hp1), top_reg, top_reg) and
  13758. (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[0]^.reg) and
  13759. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
  13760. begin
  13761. {
  13762. For example:
  13763. movzbw %al,%ax
  13764. movzwl %ax,%eax
  13765. Compress into:
  13766. movzbl %al,%eax
  13767. }
  13768. RegUsed := False;
  13769. case taicpu(p).opsize of
  13770. S_BW:
  13771. case taicpu(hp1).opsize of
  13772. S_WL:
  13773. begin
  13774. taicpu(p).opsize := S_BL;
  13775. RegUsed := True;
  13776. end;
  13777. {$ifdef x86_64}
  13778. S_WQ:
  13779. begin
  13780. if taicpu(p).opcode = A_MOVZX then
  13781. begin
  13782. taicpu(p).opsize := S_BL;
  13783. { 64-bit zero extension is implicit, so change to the 32-bit register }
  13784. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  13785. end
  13786. else
  13787. taicpu(p).opsize := S_BQ;
  13788. RegUsed := True;
  13789. end;
  13790. {$endif x86_64}
  13791. else
  13792. ;
  13793. end;
  13794. {$ifdef x86_64}
  13795. S_BL:
  13796. case taicpu(hp1).opsize of
  13797. S_LQ:
  13798. begin
  13799. if taicpu(p).opcode = A_MOVZX then
  13800. begin
  13801. taicpu(p).opsize := S_BL;
  13802. { 64-bit zero extension is implicit, so change to the 32-bit register }
  13803. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  13804. end
  13805. else
  13806. taicpu(p).opsize := S_BQ;
  13807. RegUsed := True;
  13808. end;
  13809. else
  13810. ;
  13811. end;
  13812. S_WL:
  13813. case taicpu(hp1).opsize of
  13814. S_LQ:
  13815. begin
  13816. if taicpu(p).opcode = A_MOVZX then
  13817. begin
  13818. taicpu(p).opsize := S_WL;
  13819. { 64-bit zero extension is implicit, so change to the 32-bit register }
  13820. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  13821. end
  13822. else
  13823. taicpu(p).opsize := S_WQ;
  13824. RegUsed := True;
  13825. end;
  13826. else
  13827. ;
  13828. end;
  13829. {$endif x86_64}
  13830. else
  13831. ;
  13832. end;
  13833. if RegUsed then
  13834. begin
  13835. DebugMsg(SPeepholeOptimization + 'MovxMovx2Movx', p);
  13836. taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg;
  13837. RemoveInstruction(hp1);
  13838. Result := True;
  13839. Exit;
  13840. end;
  13841. end;
  13842. if (taicpu(hp1).opsize = taicpu(p).opsize) and
  13843. not RegInInstruction(taicpu(p).oper[1]^.reg, hp1) and
  13844. GetNextInstruction(hp1, hp2) and
  13845. MatchInstruction(hp2, [A_AND, A_OR, A_XOR, A_TEST], []) and
  13846. (
  13847. ((taicpu(hp2).opsize = S_W) and (taicpu(p).opsize = S_BW)) or
  13848. ((taicpu(hp2).opsize = S_L) and (taicpu(p).opsize in [S_BL, S_WL]))
  13849. {$ifdef x86_64}
  13850. or ((taicpu(hp2).opsize = S_Q) and (taicpu(p).opsize in [S_BL, S_BQ, S_WL, S_WQ, S_LQ]))
  13851. {$endif x86_64}
  13852. ) and
  13853. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  13854. (
  13855. (
  13856. (taicpu(hp2).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  13857. (taicpu(hp2).oper[1]^.reg = taicpu(p).oper[1]^.reg)
  13858. ) or
  13859. (
  13860. { Only allow the operands in reverse order for TEST instructions }
  13861. (taicpu(hp2).opcode = A_TEST) and
  13862. (taicpu(hp2).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
  13863. (taicpu(hp2).oper[1]^.reg = taicpu(hp1).oper[1]^.reg)
  13864. )
  13865. ) then
  13866. begin
  13867. {
  13868. For example:
  13869. movzbl %al,%eax
  13870. movzbl (ref),%edx
  13871. andl %edx,%eax
  13872. (%edx deallocated)
  13873. Change to:
  13874. andb (ref),%al
  13875. movzbl %al,%eax
  13876. Rules are:
  13877. - First two instructions have the same opcode and opsize
  13878. - First instruction's operands are the same super-register
  13879. - Second instruction operates on a different register
  13880. - Third instruction is AND, OR, XOR or TEST
  13881. - Third instruction's operands are the destination registers of the first two instructions
  13882. - Third instruction writes to the destination register of the first instruction (except with TEST)
  13883. - Second instruction's destination register is deallocated afterwards
  13884. }
  13885. TransferUsedRegs(TmpUsedRegs);
  13886. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  13887. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  13888. if not RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs) then
  13889. begin
  13890. case taicpu(p).opsize of
  13891. S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  13892. NewSize := S_B;
  13893. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  13894. NewSize := S_W;
  13895. {$ifdef x86_64}
  13896. S_LQ:
  13897. NewSize := S_L;
  13898. {$endif x86_64}
  13899. else
  13900. InternalError(2021120301);
  13901. end;
  13902. taicpu(hp2).loadoper(0, taicpu(hp1).oper[0]^);
  13903. taicpu(hp2).loadreg(1, taicpu(p).oper[0]^.reg);
  13904. taicpu(hp2).opsize := NewSize;
  13905. RemoveInstruction(hp1);
  13906. { With TEST, it's best to keep the MOVX instruction at the top }
  13907. if (taicpu(hp2).opcode <> A_TEST) then
  13908. begin
  13909. DebugMsg(SPeepholeOptimization + 'MovxMovxTest2MovxTest', p);
  13910. asml.Remove(p);
  13911. { If the third instruction uses the flags, the MOVX instruction won't modify then }
  13912. asml.InsertAfter(p, hp2);
  13913. p := hp2;
  13914. end
  13915. else
  13916. DebugMsg(SPeepholeOptimization + 'MovxMovxOp2OpMovx', p);
  13917. Result := True;
  13918. Exit;
  13919. end;
  13920. end;
  13921. end;
  13922. if taicpu(p).opcode=A_MOVZX then
  13923. begin
  13924. { removes superfluous And's after movzx's }
  13925. if reg_and_hp1_is_instr and
  13926. (taicpu(hp1).opcode = A_AND) and
  13927. MatchOpType(taicpu(hp1),top_const,top_reg) and
  13928. ((taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)
  13929. {$ifdef x86_64}
  13930. { check for implicit extension to 64 bit }
  13931. or
  13932. ((taicpu(p).opsize in [S_BL,S_WL]) and
  13933. (taicpu(hp1).opsize=S_Q) and
  13934. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg)
  13935. )
  13936. {$endif x86_64}
  13937. )
  13938. then
  13939. begin
  13940. case taicpu(p).opsize Of
  13941. S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
  13942. if (taicpu(hp1).oper[0]^.val = $ff) then
  13943. begin
  13944. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz1',p);
  13945. RemoveInstruction(hp1);
  13946. Result:=true;
  13947. exit;
  13948. end;
  13949. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  13950. if (taicpu(hp1).oper[0]^.val = $ffff) then
  13951. begin
  13952. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz2',p);
  13953. RemoveInstruction(hp1);
  13954. Result:=true;
  13955. exit;
  13956. end;
  13957. {$ifdef x86_64}
  13958. S_LQ:
  13959. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  13960. begin
  13961. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz3',p);
  13962. RemoveInstruction(hp1);
  13963. Result:=true;
  13964. exit;
  13965. end;
  13966. {$endif x86_64}
  13967. else
  13968. ;
  13969. end;
  13970. { we cannot get rid of the and, but can we get rid of the movz ?}
  13971. if SuperRegistersEqual(taicpu(p).oper[0]^.reg,taicpu(p).oper[1]^.reg) then
  13972. begin
  13973. case taicpu(p).opsize Of
  13974. S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
  13975. if (taicpu(hp1).oper[0]^.val and $ff)=taicpu(hp1).oper[0]^.val then
  13976. begin
  13977. DebugMsg(SPeepholeOptimization + 'MovzAnd2And1',p);
  13978. RemoveCurrentP(p,hp1);
  13979. Result:=true;
  13980. exit;
  13981. end;
  13982. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  13983. if (taicpu(hp1).oper[0]^.val and $ffff)=taicpu(hp1).oper[0]^.val then
  13984. begin
  13985. DebugMsg(SPeepholeOptimization + 'MovzAnd2And2',p);
  13986. RemoveCurrentP(p,hp1);
  13987. Result:=true;
  13988. exit;
  13989. end;
  13990. {$ifdef x86_64}
  13991. S_LQ:
  13992. if (taicpu(hp1).oper[0]^.val and $ffffffff)=taicpu(hp1).oper[0]^.val then
  13993. begin
  13994. DebugMsg(SPeepholeOptimization + 'MovzAnd2And3',p);
  13995. RemoveCurrentP(p,hp1);
  13996. Result:=true;
  13997. exit;
  13998. end;
  13999. {$endif x86_64}
  14000. else
  14001. ;
  14002. end;
  14003. end;
  14004. end;
  14005. { changes some movzx constructs to faster synonyms (all examples
  14006. are given with eax/ax, but are also valid for other registers)}
  14007. if MatchOpType(taicpu(p),top_reg,top_reg) then
  14008. begin
  14009. case taicpu(p).opsize of
  14010. { Technically, movzbw %al,%ax cannot be encoded in 32/64-bit mode
  14011. (the machine code is equivalent to movzbl %al,%eax), but the
  14012. code generator still generates that assembler instruction and
  14013. it is silently converted. This should probably be checked.
  14014. [Kit] }
  14015. S_BW:
  14016. begin
  14017. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  14018. (
  14019. not IsMOVZXAcceptable
  14020. { and $0xff,%ax has a smaller encoding but risks a partial write penalty }
  14021. or (
  14022. (cs_opt_size in current_settings.optimizerswitches) and
  14023. (taicpu(p).oper[1]^.reg = NR_AX)
  14024. )
  14025. ) then
  14026. {Change "movzbw %al, %ax" to "andw $0x0ffh, %ax"}
  14027. begin
  14028. DebugMsg(SPeepholeOptimization + 'var7',p);
  14029. taicpu(p).opcode := A_AND;
  14030. taicpu(p).changeopsize(S_W);
  14031. taicpu(p).loadConst(0,$ff);
  14032. Result := True;
  14033. end
  14034. else if not IsMOVZXAcceptable and
  14035. GetNextInstruction(p, hp1) and
  14036. (tai(hp1).typ = ait_instruction) and
  14037. (taicpu(hp1).opcode = A_AND) and
  14038. MatchOpType(taicpu(hp1),top_const,top_reg) and
  14039. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  14040. { Change "movzbw %reg1, %reg2; andw $const, %reg2"
  14041. to "movw %reg1, reg2; andw $(const1 and $ff), %reg2"}
  14042. begin
  14043. DebugMsg(SPeepholeOptimization + 'var8',p);
  14044. taicpu(p).opcode := A_MOV;
  14045. taicpu(p).changeopsize(S_W);
  14046. setsubreg(taicpu(p).oper[0]^.reg,R_SUBW);
  14047. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  14048. Result := True;
  14049. end;
  14050. end;
  14051. {$ifndef i8086} { movzbl %al,%eax cannot be encoded in 16-bit mode (the machine code is equivalent to movzbw %al,%ax }
  14052. S_BL:
  14053. if not IsMOVZXAcceptable then
  14054. begin
  14055. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) then
  14056. { Change "movzbl %al, %eax" to "andl $0x0ffh, %eax" }
  14057. begin
  14058. DebugMsg(SPeepholeOptimization + 'var9',p);
  14059. taicpu(p).opcode := A_AND;
  14060. taicpu(p).changeopsize(S_L);
  14061. taicpu(p).loadConst(0,$ff);
  14062. Result := True;
  14063. end
  14064. else if GetNextInstruction(p, hp1) and
  14065. (tai(hp1).typ = ait_instruction) and
  14066. (taicpu(hp1).opcode = A_AND) and
  14067. MatchOpType(taicpu(hp1),top_const,top_reg) and
  14068. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  14069. { Change "movzbl %reg1, %reg2; andl $const, %reg2"
  14070. to "movl %reg1, reg2; andl $(const1 and $ff), %reg2"}
  14071. begin
  14072. DebugMsg(SPeepholeOptimization + 'var10',p);
  14073. taicpu(p).opcode := A_MOV;
  14074. taicpu(p).changeopsize(S_L);
  14075. { do not use R_SUBWHOLE
  14076. as movl %rdx,%eax
  14077. is invalid in assembler PM }
  14078. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  14079. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  14080. Result := True;
  14081. end;
  14082. end;
  14083. {$endif i8086}
  14084. S_WL:
  14085. if not IsMOVZXAcceptable then
  14086. begin
  14087. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) then
  14088. { Change "movzwl %ax, %eax" to "andl $0x0ffffh, %eax" }
  14089. begin
  14090. DebugMsg(SPeepholeOptimization + 'var11',p);
  14091. taicpu(p).opcode := A_AND;
  14092. taicpu(p).changeopsize(S_L);
  14093. taicpu(p).loadConst(0,$ffff);
  14094. Result := True;
  14095. end
  14096. else if GetNextInstruction(p, hp1) and
  14097. (tai(hp1).typ = ait_instruction) and
  14098. (taicpu(hp1).opcode = A_AND) and
  14099. (taicpu(hp1).oper[0]^.typ = top_const) and
  14100. (taicpu(hp1).oper[1]^.typ = top_reg) and
  14101. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  14102. { Change "movzwl %reg1, %reg2; andl $const, %reg2"
  14103. to "movl %reg1, reg2; andl $(const1 and $ffff), %reg2"}
  14104. begin
  14105. DebugMsg(SPeepholeOptimization + 'var12',p);
  14106. taicpu(p).opcode := A_MOV;
  14107. taicpu(p).changeopsize(S_L);
  14108. { do not use R_SUBWHOLE
  14109. as movl %rdx,%eax
  14110. is invalid in assembler PM }
  14111. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  14112. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  14113. Result := True;
  14114. end;
  14115. end;
  14116. else
  14117. InternalError(2017050705);
  14118. end;
  14119. end
  14120. else if not IsMOVZXAcceptable and (taicpu(p).oper[0]^.typ = top_ref) then
  14121. begin
  14122. if GetNextInstruction(p, hp1) and
  14123. (tai(hp1).typ = ait_instruction) and
  14124. (taicpu(hp1).opcode = A_AND) and
  14125. MatchOpType(taicpu(hp1),top_const,top_reg) and
  14126. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  14127. begin
  14128. case taicpu(p).opsize Of
  14129. S_BL:
  14130. if (taicpu(hp1).opsize <> S_L) or
  14131. (taicpu(hp1).oper[0]^.val > $FF) then
  14132. begin
  14133. DebugMsg(SPeepholeOptimization + 'var13',p);
  14134. taicpu(hp1).changeopsize(S_L);
  14135. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  14136. Include(OptsToCheck, aoc_ForceNewIteration);
  14137. end;
  14138. S_WL:
  14139. if (taicpu(hp1).opsize <> S_L) or
  14140. (taicpu(hp1).oper[0]^.val > $FFFF) then
  14141. begin
  14142. DebugMsg(SPeepholeOptimization + 'var14',p);
  14143. taicpu(hp1).changeopsize(S_L);
  14144. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  14145. Include(OptsToCheck, aoc_ForceNewIteration);
  14146. end;
  14147. S_BW:
  14148. if (taicpu(hp1).opsize <> S_W) or
  14149. (taicpu(hp1).oper[0]^.val > $FF) then
  14150. begin
  14151. DebugMsg(SPeepholeOptimization + 'var15',p);
  14152. taicpu(hp1).changeopsize(S_W);
  14153. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  14154. Include(OptsToCheck, aoc_ForceNewIteration);
  14155. end;
  14156. else
  14157. Internalerror(2017050704)
  14158. end;
  14159. end;
  14160. end;
  14161. end;
  14162. end;
  14163. {$ifdef x86_64}
  14164. function TX86AsmOptimizer.DoZeroUpper32Opt(var mov_p: tai; var and_p: tai): Boolean;
  14165. var
  14166. hp1, old_hp1: tai;
  14167. FullSourceReg, FullTargetReg: TRegister;
  14168. begin
  14169. if (mov_p.typ<>ait_instruction) or
  14170. (taicpu(mov_p).opsize<>S_L) or
  14171. not MatchOpType(taicpu(mov_p),top_reg,top_reg) then
  14172. InternalError(2025062801);
  14173. Result:=False;
  14174. FullSourceReg:=taicpu(mov_p).oper[0]^.reg; setsubreg(FullSourceReg, R_SUBQ);
  14175. FullTargetReg:=taicpu(mov_p).oper[1]^.reg; setsubreg(FullTargetReg, R_SUBQ);
  14176. { Mark the registers in the MOV command as "used" }
  14177. IncludeRegInUsedRegs(FullSourceReg,UsedRegs);
  14178. IncludeRegInUsedRegs(FullTargetReg,UsedRegs);
  14179. { This is a little hack to get DeepMOVOpt to replace the full 64-bit
  14180. registers. The MOV instruction will be put back as it was afterwards
  14181. (unless it got removed). }
  14182. taicpu(mov_p).oper[0]^.reg:=FullSourceReg;
  14183. taicpu(mov_p).oper[1]^.reg:=FullTargetReg;
  14184. { Start after the and_p otherwise that instruction will be considered
  14185. to have modified the source register }
  14186. old_hp1:=and_p;
  14187. while GetNextInstructionUsingReg(old_hp1,hp1,FullTargetReg) and
  14188. (hp1.typ=ait_instruction) do
  14189. begin
  14190. if RegReadByInstruction(FullTargetReg,hp1) and
  14191. not RegModifiedBetween(FullSourceReg,old_hp1,hp1) and
  14192. DeepMOVOpt(taicpu(mov_p),taicpu(hp1)) then
  14193. begin
  14194. { A change has occurred, just not in mov_p }
  14195. Include(OptsToCheck, aoc_ForceNewIteration);
  14196. TransferUsedRegs(TmpUsedRegs);
  14197. UpdateUsedRegsBetween(TmpUsedRegs,tai(mov_p.Next), hp1);
  14198. if not RegUsedAfterInstruction(FullTargetReg,hp1,TmpUsedRegs) and
  14199. { Just in case something didn't get modified (e.g. an
  14200. implicit register) }
  14201. not RegReadByInstruction(FullTargetReg,hp1) then
  14202. begin
  14203. { We can remove the original MOV }
  14204. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3d done',mov_p);
  14205. RemoveCurrentP(mov_p);
  14206. Result := True;
  14207. Exit;
  14208. end;
  14209. end
  14210. else
  14211. Break;
  14212. old_hp1:=hp1;
  14213. end;
  14214. { Put the MOV instruction back as it was }
  14215. setsubreg(taicpu(mov_p).oper[0]^.reg,R_SUBD);
  14216. setsubreg(taicpu(mov_p).oper[1]^.reg,R_SUBD);
  14217. end;
  14218. {$endif x86_64}
  14219. function TX86AsmOptimizer.OptPass1AND(var p : tai) : boolean;
  14220. var
  14221. hp1, hp2 : tai;
  14222. MaskLength : Cardinal;
  14223. MaskedBits : TCgInt;
  14224. ActiveReg : TRegister;
  14225. begin
  14226. Result:=false;
  14227. { There are no optimisations for reference targets }
  14228. if (taicpu(p).oper[1]^.typ <> top_reg) then
  14229. Exit;
  14230. { Saves on a bunch of dereferences }
  14231. ActiveReg := taicpu(p).oper[1]^.reg;
  14232. while GetNextInstruction(p, hp1) and
  14233. (hp1.typ = ait_instruction) do
  14234. begin
  14235. if (taicpu(p).oper[0]^.typ = top_const) then
  14236. begin
  14237. case taicpu(hp1).opcode of
  14238. A_AND:
  14239. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  14240. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  14241. { the second register must contain the first one, so compare their subreg types }
  14242. (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
  14243. (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
  14244. { change
  14245. and const1, reg
  14246. and const2, reg
  14247. to
  14248. and (const1 and const2), reg
  14249. }
  14250. begin
  14251. taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
  14252. DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
  14253. RemoveCurrentP(p, hp1);
  14254. Result:=true;
  14255. exit;
  14256. end;
  14257. A_CMP:
  14258. if (PopCnt(DWord(taicpu(p).oper[0]^.val)) = 1) and { Only 1 bit set }
  14259. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.val) and
  14260. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  14261. { Just check that the condition on the next instruction is compatible }
  14262. GetNextInstruction(hp1, hp2) and
  14263. (hp2.typ = ait_instruction) and
  14264. (taicpu(hp2).condition in [C_Z, C_E, C_NZ, C_NE])
  14265. then
  14266. { change
  14267. and 2^n, reg
  14268. cmp 2^n, reg
  14269. j(c) / set(c) / cmov(c) (c is equal or not equal)
  14270. to
  14271. and 2^n, reg
  14272. test reg, reg
  14273. j(~c) / set(~c) / cmov(~c)
  14274. }
  14275. begin
  14276. { Keep TEST instruction in, rather than remove it, because
  14277. it may trigger other optimisations such as MovAndTest2Test }
  14278. taicpu(hp1).loadreg(0, taicpu(hp1).oper[1]^.reg);
  14279. taicpu(hp1).opcode := A_TEST;
  14280. DebugMsg(SPeepholeOptimization + 'AND/CMP/J(c) -> AND/J(~c) with power of 2 constant', p);
  14281. taicpu(hp2).condition := inverse_cond(taicpu(hp2).condition);
  14282. Result := True;
  14283. Exit;
  14284. end
  14285. else if ((taicpu(p).oper[0]^.val=$ff) or (taicpu(p).oper[0]^.val=$ffff) or (taicpu(p).oper[0]^.val=$ffffffff)) and
  14286. MatchOpType(taicpu(hp1),top_const,top_reg) and
  14287. (taicpu(p).oper[0]^.val>=taicpu(hp1).oper[0]^.val) and
  14288. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) then
  14289. { change
  14290. and $ff/$ff/$ffff, reg
  14291. cmp val<=$ff/val<=$ffff/val<=$ffffffff, reg
  14292. dealloc reg
  14293. to
  14294. cmp val<=$ff/val<=$ffff/val<=$ffffffff, resized reg
  14295. }
  14296. begin
  14297. TransferUsedRegs(TmpUsedRegs);
  14298. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  14299. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
  14300. begin
  14301. DebugMsg(SPeepholeOptimization + 'AND/CMP -> CMP', p);
  14302. case taicpu(p).oper[0]^.val of
  14303. $ff:
  14304. begin
  14305. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBL);
  14306. taicpu(hp1).opsize:=S_B;
  14307. end;
  14308. $ffff:
  14309. begin
  14310. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBW);
  14311. taicpu(hp1).opsize:=S_W;
  14312. end;
  14313. $ffffffff:
  14314. begin
  14315. setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
  14316. taicpu(hp1).opsize:=S_L;
  14317. end;
  14318. else
  14319. Internalerror(2023030401);
  14320. end;
  14321. RemoveCurrentP(p);
  14322. Result := True;
  14323. Exit;
  14324. end;
  14325. end;
  14326. A_MOVZX:
  14327. if MatchOpType(taicpu(hp1),top_reg,top_reg) and
  14328. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) and
  14329. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  14330. (
  14331. (
  14332. (taicpu(p).opsize=S_W) and
  14333. (taicpu(hp1).opsize=S_BW)
  14334. ) or
  14335. (
  14336. (taicpu(p).opsize=S_L) and
  14337. (taicpu(hp1).opsize in [S_WL,S_BL{$ifdef x86_64},S_BQ,S_WQ{$endif x86_64}])
  14338. )
  14339. {$ifdef x86_64}
  14340. or
  14341. (
  14342. (taicpu(p).opsize=S_Q) and
  14343. (taicpu(hp1).opsize in [S_BQ,S_WQ,S_BL,S_WL])
  14344. )
  14345. {$endif x86_64}
  14346. ) then
  14347. begin
  14348. if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  14349. ((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val)
  14350. ) or
  14351. (((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  14352. ((taicpu(p).oper[0]^.val and $ffff)=taicpu(p).oper[0]^.val))
  14353. then
  14354. begin
  14355. { Unlike MOVSX, MOVZX doesn't actually have a version that zero-extends a
  14356. 32-bit register to a 64-bit register, or even a version called MOVZXD, so
  14357. code that tests for the presence of AND 0xffffffff followed by MOVZX is
  14358. wasted, and is indictive of a compiler bug if it were triggered. [Kit]
  14359. NOTE: To zero-extend from 32 bits to 64 bits, simply use the standard MOV.
  14360. }
  14361. DebugMsg(SPeepholeOptimization + 'AndMovzToAnd done',p);
  14362. RemoveInstruction(hp1);
  14363. { See if there are other optimisations possible }
  14364. Continue;
  14365. end;
  14366. end;
  14367. A_SHL:
  14368. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  14369. (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
  14370. begin
  14371. {$ifopt R+}
  14372. {$define RANGE_WAS_ON}
  14373. {$R-}
  14374. {$endif}
  14375. { get length of potential and mask }
  14376. MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
  14377. { really a mask? }
  14378. {$ifdef RANGE_WAS_ON}
  14379. {$R+}
  14380. {$endif}
  14381. if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
  14382. { unmasked part shifted out? }
  14383. ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
  14384. begin
  14385. DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
  14386. RemoveCurrentP(p, hp1);
  14387. Result:=true;
  14388. exit;
  14389. end;
  14390. end;
  14391. A_SHR:
  14392. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  14393. (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
  14394. (taicpu(hp1).oper[0]^.val <= 63) then
  14395. begin
  14396. { Does SHR combined with the AND cover all the bits?
  14397. e.g. for "andb $252,%reg; shrb $2,%reg" - the "and" can be removed }
  14398. MaskedBits := taicpu(p).oper[0]^.val or ((TCgInt(1) shl taicpu(hp1).oper[0]^.val) - 1);
  14399. if ((taicpu(p).opsize = S_B) and ((MaskedBits and $FF) = $FF)) or
  14400. ((taicpu(p).opsize = S_W) and ((MaskedBits and $FFFF) = $FFFF)) or
  14401. ((taicpu(p).opsize = S_L) and ((MaskedBits and $FFFFFFFF) = $FFFFFFFF)) then
  14402. begin
  14403. DebugMsg(SPeepholeOptimization + 'AndShrToShr done', p);
  14404. RemoveCurrentP(p, hp1);
  14405. Result := True;
  14406. Exit;
  14407. end;
  14408. end;
  14409. A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  14410. if (taicpu(hp1).oper[0]^.typ = top_reg) and
  14411. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
  14412. begin
  14413. if SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
  14414. (
  14415. (
  14416. (taicpu(hp1).opsize in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  14417. ((taicpu(p).oper[0]^.val and $7F) = taicpu(p).oper[0]^.val)
  14418. ) or (
  14419. (taicpu(hp1).opsize in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  14420. ((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val)
  14421. {$ifdef x86_64}
  14422. ) or (
  14423. (taicpu(hp1).opsize = S_LQ) and
  14424. ((taicpu(p).oper[0]^.val and $7fffffff) = taicpu(p).oper[0]^.val)
  14425. {$endif x86_64}
  14426. )
  14427. ) then
  14428. begin
  14429. if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg){$ifdef x86_64} or (taicpu(hp1).opsize = S_LQ){$endif x86_64} then
  14430. begin
  14431. DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
  14432. RemoveInstruction(hp1);
  14433. { See if there are other optimisations possible }
  14434. Continue;
  14435. end;
  14436. { The super-registers are the same though.
  14437. Note that this change by itself doesn't improve
  14438. code speed, but it opens up other optimisations. }
  14439. {$ifdef x86_64}
  14440. { Convert 64-bit register to 32-bit }
  14441. case taicpu(hp1).opsize of
  14442. S_BQ:
  14443. begin
  14444. taicpu(hp1).opsize := S_BL;
  14445. taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
  14446. end;
  14447. S_WQ:
  14448. begin
  14449. taicpu(hp1).opsize := S_WL;
  14450. taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
  14451. end
  14452. else
  14453. ;
  14454. end;
  14455. {$endif x86_64}
  14456. DebugMsg(SPeepholeOptimization + 'AndMovsxToAndMovzx', hp1);
  14457. taicpu(hp1).opcode := A_MOVZX;
  14458. { See if there are other optimisations possible }
  14459. Continue;
  14460. end;
  14461. end;
  14462. else
  14463. ;
  14464. end;
  14465. end
  14466. else if MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^.reg) and
  14467. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  14468. begin
  14469. {$ifdef x86_64}
  14470. if (taicpu(p).opsize = S_Q) then
  14471. begin
  14472. { Never necessary }
  14473. DebugMsg(SPeepholeOptimization + 'Andq2Nop', p);
  14474. RemoveCurrentP(p, hp1);
  14475. Result := True;
  14476. Exit;
  14477. end;
  14478. {$endif x86_64}
  14479. { Forward check to determine necessity of and %reg,%reg }
  14480. TransferUsedRegs(TmpUsedRegs);
  14481. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  14482. case taicpu(hp1).opcode of
  14483. A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  14484. if (
  14485. (taicpu(hp1).oper[0]^.typ <> top_ref) or
  14486. not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
  14487. ) and
  14488. (
  14489. (taicpu(hp1).opcode <> A_MOV) or
  14490. (taicpu(hp1).oper[1]^.typ <> top_ref) or
  14491. not RegInRef(ActiveReg, taicpu(hp1).oper[1]^.ref^)
  14492. ) and
  14493. not (
  14494. { If mov %reg,%reg is present, remove that instruction instead in OptPass1MOV }
  14495. (taicpu(hp1).opcode = A_MOV) and
  14496. MatchOperand(taicpu(hp1).oper[0]^, ActiveReg) and
  14497. MatchOperand(taicpu(hp1).oper[1]^, ActiveReg)
  14498. ) and
  14499. (
  14500. (
  14501. (taicpu(hp1).oper[0]^.typ = top_reg) and
  14502. (taicpu(hp1).oper[0]^.reg = ActiveReg) and
  14503. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg)
  14504. ) or
  14505. (
  14506. {$ifdef x86_64}
  14507. (
  14508. { If we read from the register, make sure it's not dependent on the upper 32 bits }
  14509. (taicpu(hp1).oper[0]^.typ <> top_reg) or
  14510. not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, ActiveReg) or
  14511. (GetSubReg(taicpu(hp1).oper[0]^.reg) <> R_SUBQ)
  14512. ) and
  14513. {$endif x86_64}
  14514. not RegUsedAfterInstruction(ActiveReg, hp1, TmpUsedRegs)
  14515. )
  14516. ) then
  14517. begin
  14518. DebugMsg(SPeepholeOptimization + 'AndMovx2Movx', p);
  14519. RemoveCurrentP(p, hp1);
  14520. Result := True;
  14521. Exit;
  14522. end;
  14523. A_ADD,
  14524. A_AND,
  14525. A_BSF,
  14526. A_BSR,
  14527. A_BTC,
  14528. A_BTR,
  14529. A_BTS,
  14530. A_OR,
  14531. A_SUB,
  14532. A_XOR:
  14533. { Register is written to, so this will clear the upper 32 bits (2-operand instructions) }
  14534. if (
  14535. (taicpu(hp1).oper[0]^.typ <> top_ref) or
  14536. not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
  14537. ) and
  14538. MatchOperand(taicpu(hp1).oper[1]^, ActiveReg) then
  14539. begin
  14540. DebugMsg(SPeepholeOptimization + 'AndOp2Op 2', p);
  14541. RemoveCurrentP(p, hp1);
  14542. Result := True;
  14543. Exit;
  14544. end;
  14545. A_CMP,
  14546. A_TEST:
  14547. if (
  14548. (taicpu(hp1).oper[0]^.typ <> top_ref) or
  14549. not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
  14550. ) and
  14551. MatchOperand(taicpu(hp1).oper[1]^, ActiveReg) and
  14552. not RegUsedAfterInstruction(ActiveReg, hp1, TmpUsedRegs) then
  14553. begin
  14554. DebugMsg(SPeepholeOptimization + 'AND; CMP/TEST -> CMP/TEST', p);
  14555. RemoveCurrentP(p, hp1);
  14556. Result := True;
  14557. Exit;
  14558. end;
  14559. A_BSWAP,
  14560. A_NEG,
  14561. A_NOT:
  14562. { Register is written to, so this will clear the upper 32 bits (1-operand instructions) }
  14563. if MatchOperand(taicpu(hp1).oper[0]^, ActiveReg) then
  14564. begin
  14565. DebugMsg(SPeepholeOptimization + 'AndOp2Op 1', p);
  14566. RemoveCurrentP(p, hp1);
  14567. Result := True;
  14568. Exit;
  14569. end;
  14570. else
  14571. ;
  14572. end;
  14573. end;
  14574. if (taicpu(hp1).is_jmp) and
  14575. (taicpu(hp1).opcode<>A_JMP) and
  14576. not(RegInUsedRegs(taicpu(p).oper[1]^.reg,UsedRegs)) then
  14577. begin
  14578. { change
  14579. and x, reg
  14580. jxx
  14581. to
  14582. test x, reg
  14583. jxx
  14584. if reg is deallocated before the
  14585. jump, but only if it's a conditional jump (PFV)
  14586. }
  14587. DebugMsg(SPeepholeOptimization + 'AndJcc2TestJcc', p);
  14588. taicpu(p).opcode := A_TEST;
  14589. Exit;
  14590. end;
  14591. Break;
  14592. end;
  14593. { Lone AND tests }
  14594. if (taicpu(p).oper[0]^.typ = top_const) then
  14595. begin
  14596. {
  14597. - Convert and $0xFF,reg to and reg,reg if reg is 8-bit
  14598. - Convert and $0xFFFF,reg to and reg,reg if reg is 16-bit
  14599. - Convert and $0xFFFFFFFF,reg to and reg,reg if reg is 32-bit
  14600. }
  14601. if ((taicpu(p).oper[0]^.val = $FF) and (taicpu(p).opsize = S_B)) or
  14602. ((taicpu(p).oper[0]^.val = $FFFF) and (taicpu(p).opsize = S_W)) or
  14603. ((taicpu(p).oper[0]^.val = $FFFFFFFF) and (taicpu(p).opsize = S_L)) then
  14604. begin
  14605. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  14606. if taicpu(p).opsize = S_L then
  14607. begin
  14608. Include(OptsToCheck,aoc_MovAnd2Mov_3);
  14609. Result := True;
  14610. end;
  14611. end;
  14612. end;
  14613. { Backward check to determine necessity of and %reg,%reg }
  14614. if (taicpu(p).oper[0]^.typ = top_reg) and
  14615. (taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
  14616. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  14617. begin
  14618. hp2:=p;
  14619. while GetLastInstruction(hp2, hp2) and
  14620. (cs_opt_level3 in current_settings.optimizerswitches) and
  14621. (hp2.typ=ait_instruction) and
  14622. not RegModifiedByInstruction(ActiveReg,hp2) do { loop };
  14623. if Assigned(hp2) and
  14624. RegModifiedByInstruction(ActiveReg,hp2) and { Also checks if hp2 is an instruction }
  14625. { Check size of instruction to determine if the AND is effectively
  14626. a null operation }
  14627. (
  14628. (taicpu(p).opsize = taicpu(hp2).opsize) or
  14629. { Note: Don't include S_Q }
  14630. ((taicpu(p).opsize = S_L) and (taicpu(hp2).opsize in [S_BL, S_WL])) or
  14631. ((taicpu(p).opsize = S_W) and (taicpu(hp2).opsize in [S_BW, S_BL, S_WL, S_L])) or
  14632. ((taicpu(p).opsize = S_B) and (taicpu(hp2).opsize in [S_BW, S_BL, S_WL, S_W, S_L]))
  14633. ) then
  14634. begin
  14635. { AND %reg,%reg is unnecessary to zero the upper 32 bits. }
  14636. DebugMsg(SPeepholeOptimization + 'AND %reg,%reg proven unnecessary after backward search (And2Nop)', p);
  14637. RemoveCurrentP(p, hp1);
  14638. Result:=True;
  14639. Exit;
  14640. end;
  14641. end;
  14642. end;
  14643. function TX86AsmOptimizer.OptPass2ADD(var p : tai) : boolean;
  14644. var
  14645. hp1, hp2: tai;
  14646. NewRef: TReference;
  14647. Distance: Cardinal;
  14648. TempTracking: TAllUsedRegs;
  14649. DoAddMov2Lea: Boolean;
  14650. { This entire nested function is used in an if-statement below, but we
  14651. want to avoid all the used reg transfers and GetNextInstruction calls
  14652. until we really have to check }
  14653. function MemRegisterNotUsedLater: Boolean; inline;
  14654. var
  14655. hp2: tai;
  14656. begin
  14657. TransferUsedRegs(TmpUsedRegs);
  14658. if (cs_opt_level3 in current_settings.optimizerswitches) then
  14659. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1)
  14660. else
  14661. { p and hp1 will be adjacent }
  14662. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  14663. Result := not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs);
  14664. end;
  14665. begin
  14666. Result := False;
  14667. DoAddMov2Lea:=false;
  14668. if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) and
  14669. (taicpu(p).oper[1]^.typ = top_reg) then
  14670. begin
  14671. Distance := GetNextInstructionUsingRegCount(p, hp1, taicpu(p).oper[1]^.reg);
  14672. if (Distance = 0) or (Distance > 3) { Likely too far to make a meaningful difference } or
  14673. (hp1.typ <> ait_instruction) or
  14674. not
  14675. (
  14676. (cs_opt_level3 in current_settings.optimizerswitches) or
  14677. { GetNextInstructionUsingRegCount just returns the next valid instruction under -O2 and under }
  14678. RegInInstruction(taicpu(p).oper[1]^.reg, hp1)
  14679. ) then
  14680. Exit;
  14681. { Some of the MOV optimisations are much more in-depth. For example, if we have:
  14682. addq $x, %rax
  14683. movq %rax, %rdx
  14684. sarq $63, %rdx
  14685. (%rax still in use)
  14686. ...letting OptPass2ADD run its course (and without -Os) will produce:
  14687. leaq $x(%rax),%rdx
  14688. addq $x, %rax
  14689. sarq $63, %rdx
  14690. ...which is okay since it breaks the dependency chain between
  14691. addq and movq, but if OptPass2MOV is called first:
  14692. addq $x, %rax
  14693. cqto
  14694. ...which is better in all ways, taking only 2 cycles to execute
  14695. and much smaller in code size.
  14696. }
  14697. { The extra register tracking is quite strenuous }
  14698. if (cs_opt_level2 in current_settings.optimizerswitches) and
  14699. MatchInstruction(hp1, A_MOV, []) then
  14700. begin
  14701. { Update the register tracking to the MOV instruction }
  14702. CopyUsedRegs(TempTracking);
  14703. if (cs_opt_level3 in current_settings.optimizerswitches) then
  14704. UpdateUsedRegsBetween(UsedRegs, p, hp1)
  14705. else
  14706. { p and hp1 will be adjacent }
  14707. UpdateUsedRegs(UsedRegs, tai(p.Next));
  14708. hp2 := hp1;
  14709. Include(OptsToCheck, aoc_MovlMovq2MovlMovl);
  14710. if OptPass2MOV(hp1) then
  14711. Include(OptsToCheck, aoc_ForceNewIteration);
  14712. Exclude(OptsToCheck, aoc_MovlMovq2MovlMovl);
  14713. { Reset the tracking to the current instruction }
  14714. RestoreUsedRegs(TempTracking);
  14715. ReleaseUsedRegs(TempTracking);
  14716. { if hp1 <> hp2 after the call, then hp1 got removed, so let
  14717. OptPass2ADD get called again }
  14718. if (hp1 <> hp2) then
  14719. begin
  14720. Result := True;
  14721. Exit;
  14722. end;
  14723. end;
  14724. { Change:
  14725. add %reg2,%reg1
  14726. (%reg2 not modified in between)
  14727. mov/s/z #(%reg1),%reg1 (%reg1 superregisters must be the same)
  14728. To:
  14729. mov/s/z #(%reg1,%reg2),%reg1
  14730. }
  14731. if (taicpu(p).oper[0]^.typ = top_reg) and
  14732. MatchInstruction(hp1, [A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}], []) and
  14733. MatchOpType(taicpu(hp1), top_ref, top_reg) and
  14734. (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
  14735. (
  14736. (
  14737. (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
  14738. (taicpu(hp1).oper[0]^.ref^.index = NR_NO) and
  14739. { r/esp cannot be an index }
  14740. (taicpu(p).oper[0]^.reg<>NR_STACK_POINTER_REG)
  14741. ) or (
  14742. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  14743. (taicpu(hp1).oper[0]^.ref^.base = NR_NO)
  14744. )
  14745. ) and (
  14746. Reg1WriteOverwritesReg2Entirely(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) or
  14747. (
  14748. { If the super registers ARE equal, then this MOV/S/Z does a partial write }
  14749. not SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
  14750. MemRegisterNotUsedLater
  14751. )
  14752. ) then
  14753. begin
  14754. if (
  14755. { Instructions are guaranteed to be adjacent on -O2 and under }
  14756. (cs_opt_level3 in current_settings.optimizerswitches) and
  14757. RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)
  14758. ) then
  14759. begin
  14760. { If the other register is used in between, move the MOV
  14761. instruction to right after the ADD instruction so a
  14762. saving can still be made }
  14763. Asml.Remove(hp1);
  14764. Asml.InsertAfter(hp1, p);
  14765. taicpu(hp1).oper[0]^.ref^.base := taicpu(p).oper[1]^.reg;
  14766. taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.reg;
  14767. DebugMsg(SPeepholeOptimization + 'AddMov2Mov done (instruction moved)', p);
  14768. RemoveCurrentp(p, hp1);
  14769. end
  14770. else
  14771. begin
  14772. AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp1, UsedRegs);
  14773. taicpu(hp1).oper[0]^.ref^.base := taicpu(p).oper[1]^.reg;
  14774. taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.reg;
  14775. DebugMsg(SPeepholeOptimization + 'AddMov2Mov done', p);
  14776. if (cs_opt_level3 in current_settings.optimizerswitches) then
  14777. { hp1 may not be the immediate next instruction under -O3 }
  14778. RemoveCurrentp(p)
  14779. else
  14780. RemoveCurrentp(p, hp1);
  14781. end;
  14782. Result := True;
  14783. Exit;
  14784. end;
  14785. { Change:
  14786. addl/q $x,%reg1
  14787. movl/q %reg1,%reg2
  14788. To:
  14789. leal/q $x(%reg1),%reg2
  14790. addl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
  14791. Breaks the dependency chain.
  14792. }
  14793. if (taicpu(p).oper[0]^.typ = top_const) and
  14794. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  14795. (taicpu(hp1).oper[1]^.typ = top_reg) and
  14796. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
  14797. (
  14798. { Instructions are guaranteed to be adjacent on -O2 and under }
  14799. not (cs_opt_level3 in current_settings.optimizerswitches) or
  14800. (
  14801. { If the flags are used, don't make the optimisation,
  14802. otherwise they will be scrambled. Fixes #41148 }
  14803. (
  14804. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) or
  14805. not RegUsedBetween(NR_DEFAULTFLAGS, p, hp1)
  14806. ) and
  14807. not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1)
  14808. )
  14809. ) then
  14810. begin
  14811. TransferUsedRegs(TmpUsedRegs);
  14812. if (cs_opt_level3 in current_settings.optimizerswitches) then
  14813. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1)
  14814. else
  14815. { p and hp1 will be adjacent }
  14816. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  14817. if (
  14818. SetAndTest(
  14819. (
  14820. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
  14821. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
  14822. ),
  14823. DoAddMov2Lea
  14824. ) or
  14825. { Don't do AddMov2LeaAdd under -Os, but do allow AddMov2Lea }
  14826. not (cs_opt_size in current_settings.optimizerswitches)
  14827. ) then
  14828. begin
  14829. { Change the MOV instruction to a LEA instruction, and update the
  14830. first operand }
  14831. reference_reset(NewRef, 1, []);
  14832. NewRef.base := taicpu(p).oper[1]^.reg;
  14833. NewRef.scalefactor := 1;
  14834. { if the destination reg is the same as the ADD register,
  14835. and we keep the ADD instruction, do not add the offset
  14836. to LEA instruction, otherwise the reg gets increased by 2 times the offset value }
  14837. if DoAddMov2Lea or not MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^.reg) then
  14838. NewRef.offset := asizeint(taicpu(p).oper[0]^.val);
  14839. taicpu(hp1).opcode := A_LEA;
  14840. taicpu(hp1).loadref(0, NewRef);
  14841. if DoAddMov2Lea then
  14842. begin
  14843. { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
  14844. DebugMsg(SPeepholeOptimization + 'AddMov2Lea', hp1);
  14845. if (cs_opt_level3 in current_settings.optimizerswitches) then
  14846. { hp1 may not be the immediate next instruction under -O3 }
  14847. RemoveCurrentp(p)
  14848. else
  14849. RemoveCurrentp(p, hp1);
  14850. end
  14851. else
  14852. begin
  14853. hp2 := tai(hp1.Next); { for the benefit of AllocRegBetween }
  14854. { Move what is now the LEA instruction to before the ADD instruction }
  14855. Asml.Remove(hp1);
  14856. Asml.InsertBefore(hp1, p);
  14857. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
  14858. DebugMsg(SPeepholeOptimization + 'AddMov2LeaAdd', p);
  14859. p := hp1;
  14860. end;
  14861. Result := True;
  14862. end;
  14863. end;
  14864. end;
  14865. end;
  14866. function TX86AsmOptimizer.OptPass2Lea(var p : tai) : Boolean;
  14867. var
  14868. SubReg: TSubRegister;
  14869. hp1, hp2: tai;
  14870. CallJmp: Boolean;
  14871. begin
  14872. Result := False;
  14873. CallJmp := False;
  14874. SubReg := getsubreg(taicpu(p).oper[1]^.reg);
  14875. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  14876. with taicpu(p).oper[0]^.ref^ do
  14877. if not Assigned(symbol) and not Assigned(relsymbol) and (index <> NR_NO) then
  14878. if (offset = 0) then
  14879. begin
  14880. if (scalefactor <= 1) and SuperRegistersEqual(base, taicpu(p).oper[1]^.reg) then
  14881. begin
  14882. taicpu(p).loadreg(0, newreg(R_INTREGISTER, getsupreg(index), SubReg));
  14883. taicpu(p).opcode := A_ADD;
  14884. DebugMsg(SPeepholeOptimization + 'Lea2AddBase done',p);
  14885. Result := True;
  14886. end
  14887. else if SuperRegistersEqual(index, taicpu(p).oper[1]^.reg) then
  14888. begin
  14889. if (base <> NR_NO) then
  14890. begin
  14891. if (scalefactor <= 1) then
  14892. begin
  14893. taicpu(p).loadreg(0, newreg(R_INTREGISTER, getsupreg(base), SubReg));
  14894. taicpu(p).opcode := A_ADD;
  14895. DebugMsg(SPeepholeOptimization + 'Lea2AddIndex done',p);
  14896. Result := True;
  14897. end;
  14898. end
  14899. else
  14900. { Convert lea (%reg,2^x),%reg to shl x,%reg }
  14901. if (scalefactor in [2, 4, 8]) then
  14902. begin
  14903. { BsrByte is, in essence, the base-2 logarithm of the scale factor }
  14904. taicpu(p).loadconst(0, BsrByte(scalefactor));
  14905. taicpu(p).opcode := A_SHL;
  14906. DebugMsg(SPeepholeOptimization + 'Lea2Shl done',p);
  14907. Result := True;
  14908. end;
  14909. end;
  14910. end
  14911. { lea x(%reg1,%reg2),%reg3 and lea x(symbol,%reg2),%reg3 have a
  14912. lot of latency, so break off the offset if %reg3 is used soon
  14913. afterwards }
  14914. else if not (cs_opt_size in current_settings.optimizerswitches) and
  14915. { If 3-component addresses don't have additional latency, don't
  14916. perform this optimisation }
  14917. not (CPUX86_HINT_FAST_3COMP_ADDR in cpu_optimization_hints[current_settings.optimizecputype]) and
  14918. GetNextInstruction(p, hp1) and
  14919. (hp1.typ = ait_instruction) and
  14920. (
  14921. (
  14922. { Permit jumps and calls since they have a larger degree of overhead }
  14923. (
  14924. not SetAndTest(is_calljmp(taicpu(hp1).opcode), CallJmp) or
  14925. (
  14926. { ... unless the register specifies the location }
  14927. (taicpu(hp1).ops > 0) and
  14928. RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^)
  14929. )
  14930. ) and
  14931. (
  14932. not CallJmp and { Use the Boolean result to avoid calling "is_calljmp" twice }
  14933. RegInInstruction(taicpu(p).oper[1]^.reg, hp1)
  14934. )
  14935. )
  14936. or
  14937. (
  14938. { Check up to two instructions ahead }
  14939. GetNextInstruction(hp1, hp2) and
  14940. (hp2.typ = ait_instruction) and
  14941. (
  14942. not SetAndTest(is_calljmp(taicpu(hp2).opcode), CallJmp) or
  14943. (
  14944. { Same as above }
  14945. (taicpu(hp2).ops > 0) and
  14946. RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp2).oper[0]^)
  14947. )
  14948. ) and
  14949. (
  14950. not CallJmp and { Use the Boolean result to avoid calling "is_calljmp" twice }
  14951. RegInInstruction(taicpu(p).oper[1]^.reg, hp2)
  14952. )
  14953. )
  14954. ) then
  14955. begin
  14956. { Offset will be a 32-bit signed integer, so it's safe to use in the 64-bit version of ADD }
  14957. hp2 := taicpu.op_const_reg(A_ADD, taicpu(p).opsize, offset, taicpu(p).oper[1]^.reg);
  14958. taicpu(hp2).fileinfo := taicpu(p).fileinfo;
  14959. offset := 0;
  14960. if Assigned(symbol) or Assigned(relsymbol) then
  14961. DebugMsg(SPeepholeOptimization + 'lea x(sym,%reg1),%reg2 -> lea(sym,%reg1),%reg2; add $x,%reg2 to minimise instruction latency (Lea2LeaAdd)', p)
  14962. else
  14963. DebugMsg(SPeepholeOptimization + 'lea x(%reg1,%reg2),%reg3 -> lea(%reg1,%reg2),%reg3; add $x,%reg3 to minimise instruction latency (Lea2LeaAdd)', p);
  14964. { Inserting before the next instruction rather than after the
  14965. current instruction gives more accurate register tracking }
  14966. asml.InsertBefore(hp2, hp1);
  14967. AllocRegBetween(taicpu(p).oper[1]^.reg, p, hp2, UsedRegs);
  14968. Result := True;
  14969. end;
  14970. end;
  14971. function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
  14972. var
  14973. hp1, hp2: tai;
  14974. NewRef: TReference;
  14975. Distance: Cardinal;
  14976. TempTracking: TAllUsedRegs;
  14977. DoSubMov2Lea: Boolean;
  14978. begin
  14979. Result := False;
  14980. DoSubMov2Lea:=false;
  14981. if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) and
  14982. MatchOpType(taicpu(p),top_const,top_reg) then
  14983. begin
  14984. Distance := GetNextInstructionUsingRegCount(p, hp1, taicpu(p).oper[1]^.reg);
  14985. if (Distance = 0) or (Distance > 3) { Likely too far to make a meaningful difference } or
  14986. (hp1.typ <> ait_instruction) or
  14987. not
  14988. (
  14989. (cs_opt_level3 in current_settings.optimizerswitches) or
  14990. { GetNextInstructionUsingRegCount just returns the next valid instruction under -O2 and under }
  14991. RegInInstruction(taicpu(p).oper[1]^.reg, hp1)
  14992. ) then
  14993. Exit;
  14994. { Some of the MOV optimisations are much more in-depth. For example, if we have:
  14995. subq $x, %rax
  14996. movq %rax, %rdx
  14997. sarq $63, %rdx
  14998. (%rax still in use)
  14999. ...letting OptPass2SUB run its course (and without -Os) will produce:
  15000. leaq $-x(%rax),%rdx
  15001. movq $x, %rax
  15002. sarq $63, %rdx
  15003. ...which is okay since it breaks the dependency chain between
  15004. subq and movq, but if OptPass2MOV is called first:
  15005. subq $x, %rax
  15006. cqto
  15007. ...which is better in all ways, taking only 2 cycles to execute
  15008. and much smaller in code size.
  15009. }
  15010. { The extra register tracking is quite strenuous }
  15011. if (cs_opt_level2 in current_settings.optimizerswitches) and
  15012. MatchInstruction(hp1, A_MOV, []) then
  15013. begin
  15014. { Update the register tracking to the MOV instruction }
  15015. CopyUsedRegs(TempTracking);
  15016. if (cs_opt_level3 in current_settings.optimizerswitches) then
  15017. UpdateUsedRegsBetween(UsedRegs, p, hp1)
  15018. else
  15019. { p and hp1 will be adjacent }
  15020. UpdateUsedRegs(UsedRegs, tai(p.Next));
  15021. hp2 := hp1;
  15022. Include(OptsToCheck, aoc_MovlMovq2MovlMovl);
  15023. if OptPass2MOV(hp1) then
  15024. Include(OptsToCheck, aoc_ForceNewIteration);
  15025. Exclude(OptsToCheck, aoc_MovlMovq2MovlMovl);
  15026. { Reset the tracking to the current instruction }
  15027. RestoreUsedRegs(TempTracking);
  15028. ReleaseUsedRegs(TempTracking);
  15029. { if hp1 <> hp2 after the call, then hp1 got removed, so let
  15030. OptPass2SUB get called again }
  15031. if (hp1 <> hp2) then
  15032. begin
  15033. Result := True;
  15034. Exit;
  15035. end;
  15036. end;
  15037. { Change:
  15038. subl/q $x,%reg1
  15039. movl/q %reg1,%reg2
  15040. To:
  15041. leal/q $-x(%reg1),%reg2
  15042. subl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
  15043. Breaks the dependency chain and potentially permits the removal of
  15044. a CMP instruction if one follows.
  15045. }
  15046. if MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  15047. (taicpu(hp1).oper[1]^.typ = top_reg) and
  15048. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
  15049. (
  15050. { Instructions are guaranteed to be adjacent on -O2 and under }
  15051. not (cs_opt_level3 in current_settings.optimizerswitches) or
  15052. (
  15053. { If the flags are used, don't make the optimisation,
  15054. otherwise they will be scrambled. Fixes #41148 }
  15055. (
  15056. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) or
  15057. not RegUsedBetween(NR_DEFAULTFLAGS, p, hp1)
  15058. ) and
  15059. not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1)
  15060. )
  15061. ) then
  15062. begin
  15063. TransferUsedRegs(TmpUsedRegs);
  15064. if (cs_opt_level3 in current_settings.optimizerswitches) then
  15065. UpdateUsedRegsBetween(TmpUsedRegs, p, hp1)
  15066. else
  15067. { p and hp1 will be adjacent }
  15068. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  15069. if (
  15070. SetAndTest(
  15071. (
  15072. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
  15073. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
  15074. ),
  15075. DoSubMov2Lea
  15076. ) or
  15077. { Don't do SubMov2LeaSub under -Os, but do allow SubMov2Lea }
  15078. not (cs_opt_size in current_settings.optimizerswitches)
  15079. ) then
  15080. begin
  15081. { Change the MOV instruction to a LEA instruction, and update the
  15082. first operand }
  15083. reference_reset(NewRef, 1, []);
  15084. NewRef.base := taicpu(p).oper[1]^.reg;
  15085. NewRef.scalefactor := 1;
  15086. { if the destination reg is the same as the SUB register,
  15087. and we keep the ADD instruction, do not substract the offset
  15088. to LEA instruction, otherwise the reg gets decreased by 2 times the offset value }
  15089. if DoSubMov2Lea or not MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^.reg) then
  15090. NewRef.offset := -taicpu(p).oper[0]^.val;
  15091. taicpu(hp1).opcode := A_LEA;
  15092. taicpu(hp1).loadref(0, NewRef);
  15093. if DoSubMov2Lea then
  15094. begin
  15095. { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
  15096. DebugMsg(SPeepholeOptimization + 'SubMov2Lea', hp1);
  15097. if (cs_opt_level3 in current_settings.optimizerswitches) then
  15098. { hp1 may not be the immediate next instruction under -O3 }
  15099. RemoveCurrentp(p)
  15100. else
  15101. RemoveCurrentp(p, hp1);
  15102. end
  15103. else
  15104. begin
  15105. hp2 := tai(hp1.Next); { for the benefit of AllocRegBetween }
  15106. { Move what is now the LEA instruction to before the SUB instruction }
  15107. Asml.Remove(hp1);
  15108. Asml.InsertBefore(hp1, p);
  15109. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
  15110. DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
  15111. p := hp1;
  15112. end;
  15113. Result := True;
  15114. end;
  15115. end;
  15116. end;
  15117. end;
  15118. function TX86AsmOptimizer.SkipSimpleInstructions(var hp1 : tai) : Boolean;
  15119. begin
  15120. { we can skip all instructions not messing with the stack pointer }
  15121. while assigned(hp1) and {MatchInstruction(hp1,[A_LEA,A_MOV,A_MOVQ,A_MOVSQ,A_MOVSX,A_MOVSXD,A_MOVZX,
  15122. A_AND,A_OR,A_XOR,A_ADD,A_SHR,A_SHL,A_IMUL,A_SETcc,A_SAR,A_SUB,A_TEST,A_CMOVcc,
  15123. A_MOVSS,A_MOVSD,A_MOVAPS,A_MOVUPD,A_MOVAPD,A_MOVUPS,
  15124. A_VMOVSS,A_VMOVSD,A_VMOVAPS,A_VMOVUPD,A_VMOVAPD,A_VMOVUPS],[]) and}
  15125. ({(taicpu(hp1).ops=0) or }
  15126. ({(MatchOpType(taicpu(hp1),top_reg,top_reg) or MatchOpType(taicpu(hp1),top_const,top_reg) or
  15127. (MatchOpType(taicpu(hp1),top_ref,top_reg))
  15128. ) and }
  15129. not(RegInInstruction(NR_STACK_POINTER_REG,hp1)) { and not(RegInInstruction(NR_FRAME_POINTER_REG,hp1))}
  15130. )
  15131. ) do
  15132. GetNextInstruction(hp1,hp1);
  15133. Result:=assigned(hp1);
  15134. end;
  15135. function TX86AsmOptimizer.PostPeepholeOptLea(var p : tai) : Boolean;
  15136. var
  15137. hp1, hp2, hp3, hp4, hp5, hp6, hp7, hp8: tai;
  15138. begin
  15139. Result:=false;
  15140. {$ifdef x86_64}
  15141. { Change:
  15142. lea x(%reg1d,%reg2d),%reg3d
  15143. To:
  15144. lea x(%reg1q,%reg2q),%reg3d
  15145. Reduces the number of bytes of machine code
  15146. }
  15147. if (getsubreg(taicpu(p).oper[1]^.reg)=R_SUBD) and
  15148. (
  15149. (getsubreg(taicpu(p).oper[0]^.ref^.base)=R_SUBD) or
  15150. (getsubreg(taicpu(p).oper[0]^.ref^.index)=R_SUBD)
  15151. ) then
  15152. begin
  15153. DebugMsg(SPeepholeOptimization + 'Changed 32-bit registers in reference to 64-bit (reduces instruction size)', p);
  15154. if (getsubreg(taicpu(p).oper[0]^.ref^.base)=R_SUBD) then
  15155. setsubreg(taicpu(p).oper[0]^.ref^.base,R_SUBQ);
  15156. if (getsubreg(taicpu(p).oper[0]^.ref^.index)=R_SUBD) then
  15157. setsubreg(taicpu(p).oper[0]^.ref^.index,R_SUBQ);
  15158. { No reason to set Result to true }
  15159. end;
  15160. {$endif x86_64}
  15161. hp5:=nil;
  15162. hp6:=nil;
  15163. hp7:=nil;
  15164. hp8:=nil;
  15165. { replace
  15166. leal(q) x(<stackpointer>),<stackpointer>
  15167. <optional .seh_stackalloc ...>
  15168. <optional .seh_endprologue ...>
  15169. call procname
  15170. <optional NOP>
  15171. leal(q) -x(<stackpointer>),<stackpointer>
  15172. <optional VZEROUPPER>
  15173. ret
  15174. by
  15175. jmp procname
  15176. but do it only on level 4 because it destroys stack back traces
  15177. }
  15178. if (cs_opt_level4 in current_settings.optimizerswitches) and
  15179. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG) and
  15180. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  15181. (taicpu(p).oper[0]^.ref^.index=NR_NO) and
  15182. { the -8, -24, -40 are not required, but bail out early if possible,
  15183. higher values are unlikely }
  15184. ((taicpu(p).oper[0]^.ref^.offset=-8) or
  15185. (taicpu(p).oper[0]^.ref^.offset=-24) or
  15186. (taicpu(p).oper[0]^.ref^.offset=-40)) and
  15187. (taicpu(p).oper[0]^.ref^.symbol=nil) and
  15188. (taicpu(p).oper[0]^.ref^.relsymbol=nil) and
  15189. GetNextInstruction(p, hp1) and
  15190. { Take a copy of hp1 }
  15191. SetAndTest(hp1, hp4) and
  15192. { trick to skip label }
  15193. ((hp1.typ=ait_instruction) or (SetAndTest(hp1, hp7) and GetNextInstruction(hp1, hp1))) and
  15194. { skip directives, .seh_stackalloc and .seh_endprologue on windows
  15195. ((hp1.typ=ait_instruction) or (SetAndTest(hp1, hp7) and GetNextInstruction(hp1, hp1))) and
  15196. ((hp1.typ=ait_instruction) or (SetAndTest(hp1, hp8) and GetNextInstruction(hp1, hp1))) and }
  15197. SkipSimpleInstructions(hp1) and
  15198. MatchInstruction(hp1,A_CALL,[S_NO]) and
  15199. GetNextInstruction(hp1, hp2) and
  15200. (MatchInstruction(hp2,A_LEA,[taicpu(p).opsize]) or
  15201. { skip nop instruction on win64 }
  15202. (MatchInstruction(hp2,A_NOP,[S_NO]) and
  15203. SetAndTest(hp2,hp6) and
  15204. GetNextInstruction(hp2,hp2) and
  15205. MatchInstruction(hp2,A_LEA,[taicpu(p).opsize]))
  15206. ) and
  15207. (taicpu(hp2).oper[1]^.reg=NR_STACK_POINTER_REG) and
  15208. (taicpu(hp2).oper[0]^.ref^.offset=-taicpu(p).oper[0]^.ref^.offset) and
  15209. (taicpu(hp2).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  15210. (taicpu(hp2).oper[0]^.ref^.index=NR_NO) and
  15211. (taicpu(hp2).oper[0]^.ref^.symbol=nil) and
  15212. (taicpu(hp2).oper[0]^.ref^.relsymbol=nil) and
  15213. { Segment register will be NR_NO }
  15214. GetNextInstruction(hp2, hp3) and
  15215. { trick to skip label }
  15216. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  15217. (MatchInstruction(hp3,A_RET,[S_NO]) or
  15218. (MatchInstruction(hp3,A_VZEROUPPER,[S_NO]) and
  15219. SetAndTest(hp3,hp5) and
  15220. GetNextInstruction(hp3,hp3) and
  15221. MatchInstruction(hp3,A_RET,[S_NO])
  15222. )
  15223. ) and
  15224. (taicpu(hp3).ops=0) then
  15225. begin
  15226. taicpu(hp1).opcode := A_JMP;
  15227. taicpu(hp1).is_jmp := true;
  15228. DebugMsg(SPeepholeOptimization + 'LeaCallLeaRet2Jmp done',p);
  15229. { search for the stackalloc directive and remove it }
  15230. hp7:=tai(p.next);
  15231. while assigned(hp7) and (tai(hp7).typ<>ait_instruction) do
  15232. begin
  15233. if (hp7.typ=ait_seh_directive) and (tai_seh_directive(hp7).kind=ash_stackalloc) then
  15234. begin
  15235. { sanity check }
  15236. if taicpu(p).oper[0]^.ref^.offset<>-tai_seh_directive(hp7).data.offset then
  15237. Internalerror(2024012201);
  15238. hp8:=tai(hp7.next);
  15239. RemoveInstruction(tai(hp7));
  15240. hp7:=hp8;
  15241. break;
  15242. end
  15243. else
  15244. hp7:=tai(hp7.next);
  15245. end;
  15246. RemoveCurrentP(p, hp4);
  15247. RemoveInstruction(hp2);
  15248. RemoveInstruction(hp3);
  15249. { if there is a vzeroupper instruction then move it before the jmp }
  15250. if Assigned(hp5) then
  15251. begin
  15252. AsmL.Remove(hp5);
  15253. ASmL.InsertBefore(hp5,hp1)
  15254. end;
  15255. { remove nop on win64 }
  15256. if Assigned(hp6) then
  15257. RemoveInstruction(hp6);
  15258. Result:=true;
  15259. end;
  15260. end;
  15261. function TX86AsmOptimizer.PostPeepholeOptPush(var p : tai) : Boolean;
  15262. {$ifdef x86_64}
  15263. var
  15264. hp1, hp2, hp3, hp4, hp5: tai;
  15265. {$endif x86_64}
  15266. begin
  15267. Result:=false;
  15268. {$ifdef x86_64}
  15269. hp5:=nil;
  15270. { replace
  15271. push %rax
  15272. call procname
  15273. pop %rcx
  15274. ret
  15275. by
  15276. jmp procname
  15277. but do it only on level 4 because it destroys stack back traces
  15278. It depends on the fact, that the sequence push rax/pop rcx is used for stack alignment as rcx is volatile
  15279. for all supported calling conventions
  15280. }
  15281. if (cs_opt_level4 in current_settings.optimizerswitches) and
  15282. MatchOpType(taicpu(p),top_reg) and
  15283. (taicpu(p).oper[0]^.reg=NR_RAX) and
  15284. GetNextInstruction(p, hp1) and
  15285. { Take a copy of hp1 }
  15286. SetAndTest(hp1, hp4) and
  15287. { trick to skip label }
  15288. ((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
  15289. SkipSimpleInstructions(hp1) and
  15290. MatchInstruction(hp1,A_CALL,[S_NO]) and
  15291. GetNextInstruction(hp1, hp2) and
  15292. MatchInstruction(hp2,A_POP,[taicpu(p).opsize]) and
  15293. MatchOpType(taicpu(hp2),top_reg) and
  15294. (taicpu(hp2).oper[0]^.reg=NR_RCX) and
  15295. GetNextInstruction(hp2, hp3) and
  15296. { trick to skip label }
  15297. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  15298. (MatchInstruction(hp3,A_RET,[S_NO]) or
  15299. (MatchInstruction(hp3,A_VZEROUPPER,[S_NO]) and
  15300. SetAndTest(hp3,hp5) and
  15301. GetNextInstruction(hp3,hp3) and
  15302. MatchInstruction(hp3,A_RET,[S_NO])
  15303. )
  15304. ) and
  15305. (taicpu(hp3).ops=0) then
  15306. begin
  15307. taicpu(hp1).opcode := A_JMP;
  15308. taicpu(hp1).is_jmp := true;
  15309. DebugMsg(SPeepholeOptimization + 'PushCallPushRet2Jmp done',p);
  15310. RemoveCurrentP(p, hp4);
  15311. RemoveInstruction(hp2);
  15312. RemoveInstruction(hp3);
  15313. if Assigned(hp5) then
  15314. begin
  15315. AsmL.Remove(hp5);
  15316. ASmL.InsertBefore(hp5,hp1)
  15317. end;
  15318. Result:=true;
  15319. end;
  15320. {$endif x86_64}
  15321. end;
  15322. function TX86AsmOptimizer.PostPeepholeOptMov(var p : tai) : Boolean;
  15323. var
  15324. Value, RegName: string;
  15325. hp1: tai;
  15326. begin
  15327. Result:=false;
  15328. if (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(p).oper[0]^.typ = top_const) then
  15329. begin
  15330. case taicpu(p).oper[0]^.val of
  15331. 0:
  15332. { Don't make this optimisation if the CPU flags are required, since XOR scrambles them }
  15333. if not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs) or
  15334. (
  15335. { See if we can still convert the instruction }
  15336. GetNextInstructionUsingReg(p, hp1, NR_DEFAULTFLAGS) and
  15337. RegLoadedWithNewValue(NR_DEFAULTFLAGS, hp1)
  15338. ) then
  15339. begin
  15340. { change "mov $0,%reg" into "xor %reg,%reg" }
  15341. taicpu(p).opcode := A_XOR;
  15342. taicpu(p).loadReg(0,taicpu(p).oper[1]^.reg);
  15343. Result := True;
  15344. {$ifdef x86_64}
  15345. end
  15346. else if (taicpu(p).opsize = S_Q) then
  15347. begin
  15348. RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
  15349. { The actual optimization }
  15350. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  15351. taicpu(p).changeopsize(S_L);
  15352. DebugMsg(SPeepholeOptimization + 'movq $0,' + RegName + ' -> movl $0,' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
  15353. Result := True;
  15354. end;
  15355. $1..$FFFFFFFF:
  15356. begin
  15357. { Code size reduction by J. Gareth "Kit" Moreton }
  15358. { change 64-bit register to 32-bit register to reduce code size (upper 32 bits will be set to zero) }
  15359. case taicpu(p).opsize of
  15360. S_Q:
  15361. begin
  15362. RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
  15363. Value := debug_tostr(taicpu(p).oper[0]^.val);
  15364. { The actual optimization }
  15365. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  15366. taicpu(p).changeopsize(S_L);
  15367. DebugMsg(SPeepholeOptimization + 'movq $' + Value + ',' + RegName + ' -> movl $' + Value + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
  15368. Result := True;
  15369. end;
  15370. else
  15371. { Do nothing };
  15372. end;
  15373. {$endif x86_64}
  15374. end;
  15375. -1:
  15376. { Don't make this optimisation if the CPU flags are required, since OR scrambles them }
  15377. if (cs_opt_size in current_settings.optimizerswitches) and
  15378. (taicpu(p).opsize <> S_B) and
  15379. (
  15380. not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs) or
  15381. (
  15382. { See if we can still convert the instruction }
  15383. GetNextInstructionUsingReg(p, hp1, NR_DEFAULTFLAGS) and
  15384. RegLoadedWithNewValue(NR_DEFAULTFLAGS, hp1)
  15385. )
  15386. ) then
  15387. begin
  15388. { change "mov $-1,%reg" into "or $-1,%reg" }
  15389. { NOTES:
  15390. - No size saving is made when changing a Word-sized assignment unless the register is AX (smaller encoding)
  15391. - This operation creates a false dependency on the register, so only do it when optimising for size
  15392. - It is possible to set memory operands using this method, but this creates an even greater false dependency, so don't do this at all
  15393. }
  15394. taicpu(p).opcode := A_OR;
  15395. DebugMsg(SPeepholeOptimization + 'Mov-12Or-1',p);
  15396. Result := True;
  15397. end;
  15398. else
  15399. { Do nothing };
  15400. end;
  15401. end;
  15402. end;
  15403. { Returns true if the given logic instruction can be converted into a BTx instruction (BT not included) }
  15404. class function TX86AsmOptimizer.IsBTXAcceptable(p : tai) : boolean;
  15405. begin
  15406. Result := False;
  15407. if not (CPUX86_HAS_BTX in cpu_capabilities[current_settings.optimizecputype]) then
  15408. Exit;
  15409. { For sizes less than S_L, the byte size is equal or larger with BTx,
  15410. so don't bother optimising }
  15411. if not MatchInstruction(p, A_AND, A_OR, A_XOR, [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
  15412. Exit;
  15413. if (taicpu(p).oper[0]^.typ <> top_const) or
  15414. { If the value can fit into an 8-bit signed integer, a smaller
  15415. instruction can be encoded with AND/OR/XOR, so don't optimise if it
  15416. falls within this range }
  15417. (
  15418. (taicpu(p).oper[0]^.val > -128) and
  15419. (taicpu(p).oper[0]^.val <= 127)
  15420. ) then
  15421. Exit;
  15422. { If we're optimising for size, this is acceptable }
  15423. if (cs_opt_size in current_settings.optimizerswitches) then
  15424. Exit(True);
  15425. if (taicpu(p).oper[1]^.typ = top_reg) and
  15426. (CPUX86_HINT_FAST_BTX_REG_IMM in cpu_optimization_hints[current_settings.optimizecputype]) then
  15427. Exit(True);
  15428. if (taicpu(p).oper[1]^.typ <> top_reg) and
  15429. (CPUX86_HINT_FAST_BTX_MEM_IMM in cpu_optimization_hints[current_settings.optimizecputype]) then
  15430. Exit(True);
  15431. end;
  15432. function TX86AsmOptimizer.PostPeepholeOptAnd(var p : tai) : boolean;
  15433. var
  15434. hp1: tai;
  15435. Value: TCGInt;
  15436. begin
  15437. Result := False;
  15438. if MatchOpType(taicpu(p), top_const, top_reg) then
  15439. begin
  15440. { Detect:
  15441. andw x, %ax (0 <= x < $8000)
  15442. ...
  15443. movzwl %ax,%eax
  15444. Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
  15445. }
  15446. if (taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W }
  15447. ((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val) and
  15448. GetNextInstructionUsingReg(p, hp1, NR_EAX) and
  15449. MatchInstruction(hp1, A_MOVZX, [S_WL]) and
  15450. MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and
  15451. MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then
  15452. begin
  15453. DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via AndMovz2AndCwtl)', hp1);
  15454. taicpu(hp1).opcode := A_CWDE;
  15455. taicpu(hp1).clearop(0);
  15456. taicpu(hp1).clearop(1);
  15457. taicpu(hp1).ops := 0;
  15458. { A change was made, but not with p, so don't set Result, but
  15459. notify the compiler that a change was made }
  15460. Include(OptsToCheck, aoc_ForceNewIteration);
  15461. Exit; { and -> btr won't happen because an opsize of S_W won't be optimised anyway }
  15462. end;
  15463. end;
  15464. { If "not x" is a power of 2 (popcnt = 1), change:
  15465. and $x, %reg/ref
  15466. To:
  15467. btr lb(x), %reg/ref
  15468. }
  15469. if IsBTXAcceptable(p) and
  15470. (
  15471. { Make sure a TEST doesn't follow that plays with the register }
  15472. not GetNextInstruction(p, hp1) or
  15473. not MatchInstruction(hp1, A_TEST, A_CMP, [taicpu(p).opsize]) or
  15474. not MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg)
  15475. ) then
  15476. begin
  15477. {$push}{$R-}{$Q-}
  15478. { Value is a sign-extended 32-bit integer - just correct it
  15479. if it's represented as an unsigned value. Also, IsBTXAcceptable
  15480. checks to see if this operand is an immediate. }
  15481. Value := not taicpu(p).oper[0]^.val;
  15482. {$pop}
  15483. {$ifdef x86_64}
  15484. if taicpu(p).opsize = S_L then
  15485. {$endif x86_64}
  15486. Value := Value and $FFFFFFFF;
  15487. if (PopCnt(QWord(Value)) = 1) then
  15488. begin
  15489. DebugMsg(SPeepholeOptimization + 'Changed AND (not $' + debug_hexstr(taicpu(p).oper[0]^.val) + ') to BTR $' + debug_tostr(BsrQWord(Value)) + ' to shrink instruction size (And2Btr)', p);
  15490. taicpu(p).opcode := A_BTR;
  15491. taicpu(p).oper[0]^.val := BsrQWord(Value); { Essentially the base 2 logarithm }
  15492. Result := True;
  15493. Exit;
  15494. end;
  15495. end;
  15496. end;
  15497. function TX86AsmOptimizer.PostPeepholeOptMOVSX(var p : tai) : boolean;
  15498. begin
  15499. Result := False;
  15500. if not MatchOpType(taicpu(p), top_reg, top_reg) then
  15501. Exit;
  15502. { Convert:
  15503. movswl %ax,%eax -> cwtl
  15504. movslq %eax,%rax -> cdqe
  15505. NOTE: Don't convert movswl %al,%ax to cbw, because cbw and cwde
  15506. refer to the same opcode and depends only on the assembler's
  15507. current operand-size attribute. [Kit]
  15508. }
  15509. with taicpu(p) do
  15510. case opsize of
  15511. S_WL:
  15512. if (oper[0]^.reg = NR_AX) and (oper[1]^.reg = NR_EAX) then
  15513. begin
  15514. DebugMsg(SPeepholeOptimization + 'Converted movswl %ax,%eax to cwtl', p);
  15515. opcode := A_CWDE;
  15516. clearop(0);
  15517. clearop(1);
  15518. ops := 0;
  15519. Result := True;
  15520. end;
  15521. {$ifdef x86_64}
  15522. S_LQ:
  15523. if (oper[0]^.reg = NR_EAX) and (oper[1]^.reg = NR_RAX) then
  15524. begin
  15525. DebugMsg(SPeepholeOptimization + 'Converted movslq %eax,%rax to cltq', p);
  15526. opcode := A_CDQE;
  15527. clearop(0);
  15528. clearop(1);
  15529. ops := 0;
  15530. Result := True;
  15531. end;
  15532. {$endif x86_64}
  15533. else
  15534. ;
  15535. end;
  15536. end;
  15537. function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean;
  15538. var
  15539. hp1: tai;
  15540. begin
  15541. Result := False;
  15542. { All these optimisations work on "shr const,%reg" }
  15543. if not MatchOpType(taicpu(p), top_const, top_reg) then
  15544. Exit;
  15545. if HandleSHRMerge(p, True) then
  15546. begin
  15547. Result := True;
  15548. Exit;
  15549. end;
  15550. { Detect the following (looking backwards):
  15551. shr %cl,%reg
  15552. shr x, %reg
  15553. Swap the two SHR instructions to minimise a pipeline stall.
  15554. }
  15555. if GetLastInstruction(p, hp1) and
  15556. MatchInstruction(hp1, A_SHR, [taicpu(p).opsize]) and
  15557. MatchOpType(taicpu(hp1), top_reg, top_reg) and
  15558. { First operand will be %cl }
  15559. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
  15560. { Just to be sure }
  15561. (getsupreg(taicpu(hp1).oper[1]^.reg) <> RS_ECX) then
  15562. begin
  15563. DebugMsg(SPeepholeOptimization + 'Swapped variable and constant SHR instructions to minimise pipeline stall (ShrShr2ShrShr)', hp1);
  15564. { Moving the entries this way ensures the register tracking remains correct }
  15565. Asml.Remove(p);
  15566. Asml.InsertBefore(p, hp1);
  15567. p := hp1;
  15568. { Don't set Result to True because the current instruction is now
  15569. "shr %cl,%reg" and there's nothing more we can do with it }
  15570. end;
  15571. end;
  15572. function TX86AsmOptimizer.PostPeepholeOptADDSUB(var p : tai) : boolean;
  15573. var
  15574. hp1, hp2: tai;
  15575. Opposite, SecondOpposite: TAsmOp;
  15576. NewCond: TAsmCond;
  15577. begin
  15578. Result := False;
  15579. { Change:
  15580. add/sub 128,(dest)
  15581. To:
  15582. sub/add -128,(dest)
  15583. This generaally takes fewer bytes to encode because -128 can be stored
  15584. in a signed byte, whereas +128 cannot.
  15585. }
  15586. if (taicpu(p).opsize <> S_B) and MatchOperand(taicpu(p).oper[0]^, 128) then
  15587. begin
  15588. if taicpu(p).opcode = A_ADD then
  15589. Opposite := A_SUB
  15590. else
  15591. Opposite := A_ADD;
  15592. { Be careful if the flags are in use, because the CF flag inverts
  15593. when changing from ADD to SUB and vice versa }
  15594. if RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  15595. GetNextInstruction(p, hp1) then
  15596. begin
  15597. TransferUsedRegs(TmpUsedRegs);
  15598. TmpUsedRegs[R_SPECIALREGISTER].Update(tai(p.Next), True);
  15599. hp2 := hp1;
  15600. { Scan ahead to check if everything's safe }
  15601. while Assigned(hp1) and RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) do
  15602. begin
  15603. if (hp1.typ <> ait_instruction) then
  15604. { Probably unsafe since the flags are still in use }
  15605. Exit;
  15606. if MatchInstruction(hp1, A_CALL, A_JMP, A_RET, []) then
  15607. { Stop searching at an unconditional jump }
  15608. Break;
  15609. if not
  15610. (
  15611. MatchInstruction(hp1, A_ADC, A_SBB, []) and
  15612. (taicpu(hp1).oper[0]^.typ = top_const) { We need to be able to invert a constant }
  15613. ) and
  15614. (taicpu(hp1).condition = C_None) and RegInInstruction(NR_DEFAULTFLAGS, hp1) then
  15615. { Instruction depends on FLAGS (and is not ADC or SBB); break out }
  15616. Exit;
  15617. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  15618. TmpUsedRegs[R_SPECIALREGISTER].Update(tai(hp1.Next), True);
  15619. { Move to the next instruction }
  15620. GetNextInstruction(hp1, hp1);
  15621. end;
  15622. while Assigned(hp2) and (hp2 <> hp1) do
  15623. begin
  15624. NewCond := C_None;
  15625. case taicpu(hp2).condition of
  15626. C_A, C_NBE:
  15627. NewCond := C_BE;
  15628. C_B, C_C, C_NAE:
  15629. NewCond := C_AE;
  15630. C_AE, C_NB, C_NC:
  15631. NewCond := C_B;
  15632. C_BE, C_NA:
  15633. NewCond := C_A;
  15634. else
  15635. { No change needed };
  15636. end;
  15637. if NewCond <> C_None then
  15638. begin
  15639. DebugMsg(SPeepholeOptimization + 'Condition changed from ' + cond2str[taicpu(hp2).condition] + ' to ' + cond2str[NewCond] +
  15640. ' to accommodate ' + debug_op2str(taicpu(p).opcode) + ' -> ' + debug_op2str(opposite) + ' above', hp2);
  15641. taicpu(hp2).condition := NewCond;
  15642. end
  15643. else
  15644. if MatchInstruction(hp2, A_ADC, A_SBB, []) then
  15645. begin
  15646. { Because of the flipping of the carry bit, to ensure
  15647. the operation remains equivalent, ADC becomes SBB
  15648. and vice versa, and the constant is not-inverted.
  15649. If multiple ADCs or SBBs appear in a row, each one
  15650. changed causes the carry bit to invert, so they all
  15651. need to be flipped }
  15652. if taicpu(hp2).opcode = A_ADC then
  15653. SecondOpposite := A_SBB
  15654. else
  15655. SecondOpposite := A_ADC;
  15656. if taicpu(hp2).oper[0]^.typ <> top_const then
  15657. { Should have broken out of this optimisation already }
  15658. InternalError(2021112901);
  15659. DebugMsg(SPeepholeOptimization + debug_op2str(taicpu(hp2).opcode) + debug_opsize2str(taicpu(hp2).opsize) + ' $' + debug_tostr(taicpu(hp2).oper[0]^.val) + ',' + debug_operstr(taicpu(hp2).oper[1]^) + ' -> ' +
  15660. debug_op2str(SecondOpposite) + debug_opsize2str(taicpu(hp2).opsize) + ' $' + debug_tostr(not taicpu(hp2).oper[0]^.val) + ',' + debug_operstr(taicpu(hp2).oper[1]^) + ' to accommodate inverted carry bit', hp2);
  15661. { Bit-invert the constant (effectively equivalent to "-1 - val") }
  15662. taicpu(hp2).opcode := SecondOpposite;
  15663. taicpu(hp2).oper[0]^.val := not taicpu(hp2).oper[0]^.val;
  15664. end;
  15665. { Move to the next instruction }
  15666. GetNextInstruction(hp2, hp2);
  15667. end;
  15668. if (hp2 <> hp1) then
  15669. InternalError(2021111501);
  15670. end;
  15671. DebugMsg(SPeepholeOptimization + debug_op2str(taicpu(p).opcode) + debug_opsize2str(taicpu(p).opsize) + ' $128,' + debug_operstr(taicpu(p).oper[1]^) + ' changed to ' +
  15672. debug_op2str(opposite) + debug_opsize2str(taicpu(p).opsize) + ' $-128,' + debug_operstr(taicpu(p).oper[1]^) + ' to reduce instruction size', p);
  15673. taicpu(p).opcode := Opposite;
  15674. taicpu(p).oper[0]^.val := -128;
  15675. { No further optimisations can be made on this instruction, so move
  15676. onto the next one to save time }
  15677. p := tai(p.Next);
  15678. UpdateUsedRegs(p);
  15679. Result := True;
  15680. Exit;
  15681. end;
  15682. { Detect:
  15683. add/sub %reg2,(dest)
  15684. add/sub x, (dest)
  15685. (dest can be a register or a reference)
  15686. Swap the instructions to minimise a pipeline stall. This reverses the
  15687. "Add swap" and "Sub swap" optimisations done in pass 1 if no new
  15688. optimisations could be made.
  15689. }
  15690. if (taicpu(p).oper[0]^.typ = top_reg) and
  15691. not RegInOp(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^) and
  15692. (
  15693. (
  15694. (taicpu(p).oper[1]^.typ = top_reg) and
  15695. { We can try searching further ahead if we're writing to a register }
  15696. GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.reg)
  15697. ) or
  15698. (
  15699. (taicpu(p).oper[1]^.typ = top_ref) and
  15700. GetNextInstruction(p, hp1)
  15701. )
  15702. ) and
  15703. MatchInstruction(hp1, A_ADD, A_SUB, [taicpu(p).opsize]) and
  15704. (taicpu(hp1).oper[0]^.typ = top_const) and
  15705. MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[1]^) then
  15706. begin
  15707. { Make doubly sure the flags aren't in use because the order of additions may affect them }
  15708. TransferUsedRegs(TmpUsedRegs);
  15709. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  15710. hp2 := p;
  15711. while not (cs_opt_level3 in current_settings.optimizerswitches) and
  15712. GetNextInstruction(hp2, hp2) and (hp2 <> hp1) do
  15713. UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
  15714. if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  15715. begin
  15716. asml.remove(hp1);
  15717. asml.InsertBefore(hp1, p);
  15718. DebugMsg(SPeepholeOptimization + 'Add/Sub swap 2 done', hp1);
  15719. Result := True;
  15720. end;
  15721. end;
  15722. end;
  15723. function TX86AsmOptimizer.PostPeepholeOptCmp(var p : tai) : Boolean;
  15724. var
  15725. hp1: tai;
  15726. begin
  15727. Result:=false;
  15728. { Final check to see if CMP/MOV pairs can be changed to MOV/CMP }
  15729. while GetNextInstruction(p, hp1) and
  15730. TrySwapMovCmp(p, hp1) do
  15731. begin
  15732. if MatchInstruction(hp1, A_MOV, []) then
  15733. begin
  15734. if RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  15735. begin
  15736. { A little hacky, but since CMP doesn't read the flags, only
  15737. modify them, it's safe if they get scrambled by MOV -> XOR }
  15738. ExcludeRegFromUsedRegs(NR_DEFAULTFLAGS, UsedRegs);
  15739. Result := PostPeepholeOptMov(hp1);
  15740. {$ifdef x86_64}
  15741. if Result and MatchInstruction(hp1, A_XOR, [S_Q]) then
  15742. { Used to shrink instruction size }
  15743. PostPeepholeOptXor(hp1);
  15744. {$endif x86_64}
  15745. IncludeRegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs);
  15746. end
  15747. else
  15748. begin
  15749. Result := PostPeepholeOptMov(hp1);
  15750. {$ifdef x86_64}
  15751. if Result and MatchInstruction(hp1, A_XOR, [S_Q]) then
  15752. { Used to shrink instruction size }
  15753. PostPeepholeOptXor(hp1);
  15754. {$endif x86_64}
  15755. end;
  15756. end;
  15757. { Enabling this flag is actually a null operation, but it marks
  15758. the code as 'modified' during this pass }
  15759. Include(OptsToCheck, aoc_ForceNewIteration);
  15760. end;
  15761. { change "cmp $0, %reg" to "test %reg, %reg" }
  15762. if MatchOpType(taicpu(p),top_const,top_reg) and
  15763. (taicpu(p).oper[0]^.val = 0) then
  15764. begin
  15765. taicpu(p).opcode := A_TEST;
  15766. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  15767. DebugMsg(SPeepholeOptimization + 'Cmp2Test', p);
  15768. Result:=true;
  15769. end;
  15770. end;
  15771. function TX86AsmOptimizer.PostPeepholeOptTestOr(var p : tai) : Boolean;
  15772. var
  15773. IsTestConstX, IsValid : Boolean;
  15774. hp1,hp2 : tai;
  15775. begin
  15776. Result:=false;
  15777. { Final check to see if TEST/MOV pairs can be changed to MOV/TEST }
  15778. if (taicpu(p).opcode = A_TEST) then
  15779. while GetNextInstruction(p, hp1) and
  15780. TrySwapMovCmp(p, hp1) do
  15781. begin
  15782. if MatchInstruction(hp1, A_MOV, []) then
  15783. begin
  15784. if RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  15785. begin
  15786. { A little hacky, but since TEST doesn't read the flags, only
  15787. modify them, it's safe if they get scrambled by MOV -> XOR }
  15788. ExcludeRegFromUsedRegs(NR_DEFAULTFLAGS, UsedRegs);
  15789. Result := PostPeepholeOptMov(hp1);
  15790. {$ifdef x86_64}
  15791. if Result and MatchInstruction(hp1, A_XOR, [S_Q]) then
  15792. { Used to shrink instruction size }
  15793. PostPeepholeOptXor(hp1);
  15794. {$endif x86_64}
  15795. IncludeRegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs);
  15796. end
  15797. else
  15798. begin
  15799. Result := PostPeepholeOptMov(hp1);
  15800. {$ifdef x86_64}
  15801. if Result and MatchInstruction(hp1, A_XOR, [S_Q]) then
  15802. { Used to shrink instruction size }
  15803. PostPeepholeOptXor(hp1);
  15804. {$endif x86_64}
  15805. end;
  15806. end;
  15807. { Enabling this flag is actually a null operation, but it marks
  15808. the code as 'modified' during this pass }
  15809. Include(OptsToCheck, aoc_ForceNewIteration);
  15810. end;
  15811. { If x is a power of 2 (popcnt = 1), change:
  15812. or $x, %reg/ref
  15813. To:
  15814. bts lb(x), %reg/ref
  15815. }
  15816. if (taicpu(p).opcode = A_OR) and
  15817. IsBTXAcceptable(p) and
  15818. { IsBTXAcceptable checks to see if oper[0] is an immediate }
  15819. (PopCnt(QWord(taicpu(p).oper[0]^.val)) = 1) and
  15820. (
  15821. { Don't optimise if a test instruction follows }
  15822. not GetNextInstruction(p, hp1) or
  15823. not MatchInstruction(hp1, A_TEST, [taicpu(p).opsize])
  15824. ) then
  15825. begin
  15826. DebugMsg(SPeepholeOptimization + 'Changed OR $' + debug_hexstr(taicpu(p).oper[0]^.val) + ' to BTS $' + debug_tostr(BsrQWord(taicpu(p).oper[0]^.val)) + ' to shrink instruction size (Or2Bts)', p);
  15827. taicpu(p).opcode := A_BTS;
  15828. taicpu(p).oper[0]^.val := BsrQWord(taicpu(p).oper[0]^.val); { Essentially the base 2 logarithm }
  15829. Result := True;
  15830. Exit;
  15831. end;
  15832. { If x is a power of 2 (popcnt = 1), change:
  15833. test $x, %reg/ref
  15834. je / sete / cmove (or jne / setne)
  15835. To:
  15836. bt lb(x), %reg/ref
  15837. jnc / setnc / cmovnc (or jc / setc / cmovnc)
  15838. }
  15839. if (taicpu(p).opcode = A_TEST) and
  15840. (CPUX86_HAS_BTX in cpu_capabilities[current_settings.optimizecputype]) and
  15841. (taicpu(p).oper[0]^.typ = top_const) and
  15842. (
  15843. (cs_opt_size in current_settings.optimizerswitches) or
  15844. (
  15845. (taicpu(p).oper[1]^.typ = top_reg) and
  15846. (CPUX86_HINT_FAST_BT_REG_IMM in cpu_optimization_hints[current_settings.optimizecputype])
  15847. ) or
  15848. (
  15849. (taicpu(p).oper[1]^.typ <> top_reg) and
  15850. (CPUX86_HINT_FAST_BT_MEM_IMM in cpu_optimization_hints[current_settings.optimizecputype])
  15851. )
  15852. ) and
  15853. (PopCnt(QWord(taicpu(p).oper[0]^.val)) = 1) and
  15854. { For sizes less than S_L, the byte size is equal or larger with BT,
  15855. so don't bother optimising }
  15856. (taicpu(p).opsize >= S_L) then
  15857. begin
  15858. IsValid := True;
  15859. { Check the next set of instructions, watching the FLAGS register
  15860. and the conditions used }
  15861. TransferUsedRegs(TmpUsedRegs);
  15862. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  15863. hp1 := p;
  15864. hp2 := nil;
  15865. while GetNextInstruction(hp1, hp1) do
  15866. begin
  15867. if not Assigned(hp2) then
  15868. { The first instruction after TEST }
  15869. hp2 := hp1;
  15870. if (hp1.typ <> ait_instruction) then
  15871. begin
  15872. { If the flags are no longer in use, everything is fine }
  15873. if RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
  15874. IsValid := False;
  15875. Break;
  15876. end;
  15877. case taicpu(hp1).condition of
  15878. C_None:
  15879. begin
  15880. if RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
  15881. not RegLoadedWithNewValue(NR_DEFAULTFLAGS, hp1) then
  15882. { Something is not quite normal, so play safe and don't change }
  15883. IsValid := False;
  15884. Break;
  15885. end;
  15886. C_E, C_Z, C_NE, C_NZ:
  15887. { This is fine };
  15888. else
  15889. begin
  15890. { Unsupported condition }
  15891. IsValid := False;
  15892. Break;
  15893. end;
  15894. end;
  15895. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  15896. end;
  15897. if IsValid then
  15898. begin
  15899. while hp2 <> hp1 do
  15900. begin
  15901. case taicpu(hp2).condition of
  15902. C_Z, C_E:
  15903. taicpu(hp2).condition := C_NC;
  15904. C_NZ, C_NE:
  15905. taicpu(hp2).condition := C_C;
  15906. else
  15907. { Should not get this by this point }
  15908. InternalError(2022110701);
  15909. end;
  15910. GetNextInstruction(hp2, hp2);
  15911. end;
  15912. DebugMsg(SPeepholeOptimization + 'Changed TEST $' + debug_hexstr(taicpu(p).oper[0]^.val) + ' to BT $' + debug_tostr(BsrQWord(taicpu(p).oper[0]^.val)) + ' to shrink instruction size (Test2Bt)', p);
  15913. taicpu(p).opcode := A_BT;
  15914. taicpu(p).oper[0]^.val := BsrQWord(taicpu(p).oper[0]^.val); { Essentially the base 2 logarithm }
  15915. Result := True;
  15916. Exit;
  15917. end;
  15918. end;
  15919. { removes the line marked with (x) from the sequence
  15920. and/or/xor/add/sub/... $x, %y
  15921. test/or %y, %y | test $-1, %y (x)
  15922. j(n)z _Label
  15923. as the first instruction already adjusts the ZF
  15924. %y operand may also be a reference }
  15925. IsTestConstX:=(taicpu(p).opcode=A_TEST) and
  15926. MatchOperand(taicpu(p).oper[0]^,-1);
  15927. if (OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) or IsTestConstX) and
  15928. GetLastInstruction(p, hp1) and
  15929. (tai(hp1).typ = ait_instruction) and
  15930. GetNextInstruction(p,hp2) and
  15931. MatchInstruction(hp2,A_SETcc,A_Jcc,A_CMOVcc,[]) then
  15932. case taicpu(hp1).opcode Of
  15933. A_ADD, A_SUB, A_OR, A_XOR, A_AND,
  15934. { These two instructions set the zero flag if the result is zero }
  15935. A_POPCNT, A_LZCNT:
  15936. begin
  15937. if (
  15938. { With POPCNT, an input of zero will set the zero flag
  15939. because the population count of zero is zero }
  15940. (taicpu(hp1).opcode = A_POPCNT) and
  15941. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) and
  15942. (
  15943. OpsEqual(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^) or
  15944. { Faster than going through the second half of the 'or'
  15945. condition below }
  15946. OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^)
  15947. )
  15948. ) or (
  15949. OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^) and
  15950. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  15951. { and in case of carry for A(E)/B(E)/C/NC }
  15952. (
  15953. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) or
  15954. (
  15955. (taicpu(hp1).opcode <> A_ADD) and
  15956. (taicpu(hp1).opcode <> A_SUB) and
  15957. (taicpu(hp1).opcode <> A_LZCNT)
  15958. )
  15959. )
  15960. ) then
  15961. begin
  15962. DebugMsg(SPeepholeOptimization + 'OpTest/Or2Op (2-op) done', hp1);
  15963. RemoveCurrentP(p, hp2);
  15964. Result:=true;
  15965. Exit;
  15966. end;
  15967. end;
  15968. A_SHL, A_SAL, A_SHR, A_SAR:
  15969. begin
  15970. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  15971. { SHL/SAL/SHR/SAR with a value of 0 do not change the flags }
  15972. { therefore, it's only safe to do this optimization for }
  15973. { shifts by a (nonzero) constant }
  15974. (taicpu(hp1).oper[0]^.typ = top_const) and
  15975. (taicpu(hp1).oper[0]^.val <> 0) and
  15976. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  15977. { and in case of carry for A(E)/B(E)/C/NC }
  15978. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  15979. begin
  15980. DebugMsg(SPeepholeOptimization + 'OpTest/Or2Op (shift) done', hp1);
  15981. RemoveCurrentP(p, hp2);
  15982. Result:=true;
  15983. Exit;
  15984. end;
  15985. end;
  15986. A_DEC, A_INC, A_NEG:
  15987. begin
  15988. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) and
  15989. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  15990. { and in case of carry for A(E)/B(E)/C/NC }
  15991. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  15992. begin
  15993. DebugMsg(SPeepholeOptimization + 'OpTest/Or2Op (1-op) done', hp1);
  15994. RemoveCurrentP(p, hp2);
  15995. Result:=true;
  15996. Exit;
  15997. end;
  15998. end;
  15999. A_ANDN, A_BZHI:
  16000. begin
  16001. if OpsEqual(taicpu(hp1).oper[2]^,taicpu(p).oper[1]^) and
  16002. { Only the zero and sign flags are consistent with what the result is }
  16003. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE,C_S,C_NS]) then
  16004. begin
  16005. DebugMsg(SPeepholeOptimization + 'OpTest/Or2Op (ANDN/BZHI) done', hp1);
  16006. RemoveCurrentP(p, hp2);
  16007. Result:=true;
  16008. Exit;
  16009. end;
  16010. end;
  16011. A_BEXTR:
  16012. begin
  16013. if OpsEqual(taicpu(hp1).oper[2]^,taicpu(p).oper[1]^) and
  16014. { Only the zero flag is set }
  16015. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  16016. begin
  16017. DebugMsg(SPeepholeOptimization + 'OpTest/Or2Op (BEXTR) done', hp1);
  16018. RemoveCurrentP(p, hp2);
  16019. Result:=true;
  16020. Exit;
  16021. end;
  16022. end;
  16023. else
  16024. ;
  16025. end; { case }
  16026. { change "test $-1,%reg" into "test %reg,%reg" }
  16027. if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
  16028. taicpu(p).loadoper(0,taicpu(p).oper[1]^);
  16029. { Change "or %reg,%reg" to "test %reg,%reg" as OR generates a false dependency }
  16030. if MatchInstruction(p, A_OR, []) and
  16031. { Can only match if they're both registers }
  16032. MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) then
  16033. begin
  16034. DebugMsg(SPeepholeOptimization + 'or %reg,%reg -> test %reg,%reg to remove false dependency (Or2Test)', p);
  16035. taicpu(p).opcode := A_TEST;
  16036. { No need to set Result to True, as we've done all the optimisations we can }
  16037. end;
  16038. end;
  16039. function TX86AsmOptimizer.PostPeepholeOptCall(var p : tai) : Boolean;
  16040. var
  16041. hp1,hp3 : tai;
  16042. {$ifndef x86_64}
  16043. hp2 : taicpu;
  16044. {$endif x86_64}
  16045. begin
  16046. Result:=false;
  16047. hp3:=nil;
  16048. {$ifndef x86_64}
  16049. { don't do this on modern CPUs, this really hurts them due to
  16050. broken call/ret pairing }
  16051. if (current_settings.optimizecputype < cpu_Pentium2) and
  16052. not(cs_create_pic in current_settings.moduleswitches) and
  16053. GetNextInstruction(p, hp1) and
  16054. MatchInstruction(hp1,A_JMP,[S_NO]) and
  16055. MatchOpType(taicpu(hp1),top_ref) and
  16056. (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  16057. begin
  16058. hp2 := taicpu.Op_sym(A_PUSH,S_L,taicpu(hp1).oper[0]^.ref^.symbol);
  16059. taicpu(hp2).fileinfo := taicpu(p).fileinfo;
  16060. InsertLLItem(p.previous, p, hp2);
  16061. taicpu(p).opcode := A_JMP;
  16062. taicpu(p).is_jmp := true;
  16063. RemoveInstruction(hp1);
  16064. Result:=true;
  16065. end
  16066. else
  16067. {$endif x86_64}
  16068. { replace
  16069. call procname
  16070. ret
  16071. by
  16072. jmp procname
  16073. but do it only on level 4 because it destroys stack back traces
  16074. else if the subroutine is marked as no return, remove the ret
  16075. }
  16076. if ((cs_opt_level4 in current_settings.optimizerswitches) or
  16077. (po_noreturn in current_procinfo.procdef.procoptions)) and
  16078. GetNextInstruction(p, hp1) and
  16079. (MatchInstruction(hp1,A_RET,[S_NO]) or
  16080. (MatchInstruction(hp1,A_VZEROUPPER,[S_NO]) and
  16081. SetAndTest(hp1,hp3) and
  16082. GetNextInstruction(hp1,hp1) and
  16083. MatchInstruction(hp1,A_RET,[S_NO])
  16084. )
  16085. ) and
  16086. (taicpu(hp1).ops=0) then
  16087. begin
  16088. if (cs_opt_level4 in current_settings.optimizerswitches) and
  16089. { we might destroy stack alignment here if we do not do a call }
  16090. (target_info.stackalign<=sizeof(SizeUInt)) then
  16091. begin
  16092. taicpu(p).opcode := A_JMP;
  16093. taicpu(p).is_jmp := true;
  16094. DebugMsg(SPeepholeOptimization + 'CallRet2Jmp done',p);
  16095. end
  16096. else
  16097. DebugMsg(SPeepholeOptimization + 'CallRet2Call done',p);
  16098. RemoveInstruction(hp1);
  16099. if Assigned(hp3) then
  16100. begin
  16101. AsmL.Remove(hp3);
  16102. AsmL.InsertBefore(hp3,p)
  16103. end;
  16104. Result:=true;
  16105. end;
  16106. end;
  16107. function TX86AsmOptimizer.PostPeepholeOptMovzx(var p : tai) : Boolean;
  16108. function ConstInRange(const Val: TCGInt; const OpSize: TOpSize): Boolean;
  16109. begin
  16110. case OpSize of
  16111. S_B, S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
  16112. Result := (Val <= $FF) and (Val >= -128);
  16113. S_W, S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  16114. Result := (Val <= $FFFF) and (Val >= -32768);
  16115. S_L{$ifdef x86_64}, S_LQ{$endif x86_64}:
  16116. Result := (Val <= $FFFFFFFF) and (Val >= -2147483648);
  16117. else
  16118. Result := True;
  16119. end;
  16120. end;
  16121. var
  16122. hp1, hp2 : tai;
  16123. SizeChange: Boolean;
  16124. PreMessage: string;
  16125. begin
  16126. Result := False;
  16127. if (taicpu(p).oper[0]^.typ = top_reg) and
  16128. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  16129. GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) then
  16130. begin
  16131. { Change (using movzbl %al,%eax as an example):
  16132. movzbl %al, %eax movzbl %al, %eax
  16133. cmpl x, %eax testl %eax,%eax
  16134. To:
  16135. cmpb x, %al testb %al, %al (Move one back to avoid a false dependency)
  16136. movzbl %al, %eax movzbl %al, %eax
  16137. Smaller instruction and minimises pipeline stall as the CPU
  16138. doesn't have to wait for the register to get zero-extended. [Kit]
  16139. Also allow if the smaller of the two registers is being checked,
  16140. as this still removes the false dependency.
  16141. }
  16142. if
  16143. (
  16144. (
  16145. (taicpu(hp1).opcode = A_CMP) and MatchOpType(taicpu(hp1), top_const, top_reg) and
  16146. ConstInRange(taicpu(hp1).oper[0]^.val, taicpu(p).opsize)
  16147. ) or (
  16148. { If MatchOperand returns True, they must both be registers }
  16149. (taicpu(hp1).opcode = A_TEST) and MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^)
  16150. )
  16151. ) and
  16152. (reg2opsize(taicpu(hp1).oper[1]^.reg) <= reg2opsize(taicpu(p).oper[1]^.reg)) and
  16153. SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) then
  16154. begin
  16155. PreMessage := debug_op2str(taicpu(hp1).opcode) + debug_opsize2str(taicpu(hp1).opsize) + ' ' + debug_operstr(taicpu(hp1).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' -> ' + debug_op2str(taicpu(hp1).opcode);
  16156. asml.Remove(hp1);
  16157. asml.InsertBefore(hp1, p);
  16158. { Swap instructions in the case of cmp 0,%reg or test %reg,%reg }
  16159. if (taicpu(hp1).opcode = A_TEST) or (taicpu(hp1).oper[0]^.val = 0) then
  16160. begin
  16161. taicpu(hp1).opcode := A_TEST;
  16162. taicpu(hp1).loadreg(0, taicpu(p).oper[0]^.reg);
  16163. end;
  16164. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[0]^.reg;
  16165. case taicpu(p).opsize of
  16166. S_BW, S_BL:
  16167. begin
  16168. SizeChange := taicpu(hp1).opsize <> S_B;
  16169. taicpu(hp1).changeopsize(S_B);
  16170. end;
  16171. S_WL:
  16172. begin
  16173. SizeChange := taicpu(hp1).opsize <> S_W;
  16174. taicpu(hp1).changeopsize(S_W);
  16175. end
  16176. else
  16177. InternalError(2020112701);
  16178. end;
  16179. UpdateUsedRegs(tai(p.Next));
  16180. { Check if the register is used aferwards - if not, we can
  16181. remove the movzx instruction completely }
  16182. if not RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, p, UsedRegs) then
  16183. begin
  16184. { Hp1 is a better position than p for debugging purposes }
  16185. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 4a', hp1);
  16186. RemoveCurrentp(p, hp1);
  16187. Result := True;
  16188. end;
  16189. if SizeChange then
  16190. DebugMsg(SPeepholeOptimization + PreMessage +
  16191. debug_opsize2str(taicpu(hp1).opsize) + ' ' + debug_operstr(taicpu(hp1).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (smaller and minimises pipeline stall - MovzxCmp2CmpMovzx)', hp1)
  16192. else
  16193. DebugMsg(SPeepholeOptimization + 'MovzxCmp2CmpMovzx', hp1);
  16194. Exit;
  16195. end;
  16196. { Change (using movzwl %ax,%eax as an example):
  16197. movzwl %ax, %eax
  16198. movb %al, (dest) (Register is smaller than read register in movz)
  16199. To:
  16200. movb %al, (dest) (Move one back to avoid a false dependency)
  16201. movzwl %ax, %eax
  16202. }
  16203. if (taicpu(hp1).opcode = A_MOV) and
  16204. (taicpu(hp1).oper[0]^.typ = top_reg) and
  16205. not RegInOp(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^) and
  16206. SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[0]^.reg) and
  16207. (reg2opsize(taicpu(hp1).oper[0]^.reg) <= reg2opsize(taicpu(p).oper[0]^.reg)) then
  16208. begin
  16209. DebugMsg(SPeepholeOptimization + 'MovzxMov2MovMovzx', hp1);
  16210. hp2 := tai(hp1.Previous); { Effectively the old position of hp1 }
  16211. asml.Remove(hp1);
  16212. asml.InsertBefore(hp1, p);
  16213. if taicpu(hp1).oper[1]^.typ = top_reg then
  16214. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
  16215. { Check if the register is used aferwards - if not, we can
  16216. remove the movzx instruction completely }
  16217. if not RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg, p, UsedRegs) then
  16218. begin
  16219. { Hp1 is a better position than p for debugging purposes }
  16220. DebugMsg(SPeepholeOptimization + 'Movzx2Nop 4b', hp1);
  16221. RemoveCurrentp(p, hp1);
  16222. Result := True;
  16223. end;
  16224. Exit;
  16225. end;
  16226. end;
  16227. end;
  16228. function TX86AsmOptimizer.PostPeepholeOptXor(var p : tai) : Boolean;
  16229. var
  16230. hp1: tai;
  16231. {$ifdef x86_64}
  16232. PreMessage, RegName: string;
  16233. {$endif x86_64}
  16234. begin
  16235. Result := False;
  16236. { If x is a power of 2 (popcnt = 1), change:
  16237. xor $x, %reg/ref
  16238. To:
  16239. btc lb(x), %reg/ref
  16240. }
  16241. if IsBTXAcceptable(p) and
  16242. { IsBTXAcceptable checks to see if oper[0] is an immediate }
  16243. (PopCnt(QWord(taicpu(p).oper[0]^.val)) = 1) and
  16244. (
  16245. { Don't optimise if a test instruction follows }
  16246. not GetNextInstruction(p, hp1) or
  16247. not MatchInstruction(hp1, A_TEST, [taicpu(p).opsize])
  16248. ) then
  16249. begin
  16250. DebugMsg(SPeepholeOptimization + 'Changed XOR $' + debug_hexstr(taicpu(p).oper[0]^.val) + ' to BTC $' + debug_tostr(BsrQWord(taicpu(p).oper[0]^.val)) + ' to shrink instruction size (Xor2Btc)', p);
  16251. taicpu(p).opcode := A_BTC;
  16252. taicpu(p).oper[0]^.val := BsrQWord(taicpu(p).oper[0]^.val); { Essentially the base 2 logarithm }
  16253. Result := True;
  16254. Exit;
  16255. end;
  16256. {$ifdef x86_64}
  16257. { Code size reduction by J. Gareth "Kit" Moreton }
  16258. { change "xorq %reg,%reg" to "xorl %reg,%reg" for %rax, %rcx, %rdx, %rbx, %rsi, %rdi, %rbp and %rsp,
  16259. as this removes the REX prefix }
  16260. if not OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  16261. Exit;
  16262. if taicpu(p).oper[0]^.typ <> top_reg then
  16263. { Should be impossible if both operands were equal, since one of XOR's operands must be a register }
  16264. InternalError(2018011500);
  16265. case taicpu(p).opsize of
  16266. S_Q:
  16267. begin
  16268. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 64-bit register name }
  16269. PreMessage := 'xorq ' + RegName + ',' + RegName + ' -> xorl ';
  16270. { The actual optimization }
  16271. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  16272. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  16273. taicpu(p).changeopsize(S_L);
  16274. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 32-bit register name }
  16275. DebugMsg(SPeepholeOptimization + PreMessage + RegName + ',' + RegName + ' (32-bit register recommended when zeroing 64-bit counterpart)', p);
  16276. end;
  16277. else
  16278. ;
  16279. end;
  16280. {$endif x86_64}
  16281. end;
  16282. function TX86AsmOptimizer.PostPeepholeOptVPXOR(var p : tai) : Boolean;
  16283. var
  16284. XReg: TRegister;
  16285. begin
  16286. Result := False;
  16287. { Turn "vpxor %ymmreg2,%ymmreg2,%ymmreg1" to "vpxor %xmmreg2,%xmmreg2,%xmmreg1"
  16288. Smaller encoding and slightly faster on some platforms (also works for
  16289. ZMM-sized registers) }
  16290. if (taicpu(p).opsize in [S_YMM, S_ZMM]) and
  16291. MatchOpType(taicpu(p), top_reg, top_reg, top_reg) then
  16292. begin
  16293. XReg := taicpu(p).oper[0]^.reg;
  16294. if (taicpu(p).oper[1]^.reg = XReg) then
  16295. begin
  16296. taicpu(p).changeopsize(S_XMM);
  16297. setsubreg(taicpu(p).oper[2]^.reg, R_SUBMMX);
  16298. if (cs_opt_size in current_settings.optimizerswitches) then
  16299. begin
  16300. { Change input registers to %xmm0 to reduce size. Note that
  16301. there's a risk of a false dependency doing this, so only
  16302. optimise for size here }
  16303. XReg := NR_XMM0;
  16304. DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM and changed input registers to %xmm0 to reduce size', p);
  16305. end
  16306. else
  16307. begin
  16308. setsubreg(XReg, R_SUBMMX);
  16309. DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM to reduce size and increase efficiency', p);
  16310. end;
  16311. taicpu(p).oper[0]^.reg := XReg;
  16312. taicpu(p).oper[1]^.reg := XReg;
  16313. Result := True;
  16314. end;
  16315. end;
  16316. end;
  16317. function TX86AsmOptimizer.PostPeepholeOptRET(var p: tai): Boolean;
  16318. var
  16319. hp1, p_new: tai;
  16320. begin
  16321. Result := False;
  16322. { Check for:
  16323. ret
  16324. .Lbl:
  16325. ret
  16326. Remove first 'ret'
  16327. }
  16328. if GetNextInstruction(p, hp1) and
  16329. { Remember where the label is }
  16330. SetAndTest(hp1, p_new) and
  16331. (hp1.typ in [ait_align, ait_label]) and
  16332. SkipLabels(hp1, hp1) and
  16333. MatchInstruction(hp1, A_RET, []) and
  16334. { To be safe, make sure the RET instructions are identical }
  16335. (taicpu(p).ops = taicpu(hp1).ops) and
  16336. (
  16337. (taicpu(p).ops = 0) or
  16338. (
  16339. (taicpu(p).ops = 1) and
  16340. MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^)
  16341. )
  16342. ) then
  16343. begin
  16344. DebugMsg(SPeepholeOptimization + 'Removed superfluous RET', p);
  16345. UpdateUsedRegs(tai(p.Next));
  16346. RemoveCurrentP(p, p_new);
  16347. Result := True;
  16348. Exit;
  16349. end;
  16350. end;
  16351. function TX86AsmOptimizer.PostPeepholeOptRORX(var p: tai): Boolean;
  16352. begin
  16353. Result := False;
  16354. { Change: To:
  16355. rorx #x,%reg,%reg ror #x,%reg
  16356. (Smaller instruction size)
  16357. }
  16358. if MatchOperand(taicpu(p).oper[1]^,taicpu(p).oper[2]^.reg) and
  16359. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  16360. begin
  16361. taicpu(p).opcode:=A_ROR;
  16362. taicpu(p).ops:=2;
  16363. taicpu(p).clearop(2);
  16364. end;
  16365. end;
  16366. function TX86AsmOptimizer.PostPeepholeOptSARXSHLXSHRX(var p: tai): Boolean;
  16367. begin
  16368. Result := False;
  16369. { Change: bTo:
  16370. shlx %ecx,%reg,%reg shl %cl,%reg
  16371. (Smaller instruction size)
  16372. Same with SARX and SHRX (and when using %rcx for 64-bit)
  16373. }
  16374. if (getsupreg(taicpu(p).oper[0]^.reg)=RS_ECX) and
  16375. MatchOperand(taicpu(p).oper[1]^,taicpu(p).oper[2]^.reg) and
  16376. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  16377. begin
  16378. case taicpu(p).opcode of
  16379. A_SARX: taicpu(p).opcode:=A_SAR;
  16380. A_SHLX: taicpu(p).opcode:=A_SHL;
  16381. A_SHRX: taicpu(p).opcode:=A_SHR;
  16382. else
  16383. InternalError(2025090501);
  16384. end;
  16385. setsubreg(taicpu(p).oper[0]^.reg, R_SUBL);
  16386. taicpu(p).ops:=2;
  16387. taicpu(p).clearop(2);
  16388. end;
  16389. end;
  16390. class procedure TX86AsmOptimizer.OptimizeRefs(var p: taicpu);
  16391. var
  16392. OperIdx: Integer;
  16393. begin
  16394. for OperIdx := 0 to p.ops - 1 do
  16395. if p.oper[OperIdx]^.typ = top_ref then
  16396. optimize_ref(p.oper[OperIdx]^.ref^, False);
  16397. end;
  16398. end.