aoptx86.pas 278 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517
  1. {
  2. Copyright (c) 1998-2002 by Florian Klaempfl and Jonas Maebe
  3. This unit contains the peephole optimizer.
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit aoptx86;
  18. {$i fpcdefs.inc}
  19. {$define DEBUG_AOPTCPU}
  20. interface
  21. uses
  22. globtype,
  23. cpubase,
  24. aasmtai,aasmcpu,
  25. cgbase,cgutils,
  26. aopt,aoptobj;
  27. type
  28. TOptsToCheck = (
  29. aoc_MovAnd2Mov_3
  30. );
  31. TX86AsmOptimizer = class(TAsmOptimizer)
  32. { some optimizations are very expensive to check, so the
  33. pre opt pass can be used to set some flags, depending on the found
  34. instructions if it is worth to check a certain optimization }
  35. OptsToCheck : set of TOptsToCheck;
  36. function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override;
  37. function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override;
  38. function RegReadByInstruction(reg : TRegister; hp : tai) : boolean;
  39. function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
  40. function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  41. {
  42. In comparison with GetNextInstructionUsingReg, GetNextInstructionUsingRegTrackingUse tracks
  43. the use of a register by allocs/dealloc, so it can ignore calls.
  44. In the following example, GetNextInstructionUsingReg will return the second movq,
  45. GetNextInstructionUsingRegTrackingUse won't.
  46. movq %rdi,%rax
  47. # Register rdi released
  48. # Register rdi allocated
  49. movq %rax,%rdi
  50. While in this example:
  51. movq %rdi,%rax
  52. call proc
  53. movq %rdi,%rax
  54. GetNextInstructionUsingRegTrackingUse will return the second instruction while GetNextInstructionUsingReg
  55. won't.
  56. }
  57. function GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  58. function RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean; override;
  59. private
  60. function SkipSimpleInstructions(var hp1: tai): Boolean;
  61. protected
  62. class function IsMOVZXAcceptable: Boolean; static; inline;
  63. { checks whether loading a new value in reg1 overwrites the entirety of reg2 }
  64. function Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  65. { checks whether reading the value in reg1 depends on the value of reg2. This
  66. is very similar to SuperRegisterEquals, except it takes into account that
  67. R_SUBH and R_SUBL are independendent (e.g. reading from AL does not
  68. depend on the value in AH). }
  69. function Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  70. { Replaces all references to AOldReg in a memory reference to ANewReg }
  71. class function ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean; static;
  72. { Replaces all references to AOldReg in an operand to ANewReg }
  73. class function ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean; static;
  74. { Replaces all references to AOldReg in an instruction to ANewReg,
  75. except where the register is being written }
  76. function ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
  77. { Returns true if the reference only refers to ESP or EBP (or their 64-bit equivalents),
  78. or writes to a global symbol }
  79. class function IsRefSafe(const ref: PReference): Boolean; static; inline;
  80. { Returns true if the given MOV instruction can be safely converted to CMOV }
  81. class function CanBeCMOV(p : tai) : boolean; static;
  82. { Converts the LEA instruction to ADD/INC/SUB/DEC. Returns True if the
  83. conversion was successful }
  84. function ConvertLEA(const p : taicpu): Boolean;
  85. function DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  86. procedure DebugMsg(const s : string; p : tai);inline;
  87. class function IsExitCode(p : tai) : boolean; static;
  88. class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
  89. procedure RemoveLastDeallocForFuncRes(p : tai);
  90. function DoSubAddOpt(var p : tai) : Boolean;
  91. function PrePeepholeOptSxx(var p : tai) : boolean;
  92. function PrePeepholeOptIMUL(var p : tai) : boolean;
  93. function OptPass1AND(var p : tai) : boolean;
  94. function OptPass1_V_MOVAP(var p : tai) : boolean;
  95. function OptPass1VOP(var p : tai) : boolean;
  96. function OptPass1MOV(var p : tai) : boolean;
  97. function OptPass1Movx(var p : tai) : boolean;
  98. function OptPass1MOVXX(var p : tai) : boolean;
  99. function OptPass1OP(var p : tai) : boolean;
  100. function OptPass1LEA(var p : tai) : boolean;
  101. function OptPass1Sub(var p : tai) : boolean;
  102. function OptPass1SHLSAL(var p : tai) : boolean;
  103. function OptPass1SETcc(var p : tai) : boolean;
  104. function OptPass1FSTP(var p : tai) : boolean;
  105. function OptPass1FLD(var p : tai) : boolean;
  106. function OptPass1Cmp(var p : tai) : boolean;
  107. function OptPass1PXor(var p : tai) : boolean;
  108. function OptPass1VPXor(var p: tai): boolean;
  109. function OptPass1Imul(var p : tai) : boolean;
  110. function OptPass2MOV(var p : tai) : boolean;
  111. function OptPass2Imul(var p : tai) : boolean;
  112. function OptPass2Jmp(var p : tai) : boolean;
  113. function OptPass2Jcc(var p : tai) : boolean;
  114. function OptPass2Lea(var p: tai): Boolean;
  115. function OptPass2SUB(var p: tai): Boolean;
  116. function PostPeepholeOptMov(var p : tai) : Boolean;
  117. {$ifdef x86_64} { These post-peephole optimisations only affect 64-bit registers. [Kit] }
  118. function PostPeepholeOptMovzx(var p : tai) : Boolean;
  119. function PostPeepholeOptXor(var p : tai) : Boolean;
  120. {$endif}
  121. function PostPeepholeOptMOVSX(var p : tai) : boolean;
  122. function PostPeepholeOptCmp(var p : tai) : Boolean;
  123. function PostPeepholeOptTestOr(var p : tai) : Boolean;
  124. function PostPeepholeOptCall(var p : tai) : Boolean;
  125. function PostPeepholeOptLea(var p : tai) : Boolean;
  126. function PostPeepholeOptPush(var p: tai): Boolean;
  127. procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
  128. { Processor-dependent reference optimisation }
  129. class procedure OptimizeRefs(var p: taicpu); static;
  130. end;
  131. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  132. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  133. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  134. function MatchInstruction(const instr: tai; const ops: array of TAsmOp; const opsize: topsizes): boolean;
  135. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  136. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  137. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  138. {$if max_operands>2}
  139. function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
  140. {$endif max_operands>2}
  141. function RefsEqual(const r1, r2: treference): boolean;
  142. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  143. { returns true, if ref is a reference using only the registers passed as base and index
  144. and having an offset }
  145. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  146. implementation
  147. uses
  148. cutils,verbose,
  149. systems,
  150. globals,
  151. cpuinfo,
  152. procinfo,
  153. paramgr,
  154. aasmbase,
  155. aoptbase,aoptutils,
  156. symconst,symsym,
  157. cgx86,
  158. itcpugas;
  159. {$ifdef DEBUG_AOPTCPU}
  160. const
  161. SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
  162. {$else DEBUG_AOPTCPU}
  163. { Empty strings help the optimizer to remove string concatenations that won't
  164. ever appear to the user on release builds. [Kit] }
  165. const
  166. SPeepholeOptimization = '';
  167. {$endif DEBUG_AOPTCPU}
  168. function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
  169. begin
  170. result :=
  171. (instr.typ = ait_instruction) and
  172. (taicpu(instr).opcode = op) and
  173. ((opsize = []) or (taicpu(instr).opsize in opsize));
  174. end;
  175. function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
  176. begin
  177. result :=
  178. (instr.typ = ait_instruction) and
  179. ((taicpu(instr).opcode = op1) or
  180. (taicpu(instr).opcode = op2)
  181. ) and
  182. ((opsize = []) or (taicpu(instr).opsize in opsize));
  183. end;
  184. function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
  185. begin
  186. result :=
  187. (instr.typ = ait_instruction) and
  188. ((taicpu(instr).opcode = op1) or
  189. (taicpu(instr).opcode = op2) or
  190. (taicpu(instr).opcode = op3)
  191. ) and
  192. ((opsize = []) or (taicpu(instr).opsize in opsize));
  193. end;
  194. function MatchInstruction(const instr : tai;const ops : array of TAsmOp;
  195. const opsize : topsizes) : boolean;
  196. var
  197. op : TAsmOp;
  198. begin
  199. result:=false;
  200. for op in ops do
  201. begin
  202. if (instr.typ = ait_instruction) and
  203. (taicpu(instr).opcode = op) and
  204. ((opsize = []) or (taicpu(instr).opsize in opsize)) then
  205. begin
  206. result:=true;
  207. exit;
  208. end;
  209. end;
  210. end;
  211. function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
  212. begin
  213. result := (oper.typ = top_reg) and (oper.reg = reg);
  214. end;
  215. function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
  216. begin
  217. result := (oper.typ = top_const) and (oper.val = a);
  218. end;
  219. function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
  220. begin
  221. result := oper1.typ = oper2.typ;
  222. if result then
  223. case oper1.typ of
  224. top_const:
  225. Result:=oper1.val = oper2.val;
  226. top_reg:
  227. Result:=oper1.reg = oper2.reg;
  228. top_ref:
  229. Result:=RefsEqual(oper1.ref^, oper2.ref^);
  230. else
  231. internalerror(2013102801);
  232. end
  233. end;
  234. function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
  235. begin
  236. result := (oper1.typ = oper2.typ) and (oper1.typ = oper3.typ);
  237. if result then
  238. case oper1.typ of
  239. top_const:
  240. Result:=(oper1.val = oper2.val) and (oper1.val = oper3.val);
  241. top_reg:
  242. Result:=(oper1.reg = oper2.reg) and (oper1.reg = oper3.reg);
  243. top_ref:
  244. Result:=RefsEqual(oper1.ref^, oper2.ref^) and RefsEqual(oper1.ref^, oper3.ref^);
  245. else
  246. internalerror(2020052401);
  247. end
  248. end;
  249. function RefsEqual(const r1, r2: treference): boolean;
  250. begin
  251. RefsEqual :=
  252. (r1.offset = r2.offset) and
  253. (r1.segment = r2.segment) and (r1.base = r2.base) and
  254. (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
  255. (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
  256. (r1.relsymbol = r2.relsymbol) and
  257. (r1.volatility=[]) and
  258. (r2.volatility=[]);
  259. end;
  260. function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
  261. begin
  262. Result:=(ref.offset=0) and
  263. (ref.scalefactor in [0,1]) and
  264. (ref.segment=NR_NO) and
  265. (ref.symbol=nil) and
  266. (ref.relsymbol=nil) and
  267. ((base=NR_INVALID) or
  268. (ref.base=base)) and
  269. ((index=NR_INVALID) or
  270. (ref.index=index)) and
  271. (ref.volatility=[]);
  272. end;
  273. function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
  274. begin
  275. Result:=(ref.scalefactor in [0,1]) and
  276. (ref.segment=NR_NO) and
  277. (ref.symbol=nil) and
  278. (ref.relsymbol=nil) and
  279. ((base=NR_INVALID) or
  280. (ref.base=base)) and
  281. ((index=NR_INVALID) or
  282. (ref.index=index)) and
  283. (ref.volatility=[]);
  284. end;
  285. function InstrReadsFlags(p: tai): boolean;
  286. begin
  287. InstrReadsFlags := true;
  288. case p.typ of
  289. ait_instruction:
  290. if InsProp[taicpu(p).opcode].Ch*
  291. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  292. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  293. Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc,Ch_All]<>[] then
  294. exit;
  295. ait_label:
  296. exit;
  297. else
  298. ;
  299. end;
  300. InstrReadsFlags := false;
  301. end;
  302. function TX86AsmOptimizer.GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
  303. begin
  304. Next:=Current;
  305. repeat
  306. Result:=GetNextInstruction(Next,Next);
  307. until not (Result) or
  308. not(cs_opt_level3 in current_settings.optimizerswitches) or
  309. (Next.typ<>ait_instruction) or
  310. RegInInstruction(reg,Next) or
  311. is_calljmp(taicpu(Next).opcode);
  312. end;
  313. function TX86AsmOptimizer.GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
  314. begin
  315. if not(cs_opt_level3 in current_settings.optimizerswitches) then
  316. begin
  317. Result:=GetNextInstruction(Current,Next);
  318. exit;
  319. end;
  320. Next:=tai(Current.Next);
  321. Result:=false;
  322. while assigned(Next) do
  323. begin
  324. if ((Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) and not(taicpu(Next).opcode=A_CALL)) or
  325. ((Next.typ=ait_regalloc) and (getsupreg(tai_regalloc(Next).reg)=getsupreg(reg))) or
  326. ((Next.typ=ait_label) and not(labelCanBeSkipped(Tai_Label(Next)))) then
  327. exit
  328. else if (Next.typ=ait_instruction) and RegInInstruction(reg,Next) and not(taicpu(Next).opcode=A_CALL) then
  329. begin
  330. Result:=true;
  331. exit;
  332. end;
  333. Next:=tai(Next.Next);
  334. end;
  335. end;
  336. function TX86AsmOptimizer.InstructionLoadsFromReg(const reg: TRegister;const hp: tai): boolean;
  337. begin
  338. Result:=RegReadByInstruction(reg,hp);
  339. end;
  340. function TX86AsmOptimizer.RegReadByInstruction(reg: TRegister; hp: tai): boolean;
  341. var
  342. p: taicpu;
  343. opcount: longint;
  344. begin
  345. RegReadByInstruction := false;
  346. if hp.typ <> ait_instruction then
  347. exit;
  348. p := taicpu(hp);
  349. case p.opcode of
  350. A_CALL:
  351. regreadbyinstruction := true;
  352. A_IMUL:
  353. case p.ops of
  354. 1:
  355. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  356. (
  357. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  358. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  359. );
  360. 2,3:
  361. regReadByInstruction :=
  362. reginop(reg,p.oper[0]^) or
  363. reginop(reg,p.oper[1]^);
  364. else
  365. InternalError(2019112801);
  366. end;
  367. A_MUL:
  368. begin
  369. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  370. (
  371. ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
  372. ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
  373. );
  374. end;
  375. A_IDIV,A_DIV:
  376. begin
  377. regReadByInstruction := RegInOp(reg,p.oper[0]^) or
  378. (
  379. (getregtype(reg)=R_INTREGISTER) and
  380. (
  381. (getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B))
  382. )
  383. );
  384. end;
  385. else
  386. begin
  387. if (p.opcode=A_LEA) and is_segment_reg(reg) then
  388. begin
  389. RegReadByInstruction := false;
  390. exit;
  391. end;
  392. for opcount := 0 to p.ops-1 do
  393. if (p.oper[opCount]^.typ = top_ref) and
  394. RegInRef(reg,p.oper[opcount]^.ref^) then
  395. begin
  396. RegReadByInstruction := true;
  397. exit
  398. end;
  399. { special handling for SSE MOVSD }
  400. if (p.opcode=A_MOVSD) and (p.ops>0) then
  401. begin
  402. if p.ops<>2 then
  403. internalerror(2017042702);
  404. regReadByInstruction := reginop(reg,p.oper[0]^) or
  405. (
  406. (p.oper[1]^.typ=top_reg) and (p.oper[0]^.typ=top_reg) and reginop(reg, p.oper[1]^)
  407. );
  408. exit;
  409. end;
  410. with insprop[p.opcode] do
  411. begin
  412. if getregtype(reg)=R_INTREGISTER then
  413. begin
  414. case getsupreg(reg) of
  415. RS_EAX:
  416. if [Ch_REAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
  417. begin
  418. RegReadByInstruction := true;
  419. exit
  420. end;
  421. RS_ECX:
  422. if [Ch_RECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
  423. begin
  424. RegReadByInstruction := true;
  425. exit
  426. end;
  427. RS_EDX:
  428. if [Ch_REDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
  429. begin
  430. RegReadByInstruction := true;
  431. exit
  432. end;
  433. RS_EBX:
  434. if [Ch_REBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
  435. begin
  436. RegReadByInstruction := true;
  437. exit
  438. end;
  439. RS_ESP:
  440. if [Ch_RESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
  441. begin
  442. RegReadByInstruction := true;
  443. exit
  444. end;
  445. RS_EBP:
  446. if [Ch_REBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
  447. begin
  448. RegReadByInstruction := true;
  449. exit
  450. end;
  451. RS_ESI:
  452. if [Ch_RESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
  453. begin
  454. RegReadByInstruction := true;
  455. exit
  456. end;
  457. RS_EDI:
  458. if [Ch_REDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
  459. begin
  460. RegReadByInstruction := true;
  461. exit
  462. end;
  463. end;
  464. end;
  465. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  466. begin
  467. if (Ch_RFLAGScc in Ch) and not(getsubreg(reg) in [R_SUBW,R_SUBD,R_SUBQ]) then
  468. begin
  469. case p.condition of
  470. C_A,C_NBE, { CF=0 and ZF=0 }
  471. C_BE,C_NA: { CF=1 or ZF=1 }
  472. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY,R_SUBFLAGZERO];
  473. C_AE,C_NB,C_NC, { CF=0 }
  474. C_B,C_NAE,C_C: { CF=1 }
  475. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY];
  476. C_NE,C_NZ, { ZF=0 }
  477. C_E,C_Z: { ZF=1 }
  478. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO];
  479. C_G,C_NLE, { ZF=0 and SF=OF }
  480. C_LE,C_NG: { ZF=1 or SF<>OF }
  481. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO,R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  482. C_GE,C_NL, { SF=OF }
  483. C_L,C_NGE: { SF<>OF }
  484. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
  485. C_NO, { OF=0 }
  486. C_O: { OF=1 }
  487. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGOVERFLOW];
  488. C_NP,C_PO, { PF=0 }
  489. C_P,C_PE: { PF=1 }
  490. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGPARITY];
  491. C_NS, { SF=0 }
  492. C_S: { SF=1 }
  493. RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN];
  494. else
  495. internalerror(2017042701);
  496. end;
  497. if RegReadByInstruction then
  498. exit;
  499. end;
  500. case getsubreg(reg) of
  501. R_SUBW,R_SUBD,R_SUBQ:
  502. RegReadByInstruction :=
  503. [Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
  504. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  505. Ch_RDirFlag,Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc]*Ch<>[];
  506. R_SUBFLAGCARRY:
  507. RegReadByInstruction:=[Ch_RCarryFlag,Ch_RWCarryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  508. R_SUBFLAGPARITY:
  509. RegReadByInstruction:=[Ch_RParityFlag,Ch_RWParityFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  510. R_SUBFLAGAUXILIARY:
  511. RegReadByInstruction:=[Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  512. R_SUBFLAGZERO:
  513. RegReadByInstruction:=[Ch_RZeroFlag,Ch_RWZeroFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  514. R_SUBFLAGSIGN:
  515. RegReadByInstruction:=[Ch_RSignFlag,Ch_RWSignFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  516. R_SUBFLAGOVERFLOW:
  517. RegReadByInstruction:=[Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  518. R_SUBFLAGINTERRUPT:
  519. RegReadByInstruction:=[Ch_RFlags,Ch_RWFlags]*Ch<>[];
  520. R_SUBFLAGDIRECTION:
  521. RegReadByInstruction:=[Ch_RDirFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
  522. else
  523. internalerror(2017042601);
  524. end;
  525. exit;
  526. end;
  527. if (Ch_NoReadIfEqualRegs in Ch) and (p.ops=2) and
  528. (p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
  529. (p.oper[0]^.reg=p.oper[1]^.reg) then
  530. exit;
  531. if ([CH_RWOP1,CH_ROP1,CH_MOP1]*Ch<>[]) and reginop(reg,p.oper[0]^) then
  532. begin
  533. RegReadByInstruction := true;
  534. exit
  535. end;
  536. if ([Ch_RWOP2,Ch_ROP2,Ch_MOP2]*Ch<>[]) and reginop(reg,p.oper[1]^) then
  537. begin
  538. RegReadByInstruction := true;
  539. exit
  540. end;
  541. if ([Ch_RWOP3,Ch_ROP3,Ch_MOP3]*Ch<>[]) and reginop(reg,p.oper[2]^) then
  542. begin
  543. RegReadByInstruction := true;
  544. exit
  545. end;
  546. if ([Ch_RWOP4,Ch_ROP4,Ch_MOP4]*Ch<>[]) and reginop(reg,p.oper[3]^) then
  547. begin
  548. RegReadByInstruction := true;
  549. exit
  550. end;
  551. end;
  552. end;
  553. end;
  554. end;
  555. function TX86AsmOptimizer.RegInInstruction(Reg: TRegister; p1: tai): Boolean;
  556. begin
  557. result:=false;
  558. if p1.typ<>ait_instruction then
  559. exit;
  560. if (Ch_All in insprop[taicpu(p1).opcode].Ch) then
  561. exit(true);
  562. if (getregtype(reg)=R_INTREGISTER) and
  563. { change information for xmm movsd are not correct }
  564. ((taicpu(p1).opcode<>A_MOVSD) or (taicpu(p1).ops=0)) then
  565. begin
  566. case getsupreg(reg) of
  567. { RS_EAX = RS_RAX on x86-64 }
  568. RS_EAX:
  569. result:=([Ch_REAX,Ch_RRAX,Ch_WEAX,Ch_WRAX,Ch_RWEAX,Ch_RWRAX,Ch_MEAX,Ch_MRAX]*insprop[taicpu(p1).opcode].Ch)<>[];
  570. RS_ECX:
  571. result:=([Ch_RECX,Ch_RRCX,Ch_WECX,Ch_WRCX,Ch_RWECX,Ch_RWRCX,Ch_MECX,Ch_MRCX]*insprop[taicpu(p1).opcode].Ch)<>[];
  572. RS_EDX:
  573. result:=([Ch_REDX,Ch_RRDX,Ch_WEDX,Ch_WRDX,Ch_RWEDX,Ch_RWRDX,Ch_MEDX,Ch_MRDX]*insprop[taicpu(p1).opcode].Ch)<>[];
  574. RS_EBX:
  575. result:=([Ch_REBX,Ch_RRBX,Ch_WEBX,Ch_WRBX,Ch_RWEBX,Ch_RWRBX,Ch_MEBX,Ch_MRBX]*insprop[taicpu(p1).opcode].Ch)<>[];
  576. RS_ESP:
  577. result:=([Ch_RESP,Ch_RRSP,Ch_WESP,Ch_WRSP,Ch_RWESP,Ch_RWRSP,Ch_MESP,Ch_MRSP]*insprop[taicpu(p1).opcode].Ch)<>[];
  578. RS_EBP:
  579. result:=([Ch_REBP,Ch_RRBP,Ch_WEBP,Ch_WRBP,Ch_RWEBP,Ch_RWRBP,Ch_MEBP,Ch_MRBP]*insprop[taicpu(p1).opcode].Ch)<>[];
  580. RS_ESI:
  581. result:=([Ch_RESI,Ch_RRSI,Ch_WESI,Ch_WRSI,Ch_RWESI,Ch_RWRSI,Ch_MESI,Ch_MRSI,Ch_RMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  582. RS_EDI:
  583. result:=([Ch_REDI,Ch_RRDI,Ch_WEDI,Ch_WRDI,Ch_RWEDI,Ch_RWRDI,Ch_MEDI,Ch_MRDI,Ch_WMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
  584. else
  585. ;
  586. end;
  587. if result then
  588. exit;
  589. end
  590. else if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  591. begin
  592. if ([Ch_RFlags,Ch_WFlags,Ch_RWFlags,Ch_RFLAGScc]*insprop[taicpu(p1).opcode].Ch)<>[] then
  593. exit(true);
  594. case getsubreg(reg) of
  595. R_SUBFLAGCARRY:
  596. Result:=([Ch_RCarryFlag,Ch_RWCarryFlag,Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  597. R_SUBFLAGPARITY:
  598. Result:=([Ch_RParityFlag,Ch_RWParityFlag,Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  599. R_SUBFLAGAUXILIARY:
  600. Result:=([Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  601. R_SUBFLAGZERO:
  602. Result:=([Ch_RZeroFlag,Ch_RWZeroFlag,Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  603. R_SUBFLAGSIGN:
  604. Result:=([Ch_RSignFlag,Ch_RWSignFlag,Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  605. R_SUBFLAGOVERFLOW:
  606. Result:=([Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
  607. R_SUBFLAGINTERRUPT:
  608. Result:=([Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  609. R_SUBFLAGDIRECTION:
  610. Result:=([Ch_RDirFlag,Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
  611. else
  612. ;
  613. end;
  614. if result then
  615. exit;
  616. end
  617. else if (getregtype(reg)=R_FPUREGISTER) and (Ch_FPU in insprop[taicpu(p1).opcode].Ch) then
  618. exit(true);
  619. Result:=inherited RegInInstruction(Reg, p1);
  620. end;
  621. function TX86AsmOptimizer.RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean;
  622. begin
  623. Result := False;
  624. if p1.typ <> ait_instruction then
  625. exit;
  626. with insprop[taicpu(p1).opcode] do
  627. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  628. begin
  629. case getsubreg(reg) of
  630. R_SUBW,R_SUBD,R_SUBQ:
  631. Result :=
  632. [Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
  633. Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
  634. Ch_W0DirFlag,Ch_W1DirFlag,Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  635. R_SUBFLAGCARRY:
  636. Result:=[Ch_WCarryFlag,Ch_RWCarryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  637. R_SUBFLAGPARITY:
  638. Result:=[Ch_WParityFlag,Ch_RWParityFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  639. R_SUBFLAGAUXILIARY:
  640. Result:=[Ch_WAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  641. R_SUBFLAGZERO:
  642. Result:=[Ch_WZeroFlag,Ch_RWZeroFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  643. R_SUBFLAGSIGN:
  644. Result:=[Ch_WSignFlag,Ch_RWSignFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  645. R_SUBFLAGOVERFLOW:
  646. Result:=[Ch_WOverflowFlag,Ch_RWOverflowFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  647. R_SUBFLAGINTERRUPT:
  648. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  649. R_SUBFLAGDIRECTION:
  650. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
  651. else
  652. internalerror(2017042602);
  653. end;
  654. exit;
  655. end;
  656. case taicpu(p1).opcode of
  657. A_CALL:
  658. { We could potentially set Result to False if the register in
  659. question is non-volatile for the subroutine's calling convention,
  660. but this would require detecting the calling convention in use and
  661. also assuming that the routine doesn't contain malformed assembly
  662. language, for example... so it could only be done under -O4 as it
  663. would be considered a side-effect. [Kit] }
  664. Result := True;
  665. A_MOVSD:
  666. { special handling for SSE MOVSD }
  667. if (taicpu(p1).ops>0) then
  668. begin
  669. if taicpu(p1).ops<>2 then
  670. internalerror(2017042703);
  671. Result := (taicpu(p1).oper[1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[1]^);
  672. end;
  673. { VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
  674. so fix it here (FK)
  675. }
  676. A_VMOVSS,
  677. A_VMOVSD:
  678. begin
  679. Result := (taicpu(p1).ops=3) and (taicpu(p1).oper[2]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[2]^);
  680. exit;
  681. end;
  682. A_IMUL:
  683. Result := (taicpu(p1).oper[taicpu(p1).ops-1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[taicpu(p1).ops-1]^);
  684. else
  685. ;
  686. end;
  687. if Result then
  688. exit;
  689. with insprop[taicpu(p1).opcode] do
  690. begin
  691. if getregtype(reg)=R_INTREGISTER then
  692. begin
  693. case getsupreg(reg) of
  694. RS_EAX:
  695. if [Ch_WEAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
  696. begin
  697. Result := True;
  698. exit
  699. end;
  700. RS_ECX:
  701. if [Ch_WECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
  702. begin
  703. Result := True;
  704. exit
  705. end;
  706. RS_EDX:
  707. if [Ch_WEDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
  708. begin
  709. Result := True;
  710. exit
  711. end;
  712. RS_EBX:
  713. if [Ch_WEBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
  714. begin
  715. Result := True;
  716. exit
  717. end;
  718. RS_ESP:
  719. if [Ch_WESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
  720. begin
  721. Result := True;
  722. exit
  723. end;
  724. RS_EBP:
  725. if [Ch_WEBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
  726. begin
  727. Result := True;
  728. exit
  729. end;
  730. RS_ESI:
  731. if [Ch_WESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
  732. begin
  733. Result := True;
  734. exit
  735. end;
  736. RS_EDI:
  737. if [Ch_WEDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
  738. begin
  739. Result := True;
  740. exit
  741. end;
  742. end;
  743. end;
  744. if ([CH_RWOP1,CH_WOP1,CH_MOP1]*Ch<>[]) and reginop(reg,taicpu(p1).oper[0]^) then
  745. begin
  746. Result := true;
  747. exit
  748. end;
  749. if ([Ch_RWOP2,Ch_WOP2,Ch_MOP2]*Ch<>[]) and reginop(reg,taicpu(p1).oper[1]^) then
  750. begin
  751. Result := true;
  752. exit
  753. end;
  754. if ([Ch_RWOP3,Ch_WOP3,Ch_MOP3]*Ch<>[]) and reginop(reg,taicpu(p1).oper[2]^) then
  755. begin
  756. Result := true;
  757. exit
  758. end;
  759. if ([Ch_RWOP4,Ch_WOP4,Ch_MOP4]*Ch<>[]) and reginop(reg,taicpu(p1).oper[3]^) then
  760. begin
  761. Result := true;
  762. exit
  763. end;
  764. end;
  765. end;
  766. {$ifdef DEBUG_AOPTCPU}
  767. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);
  768. begin
  769. asml.insertbefore(tai_comment.Create(strpnew(s)), p);
  770. end;
  771. function debug_tostr(i: tcgint): string; inline;
  772. begin
  773. Result := tostr(i);
  774. end;
  775. function debug_regname(r: TRegister): string; inline;
  776. begin
  777. Result := '%' + std_regname(r);
  778. end;
  779. { Debug output function - creates a string representation of an operator }
  780. function debug_operstr(oper: TOper): string;
  781. begin
  782. case oper.typ of
  783. top_const:
  784. Result := '$' + debug_tostr(oper.val);
  785. top_reg:
  786. Result := debug_regname(oper.reg);
  787. top_ref:
  788. begin
  789. if oper.ref^.offset <> 0 then
  790. Result := debug_tostr(oper.ref^.offset) + '('
  791. else
  792. Result := '(';
  793. if (oper.ref^.base <> NR_INVALID) and (oper.ref^.base <> NR_NO) then
  794. begin
  795. Result := Result + debug_regname(oper.ref^.base);
  796. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  797. Result := Result + ',' + debug_regname(oper.ref^.index);
  798. end
  799. else
  800. if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
  801. Result := Result + debug_regname(oper.ref^.index);
  802. if (oper.ref^.scalefactor > 1) then
  803. Result := Result + ',' + debug_tostr(oper.ref^.scalefactor) + ')'
  804. else
  805. Result := Result + ')';
  806. end;
  807. else
  808. Result := '[UNKNOWN]';
  809. end;
  810. end;
  811. function debug_op2str(opcode: tasmop): string; inline;
  812. begin
  813. Result := std_op2str[opcode];
  814. end;
  815. function debug_opsize2str(opsize: topsize): string; inline;
  816. begin
  817. Result := gas_opsize2str[opsize];
  818. end;
  819. {$else DEBUG_AOPTCPU}
  820. procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);inline;
  821. begin
  822. end;
  823. function debug_tostr(i: tcgint): string; inline;
  824. begin
  825. Result := '';
  826. end;
  827. function debug_regname(r: TRegister): string; inline;
  828. begin
  829. Result := '';
  830. end;
  831. function debug_operstr(oper: TOper): string; inline;
  832. begin
  833. Result := '';
  834. end;
  835. function debug_op2str(opcode: tasmop): string; inline;
  836. begin
  837. Result := '';
  838. end;
  839. function debug_opsize2str(opsize: topsize): string; inline;
  840. begin
  841. Result := '';
  842. end;
  843. {$endif DEBUG_AOPTCPU}
  844. class function TX86AsmOptimizer.IsMOVZXAcceptable: Boolean; inline;
  845. begin
  846. {$ifdef x86_64}
  847. { Always fine on x86-64 }
  848. Result := True;
  849. {$else x86_64}
  850. Result :=
  851. {$ifdef i8086}
  852. (current_settings.cputype >= cpu_386) and
  853. {$endif i8086}
  854. (
  855. { Always accept if optimising for size }
  856. (cs_opt_size in current_settings.optimizerswitches) or
  857. { From the Pentium II onwards, MOVZX only takes 1 cycle. [Kit] }
  858. (current_settings.optimizecputype >= cpu_Pentium2)
  859. );
  860. {$endif x86_64}
  861. end;
  862. function TX86AsmOptimizer.Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
  863. begin
  864. if not SuperRegistersEqual(reg1,reg2) then
  865. exit(false);
  866. if getregtype(reg1)<>R_INTREGISTER then
  867. exit(true); {because SuperRegisterEqual is true}
  868. case getsubreg(reg1) of
  869. { A write to R_SUBL doesn't change R_SUBH and if reg2 is R_SUBW or
  870. higher, it preserves the high bits, so the new value depends on
  871. reg2's previous value. In other words, it is equivalent to doing:
  872. reg2 := (reg2 and $ffffff00) or byte(reg1); }
  873. R_SUBL:
  874. exit(getsubreg(reg2)=R_SUBL);
  875. { A write to R_SUBH doesn't change R_SUBL and if reg2 is R_SUBW or
  876. higher, it actually does a:
  877. reg2 := (reg2 and $ffff00ff) or (reg1 and $ff00); }
  878. R_SUBH:
  879. exit(getsubreg(reg2)=R_SUBH);
  880. { If reg2 is R_SUBD or larger, a write to R_SUBW preserves the high 16
  881. bits of reg2:
  882. reg2 := (reg2 and $ffff0000) or word(reg1); }
  883. R_SUBW:
  884. exit(getsubreg(reg2) in [R_SUBL,R_SUBH,R_SUBW]);
  885. { a write to R_SUBD always overwrites every other subregister,
  886. because it clears the high 32 bits of R_SUBQ on x86_64 }
  887. R_SUBD,
  888. R_SUBQ:
  889. exit(true);
  890. else
  891. internalerror(2017042801);
  892. end;
  893. end;
  894. function TX86AsmOptimizer.Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
  895. begin
  896. if not SuperRegistersEqual(reg1,reg2) then
  897. exit(false);
  898. if getregtype(reg1)<>R_INTREGISTER then
  899. exit(true); {because SuperRegisterEqual is true}
  900. case getsubreg(reg1) of
  901. R_SUBL:
  902. exit(getsubreg(reg2)<>R_SUBH);
  903. R_SUBH:
  904. exit(getsubreg(reg2)<>R_SUBL);
  905. R_SUBW,
  906. R_SUBD,
  907. R_SUBQ:
  908. exit(true);
  909. else
  910. internalerror(2017042802);
  911. end;
  912. end;
  913. function TX86AsmOptimizer.PrePeepholeOptSxx(var p : tai) : boolean;
  914. var
  915. hp1 : tai;
  916. l : TCGInt;
  917. begin
  918. result:=false;
  919. { changes the code sequence
  920. shr/sar const1, x
  921. shl const2, x
  922. to
  923. either "sar/and", "shl/and" or just "and" depending on const1 and const2 }
  924. if GetNextInstruction(p, hp1) and
  925. MatchInstruction(hp1,A_SHL,[]) and
  926. (taicpu(p).oper[0]^.typ = top_const) and
  927. (taicpu(hp1).oper[0]^.typ = top_const) and
  928. (taicpu(hp1).opsize = taicpu(p).opsize) and
  929. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[1]^.typ) and
  930. OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^) then
  931. begin
  932. if (taicpu(p).oper[0]^.val > taicpu(hp1).oper[0]^.val) and
  933. not(cs_opt_size in current_settings.optimizerswitches) then
  934. begin
  935. { shr/sar const1, %reg
  936. shl const2, %reg
  937. with const1 > const2 }
  938. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  939. taicpu(hp1).opcode := A_AND;
  940. l := (1 shl (taicpu(hp1).oper[0]^.val)) - 1;
  941. case taicpu(p).opsize Of
  942. S_B: taicpu(hp1).loadConst(0,l Xor $ff);
  943. S_W: taicpu(hp1).loadConst(0,l Xor $ffff);
  944. S_L: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffff));
  945. S_Q: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffffffffffff));
  946. else
  947. Internalerror(2017050703)
  948. end;
  949. end
  950. else if (taicpu(p).oper[0]^.val<taicpu(hp1).oper[0]^.val) and
  951. not(cs_opt_size in current_settings.optimizerswitches) then
  952. begin
  953. { shr/sar const1, %reg
  954. shl const2, %reg
  955. with const1 < const2 }
  956. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val-taicpu(p).oper[0]^.val);
  957. taicpu(p).opcode := A_AND;
  958. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  959. case taicpu(p).opsize Of
  960. S_B: taicpu(p).loadConst(0,l Xor $ff);
  961. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  962. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  963. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  964. else
  965. Internalerror(2017050702)
  966. end;
  967. end
  968. else if (taicpu(p).oper[0]^.val = taicpu(hp1).oper[0]^.val) then
  969. begin
  970. { shr/sar const1, %reg
  971. shl const2, %reg
  972. with const1 = const2 }
  973. taicpu(p).opcode := A_AND;
  974. l := (1 shl (taicpu(p).oper[0]^.val))-1;
  975. case taicpu(p).opsize Of
  976. S_B: taicpu(p).loadConst(0,l Xor $ff);
  977. S_W: taicpu(p).loadConst(0,l Xor $ffff);
  978. S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
  979. S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
  980. else
  981. Internalerror(2017050701)
  982. end;
  983. RemoveInstruction(hp1);
  984. end;
  985. end;
  986. end;
  987. function TX86AsmOptimizer.PrePeepholeOptIMUL(var p : tai) : boolean;
  988. var
  989. opsize : topsize;
  990. hp1 : tai;
  991. tmpref : treference;
  992. ShiftValue : Cardinal;
  993. BaseValue : TCGInt;
  994. begin
  995. result:=false;
  996. opsize:=taicpu(p).opsize;
  997. { changes certain "imul const, %reg"'s to lea sequences }
  998. if (MatchOpType(taicpu(p),top_const,top_reg) or
  999. MatchOpType(taicpu(p),top_const,top_reg,top_reg)) and
  1000. (opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
  1001. if (taicpu(p).oper[0]^.val = 1) then
  1002. if (taicpu(p).ops = 2) then
  1003. { remove "imul $1, reg" }
  1004. begin
  1005. DebugMsg(SPeepholeOptimization + 'Imul2Nop done',p);
  1006. Result := RemoveCurrentP(p);
  1007. end
  1008. else
  1009. { change "imul $1, reg1, reg2" to "mov reg1, reg2" }
  1010. begin
  1011. hp1 := taicpu.Op_Reg_Reg(A_MOV, opsize, taicpu(p).oper[1]^.reg,taicpu(p).oper[2]^.reg);
  1012. InsertLLItem(p.previous, p.next, hp1);
  1013. DebugMsg(SPeepholeOptimization + 'Imul2Mov done',p);
  1014. p.free;
  1015. p := hp1;
  1016. end
  1017. else if ((taicpu(p).ops <= 2) or
  1018. (taicpu(p).oper[2]^.typ = Top_Reg)) and
  1019. not(cs_opt_size in current_settings.optimizerswitches) and
  1020. (not(GetNextInstruction(p, hp1)) or
  1021. not((tai(hp1).typ = ait_instruction) and
  1022. ((taicpu(hp1).opcode=A_Jcc) and
  1023. (taicpu(hp1).condition in [C_O,C_NO])))) then
  1024. begin
  1025. {
  1026. imul X, reg1, reg2 to
  1027. lea (reg1,reg1,Y), reg2
  1028. shl ZZ,reg2
  1029. imul XX, reg1 to
  1030. lea (reg1,reg1,YY), reg1
  1031. shl ZZ,reg2
  1032. This optimziation makes sense for pretty much every x86, except the VIA Nano3000: it has IMUL latency 2, lea/shl pair as well,
  1033. it does not exist as a separate optimization target in FPC though.
  1034. This optimziation can be applied as long as only two bits are set in the constant and those two bits are separated by
  1035. at most two zeros
  1036. }
  1037. reference_reset(tmpref,1,[]);
  1038. if (PopCnt(QWord(taicpu(p).oper[0]^.val))=2) and (BsrQWord(taicpu(p).oper[0]^.val)-BsfQWord(taicpu(p).oper[0]^.val)<=3) then
  1039. begin
  1040. ShiftValue:=BsfQWord(taicpu(p).oper[0]^.val);
  1041. BaseValue:=taicpu(p).oper[0]^.val shr ShiftValue;
  1042. TmpRef.base := taicpu(p).oper[1]^.reg;
  1043. TmpRef.index := taicpu(p).oper[1]^.reg;
  1044. if not(BaseValue in [3,5,9]) then
  1045. Internalerror(2018110101);
  1046. TmpRef.ScaleFactor := BaseValue-1;
  1047. if (taicpu(p).ops = 2) then
  1048. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[1]^.reg)
  1049. else
  1050. hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[2]^.reg);
  1051. AsmL.InsertAfter(hp1,p);
  1052. DebugMsg(SPeepholeOptimization + 'Imul2LeaShl done',p);
  1053. taicpu(hp1).fileinfo:=taicpu(p).fileinfo;
  1054. RemoveCurrentP(p, hp1);
  1055. if ShiftValue>0 then
  1056. AsmL.InsertAfter(taicpu.op_const_reg(A_SHL, opsize, ShiftValue, taicpu(hp1).oper[1]^.reg),hp1);
  1057. end;
  1058. end;
  1059. end;
  1060. function TX86AsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
  1061. var
  1062. p: taicpu;
  1063. begin
  1064. if not assigned(hp) or
  1065. (hp.typ <> ait_instruction) then
  1066. begin
  1067. Result := false;
  1068. exit;
  1069. end;
  1070. p := taicpu(hp);
  1071. if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
  1072. with insprop[p.opcode] do
  1073. begin
  1074. case getsubreg(reg) of
  1075. R_SUBW,R_SUBD,R_SUBQ:
  1076. Result:=
  1077. RegLoadedWithNewValue(NR_CARRYFLAG,hp) and
  1078. RegLoadedWithNewValue(NR_PARITYFLAG,hp) and
  1079. RegLoadedWithNewValue(NR_AUXILIARYFLAG,hp) and
  1080. RegLoadedWithNewValue(NR_ZEROFLAG,hp) and
  1081. RegLoadedWithNewValue(NR_SIGNFLAG,hp) and
  1082. RegLoadedWithNewValue(NR_OVERFLOWFLAG,hp);
  1083. R_SUBFLAGCARRY:
  1084. Result:=[Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag,Ch_WFlags]*Ch<>[];
  1085. R_SUBFLAGPARITY:
  1086. Result:=[Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag,Ch_WFlags]*Ch<>[];
  1087. R_SUBFLAGAUXILIARY:
  1088. Result:=[Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_WFlags]*Ch<>[];
  1089. R_SUBFLAGZERO:
  1090. Result:=[Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag,Ch_WFlags]*Ch<>[];
  1091. R_SUBFLAGSIGN:
  1092. Result:=[Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag,Ch_WFlags]*Ch<>[];
  1093. R_SUBFLAGOVERFLOW:
  1094. Result:=[Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag,Ch_WFlags]*Ch<>[];
  1095. R_SUBFLAGINTERRUPT:
  1096. Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*Ch<>[];
  1097. R_SUBFLAGDIRECTION:
  1098. Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*Ch<>[];
  1099. else
  1100. begin
  1101. writeln(getsubreg(reg));
  1102. internalerror(2017050501);
  1103. end;
  1104. end;
  1105. exit;
  1106. end;
  1107. Result :=
  1108. (((p.opcode = A_MOV) or
  1109. (p.opcode = A_MOVZX) or
  1110. (p.opcode = A_MOVSX) or
  1111. (p.opcode = A_LEA) or
  1112. (p.opcode = A_VMOVSS) or
  1113. (p.opcode = A_VMOVSD) or
  1114. (p.opcode = A_VMOVAPD) or
  1115. (p.opcode = A_VMOVAPS) or
  1116. (p.opcode = A_VMOVQ) or
  1117. (p.opcode = A_MOVSS) or
  1118. (p.opcode = A_MOVSD) or
  1119. (p.opcode = A_MOVQ) or
  1120. (p.opcode = A_MOVAPD) or
  1121. (p.opcode = A_MOVAPS) or
  1122. {$ifndef x86_64}
  1123. (p.opcode = A_LDS) or
  1124. (p.opcode = A_LES) or
  1125. {$endif not x86_64}
  1126. (p.opcode = A_LFS) or
  1127. (p.opcode = A_LGS) or
  1128. (p.opcode = A_LSS)) and
  1129. (p.ops=2) and { A_MOVSD can have zero operands, so this check is needed }
  1130. (p.oper[1]^.typ = top_reg) and
  1131. (Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)) and
  1132. ((p.oper[0]^.typ = top_const) or
  1133. ((p.oper[0]^.typ = top_reg) and
  1134. not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))) or
  1135. ((p.oper[0]^.typ = top_ref) and
  1136. not RegInRef(reg,p.oper[0]^.ref^)))) or
  1137. ((p.opcode = A_POP) and
  1138. (Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg))) or
  1139. ((p.opcode = A_IMUL) and
  1140. (p.ops=3) and
  1141. (Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg,reg)) and
  1142. (((p.oper[1]^.typ=top_reg) and not(Reg1ReadDependsOnReg2(p.oper[1]^.reg,reg))) or
  1143. ((p.oper[1]^.typ=top_ref) and not(RegInRef(reg,p.oper[1]^.ref^))))) or
  1144. ((((p.opcode = A_IMUL) or
  1145. (p.opcode = A_MUL)) and
  1146. (p.ops=1)) and
  1147. (((p.oper[0]^.typ=top_reg) and not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))) or
  1148. ((p.oper[0]^.typ=top_ref) and not(RegInRef(reg,p.oper[0]^.ref^)))) and
  1149. (((p.opsize=S_B) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg))) or
  1150. ((p.opsize=S_W) and Reg1WriteOverwritesReg2Entirely(NR_DX,reg)) or
  1151. ((p.opsize=S_L) and Reg1WriteOverwritesReg2Entirely(NR_EDX,reg))
  1152. {$ifdef x86_64}
  1153. or ((p.opsize=S_Q) and Reg1WriteOverwritesReg2Entirely(NR_RDX,reg))
  1154. {$endif x86_64}
  1155. )) or
  1156. ((p.opcode = A_CWD) and Reg1WriteOverwritesReg2Entirely(NR_DX,reg)) or
  1157. ((p.opcode = A_CDQ) and Reg1WriteOverwritesReg2Entirely(NR_EDX,reg)) or
  1158. {$ifdef x86_64}
  1159. ((p.opcode = A_CQO) and Reg1WriteOverwritesReg2Entirely(NR_RDX,reg)) or
  1160. {$endif x86_64}
  1161. ((p.opcode = A_CBW) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg))) or
  1162. {$ifndef x86_64}
  1163. ((p.opcode = A_LDS) and (reg=NR_DS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1164. ((p.opcode = A_LES) and (reg=NR_ES) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1165. {$endif not x86_64}
  1166. ((p.opcode = A_LFS) and (reg=NR_FS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1167. ((p.opcode = A_LGS) and (reg=NR_GS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1168. ((p.opcode = A_LSS) and (reg=NR_SS) and not(RegInRef(reg,p.oper[0]^.ref^))) or
  1169. {$ifndef x86_64}
  1170. ((p.opcode = A_AAM) and Reg1WriteOverwritesReg2Entirely(NR_AH,reg)) or
  1171. {$endif not x86_64}
  1172. ((p.opcode = A_LAHF) and Reg1WriteOverwritesReg2Entirely(NR_AH,reg)) or
  1173. ((p.opcode = A_LODSB) and Reg1WriteOverwritesReg2Entirely(NR_AL,reg)) or
  1174. ((p.opcode = A_LODSW) and Reg1WriteOverwritesReg2Entirely(NR_AX,reg)) or
  1175. ((p.opcode = A_LODSD) and Reg1WriteOverwritesReg2Entirely(NR_EAX,reg)) or
  1176. {$ifdef x86_64}
  1177. ((p.opcode = A_LODSQ) and Reg1WriteOverwritesReg2Entirely(NR_RAX,reg)) or
  1178. {$endif x86_64}
  1179. ((p.opcode = A_SETcc) and (p.oper[0]^.typ=top_reg) and Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg)) or
  1180. (((p.opcode = A_FSTSW) or
  1181. (p.opcode = A_FNSTSW)) and
  1182. (p.oper[0]^.typ=top_reg) and
  1183. Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg)) or
  1184. (((p.opcode = A_XOR) or (p.opcode = A_SUB) or (p.opcode = A_SBB)) and
  1185. (p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
  1186. (p.oper[0]^.reg=p.oper[1]^.reg) and
  1187. Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg));
  1188. end;
  1189. class function TX86AsmOptimizer.IsExitCode(p : tai) : boolean;
  1190. var
  1191. hp2,hp3 : tai;
  1192. begin
  1193. { some x86-64 issue a NOP before the real exit code }
  1194. if MatchInstruction(p,A_NOP,[]) then
  1195. GetNextInstruction(p,p);
  1196. result:=assigned(p) and (p.typ=ait_instruction) and
  1197. ((taicpu(p).opcode = A_RET) or
  1198. ((taicpu(p).opcode=A_LEAVE) and
  1199. GetNextInstruction(p,hp2) and
  1200. MatchInstruction(hp2,A_RET,[S_NO])
  1201. ) or
  1202. (((taicpu(p).opcode=A_LEA) and
  1203. MatchOpType(taicpu(p),top_ref,top_reg) and
  1204. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  1205. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  1206. ) and
  1207. GetNextInstruction(p,hp2) and
  1208. MatchInstruction(hp2,A_RET,[S_NO])
  1209. ) or
  1210. ((((taicpu(p).opcode=A_MOV) and
  1211. MatchOpType(taicpu(p),top_reg,top_reg) and
  1212. (taicpu(p).oper[0]^.reg=current_procinfo.framepointer) and
  1213. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)) or
  1214. ((taicpu(p).opcode=A_LEA) and
  1215. MatchOpType(taicpu(p),top_ref,top_reg) and
  1216. (taicpu(p).oper[0]^.ref^.base=current_procinfo.framepointer) and
  1217. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
  1218. )
  1219. ) and
  1220. GetNextInstruction(p,hp2) and
  1221. MatchInstruction(hp2,A_POP,[reg2opsize(current_procinfo.framepointer)]) and
  1222. MatchOpType(taicpu(hp2),top_reg) and
  1223. (taicpu(hp2).oper[0]^.reg=current_procinfo.framepointer) and
  1224. GetNextInstruction(hp2,hp3) and
  1225. MatchInstruction(hp3,A_RET,[S_NO])
  1226. )
  1227. );
  1228. end;
  1229. class function TX86AsmOptimizer.isFoldableArithOp(hp1: taicpu; reg: tregister): boolean;
  1230. begin
  1231. isFoldableArithOp := False;
  1232. case hp1.opcode of
  1233. A_ADD,A_SUB,A_OR,A_XOR,A_AND,A_SHL,A_SHR,A_SAR:
  1234. isFoldableArithOp :=
  1235. ((taicpu(hp1).oper[0]^.typ = top_const) or
  1236. ((taicpu(hp1).oper[0]^.typ = top_reg) and
  1237. (taicpu(hp1).oper[0]^.reg <> reg))) and
  1238. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1239. (taicpu(hp1).oper[1]^.reg = reg);
  1240. A_INC,A_DEC,A_NEG,A_NOT:
  1241. isFoldableArithOp :=
  1242. (taicpu(hp1).oper[0]^.typ = top_reg) and
  1243. (taicpu(hp1).oper[0]^.reg = reg);
  1244. else
  1245. ;
  1246. end;
  1247. end;
  1248. procedure TX86AsmOptimizer.RemoveLastDeallocForFuncRes(p: tai);
  1249. procedure DoRemoveLastDeallocForFuncRes( supreg: tsuperregister);
  1250. var
  1251. hp2: tai;
  1252. begin
  1253. hp2 := p;
  1254. repeat
  1255. hp2 := tai(hp2.previous);
  1256. if assigned(hp2) and
  1257. (hp2.typ = ait_regalloc) and
  1258. (tai_regalloc(hp2).ratype=ra_dealloc) and
  1259. (getregtype(tai_regalloc(hp2).reg) = R_INTREGISTER) and
  1260. (getsupreg(tai_regalloc(hp2).reg) = supreg) then
  1261. begin
  1262. RemoveInstruction(hp2);
  1263. break;
  1264. end;
  1265. until not(assigned(hp2)) or regInInstruction(newreg(R_INTREGISTER,supreg,R_SUBWHOLE),hp2);
  1266. end;
  1267. begin
  1268. case current_procinfo.procdef.returndef.typ of
  1269. arraydef,recorddef,pointerdef,
  1270. stringdef,enumdef,procdef,objectdef,errordef,
  1271. filedef,setdef,procvardef,
  1272. classrefdef,forwarddef:
  1273. DoRemoveLastDeallocForFuncRes(RS_EAX);
  1274. orddef:
  1275. if current_procinfo.procdef.returndef.size <> 0 then
  1276. begin
  1277. DoRemoveLastDeallocForFuncRes(RS_EAX);
  1278. { for int64/qword }
  1279. if current_procinfo.procdef.returndef.size = 8 then
  1280. DoRemoveLastDeallocForFuncRes(RS_EDX);
  1281. end;
  1282. else
  1283. ;
  1284. end;
  1285. end;
  1286. function TX86AsmOptimizer.OptPass1_V_MOVAP(var p : tai) : boolean;
  1287. var
  1288. hp1,hp2 : tai;
  1289. begin
  1290. result:=false;
  1291. if MatchOpType(taicpu(p),top_reg,top_reg) then
  1292. begin
  1293. { vmova* reg1,reg1
  1294. =>
  1295. <nop> }
  1296. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  1297. begin
  1298. RemoveCurrentP(p);
  1299. result:=true;
  1300. exit;
  1301. end
  1302. else if GetNextInstruction(p,hp1) then
  1303. begin
  1304. if MatchInstruction(hp1,[taicpu(p).opcode],[S_NO]) and
  1305. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  1306. begin
  1307. { vmova* reg1,reg2
  1308. vmova* reg2,reg3
  1309. dealloc reg2
  1310. =>
  1311. vmova* reg1,reg3 }
  1312. TransferUsedRegs(TmpUsedRegs);
  1313. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1314. if MatchOpType(taicpu(hp1),top_reg,top_reg) and
  1315. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  1316. begin
  1317. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 1',p);
  1318. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  1319. RemoveInstruction(hp1);
  1320. result:=true;
  1321. exit;
  1322. end
  1323. { special case:
  1324. vmova* reg1,<op>
  1325. vmova* <op>,reg1
  1326. =>
  1327. vmova* reg1,<op> }
  1328. else if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
  1329. ((taicpu(p).oper[0]^.typ<>top_ref) or
  1330. (not(vol_read in taicpu(p).oper[0]^.ref^.volatility))
  1331. ) then
  1332. begin
  1333. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 2',p);
  1334. RemoveInstruction(hp1);
  1335. result:=true;
  1336. exit;
  1337. end
  1338. end
  1339. else if ((MatchInstruction(p,[A_MOVAPS,A_VMOVAPS],[S_NO]) and
  1340. MatchInstruction(hp1,[A_MOVSS,A_VMOVSS],[S_NO])) or
  1341. ((MatchInstruction(p,[A_MOVAPD,A_VMOVAPD],[S_NO]) and
  1342. MatchInstruction(hp1,[A_MOVSD,A_VMOVSD],[S_NO])))
  1343. ) and
  1344. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  1345. begin
  1346. { vmova* reg1,reg2
  1347. vmovs* reg2,<op>
  1348. dealloc reg2
  1349. =>
  1350. vmovs* reg1,reg3 }
  1351. TransferUsedRegs(TmpUsedRegs);
  1352. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1353. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  1354. begin
  1355. DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVS*2(V)MOVS* 1',p);
  1356. taicpu(p).opcode:=taicpu(hp1).opcode;
  1357. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  1358. RemoveInstruction(hp1);
  1359. result:=true;
  1360. exit;
  1361. end
  1362. end;
  1363. end;
  1364. if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) then
  1365. begin
  1366. if MatchInstruction(hp1,[A_VFMADDPD,
  1367. A_VFMADD132PD,
  1368. A_VFMADD132PS,
  1369. A_VFMADD132SD,
  1370. A_VFMADD132SS,
  1371. A_VFMADD213PD,
  1372. A_VFMADD213PS,
  1373. A_VFMADD213SD,
  1374. A_VFMADD213SS,
  1375. A_VFMADD231PD,
  1376. A_VFMADD231PS,
  1377. A_VFMADD231SD,
  1378. A_VFMADD231SS,
  1379. A_VFMADDSUB132PD,
  1380. A_VFMADDSUB132PS,
  1381. A_VFMADDSUB213PD,
  1382. A_VFMADDSUB213PS,
  1383. A_VFMADDSUB231PD,
  1384. A_VFMADDSUB231PS,
  1385. A_VFMSUB132PD,
  1386. A_VFMSUB132PS,
  1387. A_VFMSUB132SD,
  1388. A_VFMSUB132SS,
  1389. A_VFMSUB213PD,
  1390. A_VFMSUB213PS,
  1391. A_VFMSUB213SD,
  1392. A_VFMSUB213SS,
  1393. A_VFMSUB231PD,
  1394. A_VFMSUB231PS,
  1395. A_VFMSUB231SD,
  1396. A_VFMSUB231SS,
  1397. A_VFMSUBADD132PD,
  1398. A_VFMSUBADD132PS,
  1399. A_VFMSUBADD213PD,
  1400. A_VFMSUBADD213PS,
  1401. A_VFMSUBADD231PD,
  1402. A_VFMSUBADD231PS,
  1403. A_VFNMADD132PD,
  1404. A_VFNMADD132PS,
  1405. A_VFNMADD132SD,
  1406. A_VFNMADD132SS,
  1407. A_VFNMADD213PD,
  1408. A_VFNMADD213PS,
  1409. A_VFNMADD213SD,
  1410. A_VFNMADD213SS,
  1411. A_VFNMADD231PD,
  1412. A_VFNMADD231PS,
  1413. A_VFNMADD231SD,
  1414. A_VFNMADD231SS,
  1415. A_VFNMSUB132PD,
  1416. A_VFNMSUB132PS,
  1417. A_VFNMSUB132SD,
  1418. A_VFNMSUB132SS,
  1419. A_VFNMSUB213PD,
  1420. A_VFNMSUB213PS,
  1421. A_VFNMSUB213SD,
  1422. A_VFNMSUB213SS,
  1423. A_VFNMSUB231PD,
  1424. A_VFNMSUB231PS,
  1425. A_VFNMSUB231SD,
  1426. A_VFNMSUB231SS],[S_NO]) and
  1427. { we mix single and double opperations here because we assume that the compiler
  1428. generates vmovapd only after double operations and vmovaps only after single operations }
  1429. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[2]^) and
  1430. GetNextInstruction(hp1,hp2) and
  1431. MatchInstruction(hp2,[A_VMOVAPD,A_VMOVAPS,A_MOVAPD,A_MOVAPS],[S_NO]) and
  1432. MatchOperand(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) then
  1433. begin
  1434. TransferUsedRegs(TmpUsedRegs);
  1435. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1436. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  1437. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  1438. begin
  1439. taicpu(hp1).loadoper(2,taicpu(p).oper[0]^);
  1440. RemoveCurrentP(p, hp1); // <-- Is this actually safe? hp1 is not necessarily the next instruction. [Kit]
  1441. RemoveInstruction(hp2);
  1442. end;
  1443. end
  1444. else if (hp1.typ = ait_instruction) and
  1445. GetNextInstruction(hp1, hp2) and
  1446. MatchInstruction(hp2,taicpu(p).opcode,[]) and
  1447. OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  1448. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  1449. MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^) and
  1450. (((taicpu(p).opcode=A_MOVAPS) and
  1451. ((taicpu(hp1).opcode=A_ADDSS) or (taicpu(hp1).opcode=A_SUBSS) or
  1452. (taicpu(hp1).opcode=A_MULSS) or (taicpu(hp1).opcode=A_DIVSS))) or
  1453. ((taicpu(p).opcode=A_MOVAPD) and
  1454. ((taicpu(hp1).opcode=A_ADDSD) or (taicpu(hp1).opcode=A_SUBSD) or
  1455. (taicpu(hp1).opcode=A_MULSD) or (taicpu(hp1).opcode=A_DIVSD)))
  1456. ) then
  1457. { change
  1458. movapX reg,reg2
  1459. addsX/subsX/... reg3, reg2
  1460. movapX reg2,reg
  1461. to
  1462. addsX/subsX/... reg3,reg
  1463. }
  1464. begin
  1465. TransferUsedRegs(TmpUsedRegs);
  1466. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1467. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  1468. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  1469. begin
  1470. DebugMsg(SPeepholeOptimization + 'MovapXOpMovapX2Op ('+
  1471. debug_op2str(taicpu(p).opcode)+' '+
  1472. debug_op2str(taicpu(hp1).opcode)+' '+
  1473. debug_op2str(taicpu(hp2).opcode)+') done',p);
  1474. { we cannot eliminate the first move if
  1475. the operations uses the same register for source and dest }
  1476. if not(OpsEqual(taicpu(hp1).oper[1]^,taicpu(hp1).oper[0]^)) then
  1477. RemoveCurrentP(p, nil);
  1478. p:=hp1;
  1479. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  1480. RemoveInstruction(hp2);
  1481. result:=true;
  1482. end;
  1483. end;
  1484. end;
  1485. end;
  1486. end;
  1487. function TX86AsmOptimizer.OptPass1VOP(var p : tai) : boolean;
  1488. var
  1489. hp1 : tai;
  1490. begin
  1491. result:=false;
  1492. { replace
  1493. V<Op>X %mreg1,%mreg2,%mreg3
  1494. VMovX %mreg3,%mreg4
  1495. dealloc %mreg3
  1496. by
  1497. V<Op>X %mreg1,%mreg2,%mreg4
  1498. ?
  1499. }
  1500. if GetNextInstruction(p,hp1) and
  1501. { we mix single and double operations here because we assume that the compiler
  1502. generates vmovapd only after double operations and vmovaps only after single operations }
  1503. MatchInstruction(hp1,A_VMOVAPD,A_VMOVAPS,[S_NO]) and
  1504. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  1505. (taicpu(hp1).oper[1]^.typ=top_reg) then
  1506. begin
  1507. TransferUsedRegs(TmpUsedRegs);
  1508. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  1509. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  1510. begin
  1511. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  1512. DebugMsg(SPeepholeOptimization + 'VOpVmov2VOp done',p);
  1513. RemoveInstruction(hp1);
  1514. result:=true;
  1515. end;
  1516. end;
  1517. end;
  1518. { Replaces all references to AOldReg in a memory reference to ANewReg }
  1519. class function TX86AsmOptimizer.ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean;
  1520. var
  1521. OldSupReg: TSuperRegister;
  1522. OldSubReg, MemSubReg: TSubRegister;
  1523. begin
  1524. Result := False;
  1525. { For safety reasons, only check for exact register matches }
  1526. { Check base register }
  1527. if (ref.base = AOldReg) then
  1528. begin
  1529. ref.base := ANewReg;
  1530. Result := True;
  1531. end;
  1532. { Check index register }
  1533. if (ref.index = AOldReg) then
  1534. begin
  1535. ref.index := ANewReg;
  1536. Result := True;
  1537. end;
  1538. end;
  1539. { Replaces all references to AOldReg in an operand to ANewReg }
  1540. class function TX86AsmOptimizer.ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean;
  1541. var
  1542. OldSupReg, NewSupReg: TSuperRegister;
  1543. OldSubReg, NewSubReg, MemSubReg: TSubRegister;
  1544. OldRegType: TRegisterType;
  1545. ThisOper: POper;
  1546. begin
  1547. ThisOper := p.oper[OperIdx]; { Faster to access overall }
  1548. Result := False;
  1549. if (AOldReg = NR_NO) or (ANewReg = NR_NO) then
  1550. InternalError(2020011801);
  1551. OldSupReg := getsupreg(AOldReg);
  1552. OldSubReg := getsubreg(AOldReg);
  1553. OldRegType := getregtype(AOldReg);
  1554. NewSupReg := getsupreg(ANewReg);
  1555. NewSubReg := getsubreg(ANewReg);
  1556. if OldRegType <> getregtype(ANewReg) then
  1557. InternalError(2020011802);
  1558. if OldSubReg <> NewSubReg then
  1559. InternalError(2020011803);
  1560. case ThisOper^.typ of
  1561. top_reg:
  1562. if (
  1563. (ThisOper^.reg = AOldReg) or
  1564. (
  1565. (OldRegType = R_INTREGISTER) and
  1566. (getsupreg(ThisOper^.reg) = OldSupReg) and
  1567. (getregtype(ThisOper^.reg) = R_INTREGISTER) and
  1568. (
  1569. (getsubreg(ThisOper^.reg) <= OldSubReg)
  1570. {$ifndef x86_64}
  1571. and (
  1572. { Under i386 and i8086, ESI, EDI, EBP and ESP
  1573. don't have an 8-bit representation }
  1574. (getsubreg(ThisOper^.reg) >= R_SUBW) or
  1575. not (NewSupReg in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
  1576. )
  1577. {$endif x86_64}
  1578. )
  1579. )
  1580. ) then
  1581. begin
  1582. ThisOper^.reg := newreg(getregtype(ANewReg), NewSupReg, getsubreg(p.oper[OperIdx]^.reg));
  1583. Result := True;
  1584. end;
  1585. top_ref:
  1586. if ReplaceRegisterInRef(ThisOper^.ref^, AOldReg, ANewReg) then
  1587. Result := True;
  1588. else
  1589. ;
  1590. end;
  1591. end;
  1592. { Replaces all references to AOldReg in an instruction to ANewReg }
  1593. function TX86AsmOptimizer.ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
  1594. const
  1595. ReadFlag: array[0..3] of TInsChange = (Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Rop4);
  1596. var
  1597. OperIdx: Integer;
  1598. begin
  1599. Result := False;
  1600. for OperIdx := 0 to p.ops - 1 do
  1601. if (ReadFlag[OperIdx] in InsProp[p.Opcode].Ch) and
  1602. { The shift and rotate instructions can only use CL }
  1603. not (
  1604. (OperIdx = 0) and
  1605. { This second condition just helps to avoid unnecessarily
  1606. calling MatchInstruction for 10 different opcodes }
  1607. (p.oper[0]^.reg = NR_CL) and
  1608. MatchInstruction(p, [A_RCL, A_RCR, A_ROL, A_ROR, A_SAL, A_SAR, A_SHL, A_SHLD, A_SHR, A_SHRD], [])
  1609. ) then
  1610. Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
  1611. end;
  1612. class function TX86AsmOptimizer.IsRefSafe(const ref: PReference): Boolean; inline;
  1613. begin
  1614. Result :=
  1615. (ref^.index = NR_NO) and
  1616. (
  1617. {$ifdef x86_64}
  1618. (
  1619. (ref^.base = NR_RIP) and
  1620. (ref^.refaddr in [addr_pic, addr_pic_no_got])
  1621. ) or
  1622. {$endif x86_64}
  1623. (ref^.base = NR_STACK_POINTER_REG) or
  1624. (ref^.base = current_procinfo.framepointer)
  1625. );
  1626. end;
  1627. function TX86AsmOptimizer.ConvertLEA(const p: taicpu): Boolean;
  1628. var
  1629. l: asizeint;
  1630. begin
  1631. Result := False;
  1632. { Should have been checked previously }
  1633. if p.opcode <> A_LEA then
  1634. InternalError(2020072501);
  1635. { do not mess with the stack point as adjusting it by lea is recommend, except if we optimize for size }
  1636. if (p.oper[1]^.reg=NR_STACK_POINTER_REG) and
  1637. not(cs_opt_size in current_settings.optimizerswitches) then
  1638. exit;
  1639. with p.oper[0]^.ref^ do
  1640. begin
  1641. if (base <> p.oper[1]^.reg) or
  1642. (index <> NR_NO) or
  1643. assigned(symbol) then
  1644. exit;
  1645. l:=offset;
  1646. if (l=1) and UseIncDec then
  1647. begin
  1648. p.opcode:=A_INC;
  1649. p.loadreg(0,p.oper[1]^.reg);
  1650. p.ops:=1;
  1651. DebugMsg(SPeepholeOptimization + 'Lea2Inc done',p);
  1652. end
  1653. else if (l=-1) and UseIncDec then
  1654. begin
  1655. p.opcode:=A_DEC;
  1656. p.loadreg(0,p.oper[1]^.reg);
  1657. p.ops:=1;
  1658. DebugMsg(SPeepholeOptimization + 'Lea2Dec done',p);
  1659. end
  1660. else
  1661. begin
  1662. if (l<0) and (l<>-2147483648) then
  1663. begin
  1664. p.opcode:=A_SUB;
  1665. p.loadConst(0,-l);
  1666. DebugMsg(SPeepholeOptimization + 'Lea2Sub done',p);
  1667. end
  1668. else
  1669. begin
  1670. p.opcode:=A_ADD;
  1671. p.loadConst(0,l);
  1672. DebugMsg(SPeepholeOptimization + 'Lea2Add done',p);
  1673. end;
  1674. end;
  1675. end;
  1676. Result := True;
  1677. end;
  1678. function TX86AsmOptimizer.DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
  1679. var
  1680. CurrentReg, ReplaceReg: TRegister;
  1681. SubReg: TSubRegister;
  1682. begin
  1683. Result := False;
  1684. ReplaceReg := taicpu(p_mov).oper[0]^.reg;
  1685. CurrentReg := taicpu(p_mov).oper[1]^.reg;
  1686. case hp.opcode of
  1687. A_FSTSW, A_FNSTSW,
  1688. A_IN, A_INS, A_OUT, A_OUTS,
  1689. A_CMPS, A_LODS, A_MOVS, A_SCAS, A_STOS:
  1690. { These routines have explicit operands, but they are restricted in
  1691. what they can be (e.g. IN and OUT can only read from AL, AX or
  1692. EAX. }
  1693. Exit;
  1694. A_IMUL:
  1695. begin
  1696. { The 1-operand version writes to implicit registers
  1697. The 2-operand version reads from the first operator, and reads
  1698. from and writes to the second (equivalent to Ch_ROp1, ChRWOp2).
  1699. the 3-operand version reads from a register that it doesn't write to
  1700. }
  1701. case hp.ops of
  1702. 1:
  1703. if (
  1704. (
  1705. (hp.opsize = S_B) and (getsupreg(CurrentReg) <> RS_EAX)
  1706. ) or
  1707. not (getsupreg(CurrentReg) in [RS_EAX, RS_EDX])
  1708. ) and ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  1709. begin
  1710. Result := True;
  1711. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 1)', hp);
  1712. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1713. end;
  1714. 2:
  1715. { Only modify the first parameter }
  1716. if ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
  1717. begin
  1718. Result := True;
  1719. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 2)', hp);
  1720. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1721. end;
  1722. 3:
  1723. { Only modify the second parameter }
  1724. if ReplaceRegisterInOper(hp, 1, CurrentReg, ReplaceReg) then
  1725. begin
  1726. Result := True;
  1727. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 3)', hp);
  1728. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1729. end;
  1730. else
  1731. InternalError(2020012901);
  1732. end;
  1733. end;
  1734. else
  1735. if (hp.ops > 0) and
  1736. ReplaceRegisterInInstruction(hp, CurrentReg, ReplaceReg) then
  1737. begin
  1738. Result := True;
  1739. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovXXX2MovXXX)', hp);
  1740. AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
  1741. end;
  1742. end;
  1743. end;
  1744. function TX86AsmOptimizer.OptPass1MOV(var p : tai) : boolean;
  1745. var
  1746. hp1, hp2, hp3: tai;
  1747. procedure convert_mov_value(signed_movop: tasmop; max_value: tcgint); inline;
  1748. begin
  1749. if taicpu(hp1).opcode = signed_movop then
  1750. begin
  1751. if taicpu(p).oper[0]^.val > max_value shr 1 then
  1752. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val - max_value - 1 { Convert to signed }
  1753. end
  1754. else
  1755. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and max_value; { Trim to unsigned }
  1756. end;
  1757. var
  1758. GetNextInstruction_p, TempRegUsed: Boolean;
  1759. PreMessage, RegName1, RegName2, InputVal, MaskNum: string;
  1760. NewSize: topsize;
  1761. CurrentReg: TRegister;
  1762. begin
  1763. Result:=false;
  1764. GetNextInstruction_p:=GetNextInstruction(p, hp1);
  1765. { remove mov reg1,reg1? }
  1766. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^)
  1767. then
  1768. begin
  1769. DebugMsg(SPeepholeOptimization + 'Mov2Nop 1 done',p);
  1770. { take care of the register (de)allocs following p }
  1771. RemoveCurrentP(p, hp1);
  1772. Result:=true;
  1773. exit;
  1774. end;
  1775. { All the next optimisations require a next instruction }
  1776. if not GetNextInstruction_p or (hp1.typ <> ait_instruction) then
  1777. Exit;
  1778. { Look for:
  1779. mov %reg1,%reg2
  1780. ??? %reg2,r/m
  1781. Change to:
  1782. mov %reg1,%reg2
  1783. ??? %reg1,r/m
  1784. }
  1785. if MatchOpType(taicpu(p), top_reg, top_reg) then
  1786. begin
  1787. CurrentReg := taicpu(p).oper[1]^.reg;
  1788. if RegReadByInstruction(CurrentReg, hp1) and
  1789. DeepMOVOpt(taicpu(p), taicpu(hp1)) then
  1790. begin
  1791. TransferUsedRegs(TmpUsedRegs);
  1792. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  1793. if not RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs) and
  1794. { Just in case something didn't get modified (e.g. an
  1795. implicit register) }
  1796. not RegReadByInstruction(CurrentReg, hp1) then
  1797. begin
  1798. { We can remove the original MOV }
  1799. DebugMsg(SPeepholeOptimization + 'Mov2Nop 3 done',p);
  1800. RemoveCurrentp(p, hp1);
  1801. { TmpUsedRegs contains the results of "UpdateUsedRegs(tai(p.Next))" already,
  1802. so just restore it to UsedRegs instead of calculating it again }
  1803. RestoreUsedRegs(TmpUsedRegs);
  1804. Result := True;
  1805. Exit;
  1806. end;
  1807. { If we know a MOV instruction has become a null operation, we might as well
  1808. get rid of it now to save time. }
  1809. if (taicpu(hp1).opcode = A_MOV) and
  1810. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1811. SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
  1812. { Just being a register is enough to confirm it's a null operation }
  1813. (taicpu(hp1).oper[0]^.typ = top_reg) then
  1814. begin
  1815. Result := True;
  1816. { Speed-up to reduce a pipeline stall... if we had something like...
  1817. movl %eax,%edx
  1818. movw %dx,%ax
  1819. ... the second instruction would change to movw %ax,%ax, but
  1820. given that it is now %ax that's active rather than %eax,
  1821. penalties might occur due to a partial register write, so instead,
  1822. change it to a MOVZX instruction when optimising for speed.
  1823. }
  1824. if not (cs_opt_size in current_settings.optimizerswitches) and
  1825. IsMOVZXAcceptable and
  1826. (taicpu(hp1).opsize < taicpu(p).opsize)
  1827. {$ifdef x86_64}
  1828. { operations already implicitly set the upper 64 bits to zero }
  1829. and not ((taicpu(hp1).opsize = S_L) and (taicpu(p).opsize = S_Q))
  1830. {$endif x86_64}
  1831. then
  1832. begin
  1833. CurrentReg := taicpu(hp1).oper[1]^.reg;
  1834. DebugMsg(SPeepholeOptimization + 'Zero-extension to minimise pipeline stall (Mov2Movz)',hp1);
  1835. case taicpu(p).opsize of
  1836. S_W:
  1837. if taicpu(hp1).opsize = S_B then
  1838. taicpu(hp1).opsize := S_BL
  1839. else
  1840. InternalError(2020012911);
  1841. S_L{$ifdef x86_64}, S_Q{$endif x86_64}:
  1842. case taicpu(hp1).opsize of
  1843. S_B:
  1844. taicpu(hp1).opsize := S_BL;
  1845. S_W:
  1846. taicpu(hp1).opsize := S_WL;
  1847. else
  1848. InternalError(2020012912);
  1849. end;
  1850. else
  1851. InternalError(2020012910);
  1852. end;
  1853. taicpu(hp1).opcode := A_MOVZX;
  1854. taicpu(hp1).oper[1]^.reg := newreg(getregtype(CurrentReg), getsupreg(CurrentReg), R_SUBD)
  1855. end
  1856. else
  1857. begin
  1858. GetNextInstruction_p := GetNextInstruction(hp1, hp2);
  1859. DebugMsg(SPeepholeOptimization + 'Mov2Nop 4 done',hp1);
  1860. RemoveInstruction(hp1);
  1861. { The instruction after what was hp1 is now the immediate next instruction,
  1862. so we can continue to make optimisations if it's present }
  1863. if not GetNextInstruction_p or (hp2.typ <> ait_instruction) then
  1864. Exit;
  1865. hp1 := hp2;
  1866. end;
  1867. end;
  1868. end;
  1869. end;
  1870. { Depending on the DeepMOVOpt above, it may turn out that hp1 completely
  1871. overwrites the original destination register. e.g.
  1872. movl ###,%reg2d
  1873. movslq ###,%reg2q (### doesn't have to be the same as the first one)
  1874. In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
  1875. }
  1876. if (taicpu(p).oper[1]^.typ = top_reg) and
  1877. MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
  1878. (taicpu(hp1).oper[1]^.typ = top_reg) and
  1879. Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  1880. begin
  1881. if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
  1882. begin
  1883. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  1884. case taicpu(p).oper[0]^.typ of
  1885. top_const:
  1886. { We have something like:
  1887. movb $x, %regb
  1888. movzbl %regb,%regd
  1889. Change to:
  1890. movl $x, %regd
  1891. }
  1892. begin
  1893. case taicpu(hp1).opsize of
  1894. S_BW:
  1895. begin
  1896. convert_mov_value(A_MOVSX, $FF);
  1897. setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
  1898. taicpu(p).opsize := S_W;
  1899. end;
  1900. S_BL:
  1901. begin
  1902. convert_mov_value(A_MOVSX, $FF);
  1903. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  1904. taicpu(p).opsize := S_L;
  1905. end;
  1906. S_WL:
  1907. begin
  1908. convert_mov_value(A_MOVSX, $FFFF);
  1909. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  1910. taicpu(p).opsize := S_L;
  1911. end;
  1912. {$ifdef x86_64}
  1913. S_BQ:
  1914. begin
  1915. convert_mov_value(A_MOVSX, $FF);
  1916. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1917. taicpu(p).opsize := S_Q;
  1918. end;
  1919. S_WQ:
  1920. begin
  1921. convert_mov_value(A_MOVSX, $FFFF);
  1922. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1923. taicpu(p).opsize := S_Q;
  1924. end;
  1925. S_LQ:
  1926. begin
  1927. convert_mov_value(A_MOVSXD, $FFFFFFFF); { Note it's MOVSXD, not MOVSX }
  1928. setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
  1929. taicpu(p).opsize := S_Q;
  1930. end;
  1931. {$endif x86_64}
  1932. else
  1933. { If hp1 was a MOV instruction, it should have been
  1934. optimised already }
  1935. InternalError(2020021001);
  1936. end;
  1937. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
  1938. RemoveInstruction(hp1);
  1939. Result := True;
  1940. Exit;
  1941. end;
  1942. top_ref:
  1943. { We have something like:
  1944. movb mem, %regb
  1945. movzbl %regb,%regd
  1946. Change to:
  1947. movzbl mem, %regd
  1948. }
  1949. if (taicpu(p).oper[0]^.ref^.refaddr<>addr_full) and (IsMOVZXAcceptable or (taicpu(hp1).opcode<>A_MOVZX)) then
  1950. begin
  1951. DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
  1952. taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
  1953. RemoveCurrentP(p, hp1);
  1954. Result:=True;
  1955. Exit;
  1956. end;
  1957. else
  1958. if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
  1959. { Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
  1960. Exit;
  1961. end;
  1962. end
  1963. { The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
  1964. and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
  1965. optimised }
  1966. else
  1967. begin
  1968. DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
  1969. RemoveCurrentP(p, hp1);
  1970. Result := True;
  1971. Exit;
  1972. end;
  1973. end;
  1974. if (taicpu(hp1).opcode = A_AND) and
  1975. (taicpu(p).oper[1]^.typ = top_reg) and
  1976. MatchOpType(taicpu(hp1),top_const,top_reg) then
  1977. begin
  1978. if MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) then
  1979. begin
  1980. case taicpu(p).opsize of
  1981. S_L:
  1982. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  1983. begin
  1984. { Optimize out:
  1985. mov x, %reg
  1986. and ffffffffh, %reg
  1987. }
  1988. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 1 done',p);
  1989. RemoveInstruction(hp1);
  1990. Result:=true;
  1991. exit;
  1992. end;
  1993. S_Q: { TODO: Confirm if this is even possible }
  1994. if (taicpu(hp1).oper[0]^.val = $ffffffffffffffff) then
  1995. begin
  1996. { Optimize out:
  1997. mov x, %reg
  1998. and ffffffffffffffffh, %reg
  1999. }
  2000. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 2 done',p);
  2001. RemoveInstruction(hp1);
  2002. Result:=true;
  2003. exit;
  2004. end;
  2005. else
  2006. ;
  2007. end;
  2008. if ((taicpu(p).oper[0]^.typ=top_reg) or
  2009. ((taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr<>addr_full))) and
  2010. GetNextInstruction(hp1,hp2) and
  2011. MatchInstruction(hp2,A_TEST,[taicpu(p).opsize]) and
  2012. MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp2).oper[1]^) and
  2013. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^) and
  2014. GetNextInstruction(hp2,hp3) and
  2015. MatchInstruction(hp3,A_Jcc,A_Setcc,[S_NO]) and
  2016. (taicpu(hp3).condition in [C_E,C_NE]) then
  2017. begin
  2018. TransferUsedRegs(TmpUsedRegs);
  2019. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  2020. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  2021. if not(RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp2, TmpUsedRegs)) then
  2022. begin
  2023. DebugMsg(SPeepholeOptimization + 'MovAndTest2Test done',p);
  2024. taicpu(hp1).loadoper(1,taicpu(p).oper[0]^);
  2025. taicpu(hp1).opcode:=A_TEST;
  2026. RemoveInstruction(hp2);
  2027. RemoveCurrentP(p, hp1);
  2028. Result:=true;
  2029. exit;
  2030. end;
  2031. end;
  2032. end
  2033. else if IsMOVZXAcceptable and
  2034. (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(hp1).oper[1]^.typ = top_reg) and
  2035. (taicpu(p).oper[0]^.typ <> top_const) and { MOVZX only supports registers and memory, not immediates (use MOV for that!) }
  2036. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  2037. then
  2038. begin
  2039. InputVal := debug_operstr(taicpu(p).oper[0]^);
  2040. MaskNum := debug_tostr(taicpu(hp1).oper[0]^.val);
  2041. case taicpu(p).opsize of
  2042. S_B:
  2043. if (taicpu(hp1).oper[0]^.val = $ff) then
  2044. begin
  2045. { Convert:
  2046. movb x, %regl movb x, %regl
  2047. andw ffh, %regw andl ffh, %regd
  2048. To:
  2049. movzbw x, %regd movzbl x, %regd
  2050. (Identical registers, just different sizes)
  2051. }
  2052. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 8-bit register name }
  2053. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 16/32-bit register name }
  2054. case taicpu(hp1).opsize of
  2055. S_W: NewSize := S_BW;
  2056. S_L: NewSize := S_BL;
  2057. {$ifdef x86_64}
  2058. S_Q: NewSize := S_BQ;
  2059. {$endif x86_64}
  2060. else
  2061. InternalError(2018011510);
  2062. end;
  2063. end
  2064. else
  2065. NewSize := S_NO;
  2066. S_W:
  2067. if (taicpu(hp1).oper[0]^.val = $ffff) then
  2068. begin
  2069. { Convert:
  2070. movw x, %regw
  2071. andl ffffh, %regd
  2072. To:
  2073. movzwl x, %regd
  2074. (Identical registers, just different sizes)
  2075. }
  2076. RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 16-bit register name }
  2077. RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 32-bit register name }
  2078. case taicpu(hp1).opsize of
  2079. S_L: NewSize := S_WL;
  2080. {$ifdef x86_64}
  2081. S_Q: NewSize := S_WQ;
  2082. {$endif x86_64}
  2083. else
  2084. InternalError(2018011511);
  2085. end;
  2086. end
  2087. else
  2088. NewSize := S_NO;
  2089. else
  2090. NewSize := S_NO;
  2091. end;
  2092. if NewSize <> S_NO then
  2093. begin
  2094. PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + InputVal + ',' + RegName1;
  2095. { The actual optimization }
  2096. taicpu(p).opcode := A_MOVZX;
  2097. taicpu(p).changeopsize(NewSize);
  2098. taicpu(p).oper[1]^ := taicpu(hp1).oper[1]^;
  2099. { Safeguard if "and" is followed by a conditional command }
  2100. TransferUsedRegs(TmpUsedRegs);
  2101. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  2102. if (RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  2103. begin
  2104. { At this point, the "and" command is effectively equivalent to
  2105. "test %reg,%reg". This will be handled separately by the
  2106. Peephole Optimizer. [Kit] }
  2107. DebugMsg(SPeepholeOptimization + PreMessage +
  2108. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  2109. end
  2110. else
  2111. begin
  2112. DebugMsg(SPeepholeOptimization + PreMessage + '; and' + debug_opsize2str(taicpu(hp1).opsize) + ' $' + MaskNum + ',' + RegName2 +
  2113. ' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
  2114. RemoveInstruction(hp1);
  2115. end;
  2116. Result := True;
  2117. Exit;
  2118. end;
  2119. end;
  2120. end;
  2121. { Next instruction is also a MOV ? }
  2122. if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) then
  2123. begin
  2124. if (taicpu(p).oper[1]^.typ = top_reg) and
  2125. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
  2126. begin
  2127. CurrentReg := taicpu(p).oper[1]^.reg;
  2128. TransferUsedRegs(TmpUsedRegs);
  2129. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  2130. { we have
  2131. mov x, %treg
  2132. mov %treg, y
  2133. }
  2134. if not(RegInOp(CurrentReg, taicpu(hp1).oper[1]^)) then
  2135. if not(RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs)) then
  2136. { we've got
  2137. mov x, %treg
  2138. mov %treg, y
  2139. with %treg is not used after }
  2140. case taicpu(p).oper[0]^.typ Of
  2141. { top_reg is covered by DeepMOVOpt }
  2142. top_const:
  2143. begin
  2144. { change
  2145. mov const, %treg
  2146. mov %treg, y
  2147. to
  2148. mov const, y
  2149. }
  2150. if (taicpu(hp1).oper[1]^.typ=top_reg) or
  2151. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  2152. begin
  2153. if taicpu(hp1).oper[1]^.typ=top_reg then
  2154. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  2155. taicpu(p).loadOper(1,taicpu(hp1).oper[1]^);
  2156. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 5 done',p);
  2157. RemoveInstruction(hp1);
  2158. Result:=true;
  2159. Exit;
  2160. end;
  2161. end;
  2162. top_ref:
  2163. if (taicpu(hp1).oper[1]^.typ = top_reg) then
  2164. begin
  2165. { change
  2166. mov mem, %treg
  2167. mov %treg, %reg
  2168. to
  2169. mov mem, %reg"
  2170. }
  2171. taicpu(p).loadreg(1, taicpu(hp1).oper[1]^.reg);
  2172. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3 done',p);
  2173. RemoveInstruction(hp1);
  2174. Result:=true;
  2175. Exit;
  2176. end;
  2177. else
  2178. ;
  2179. end
  2180. else
  2181. { %treg is used afterwards, but all eventualities
  2182. other than the first MOV instruction being a constant
  2183. are covered by DeepMOVOpt, so only check for that }
  2184. if (taicpu(p).oper[0]^.typ = top_const) and
  2185. (
  2186. { For MOV operations, a size saving is only made if the register/const is byte-sized }
  2187. not (cs_opt_size in current_settings.optimizerswitches) or
  2188. (taicpu(hp1).opsize = S_B)
  2189. ) and
  2190. (
  2191. (taicpu(hp1).oper[1]^.typ = top_reg) or
  2192. ((taicpu(p).oper[0]^.val >= low(longint)) and (taicpu(p).oper[0]^.val <= high(longint)))
  2193. ) then
  2194. begin
  2195. DebugMsg(SPeepholeOptimization + debug_operstr(taicpu(hp1).oper[0]^) + ' = $' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 6b)',hp1);
  2196. taicpu(hp1).loadconst(0, taicpu(p).oper[0]^.val);
  2197. end;
  2198. end;
  2199. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  2200. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  2201. { mov reg1, mem1 or mov mem1, reg1
  2202. mov mem2, reg2 mov reg2, mem2}
  2203. begin
  2204. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  2205. { mov reg1, mem1 or mov mem1, reg1
  2206. mov mem2, reg1 mov reg2, mem1}
  2207. begin
  2208. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2209. { Removes the second statement from
  2210. mov reg1, mem1/reg2
  2211. mov mem1/reg2, reg1 }
  2212. begin
  2213. if taicpu(p).oper[0]^.typ=top_reg then
  2214. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2215. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 1',p);
  2216. RemoveInstruction(hp1);
  2217. Result:=true;
  2218. exit;
  2219. end
  2220. else
  2221. begin
  2222. TransferUsedRegs(TmpUsedRegs);
  2223. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2224. if (taicpu(p).oper[1]^.typ = top_ref) and
  2225. { mov reg1, mem1
  2226. mov mem2, reg1 }
  2227. (taicpu(hp1).oper[0]^.ref^.refaddr = addr_no) and
  2228. GetNextInstruction(hp1, hp2) and
  2229. MatchInstruction(hp2,A_CMP,[taicpu(p).opsize]) and
  2230. OpsEqual(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^) and
  2231. OpsEqual(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) and
  2232. not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs)) then
  2233. { change to
  2234. mov reg1, mem1 mov reg1, mem1
  2235. mov mem2, reg1 cmp reg1, mem2
  2236. cmp mem1, reg1
  2237. }
  2238. begin
  2239. RemoveInstruction(hp2);
  2240. taicpu(hp1).opcode := A_CMP;
  2241. taicpu(hp1).loadref(1,taicpu(hp1).oper[0]^.ref^);
  2242. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  2243. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  2244. DebugMsg(SPeepholeOptimization + 'MovMovCmp2MovCmp done',hp1);
  2245. end;
  2246. end;
  2247. end
  2248. else if (taicpu(p).oper[1]^.typ=top_ref) and
  2249. OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2250. begin
  2251. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
  2252. taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
  2253. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov1 done',p);
  2254. end
  2255. else
  2256. begin
  2257. TransferUsedRegs(TmpUsedRegs);
  2258. if GetNextInstruction(hp1, hp2) and
  2259. MatchOpType(taicpu(p),top_ref,top_reg) and
  2260. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2261. (taicpu(hp1).oper[1]^.typ = top_ref) and
  2262. MatchInstruction(hp2,A_MOV,[taicpu(p).opsize]) and
  2263. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  2264. RefsEqual(taicpu(hp2).oper[0]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  2265. if not RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^) and
  2266. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,tmpUsedRegs)) then
  2267. { mov mem1, %reg1
  2268. mov %reg1, mem2
  2269. mov mem2, reg2
  2270. to:
  2271. mov mem1, reg2
  2272. mov reg2, mem2}
  2273. begin
  2274. AllocRegBetween(taicpu(hp2).oper[1]^.reg,p,hp2,usedregs);
  2275. DebugMsg(SPeepholeOptimization + 'MovMovMov2MovMov 1 done',p);
  2276. taicpu(p).loadoper(1,taicpu(hp2).oper[1]^);
  2277. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  2278. RemoveInstruction(hp2);
  2279. end
  2280. {$ifdef i386}
  2281. { this is enabled for i386 only, as the rules to create the reg sets below
  2282. are too complicated for x86-64, so this makes this code too error prone
  2283. on x86-64
  2284. }
  2285. else if (taicpu(p).oper[1]^.reg <> taicpu(hp2).oper[1]^.reg) and
  2286. not(RegInRef(taicpu(p).oper[1]^.reg,taicpu(p).oper[0]^.ref^)) and
  2287. not(RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^)) then
  2288. { mov mem1, reg1 mov mem1, reg1
  2289. mov reg1, mem2 mov reg1, mem2
  2290. mov mem2, reg2 mov mem2, reg1
  2291. to: to:
  2292. mov mem1, reg1 mov mem1, reg1
  2293. mov mem1, reg2 mov reg1, mem2
  2294. mov reg1, mem2
  2295. or (if mem1 depends on reg1
  2296. and/or if mem2 depends on reg2)
  2297. to:
  2298. mov mem1, reg1
  2299. mov reg1, mem2
  2300. mov reg1, reg2
  2301. }
  2302. begin
  2303. taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^);
  2304. taicpu(hp1).loadReg(1,taicpu(hp2).oper[1]^.reg);
  2305. taicpu(hp2).loadRef(1,taicpu(hp2).oper[0]^.ref^);
  2306. taicpu(hp2).loadReg(0,taicpu(p).oper[1]^.reg);
  2307. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  2308. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  2309. (getsupreg(taicpu(p).oper[0]^.ref^.base) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  2310. AllocRegBetween(taicpu(p).oper[0]^.ref^.base,p,hp2,usedregs);
  2311. if (taicpu(p).oper[0]^.ref^.index <> NR_NO) and
  2312. (getsupreg(taicpu(p).oper[0]^.ref^.index) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
  2313. AllocRegBetween(taicpu(p).oper[0]^.ref^.index,p,hp2,usedregs);
  2314. end
  2315. else if (taicpu(hp1).Oper[0]^.reg <> taicpu(hp2).Oper[1]^.reg) then
  2316. begin
  2317. taicpu(hp2).loadReg(0,taicpu(hp1).Oper[0]^.reg);
  2318. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
  2319. end
  2320. else
  2321. begin
  2322. RemoveInstruction(hp2);
  2323. end
  2324. {$endif i386}
  2325. ;
  2326. end;
  2327. end
  2328. { movl [mem1],reg1
  2329. movl [mem1],reg2
  2330. to
  2331. movl [mem1],reg1
  2332. movl reg1,reg2
  2333. }
  2334. else if MatchOpType(taicpu(p),top_ref,top_reg) and
  2335. MatchOpType(taicpu(hp1),top_ref,top_reg) and
  2336. (taicpu(p).opsize = taicpu(hp1).opsize) and
  2337. RefsEqual(taicpu(p).oper[0]^.ref^,taicpu(hp1).oper[0]^.ref^) and
  2338. (taicpu(p).oper[0]^.ref^.volatility=[]) and
  2339. (taicpu(hp1).oper[0]^.ref^.volatility=[]) and
  2340. not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.base)) and
  2341. not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.index)) then
  2342. begin
  2343. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 2',p);
  2344. taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
  2345. end;
  2346. { movl const1,[mem1]
  2347. movl [mem1],reg1
  2348. to
  2349. movl const1,reg1
  2350. movl reg1,[mem1]
  2351. }
  2352. if MatchOpType(Taicpu(p),top_const,top_ref) and
  2353. MatchOpType(Taicpu(hp1),top_ref,top_reg) and
  2354. (taicpu(p).opsize = taicpu(hp1).opsize) and
  2355. RefsEqual(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.ref^) and
  2356. not(RegInRef(taicpu(hp1).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^)) then
  2357. begin
  2358. AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
  2359. taicpu(hp1).loadReg(0,taicpu(hp1).oper[1]^.reg);
  2360. taicpu(hp1).loadRef(1,taicpu(p).oper[1]^.ref^);
  2361. taicpu(p).loadReg(1,taicpu(hp1).oper[0]^.reg);
  2362. taicpu(hp1).fileinfo := taicpu(p).fileinfo;
  2363. DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 1',p);
  2364. Result:=true;
  2365. exit;
  2366. end;
  2367. { mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
  2368. end;
  2369. { search further than the next instruction for a mov }
  2370. if
  2371. { check as much as possible before the expensive GetNextInstructionUsingReg call }
  2372. (taicpu(p).oper[1]^.typ = top_reg) and
  2373. (taicpu(p).oper[0]^.typ in [top_reg,top_const]) and
  2374. not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) and
  2375. { we work with hp2 here, so hp1 can be still used later on when
  2376. checking for GetNextInstruction_p }
  2377. { GetNextInstructionUsingReg only searches one instruction ahead unless -O3 is specified }
  2378. GetNextInstructionUsingReg(hp1,hp2,taicpu(p).oper[1]^.reg) and
  2379. (hp2.typ=ait_instruction) then
  2380. begin
  2381. case taicpu(hp2).opcode of
  2382. A_MOV:
  2383. if MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^.reg) and
  2384. ((taicpu(p).oper[0]^.typ=top_const) or
  2385. ((taicpu(p).oper[0]^.typ=top_reg) and
  2386. not(RegUsedBetween(taicpu(p).oper[0]^.reg, p, hp2))
  2387. )
  2388. ) then
  2389. begin
  2390. { we have
  2391. mov x, %treg
  2392. mov %treg, y
  2393. }
  2394. TransferUsedRegs(TmpUsedRegs);
  2395. TmpUsedRegs[R_INTREGISTER].Update(tai(p.Next));
  2396. { We don't need to call UpdateUsedRegs for every instruction between
  2397. p and hp2 because the register we're concerned about will not
  2398. become deallocated (otherwise GetNextInstructionUsingReg would
  2399. have stopped at an earlier instruction). [Kit] }
  2400. TempRegUsed :=
  2401. RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) or
  2402. RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1);
  2403. case taicpu(p).oper[0]^.typ Of
  2404. top_reg:
  2405. begin
  2406. { change
  2407. mov %reg, %treg
  2408. mov %treg, y
  2409. to
  2410. mov %reg, y
  2411. }
  2412. CurrentReg := taicpu(p).oper[0]^.reg; { Saves on a handful of pointer dereferences }
  2413. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  2414. if taicpu(hp2).oper[1]^.reg = CurrentReg then
  2415. begin
  2416. { %reg = y - remove hp2 completely (doing it here instead of relying on
  2417. the "mov %reg,%reg" optimisation might cut down on a pass iteration) }
  2418. if TempRegUsed then
  2419. begin
  2420. DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + RegName1 + '; removed unnecessary instruction (MovMov2MovNop 6b}',hp2);
  2421. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  2422. RemoveInstruction(hp2);
  2423. end
  2424. else
  2425. begin
  2426. RemoveInstruction(hp2);
  2427. { We can remove the original MOV too }
  2428. DebugMsg(SPeepholeOptimization + 'MovMov2NopNop 6b done',p);
  2429. RemoveCurrentP(p, hp1);
  2430. Result:=true;
  2431. Exit;
  2432. end;
  2433. end
  2434. else
  2435. begin
  2436. AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
  2437. taicpu(hp2).loadReg(0, CurrentReg);
  2438. if TempRegUsed then
  2439. begin
  2440. { Don't remove the first instruction if the temporary register is in use }
  2441. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_regname(CurrentReg) + '; changed to minimise pipeline stall (MovMov2Mov 6a}',hp2);
  2442. { No need to set Result to True. If there's another instruction later on
  2443. that can be optimised, it will be detected when the main Pass 1 loop
  2444. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
  2445. end
  2446. else
  2447. begin
  2448. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 6 done',p);
  2449. RemoveCurrentP(p, hp1);
  2450. Result:=true;
  2451. Exit;
  2452. end;
  2453. end;
  2454. end;
  2455. top_const:
  2456. if not (cs_opt_size in current_settings.optimizerswitches) or (taicpu(hp2).opsize = S_B) then
  2457. begin
  2458. { change
  2459. mov const, %treg
  2460. mov %treg, y
  2461. to
  2462. mov const, y
  2463. }
  2464. if (taicpu(hp2).oper[1]^.typ=top_reg) or
  2465. ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
  2466. begin
  2467. RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
  2468. taicpu(hp2).loadOper(0,taicpu(p).oper[0]^);
  2469. if TempRegUsed then
  2470. begin
  2471. { Don't remove the first instruction if the temporary register is in use }
  2472. DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 7a)',hp2);
  2473. { No need to set Result to True. If there's another instruction later on
  2474. that can be optimised, it will be detected when the main Pass 1 loop
  2475. reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
  2476. end
  2477. else
  2478. begin
  2479. DebugMsg(SPeepholeOptimization + 'MovMov2Mov 7 done',p);
  2480. RemoveCurrentP(p, hp1);
  2481. Result:=true;
  2482. Exit;
  2483. end;
  2484. end;
  2485. end;
  2486. else
  2487. Internalerror(2019103001);
  2488. end;
  2489. end;
  2490. A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
  2491. if MatchOpType(taicpu(hp2), top_reg, top_reg) and
  2492. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  2493. SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
  2494. begin
  2495. {
  2496. Change from:
  2497. mov ###, %reg
  2498. ...
  2499. movs/z %reg,%reg (Same register, just different sizes)
  2500. To:
  2501. movs/z ###, %reg (Longer version)
  2502. ...
  2503. (remove)
  2504. }
  2505. DebugMsg(SPeepholeOptimization + 'MovMovs/z2Mov/s/z done', p);
  2506. taicpu(p).oper[1]^.reg := taicpu(hp2).oper[1]^.reg;
  2507. { Keep the first instruction as mov if ### is a constant }
  2508. if taicpu(p).oper[0]^.typ = top_const then
  2509. taicpu(p).opsize := reg2opsize(taicpu(hp2).oper[1]^.reg)
  2510. else
  2511. begin
  2512. taicpu(p).opcode := taicpu(hp2).opcode;
  2513. taicpu(p).opsize := taicpu(hp2).opsize;
  2514. end;
  2515. DebugMsg(SPeepholeOptimization + 'Removed movs/z instruction and extended earlier write (MovMovs/z2Mov/s/z)', hp2);
  2516. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp2, UsedRegs);
  2517. RemoveInstruction(hp2);
  2518. Result := True;
  2519. Exit;
  2520. end;
  2521. else
  2522. ;
  2523. end;
  2524. end;
  2525. if (aoc_MovAnd2Mov_3 in OptsToCheck) and
  2526. (taicpu(p).oper[1]^.typ = top_reg) and
  2527. (taicpu(p).opsize = S_L) and
  2528. GetNextInstructionUsingRegTrackingUse(p,hp2,taicpu(p).oper[1]^.reg) and
  2529. (taicpu(hp2).opcode = A_AND) and
  2530. (MatchOpType(taicpu(hp2),top_const,top_reg) or
  2531. (MatchOpType(taicpu(hp2),top_reg,top_reg) and
  2532. MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^))
  2533. ) then
  2534. begin
  2535. if SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp2).oper[1]^.reg) then
  2536. begin
  2537. if ((taicpu(hp2).oper[0]^.typ=top_const) and (taicpu(hp2).oper[0]^.val = $ffffffff)) or
  2538. ((taicpu(hp2).oper[0]^.typ=top_reg) and (taicpu(hp2).opsize=S_L)) then
  2539. begin
  2540. { Optimize out:
  2541. mov x, %reg
  2542. and ffffffffh, %reg
  2543. }
  2544. DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 3 done',p);
  2545. RemoveInstruction(hp2);
  2546. Result:=true;
  2547. exit;
  2548. end;
  2549. end;
  2550. end;
  2551. { leave out the mov from "mov reg, x(%frame_pointer); leave/ret" (with
  2552. x >= RetOffset) as it doesn't do anything (it writes either to a
  2553. parameter or to the temporary storage room for the function
  2554. result)
  2555. }
  2556. if IsExitCode(hp1) and
  2557. (taicpu(p).oper[1]^.typ = top_ref) and
  2558. (taicpu(p).oper[1]^.ref^.index = NR_NO) and
  2559. (
  2560. (
  2561. (taicpu(p).oper[1]^.ref^.base = current_procinfo.FramePointer) and
  2562. not (
  2563. assigned(current_procinfo.procdef.funcretsym) and
  2564. (taicpu(p).oper[1]^.ref^.offset <= tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)
  2565. )
  2566. ) or
  2567. { Also discard writes to the stack that are below the base pointer,
  2568. as this is temporary storage rather than a function result on the
  2569. stack, say. }
  2570. (
  2571. (taicpu(p).oper[1]^.ref^.base = NR_STACK_POINTER_REG) and
  2572. (taicpu(p).oper[1]^.ref^.offset < current_procinfo.final_localsize)
  2573. )
  2574. ) then
  2575. begin
  2576. RemoveCurrentp(p, hp1);
  2577. DebugMsg(SPeepholeOptimization + 'removed deadstore before leave/ret',p);
  2578. RemoveLastDeallocForFuncRes(p);
  2579. Result:=true;
  2580. exit;
  2581. end;
  2582. if MatchOpType(taicpu(p),top_reg,top_ref) and
  2583. MatchInstruction(hp1,A_CMP,A_TEST,[taicpu(p).opsize]) and
  2584. (taicpu(hp1).oper[1]^.typ = top_ref) and
  2585. RefsEqual(taicpu(p).oper[1]^.ref^, taicpu(hp1).oper[1]^.ref^) then
  2586. begin
  2587. { change
  2588. mov reg1, mem1
  2589. test/cmp x, mem1
  2590. to
  2591. mov reg1, mem1
  2592. test/cmp x, reg1
  2593. }
  2594. taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
  2595. DebugMsg(SPeepholeOptimization + 'MovTestCmp2MovTestCmp 1',hp1);
  2596. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2597. exit;
  2598. end;
  2599. if MatchInstruction(hp1,A_LEA,[S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  2600. { If the flags register is in use, don't change the instruction to an
  2601. ADD otherwise this will scramble the flags. [Kit] }
  2602. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
  2603. begin
  2604. if MatchOpType(Taicpu(p),top_ref,top_reg) and
  2605. ((MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(hp1).oper[1]^.reg,Taicpu(p).oper[1]^.reg) and
  2606. (Taicpu(hp1).oper[0]^.ref^.base<>Taicpu(p).oper[1]^.reg)
  2607. ) or
  2608. (MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(p).oper[1]^.reg,Taicpu(hp1).oper[1]^.reg) and
  2609. (Taicpu(hp1).oper[0]^.ref^.index<>Taicpu(p).oper[1]^.reg)
  2610. )
  2611. ) then
  2612. { mov reg1,ref
  2613. lea reg2,[reg1,reg2]
  2614. to
  2615. add reg2,ref}
  2616. begin
  2617. TransferUsedRegs(TmpUsedRegs);
  2618. { reg1 may not be used afterwards }
  2619. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
  2620. begin
  2621. Taicpu(hp1).opcode:=A_ADD;
  2622. Taicpu(hp1).oper[0]^.ref^:=Taicpu(p).oper[0]^.ref^;
  2623. DebugMsg(SPeepholeOptimization + 'MovLea2Add done',hp1);
  2624. RemoveCurrentp(p, hp1);
  2625. result:=true;
  2626. exit;
  2627. end;
  2628. end;
  2629. { If the LEA instruction can be converted into an arithmetic instruction,
  2630. it may be possible to then fold it in the next optimisation, otherwise
  2631. there's nothing more that can be optimised here. }
  2632. if not ConvertLEA(taicpu(hp1)) then
  2633. Exit;
  2634. end;
  2635. if (taicpu(p).oper[1]^.typ = top_reg) and
  2636. (hp1.typ = ait_instruction) and
  2637. GetNextInstruction(hp1, hp2) and
  2638. MatchInstruction(hp2,A_MOV,[]) and
  2639. (SuperRegistersEqual(taicpu(hp2).oper[0]^.reg,taicpu(p).oper[1]^.reg)) and
  2640. (
  2641. IsFoldableArithOp(taicpu(hp1), taicpu(p).oper[1]^.reg)
  2642. {$ifdef x86_64}
  2643. or
  2644. (
  2645. (taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and (taicpu(hp2).opsize=S_L) and
  2646. IsFoldableArithOp(taicpu(hp1), newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[1]^.reg),R_SUBQ))
  2647. )
  2648. {$endif x86_64}
  2649. ) then
  2650. begin
  2651. if OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
  2652. (taicpu(hp2).oper[0]^.typ=top_reg) then
  2653. { change movsX/movzX reg/ref, reg2
  2654. add/sub/or/... reg3/$const, reg2
  2655. mov reg2 reg/ref
  2656. dealloc reg2
  2657. to
  2658. add/sub/or/... reg3/$const, reg/ref }
  2659. begin
  2660. TransferUsedRegs(TmpUsedRegs);
  2661. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2662. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2663. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2664. begin
  2665. { by example:
  2666. movswl %si,%eax movswl %si,%eax p
  2667. decl %eax addl %edx,%eax hp1
  2668. movw %ax,%si movw %ax,%si hp2
  2669. ->
  2670. movswl %si,%eax movswl %si,%eax p
  2671. decw %eax addw %edx,%eax hp1
  2672. movw %ax,%si movw %ax,%si hp2
  2673. }
  2674. DebugMsg(SPeepholeOptimization + 'MovOpMov2Op ('+
  2675. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  2676. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  2677. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  2678. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  2679. {
  2680. ->
  2681. movswl %si,%eax movswl %si,%eax p
  2682. decw %si addw %dx,%si hp1
  2683. movw %ax,%si movw %ax,%si hp2
  2684. }
  2685. case taicpu(hp1).ops of
  2686. 1:
  2687. begin
  2688. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  2689. if taicpu(hp1).oper[0]^.typ=top_reg then
  2690. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2691. end;
  2692. 2:
  2693. begin
  2694. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  2695. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  2696. (taicpu(hp1).opcode<>A_SHL) and
  2697. (taicpu(hp1).opcode<>A_SHR) and
  2698. (taicpu(hp1).opcode<>A_SAR) then
  2699. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2700. end;
  2701. else
  2702. internalerror(2008042701);
  2703. end;
  2704. {
  2705. ->
  2706. decw %si addw %dx,%si p
  2707. }
  2708. RemoveInstruction(hp2);
  2709. RemoveCurrentP(p, hp1);
  2710. Result:=True;
  2711. Exit;
  2712. end;
  2713. end;
  2714. if MatchOpType(taicpu(hp2),top_reg,top_reg) and
  2715. not(SuperRegistersEqual(taicpu(hp1).oper[0]^.reg,taicpu(hp2).oper[1]^.reg)) and
  2716. ((topsize2memsize[taicpu(hp1).opsize]<= topsize2memsize[taicpu(hp2).opsize]) or
  2717. { opsize matters for these opcodes, we could probably work around this, but it is not worth the effort }
  2718. ((taicpu(hp1).opcode<>A_SHL) and (taicpu(hp1).opcode<>A_SHR) and (taicpu(hp1).opcode<>A_SAR))
  2719. )
  2720. {$ifdef i386}
  2721. { byte registers of esi, edi, ebp, esp are not available on i386 }
  2722. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  2723. and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(p).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
  2724. {$endif i386}
  2725. then
  2726. { change movsX/movzX reg/ref, reg2
  2727. add/sub/or/... regX/$const, reg2
  2728. mov reg2, reg3
  2729. dealloc reg2
  2730. to
  2731. movsX/movzX reg/ref, reg3
  2732. add/sub/or/... reg3/$const, reg3
  2733. }
  2734. begin
  2735. TransferUsedRegs(TmpUsedRegs);
  2736. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2737. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  2738. If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
  2739. begin
  2740. { by example:
  2741. movswl %si,%eax movswl %si,%eax p
  2742. decl %eax addl %edx,%eax hp1
  2743. movw %ax,%si movw %ax,%si hp2
  2744. ->
  2745. movswl %si,%eax movswl %si,%eax p
  2746. decw %eax addw %edx,%eax hp1
  2747. movw %ax,%si movw %ax,%si hp2
  2748. }
  2749. DebugMsg(SPeepholeOptimization + 'MovOpMov2MovOp ('+
  2750. debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
  2751. debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
  2752. debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
  2753. { limit size of constants as well to avoid assembler errors, but
  2754. check opsize to avoid overflow when left shifting the 1 }
  2755. if (taicpu(p).oper[0]^.typ=top_const) and (topsize2memsize[taicpu(hp2).opsize]<=63) then
  2756. taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and ((qword(1) shl topsize2memsize[taicpu(hp2).opsize])-1);
  2757. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  2758. taicpu(p).changeopsize(taicpu(hp2).opsize);
  2759. if taicpu(p).oper[0]^.typ=top_reg then
  2760. setsubreg(taicpu(p).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2761. taicpu(p).loadoper(1, taicpu(hp2).oper[1]^);
  2762. AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,usedregs);
  2763. {
  2764. ->
  2765. movswl %si,%eax movswl %si,%eax p
  2766. decw %si addw %dx,%si hp1
  2767. movw %ax,%si movw %ax,%si hp2
  2768. }
  2769. case taicpu(hp1).ops of
  2770. 1:
  2771. begin
  2772. taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
  2773. if taicpu(hp1).oper[0]^.typ=top_reg then
  2774. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2775. end;
  2776. 2:
  2777. begin
  2778. taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
  2779. if (taicpu(hp1).oper[0]^.typ=top_reg) and
  2780. (taicpu(hp1).opcode<>A_SHL) and
  2781. (taicpu(hp1).opcode<>A_SHR) and
  2782. (taicpu(hp1).opcode<>A_SAR) then
  2783. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  2784. end;
  2785. else
  2786. internalerror(2018111801);
  2787. end;
  2788. {
  2789. ->
  2790. decw %si addw %dx,%si p
  2791. }
  2792. RemoveInstruction(hp2);
  2793. end;
  2794. end;
  2795. end;
  2796. if MatchInstruction(hp1,A_BTS,A_BTR,[Taicpu(p).opsize]) and
  2797. GetNextInstruction(hp1, hp2) and
  2798. MatchInstruction(hp2,A_OR,[Taicpu(p).opsize]) and
  2799. MatchOperand(Taicpu(p).oper[0]^,0) and
  2800. (Taicpu(p).oper[1]^.typ = top_reg) and
  2801. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp1).oper[1]^) and
  2802. MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp2).oper[1]^) then
  2803. { mov reg1,0
  2804. bts reg1,operand1 --> mov reg1,operand2
  2805. or reg1,operand2 bts reg1,operand1}
  2806. begin
  2807. Taicpu(hp2).opcode:=A_MOV;
  2808. asml.remove(hp1);
  2809. insertllitem(hp2,hp2.next,hp1);
  2810. RemoveCurrentp(p, hp1);
  2811. Result:=true;
  2812. exit;
  2813. end;
  2814. end;
  2815. function TX86AsmOptimizer.OptPass1MOVXX(var p : tai) : boolean;
  2816. var
  2817. hp1 : tai;
  2818. begin
  2819. Result:=false;
  2820. if taicpu(p).ops <> 2 then
  2821. exit;
  2822. if GetNextInstruction(p,hp1) and
  2823. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  2824. (taicpu(hp1).ops = 2) then
  2825. begin
  2826. if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
  2827. (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
  2828. { movXX reg1, mem1 or movXX mem1, reg1
  2829. movXX mem2, reg2 movXX reg2, mem2}
  2830. begin
  2831. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
  2832. { movXX reg1, mem1 or movXX mem1, reg1
  2833. movXX mem2, reg1 movXX reg2, mem1}
  2834. begin
  2835. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  2836. begin
  2837. { Removes the second statement from
  2838. movXX reg1, mem1/reg2
  2839. movXX mem1/reg2, reg1
  2840. }
  2841. if taicpu(p).oper[0]^.typ=top_reg then
  2842. AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
  2843. { Removes the second statement from
  2844. movXX mem1/reg1, reg2
  2845. movXX reg2, mem1/reg1
  2846. }
  2847. if (taicpu(p).oper[1]^.typ=top_reg) and
  2848. not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,UsedRegs)) then
  2849. begin
  2850. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2Nop 1 done',p);
  2851. RemoveInstruction(hp1);
  2852. RemoveCurrentp(p); { p will now be equal to the instruction that follows what was hp1 }
  2853. end
  2854. else
  2855. begin
  2856. DebugMsg(SPeepholeOptimization + 'MovXXMovXX2MoVXX 1 done',p);
  2857. RemoveInstruction(hp1);
  2858. end;
  2859. Result:=true;
  2860. exit;
  2861. end
  2862. end;
  2863. end;
  2864. end;
  2865. end;
  2866. function TX86AsmOptimizer.OptPass1OP(var p : tai) : boolean;
  2867. var
  2868. hp1 : tai;
  2869. begin
  2870. result:=false;
  2871. { replace
  2872. <Op>X %mreg1,%mreg2 // Op in [ADD,MUL]
  2873. MovX %mreg2,%mreg1
  2874. dealloc %mreg2
  2875. by
  2876. <Op>X %mreg2,%mreg1
  2877. ?
  2878. }
  2879. if GetNextInstruction(p,hp1) and
  2880. { we mix single and double opperations here because we assume that the compiler
  2881. generates vmovapd only after double operations and vmovaps only after single operations }
  2882. MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
  2883. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2884. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
  2885. (taicpu(p).oper[0]^.typ=top_reg) then
  2886. begin
  2887. TransferUsedRegs(TmpUsedRegs);
  2888. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2889. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2890. begin
  2891. taicpu(p).loadoper(0,taicpu(hp1).oper[0]^);
  2892. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2893. DebugMsg(SPeepholeOptimization + 'OpMov2Op done',p);
  2894. RemoveInstruction(hp1);
  2895. result:=true;
  2896. end;
  2897. end;
  2898. end;
  2899. function TX86AsmOptimizer.OptPass1LEA(var p : tai) : boolean;
  2900. var
  2901. hp1, hp2, hp3: tai;
  2902. l : ASizeInt;
  2903. ref: Integer;
  2904. saveref: treference;
  2905. begin
  2906. Result:=false;
  2907. { removes seg register prefixes from LEA operations, as they
  2908. don't do anything}
  2909. taicpu(p).oper[0]^.ref^.Segment:=NR_NO;
  2910. { changes "lea (%reg1), %reg2" into "mov %reg1, %reg2" }
  2911. if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
  2912. (taicpu(p).oper[0]^.ref^.index = NR_NO) and
  2913. { do not mess with leas acessing the stack pointer }
  2914. (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) and
  2915. (not(Assigned(taicpu(p).oper[0]^.ref^.Symbol))) then
  2916. begin
  2917. if (taicpu(p).oper[0]^.ref^.offset = 0) then
  2918. begin
  2919. if (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) then
  2920. begin
  2921. hp1:=taicpu.op_reg_reg(A_MOV,taicpu(p).opsize,taicpu(p).oper[0]^.ref^.base,
  2922. taicpu(p).oper[1]^.reg);
  2923. InsertLLItem(p.previous,p.next, hp1);
  2924. DebugMsg(SPeepholeOptimization + 'Lea2Mov done',hp1);
  2925. p.free;
  2926. p:=hp1;
  2927. end
  2928. else
  2929. begin
  2930. DebugMsg(SPeepholeOptimization + 'Lea2Nop done',p);
  2931. RemoveCurrentP(p);
  2932. end;
  2933. Result:=true;
  2934. exit;
  2935. end
  2936. else if (
  2937. { continue to use lea to adjust the stack pointer,
  2938. it is the recommended way, but only if not optimizing for size }
  2939. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) or
  2940. (cs_opt_size in current_settings.optimizerswitches)
  2941. ) and
  2942. { If the flags register is in use, don't change the instruction
  2943. to an ADD otherwise this will scramble the flags. [Kit] }
  2944. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  2945. ConvertLEA(taicpu(p)) then
  2946. begin
  2947. Result:=true;
  2948. exit;
  2949. end;
  2950. end;
  2951. if GetNextInstruction(p,hp1) and
  2952. MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  2953. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  2954. MatchOpType(Taicpu(hp1),top_reg,top_reg) and
  2955. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) then
  2956. begin
  2957. TransferUsedRegs(TmpUsedRegs);
  2958. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  2959. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  2960. begin
  2961. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  2962. DebugMsg(SPeepholeOptimization + 'LeaMov2Lea done',p);
  2963. RemoveInstruction(hp1);
  2964. result:=true;
  2965. end;
  2966. end;
  2967. { changes
  2968. lea offset1(regX), reg1
  2969. lea offset2(reg1), reg1
  2970. to
  2971. lea offset1+offset2(regX), reg1 }
  2972. { for now, we do not mess with the stack pointer, thought it might be usefull to remove
  2973. unneeded lea sequences on the stack pointer, it needs to be tested in detail }
  2974. if (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) and
  2975. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
  2976. MatchInstruction(hp1,A_LEA,[taicpu(p).opsize]) and
  2977. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
  2978. (taicpu(p).oper[0]^.ref^.relsymbol=nil) and
  2979. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and
  2980. (taicpu(p).oper[0]^.ref^.symbol=nil) and
  2981. (((taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg) and
  2982. (taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) and
  2983. (taicpu(p).oper[0]^.ref^.index=NR_NO) and
  2984. (taicpu(p).oper[0]^.ref^.index=taicpu(hp1).oper[0]^.ref^.index) and
  2985. (taicpu(p).oper[0]^.ref^.scalefactor=taicpu(hp1).oper[0]^.ref^.scalefactor)
  2986. ) or
  2987. ((taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg) and
  2988. (taicpu(p).oper[0]^.ref^.index=NR_NO)
  2989. ) or
  2990. ((taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg) and
  2991. (taicpu(hp1).oper[0]^.ref^.scalefactor in [0,1]) and
  2992. (taicpu(p).oper[0]^.ref^.base=NR_NO) and
  2993. not(RegUsedBetween(taicpu(p).oper[0]^.ref^.index,p,hp1)))
  2994. ) and
  2995. not(RegUsedBetween(taicpu(p).oper[0]^.ref^.base,p,hp1)) and
  2996. (taicpu(p).oper[0]^.ref^.relsymbol=taicpu(hp1).oper[0]^.ref^.relsymbol) and
  2997. (taicpu(p).oper[0]^.ref^.segment=taicpu(hp1).oper[0]^.ref^.segment) and
  2998. (taicpu(p).oper[0]^.ref^.symbol=taicpu(hp1).oper[0]^.ref^.symbol) then
  2999. begin
  3000. DebugMsg(SPeepholeOptimization + 'LeaLea2Lea done',p);
  3001. if taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg then
  3002. begin
  3003. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.base;
  3004. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
  3005. { if the register is used as index and base, we have to increase for base as well
  3006. and adapt base }
  3007. if taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg then
  3008. begin
  3009. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  3010. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  3011. end;
  3012. end
  3013. else
  3014. begin
  3015. inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  3016. taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  3017. end;
  3018. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  3019. begin
  3020. taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
  3021. taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  3022. taicpu(hp1).oper[0]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  3023. end;
  3024. RemoveCurrentP(p);
  3025. result:=true;
  3026. exit;
  3027. end;
  3028. { changes
  3029. lea <ref1>, reg1
  3030. <op> ...,<ref. with reg1>,...
  3031. to
  3032. <op> ...,<ref1>,... }
  3033. if (taicpu(p).oper[1]^.reg<>current_procinfo.framepointer) and
  3034. (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) and
  3035. GetNextInstruction(p,hp1) and
  3036. (hp1.typ=ait_instruction) and
  3037. not(MatchInstruction(hp1,A_LEA,[])) then
  3038. begin
  3039. { find a reference which uses reg1 }
  3040. if (taicpu(hp1).ops>=1) and (taicpu(hp1).oper[0]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^) then
  3041. ref:=0
  3042. else if (taicpu(hp1).ops>=2) and (taicpu(hp1).oper[1]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^) then
  3043. ref:=1
  3044. else
  3045. ref:=-1;
  3046. if (ref<>-1) and
  3047. { reg1 must be either the base or the index }
  3048. ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) xor (taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg)) then
  3049. begin
  3050. { reg1 can be removed from the reference }
  3051. saveref:=taicpu(hp1).oper[ref]^.ref^;
  3052. if taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg then
  3053. taicpu(hp1).oper[ref]^.ref^.base:=NR_NO
  3054. else if taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg then
  3055. taicpu(hp1).oper[ref]^.ref^.index:=NR_NO
  3056. else
  3057. Internalerror(2019111201);
  3058. { check if the can insert all data of the lea into the second instruction }
  3059. if ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) or (taicpu(hp1).oper[ref]^.ref^.scalefactor in [0,1])) and
  3060. ((taicpu(p).oper[0]^.ref^.base=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.base=NR_NO)) and
  3061. ((taicpu(p).oper[0]^.ref^.index=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.index=NR_NO)) and
  3062. ((taicpu(p).oper[0]^.ref^.symbol=nil) or (taicpu(hp1).oper[ref]^.ref^.symbol=nil)) and
  3063. ((taicpu(p).oper[0]^.ref^.relsymbol=nil) or (taicpu(hp1).oper[ref]^.ref^.relsymbol=nil)) and
  3064. ((taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) or (taicpu(hp1).oper[ref]^.ref^.scalefactor in [0,1])) and
  3065. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.segment=NR_NO)
  3066. {$ifdef x86_64}
  3067. and (abs(taicpu(hp1).oper[ref]^.ref^.offset+taicpu(p).oper[0]^.ref^.offset)<=$7fffffff)
  3068. and (((taicpu(p).oper[0]^.ref^.base<>NR_RIP) and (taicpu(p).oper[0]^.ref^.index<>NR_RIP)) or
  3069. ((taicpu(hp1).oper[ref]^.ref^.base=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.index=NR_NO))
  3070. )
  3071. {$endif x86_64}
  3072. then
  3073. begin
  3074. { reg1 might not used by the second instruction after it is remove from the reference }
  3075. if not(RegInInstruction(taicpu(p).oper[1]^.reg,taicpu(hp1))) then
  3076. begin
  3077. TransferUsedRegs(TmpUsedRegs);
  3078. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3079. { reg1 is not updated so it might not be used afterwards }
  3080. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  3081. begin
  3082. DebugMsg(SPeepholeOptimization + 'LeaOp2Op done',p);
  3083. if taicpu(p).oper[0]^.ref^.base<>NR_NO then
  3084. taicpu(hp1).oper[ref]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
  3085. if taicpu(p).oper[0]^.ref^.index<>NR_NO then
  3086. taicpu(hp1).oper[ref]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
  3087. if taicpu(p).oper[0]^.ref^.symbol<>nil then
  3088. taicpu(hp1).oper[ref]^.ref^.symbol:=taicpu(p).oper[0]^.ref^.symbol;
  3089. if taicpu(p).oper[0]^.ref^.relsymbol<>nil then
  3090. taicpu(hp1).oper[ref]^.ref^.relsymbol:=taicpu(p).oper[0]^.ref^.relsymbol;
  3091. if not(taicpu(p).oper[0]^.ref^.scalefactor in [0,1]) then
  3092. taicpu(hp1).oper[ref]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
  3093. inc(taicpu(hp1).oper[ref]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
  3094. RemoveCurrentP(p, hp1);
  3095. result:=true;
  3096. exit;
  3097. end
  3098. end;
  3099. end;
  3100. { recover }
  3101. taicpu(hp1).oper[ref]^.ref^:=saveref;
  3102. end;
  3103. end;
  3104. end;
  3105. function TX86AsmOptimizer.DoSubAddOpt(var p: tai): Boolean;
  3106. var
  3107. hp1 : tai;
  3108. begin
  3109. DoSubAddOpt := False;
  3110. if GetLastInstruction(p, hp1) and
  3111. (hp1.typ = ait_instruction) and
  3112. (taicpu(hp1).opsize = taicpu(p).opsize) then
  3113. case taicpu(hp1).opcode Of
  3114. A_DEC:
  3115. if (taicpu(hp1).oper[0]^.typ = top_reg) and
  3116. MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
  3117. begin
  3118. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+1);
  3119. RemoveInstruction(hp1);
  3120. end;
  3121. A_SUB:
  3122. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  3123. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  3124. begin
  3125. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+taicpu(hp1).oper[0]^.val);
  3126. RemoveInstruction(hp1);
  3127. end;
  3128. A_ADD:
  3129. begin
  3130. if MatchOpType(taicpu(hp1),top_const,top_reg) and
  3131. MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
  3132. begin
  3133. taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
  3134. RemoveInstruction(hp1);
  3135. if (taicpu(p).oper[0]^.val = 0) then
  3136. begin
  3137. hp1 := tai(p.next);
  3138. RemoveInstruction(p); { Note, the choice to not use RemoveCurrentp is deliberate }
  3139. if not GetLastInstruction(hp1, p) then
  3140. p := hp1;
  3141. DoSubAddOpt := True;
  3142. end
  3143. end;
  3144. end;
  3145. else
  3146. ;
  3147. end;
  3148. end;
  3149. function TX86AsmOptimizer.OptPass1Sub(var p : tai) : boolean;
  3150. {$ifdef i386}
  3151. var
  3152. hp1 : tai;
  3153. {$endif i386}
  3154. begin
  3155. Result:=false;
  3156. { * change "subl $2, %esp; pushw x" to "pushl x"}
  3157. { * change "sub/add const1, reg" or "dec reg" followed by
  3158. "sub const2, reg" to one "sub ..., reg" }
  3159. if MatchOpType(taicpu(p),top_const,top_reg) then
  3160. begin
  3161. {$ifdef i386}
  3162. if (taicpu(p).oper[0]^.val = 2) and
  3163. (taicpu(p).oper[1]^.reg = NR_ESP) and
  3164. { Don't do the sub/push optimization if the sub }
  3165. { comes from setting up the stack frame (JM) }
  3166. (not(GetLastInstruction(p,hp1)) or
  3167. not(MatchInstruction(hp1,A_MOV,[S_L]) and
  3168. MatchOperand(taicpu(hp1).oper[0]^,NR_ESP) and
  3169. MatchOperand(taicpu(hp1).oper[0]^,NR_EBP))) then
  3170. begin
  3171. hp1 := tai(p.next);
  3172. while Assigned(hp1) and
  3173. (tai(hp1).typ in [ait_instruction]+SkipInstr) and
  3174. not RegReadByInstruction(NR_ESP,hp1) and
  3175. not RegModifiedByInstruction(NR_ESP,hp1) do
  3176. hp1 := tai(hp1.next);
  3177. if Assigned(hp1) and
  3178. MatchInstruction(hp1,A_PUSH,[S_W]) then
  3179. begin
  3180. taicpu(hp1).changeopsize(S_L);
  3181. if taicpu(hp1).oper[0]^.typ=top_reg then
  3182. setsubreg(taicpu(hp1).oper[0]^.reg,R_SUBWHOLE);
  3183. hp1 := tai(p.next);
  3184. RemoveCurrentp(p, hp1);
  3185. Result:=true;
  3186. exit;
  3187. end;
  3188. end;
  3189. {$endif i386}
  3190. if DoSubAddOpt(p) then
  3191. Result:=true;
  3192. end;
  3193. end;
  3194. function TX86AsmOptimizer.OptPass1SHLSAL(var p : tai) : boolean;
  3195. var
  3196. TmpBool1,TmpBool2 : Boolean;
  3197. tmpref : treference;
  3198. hp1,hp2: tai;
  3199. mask: tcgint;
  3200. begin
  3201. Result:=false;
  3202. { All these optimisations work on "shl/sal const,%reg" }
  3203. if not MatchOpType(taicpu(p),top_const,top_reg) then
  3204. Exit;
  3205. if (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
  3206. (taicpu(p).oper[0]^.val <= 3) then
  3207. { Changes "shl const, %reg32; add const/reg, %reg32" to one lea statement }
  3208. begin
  3209. { should we check the next instruction? }
  3210. TmpBool1 := True;
  3211. { have we found an add/sub which could be
  3212. integrated in the lea? }
  3213. TmpBool2 := False;
  3214. reference_reset(tmpref,2,[]);
  3215. TmpRef.index := taicpu(p).oper[1]^.reg;
  3216. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  3217. while TmpBool1 and
  3218. GetNextInstruction(p, hp1) and
  3219. (tai(hp1).typ = ait_instruction) and
  3220. ((((taicpu(hp1).opcode = A_ADD) or
  3221. (taicpu(hp1).opcode = A_SUB)) and
  3222. (taicpu(hp1).oper[1]^.typ = Top_Reg) and
  3223. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)) or
  3224. (((taicpu(hp1).opcode = A_INC) or
  3225. (taicpu(hp1).opcode = A_DEC)) and
  3226. (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  3227. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg)) or
  3228. ((taicpu(hp1).opcode = A_LEA) and
  3229. (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
  3230. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg))) and
  3231. (not GetNextInstruction(hp1,hp2) or
  3232. not instrReadsFlags(hp2)) Do
  3233. begin
  3234. TmpBool1 := False;
  3235. if taicpu(hp1).opcode=A_LEA then
  3236. begin
  3237. if (TmpRef.base = NR_NO) and
  3238. (taicpu(hp1).oper[0]^.ref^.symbol=nil) and
  3239. (taicpu(hp1).oper[0]^.ref^.relsymbol=nil) and
  3240. (taicpu(hp1).oper[0]^.ref^.segment=NR_NO) and
  3241. ((taicpu(hp1).oper[0]^.ref^.scalefactor=0) or
  3242. (taicpu(hp1).oper[0]^.ref^.scalefactor*tmpref.scalefactor<=8)) then
  3243. begin
  3244. TmpBool1 := True;
  3245. TmpBool2 := True;
  3246. inc(TmpRef.offset, taicpu(hp1).oper[0]^.ref^.offset);
  3247. if taicpu(hp1).oper[0]^.ref^.scalefactor<>0 then
  3248. tmpref.scalefactor:=tmpref.scalefactor*taicpu(hp1).oper[0]^.ref^.scalefactor;
  3249. TmpRef.base := taicpu(hp1).oper[0]^.ref^.base;
  3250. RemoveInstruction(hp1);
  3251. end
  3252. end
  3253. else if (taicpu(hp1).oper[0]^.typ = Top_Const) then
  3254. begin
  3255. TmpBool1 := True;
  3256. TmpBool2 := True;
  3257. case taicpu(hp1).opcode of
  3258. A_ADD:
  3259. inc(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  3260. A_SUB:
  3261. dec(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
  3262. else
  3263. internalerror(2019050536);
  3264. end;
  3265. RemoveInstruction(hp1);
  3266. end
  3267. else
  3268. if (taicpu(hp1).oper[0]^.typ = Top_Reg) and
  3269. (((taicpu(hp1).opcode = A_ADD) and
  3270. (TmpRef.base = NR_NO)) or
  3271. (taicpu(hp1).opcode = A_INC) or
  3272. (taicpu(hp1).opcode = A_DEC)) then
  3273. begin
  3274. TmpBool1 := True;
  3275. TmpBool2 := True;
  3276. case taicpu(hp1).opcode of
  3277. A_ADD:
  3278. TmpRef.base := taicpu(hp1).oper[0]^.reg;
  3279. A_INC:
  3280. inc(TmpRef.offset);
  3281. A_DEC:
  3282. dec(TmpRef.offset);
  3283. else
  3284. internalerror(2019050535);
  3285. end;
  3286. RemoveInstruction(hp1);
  3287. end;
  3288. end;
  3289. if TmpBool2
  3290. {$ifndef x86_64}
  3291. or
  3292. ((current_settings.optimizecputype < cpu_Pentium2) and
  3293. (taicpu(p).oper[0]^.val <= 3) and
  3294. not(cs_opt_size in current_settings.optimizerswitches))
  3295. {$endif x86_64}
  3296. then
  3297. begin
  3298. if not(TmpBool2) and
  3299. (taicpu(p).oper[0]^.val=1) then
  3300. begin
  3301. hp1:=taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
  3302. taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg)
  3303. end
  3304. else
  3305. hp1:=taicpu.op_ref_reg(A_LEA, taicpu(p).opsize, TmpRef,
  3306. taicpu(p).oper[1]^.reg);
  3307. DebugMsg(SPeepholeOptimization + 'ShlAddLeaSubIncDec2Lea',p);
  3308. InsertLLItem(p.previous, p.next, hp1);
  3309. p.free;
  3310. p := hp1;
  3311. end;
  3312. end
  3313. {$ifndef x86_64}
  3314. else if (current_settings.optimizecputype < cpu_Pentium2) then
  3315. begin
  3316. { changes "shl $1, %reg" to "add %reg, %reg", which is the same on a 386,
  3317. but faster on a 486, and Tairable in both U and V pipes on the Pentium
  3318. (unlike shl, which is only Tairable in the U pipe) }
  3319. if taicpu(p).oper[0]^.val=1 then
  3320. begin
  3321. hp1 := taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
  3322. taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg);
  3323. InsertLLItem(p.previous, p.next, hp1);
  3324. p.free;
  3325. p := hp1;
  3326. end
  3327. { changes "shl $2, %reg" to "lea (,%reg,4), %reg"
  3328. "shl $3, %reg" to "lea (,%reg,8), %reg }
  3329. else if (taicpu(p).opsize = S_L) and
  3330. (taicpu(p).oper[0]^.val<= 3) then
  3331. begin
  3332. reference_reset(tmpref,2,[]);
  3333. TmpRef.index := taicpu(p).oper[1]^.reg;
  3334. TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
  3335. hp1 := taicpu.Op_ref_reg(A_LEA,S_L,TmpRef, taicpu(p).oper[1]^.reg);
  3336. InsertLLItem(p.previous, p.next, hp1);
  3337. p.free;
  3338. p := hp1;
  3339. end;
  3340. end
  3341. {$endif x86_64}
  3342. else if
  3343. GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) and MatchOpType(taicpu(hp1), top_const, top_reg) and
  3344. (
  3345. (
  3346. MatchInstruction(hp1, A_AND, [taicpu(p).opsize]) and
  3347. SetAndTest(hp1, hp2)
  3348. {$ifdef x86_64}
  3349. ) or
  3350. (
  3351. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  3352. GetNextInstruction(hp1, hp2) and
  3353. MatchInstruction(hp2, A_AND, [taicpu(p).opsize]) and
  3354. MatchOpType(taicpu(hp2), top_reg, top_reg) and
  3355. (taicpu(hp1).oper[1]^.reg = taicpu(hp2).oper[0]^.reg)
  3356. {$endif x86_64}
  3357. )
  3358. ) and
  3359. (taicpu(p).oper[1]^.reg = taicpu(hp2).oper[1]^.reg) then
  3360. begin
  3361. { Change:
  3362. shl x, %reg1
  3363. mov -(1<<x), %reg2
  3364. and %reg2, %reg1
  3365. Or:
  3366. shl x, %reg1
  3367. and -(1<<x), %reg1
  3368. To just:
  3369. shl x, %reg1
  3370. Since the and operation only zeroes bits that are already zero from the shl operation
  3371. }
  3372. case taicpu(p).oper[0]^.val of
  3373. 8:
  3374. mask:=$FFFFFFFFFFFFFF00;
  3375. 16:
  3376. mask:=$FFFFFFFFFFFF0000;
  3377. 32:
  3378. mask:=$FFFFFFFF00000000;
  3379. 63:
  3380. { Constant pre-calculated to prevent overflow errors with Int64 }
  3381. mask:=$8000000000000000;
  3382. else
  3383. begin
  3384. if taicpu(p).oper[0]^.val >= 64 then
  3385. { Shouldn't happen realistically, since the register
  3386. is guaranteed to be set to zero at this point }
  3387. mask := 0
  3388. else
  3389. mask := -(Int64(1 shl taicpu(p).oper[0]^.val));
  3390. end;
  3391. end;
  3392. if taicpu(hp1).oper[0]^.val = mask then
  3393. begin
  3394. { Everything checks out, perform the optimisation, as long as
  3395. the FLAGS register isn't being used}
  3396. TransferUsedRegs(TmpUsedRegs);
  3397. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3398. {$ifdef x86_64}
  3399. if (hp1 <> hp2) then
  3400. begin
  3401. { "shl/mov/and" version }
  3402. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  3403. { Don't do the optimisation if the FLAGS register is in use }
  3404. if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)) then
  3405. begin
  3406. DebugMsg(SPeepholeOptimization + 'ShlMovAnd2Shl', p);
  3407. { Don't remove the 'mov' instruction if its register is used elsewhere }
  3408. if not(RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs)) then
  3409. begin
  3410. RemoveInstruction(hp1);
  3411. Result := True;
  3412. end;
  3413. { Only set Result to True if the 'mov' instruction was removed }
  3414. RemoveInstruction(hp2);
  3415. end;
  3416. end
  3417. else
  3418. {$endif x86_64}
  3419. begin
  3420. { "shl/and" version }
  3421. { Don't do the optimisation if the FLAGS register is in use }
  3422. if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
  3423. begin
  3424. DebugMsg(SPeepholeOptimization + 'ShlAnd2Shl', p);
  3425. RemoveInstruction(hp1);
  3426. Result := True;
  3427. end;
  3428. end;
  3429. Exit;
  3430. end
  3431. else {$ifdef x86_64}if (hp1 = hp2) then{$endif x86_64}
  3432. begin
  3433. { Even if the mask doesn't allow for its removal, we might be
  3434. able to optimise the mask for the "shl/and" version, which
  3435. may permit other peephole optimisations }
  3436. {$ifdef DEBUG_AOPTCPU}
  3437. mask := taicpu(hp1).oper[0]^.val and mask;
  3438. if taicpu(hp1).oper[0]^.val <> mask then
  3439. begin
  3440. DebugMsg(
  3441. SPeepholeOptimization +
  3442. 'Changed mask from $' + debug_tostr(taicpu(hp1).oper[0]^.val) +
  3443. ' to $' + debug_tostr(mask) +
  3444. 'based on previous instruction (ShlAnd2ShlAnd)', hp1);
  3445. taicpu(hp1).oper[0]^.val := mask;
  3446. end;
  3447. {$else DEBUG_AOPTCPU}
  3448. { If debugging is off, just set the operand even if it's the same }
  3449. taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and mask;
  3450. {$endif DEBUG_AOPTCPU}
  3451. end;
  3452. end;
  3453. end;
  3454. function TX86AsmOptimizer.OptPass1SETcc(var p: tai): boolean;
  3455. var
  3456. hp1,hp2,next: tai; SetC, JumpC: TAsmCond; Unconditional: Boolean;
  3457. begin
  3458. Result:=false;
  3459. if MatchOpType(taicpu(p),top_reg) and
  3460. GetNextInstruction(p, hp1) and
  3461. ((MatchInstruction(hp1, A_TEST, [S_B]) and
  3462. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3463. (taicpu(hp1).oper[0]^.reg = taicpu(hp1).oper[1]^.reg)) or
  3464. (MatchInstruction(hp1, A_CMP, [S_B]) and
  3465. MatchOpType(taicpu(hp1),top_const,top_reg) and
  3466. (taicpu(hp1).oper[0]^.val=0))
  3467. ) and
  3468. (taicpu(p).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
  3469. GetNextInstruction(hp1, hp2) and
  3470. MatchInstruction(hp2, A_Jcc, []) then
  3471. { Change from: To:
  3472. set(C) %reg j(~C) label
  3473. test %reg,%reg/cmp $0,%reg
  3474. je label
  3475. set(C) %reg j(C) label
  3476. test %reg,%reg/cmp $0,%reg
  3477. jne label
  3478. }
  3479. begin
  3480. next := tai(p.Next);
  3481. TransferUsedRegs(TmpUsedRegs);
  3482. UpdateUsedRegs(TmpUsedRegs, next);
  3483. UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
  3484. JumpC := taicpu(hp2).condition;
  3485. Unconditional := False;
  3486. if conditions_equal(JumpC, C_E) then
  3487. SetC := inverse_cond(taicpu(p).condition)
  3488. else if conditions_equal(JumpC, C_NE) then
  3489. SetC := taicpu(p).condition
  3490. else
  3491. { We've got something weird here (and inefficent) }
  3492. begin
  3493. DebugMsg('DEBUG: Inefficient jump - check code generation', p);
  3494. SetC := C_NONE;
  3495. { JAE/JNB will always branch (use 'condition_in', since C_AE <> C_NB normally) }
  3496. if condition_in(C_AE, JumpC) then
  3497. Unconditional := True
  3498. else
  3499. { Not sure what to do with this jump - drop out }
  3500. Exit;
  3501. end;
  3502. RemoveInstruction(hp1);
  3503. if Unconditional then
  3504. MakeUnconditional(taicpu(hp2))
  3505. else
  3506. begin
  3507. if SetC = C_NONE then
  3508. InternalError(2018061401);
  3509. taicpu(hp2).SetCondition(SetC);
  3510. end;
  3511. if not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs) then
  3512. begin
  3513. RemoveCurrentp(p, hp2);
  3514. Result := True;
  3515. end;
  3516. DebugMsg(SPeepholeOptimization + 'SETcc/TESTCmp/Jcc -> Jcc',p);
  3517. end;
  3518. end;
  3519. function TX86AsmOptimizer.OptPass1FSTP(var p: tai): boolean;
  3520. { returns true if a "continue" should be done after this optimization }
  3521. var
  3522. hp1, hp2: tai;
  3523. begin
  3524. Result := false;
  3525. if MatchOpType(taicpu(p),top_ref) and
  3526. GetNextInstruction(p, hp1) and
  3527. (hp1.typ = ait_instruction) and
  3528. (((taicpu(hp1).opcode = A_FLD) and
  3529. (taicpu(p).opcode = A_FSTP)) or
  3530. ((taicpu(p).opcode = A_FISTP) and
  3531. (taicpu(hp1).opcode = A_FILD))) and
  3532. MatchOpType(taicpu(hp1),top_ref) and
  3533. (taicpu(hp1).opsize = taicpu(p).opsize) and
  3534. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  3535. begin
  3536. { replacing fstp f;fld f by fst f is only valid for extended because of rounding or if fastmath is on }
  3537. if ((taicpu(p).opsize=S_FX) or (cs_opt_fastmath in current_settings.optimizerswitches)) and
  3538. GetNextInstruction(hp1, hp2) and
  3539. (hp2.typ = ait_instruction) and
  3540. IsExitCode(hp2) and
  3541. (taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
  3542. not(assigned(current_procinfo.procdef.funcretsym) and
  3543. (taicpu(p).oper[0]^.ref^.offset < tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)) and
  3544. (taicpu(p).oper[0]^.ref^.index = NR_NO) then
  3545. begin
  3546. RemoveInstruction(hp1);
  3547. RemoveCurrentP(p, hp2);
  3548. RemoveLastDeallocForFuncRes(p);
  3549. Result := true;
  3550. end
  3551. else
  3552. { we can do this only in fast math mode as fstp is rounding ...
  3553. ... still disabled as it breaks the compiler and/or rtl }
  3554. if ({ (cs_opt_fastmath in current_settings.optimizerswitches) or }
  3555. { ... or if another fstp equal to the first one follows }
  3556. (GetNextInstruction(hp1,hp2) and
  3557. (hp2.typ = ait_instruction) and
  3558. (taicpu(p).opcode=taicpu(hp2).opcode) and
  3559. (taicpu(p).opsize=taicpu(hp2).opsize))
  3560. ) and
  3561. { fst can't store an extended/comp value }
  3562. (taicpu(p).opsize <> S_FX) and
  3563. (taicpu(p).opsize <> S_IQ) then
  3564. begin
  3565. if (taicpu(p).opcode = A_FSTP) then
  3566. taicpu(p).opcode := A_FST
  3567. else
  3568. taicpu(p).opcode := A_FIST;
  3569. DebugMsg(SPeepholeOptimization + 'FstpFld2Fst',p);
  3570. RemoveInstruction(hp1);
  3571. end;
  3572. end;
  3573. end;
  3574. function TX86AsmOptimizer.OptPass1FLD(var p : tai) : boolean;
  3575. var
  3576. hp1, hp2: tai;
  3577. begin
  3578. result:=false;
  3579. if MatchOpType(taicpu(p),top_reg) and
  3580. GetNextInstruction(p, hp1) and
  3581. (hp1.typ = Ait_Instruction) and
  3582. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  3583. (taicpu(hp1).oper[0]^.reg = NR_ST) and
  3584. (taicpu(hp1).oper[1]^.reg = NR_ST1) then
  3585. { change to
  3586. fld reg fxxx reg,st
  3587. fxxxp st, st1 (hp1)
  3588. Remark: non commutative operations must be reversed!
  3589. }
  3590. begin
  3591. case taicpu(hp1).opcode Of
  3592. A_FMULP,A_FADDP,
  3593. A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  3594. begin
  3595. case taicpu(hp1).opcode Of
  3596. A_FADDP: taicpu(hp1).opcode := A_FADD;
  3597. A_FMULP: taicpu(hp1).opcode := A_FMUL;
  3598. A_FSUBP: taicpu(hp1).opcode := A_FSUBR;
  3599. A_FSUBRP: taicpu(hp1).opcode := A_FSUB;
  3600. A_FDIVP: taicpu(hp1).opcode := A_FDIVR;
  3601. A_FDIVRP: taicpu(hp1).opcode := A_FDIV;
  3602. else
  3603. internalerror(2019050534);
  3604. end;
  3605. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  3606. taicpu(hp1).oper[1]^.reg := NR_ST;
  3607. RemoveCurrentP(p, hp1);
  3608. Result:=true;
  3609. exit;
  3610. end;
  3611. else
  3612. ;
  3613. end;
  3614. end
  3615. else
  3616. if MatchOpType(taicpu(p),top_ref) and
  3617. GetNextInstruction(p, hp2) and
  3618. (hp2.typ = Ait_Instruction) and
  3619. MatchOpType(taicpu(hp2),top_reg,top_reg) and
  3620. (taicpu(p).opsize in [S_FS, S_FL]) and
  3621. (taicpu(hp2).oper[0]^.reg = NR_ST) and
  3622. (taicpu(hp2).oper[1]^.reg = NR_ST1) then
  3623. if GetLastInstruction(p, hp1) and
  3624. MatchInstruction(hp1,A_FLD,A_FST,[taicpu(p).opsize]) and
  3625. MatchOpType(taicpu(hp1),top_ref) and
  3626. RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
  3627. if ((taicpu(hp2).opcode = A_FMULP) or
  3628. (taicpu(hp2).opcode = A_FADDP)) then
  3629. { change to
  3630. fld/fst mem1 (hp1) fld/fst mem1
  3631. fld mem1 (p) fadd/
  3632. faddp/ fmul st, st
  3633. fmulp st, st1 (hp2) }
  3634. begin
  3635. RemoveCurrentP(p, hp1);
  3636. if (taicpu(hp2).opcode = A_FADDP) then
  3637. taicpu(hp2).opcode := A_FADD
  3638. else
  3639. taicpu(hp2).opcode := A_FMUL;
  3640. taicpu(hp2).oper[1]^.reg := NR_ST;
  3641. end
  3642. else
  3643. { change to
  3644. fld/fst mem1 (hp1) fld/fst mem1
  3645. fld mem1 (p) fld st}
  3646. begin
  3647. taicpu(p).changeopsize(S_FL);
  3648. taicpu(p).loadreg(0,NR_ST);
  3649. end
  3650. else
  3651. begin
  3652. case taicpu(hp2).opcode Of
  3653. A_FMULP,A_FADDP,A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
  3654. { change to
  3655. fld/fst mem1 (hp1) fld/fst mem1
  3656. fld mem2 (p) fxxx mem2
  3657. fxxxp st, st1 (hp2) }
  3658. begin
  3659. case taicpu(hp2).opcode Of
  3660. A_FADDP: taicpu(p).opcode := A_FADD;
  3661. A_FMULP: taicpu(p).opcode := A_FMUL;
  3662. A_FSUBP: taicpu(p).opcode := A_FSUBR;
  3663. A_FSUBRP: taicpu(p).opcode := A_FSUB;
  3664. A_FDIVP: taicpu(p).opcode := A_FDIVR;
  3665. A_FDIVRP: taicpu(p).opcode := A_FDIV;
  3666. else
  3667. internalerror(2019050533);
  3668. end;
  3669. RemoveInstruction(hp2);
  3670. end
  3671. else
  3672. ;
  3673. end
  3674. end
  3675. end;
  3676. function TX86AsmOptimizer.OptPass1Cmp(var p: tai): boolean;
  3677. var
  3678. v: TCGInt;
  3679. hp1, hp2: tai;
  3680. begin
  3681. Result:=false;
  3682. if taicpu(p).oper[0]^.typ = top_const then
  3683. begin
  3684. { Though GetNextInstruction can be factored out, it is an expensive
  3685. call, so delay calling it until we have first checked cheaper
  3686. conditions that are independent of it. }
  3687. if (taicpu(p).oper[0]^.val = 0) and
  3688. (taicpu(p).oper[1]^.typ = top_reg) and
  3689. GetNextInstruction(p, hp1) and
  3690. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) then
  3691. begin
  3692. hp2 := p;
  3693. { When dealing with "cmp $0,%reg", only ZF and SF contain
  3694. anything meaningful once it's converted to "test %reg,%reg";
  3695. additionally, some jumps will always (or never) branch, so
  3696. evaluate every jump immediately following the
  3697. comparison, optimising the conditions if possible.
  3698. Similarly with SETcc... those that are always set to 0 or 1
  3699. are changed to MOV instructions }
  3700. while GetNextInstruction(hp2, hp1) and
  3701. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) do
  3702. begin
  3703. case taicpu(hp1).condition of
  3704. C_B, C_C, C_NAE, C_O:
  3705. { For B/NAE:
  3706. Will never branch since an unsigned integer can never be below zero
  3707. For C/O:
  3708. Result cannot overflow because 0 is being subtracted
  3709. }
  3710. begin
  3711. if taicpu(hp1).opcode = A_Jcc then
  3712. begin
  3713. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (jump removed)', hp1);
  3714. TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol).decrefs;
  3715. RemoveInstruction(hp1);
  3716. { Since hp1 was deleted, hp2 must not be updated }
  3717. Continue;
  3718. end
  3719. else
  3720. begin
  3721. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (set -> mov 0)', hp1);
  3722. { Convert "set(c) %reg" instruction to "movb 0,%reg" }
  3723. taicpu(hp1).opcode := A_MOV;
  3724. taicpu(hp1).ops := 2;
  3725. taicpu(hp1).condition := C_None;
  3726. taicpu(hp1).opsize := S_B;
  3727. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  3728. taicpu(hp1).loadconst(0, 0);
  3729. end;
  3730. end;
  3731. C_BE, C_NA:
  3732. begin
  3733. { Will only branch if equal to zero }
  3734. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition BE/NA --> E', hp1);
  3735. taicpu(hp1).condition := C_E;
  3736. end;
  3737. C_A, C_NBE:
  3738. begin
  3739. { Will only branch if not equal to zero }
  3740. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition A/NBE --> NE', hp1);
  3741. taicpu(hp1).condition := C_NE;
  3742. end;
  3743. C_AE, C_NB, C_NC, C_NO:
  3744. begin
  3745. { Will always branch }
  3746. DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition AE/NB/NC/NO --> Always', hp1);
  3747. if taicpu(hp1).opcode = A_Jcc then
  3748. begin
  3749. MakeUnconditional(taicpu(hp1));
  3750. { Any jumps/set that follow will now be dead code }
  3751. RemoveDeadCodeAfterJump(taicpu(hp1));
  3752. Break;
  3753. end
  3754. else
  3755. begin
  3756. { Convert "set(c) %reg" instruction to "movb 1,%reg" }
  3757. taicpu(hp1).opcode := A_MOV;
  3758. taicpu(hp1).ops := 2;
  3759. taicpu(hp1).condition := C_None;
  3760. taicpu(hp1).opsize := S_B;
  3761. taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
  3762. taicpu(hp1).loadconst(0, 1);
  3763. end;
  3764. end;
  3765. C_None:
  3766. InternalError(2020012201);
  3767. C_P, C_PE, C_NP, C_PO:
  3768. { We can't handle parity checks and they should never be generated
  3769. after a general-purpose CMP (it's used in some floating-point
  3770. comparisons that don't use CMP) }
  3771. InternalError(2020012202);
  3772. else
  3773. { Zero/Equality, Sign, their complements and all of the
  3774. signed comparisons do not need to be converted };
  3775. end;
  3776. hp2 := hp1;
  3777. end;
  3778. { Convert the instruction to a TEST }
  3779. taicpu(p).opcode := A_TEST;
  3780. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  3781. Result := True;
  3782. Exit;
  3783. end
  3784. else if (taicpu(p).oper[0]^.val = 1) and
  3785. GetNextInstruction(p, hp1) and
  3786. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
  3787. (taicpu(hp1).condition in [C_L, C_NGE]) then
  3788. begin
  3789. { Convert; To:
  3790. cmp $1,r/m cmp $0,r/m
  3791. jl @lbl jle @lbl
  3792. }
  3793. DebugMsg(SPeepholeOptimization + 'Cmp1Jl2Cmp0Jle', p);
  3794. taicpu(p).oper[0]^.val := 0;
  3795. taicpu(hp1).condition := C_LE;
  3796. { If the instruction is now "cmp $0,%reg", convert it to a
  3797. TEST (and effectively do the work of the "cmp $0,%reg" in
  3798. the block above)
  3799. If it's a reference, we can get away with not setting
  3800. Result to True because he haven't evaluated the jump
  3801. in this pass yet.
  3802. }
  3803. if (taicpu(p).oper[1]^.typ = top_reg) then
  3804. begin
  3805. taicpu(p).opcode := A_TEST;
  3806. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  3807. Result := True;
  3808. end;
  3809. Exit;
  3810. end
  3811. else if (taicpu(p).oper[1]^.typ = top_reg) then
  3812. begin
  3813. { cmp register,$8000 neg register
  3814. je target --> jo target
  3815. .... only if register is deallocated before jump.}
  3816. case Taicpu(p).opsize of
  3817. S_B: v:=$80;
  3818. S_W: v:=$8000;
  3819. S_L: v:=qword($80000000);
  3820. { S_Q will never happen: cmp with 64 bit constants is not possible }
  3821. S_Q:
  3822. Exit;
  3823. else
  3824. internalerror(2013112905);
  3825. end;
  3826. if (taicpu(p).oper[0]^.val=v) and
  3827. GetNextInstruction(p, hp1) and
  3828. MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
  3829. (Taicpu(hp1).condition in [C_E,C_NE]) then
  3830. begin
  3831. TransferUsedRegs(TmpUsedRegs);
  3832. UpdateUsedRegs(TmpUsedRegs,tai(p.next));
  3833. if not(RegInUsedRegs(Taicpu(p).oper[1]^.reg, TmpUsedRegs)) then
  3834. begin
  3835. DebugMsg(SPeepholeOptimization + 'CmpJe2NegJo done',p);
  3836. Taicpu(p).opcode:=A_NEG;
  3837. Taicpu(p).loadoper(0,Taicpu(p).oper[1]^);
  3838. Taicpu(p).clearop(1);
  3839. Taicpu(p).ops:=1;
  3840. if Taicpu(hp1).condition=C_E then
  3841. Taicpu(hp1).condition:=C_O
  3842. else
  3843. Taicpu(hp1).condition:=C_NO;
  3844. Result:=true;
  3845. exit;
  3846. end;
  3847. end;
  3848. end;
  3849. end;
  3850. end;
  3851. function TX86AsmOptimizer.OptPass1PXor(var p: tai): boolean;
  3852. var
  3853. hp1: tai;
  3854. begin
  3855. {
  3856. remove the second (v)pxor from
  3857. pxor reg,reg
  3858. ...
  3859. pxor reg,reg
  3860. }
  3861. Result:=false;
  3862. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  3863. MatchOpType(taicpu(p),top_reg,top_reg) and
  3864. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  3865. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  3866. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  3867. MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^) then
  3868. begin
  3869. DebugMsg(SPeepholeOptimization + 'PXorPXor2PXor done',hp1);
  3870. RemoveInstruction(hp1);
  3871. Result:=true;
  3872. Exit;
  3873. end
  3874. {
  3875. replace
  3876. pxor reg1,reg1
  3877. movapd/s reg1,reg2
  3878. dealloc reg1
  3879. by
  3880. pxor reg2,reg2
  3881. }
  3882. else if GetNextInstruction(p,hp1) and
  3883. { we mix single and double opperations here because we assume that the compiler
  3884. generates vmovapd only after double operations and vmovaps only after single operations }
  3885. MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
  3886. MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
  3887. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
  3888. (taicpu(p).oper[0]^.typ=top_reg) then
  3889. begin
  3890. TransferUsedRegs(TmpUsedRegs);
  3891. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3892. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  3893. begin
  3894. taicpu(p).loadoper(0,taicpu(hp1).oper[1]^);
  3895. taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
  3896. DebugMsg(SPeepholeOptimization + 'PXorMovapd2PXor done',p);
  3897. RemoveInstruction(hp1);
  3898. result:=true;
  3899. end;
  3900. end;
  3901. end;
  3902. function TX86AsmOptimizer.OptPass1VPXor(var p: tai): boolean;
  3903. var
  3904. hp1: tai;
  3905. begin
  3906. {
  3907. remove the second (v)pxor from
  3908. (v)pxor reg,reg
  3909. ...
  3910. (v)pxor reg,reg
  3911. }
  3912. Result:=false;
  3913. if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^,taicpu(p).oper[2]^) and
  3914. MatchOpType(taicpu(p),top_reg,top_reg,top_reg) and
  3915. GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
  3916. MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
  3917. MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
  3918. MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^,taicpu(hp1).oper[2]^) then
  3919. begin
  3920. DebugMsg(SPeepholeOptimization + 'VPXorVPXor2PXor done',hp1);
  3921. RemoveInstruction(hp1);
  3922. Result:=true;
  3923. Exit;
  3924. end
  3925. else
  3926. Result:=OptPass1VOP(p);
  3927. end;
  3928. function TX86AsmOptimizer.OptPass1Imul(var p: tai): boolean;
  3929. var
  3930. hp1 : tai;
  3931. begin
  3932. result:=false;
  3933. { replace
  3934. IMul const,%mreg1,%mreg2
  3935. Mov %reg2,%mreg3
  3936. dealloc %mreg3
  3937. by
  3938. Imul const,%mreg1,%mreg23
  3939. }
  3940. if (taicpu(p).ops=3) and
  3941. GetNextInstruction(p,hp1) and
  3942. MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
  3943. MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
  3944. (taicpu(hp1).oper[1]^.typ=top_reg) then
  3945. begin
  3946. TransferUsedRegs(TmpUsedRegs);
  3947. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  3948. if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
  3949. begin
  3950. taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
  3951. DebugMsg(SPeepholeOptimization + 'ImulMov2Imul done',p);
  3952. RemoveInstruction(hp1);
  3953. result:=true;
  3954. end;
  3955. end;
  3956. end;
  3957. function TX86AsmOptimizer.OptPass2MOV(var p : tai) : boolean;
  3958. function IsXCHGAcceptable: Boolean; inline;
  3959. begin
  3960. { Always accept if optimising for size }
  3961. Result := (cs_opt_size in current_settings.optimizerswitches) or
  3962. (
  3963. {$ifdef x86_64}
  3964. { XCHG takes 3 cycles on AMD Athlon64 }
  3965. (current_settings.optimizecputype >= cpu_core_i)
  3966. {$else x86_64}
  3967. { From the Pentium M onwards, XCHG only has a latency of 2 rather
  3968. than 3, so it becomes a saving compared to three MOVs with two of
  3969. them able to execute simultaneously. [Kit] }
  3970. (current_settings.optimizecputype >= cpu_PentiumM)
  3971. {$endif x86_64}
  3972. );
  3973. end;
  3974. var
  3975. NewRef: TReference;
  3976. hp1,hp2,hp3: tai;
  3977. {$ifndef x86_64}
  3978. hp4: tai;
  3979. OperIdx: Integer;
  3980. {$endif x86_64}
  3981. begin
  3982. Result:=false;
  3983. if not GetNextInstruction(p, hp1) then
  3984. Exit;
  3985. if MatchInstruction(hp1, A_JMP, [S_NO]) then
  3986. begin
  3987. { Sometimes the MOVs that OptPass2JMP produces can be improved
  3988. further, but we can't just put this jump optimisation in pass 1
  3989. because it tends to perform worse when conditional jumps are
  3990. nearby (e.g. when converting CMOV instructions). [Kit] }
  3991. if OptPass2JMP(hp1) then
  3992. { call OptPass1MOV once to potentially merge any MOVs that were created }
  3993. Result := OptPass1MOV(p)
  3994. { OptPass2MOV will now exit but will be called again if OptPass1MOV
  3995. returned True and the instruction is still a MOV, thus checking
  3996. the optimisations below }
  3997. { If OptPass2JMP returned False, no optimisations were done to
  3998. the jump and there are no further optimisations that can be done
  3999. to the MOV instruction on this pass }
  4000. end
  4001. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  4002. (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  4003. MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
  4004. MatchOpType(taicpu(hp1),top_const,top_reg) and
  4005. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
  4006. { be lazy, checking separately for sub would be slightly better }
  4007. (abs(taicpu(hp1).oper[0]^.val)<=$7fffffff) then
  4008. begin
  4009. { Change:
  4010. movl/q %reg1,%reg2 movl/q %reg1,%reg2
  4011. addl/q $x,%reg2 subl/q $x,%reg2
  4012. To:
  4013. leal/q x(%reg1),%reg2 leal/q -x(%reg1),%reg2
  4014. }
  4015. TransferUsedRegs(TmpUsedRegs);
  4016. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  4017. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  4018. if not GetNextInstruction(hp1, hp2) or
  4019. (
  4020. { The FLAGS register isn't always tracked properly, so do not
  4021. perform this optimisation if a conditional statement follows }
  4022. not RegReadByInstruction(NR_DEFAULTFLAGS, hp2) and
  4023. not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)
  4024. ) then
  4025. begin
  4026. reference_reset(NewRef, 1, []);
  4027. NewRef.base := taicpu(p).oper[0]^.reg;
  4028. NewRef.scalefactor := 1;
  4029. if taicpu(hp1).opcode = A_ADD then
  4030. begin
  4031. DebugMsg(SPeepholeOptimization + 'MovAdd2Lea', p);
  4032. NewRef.offset := taicpu(hp1).oper[0]^.val;
  4033. end
  4034. else
  4035. begin
  4036. DebugMsg(SPeepholeOptimization + 'MovSub2Lea', p);
  4037. NewRef.offset := -taicpu(hp1).oper[0]^.val;
  4038. end;
  4039. taicpu(p).opcode := A_LEA;
  4040. taicpu(p).loadref(0, NewRef);
  4041. RemoveInstruction(hp1);
  4042. Result := True;
  4043. Exit;
  4044. end;
  4045. end
  4046. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  4047. {$ifdef x86_64}
  4048. MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
  4049. {$else x86_64}
  4050. MatchInstruction(hp1,A_MOVZX,A_MOVSX,[]) and
  4051. {$endif x86_64}
  4052. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  4053. (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg) then
  4054. { mov reg1, reg2 mov reg1, reg2
  4055. movzx/sx reg2, reg3 to movzx/sx reg1, reg3}
  4056. begin
  4057. taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
  4058. DebugMsg(SPeepholeOptimization + 'mov %reg1,%reg2; movzx/sx %reg2,%reg3 -> mov %reg1,%reg2;movzx/sx %reg1,%reg3',p);
  4059. { Don't remove the MOV command without first checking that reg2 isn't used afterwards,
  4060. or unless supreg(reg3) = supreg(reg2)). [Kit] }
  4061. TransferUsedRegs(TmpUsedRegs);
  4062. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4063. if (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) or
  4064. not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)
  4065. then
  4066. begin
  4067. RemoveCurrentP(p, hp1);
  4068. Result:=true;
  4069. end;
  4070. exit;
  4071. end
  4072. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  4073. IsXCHGAcceptable and
  4074. { XCHG doesn't support 8-byte registers }
  4075. (taicpu(p).opsize <> S_B) and
  4076. MatchInstruction(hp1, A_MOV, []) and
  4077. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  4078. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
  4079. GetNextInstruction(hp1, hp2) and
  4080. MatchInstruction(hp2, A_MOV, []) and
  4081. { Don't need to call MatchOpType for hp2 because the operand matches below cover for it }
  4082. MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
  4083. MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) then
  4084. begin
  4085. { mov %reg1,%reg2
  4086. mov %reg3,%reg1 -> xchg %reg3,%reg1
  4087. mov %reg2,%reg3
  4088. (%reg2 not used afterwards)
  4089. Note that xchg takes 3 cycles to execute, and generally mov's take
  4090. only one cycle apiece, but the first two mov's can be executed in
  4091. parallel, only taking 2 cycles overall. Older processors should
  4092. therefore only optimise for size. [Kit]
  4093. }
  4094. TransferUsedRegs(TmpUsedRegs);
  4095. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  4096. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  4097. if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) then
  4098. begin
  4099. DebugMsg(SPeepholeOptimization + 'MovMovMov2XChg', p);
  4100. AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp1, UsedRegs);
  4101. taicpu(hp1).opcode := A_XCHG;
  4102. RemoveCurrentP(p, hp1);
  4103. RemoveInstruction(hp2);
  4104. Result := True;
  4105. Exit;
  4106. end;
  4107. end
  4108. else if MatchOpType(taicpu(p),top_reg,top_reg) and
  4109. MatchInstruction(hp1, A_SAR, []) then
  4110. begin
  4111. if MatchOperand(taicpu(hp1).oper[0]^, 31) then
  4112. begin
  4113. { the use of %edx also covers the opsize being S_L }
  4114. if MatchOperand(taicpu(hp1).oper[1]^, NR_EDX) then
  4115. begin
  4116. { Note it has to be specifically "movl %eax,%edx", and those specific sub-registers }
  4117. if (taicpu(p).oper[0]^.reg = NR_EAX) and
  4118. (taicpu(p).oper[1]^.reg = NR_EDX) then
  4119. begin
  4120. { Change:
  4121. movl %eax,%edx
  4122. sarl $31,%edx
  4123. To:
  4124. cltd
  4125. }
  4126. DebugMsg(SPeepholeOptimization + 'MovSar2Cltd', p);
  4127. RemoveInstruction(hp1);
  4128. taicpu(p).opcode := A_CDQ;
  4129. taicpu(p).opsize := S_NO;
  4130. taicpu(p).clearop(1);
  4131. taicpu(p).clearop(0);
  4132. taicpu(p).ops:=0;
  4133. Result := True;
  4134. end
  4135. else if (cs_opt_size in current_settings.optimizerswitches) and
  4136. (taicpu(p).oper[0]^.reg = NR_EDX) and
  4137. (taicpu(p).oper[1]^.reg = NR_EAX) then
  4138. begin
  4139. { Change:
  4140. movl %edx,%eax
  4141. sarl $31,%edx
  4142. To:
  4143. movl %edx,%eax
  4144. cltd
  4145. Note that this creates a dependency between the two instructions,
  4146. so only perform if optimising for size.
  4147. }
  4148. DebugMsg(SPeepholeOptimization + 'MovSar2MovCltd', p);
  4149. taicpu(hp1).opcode := A_CDQ;
  4150. taicpu(hp1).opsize := S_NO;
  4151. taicpu(hp1).clearop(1);
  4152. taicpu(hp1).clearop(0);
  4153. taicpu(hp1).ops:=0;
  4154. end;
  4155. {$ifndef x86_64}
  4156. end
  4157. { Don't bother if CMOV is supported, because a more optimal
  4158. sequence would have been generated for the Abs() intrinsic }
  4159. else if not(CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) and
  4160. { the use of %eax also covers the opsize being S_L }
  4161. MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) and
  4162. (taicpu(p).oper[0]^.reg = NR_EAX) and
  4163. (taicpu(p).oper[1]^.reg = NR_EDX) and
  4164. GetNextInstruction(hp1, hp2) and
  4165. MatchInstruction(hp2, A_XOR, [S_L]) and
  4166. MatchOperand(taicpu(hp2).oper[0]^, NR_EAX) and
  4167. MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) and
  4168. GetNextInstruction(hp2, hp3) and
  4169. MatchInstruction(hp3, A_SUB, [S_L]) and
  4170. MatchOperand(taicpu(hp3).oper[0]^, NR_EAX) and
  4171. MatchOperand(taicpu(hp3).oper[1]^, NR_EDX) then
  4172. begin
  4173. { Change:
  4174. movl %eax,%edx
  4175. sarl $31,%eax
  4176. xorl %eax,%edx
  4177. subl %eax,%edx
  4178. (Instruction that uses %edx)
  4179. (%eax deallocated)
  4180. (%edx deallocated)
  4181. To:
  4182. cltd
  4183. xorl %edx,%eax <-- Note the registers have swapped
  4184. subl %edx,%eax
  4185. (Instruction that uses %eax) <-- %eax rather than %edx
  4186. }
  4187. TransferUsedRegs(TmpUsedRegs);
  4188. UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
  4189. UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
  4190. UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
  4191. if not RegUsedAfterInstruction(NR_EAX, hp3, TmpUsedRegs) then
  4192. begin
  4193. if GetNextInstruction(hp3, hp4) and
  4194. not RegModifiedByInstruction(NR_EDX, hp4) and
  4195. not RegUsedAfterInstruction(NR_EDX, hp4, TmpUsedRegs) then
  4196. begin
  4197. DebugMsg(SPeepholeOptimization + 'abs() intrinsic optimisation', p);
  4198. taicpu(p).opcode := A_CDQ;
  4199. taicpu(p).clearop(1);
  4200. taicpu(p).clearop(0);
  4201. taicpu(p).ops:=0;
  4202. RemoveInstruction(hp1);
  4203. taicpu(hp2).loadreg(0, NR_EDX);
  4204. taicpu(hp2).loadreg(1, NR_EAX);
  4205. taicpu(hp3).loadreg(0, NR_EDX);
  4206. taicpu(hp3).loadreg(1, NR_EAX);
  4207. AllocRegBetween(NR_EAX, hp3, hp4, TmpUsedRegs);
  4208. { Convert references in the following instruction (hp4) from %edx to %eax }
  4209. for OperIdx := 0 to taicpu(hp4).ops - 1 do
  4210. with taicpu(hp4).oper[OperIdx]^ do
  4211. case typ of
  4212. top_reg:
  4213. if getsupreg(reg) = RS_EDX then
  4214. reg := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  4215. top_ref:
  4216. begin
  4217. if getsupreg(reg) = RS_EDX then
  4218. ref^.base := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  4219. if getsupreg(reg) = RS_EDX then
  4220. ref^.index := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
  4221. end;
  4222. else
  4223. ;
  4224. end;
  4225. end;
  4226. end;
  4227. {$else x86_64}
  4228. end;
  4229. end
  4230. else if MatchOperand(taicpu(hp1).oper[0]^, 63) and
  4231. { the use of %rdx also covers the opsize being S_Q }
  4232. MatchOperand(taicpu(hp1).oper[1]^, NR_RDX) then
  4233. begin
  4234. { Note it has to be specifically "movq %rax,%rdx", and those specific sub-registers }
  4235. if (taicpu(p).oper[0]^.reg = NR_RAX) and
  4236. (taicpu(p).oper[1]^.reg = NR_RDX) then
  4237. begin
  4238. { Change:
  4239. movq %rax,%rdx
  4240. sarq $63,%rdx
  4241. To:
  4242. cqto
  4243. }
  4244. DebugMsg(SPeepholeOptimization + 'MovSar2Cqto', p);
  4245. RemoveInstruction(hp1);
  4246. taicpu(p).opcode := A_CQO;
  4247. taicpu(p).opsize := S_NO;
  4248. taicpu(p).clearop(1);
  4249. taicpu(p).clearop(0);
  4250. taicpu(p).ops:=0;
  4251. Result := True;
  4252. end
  4253. else if (cs_opt_size in current_settings.optimizerswitches) and
  4254. (taicpu(p).oper[0]^.reg = NR_RDX) and
  4255. (taicpu(p).oper[1]^.reg = NR_RAX) then
  4256. begin
  4257. { Change:
  4258. movq %rdx,%rax
  4259. sarq $63,%rdx
  4260. To:
  4261. movq %rdx,%rax
  4262. cqto
  4263. Note that this creates a dependency between the two instructions,
  4264. so only perform if optimising for size.
  4265. }
  4266. DebugMsg(SPeepholeOptimization + 'MovSar2MovCqto', p);
  4267. taicpu(hp1).opcode := A_CQO;
  4268. taicpu(hp1).opsize := S_NO;
  4269. taicpu(hp1).clearop(1);
  4270. taicpu(hp1).clearop(0);
  4271. taicpu(hp1).ops:=0;
  4272. {$endif x86_64}
  4273. end;
  4274. end;
  4275. end
  4276. else if MatchInstruction(hp1, A_MOV, []) and
  4277. (taicpu(hp1).oper[1]^.typ = top_reg) then
  4278. { Though "GetNextInstruction" could be factored out, along with
  4279. the instructions that depend on hp2, it is an expensive call that
  4280. should be delayed for as long as possible, hence we do cheaper
  4281. checks first that are likely to be False. [Kit] }
  4282. begin
  4283. if MatchOperand(taicpu(p).oper[1]^, NR_EDX) and
  4284. (
  4285. (
  4286. (taicpu(hp1).oper[1]^.reg = NR_EAX) and
  4287. (
  4288. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4289. MatchOperand(taicpu(hp1).oper[0]^, NR_EDX)
  4290. )
  4291. ) or
  4292. (
  4293. (taicpu(hp1).oper[1]^.reg = NR_EDX) and
  4294. (
  4295. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4296. MatchOperand(taicpu(hp1).oper[0]^, NR_EAX)
  4297. )
  4298. )
  4299. ) and
  4300. GetNextInstruction(hp1, hp2) and
  4301. MatchInstruction(hp2, A_SAR, []) and
  4302. MatchOperand(taicpu(hp2).oper[0]^, 31) then
  4303. begin
  4304. if MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) then
  4305. begin
  4306. { Change:
  4307. movl r/m,%edx movl r/m,%eax movl r/m,%edx movl r/m,%eax
  4308. movl %edx,%eax or movl %eax,%edx or movl r/m,%eax or movl r/m,%edx
  4309. sarl $31,%edx sarl $31,%edx sarl $31,%edx sarl $31,%edx
  4310. To:
  4311. movl r/m,%eax <- Note the change in register
  4312. cltd
  4313. }
  4314. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCltd', p);
  4315. AllocRegBetween(NR_EAX, p, hp1, UsedRegs);
  4316. taicpu(p).loadreg(1, NR_EAX);
  4317. taicpu(hp1).opcode := A_CDQ;
  4318. taicpu(hp1).clearop(1);
  4319. taicpu(hp1).clearop(0);
  4320. taicpu(hp1).ops:=0;
  4321. RemoveInstruction(hp2);
  4322. (*
  4323. {$ifdef x86_64}
  4324. end
  4325. else if MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) and
  4326. { This code sequence does not get generated - however it might become useful
  4327. if and when 128-bit signed integer types make an appearance, so the code
  4328. is kept here for when it is eventually needed. [Kit] }
  4329. (
  4330. (
  4331. (taicpu(hp1).oper[1]^.reg = NR_RAX) and
  4332. (
  4333. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4334. MatchOperand(taicpu(hp1).oper[0]^, NR_RDX)
  4335. )
  4336. ) or
  4337. (
  4338. (taicpu(hp1).oper[1]^.reg = NR_RDX) and
  4339. (
  4340. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
  4341. MatchOperand(taicpu(hp1).oper[0]^, NR_RAX)
  4342. )
  4343. )
  4344. ) and
  4345. GetNextInstruction(hp1, hp2) and
  4346. MatchInstruction(hp2, A_SAR, [S_Q]) and
  4347. MatchOperand(taicpu(hp2).oper[0]^, 63) and
  4348. MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) then
  4349. begin
  4350. { Change:
  4351. movq r/m,%rdx movq r/m,%rax movq r/m,%rdx movq r/m,%rax
  4352. movq %rdx,%rax or movq %rax,%rdx or movq r/m,%rax or movq r/m,%rdx
  4353. sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx
  4354. To:
  4355. movq r/m,%rax <- Note the change in register
  4356. cqto
  4357. }
  4358. DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCqto', p);
  4359. AllocRegBetween(NR_RAX, p, hp1, UsedRegs);
  4360. taicpu(p).loadreg(1, NR_RAX);
  4361. taicpu(hp1).opcode := A_CQO;
  4362. taicpu(hp1).clearop(1);
  4363. taicpu(hp1).clearop(0);
  4364. taicpu(hp1).ops:=0;
  4365. RemoveInstruction(hp2);
  4366. {$endif x86_64}
  4367. *)
  4368. end;
  4369. end;
  4370. {$ifdef x86_64}
  4371. end
  4372. else if (taicpu(p).opsize = S_L) and
  4373. (taicpu(p).oper[1]^.typ = top_reg) and
  4374. (
  4375. MatchInstruction(hp1, A_MOV,[]) and
  4376. (taicpu(hp1).opsize = S_L) and
  4377. (taicpu(hp1).oper[1]^.typ = top_reg)
  4378. ) and (
  4379. GetNextInstruction(hp1, hp2) and
  4380. (tai(hp2).typ=ait_instruction) and
  4381. (taicpu(hp2).opsize = S_Q) and
  4382. (
  4383. (
  4384. MatchInstruction(hp2, A_ADD,[]) and
  4385. (taicpu(hp2).opsize = S_Q) and
  4386. (taicpu(hp2).oper[0]^.typ = top_reg) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  4387. (
  4388. (
  4389. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(p).oper[1]^.reg)) and
  4390. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4391. ) or (
  4392. (getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  4393. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  4394. )
  4395. )
  4396. ) or (
  4397. MatchInstruction(hp2, A_LEA,[]) and
  4398. (taicpu(hp2).oper[0]^.ref^.offset = 0) and
  4399. (taicpu(hp2).oper[0]^.ref^.scalefactor <= 1) and
  4400. (
  4401. (
  4402. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(p).oper[1]^.reg)) and
  4403. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4404. ) or (
  4405. (getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  4406. (getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(p).oper[1]^.reg))
  4407. )
  4408. ) and (
  4409. (
  4410. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
  4411. ) or (
  4412. (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
  4413. )
  4414. )
  4415. )
  4416. )
  4417. ) and (
  4418. GetNextInstruction(hp2, hp3) and
  4419. MatchInstruction(hp3, A_SHR,[]) and
  4420. (taicpu(hp3).opsize = S_Q) and
  4421. (taicpu(hp3).oper[0]^.typ = top_const) and (taicpu(hp2).oper[1]^.typ = top_reg) and
  4422. (taicpu(hp3).oper[0]^.val = 1) and
  4423. (taicpu(hp3).oper[1]^.reg = taicpu(hp2).oper[1]^.reg)
  4424. ) then
  4425. begin
  4426. { Change movl x, reg1d movl x, reg1d
  4427. movl y, reg2d movl y, reg2d
  4428. addq reg2q,reg1q or leaq (reg1q,reg2q),reg1q
  4429. shrq $1, reg1q shrq $1, reg1q
  4430. ( reg1d and reg2d can be switched around in the first two instructions )
  4431. To movl x, reg1d
  4432. addl y, reg1d
  4433. rcrl $1, reg1d
  4434. This corresponds to the common expression (x + y) shr 1, where
  4435. x and y are Cardinals (replacing "shr 1" with "div 2" produces
  4436. smaller code, but won't account for x + y causing an overflow). [Kit]
  4437. }
  4438. if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then
  4439. { Change first MOV command to have the same register as the final output }
  4440. taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg
  4441. else
  4442. taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
  4443. { Change second MOV command to an ADD command. This is easier than
  4444. converting the existing command because it means we don't have to
  4445. touch 'y', which might be a complicated reference, and also the
  4446. fact that the third command might either be ADD or LEA. [Kit] }
  4447. taicpu(hp1).opcode := A_ADD;
  4448. { Delete old ADD/LEA instruction }
  4449. RemoveInstruction(hp2);
  4450. { Convert "shrq $1, reg1q" to "rcr $1, reg1d" }
  4451. taicpu(hp3).opcode := A_RCR;
  4452. taicpu(hp3).changeopsize(S_L);
  4453. setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD);
  4454. {$endif x86_64}
  4455. end;
  4456. end;
  4457. function TX86AsmOptimizer.OptPass2Imul(var p : tai) : boolean;
  4458. var
  4459. hp1 : tai;
  4460. begin
  4461. Result:=false;
  4462. if (taicpu(p).ops >= 2) and
  4463. ((taicpu(p).oper[0]^.typ = top_const) or
  4464. ((taicpu(p).oper[0]^.typ = top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full))) and
  4465. (taicpu(p).oper[1]^.typ = top_reg) and
  4466. ((taicpu(p).ops = 2) or
  4467. ((taicpu(p).oper[2]^.typ = top_reg) and
  4468. (taicpu(p).oper[2]^.reg = taicpu(p).oper[1]^.reg))) and
  4469. GetLastInstruction(p,hp1) and
  4470. MatchInstruction(hp1,A_MOV,[]) and
  4471. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  4472. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  4473. begin
  4474. TransferUsedRegs(TmpUsedRegs);
  4475. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,p,TmpUsedRegs)) or
  4476. ((taicpu(p).ops = 3) and (taicpu(p).oper[1]^.reg=taicpu(p).oper[2]^.reg)) then
  4477. { change
  4478. mov reg1,reg2
  4479. imul y,reg2 to imul y,reg1,reg2 }
  4480. begin
  4481. taicpu(p).ops := 3;
  4482. taicpu(p).loadreg(2,taicpu(p).oper[1]^.reg);
  4483. taicpu(p).loadreg(1,taicpu(hp1).oper[0]^.reg);
  4484. DebugMsg(SPeepholeOptimization + 'MovImul2Imul done',p);
  4485. RemoveInstruction(hp1);
  4486. result:=true;
  4487. end;
  4488. end;
  4489. end;
  4490. procedure TX86AsmOptimizer.ConvertJumpToRET(const p: tai; const ret_p: tai);
  4491. var
  4492. ThisLabel: TAsmLabel;
  4493. begin
  4494. ThisLabel := tasmlabel(taicpu(p).oper[0]^.ref^.symbol);
  4495. ThisLabel.decrefs;
  4496. taicpu(p).opcode := A_RET;
  4497. taicpu(p).is_jmp := false;
  4498. taicpu(p).ops := taicpu(ret_p).ops;
  4499. case taicpu(ret_p).ops of
  4500. 0:
  4501. taicpu(p).clearop(0);
  4502. 1:
  4503. taicpu(p).loadconst(0,taicpu(ret_p).oper[0]^.val);
  4504. else
  4505. internalerror(2016041301);
  4506. end;
  4507. { If the original label is now dead, it might turn out that the label
  4508. immediately follows p. As a result, everything beyond it, which will
  4509. be just some final register configuration and a RET instruction, is
  4510. now dead code. [Kit] }
  4511. { NOTE: This is much faster than introducing a OptPass2RET routine and
  4512. running RemoveDeadCodeAfterJump for each RET instruction, because
  4513. this optimisation rarely happens and most RETs appear at the end of
  4514. routines where there is nothing that can be stripped. [Kit] }
  4515. if not ThisLabel.is_used then
  4516. RemoveDeadCodeAfterJump(p);
  4517. end;
  4518. function TX86AsmOptimizer.OptPass2Jmp(var p : tai) : boolean;
  4519. var
  4520. hp1, hp2, hp3: tai;
  4521. OperIdx: Integer;
  4522. begin
  4523. result:=false;
  4524. if (taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full) and (taicpu(p).oper[0]^.ref^.base=NR_NO) and
  4525. (taicpu(p).oper[0]^.ref^.index=NR_NO) and (taicpu(p).oper[0]^.ref^.symbol is tasmlabel) then
  4526. begin
  4527. hp1:=getlabelwithsym(tasmlabel(taicpu(p).oper[0]^.ref^.symbol));
  4528. if (taicpu(p).condition=C_None) and assigned(hp1) and SkipLabels(hp1,hp1) and (hp1.typ = ait_instruction) then
  4529. begin
  4530. case taicpu(hp1).opcode of
  4531. A_RET:
  4532. {
  4533. change
  4534. jmp .L1
  4535. ...
  4536. .L1:
  4537. ret
  4538. into
  4539. ret
  4540. }
  4541. begin
  4542. ConvertJumpToRET(p, hp1);
  4543. result:=true;
  4544. end;
  4545. A_MOV:
  4546. {
  4547. change
  4548. jmp .L1
  4549. ...
  4550. .L1:
  4551. mov ##, ##
  4552. ret
  4553. into
  4554. mov ##, ##
  4555. ret
  4556. }
  4557. { This optimisation tends to increase code size if the pass 1 MOV optimisations aren't
  4558. re-run, so only do this particular optimisation if optimising for speed or when
  4559. optimisations are very in-depth. [Kit] }
  4560. if (current_settings.optimizerswitches * [cs_opt_level3, cs_opt_size]) <> [cs_opt_size] then
  4561. begin
  4562. GetNextInstruction(hp1, hp2);
  4563. if not Assigned(hp2) then
  4564. Exit;
  4565. if (hp2.typ in [ait_label, ait_align]) then
  4566. SkipLabels(hp2,hp2);
  4567. if Assigned(hp2) and MatchInstruction(hp2, A_RET, [S_NO]) then
  4568. begin
  4569. { Duplicate the MOV instruction }
  4570. hp3:=tai(hp1.getcopy);
  4571. asml.InsertBefore(hp3, p);
  4572. { Make sure the compiler knows about any final registers written here }
  4573. for OperIdx := 0 to 1 do
  4574. with taicpu(hp3).oper[OperIdx]^ do
  4575. begin
  4576. case typ of
  4577. top_ref:
  4578. begin
  4579. if (ref^.base <> NR_NO) {$ifdef x86_64} and (ref^.base <> NR_RIP) {$endif x86_64} then
  4580. AllocRegBetween(ref^.base, hp3, tai(p.Next), UsedRegs);
  4581. if (ref^.index <> NR_NO) {$ifdef x86_64} and (ref^.index <> NR_RIP) {$endif x86_64} then
  4582. AllocRegBetween(ref^.index, hp3, tai(p.Next), UsedRegs);
  4583. end;
  4584. top_reg:
  4585. AllocRegBetween(reg, hp3, tai(p.Next), UsedRegs);
  4586. else
  4587. ;
  4588. end;
  4589. end;
  4590. { Now change the jump into a RET instruction }
  4591. ConvertJumpToRET(p, hp2);
  4592. result:=true;
  4593. end;
  4594. end;
  4595. else
  4596. ;
  4597. end;
  4598. end;
  4599. end;
  4600. end;
  4601. class function TX86AsmOptimizer.CanBeCMOV(p : tai) : boolean;
  4602. begin
  4603. CanBeCMOV:=assigned(p) and
  4604. MatchInstruction(p,A_MOV,[S_W,S_L,S_Q]) and
  4605. { we can't use cmov ref,reg because
  4606. ref could be nil and cmov still throws an exception
  4607. if ref=nil but the mov isn't done (FK)
  4608. or ((taicpu(p).oper[0]^.typ = top_ref) and
  4609. (taicpu(p).oper[0]^.ref^.refaddr = addr_no))
  4610. }
  4611. (taicpu(p).oper[1]^.typ = top_reg) and
  4612. (
  4613. (taicpu(p).oper[0]^.typ = top_reg) or
  4614. { allow references, but only pure symbols or got rel. addressing with RIP as based,
  4615. it is not expected that this can cause a seg. violation }
  4616. (
  4617. (taicpu(p).oper[0]^.typ = top_ref) and
  4618. IsRefSafe(taicpu(p).oper[0]^.ref)
  4619. )
  4620. );
  4621. end;
  4622. function TX86AsmOptimizer.OptPass2Jcc(var p : tai) : boolean;
  4623. var
  4624. hp1,hp2,hp3,hp4,hpmov2: tai;
  4625. carryadd_opcode : TAsmOp;
  4626. l : Longint;
  4627. condition : TAsmCond;
  4628. symbol: TAsmSymbol;
  4629. reg: tsuperregister;
  4630. regavailable: Boolean;
  4631. begin
  4632. result:=false;
  4633. symbol:=nil;
  4634. if GetNextInstruction(p,hp1) then
  4635. begin
  4636. symbol := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
  4637. if (hp1.typ=ait_instruction) and
  4638. GetNextInstruction(hp1,hp2) and
  4639. ((hp2.typ=ait_label) or
  4640. { trick to skip align }
  4641. ((hp2.typ=ait_align) and GetNextInstruction(hp2,hp2) and (hp2.typ=ait_label))
  4642. ) and
  4643. (Tasmlabel(symbol) = Tai_label(hp2).labsym) then
  4644. { jb @@1 cmc
  4645. inc/dec operand --> adc/sbb operand,0
  4646. @@1:
  4647. ... and ...
  4648. jnb @@1
  4649. inc/dec operand --> adc/sbb operand,0
  4650. @@1: }
  4651. begin
  4652. carryadd_opcode:=A_NONE;
  4653. if Taicpu(p).condition in [C_NAE,C_B,C_C] then
  4654. begin
  4655. if (Taicpu(hp1).opcode=A_INC) or
  4656. ((Taicpu(hp1).opcode=A_ADD) and
  4657. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4658. (Taicpu(hp1).oper[0]^.val=1)
  4659. ) then
  4660. carryadd_opcode:=A_ADC;
  4661. if (Taicpu(hp1).opcode=A_DEC) or
  4662. ((Taicpu(hp1).opcode=A_SUB) and
  4663. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4664. (Taicpu(hp1).oper[0]^.val=1)
  4665. ) then
  4666. carryadd_opcode:=A_SBB;
  4667. if carryadd_opcode<>A_NONE then
  4668. begin
  4669. Taicpu(p).clearop(0);
  4670. Taicpu(p).ops:=0;
  4671. Taicpu(p).is_jmp:=false;
  4672. Taicpu(p).opcode:=A_CMC;
  4673. Taicpu(p).condition:=C_NONE;
  4674. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2CmcAdc/Sbb',p);
  4675. Taicpu(hp1).ops:=2;
  4676. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  4677. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  4678. else
  4679. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  4680. Taicpu(hp1).loadconst(0,0);
  4681. Taicpu(hp1).opcode:=carryadd_opcode;
  4682. result:=true;
  4683. exit;
  4684. end;
  4685. end
  4686. else if Taicpu(p).condition in [C_AE,C_NB,C_NC] then
  4687. begin
  4688. if (Taicpu(hp1).opcode=A_INC) or
  4689. ((Taicpu(hp1).opcode=A_ADD) and
  4690. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4691. (Taicpu(hp1).oper[0]^.val=1)
  4692. ) then
  4693. carryadd_opcode:=A_ADC;
  4694. if (Taicpu(hp1).opcode=A_DEC) or
  4695. ((Taicpu(hp1).opcode=A_SUB) and
  4696. MatchOptype(Taicpu(hp1),top_const,top_reg) and
  4697. (Taicpu(hp1).oper[0]^.val=1)
  4698. ) then
  4699. carryadd_opcode:=A_SBB;
  4700. if carryadd_opcode<>A_NONE then
  4701. begin
  4702. Taicpu(hp1).ops:=2;
  4703. DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2Adc/Sbb',p);
  4704. if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
  4705. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
  4706. else
  4707. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
  4708. Taicpu(hp1).loadconst(0,0);
  4709. Taicpu(hp1).opcode:=carryadd_opcode;
  4710. RemoveCurrentP(p, hp1);
  4711. result:=true;
  4712. exit;
  4713. end;
  4714. end
  4715. {
  4716. jcc @@1 setcc tmpreg
  4717. inc/dec/add/sub operand -> (movzx tmpreg)
  4718. @@1: add/sub tmpreg,operand
  4719. While this increases code size slightly, it makes the code much faster if the
  4720. jump is unpredictable
  4721. }
  4722. else if not(cs_opt_size in current_settings.optimizerswitches) and
  4723. ((((Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB)) and
  4724. (Taicpu(hp1).oper[0]^.typ=top_const) and
  4725. (Taicpu(hp1).oper[1]^.typ=top_reg) and
  4726. (Taicpu(hp1).oper[0]^.val=1)) or
  4727. ((Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC))
  4728. ) then
  4729. begin
  4730. TransferUsedRegs(TmpUsedRegs);
  4731. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  4732. { search for an available register which is volatile }
  4733. regavailable:=false;
  4734. for reg in tcpuregisterset do
  4735. begin
  4736. if (reg in paramanager.get_volatile_registers_int(current_procinfo.procdef.proccalloption)) and
  4737. not(reg in TmpUsedRegs[R_INTREGISTER].GetUsedRegs) and
  4738. not(RegInInstruction(newreg(R_INTREGISTER,reg,R_SUBL),hp1))
  4739. {$ifdef i386}
  4740. and (reg in [RS_EAX,RS_EBX,RS_ECX,RS_EDX])
  4741. {$endif i386}
  4742. then
  4743. begin
  4744. regavailable:=true;
  4745. break;
  4746. end;
  4747. end;
  4748. if regavailable then
  4749. begin
  4750. Taicpu(p).clearop(0);
  4751. Taicpu(p).ops:=1;
  4752. Taicpu(p).is_jmp:=false;
  4753. Taicpu(p).opcode:=A_SETcc;
  4754. DebugMsg(SPeepholeOptimization+'JccAdd2SetccAdd',p);
  4755. Taicpu(p).condition:=inverse_cond(Taicpu(p).condition);
  4756. Taicpu(p).loadreg(0,newreg(R_INTREGISTER,reg,R_SUBL));
  4757. if getsubreg(Taicpu(hp1).oper[1]^.reg)<>R_SUBL then
  4758. begin
  4759. case getsubreg(Taicpu(hp1).oper[1]^.reg) of
  4760. R_SUBW:
  4761. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BW,newreg(R_INTREGISTER,reg,R_SUBL),
  4762. newreg(R_INTREGISTER,reg,R_SUBW));
  4763. R_SUBD,
  4764. R_SUBQ:
  4765. hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BL,newreg(R_INTREGISTER,reg,R_SUBL),
  4766. newreg(R_INTREGISTER,reg,R_SUBD));
  4767. else
  4768. Internalerror(2020030601);
  4769. end;
  4770. taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
  4771. asml.InsertAfter(hp2,p);
  4772. end;
  4773. if (Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC) then
  4774. begin
  4775. Taicpu(hp1).ops:=2;
  4776. Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^)
  4777. end;
  4778. Taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,reg,getsubreg(Taicpu(hp1).oper[1]^.reg)));
  4779. AllocRegBetween(newreg(R_INTREGISTER,reg,getsubreg(Taicpu(hp1).oper[1]^.reg)),p,hp1,UsedRegs);
  4780. end;
  4781. end;
  4782. end;
  4783. { Detect the following:
  4784. jmp<cond> @Lbl1
  4785. jmp @Lbl2
  4786. ...
  4787. @Lbl1:
  4788. ret
  4789. Change to:
  4790. jmp<inv_cond> @Lbl2
  4791. ret
  4792. }
  4793. if MatchInstruction(hp1,A_JMP,[]) and (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  4794. begin
  4795. hp2:=getlabelwithsym(TAsmLabel(symbol));
  4796. if Assigned(hp2) and SkipLabels(hp2,hp2) and
  4797. MatchInstruction(hp2,A_RET,[S_NO]) then
  4798. begin
  4799. taicpu(p).condition := inverse_cond(taicpu(p).condition);
  4800. { Change label address to that of the unconditional jump }
  4801. taicpu(p).loadoper(0, taicpu(hp1).oper[0]^);
  4802. TAsmLabel(symbol).DecRefs;
  4803. taicpu(hp1).opcode := A_RET;
  4804. taicpu(hp1).is_jmp := false;
  4805. taicpu(hp1).ops := taicpu(hp2).ops;
  4806. DebugMsg(SPeepholeOptimization+'JccJmpRet2J!ccRet',p);
  4807. case taicpu(hp2).ops of
  4808. 0:
  4809. taicpu(hp1).clearop(0);
  4810. 1:
  4811. taicpu(hp1).loadconst(0,taicpu(hp2).oper[0]^.val);
  4812. else
  4813. internalerror(2016041302);
  4814. end;
  4815. end;
  4816. end;
  4817. end;
  4818. {$ifndef i8086}
  4819. if CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype] then
  4820. begin
  4821. { check for
  4822. jCC xxx
  4823. <several movs>
  4824. xxx:
  4825. }
  4826. l:=0;
  4827. GetNextInstruction(p, hp1);
  4828. while assigned(hp1) and
  4829. CanBeCMOV(hp1) and
  4830. { stop on labels }
  4831. not(hp1.typ=ait_label) do
  4832. begin
  4833. inc(l);
  4834. GetNextInstruction(hp1,hp1);
  4835. end;
  4836. if assigned(hp1) then
  4837. begin
  4838. if FindLabel(tasmlabel(symbol),hp1) then
  4839. begin
  4840. if (l<=4) and (l>0) then
  4841. begin
  4842. condition:=inverse_cond(taicpu(p).condition);
  4843. GetNextInstruction(p,hp1);
  4844. repeat
  4845. if not Assigned(hp1) then
  4846. InternalError(2018062900);
  4847. taicpu(hp1).opcode:=A_CMOVcc;
  4848. taicpu(hp1).condition:=condition;
  4849. UpdateUsedRegs(hp1);
  4850. GetNextInstruction(hp1,hp1);
  4851. until not(CanBeCMOV(hp1));
  4852. { Remember what hp1 is in case there's multiple aligns to get rid of }
  4853. hp2 := hp1;
  4854. repeat
  4855. if not Assigned(hp2) then
  4856. InternalError(2018062910);
  4857. case hp2.typ of
  4858. ait_label:
  4859. { What we expected - break out of the loop (it won't be a dead label at the top of
  4860. a cluster because that was optimised at an earlier stage) }
  4861. Break;
  4862. ait_align:
  4863. { Go to the next entry until a label is found (may be multiple aligns before it) }
  4864. begin
  4865. hp2 := tai(hp2.Next);
  4866. Continue;
  4867. end;
  4868. else
  4869. begin
  4870. { Might be a comment or temporary allocation entry }
  4871. if not (hp2.typ in SkipInstr) then
  4872. InternalError(2018062911);
  4873. hp2 := tai(hp2.Next);
  4874. Continue;
  4875. end;
  4876. end;
  4877. until False;
  4878. { Now we can safely decrement the reference count }
  4879. tasmlabel(symbol).decrefs;
  4880. DebugMsg(SPeepholeOptimization+'JccMov2CMov',p);
  4881. { Remove the original jump }
  4882. RemoveInstruction(p); { Note, the choice to not use RemoveCurrentp is deliberate }
  4883. GetNextInstruction(hp2, p); { Instruction after the label }
  4884. { Remove the label if this is its final reference }
  4885. if (tasmlabel(symbol).getrefs=0) then
  4886. StripLabelFast(hp1);
  4887. if Assigned(p) then
  4888. begin
  4889. UpdateUsedRegs(p);
  4890. result:=true;
  4891. end;
  4892. exit;
  4893. end;
  4894. end
  4895. else
  4896. begin
  4897. { check further for
  4898. jCC xxx
  4899. <several movs 1>
  4900. jmp yyy
  4901. xxx:
  4902. <several movs 2>
  4903. yyy:
  4904. }
  4905. { hp2 points to jmp yyy }
  4906. hp2:=hp1;
  4907. { skip hp1 to xxx (or an align right before it) }
  4908. GetNextInstruction(hp1, hp1);
  4909. if assigned(hp2) and
  4910. assigned(hp1) and
  4911. (l<=3) and
  4912. (hp2.typ=ait_instruction) and
  4913. (taicpu(hp2).is_jmp) and
  4914. (taicpu(hp2).condition=C_None) and
  4915. { real label and jump, no further references to the
  4916. label are allowed }
  4917. (tasmlabel(symbol).getrefs=1) and
  4918. FindLabel(tasmlabel(symbol),hp1) then
  4919. begin
  4920. l:=0;
  4921. { skip hp1 to <several moves 2> }
  4922. if (hp1.typ = ait_align) then
  4923. GetNextInstruction(hp1, hp1);
  4924. GetNextInstruction(hp1, hpmov2);
  4925. hp1 := hpmov2;
  4926. while assigned(hp1) and
  4927. CanBeCMOV(hp1) do
  4928. begin
  4929. inc(l);
  4930. GetNextInstruction(hp1, hp1);
  4931. end;
  4932. { hp1 points to yyy (or an align right before it) }
  4933. hp3 := hp1;
  4934. if assigned(hp1) and
  4935. FindLabel(tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol),hp1) then
  4936. begin
  4937. condition:=inverse_cond(taicpu(p).condition);
  4938. GetNextInstruction(p,hp1);
  4939. repeat
  4940. taicpu(hp1).opcode:=A_CMOVcc;
  4941. taicpu(hp1).condition:=condition;
  4942. UpdateUsedRegs(hp1);
  4943. GetNextInstruction(hp1,hp1);
  4944. until not(assigned(hp1)) or
  4945. not(CanBeCMOV(hp1));
  4946. condition:=inverse_cond(condition);
  4947. hp1 := hpmov2;
  4948. { hp1 is now at <several movs 2> }
  4949. while Assigned(hp1) and CanBeCMOV(hp1) do
  4950. begin
  4951. taicpu(hp1).opcode:=A_CMOVcc;
  4952. taicpu(hp1).condition:=condition;
  4953. UpdateUsedRegs(hp1);
  4954. GetNextInstruction(hp1,hp1);
  4955. end;
  4956. hp1 := p;
  4957. { Get first instruction after label }
  4958. GetNextInstruction(hp3, p);
  4959. if assigned(p) and (hp3.typ = ait_align) then
  4960. GetNextInstruction(p, p);
  4961. { Don't dereference yet, as doing so will cause
  4962. GetNextInstruction to skip the label and
  4963. optional align marker. [Kit] }
  4964. GetNextInstruction(hp2, hp4);
  4965. DebugMsg(SPeepholeOptimization+'JccMovJmpMov2CMovCMov',hp1);
  4966. { remove jCC }
  4967. RemoveInstruction(hp1);
  4968. { Now we can safely decrement it }
  4969. tasmlabel(symbol).decrefs;
  4970. { Remove label xxx (it will have a ref of zero due to the initial check }
  4971. StripLabelFast(hp4);
  4972. { remove jmp }
  4973. symbol := taicpu(hp2).oper[0]^.ref^.symbol;
  4974. RemoveInstruction(hp2);
  4975. { As before, now we can safely decrement it }
  4976. tasmlabel(symbol).decrefs;
  4977. { Remove label yyy (and the optional alignment) if its reference falls to zero }
  4978. if tasmlabel(symbol).getrefs = 0 then
  4979. StripLabelFast(hp3);
  4980. if Assigned(p) then
  4981. begin
  4982. UpdateUsedRegs(p);
  4983. result:=true;
  4984. end;
  4985. exit;
  4986. end;
  4987. end;
  4988. end;
  4989. end;
  4990. end;
  4991. {$endif i8086}
  4992. end;
  4993. function TX86AsmOptimizer.OptPass1Movx(var p : tai) : boolean;
  4994. var
  4995. hp1,hp2: tai;
  4996. reg_and_hp1_is_instr: Boolean;
  4997. begin
  4998. result:=false;
  4999. reg_and_hp1_is_instr:=(taicpu(p).oper[1]^.typ = top_reg) and
  5000. GetNextInstruction(p,hp1) and
  5001. (hp1.typ = ait_instruction);
  5002. if reg_and_hp1_is_instr and
  5003. (
  5004. (taicpu(hp1).opcode <> A_LEA) or
  5005. { If the LEA instruction can be converted into an arithmetic instruction,
  5006. it may be possible to then fold it. }
  5007. (
  5008. { If the flags register is in use, don't change the instruction
  5009. to an ADD otherwise this will scramble the flags. [Kit] }
  5010. not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
  5011. ConvertLEA(taicpu(hp1))
  5012. )
  5013. ) and
  5014. IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
  5015. GetNextInstruction(hp1,hp2) and
  5016. MatchInstruction(hp2,A_MOV,[]) and
  5017. (taicpu(hp2).oper[0]^.typ = top_reg) and
  5018. OpsEqual(taicpu(hp2).oper[1]^,taicpu(p).oper[0]^) and
  5019. ((taicpu(p).opsize in [S_BW,S_BL]) and (taicpu(hp2).opsize=S_B) or
  5020. (taicpu(p).opsize in [S_WL]) and (taicpu(hp2).opsize=S_W)) and
  5021. {$ifdef i386}
  5022. { not all registers have byte size sub registers on i386 }
  5023. ((taicpu(hp2).opsize<>S_B) or (getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])) and
  5024. {$endif i386}
  5025. (((taicpu(hp1).ops=2) and
  5026. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg))) or
  5027. ((taicpu(hp1).ops=1) and
  5028. (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg)))) and
  5029. not(RegUsedAfterInstruction(taicpu(hp2).oper[0]^.reg,hp2,UsedRegs)) then
  5030. begin
  5031. { change movsX/movzX reg/ref, reg2
  5032. add/sub/or/... reg3/$const, reg2
  5033. mov reg2 reg/ref
  5034. to add/sub/or/... reg3/$const, reg/ref }
  5035. { by example:
  5036. movswl %si,%eax movswl %si,%eax p
  5037. decl %eax addl %edx,%eax hp1
  5038. movw %ax,%si movw %ax,%si hp2
  5039. ->
  5040. movswl %si,%eax movswl %si,%eax p
  5041. decw %eax addw %edx,%eax hp1
  5042. movw %ax,%si movw %ax,%si hp2
  5043. }
  5044. taicpu(hp1).changeopsize(taicpu(hp2).opsize);
  5045. {
  5046. ->
  5047. movswl %si,%eax movswl %si,%eax p
  5048. decw %si addw %dx,%si hp1
  5049. movw %ax,%si movw %ax,%si hp2
  5050. }
  5051. case taicpu(hp1).ops of
  5052. 1:
  5053. taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
  5054. 2:
  5055. begin
  5056. taicpu(hp1).loadoper(1,taicpu(hp2).oper[1]^);
  5057. if (taicpu(hp1).oper[0]^.typ = top_reg) then
  5058. setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
  5059. end;
  5060. else
  5061. internalerror(2008042701);
  5062. end;
  5063. {
  5064. ->
  5065. decw %si addw %dx,%si p
  5066. }
  5067. DebugMsg(SPeepholeOptimization + 'var3',p);
  5068. RemoveCurrentP(p, hp1);
  5069. RemoveInstruction(hp2);
  5070. end
  5071. else if reg_and_hp1_is_instr and
  5072. (taicpu(hp1).opcode = A_MOV) and
  5073. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  5074. (MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)
  5075. {$ifdef x86_64}
  5076. { check for implicit extension to 64 bit }
  5077. or
  5078. ((taicpu(p).opsize in [S_BL,S_WL]) and
  5079. (taicpu(hp1).opsize=S_Q) and
  5080. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg)
  5081. )
  5082. {$endif x86_64}
  5083. )
  5084. then
  5085. begin
  5086. { change
  5087. movx %reg1,%reg2
  5088. mov %reg2,%reg3
  5089. dealloc %reg2
  5090. into
  5091. movx %reg,%reg3
  5092. }
  5093. TransferUsedRegs(TmpUsedRegs);
  5094. UpdateUsedRegs(TmpUsedRegs, tai(p.next));
  5095. if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
  5096. begin
  5097. DebugMsg(SPeepholeOptimization + 'MovxMov2Movx',p);
  5098. {$ifdef x86_64}
  5099. if (taicpu(p).opsize in [S_BL,S_WL]) and
  5100. (taicpu(hp1).opsize=S_Q) then
  5101. taicpu(p).loadreg(1,newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg),R_SUBD))
  5102. else
  5103. {$endif x86_64}
  5104. taicpu(p).loadreg(1,taicpu(hp1).oper[1]^.reg);
  5105. RemoveInstruction(hp1);
  5106. end;
  5107. end
  5108. else if reg_and_hp1_is_instr and
  5109. (taicpu(p).oper[0]^.typ = top_reg) and
  5110. (
  5111. (taicpu(hp1).opcode = A_SHL) or (taicpu(hp1).opcode = A_SAL)
  5112. ) and
  5113. (taicpu(hp1).oper[0]^.typ = top_const) and
  5114. SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
  5115. MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
  5116. { Minimum shift value allowed is the bit difference between the sizes }
  5117. (taicpu(hp1).oper[0]^.val >=
  5118. { Multiply by 8 because tcgsize2size returns bytes, not bits }
  5119. 8 * (
  5120. tcgsize2size[reg_cgsize(taicpu(p).oper[1]^.reg)] -
  5121. tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
  5122. )
  5123. ) then
  5124. begin
  5125. { For:
  5126. movsx/movzx %reg1,%reg1 (same register, just different sizes)
  5127. shl/sal ##, %reg1
  5128. Remove the movsx/movzx instruction if the shift overwrites the
  5129. extended bits of the register (e.g. movslq %eax,%rax; shlq $32,%rax
  5130. }
  5131. DebugMsg(SPeepholeOptimization + 'MovxShl2Shl',p);
  5132. RemoveCurrentP(p, hp1);
  5133. Result := True;
  5134. Exit;
  5135. end
  5136. else if taicpu(p).opcode=A_MOVZX then
  5137. begin
  5138. { removes superfluous And's after movzx's }
  5139. if reg_and_hp1_is_instr and
  5140. (taicpu(hp1).opcode = A_AND) and
  5141. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5142. ((taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)
  5143. {$ifdef x86_64}
  5144. { check for implicit extension to 64 bit }
  5145. or
  5146. ((taicpu(p).opsize in [S_BL,S_WL]) and
  5147. (taicpu(hp1).opsize=S_Q) and
  5148. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg)
  5149. )
  5150. {$endif x86_64}
  5151. )
  5152. then
  5153. begin
  5154. case taicpu(p).opsize Of
  5155. S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
  5156. if (taicpu(hp1).oper[0]^.val = $ff) then
  5157. begin
  5158. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz1',p);
  5159. RemoveInstruction(hp1);
  5160. Result:=true;
  5161. exit;
  5162. end;
  5163. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  5164. if (taicpu(hp1).oper[0]^.val = $ffff) then
  5165. begin
  5166. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz2',p);
  5167. RemoveInstruction(hp1);
  5168. Result:=true;
  5169. exit;
  5170. end;
  5171. {$ifdef x86_64}
  5172. S_LQ:
  5173. if (taicpu(hp1).oper[0]^.val = $ffffffff) then
  5174. begin
  5175. DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz3',p);
  5176. RemoveInstruction(hp1);
  5177. Result:=true;
  5178. exit;
  5179. end;
  5180. {$endif x86_64}
  5181. else
  5182. ;
  5183. end;
  5184. { we cannot get rid of the and, but can we get rid of the movz ?}
  5185. if SuperRegistersEqual(taicpu(p).oper[0]^.reg,taicpu(p).oper[1]^.reg) then
  5186. begin
  5187. case taicpu(p).opsize Of
  5188. S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
  5189. if (taicpu(hp1).oper[0]^.val and $ff)=taicpu(hp1).oper[0]^.val then
  5190. begin
  5191. DebugMsg(SPeepholeOptimization + 'MovzAnd2And1',p);
  5192. RemoveCurrentP(p,hp1);
  5193. Result:=true;
  5194. exit;
  5195. end;
  5196. S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
  5197. if (taicpu(hp1).oper[0]^.val and $ffff)=taicpu(hp1).oper[0]^.val then
  5198. begin
  5199. DebugMsg(SPeepholeOptimization + 'MovzAnd2And2',p);
  5200. RemoveCurrentP(p,hp1);
  5201. Result:=true;
  5202. exit;
  5203. end;
  5204. {$ifdef x86_64}
  5205. S_LQ:
  5206. if (taicpu(hp1).oper[0]^.val and $ffffffff)=taicpu(hp1).oper[0]^.val then
  5207. begin
  5208. DebugMsg(SPeepholeOptimization + 'MovzAnd2And3',p);
  5209. RemoveCurrentP(p,hp1);
  5210. Result:=true;
  5211. exit;
  5212. end;
  5213. {$endif x86_64}
  5214. else
  5215. ;
  5216. end;
  5217. end;
  5218. end;
  5219. { changes some movzx constructs to faster synonyms (all examples
  5220. are given with eax/ax, but are also valid for other registers)}
  5221. if MatchOpType(taicpu(p),top_reg,top_reg) then
  5222. begin
  5223. case taicpu(p).opsize of
  5224. { Technically, movzbw %al,%ax cannot be encoded in 32/64-bit mode
  5225. (the machine code is equivalent to movzbl %al,%eax), but the
  5226. code generator still generates that assembler instruction and
  5227. it is silently converted. This should probably be checked.
  5228. [Kit] }
  5229. S_BW:
  5230. begin
  5231. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  5232. (
  5233. not IsMOVZXAcceptable
  5234. { and $0xff,%ax has a smaller encoding but risks a partial write penalty }
  5235. or (
  5236. (cs_opt_size in current_settings.optimizerswitches) and
  5237. (taicpu(p).oper[1]^.reg = NR_AX)
  5238. )
  5239. ) then
  5240. {Change "movzbw %al, %ax" to "andw $0x0ffh, %ax"}
  5241. begin
  5242. DebugMsg(SPeepholeOptimization + 'var7',p);
  5243. taicpu(p).opcode := A_AND;
  5244. taicpu(p).changeopsize(S_W);
  5245. taicpu(p).loadConst(0,$ff);
  5246. Result := True;
  5247. end
  5248. else if not IsMOVZXAcceptable and
  5249. GetNextInstruction(p, hp1) and
  5250. (tai(hp1).typ = ait_instruction) and
  5251. (taicpu(hp1).opcode = A_AND) and
  5252. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5253. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5254. { Change "movzbw %reg1, %reg2; andw $const, %reg2"
  5255. to "movw %reg1, reg2; andw $(const1 and $ff), %reg2"}
  5256. begin
  5257. DebugMsg(SPeepholeOptimization + 'var8',p);
  5258. taicpu(p).opcode := A_MOV;
  5259. taicpu(p).changeopsize(S_W);
  5260. setsubreg(taicpu(p).oper[0]^.reg,R_SUBW);
  5261. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5262. Result := True;
  5263. end;
  5264. end;
  5265. {$ifndef i8086} { movzbl %al,%eax cannot be encoded in 16-bit mode (the machine code is equivalent to movzbw %al,%ax }
  5266. S_BL:
  5267. begin
  5268. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
  5269. (
  5270. not IsMOVZXAcceptable
  5271. { and $0xff,%eax has a smaller encoding but risks a partial write penalty }
  5272. or (
  5273. (cs_opt_size in current_settings.optimizerswitches) and
  5274. (taicpu(p).oper[1]^.reg = NR_EAX)
  5275. )
  5276. ) then
  5277. { Change "movzbl %al, %eax" to "andl $0x0ffh, %eax" }
  5278. begin
  5279. DebugMsg(SPeepholeOptimization + 'var9',p);
  5280. taicpu(p).opcode := A_AND;
  5281. taicpu(p).changeopsize(S_L);
  5282. taicpu(p).loadConst(0,$ff);
  5283. Result := True;
  5284. end
  5285. else if not IsMOVZXAcceptable and
  5286. GetNextInstruction(p, hp1) and
  5287. (tai(hp1).typ = ait_instruction) and
  5288. (taicpu(hp1).opcode = A_AND) and
  5289. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5290. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5291. { Change "movzbl %reg1, %reg2; andl $const, %reg2"
  5292. to "movl %reg1, reg2; andl $(const1 and $ff), %reg2"}
  5293. begin
  5294. DebugMsg(SPeepholeOptimization + 'var10',p);
  5295. taicpu(p).opcode := A_MOV;
  5296. taicpu(p).changeopsize(S_L);
  5297. { do not use R_SUBWHOLE
  5298. as movl %rdx,%eax
  5299. is invalid in assembler PM }
  5300. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  5301. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5302. Result := True;
  5303. end;
  5304. end;
  5305. {$endif i8086}
  5306. S_WL:
  5307. if not IsMOVZXAcceptable then
  5308. begin
  5309. if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) then
  5310. { Change "movzwl %ax, %eax" to "andl $0x0ffffh, %eax" }
  5311. begin
  5312. DebugMsg(SPeepholeOptimization + 'var11',p);
  5313. taicpu(p).opcode := A_AND;
  5314. taicpu(p).changeopsize(S_L);
  5315. taicpu(p).loadConst(0,$ffff);
  5316. Result := True;
  5317. end
  5318. else if GetNextInstruction(p, hp1) and
  5319. (tai(hp1).typ = ait_instruction) and
  5320. (taicpu(hp1).opcode = A_AND) and
  5321. (taicpu(hp1).oper[0]^.typ = top_const) and
  5322. (taicpu(hp1).oper[1]^.typ = top_reg) and
  5323. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5324. { Change "movzwl %reg1, %reg2; andl $const, %reg2"
  5325. to "movl %reg1, reg2; andl $(const1 and $ffff), %reg2"}
  5326. begin
  5327. DebugMsg(SPeepholeOptimization + 'var12',p);
  5328. taicpu(p).opcode := A_MOV;
  5329. taicpu(p).changeopsize(S_L);
  5330. { do not use R_SUBWHOLE
  5331. as movl %rdx,%eax
  5332. is invalid in assembler PM }
  5333. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  5334. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  5335. Result := True;
  5336. end;
  5337. end;
  5338. else
  5339. InternalError(2017050705);
  5340. end;
  5341. end
  5342. else if not IsMOVZXAcceptable and (taicpu(p).oper[0]^.typ = top_ref) then
  5343. begin
  5344. if GetNextInstruction(p, hp1) and
  5345. (tai(hp1).typ = ait_instruction) and
  5346. (taicpu(hp1).opcode = A_AND) and
  5347. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5348. (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
  5349. begin
  5350. //taicpu(p).opcode := A_MOV;
  5351. case taicpu(p).opsize Of
  5352. S_BL:
  5353. begin
  5354. DebugMsg(SPeepholeOptimization + 'var13',p);
  5355. taicpu(hp1).changeopsize(S_L);
  5356. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5357. end;
  5358. S_WL:
  5359. begin
  5360. DebugMsg(SPeepholeOptimization + 'var14',p);
  5361. taicpu(hp1).changeopsize(S_L);
  5362. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
  5363. end;
  5364. S_BW:
  5365. begin
  5366. DebugMsg(SPeepholeOptimization + 'var15',p);
  5367. taicpu(hp1).changeopsize(S_W);
  5368. taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
  5369. end;
  5370. else
  5371. Internalerror(2017050704)
  5372. end;
  5373. Result := True;
  5374. end;
  5375. end;
  5376. end;
  5377. end;
  5378. function TX86AsmOptimizer.OptPass1AND(var p : tai) : boolean;
  5379. var
  5380. hp1 : tai;
  5381. MaskLength : Cardinal;
  5382. begin
  5383. Result:=false;
  5384. if GetNextInstruction(p, hp1) then
  5385. begin
  5386. if MatchOpType(taicpu(p),top_const,top_reg) and
  5387. MatchInstruction(hp1,A_AND,[]) and
  5388. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5389. (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
  5390. { the second register must contain the first one, so compare their subreg types }
  5391. (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
  5392. (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
  5393. { change
  5394. and const1, reg
  5395. and const2, reg
  5396. to
  5397. and (const1 and const2), reg
  5398. }
  5399. begin
  5400. taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
  5401. DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
  5402. RemoveCurrentP(p, hp1);
  5403. Result:=true;
  5404. exit;
  5405. end
  5406. else if MatchOpType(taicpu(p),top_const,top_reg) and
  5407. MatchInstruction(hp1,A_MOVZX,[]) and
  5408. MatchOpType(taicpu(hp1),top_reg,top_reg) and
  5409. SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) and
  5410. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  5411. (((taicpu(p).opsize=S_W) and
  5412. (taicpu(hp1).opsize=S_BW)) or
  5413. ((taicpu(p).opsize=S_L) and
  5414. (taicpu(hp1).opsize in [S_WL,S_BL{$ifdef x86_64},S_BQ,S_WQ{$endif x86_64}]))
  5415. {$ifdef x86_64}
  5416. or
  5417. ((taicpu(p).opsize=S_Q) and
  5418. (taicpu(hp1).opsize in [S_BQ,S_WQ,S_BL,S_WL]))
  5419. {$endif x86_64}
  5420. ) then
  5421. begin
  5422. if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  5423. ((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val)
  5424. ) or
  5425. (((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  5426. ((taicpu(p).oper[0]^.val and $ffff)=taicpu(p).oper[0]^.val))
  5427. then
  5428. begin
  5429. { Unlike MOVSX, MOVZX doesn't actually have a version that zero-extends a
  5430. 32-bit register to a 64-bit register, or even a version called MOVZXD, so
  5431. code that tests for the presence of AND 0xffffffff followed by MOVZX is
  5432. wasted, and is indictive of a compiler bug if it were triggered. [Kit]
  5433. NOTE: To zero-extend from 32 bits to 64 bits, simply use the standard MOV.
  5434. }
  5435. DebugMsg(SPeepholeOptimization + 'AndMovzToAnd done',p);
  5436. RemoveInstruction(hp1);
  5437. Exit;
  5438. end;
  5439. end
  5440. else if MatchOpType(taicpu(p),top_const,top_reg) and
  5441. MatchInstruction(hp1,A_SHL,[]) and
  5442. MatchOpType(taicpu(hp1),top_const,top_reg) and
  5443. (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
  5444. begin
  5445. {$ifopt R+}
  5446. {$define RANGE_WAS_ON}
  5447. {$R-}
  5448. {$endif}
  5449. { get length of potential and mask }
  5450. MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
  5451. { really a mask? }
  5452. {$ifdef RANGE_WAS_ON}
  5453. {$R+}
  5454. {$endif}
  5455. if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
  5456. { unmasked part shifted out? }
  5457. ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
  5458. begin
  5459. DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
  5460. RemoveCurrentP(p, hp1);
  5461. Result:=true;
  5462. exit;
  5463. end;
  5464. end
  5465. else if MatchOpType(taicpu(p),top_const,top_reg) and
  5466. MatchInstruction(hp1,A_MOVSX{$ifdef x86_64},A_MOVSXD{$endif x86_64},[]) and
  5467. (taicpu(hp1).oper[0]^.typ = top_reg) and
  5468. MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
  5469. (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
  5470. (((taicpu(p).opsize=S_W) and
  5471. (taicpu(hp1).opsize=S_BW)) or
  5472. ((taicpu(p).opsize=S_L) and
  5473. (taicpu(hp1).opsize in [S_WL,S_BL]))
  5474. {$ifdef x86_64}
  5475. or
  5476. ((taicpu(p).opsize=S_Q) and
  5477. (taicpu(hp1).opsize in [S_BQ,S_WQ,S_LQ]))
  5478. {$endif x86_64}
  5479. ) then
  5480. begin
  5481. if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
  5482. ((taicpu(p).oper[0]^.val and $7f)=taicpu(p).oper[0]^.val)
  5483. ) or
  5484. (((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
  5485. ((taicpu(p).oper[0]^.val and $7fff)=taicpu(p).oper[0]^.val))
  5486. {$ifdef x86_64}
  5487. or
  5488. (((taicpu(hp1).opsize)=S_LQ) and
  5489. ((taicpu(p).oper[0]^.val and $7fffffff)=taicpu(p).oper[0]^.val)
  5490. )
  5491. {$endif x86_64}
  5492. then
  5493. begin
  5494. DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
  5495. RemoveInstruction(hp1);
  5496. Exit;
  5497. end;
  5498. end
  5499. else if (taicpu(p).oper[1]^.typ = top_reg) and
  5500. (hp1.typ = ait_instruction) and
  5501. (taicpu(hp1).is_jmp) and
  5502. (taicpu(hp1).opcode<>A_JMP) and
  5503. not(RegInUsedRegs(taicpu(p).oper[1]^.reg,UsedRegs)) then
  5504. begin
  5505. { change
  5506. and x, reg
  5507. jxx
  5508. to
  5509. test x, reg
  5510. jxx
  5511. if reg is deallocated before the
  5512. jump, but only if it's a conditional jump (PFV)
  5513. }
  5514. taicpu(p).opcode := A_TEST;
  5515. Exit;
  5516. end;
  5517. end;
  5518. { Lone AND tests }
  5519. if MatchOpType(taicpu(p),top_const,top_reg) then
  5520. begin
  5521. {
  5522. - Convert and $0xFF,reg to and reg,reg if reg is 8-bit
  5523. - Convert and $0xFFFF,reg to and reg,reg if reg is 16-bit
  5524. - Convert and $0xFFFFFFFF,reg to and reg,reg if reg is 32-bit
  5525. }
  5526. if ((taicpu(p).oper[0]^.val = $FF) and (taicpu(p).opsize = S_B)) or
  5527. ((taicpu(p).oper[0]^.val = $FFFF) and (taicpu(p).opsize = S_W)) or
  5528. ((taicpu(p).oper[0]^.val = $FFFFFFFF) and (taicpu(p).opsize = S_L)) then
  5529. begin
  5530. taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
  5531. if taicpu(p).opsize = S_L then
  5532. begin
  5533. Include(OptsToCheck,aoc_MovAnd2Mov_3);
  5534. Result := True;
  5535. end;
  5536. end;
  5537. end;
  5538. end;
  5539. function TX86AsmOptimizer.OptPass2Lea(var p : tai) : Boolean;
  5540. begin
  5541. Result:=false;
  5542. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) and
  5543. MatchReference(taicpu(p).oper[0]^.ref^,taicpu(p).oper[1]^.reg,NR_INVALID) and
  5544. (taicpu(p).oper[0]^.ref^.index<>NR_NO) then
  5545. begin
  5546. taicpu(p).loadreg(1,taicpu(p).oper[0]^.ref^.base);
  5547. taicpu(p).loadreg(0,taicpu(p).oper[0]^.ref^.index);
  5548. taicpu(p).opcode:=A_ADD;
  5549. DebugMsg(SPeepholeOptimization + 'Lea2AddBase done',p);
  5550. result:=true;
  5551. end
  5552. else if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) and
  5553. MatchReference(taicpu(p).oper[0]^.ref^,NR_INVALID,taicpu(p).oper[1]^.reg) and
  5554. (taicpu(p).oper[0]^.ref^.base<>NR_NO) then
  5555. begin
  5556. taicpu(p).loadreg(1,taicpu(p).oper[0]^.ref^.index);
  5557. taicpu(p).loadreg(0,taicpu(p).oper[0]^.ref^.base);
  5558. taicpu(p).opcode:=A_ADD;
  5559. DebugMsg(SPeepholeOptimization + 'Lea2AddIndex done',p);
  5560. result:=true;
  5561. end;
  5562. end;
  5563. function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
  5564. var
  5565. hp1: tai; NewRef: TReference;
  5566. begin
  5567. { Change:
  5568. subl/q $x,%reg1
  5569. movl/q %reg1,%reg2
  5570. To:
  5571. leal/q $-x(%reg1),%reg2
  5572. subl/q $x,%reg1
  5573. Breaks the dependency chain and potentially permits the removal of
  5574. a CMP instruction if one follows.
  5575. }
  5576. Result := False;
  5577. if not (cs_opt_size in current_settings.optimizerswitches) and
  5578. (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
  5579. MatchOpType(taicpu(p),top_const,top_reg) and
  5580. GetNextInstruction(p, hp1) and
  5581. MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
  5582. (taicpu(hp1).oper[1]^.typ = top_reg) and
  5583. MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) then
  5584. begin
  5585. { Change the MOV instruction to a LEA instruction, and update the
  5586. first operand }
  5587. reference_reset(NewRef, 1, []);
  5588. NewRef.base := taicpu(p).oper[1]^.reg;
  5589. NewRef.scalefactor := 1;
  5590. NewRef.offset := -taicpu(p).oper[0]^.val;
  5591. taicpu(hp1).opcode := A_LEA;
  5592. taicpu(hp1).loadref(0, NewRef);
  5593. { Move what is now the LEA instruction to before the SUB instruction }
  5594. Asml.Remove(hp1);
  5595. Asml.InsertBefore(hp1, p);
  5596. AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
  5597. DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
  5598. Result := True;
  5599. end;
  5600. end;
  5601. function TX86AsmOptimizer.SkipSimpleInstructions(var hp1 : tai) : Boolean;
  5602. begin
  5603. { we can skip all instructions not messing with the stack pointer }
  5604. while assigned(hp1) and {MatchInstruction(hp1,[A_LEA,A_MOV,A_MOVQ,A_MOVSQ,A_MOVSX,A_MOVSXD,A_MOVZX,
  5605. A_AND,A_OR,A_XOR,A_ADD,A_SHR,A_SHL,A_IMUL,A_SETcc,A_SAR,A_SUB,A_TEST,A_CMOVcc,
  5606. A_MOVSS,A_MOVSD,A_MOVAPS,A_MOVUPD,A_MOVAPD,A_MOVUPS,
  5607. A_VMOVSS,A_VMOVSD,A_VMOVAPS,A_VMOVUPD,A_VMOVAPD,A_VMOVUPS],[]) and}
  5608. ({(taicpu(hp1).ops=0) or }
  5609. ({(MatchOpType(taicpu(hp1),top_reg,top_reg) or MatchOpType(taicpu(hp1),top_const,top_reg) or
  5610. (MatchOpType(taicpu(hp1),top_ref,top_reg))
  5611. ) and }
  5612. not(RegInInstruction(NR_STACK_POINTER_REG,hp1)) { and not(RegInInstruction(NR_FRAME_POINTER_REG,hp1))}
  5613. )
  5614. ) do
  5615. GetNextInstruction(hp1,hp1);
  5616. Result:=assigned(hp1);
  5617. end;
  5618. function TX86AsmOptimizer.PostPeepholeOptLea(var p : tai) : Boolean;
  5619. var
  5620. hp1, hp2, hp3, hp4: tai;
  5621. begin
  5622. Result:=false;
  5623. { replace
  5624. leal(q) x(<stackpointer>),<stackpointer>
  5625. call procname
  5626. leal(q) -x(<stackpointer>),<stackpointer>
  5627. ret
  5628. by
  5629. jmp procname
  5630. but do it only on level 4 because it destroys stack back traces
  5631. }
  5632. if (cs_opt_level4 in current_settings.optimizerswitches) and
  5633. MatchOpType(taicpu(p),top_ref,top_reg) and
  5634. (taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  5635. (taicpu(p).oper[0]^.ref^.index=NR_NO) and
  5636. { the -8 or -24 are not required, but bail out early if possible,
  5637. higher values are unlikely }
  5638. ((taicpu(p).oper[0]^.ref^.offset=-8) or
  5639. (taicpu(p).oper[0]^.ref^.offset=-24)) and
  5640. (taicpu(p).oper[0]^.ref^.symbol=nil) and
  5641. (taicpu(p).oper[0]^.ref^.relsymbol=nil) and
  5642. (taicpu(p).oper[0]^.ref^.segment=NR_NO) and
  5643. (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG) and
  5644. GetNextInstruction(p, hp1) and
  5645. { Take a copy of hp1 }
  5646. SetAndTest(hp1, hp4) and
  5647. { trick to skip label }
  5648. ((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
  5649. SkipSimpleInstructions(hp1) and
  5650. MatchInstruction(hp1,A_CALL,[S_NO]) and
  5651. GetNextInstruction(hp1, hp2) and
  5652. MatchInstruction(hp2,A_LEA,[taicpu(p).opsize]) and
  5653. MatchOpType(taicpu(hp2),top_ref,top_reg) and
  5654. (taicpu(hp2).oper[0]^.ref^.offset=-taicpu(p).oper[0]^.ref^.offset) and
  5655. (taicpu(hp2).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
  5656. (taicpu(hp2).oper[0]^.ref^.index=NR_NO) and
  5657. (taicpu(hp2).oper[0]^.ref^.symbol=nil) and
  5658. (taicpu(hp2).oper[0]^.ref^.relsymbol=nil) and
  5659. (taicpu(hp2).oper[0]^.ref^.segment=NR_NO) and
  5660. (taicpu(hp2).oper[1]^.reg=NR_STACK_POINTER_REG) and
  5661. GetNextInstruction(hp2, hp3) and
  5662. { trick to skip label }
  5663. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  5664. MatchInstruction(hp3,A_RET,[S_NO]) and
  5665. (taicpu(hp3).ops=0) then
  5666. begin
  5667. taicpu(hp1).opcode := A_JMP;
  5668. taicpu(hp1).is_jmp := true;
  5669. DebugMsg(SPeepholeOptimization + 'LeaCallLeaRet2Jmp done',p);
  5670. RemoveCurrentP(p, hp4);
  5671. RemoveInstruction(hp2);
  5672. RemoveInstruction(hp3);
  5673. Result:=true;
  5674. end;
  5675. end;
  5676. function TX86AsmOptimizer.PostPeepholeOptPush(var p : tai) : Boolean;
  5677. var
  5678. hp1, hp2, hp3, hp4: tai;
  5679. begin
  5680. Result:=false;
  5681. {$ifdef x86_64}
  5682. { replace
  5683. push %rax
  5684. call procname
  5685. pop %rcx
  5686. ret
  5687. by
  5688. jmp procname
  5689. but do it only on level 4 because it destroys stack back traces
  5690. It depends on the fact, that the sequence push rax/pop rcx is used for stack alignment as rcx is volatile
  5691. for all supported calling conventions
  5692. }
  5693. if (cs_opt_level4 in current_settings.optimizerswitches) and
  5694. MatchOpType(taicpu(p),top_reg) and
  5695. (taicpu(p).oper[0]^.reg=NR_RAX) and
  5696. GetNextInstruction(p, hp1) and
  5697. { Take a copy of hp1 }
  5698. SetAndTest(hp1, hp4) and
  5699. { trick to skip label }
  5700. ((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
  5701. SkipSimpleInstructions(hp1) and
  5702. MatchInstruction(hp1,A_CALL,[S_NO]) and
  5703. GetNextInstruction(hp1, hp2) and
  5704. MatchInstruction(hp2,A_POP,[taicpu(p).opsize]) and
  5705. MatchOpType(taicpu(hp2),top_reg) and
  5706. (taicpu(hp2).oper[0]^.reg=NR_RCX) and
  5707. GetNextInstruction(hp2, hp3) and
  5708. { trick to skip label }
  5709. ((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
  5710. MatchInstruction(hp3,A_RET,[S_NO]) and
  5711. (taicpu(hp3).ops=0) then
  5712. begin
  5713. taicpu(hp1).opcode := A_JMP;
  5714. taicpu(hp1).is_jmp := true;
  5715. DebugMsg(SPeepholeOptimization + 'PushCallPushRet2Jmp done',p);
  5716. RemoveCurrentP(p, hp4);
  5717. RemoveInstruction(hp2);
  5718. RemoveInstruction(hp3);
  5719. Result:=true;
  5720. end;
  5721. {$endif x86_64}
  5722. end;
  5723. function TX86AsmOptimizer.PostPeepholeOptMov(var p : tai) : Boolean;
  5724. var
  5725. Value, RegName: string;
  5726. begin
  5727. Result:=false;
  5728. if (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(p).oper[0]^.typ = top_const) then
  5729. begin
  5730. case taicpu(p).oper[0]^.val of
  5731. 0:
  5732. { Don't make this optimisation if the CPU flags are required, since XOR scrambles them }
  5733. if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  5734. begin
  5735. { change "mov $0,%reg" into "xor %reg,%reg" }
  5736. taicpu(p).opcode := A_XOR;
  5737. taicpu(p).loadReg(0,taicpu(p).oper[1]^.reg);
  5738. Result := True;
  5739. end;
  5740. $1..$FFFFFFFF:
  5741. begin
  5742. { Code size reduction by J. Gareth "Kit" Moreton }
  5743. { change 64-bit register to 32-bit register to reduce code size (upper 32 bits will be set to zero) }
  5744. case taicpu(p).opsize of
  5745. S_Q:
  5746. begin
  5747. RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
  5748. Value := debug_tostr(taicpu(p).oper[0]^.val);
  5749. { The actual optimization }
  5750. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5751. taicpu(p).changeopsize(S_L);
  5752. DebugMsg(SPeepholeOptimization + 'movq $' + Value + ',' + RegName + ' -> movl $' + Value + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
  5753. Result := True;
  5754. end;
  5755. else
  5756. { Do nothing };
  5757. end;
  5758. end;
  5759. -1:
  5760. { Don't make this optimisation if the CPU flags are required, since OR scrambles them }
  5761. if (cs_opt_size in current_settings.optimizerswitches) and
  5762. (taicpu(p).opsize <> S_B) and
  5763. not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
  5764. begin
  5765. { change "mov $-1,%reg" into "or $-1,%reg" }
  5766. { NOTES:
  5767. - No size saving is made when changing a Word-sized assignment unless the register is AX (smaller encoding)
  5768. - This operation creates a false dependency on the register, so only do it when optimising for size
  5769. - It is possible to set memory operands using this method, but this creates an even greater false dependency, so don't do this at all
  5770. }
  5771. taicpu(p).opcode := A_OR;
  5772. Result := True;
  5773. end;
  5774. end;
  5775. end;
  5776. end;
  5777. function TX86AsmOptimizer.PostPeepholeOptMOVSX(var p : tai) : boolean;
  5778. begin
  5779. Result := False;
  5780. if not MatchOpType(taicpu(p), top_reg, top_reg) then
  5781. Exit;
  5782. { Convert:
  5783. movswl %ax,%eax -> cwtl
  5784. movslq %eax,%rax -> cdqe
  5785. NOTE: Don't convert movswl %al,%ax to cbw, because cbw and cwde
  5786. refer to the same opcode and depends only on the assembler's
  5787. current operand-size attribute. [Kit]
  5788. }
  5789. with taicpu(p) do
  5790. case opsize of
  5791. S_WL:
  5792. if (oper[0]^.reg = NR_AX) and (oper[1]^.reg = NR_EAX) then
  5793. begin
  5794. DebugMsg(SPeepholeOptimization + 'Converted movswl %ax,%eax to cwtl', p);
  5795. opcode := A_CWDE;
  5796. clearop(0);
  5797. clearop(1);
  5798. ops := 0;
  5799. Result := True;
  5800. end;
  5801. {$ifdef x86_64}
  5802. S_LQ:
  5803. if (oper[0]^.reg = NR_EAX) and (oper[1]^.reg = NR_RAX) then
  5804. begin
  5805. DebugMsg(SPeepholeOptimization + 'Converted movslq %eax,%rax to cltq', p);
  5806. opcode := A_CDQE;
  5807. clearop(0);
  5808. clearop(1);
  5809. ops := 0;
  5810. Result := True;
  5811. end;
  5812. {$endif x86_64}
  5813. else
  5814. ;
  5815. end;
  5816. end;
  5817. function TX86AsmOptimizer.PostPeepholeOptCmp(var p : tai) : Boolean;
  5818. begin
  5819. Result:=false;
  5820. { change "cmp $0, %reg" to "test %reg, %reg" }
  5821. if MatchOpType(taicpu(p),top_const,top_reg) and
  5822. (taicpu(p).oper[0]^.val = 0) then
  5823. begin
  5824. taicpu(p).opcode := A_TEST;
  5825. taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
  5826. Result:=true;
  5827. end;
  5828. end;
  5829. function TX86AsmOptimizer.PostPeepholeOptTestOr(var p : tai) : Boolean;
  5830. var
  5831. IsTestConstX : Boolean;
  5832. hp1,hp2 : tai;
  5833. begin
  5834. Result:=false;
  5835. { removes the line marked with (x) from the sequence
  5836. and/or/xor/add/sub/... $x, %y
  5837. test/or %y, %y | test $-1, %y (x)
  5838. j(n)z _Label
  5839. as the first instruction already adjusts the ZF
  5840. %y operand may also be a reference }
  5841. IsTestConstX:=(taicpu(p).opcode=A_TEST) and
  5842. MatchOperand(taicpu(p).oper[0]^,-1);
  5843. if (OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) or IsTestConstX) and
  5844. GetLastInstruction(p, hp1) and
  5845. (tai(hp1).typ = ait_instruction) and
  5846. GetNextInstruction(p,hp2) and
  5847. MatchInstruction(hp2,A_SETcc,A_Jcc,A_CMOVcc,[]) then
  5848. case taicpu(hp1).opcode Of
  5849. A_ADD, A_SUB, A_OR, A_XOR, A_AND:
  5850. begin
  5851. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  5852. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5853. { and in case of carry for A(E)/B(E)/C/NC }
  5854. ((taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) or
  5855. ((taicpu(hp1).opcode <> A_ADD) and
  5856. (taicpu(hp1).opcode <> A_SUB))) then
  5857. begin
  5858. RemoveCurrentP(p, hp2);
  5859. Result:=true;
  5860. end;
  5861. end;
  5862. A_SHL, A_SAL, A_SHR, A_SAR:
  5863. begin
  5864. if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
  5865. { SHL/SAL/SHR/SAR with a value of 0 do not change the flags }
  5866. { therefore, it's only safe to do this optimization for }
  5867. { shifts by a (nonzero) constant }
  5868. (taicpu(hp1).oper[0]^.typ = top_const) and
  5869. (taicpu(hp1).oper[0]^.val <> 0) and
  5870. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5871. { and in case of carry for A(E)/B(E)/C/NC }
  5872. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  5873. begin
  5874. RemoveCurrentP(p, hp2);
  5875. Result:=true;
  5876. end;
  5877. end;
  5878. A_DEC, A_INC, A_NEG:
  5879. begin
  5880. if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) and
  5881. { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
  5882. { and in case of carry for A(E)/B(E)/C/NC }
  5883. (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
  5884. begin
  5885. case taicpu(hp1).opcode of
  5886. A_DEC, A_INC:
  5887. { replace inc/dec with add/sub 1, because inc/dec doesn't set the carry flag }
  5888. begin
  5889. case taicpu(hp1).opcode Of
  5890. A_DEC: taicpu(hp1).opcode := A_SUB;
  5891. A_INC: taicpu(hp1).opcode := A_ADD;
  5892. else
  5893. ;
  5894. end;
  5895. taicpu(hp1).loadoper(1,taicpu(hp1).oper[0]^);
  5896. taicpu(hp1).loadConst(0,1);
  5897. taicpu(hp1).ops:=2;
  5898. end;
  5899. else
  5900. ;
  5901. end;
  5902. RemoveCurrentP(p, hp2);
  5903. Result:=true;
  5904. end;
  5905. end
  5906. else
  5907. { change "test $-1,%reg" into "test %reg,%reg" }
  5908. if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
  5909. taicpu(p).loadoper(0,taicpu(p).oper[1]^);
  5910. end { case }
  5911. { change "test $-1,%reg" into "test %reg,%reg" }
  5912. else if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
  5913. taicpu(p).loadoper(0,taicpu(p).oper[1]^);
  5914. end;
  5915. function TX86AsmOptimizer.PostPeepholeOptCall(var p : tai) : Boolean;
  5916. var
  5917. hp1 : tai;
  5918. {$ifndef x86_64}
  5919. hp2 : taicpu;
  5920. {$endif x86_64}
  5921. begin
  5922. Result:=false;
  5923. {$ifndef x86_64}
  5924. { don't do this on modern CPUs, this really hurts them due to
  5925. broken call/ret pairing }
  5926. if (current_settings.optimizecputype < cpu_Pentium2) and
  5927. not(cs_create_pic in current_settings.moduleswitches) and
  5928. GetNextInstruction(p, hp1) and
  5929. MatchInstruction(hp1,A_JMP,[S_NO]) and
  5930. MatchOpType(taicpu(hp1),top_ref) and
  5931. (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
  5932. begin
  5933. hp2 := taicpu.Op_sym(A_PUSH,S_L,taicpu(hp1).oper[0]^.ref^.symbol);
  5934. InsertLLItem(p.previous, p, hp2);
  5935. taicpu(p).opcode := A_JMP;
  5936. taicpu(p).is_jmp := true;
  5937. RemoveInstruction(hp1);
  5938. Result:=true;
  5939. end
  5940. else
  5941. {$endif x86_64}
  5942. { replace
  5943. call procname
  5944. ret
  5945. by
  5946. jmp procname
  5947. but do it only on level 4 because it destroys stack back traces
  5948. else if the subroutine is marked as no return, remove the ret
  5949. }
  5950. if ((cs_opt_level4 in current_settings.optimizerswitches) or
  5951. (po_noreturn in current_procinfo.procdef.procoptions)) and
  5952. GetNextInstruction(p, hp1) and
  5953. MatchInstruction(hp1,A_RET,[S_NO]) and
  5954. (taicpu(hp1).ops=0) then
  5955. begin
  5956. if (cs_opt_level4 in current_settings.optimizerswitches) and
  5957. { we might destroy stack alignment here if we do not do a call }
  5958. (target_info.stackalign<=sizeof(SizeUInt)) then
  5959. begin
  5960. taicpu(p).opcode := A_JMP;
  5961. taicpu(p).is_jmp := true;
  5962. DebugMsg(SPeepholeOptimization + 'CallRet2Jmp done',p);
  5963. end
  5964. else
  5965. DebugMsg(SPeepholeOptimization + 'CallRet2Call done',p);
  5966. RemoveInstruction(hp1);
  5967. Result:=true;
  5968. end;
  5969. end;
  5970. {$ifdef x86_64}
  5971. function TX86AsmOptimizer.PostPeepholeOptMovzx(var p : tai) : Boolean;
  5972. var
  5973. PreMessage: string;
  5974. begin
  5975. Result := False;
  5976. { Code size reduction by J. Gareth "Kit" Moreton }
  5977. { Convert MOVZBQ and MOVZWQ to MOVZBL and MOVZWL respectively if it removes the REX prefix }
  5978. if (taicpu(p).opsize in [S_BQ, S_WQ]) and
  5979. (getsupreg(taicpu(p).oper[1]^.reg) in [RS_RAX, RS_RCX, RS_RDX, RS_RBX, RS_RSI, RS_RDI, RS_RBP, RS_RSP])
  5980. then
  5981. begin
  5982. { Has 64-bit register name and opcode suffix }
  5983. PreMessage := 'movz' + debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' -> movz';
  5984. { The actual optimization }
  5985. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  5986. if taicpu(p).opsize = S_BQ then
  5987. taicpu(p).changeopsize(S_BL)
  5988. else
  5989. taicpu(p).changeopsize(S_WL);
  5990. DebugMsg(SPeepholeOptimization + PreMessage +
  5991. debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (removes REX prefix)', p);
  5992. end;
  5993. end;
  5994. function TX86AsmOptimizer.PostPeepholeOptXor(var p : tai) : Boolean;
  5995. var
  5996. PreMessage, RegName: string;
  5997. begin
  5998. { Code size reduction by J. Gareth "Kit" Moreton }
  5999. { change "xorq %reg,%reg" to "xorl %reg,%reg" for %rax, %rcx, %rdx, %rbx, %rsi, %rdi, %rbp and %rsp,
  6000. as this removes the REX prefix }
  6001. Result := False;
  6002. if not OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
  6003. Exit;
  6004. if taicpu(p).oper[0]^.typ <> top_reg then
  6005. { Should be impossible if both operands were equal, since one of XOR's operands must be a register }
  6006. InternalError(2018011500);
  6007. case taicpu(p).opsize of
  6008. S_Q:
  6009. begin
  6010. if (getsupreg(taicpu(p).oper[0]^.reg) in [RS_RAX, RS_RCX, RS_RDX, RS_RBX, RS_RSI, RS_RDI, RS_RBP, RS_RSP]) then
  6011. begin
  6012. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 64-bit register name }
  6013. PreMessage := 'xorq ' + RegName + ',' + RegName + ' -> xorl ';
  6014. { The actual optimization }
  6015. setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
  6016. setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
  6017. taicpu(p).changeopsize(S_L);
  6018. RegName := debug_regname(taicpu(p).oper[0]^.reg); { 32-bit register name }
  6019. DebugMsg(SPeepholeOptimization + PreMessage + RegName + ',' + RegName + ' (removes REX prefix)', p);
  6020. end;
  6021. end;
  6022. else
  6023. ;
  6024. end;
  6025. end;
  6026. {$endif}
  6027. class procedure TX86AsmOptimizer.OptimizeRefs(var p: taicpu);
  6028. var
  6029. OperIdx: Integer;
  6030. begin
  6031. for OperIdx := 0 to p.ops - 1 do
  6032. if p.oper[OperIdx]^.typ = top_ref then
  6033. optimize_ref(p.oper[OperIdx]^.ref^, False);
  6034. end;
  6035. end.