row_gcc.cc 231 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534
  1. // VERSION 2
  2. /*
  3. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  4. *
  5. * Use of this source code is governed by a BSD-style license
  6. * that can be found in the LICENSE file in the root of the source
  7. * tree. An additional intellectual property rights grant can be found
  8. * in the file PATENTS. All contributing project authors may
  9. * be found in the AUTHORS file in the root of the source tree.
  10. */
  11. #include "libyuv/row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for GCC x86 and x64.
  17. #if !defined(LIBYUV_DISABLE_X86) && \
  18. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  19. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  20. // Constants for ARGB
  21. static vec8 kARGBToY = {
  22. 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
  23. };
  24. // JPeg full range.
  25. static vec8 kARGBToYJ = {
  26. 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
  27. };
  28. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  29. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  30. static vec8 kARGBToU = {
  31. 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
  32. };
  33. static vec8 kARGBToUJ = {
  34. 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
  35. };
  36. static vec8 kARGBToV = {
  37. -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
  38. };
  39. static vec8 kARGBToVJ = {
  40. -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
  41. };
  42. // Constants for BGRA
  43. static vec8 kBGRAToY = {
  44. 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
  45. };
  46. static vec8 kBGRAToU = {
  47. 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
  48. };
  49. static vec8 kBGRAToV = {
  50. 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
  51. };
  52. // Constants for ABGR
  53. static vec8 kABGRToY = {
  54. 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
  55. };
  56. static vec8 kABGRToU = {
  57. -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
  58. };
  59. static vec8 kABGRToV = {
  60. 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
  61. };
  62. // Constants for RGBA.
  63. static vec8 kRGBAToY = {
  64. 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
  65. };
  66. static vec8 kRGBAToU = {
  67. 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
  68. };
  69. static vec8 kRGBAToV = {
  70. 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
  71. };
  72. static uvec8 kAddY16 = {
  73. 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
  74. };
  75. // 7 bit fixed point 0.5.
  76. static vec16 kAddYJ64 = {
  77. 64, 64, 64, 64, 64, 64, 64, 64
  78. };
  79. static uvec8 kAddUV128 = {
  80. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  81. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  82. };
  83. static uvec16 kAddUVJ128 = {
  84. 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
  85. };
  86. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  87. #ifdef HAS_RGB24TOARGBROW_SSSE3
  88. // Shuffle table for converting RGB24 to ARGB.
  89. static uvec8 kShuffleMaskRGB24ToARGB = {
  90. 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
  91. };
  92. // Shuffle table for converting RAW to ARGB.
  93. static uvec8 kShuffleMaskRAWToARGB = {
  94. 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
  95. };
  96. // Shuffle table for converting RAW to RGB24. First 8.
  97. static const uvec8 kShuffleMaskRAWToRGB24_0 = {
  98. 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
  99. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  100. };
  101. // Shuffle table for converting RAW to RGB24. Middle 8.
  102. static const uvec8 kShuffleMaskRAWToRGB24_1 = {
  103. 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
  104. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  105. };
  106. // Shuffle table for converting RAW to RGB24. Last 8.
  107. static const uvec8 kShuffleMaskRAWToRGB24_2 = {
  108. 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
  109. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
  110. };
  111. // Shuffle table for converting ARGB to RGB24.
  112. static uvec8 kShuffleMaskARGBToRGB24 = {
  113. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
  114. };
  115. // Shuffle table for converting ARGB to RAW.
  116. static uvec8 kShuffleMaskARGBToRAW = {
  117. 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
  118. };
  119. // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
  120. static uvec8 kShuffleMaskARGBToRGB24_0 = {
  121. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
  122. };
  123. // YUY2 shuf 16 Y to 32 Y.
  124. static const lvec8 kShuffleYUY2Y = {
  125. 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
  126. 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
  127. };
  128. // YUY2 shuf 8 UV to 16 UV.
  129. static const lvec8 kShuffleYUY2UV = {
  130. 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
  131. 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
  132. };
  133. // UYVY shuf 16 Y to 32 Y.
  134. static const lvec8 kShuffleUYVYY = {
  135. 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
  136. 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
  137. };
  138. // UYVY shuf 8 UV to 16 UV.
  139. static const lvec8 kShuffleUYVYUV = {
  140. 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
  141. 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
  142. };
  143. // NV21 shuf 8 VU to 16 UV.
  144. static const lvec8 kShuffleNV21 = {
  145. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  146. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  147. };
  148. #endif // HAS_RGB24TOARGBROW_SSSE3
  149. #ifdef HAS_J400TOARGBROW_SSE2
  150. void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
  151. asm volatile (
  152. "pcmpeqb %%xmm5,%%xmm5 \n"
  153. "pslld $0x18,%%xmm5 \n"
  154. LABELALIGN
  155. "1: \n"
  156. "movq " MEMACCESS(0) ",%%xmm0 \n"
  157. "lea " MEMLEA(0x8,0) ",%0 \n"
  158. "punpcklbw %%xmm0,%%xmm0 \n"
  159. "movdqa %%xmm0,%%xmm1 \n"
  160. "punpcklwd %%xmm0,%%xmm0 \n"
  161. "punpckhwd %%xmm1,%%xmm1 \n"
  162. "por %%xmm5,%%xmm0 \n"
  163. "por %%xmm5,%%xmm1 \n"
  164. "movdqu %%xmm0," MEMACCESS(1) " \n"
  165. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  166. "lea " MEMLEA(0x20,1) ",%1 \n"
  167. "sub $0x8,%2 \n"
  168. "jg 1b \n"
  169. : "+r"(src_y), // %0
  170. "+r"(dst_argb), // %1
  171. "+r"(width) // %2
  172. :: "memory", "cc", "xmm0", "xmm1", "xmm5"
  173. );
  174. }
  175. #endif // HAS_J400TOARGBROW_SSE2
  176. #ifdef HAS_RGB24TOARGBROW_SSSE3
  177. void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
  178. asm volatile (
  179. "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
  180. "pslld $0x18,%%xmm5 \n"
  181. "movdqa %3,%%xmm4 \n"
  182. LABELALIGN
  183. "1: \n"
  184. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  185. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  186. "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
  187. "lea " MEMLEA(0x30,0) ",%0 \n"
  188. "movdqa %%xmm3,%%xmm2 \n"
  189. "palignr $0x8,%%xmm1,%%xmm2 \n"
  190. "pshufb %%xmm4,%%xmm2 \n"
  191. "por %%xmm5,%%xmm2 \n"
  192. "palignr $0xc,%%xmm0,%%xmm1 \n"
  193. "pshufb %%xmm4,%%xmm0 \n"
  194. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  195. "por %%xmm5,%%xmm0 \n"
  196. "pshufb %%xmm4,%%xmm1 \n"
  197. "movdqu %%xmm0," MEMACCESS(1) " \n"
  198. "por %%xmm5,%%xmm1 \n"
  199. "palignr $0x4,%%xmm3,%%xmm3 \n"
  200. "pshufb %%xmm4,%%xmm3 \n"
  201. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  202. "por %%xmm5,%%xmm3 \n"
  203. "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
  204. "lea " MEMLEA(0x40,1) ",%1 \n"
  205. "sub $0x10,%2 \n"
  206. "jg 1b \n"
  207. : "+r"(src_rgb24), // %0
  208. "+r"(dst_argb), // %1
  209. "+r"(width) // %2
  210. : "m"(kShuffleMaskRGB24ToARGB) // %3
  211. : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  212. );
  213. }
  214. void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
  215. asm volatile (
  216. "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
  217. "pslld $0x18,%%xmm5 \n"
  218. "movdqa %3,%%xmm4 \n"
  219. LABELALIGN
  220. "1: \n"
  221. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  222. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  223. "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
  224. "lea " MEMLEA(0x30,0) ",%0 \n"
  225. "movdqa %%xmm3,%%xmm2 \n"
  226. "palignr $0x8,%%xmm1,%%xmm2 \n"
  227. "pshufb %%xmm4,%%xmm2 \n"
  228. "por %%xmm5,%%xmm2 \n"
  229. "palignr $0xc,%%xmm0,%%xmm1 \n"
  230. "pshufb %%xmm4,%%xmm0 \n"
  231. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  232. "por %%xmm5,%%xmm0 \n"
  233. "pshufb %%xmm4,%%xmm1 \n"
  234. "movdqu %%xmm0," MEMACCESS(1) " \n"
  235. "por %%xmm5,%%xmm1 \n"
  236. "palignr $0x4,%%xmm3,%%xmm3 \n"
  237. "pshufb %%xmm4,%%xmm3 \n"
  238. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  239. "por %%xmm5,%%xmm3 \n"
  240. "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
  241. "lea " MEMLEA(0x40,1) ",%1 \n"
  242. "sub $0x10,%2 \n"
  243. "jg 1b \n"
  244. : "+r"(src_raw), // %0
  245. "+r"(dst_argb), // %1
  246. "+r"(width) // %2
  247. : "m"(kShuffleMaskRAWToARGB) // %3
  248. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  249. );
  250. }
  251. void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
  252. asm volatile (
  253. "movdqa %3,%%xmm3 \n"
  254. "movdqa %4,%%xmm4 \n"
  255. "movdqa %5,%%xmm5 \n"
  256. LABELALIGN
  257. "1: \n"
  258. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  259. "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n"
  260. "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n"
  261. "lea " MEMLEA(0x18,0) ",%0 \n"
  262. "pshufb %%xmm3,%%xmm0 \n"
  263. "pshufb %%xmm4,%%xmm1 \n"
  264. "pshufb %%xmm5,%%xmm2 \n"
  265. "movq %%xmm0," MEMACCESS(1) " \n"
  266. "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
  267. "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
  268. "lea " MEMLEA(0x18,1) ",%1 \n"
  269. "sub $0x8,%2 \n"
  270. "jg 1b \n"
  271. : "+r"(src_raw), // %0
  272. "+r"(dst_rgb24), // %1
  273. "+r"(width) // %2
  274. : "m"(kShuffleMaskRAWToRGB24_0), // %3
  275. "m"(kShuffleMaskRAWToRGB24_1), // %4
  276. "m"(kShuffleMaskRAWToRGB24_2) // %5
  277. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  278. );
  279. }
  280. void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
  281. asm volatile (
  282. "mov $0x1080108,%%eax \n"
  283. "movd %%eax,%%xmm5 \n"
  284. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  285. "mov $0x20802080,%%eax \n"
  286. "movd %%eax,%%xmm6 \n"
  287. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  288. "pcmpeqb %%xmm3,%%xmm3 \n"
  289. "psllw $0xb,%%xmm3 \n"
  290. "pcmpeqb %%xmm4,%%xmm4 \n"
  291. "psllw $0xa,%%xmm4 \n"
  292. "psrlw $0x5,%%xmm4 \n"
  293. "pcmpeqb %%xmm7,%%xmm7 \n"
  294. "psllw $0x8,%%xmm7 \n"
  295. "sub %0,%1 \n"
  296. "sub %0,%1 \n"
  297. LABELALIGN
  298. "1: \n"
  299. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  300. "movdqa %%xmm0,%%xmm1 \n"
  301. "movdqa %%xmm0,%%xmm2 \n"
  302. "pand %%xmm3,%%xmm1 \n"
  303. "psllw $0xb,%%xmm2 \n"
  304. "pmulhuw %%xmm5,%%xmm1 \n"
  305. "pmulhuw %%xmm5,%%xmm2 \n"
  306. "psllw $0x8,%%xmm1 \n"
  307. "por %%xmm2,%%xmm1 \n"
  308. "pand %%xmm4,%%xmm0 \n"
  309. "pmulhuw %%xmm6,%%xmm0 \n"
  310. "por %%xmm7,%%xmm0 \n"
  311. "movdqa %%xmm1,%%xmm2 \n"
  312. "punpcklbw %%xmm0,%%xmm1 \n"
  313. "punpckhbw %%xmm0,%%xmm2 \n"
  314. MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
  315. MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
  316. "lea " MEMLEA(0x10,0) ",%0 \n"
  317. "sub $0x8,%2 \n"
  318. "jg 1b \n"
  319. : "+r"(src), // %0
  320. "+r"(dst), // %1
  321. "+r"(width) // %2
  322. :
  323. : "memory", "cc", "eax", NACL_R14
  324. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  325. );
  326. }
  327. void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
  328. asm volatile (
  329. "mov $0x1080108,%%eax \n"
  330. "movd %%eax,%%xmm5 \n"
  331. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  332. "mov $0x42004200,%%eax \n"
  333. "movd %%eax,%%xmm6 \n"
  334. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  335. "pcmpeqb %%xmm3,%%xmm3 \n"
  336. "psllw $0xb,%%xmm3 \n"
  337. "movdqa %%xmm3,%%xmm4 \n"
  338. "psrlw $0x6,%%xmm4 \n"
  339. "pcmpeqb %%xmm7,%%xmm7 \n"
  340. "psllw $0x8,%%xmm7 \n"
  341. "sub %0,%1 \n"
  342. "sub %0,%1 \n"
  343. LABELALIGN
  344. "1: \n"
  345. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  346. "movdqa %%xmm0,%%xmm1 \n"
  347. "movdqa %%xmm0,%%xmm2 \n"
  348. "psllw $0x1,%%xmm1 \n"
  349. "psllw $0xb,%%xmm2 \n"
  350. "pand %%xmm3,%%xmm1 \n"
  351. "pmulhuw %%xmm5,%%xmm2 \n"
  352. "pmulhuw %%xmm5,%%xmm1 \n"
  353. "psllw $0x8,%%xmm1 \n"
  354. "por %%xmm2,%%xmm1 \n"
  355. "movdqa %%xmm0,%%xmm2 \n"
  356. "pand %%xmm4,%%xmm0 \n"
  357. "psraw $0x8,%%xmm2 \n"
  358. "pmulhuw %%xmm6,%%xmm0 \n"
  359. "pand %%xmm7,%%xmm2 \n"
  360. "por %%xmm2,%%xmm0 \n"
  361. "movdqa %%xmm1,%%xmm2 \n"
  362. "punpcklbw %%xmm0,%%xmm1 \n"
  363. "punpckhbw %%xmm0,%%xmm2 \n"
  364. MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
  365. MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
  366. "lea " MEMLEA(0x10,0) ",%0 \n"
  367. "sub $0x8,%2 \n"
  368. "jg 1b \n"
  369. : "+r"(src), // %0
  370. "+r"(dst), // %1
  371. "+r"(width) // %2
  372. :
  373. : "memory", "cc", "eax", NACL_R14
  374. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  375. );
  376. }
  377. void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
  378. asm volatile (
  379. "mov $0xf0f0f0f,%%eax \n"
  380. "movd %%eax,%%xmm4 \n"
  381. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  382. "movdqa %%xmm4,%%xmm5 \n"
  383. "pslld $0x4,%%xmm5 \n"
  384. "sub %0,%1 \n"
  385. "sub %0,%1 \n"
  386. LABELALIGN
  387. "1: \n"
  388. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  389. "movdqa %%xmm0,%%xmm2 \n"
  390. "pand %%xmm4,%%xmm0 \n"
  391. "pand %%xmm5,%%xmm2 \n"
  392. "movdqa %%xmm0,%%xmm1 \n"
  393. "movdqa %%xmm2,%%xmm3 \n"
  394. "psllw $0x4,%%xmm1 \n"
  395. "psrlw $0x4,%%xmm3 \n"
  396. "por %%xmm1,%%xmm0 \n"
  397. "por %%xmm3,%%xmm2 \n"
  398. "movdqa %%xmm0,%%xmm1 \n"
  399. "punpcklbw %%xmm2,%%xmm0 \n"
  400. "punpckhbw %%xmm2,%%xmm1 \n"
  401. MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
  402. MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
  403. "lea " MEMLEA(0x10,0) ",%0 \n"
  404. "sub $0x8,%2 \n"
  405. "jg 1b \n"
  406. : "+r"(src), // %0
  407. "+r"(dst), // %1
  408. "+r"(width) // %2
  409. :
  410. : "memory", "cc", "eax", NACL_R14
  411. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  412. );
  413. }
  414. void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
  415. asm volatile (
  416. "movdqa %3,%%xmm6 \n"
  417. LABELALIGN
  418. "1: \n"
  419. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  420. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  421. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  422. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  423. "lea " MEMLEA(0x40,0) ",%0 \n"
  424. "pshufb %%xmm6,%%xmm0 \n"
  425. "pshufb %%xmm6,%%xmm1 \n"
  426. "pshufb %%xmm6,%%xmm2 \n"
  427. "pshufb %%xmm6,%%xmm3 \n"
  428. "movdqa %%xmm1,%%xmm4 \n"
  429. "psrldq $0x4,%%xmm1 \n"
  430. "pslldq $0xc,%%xmm4 \n"
  431. "movdqa %%xmm2,%%xmm5 \n"
  432. "por %%xmm4,%%xmm0 \n"
  433. "pslldq $0x8,%%xmm5 \n"
  434. "movdqu %%xmm0," MEMACCESS(1) " \n"
  435. "por %%xmm5,%%xmm1 \n"
  436. "psrldq $0x8,%%xmm2 \n"
  437. "pslldq $0x4,%%xmm3 \n"
  438. "por %%xmm3,%%xmm2 \n"
  439. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  440. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  441. "lea " MEMLEA(0x30,1) ",%1 \n"
  442. "sub $0x10,%2 \n"
  443. "jg 1b \n"
  444. : "+r"(src), // %0
  445. "+r"(dst), // %1
  446. "+r"(width) // %2
  447. : "m"(kShuffleMaskARGBToRGB24) // %3
  448. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  449. );
  450. }
  451. void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
  452. asm volatile (
  453. "movdqa %3,%%xmm6 \n"
  454. LABELALIGN
  455. "1: \n"
  456. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  457. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  458. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  459. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  460. "lea " MEMLEA(0x40,0) ",%0 \n"
  461. "pshufb %%xmm6,%%xmm0 \n"
  462. "pshufb %%xmm6,%%xmm1 \n"
  463. "pshufb %%xmm6,%%xmm2 \n"
  464. "pshufb %%xmm6,%%xmm3 \n"
  465. "movdqa %%xmm1,%%xmm4 \n"
  466. "psrldq $0x4,%%xmm1 \n"
  467. "pslldq $0xc,%%xmm4 \n"
  468. "movdqa %%xmm2,%%xmm5 \n"
  469. "por %%xmm4,%%xmm0 \n"
  470. "pslldq $0x8,%%xmm5 \n"
  471. "movdqu %%xmm0," MEMACCESS(1) " \n"
  472. "por %%xmm5,%%xmm1 \n"
  473. "psrldq $0x8,%%xmm2 \n"
  474. "pslldq $0x4,%%xmm3 \n"
  475. "por %%xmm3,%%xmm2 \n"
  476. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  477. "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
  478. "lea " MEMLEA(0x30,1) ",%1 \n"
  479. "sub $0x10,%2 \n"
  480. "jg 1b \n"
  481. : "+r"(src), // %0
  482. "+r"(dst), // %1
  483. "+r"(width) // %2
  484. : "m"(kShuffleMaskARGBToRAW) // %3
  485. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  486. );
  487. }
  488. void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
  489. asm volatile (
  490. "pcmpeqb %%xmm3,%%xmm3 \n"
  491. "psrld $0x1b,%%xmm3 \n"
  492. "pcmpeqb %%xmm4,%%xmm4 \n"
  493. "psrld $0x1a,%%xmm4 \n"
  494. "pslld $0x5,%%xmm4 \n"
  495. "pcmpeqb %%xmm5,%%xmm5 \n"
  496. "pslld $0xb,%%xmm5 \n"
  497. LABELALIGN
  498. "1: \n"
  499. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  500. "movdqa %%xmm0,%%xmm1 \n"
  501. "movdqa %%xmm0,%%xmm2 \n"
  502. "pslld $0x8,%%xmm0 \n"
  503. "psrld $0x3,%%xmm1 \n"
  504. "psrld $0x5,%%xmm2 \n"
  505. "psrad $0x10,%%xmm0 \n"
  506. "pand %%xmm3,%%xmm1 \n"
  507. "pand %%xmm4,%%xmm2 \n"
  508. "pand %%xmm5,%%xmm0 \n"
  509. "por %%xmm2,%%xmm1 \n"
  510. "por %%xmm1,%%xmm0 \n"
  511. "packssdw %%xmm0,%%xmm0 \n"
  512. "lea " MEMLEA(0x10,0) ",%0 \n"
  513. "movq %%xmm0," MEMACCESS(1) " \n"
  514. "lea " MEMLEA(0x8,1) ",%1 \n"
  515. "sub $0x4,%2 \n"
  516. "jg 1b \n"
  517. : "+r"(src), // %0
  518. "+r"(dst), // %1
  519. "+r"(width) // %2
  520. :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  521. );
  522. }
  523. void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
  524. const uint32 dither4, int width) {
  525. asm volatile (
  526. "movd %3,%%xmm6 \n"
  527. "punpcklbw %%xmm6,%%xmm6 \n"
  528. "movdqa %%xmm6,%%xmm7 \n"
  529. "punpcklwd %%xmm6,%%xmm6 \n"
  530. "punpckhwd %%xmm7,%%xmm7 \n"
  531. "pcmpeqb %%xmm3,%%xmm3 \n"
  532. "psrld $0x1b,%%xmm3 \n"
  533. "pcmpeqb %%xmm4,%%xmm4 \n"
  534. "psrld $0x1a,%%xmm4 \n"
  535. "pslld $0x5,%%xmm4 \n"
  536. "pcmpeqb %%xmm5,%%xmm5 \n"
  537. "pslld $0xb,%%xmm5 \n"
  538. LABELALIGN
  539. "1: \n"
  540. "movdqu (%0),%%xmm0 \n"
  541. "paddusb %%xmm6,%%xmm0 \n"
  542. "movdqa %%xmm0,%%xmm1 \n"
  543. "movdqa %%xmm0,%%xmm2 \n"
  544. "pslld $0x8,%%xmm0 \n"
  545. "psrld $0x3,%%xmm1 \n"
  546. "psrld $0x5,%%xmm2 \n"
  547. "psrad $0x10,%%xmm0 \n"
  548. "pand %%xmm3,%%xmm1 \n"
  549. "pand %%xmm4,%%xmm2 \n"
  550. "pand %%xmm5,%%xmm0 \n"
  551. "por %%xmm2,%%xmm1 \n"
  552. "por %%xmm1,%%xmm0 \n"
  553. "packssdw %%xmm0,%%xmm0 \n"
  554. "lea 0x10(%0),%0 \n"
  555. "movq %%xmm0,(%1) \n"
  556. "lea 0x8(%1),%1 \n"
  557. "sub $0x4,%2 \n"
  558. "jg 1b \n"
  559. : "+r"(src), // %0
  560. "+r"(dst), // %1
  561. "+r"(width) // %2
  562. : "m"(dither4) // %3
  563. : "memory", "cc",
  564. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  565. );
  566. }
  567. #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
  568. void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
  569. const uint32 dither4, int width) {
  570. asm volatile (
  571. "vbroadcastss %3,%%xmm6 \n"
  572. "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
  573. "vpermq $0xd8,%%ymm6,%%ymm6 \n"
  574. "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
  575. "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
  576. "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
  577. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  578. "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
  579. "vpslld $0x5,%%ymm4,%%ymm4 \n"
  580. "vpslld $0xb,%%ymm3,%%ymm5 \n"
  581. LABELALIGN
  582. "1: \n"
  583. "vmovdqu (%0),%%ymm0 \n"
  584. "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
  585. "vpsrld $0x5,%%ymm0,%%ymm2 \n"
  586. "vpsrld $0x3,%%ymm0,%%ymm1 \n"
  587. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  588. "vpand %%ymm4,%%ymm2,%%ymm2 \n"
  589. "vpand %%ymm3,%%ymm1,%%ymm1 \n"
  590. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  591. "vpor %%ymm2,%%ymm1,%%ymm1 \n"
  592. "vpor %%ymm1,%%ymm0,%%ymm0 \n"
  593. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  594. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  595. "lea 0x20(%0),%0 \n"
  596. "vmovdqu %%xmm0,(%1) \n"
  597. "lea 0x10(%1),%1 \n"
  598. "sub $0x8,%2 \n"
  599. "jg 1b \n"
  600. "vzeroupper \n"
  601. : "+r"(src), // %0
  602. "+r"(dst), // %1
  603. "+r"(width) // %2
  604. : "m"(dither4) // %3
  605. : "memory", "cc",
  606. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  607. );
  608. }
  609. #endif // HAS_ARGBTORGB565DITHERROW_AVX2
  610. void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
  611. asm volatile (
  612. "pcmpeqb %%xmm4,%%xmm4 \n"
  613. "psrld $0x1b,%%xmm4 \n"
  614. "movdqa %%xmm4,%%xmm5 \n"
  615. "pslld $0x5,%%xmm5 \n"
  616. "movdqa %%xmm4,%%xmm6 \n"
  617. "pslld $0xa,%%xmm6 \n"
  618. "pcmpeqb %%xmm7,%%xmm7 \n"
  619. "pslld $0xf,%%xmm7 \n"
  620. LABELALIGN
  621. "1: \n"
  622. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  623. "movdqa %%xmm0,%%xmm1 \n"
  624. "movdqa %%xmm0,%%xmm2 \n"
  625. "movdqa %%xmm0,%%xmm3 \n"
  626. "psrad $0x10,%%xmm0 \n"
  627. "psrld $0x3,%%xmm1 \n"
  628. "psrld $0x6,%%xmm2 \n"
  629. "psrld $0x9,%%xmm3 \n"
  630. "pand %%xmm7,%%xmm0 \n"
  631. "pand %%xmm4,%%xmm1 \n"
  632. "pand %%xmm5,%%xmm2 \n"
  633. "pand %%xmm6,%%xmm3 \n"
  634. "por %%xmm1,%%xmm0 \n"
  635. "por %%xmm3,%%xmm2 \n"
  636. "por %%xmm2,%%xmm0 \n"
  637. "packssdw %%xmm0,%%xmm0 \n"
  638. "lea " MEMLEA(0x10,0) ",%0 \n"
  639. "movq %%xmm0," MEMACCESS(1) " \n"
  640. "lea " MEMLEA(0x8,1) ",%1 \n"
  641. "sub $0x4,%2 \n"
  642. "jg 1b \n"
  643. : "+r"(src), // %0
  644. "+r"(dst), // %1
  645. "+r"(width) // %2
  646. :: "memory", "cc",
  647. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  648. );
  649. }
  650. void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
  651. asm volatile (
  652. "pcmpeqb %%xmm4,%%xmm4 \n"
  653. "psllw $0xc,%%xmm4 \n"
  654. "movdqa %%xmm4,%%xmm3 \n"
  655. "psrlw $0x8,%%xmm3 \n"
  656. LABELALIGN
  657. "1: \n"
  658. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  659. "movdqa %%xmm0,%%xmm1 \n"
  660. "pand %%xmm3,%%xmm0 \n"
  661. "pand %%xmm4,%%xmm1 \n"
  662. "psrlq $0x4,%%xmm0 \n"
  663. "psrlq $0x8,%%xmm1 \n"
  664. "por %%xmm1,%%xmm0 \n"
  665. "packuswb %%xmm0,%%xmm0 \n"
  666. "lea " MEMLEA(0x10,0) ",%0 \n"
  667. "movq %%xmm0," MEMACCESS(1) " \n"
  668. "lea " MEMLEA(0x8,1) ",%1 \n"
  669. "sub $0x4,%2 \n"
  670. "jg 1b \n"
  671. : "+r"(src), // %0
  672. "+r"(dst), // %1
  673. "+r"(width) // %2
  674. :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  675. );
  676. }
  677. #endif // HAS_RGB24TOARGBROW_SSSE3
  678. #ifdef HAS_ARGBTOYROW_SSSE3
  679. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
  680. void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
  681. asm volatile (
  682. "movdqa %3,%%xmm4 \n"
  683. "movdqa %4,%%xmm5 \n"
  684. LABELALIGN
  685. "1: \n"
  686. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  687. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  688. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  689. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  690. "pmaddubsw %%xmm4,%%xmm0 \n"
  691. "pmaddubsw %%xmm4,%%xmm1 \n"
  692. "pmaddubsw %%xmm4,%%xmm2 \n"
  693. "pmaddubsw %%xmm4,%%xmm3 \n"
  694. "lea " MEMLEA(0x40,0) ",%0 \n"
  695. "phaddw %%xmm1,%%xmm0 \n"
  696. "phaddw %%xmm3,%%xmm2 \n"
  697. "psrlw $0x7,%%xmm0 \n"
  698. "psrlw $0x7,%%xmm2 \n"
  699. "packuswb %%xmm2,%%xmm0 \n"
  700. "paddb %%xmm5,%%xmm0 \n"
  701. "movdqu %%xmm0," MEMACCESS(1) " \n"
  702. "lea " MEMLEA(0x10,1) ",%1 \n"
  703. "sub $0x10,%2 \n"
  704. "jg 1b \n"
  705. : "+r"(src_argb), // %0
  706. "+r"(dst_y), // %1
  707. "+r"(width) // %2
  708. : "m"(kARGBToY), // %3
  709. "m"(kAddY16) // %4
  710. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  711. );
  712. }
  713. #endif // HAS_ARGBTOYROW_SSSE3
  714. #ifdef HAS_ARGBTOYJROW_SSSE3
  715. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
  716. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
  717. void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
  718. asm volatile (
  719. "movdqa %3,%%xmm4 \n"
  720. "movdqa %4,%%xmm5 \n"
  721. LABELALIGN
  722. "1: \n"
  723. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  724. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  725. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  726. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  727. "pmaddubsw %%xmm4,%%xmm0 \n"
  728. "pmaddubsw %%xmm4,%%xmm1 \n"
  729. "pmaddubsw %%xmm4,%%xmm2 \n"
  730. "pmaddubsw %%xmm4,%%xmm3 \n"
  731. "lea " MEMLEA(0x40,0) ",%0 \n"
  732. "phaddw %%xmm1,%%xmm0 \n"
  733. "phaddw %%xmm3,%%xmm2 \n"
  734. "paddw %%xmm5,%%xmm0 \n"
  735. "paddw %%xmm5,%%xmm2 \n"
  736. "psrlw $0x7,%%xmm0 \n"
  737. "psrlw $0x7,%%xmm2 \n"
  738. "packuswb %%xmm2,%%xmm0 \n"
  739. "movdqu %%xmm0," MEMACCESS(1) " \n"
  740. "lea " MEMLEA(0x10,1) ",%1 \n"
  741. "sub $0x10,%2 \n"
  742. "jg 1b \n"
  743. : "+r"(src_argb), // %0
  744. "+r"(dst_y), // %1
  745. "+r"(width) // %2
  746. : "m"(kARGBToYJ), // %3
  747. "m"(kAddYJ64) // %4
  748. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  749. );
  750. }
  751. #endif // HAS_ARGBTOYJROW_SSSE3
  752. #ifdef HAS_ARGBTOYROW_AVX2
  753. // vpermd for vphaddw + vpackuswb vpermd.
  754. static const lvec32 kPermdARGBToY_AVX = {
  755. 0, 4, 1, 5, 2, 6, 3, 7
  756. };
  757. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  758. void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
  759. asm volatile (
  760. "vbroadcastf128 %3,%%ymm4 \n"
  761. "vbroadcastf128 %4,%%ymm5 \n"
  762. "vmovdqu %5,%%ymm6 \n"
  763. LABELALIGN
  764. "1: \n"
  765. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  766. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  767. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  768. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  769. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  770. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  771. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  772. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  773. "lea " MEMLEA(0x80,0) ",%0 \n"
  774. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
  775. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
  776. "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
  777. "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
  778. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  779. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
  780. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
  781. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  782. "lea " MEMLEA(0x20,1) ",%1 \n"
  783. "sub $0x20,%2 \n"
  784. "jg 1b \n"
  785. "vzeroupper \n"
  786. : "+r"(src_argb), // %0
  787. "+r"(dst_y), // %1
  788. "+r"(width) // %2
  789. : "m"(kARGBToY), // %3
  790. "m"(kAddY16), // %4
  791. "m"(kPermdARGBToY_AVX) // %5
  792. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  793. );
  794. }
  795. #endif // HAS_ARGBTOYROW_AVX2
  796. #ifdef HAS_ARGBTOYJROW_AVX2
  797. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  798. void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
  799. asm volatile (
  800. "vbroadcastf128 %3,%%ymm4 \n"
  801. "vbroadcastf128 %4,%%ymm5 \n"
  802. "vmovdqu %5,%%ymm6 \n"
  803. LABELALIGN
  804. "1: \n"
  805. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  806. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  807. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  808. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  809. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  810. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  811. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  812. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  813. "lea " MEMLEA(0x80,0) ",%0 \n"
  814. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
  815. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
  816. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
  817. "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
  818. "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
  819. "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
  820. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  821. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
  822. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  823. "lea " MEMLEA(0x20,1) ",%1 \n"
  824. "sub $0x20,%2 \n"
  825. "jg 1b \n"
  826. "vzeroupper \n"
  827. : "+r"(src_argb), // %0
  828. "+r"(dst_y), // %1
  829. "+r"(width) // %2
  830. : "m"(kARGBToYJ), // %3
  831. "m"(kAddYJ64), // %4
  832. "m"(kPermdARGBToY_AVX) // %5
  833. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  834. );
  835. }
  836. #endif // HAS_ARGBTOYJROW_AVX2
  837. #ifdef HAS_ARGBTOUVROW_SSSE3
  838. void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  839. uint8* dst_u, uint8* dst_v, int width) {
  840. asm volatile (
  841. "movdqa %5,%%xmm3 \n"
  842. "movdqa %6,%%xmm4 \n"
  843. "movdqa %7,%%xmm5 \n"
  844. "sub %1,%2 \n"
  845. LABELALIGN
  846. "1: \n"
  847. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  848. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  849. "pavgb %%xmm7,%%xmm0 \n"
  850. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  851. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  852. "pavgb %%xmm7,%%xmm1 \n"
  853. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  854. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  855. "pavgb %%xmm7,%%xmm2 \n"
  856. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  857. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  858. "pavgb %%xmm7,%%xmm6 \n"
  859. "lea " MEMLEA(0x40,0) ",%0 \n"
  860. "movdqa %%xmm0,%%xmm7 \n"
  861. "shufps $0x88,%%xmm1,%%xmm0 \n"
  862. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  863. "pavgb %%xmm7,%%xmm0 \n"
  864. "movdqa %%xmm2,%%xmm7 \n"
  865. "shufps $0x88,%%xmm6,%%xmm2 \n"
  866. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  867. "pavgb %%xmm7,%%xmm2 \n"
  868. "movdqa %%xmm0,%%xmm1 \n"
  869. "movdqa %%xmm2,%%xmm6 \n"
  870. "pmaddubsw %%xmm4,%%xmm0 \n"
  871. "pmaddubsw %%xmm4,%%xmm2 \n"
  872. "pmaddubsw %%xmm3,%%xmm1 \n"
  873. "pmaddubsw %%xmm3,%%xmm6 \n"
  874. "phaddw %%xmm2,%%xmm0 \n"
  875. "phaddw %%xmm6,%%xmm1 \n"
  876. "psraw $0x8,%%xmm0 \n"
  877. "psraw $0x8,%%xmm1 \n"
  878. "packsswb %%xmm1,%%xmm0 \n"
  879. "paddb %%xmm5,%%xmm0 \n"
  880. "movlps %%xmm0," MEMACCESS(1) " \n"
  881. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  882. "lea " MEMLEA(0x8,1) ",%1 \n"
  883. "sub $0x10,%3 \n"
  884. "jg 1b \n"
  885. : "+r"(src_argb0), // %0
  886. "+r"(dst_u), // %1
  887. "+r"(dst_v), // %2
  888. "+rm"(width) // %3
  889. : "r"((intptr_t)(src_stride_argb)), // %4
  890. "m"(kARGBToV), // %5
  891. "m"(kARGBToU), // %6
  892. "m"(kAddUV128) // %7
  893. : "memory", "cc", NACL_R14
  894. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  895. );
  896. }
  897. #endif // HAS_ARGBTOUVROW_SSSE3
  898. #ifdef HAS_ARGBTOUVROW_AVX2
  899. // vpshufb for vphaddw + vpackuswb packed to shorts.
  900. static const lvec8 kShufARGBToUV_AVX = {
  901. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
  902. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
  903. };
  904. void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
  905. uint8* dst_u, uint8* dst_v, int width) {
  906. asm volatile (
  907. "vbroadcastf128 %5,%%ymm5 \n"
  908. "vbroadcastf128 %6,%%ymm6 \n"
  909. "vbroadcastf128 %7,%%ymm7 \n"
  910. "sub %1,%2 \n"
  911. LABELALIGN
  912. "1: \n"
  913. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  914. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  915. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  916. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  917. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  918. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  919. VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
  920. VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
  921. "lea " MEMLEA(0x80,0) ",%0 \n"
  922. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  923. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  924. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  925. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  926. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  927. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  928. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  929. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  930. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  931. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  932. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  933. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  934. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  935. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  936. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  937. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  938. "vpshufb %8,%%ymm0,%%ymm0 \n"
  939. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
  940. "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
  941. VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
  942. "lea " MEMLEA(0x10,1) ",%1 \n"
  943. "sub $0x20,%3 \n"
  944. "jg 1b \n"
  945. "vzeroupper \n"
  946. : "+r"(src_argb0), // %0
  947. "+r"(dst_u), // %1
  948. "+r"(dst_v), // %2
  949. "+rm"(width) // %3
  950. : "r"((intptr_t)(src_stride_argb)), // %4
  951. "m"(kAddUV128), // %5
  952. "m"(kARGBToV), // %6
  953. "m"(kARGBToU), // %7
  954. "m"(kShufARGBToUV_AVX) // %8
  955. : "memory", "cc", NACL_R14
  956. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  957. );
  958. }
  959. #endif // HAS_ARGBTOUVROW_AVX2
  960. #ifdef HAS_ARGBTOUVJROW_AVX2
  961. void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
  962. uint8* dst_u, uint8* dst_v, int width) {
  963. asm volatile (
  964. "vbroadcastf128 %5,%%ymm5 \n"
  965. "vbroadcastf128 %6,%%ymm6 \n"
  966. "vbroadcastf128 %7,%%ymm7 \n"
  967. "sub %1,%2 \n"
  968. LABELALIGN
  969. "1: \n"
  970. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  971. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  972. "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
  973. "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
  974. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  975. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  976. VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
  977. VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
  978. "lea " MEMLEA(0x80,0) ",%0 \n"
  979. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  980. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  981. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  982. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  983. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  984. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  985. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  986. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  987. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  988. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  989. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  990. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  991. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
  992. "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
  993. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  994. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  995. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  996. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  997. "vpshufb %8,%%ymm0,%%ymm0 \n"
  998. "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
  999. VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
  1000. "lea " MEMLEA(0x10,1) ",%1 \n"
  1001. "sub $0x20,%3 \n"
  1002. "jg 1b \n"
  1003. "vzeroupper \n"
  1004. : "+r"(src_argb0), // %0
  1005. "+r"(dst_u), // %1
  1006. "+r"(dst_v), // %2
  1007. "+rm"(width) // %3
  1008. : "r"((intptr_t)(src_stride_argb)), // %4
  1009. "m"(kAddUVJ128), // %5
  1010. "m"(kARGBToVJ), // %6
  1011. "m"(kARGBToUJ), // %7
  1012. "m"(kShufARGBToUV_AVX) // %8
  1013. : "memory", "cc", NACL_R14
  1014. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  1015. );
  1016. }
  1017. #endif // HAS_ARGBTOUVJROW_AVX2
  1018. #ifdef HAS_ARGBTOUVJROW_SSSE3
  1019. void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  1020. uint8* dst_u, uint8* dst_v, int width) {
  1021. asm volatile (
  1022. "movdqa %5,%%xmm3 \n"
  1023. "movdqa %6,%%xmm4 \n"
  1024. "movdqa %7,%%xmm5 \n"
  1025. "sub %1,%2 \n"
  1026. LABELALIGN
  1027. "1: \n"
  1028. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1029. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1030. "pavgb %%xmm7,%%xmm0 \n"
  1031. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1032. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1033. "pavgb %%xmm7,%%xmm1 \n"
  1034. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1035. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1036. "pavgb %%xmm7,%%xmm2 \n"
  1037. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1038. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1039. "pavgb %%xmm7,%%xmm6 \n"
  1040. "lea " MEMLEA(0x40,0) ",%0 \n"
  1041. "movdqa %%xmm0,%%xmm7 \n"
  1042. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1043. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1044. "pavgb %%xmm7,%%xmm0 \n"
  1045. "movdqa %%xmm2,%%xmm7 \n"
  1046. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1047. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1048. "pavgb %%xmm7,%%xmm2 \n"
  1049. "movdqa %%xmm0,%%xmm1 \n"
  1050. "movdqa %%xmm2,%%xmm6 \n"
  1051. "pmaddubsw %%xmm4,%%xmm0 \n"
  1052. "pmaddubsw %%xmm4,%%xmm2 \n"
  1053. "pmaddubsw %%xmm3,%%xmm1 \n"
  1054. "pmaddubsw %%xmm3,%%xmm6 \n"
  1055. "phaddw %%xmm2,%%xmm0 \n"
  1056. "phaddw %%xmm6,%%xmm1 \n"
  1057. "paddw %%xmm5,%%xmm0 \n"
  1058. "paddw %%xmm5,%%xmm1 \n"
  1059. "psraw $0x8,%%xmm0 \n"
  1060. "psraw $0x8,%%xmm1 \n"
  1061. "packsswb %%xmm1,%%xmm0 \n"
  1062. "movlps %%xmm0," MEMACCESS(1) " \n"
  1063. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1064. "lea " MEMLEA(0x8,1) ",%1 \n"
  1065. "sub $0x10,%3 \n"
  1066. "jg 1b \n"
  1067. : "+r"(src_argb0), // %0
  1068. "+r"(dst_u), // %1
  1069. "+r"(dst_v), // %2
  1070. "+rm"(width) // %3
  1071. : "r"((intptr_t)(src_stride_argb)), // %4
  1072. "m"(kARGBToVJ), // %5
  1073. "m"(kARGBToUJ), // %6
  1074. "m"(kAddUVJ128) // %7
  1075. : "memory", "cc", NACL_R14
  1076. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1077. );
  1078. }
  1079. #endif // HAS_ARGBTOUVJROW_SSSE3
  1080. #ifdef HAS_ARGBTOUV444ROW_SSSE3
  1081. void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
  1082. int width) {
  1083. asm volatile (
  1084. "movdqa %4,%%xmm3 \n"
  1085. "movdqa %5,%%xmm4 \n"
  1086. "movdqa %6,%%xmm5 \n"
  1087. "sub %1,%2 \n"
  1088. LABELALIGN
  1089. "1: \n"
  1090. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1091. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1092. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1093. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1094. "pmaddubsw %%xmm4,%%xmm0 \n"
  1095. "pmaddubsw %%xmm4,%%xmm1 \n"
  1096. "pmaddubsw %%xmm4,%%xmm2 \n"
  1097. "pmaddubsw %%xmm4,%%xmm6 \n"
  1098. "phaddw %%xmm1,%%xmm0 \n"
  1099. "phaddw %%xmm6,%%xmm2 \n"
  1100. "psraw $0x8,%%xmm0 \n"
  1101. "psraw $0x8,%%xmm2 \n"
  1102. "packsswb %%xmm2,%%xmm0 \n"
  1103. "paddb %%xmm5,%%xmm0 \n"
  1104. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1105. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1106. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1107. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1108. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1109. "pmaddubsw %%xmm3,%%xmm0 \n"
  1110. "pmaddubsw %%xmm3,%%xmm1 \n"
  1111. "pmaddubsw %%xmm3,%%xmm2 \n"
  1112. "pmaddubsw %%xmm3,%%xmm6 \n"
  1113. "phaddw %%xmm1,%%xmm0 \n"
  1114. "phaddw %%xmm6,%%xmm2 \n"
  1115. "psraw $0x8,%%xmm0 \n"
  1116. "psraw $0x8,%%xmm2 \n"
  1117. "packsswb %%xmm2,%%xmm0 \n"
  1118. "paddb %%xmm5,%%xmm0 \n"
  1119. "lea " MEMLEA(0x40,0) ",%0 \n"
  1120. MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
  1121. "lea " MEMLEA(0x10,1) ",%1 \n"
  1122. "sub $0x10,%3 \n"
  1123. "jg 1b \n"
  1124. : "+r"(src_argb), // %0
  1125. "+r"(dst_u), // %1
  1126. "+r"(dst_v), // %2
  1127. "+rm"(width) // %3
  1128. : "m"(kARGBToV), // %4
  1129. "m"(kARGBToU), // %5
  1130. "m"(kAddUV128) // %6
  1131. : "memory", "cc", NACL_R14
  1132. "xmm0", "xmm1", "xmm2", "xmm6"
  1133. );
  1134. }
  1135. #endif // HAS_ARGBTOUV444ROW_SSSE3
  1136. void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
  1137. asm volatile (
  1138. "movdqa %4,%%xmm5 \n"
  1139. "movdqa %3,%%xmm4 \n"
  1140. LABELALIGN
  1141. "1: \n"
  1142. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1143. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1144. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1145. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  1146. "pmaddubsw %%xmm4,%%xmm0 \n"
  1147. "pmaddubsw %%xmm4,%%xmm1 \n"
  1148. "pmaddubsw %%xmm4,%%xmm2 \n"
  1149. "pmaddubsw %%xmm4,%%xmm3 \n"
  1150. "lea " MEMLEA(0x40,0) ",%0 \n"
  1151. "phaddw %%xmm1,%%xmm0 \n"
  1152. "phaddw %%xmm3,%%xmm2 \n"
  1153. "psrlw $0x7,%%xmm0 \n"
  1154. "psrlw $0x7,%%xmm2 \n"
  1155. "packuswb %%xmm2,%%xmm0 \n"
  1156. "paddb %%xmm5,%%xmm0 \n"
  1157. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1158. "lea " MEMLEA(0x10,1) ",%1 \n"
  1159. "sub $0x10,%2 \n"
  1160. "jg 1b \n"
  1161. : "+r"(src_bgra), // %0
  1162. "+r"(dst_y), // %1
  1163. "+r"(width) // %2
  1164. : "m"(kBGRAToY), // %3
  1165. "m"(kAddY16) // %4
  1166. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1167. );
  1168. }
  1169. void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
  1170. uint8* dst_u, uint8* dst_v, int width) {
  1171. asm volatile (
  1172. "movdqa %5,%%xmm3 \n"
  1173. "movdqa %6,%%xmm4 \n"
  1174. "movdqa %7,%%xmm5 \n"
  1175. "sub %1,%2 \n"
  1176. LABELALIGN
  1177. "1: \n"
  1178. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1179. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1180. "pavgb %%xmm7,%%xmm0 \n"
  1181. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1182. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1183. "pavgb %%xmm7,%%xmm1 \n"
  1184. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1185. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1186. "pavgb %%xmm7,%%xmm2 \n"
  1187. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1188. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1189. "pavgb %%xmm7,%%xmm6 \n"
  1190. "lea " MEMLEA(0x40,0) ",%0 \n"
  1191. "movdqa %%xmm0,%%xmm7 \n"
  1192. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1193. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1194. "pavgb %%xmm7,%%xmm0 \n"
  1195. "movdqa %%xmm2,%%xmm7 \n"
  1196. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1197. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1198. "pavgb %%xmm7,%%xmm2 \n"
  1199. "movdqa %%xmm0,%%xmm1 \n"
  1200. "movdqa %%xmm2,%%xmm6 \n"
  1201. "pmaddubsw %%xmm4,%%xmm0 \n"
  1202. "pmaddubsw %%xmm4,%%xmm2 \n"
  1203. "pmaddubsw %%xmm3,%%xmm1 \n"
  1204. "pmaddubsw %%xmm3,%%xmm6 \n"
  1205. "phaddw %%xmm2,%%xmm0 \n"
  1206. "phaddw %%xmm6,%%xmm1 \n"
  1207. "psraw $0x8,%%xmm0 \n"
  1208. "psraw $0x8,%%xmm1 \n"
  1209. "packsswb %%xmm1,%%xmm0 \n"
  1210. "paddb %%xmm5,%%xmm0 \n"
  1211. "movlps %%xmm0," MEMACCESS(1) " \n"
  1212. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1213. "lea " MEMLEA(0x8,1) ",%1 \n"
  1214. "sub $0x10,%3 \n"
  1215. "jg 1b \n"
  1216. : "+r"(src_bgra0), // %0
  1217. "+r"(dst_u), // %1
  1218. "+r"(dst_v), // %2
  1219. "+rm"(width) // %3
  1220. : "r"((intptr_t)(src_stride_bgra)), // %4
  1221. "m"(kBGRAToV), // %5
  1222. "m"(kBGRAToU), // %6
  1223. "m"(kAddUV128) // %7
  1224. : "memory", "cc", NACL_R14
  1225. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1226. );
  1227. }
  1228. void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
  1229. asm volatile (
  1230. "movdqa %4,%%xmm5 \n"
  1231. "movdqa %3,%%xmm4 \n"
  1232. LABELALIGN
  1233. "1: \n"
  1234. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1235. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1236. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1237. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  1238. "pmaddubsw %%xmm4,%%xmm0 \n"
  1239. "pmaddubsw %%xmm4,%%xmm1 \n"
  1240. "pmaddubsw %%xmm4,%%xmm2 \n"
  1241. "pmaddubsw %%xmm4,%%xmm3 \n"
  1242. "lea " MEMLEA(0x40,0) ",%0 \n"
  1243. "phaddw %%xmm1,%%xmm0 \n"
  1244. "phaddw %%xmm3,%%xmm2 \n"
  1245. "psrlw $0x7,%%xmm0 \n"
  1246. "psrlw $0x7,%%xmm2 \n"
  1247. "packuswb %%xmm2,%%xmm0 \n"
  1248. "paddb %%xmm5,%%xmm0 \n"
  1249. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1250. "lea " MEMLEA(0x10,1) ",%1 \n"
  1251. "sub $0x10,%2 \n"
  1252. "jg 1b \n"
  1253. : "+r"(src_abgr), // %0
  1254. "+r"(dst_y), // %1
  1255. "+r"(width) // %2
  1256. : "m"(kABGRToY), // %3
  1257. "m"(kAddY16) // %4
  1258. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1259. );
  1260. }
  1261. void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
  1262. asm volatile (
  1263. "movdqa %4,%%xmm5 \n"
  1264. "movdqa %3,%%xmm4 \n"
  1265. LABELALIGN
  1266. "1: \n"
  1267. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1268. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1269. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1270. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  1271. "pmaddubsw %%xmm4,%%xmm0 \n"
  1272. "pmaddubsw %%xmm4,%%xmm1 \n"
  1273. "pmaddubsw %%xmm4,%%xmm2 \n"
  1274. "pmaddubsw %%xmm4,%%xmm3 \n"
  1275. "lea " MEMLEA(0x40,0) ",%0 \n"
  1276. "phaddw %%xmm1,%%xmm0 \n"
  1277. "phaddw %%xmm3,%%xmm2 \n"
  1278. "psrlw $0x7,%%xmm0 \n"
  1279. "psrlw $0x7,%%xmm2 \n"
  1280. "packuswb %%xmm2,%%xmm0 \n"
  1281. "paddb %%xmm5,%%xmm0 \n"
  1282. "movdqu %%xmm0," MEMACCESS(1) " \n"
  1283. "lea " MEMLEA(0x10,1) ",%1 \n"
  1284. "sub $0x10,%2 \n"
  1285. "jg 1b \n"
  1286. : "+r"(src_rgba), // %0
  1287. "+r"(dst_y), // %1
  1288. "+r"(width) // %2
  1289. : "m"(kRGBAToY), // %3
  1290. "m"(kAddY16) // %4
  1291. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1292. );
  1293. }
  1294. void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
  1295. uint8* dst_u, uint8* dst_v, int width) {
  1296. asm volatile (
  1297. "movdqa %5,%%xmm3 \n"
  1298. "movdqa %6,%%xmm4 \n"
  1299. "movdqa %7,%%xmm5 \n"
  1300. "sub %1,%2 \n"
  1301. LABELALIGN
  1302. "1: \n"
  1303. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1304. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1305. "pavgb %%xmm7,%%xmm0 \n"
  1306. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1307. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1308. "pavgb %%xmm7,%%xmm1 \n"
  1309. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1310. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1311. "pavgb %%xmm7,%%xmm2 \n"
  1312. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1313. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1314. "pavgb %%xmm7,%%xmm6 \n"
  1315. "lea " MEMLEA(0x40,0) ",%0 \n"
  1316. "movdqa %%xmm0,%%xmm7 \n"
  1317. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1318. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1319. "pavgb %%xmm7,%%xmm0 \n"
  1320. "movdqa %%xmm2,%%xmm7 \n"
  1321. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1322. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1323. "pavgb %%xmm7,%%xmm2 \n"
  1324. "movdqa %%xmm0,%%xmm1 \n"
  1325. "movdqa %%xmm2,%%xmm6 \n"
  1326. "pmaddubsw %%xmm4,%%xmm0 \n"
  1327. "pmaddubsw %%xmm4,%%xmm2 \n"
  1328. "pmaddubsw %%xmm3,%%xmm1 \n"
  1329. "pmaddubsw %%xmm3,%%xmm6 \n"
  1330. "phaddw %%xmm2,%%xmm0 \n"
  1331. "phaddw %%xmm6,%%xmm1 \n"
  1332. "psraw $0x8,%%xmm0 \n"
  1333. "psraw $0x8,%%xmm1 \n"
  1334. "packsswb %%xmm1,%%xmm0 \n"
  1335. "paddb %%xmm5,%%xmm0 \n"
  1336. "movlps %%xmm0," MEMACCESS(1) " \n"
  1337. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1338. "lea " MEMLEA(0x8,1) ",%1 \n"
  1339. "sub $0x10,%3 \n"
  1340. "jg 1b \n"
  1341. : "+r"(src_abgr0), // %0
  1342. "+r"(dst_u), // %1
  1343. "+r"(dst_v), // %2
  1344. "+rm"(width) // %3
  1345. : "r"((intptr_t)(src_stride_abgr)), // %4
  1346. "m"(kABGRToV), // %5
  1347. "m"(kABGRToU), // %6
  1348. "m"(kAddUV128) // %7
  1349. : "memory", "cc", NACL_R14
  1350. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1351. );
  1352. }
  1353. void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
  1354. uint8* dst_u, uint8* dst_v, int width) {
  1355. asm volatile (
  1356. "movdqa %5,%%xmm3 \n"
  1357. "movdqa %6,%%xmm4 \n"
  1358. "movdqa %7,%%xmm5 \n"
  1359. "sub %1,%2 \n"
  1360. LABELALIGN
  1361. "1: \n"
  1362. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  1363. MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
  1364. "pavgb %%xmm7,%%xmm0 \n"
  1365. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  1366. MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
  1367. "pavgb %%xmm7,%%xmm1 \n"
  1368. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  1369. MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
  1370. "pavgb %%xmm7,%%xmm2 \n"
  1371. "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
  1372. MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
  1373. "pavgb %%xmm7,%%xmm6 \n"
  1374. "lea " MEMLEA(0x40,0) ",%0 \n"
  1375. "movdqa %%xmm0,%%xmm7 \n"
  1376. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1377. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1378. "pavgb %%xmm7,%%xmm0 \n"
  1379. "movdqa %%xmm2,%%xmm7 \n"
  1380. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1381. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1382. "pavgb %%xmm7,%%xmm2 \n"
  1383. "movdqa %%xmm0,%%xmm1 \n"
  1384. "movdqa %%xmm2,%%xmm6 \n"
  1385. "pmaddubsw %%xmm4,%%xmm0 \n"
  1386. "pmaddubsw %%xmm4,%%xmm2 \n"
  1387. "pmaddubsw %%xmm3,%%xmm1 \n"
  1388. "pmaddubsw %%xmm3,%%xmm6 \n"
  1389. "phaddw %%xmm2,%%xmm0 \n"
  1390. "phaddw %%xmm6,%%xmm1 \n"
  1391. "psraw $0x8,%%xmm0 \n"
  1392. "psraw $0x8,%%xmm1 \n"
  1393. "packsswb %%xmm1,%%xmm0 \n"
  1394. "paddb %%xmm5,%%xmm0 \n"
  1395. "movlps %%xmm0," MEMACCESS(1) " \n"
  1396. MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
  1397. "lea " MEMLEA(0x8,1) ",%1 \n"
  1398. "sub $0x10,%3 \n"
  1399. "jg 1b \n"
  1400. : "+r"(src_rgba0), // %0
  1401. "+r"(dst_u), // %1
  1402. "+r"(dst_v), // %2
  1403. "+rm"(width) // %3
  1404. : "r"((intptr_t)(src_stride_rgba)), // %4
  1405. "m"(kRGBAToV), // %5
  1406. "m"(kRGBAToU), // %6
  1407. "m"(kAddUV128) // %7
  1408. : "memory", "cc", NACL_R14
  1409. "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
  1410. );
  1411. }
  1412. #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
  1413. // Read 8 UV from 444
  1414. #define READYUV444 \
  1415. "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1416. MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1417. "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
  1418. "punpcklbw %%xmm1,%%xmm0 \n" \
  1419. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1420. "punpcklbw %%xmm4,%%xmm4 \n" \
  1421. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1422. // Read 4 UV from 422, upsample to 8 UV
  1423. #define READYUV422 \
  1424. "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1425. MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1426. "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
  1427. "punpcklbw %%xmm1,%%xmm0 \n" \
  1428. "punpcklwd %%xmm0,%%xmm0 \n" \
  1429. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1430. "punpcklbw %%xmm4,%%xmm4 \n" \
  1431. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1432. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  1433. #define READYUVA422 \
  1434. "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1435. MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1436. "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
  1437. "punpcklbw %%xmm1,%%xmm0 \n" \
  1438. "punpcklwd %%xmm0,%%xmm0 \n" \
  1439. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1440. "punpcklbw %%xmm4,%%xmm4 \n" \
  1441. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
  1442. "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
  1443. "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
  1444. // Read 2 UV from 411, upsample to 8 UV.
  1445. // reading 4 bytes is an msan violation.
  1446. // "movd " MEMACCESS([u_buf]) ",%%xmm0 \n"
  1447. // MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
  1448. // pinsrw fails with drmemory
  1449. // __asm pinsrw xmm0, [esi], 0 /* U */
  1450. // __asm pinsrw xmm1, [esi + edi], 0 /* V */
  1451. #define READYUV411_TEMP \
  1452. "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \
  1453. "movd %[temp],%%xmm0 \n" \
  1454. MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \
  1455. "movd %[temp],%%xmm1 \n" \
  1456. "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
  1457. "punpcklbw %%xmm1,%%xmm0 \n" \
  1458. "punpcklwd %%xmm0,%%xmm0 \n" \
  1459. "punpckldq %%xmm0,%%xmm0 \n" \
  1460. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1461. "punpcklbw %%xmm4,%%xmm4 \n" \
  1462. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1463. // Read 4 UV from NV12, upsample to 8 UV
  1464. #define READNV12 \
  1465. "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
  1466. "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
  1467. "punpcklwd %%xmm0,%%xmm0 \n" \
  1468. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1469. "punpcklbw %%xmm4,%%xmm4 \n" \
  1470. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1471. // Read 4 VU from NV21, upsample to 8 UV
  1472. #define READNV21 \
  1473. "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
  1474. "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
  1475. "pshufb %[kShuffleNV21], %%xmm0 \n" \
  1476. "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1477. "punpcklbw %%xmm4,%%xmm4 \n" \
  1478. "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
  1479. // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
  1480. #define READYUY2 \
  1481. "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
  1482. "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
  1483. "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
  1484. "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
  1485. "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
  1486. // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
  1487. #define READUYVY \
  1488. "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
  1489. "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
  1490. "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
  1491. "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
  1492. "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
  1493. #if defined(__x86_64__)
  1494. #define YUVTORGB_SETUP(yuvconstants) \
  1495. "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
  1496. "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
  1497. "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
  1498. "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
  1499. "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
  1500. "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
  1501. "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
  1502. // Convert 8 pixels: 8 UV and 8 Y
  1503. #define YUVTORGB(yuvconstants) \
  1504. "movdqa %%xmm0,%%xmm1 \n" \
  1505. "movdqa %%xmm0,%%xmm2 \n" \
  1506. "movdqa %%xmm0,%%xmm3 \n" \
  1507. "movdqa %%xmm11,%%xmm0 \n" \
  1508. "pmaddubsw %%xmm8,%%xmm1 \n" \
  1509. "psubw %%xmm1,%%xmm0 \n" \
  1510. "movdqa %%xmm12,%%xmm1 \n" \
  1511. "pmaddubsw %%xmm9,%%xmm2 \n" \
  1512. "psubw %%xmm2,%%xmm1 \n" \
  1513. "movdqa %%xmm13,%%xmm2 \n" \
  1514. "pmaddubsw %%xmm10,%%xmm3 \n" \
  1515. "psubw %%xmm3,%%xmm2 \n" \
  1516. "pmulhuw %%xmm14,%%xmm4 \n" \
  1517. "paddsw %%xmm4,%%xmm0 \n" \
  1518. "paddsw %%xmm4,%%xmm1 \n" \
  1519. "paddsw %%xmm4,%%xmm2 \n" \
  1520. "psraw $0x6,%%xmm0 \n" \
  1521. "psraw $0x6,%%xmm1 \n" \
  1522. "psraw $0x6,%%xmm2 \n" \
  1523. "packuswb %%xmm0,%%xmm0 \n" \
  1524. "packuswb %%xmm1,%%xmm1 \n" \
  1525. "packuswb %%xmm2,%%xmm2 \n"
  1526. #define YUVTORGB_REGS \
  1527. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  1528. #else
  1529. #define YUVTORGB_SETUP(yuvconstants)
  1530. // Convert 8 pixels: 8 UV and 8 Y
  1531. #define YUVTORGB(yuvconstants) \
  1532. "movdqa %%xmm0,%%xmm1 \n" \
  1533. "movdqa %%xmm0,%%xmm2 \n" \
  1534. "movdqa %%xmm0,%%xmm3 \n" \
  1535. "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
  1536. "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
  1537. "psubw %%xmm1,%%xmm0 \n" \
  1538. "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
  1539. "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
  1540. "psubw %%xmm2,%%xmm1 \n" \
  1541. "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
  1542. "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
  1543. "psubw %%xmm3,%%xmm2 \n" \
  1544. "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
  1545. "paddsw %%xmm4,%%xmm0 \n" \
  1546. "paddsw %%xmm4,%%xmm1 \n" \
  1547. "paddsw %%xmm4,%%xmm2 \n" \
  1548. "psraw $0x6,%%xmm0 \n" \
  1549. "psraw $0x6,%%xmm1 \n" \
  1550. "psraw $0x6,%%xmm2 \n" \
  1551. "packuswb %%xmm0,%%xmm0 \n" \
  1552. "packuswb %%xmm1,%%xmm1 \n" \
  1553. "packuswb %%xmm2,%%xmm2 \n"
  1554. #define YUVTORGB_REGS
  1555. #endif
  1556. // Store 8 ARGB values.
  1557. #define STOREARGB \
  1558. "punpcklbw %%xmm1,%%xmm0 \n" \
  1559. "punpcklbw %%xmm5,%%xmm2 \n" \
  1560. "movdqa %%xmm0,%%xmm1 \n" \
  1561. "punpcklwd %%xmm2,%%xmm0 \n" \
  1562. "punpckhwd %%xmm2,%%xmm1 \n" \
  1563. "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
  1564. "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
  1565. "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
  1566. // Store 8 RGBA values.
  1567. #define STORERGBA \
  1568. "pcmpeqb %%xmm5,%%xmm5 \n" \
  1569. "punpcklbw %%xmm2,%%xmm1 \n" \
  1570. "punpcklbw %%xmm0,%%xmm5 \n" \
  1571. "movdqa %%xmm5,%%xmm0 \n" \
  1572. "punpcklwd %%xmm1,%%xmm5 \n" \
  1573. "punpckhwd %%xmm1,%%xmm0 \n" \
  1574. "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
  1575. "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
  1576. "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
  1577. void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
  1578. const uint8* u_buf,
  1579. const uint8* v_buf,
  1580. uint8* dst_argb,
  1581. const struct YuvConstants* yuvconstants,
  1582. int width) {
  1583. asm volatile (
  1584. YUVTORGB_SETUP(yuvconstants)
  1585. "sub %[u_buf],%[v_buf] \n"
  1586. "pcmpeqb %%xmm5,%%xmm5 \n"
  1587. LABELALIGN
  1588. "1: \n"
  1589. READYUV444
  1590. YUVTORGB(yuvconstants)
  1591. STOREARGB
  1592. "sub $0x8,%[width] \n"
  1593. "jg 1b \n"
  1594. : [y_buf]"+r"(y_buf), // %[y_buf]
  1595. [u_buf]"+r"(u_buf), // %[u_buf]
  1596. [v_buf]"+r"(v_buf), // %[v_buf]
  1597. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1598. [width]"+rm"(width) // %[width]
  1599. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1600. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1601. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1602. );
  1603. }
  1604. void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
  1605. const uint8* u_buf,
  1606. const uint8* v_buf,
  1607. uint8* dst_rgb24,
  1608. const struct YuvConstants* yuvconstants,
  1609. int width) {
  1610. asm volatile (
  1611. YUVTORGB_SETUP(yuvconstants)
  1612. "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
  1613. "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
  1614. "sub %[u_buf],%[v_buf] \n"
  1615. LABELALIGN
  1616. "1: \n"
  1617. READYUV422
  1618. YUVTORGB(yuvconstants)
  1619. "punpcklbw %%xmm1,%%xmm0 \n"
  1620. "punpcklbw %%xmm2,%%xmm2 \n"
  1621. "movdqa %%xmm0,%%xmm1 \n"
  1622. "punpcklwd %%xmm2,%%xmm0 \n"
  1623. "punpckhwd %%xmm2,%%xmm1 \n"
  1624. "pshufb %%xmm5,%%xmm0 \n"
  1625. "pshufb %%xmm6,%%xmm1 \n"
  1626. "palignr $0xc,%%xmm0,%%xmm1 \n"
  1627. "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
  1628. "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
  1629. "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
  1630. "subl $0x8,%[width] \n"
  1631. "jg 1b \n"
  1632. : [y_buf]"+r"(y_buf), // %[y_buf]
  1633. [u_buf]"+r"(u_buf), // %[u_buf]
  1634. [v_buf]"+r"(v_buf), // %[v_buf]
  1635. [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
  1636. #if defined(__i386__) && defined(__pic__)
  1637. [width]"+m"(width) // %[width]
  1638. #else
  1639. [width]"+rm"(width) // %[width]
  1640. #endif
  1641. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1642. [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
  1643. [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
  1644. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1645. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1646. );
  1647. }
  1648. void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
  1649. const uint8* u_buf,
  1650. const uint8* v_buf,
  1651. uint8* dst_argb,
  1652. const struct YuvConstants* yuvconstants,
  1653. int width) {
  1654. asm volatile (
  1655. YUVTORGB_SETUP(yuvconstants)
  1656. "sub %[u_buf],%[v_buf] \n"
  1657. "pcmpeqb %%xmm5,%%xmm5 \n"
  1658. LABELALIGN
  1659. "1: \n"
  1660. READYUV422
  1661. YUVTORGB(yuvconstants)
  1662. STOREARGB
  1663. "sub $0x8,%[width] \n"
  1664. "jg 1b \n"
  1665. : [y_buf]"+r"(y_buf), // %[y_buf]
  1666. [u_buf]"+r"(u_buf), // %[u_buf]
  1667. [v_buf]"+r"(v_buf), // %[v_buf]
  1668. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1669. [width]"+rm"(width) // %[width]
  1670. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1671. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1672. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1673. );
  1674. }
  1675. #ifdef HAS_I422ALPHATOARGBROW_SSSE3
  1676. void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
  1677. const uint8* u_buf,
  1678. const uint8* v_buf,
  1679. const uint8* a_buf,
  1680. uint8* dst_argb,
  1681. const struct YuvConstants* yuvconstants,
  1682. int width) {
  1683. asm volatile (
  1684. YUVTORGB_SETUP(yuvconstants)
  1685. "sub %[u_buf],%[v_buf] \n"
  1686. LABELALIGN
  1687. "1: \n"
  1688. READYUVA422
  1689. YUVTORGB(yuvconstants)
  1690. STOREARGB
  1691. "subl $0x8,%[width] \n"
  1692. "jg 1b \n"
  1693. : [y_buf]"+r"(y_buf), // %[y_buf]
  1694. [u_buf]"+r"(u_buf), // %[u_buf]
  1695. [v_buf]"+r"(v_buf), // %[v_buf]
  1696. [a_buf]"+r"(a_buf), // %[a_buf]
  1697. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1698. #if defined(__i386__) && defined(__pic__)
  1699. [width]"+m"(width) // %[width]
  1700. #else
  1701. [width]"+rm"(width) // %[width]
  1702. #endif
  1703. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1704. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1705. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1706. );
  1707. }
  1708. #endif // HAS_I422ALPHATOARGBROW_SSSE3
  1709. #ifdef HAS_I411TOARGBROW_SSSE3
  1710. void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
  1711. const uint8* u_buf,
  1712. const uint8* v_buf,
  1713. uint8* dst_argb,
  1714. const struct YuvConstants* yuvconstants,
  1715. int width) {
  1716. int temp;
  1717. asm volatile (
  1718. YUVTORGB_SETUP(yuvconstants)
  1719. "sub %[u_buf],%[v_buf] \n"
  1720. "pcmpeqb %%xmm5,%%xmm5 \n"
  1721. LABELALIGN
  1722. "1: \n"
  1723. READYUV411_TEMP
  1724. YUVTORGB(yuvconstants)
  1725. STOREARGB
  1726. "subl $0x8,%[width] \n"
  1727. "jg 1b \n"
  1728. : [y_buf]"+r"(y_buf), // %[y_buf]
  1729. [u_buf]"+r"(u_buf), // %[u_buf]
  1730. [v_buf]"+r"(v_buf), // %[v_buf]
  1731. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1732. [temp]"=&r"(temp), // %[temp]
  1733. #if defined(__i386__) && defined(__pic__)
  1734. [width]"+m"(width) // %[width]
  1735. #else
  1736. [width]"+rm"(width) // %[width]
  1737. #endif
  1738. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1739. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1740. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1741. );
  1742. }
  1743. #endif
  1744. void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
  1745. const uint8* uv_buf,
  1746. uint8* dst_argb,
  1747. const struct YuvConstants* yuvconstants,
  1748. int width) {
  1749. asm volatile (
  1750. YUVTORGB_SETUP(yuvconstants)
  1751. "pcmpeqb %%xmm5,%%xmm5 \n"
  1752. LABELALIGN
  1753. "1: \n"
  1754. READNV12
  1755. YUVTORGB(yuvconstants)
  1756. STOREARGB
  1757. "sub $0x8,%[width] \n"
  1758. "jg 1b \n"
  1759. : [y_buf]"+r"(y_buf), // %[y_buf]
  1760. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  1761. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1762. [width]"+rm"(width) // %[width]
  1763. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1764. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1765. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1766. );
  1767. }
  1768. void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
  1769. const uint8* vu_buf,
  1770. uint8* dst_argb,
  1771. const struct YuvConstants* yuvconstants,
  1772. int width) {
  1773. asm volatile (
  1774. YUVTORGB_SETUP(yuvconstants)
  1775. "pcmpeqb %%xmm5,%%xmm5 \n"
  1776. LABELALIGN
  1777. "1: \n"
  1778. READNV21
  1779. YUVTORGB(yuvconstants)
  1780. STOREARGB
  1781. "sub $0x8,%[width] \n"
  1782. "jg 1b \n"
  1783. : [y_buf]"+r"(y_buf), // %[y_buf]
  1784. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  1785. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1786. [width]"+rm"(width) // %[width]
  1787. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1788. [kShuffleNV21]"m"(kShuffleNV21)
  1789. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1790. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1791. );
  1792. }
  1793. void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
  1794. uint8* dst_argb,
  1795. const struct YuvConstants* yuvconstants,
  1796. int width) {
  1797. asm volatile (
  1798. YUVTORGB_SETUP(yuvconstants)
  1799. "pcmpeqb %%xmm5,%%xmm5 \n"
  1800. LABELALIGN
  1801. "1: \n"
  1802. READYUY2
  1803. YUVTORGB(yuvconstants)
  1804. STOREARGB
  1805. "sub $0x8,%[width] \n"
  1806. "jg 1b \n"
  1807. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  1808. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1809. [width]"+rm"(width) // %[width]
  1810. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1811. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  1812. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  1813. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1814. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1815. );
  1816. }
  1817. void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
  1818. uint8* dst_argb,
  1819. const struct YuvConstants* yuvconstants,
  1820. int width) {
  1821. asm volatile (
  1822. YUVTORGB_SETUP(yuvconstants)
  1823. "pcmpeqb %%xmm5,%%xmm5 \n"
  1824. LABELALIGN
  1825. "1: \n"
  1826. READUYVY
  1827. YUVTORGB(yuvconstants)
  1828. STOREARGB
  1829. "sub $0x8,%[width] \n"
  1830. "jg 1b \n"
  1831. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  1832. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1833. [width]"+rm"(width) // %[width]
  1834. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  1835. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  1836. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  1837. : "memory", "cc", YUVTORGB_REGS // Does not use r14.
  1838. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1839. );
  1840. }
  1841. void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
  1842. const uint8* u_buf,
  1843. const uint8* v_buf,
  1844. uint8* dst_rgba,
  1845. const struct YuvConstants* yuvconstants,
  1846. int width) {
  1847. asm volatile (
  1848. YUVTORGB_SETUP(yuvconstants)
  1849. "sub %[u_buf],%[v_buf] \n"
  1850. "pcmpeqb %%xmm5,%%xmm5 \n"
  1851. LABELALIGN
  1852. "1: \n"
  1853. READYUV422
  1854. YUVTORGB(yuvconstants)
  1855. STORERGBA
  1856. "sub $0x8,%[width] \n"
  1857. "jg 1b \n"
  1858. : [y_buf]"+r"(y_buf), // %[y_buf]
  1859. [u_buf]"+r"(u_buf), // %[u_buf]
  1860. [v_buf]"+r"(v_buf), // %[v_buf]
  1861. [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
  1862. [width]"+rm"(width) // %[width]
  1863. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1864. : "memory", "cc", NACL_R14 YUVTORGB_REGS
  1865. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1866. );
  1867. }
  1868. #endif // HAS_I422TOARGBROW_SSSE3
  1869. // Read 16 UV from 444
  1870. #define READYUV444_AVX2 \
  1871. "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1872. MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1873. "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
  1874. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1875. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  1876. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1877. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1878. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1879. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1880. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1881. // Read 8 UV from 422, upsample to 16 UV.
  1882. #define READYUV422_AVX2 \
  1883. "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1884. MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1885. "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
  1886. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1887. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1888. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  1889. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1890. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1891. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1892. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1893. // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
  1894. #define READYUVA422_AVX2 \
  1895. "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1896. MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1897. "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
  1898. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1899. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1900. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  1901. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1902. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1903. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1904. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
  1905. "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
  1906. "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
  1907. "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
  1908. // Read 4 UV from 411, upsample to 16 UV.
  1909. #define READYUV411_AVX2 \
  1910. "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
  1911. MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \
  1912. "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
  1913. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  1914. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  1915. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1916. "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \
  1917. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1918. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1919. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1920. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1921. // Read 8 UV from NV12, upsample to 16 UV.
  1922. #define READNV12_AVX2 \
  1923. "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
  1924. "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
  1925. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1926. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  1927. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1928. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1929. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1930. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1931. // Read 8 VU from NV21, upsample to 16 UV.
  1932. #define READNV21_AVX2 \
  1933. "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
  1934. "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
  1935. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  1936. "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
  1937. "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
  1938. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  1939. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  1940. "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
  1941. // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
  1942. #define READYUY2_AVX2 \
  1943. "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
  1944. "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
  1945. "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
  1946. "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
  1947. "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
  1948. // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
  1949. #define READUYVY_AVX2 \
  1950. "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
  1951. "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
  1952. "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
  1953. "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
  1954. "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
  1955. #if defined(__x86_64__)
  1956. #define YUVTORGB_SETUP_AVX2(yuvconstants) \
  1957. "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
  1958. "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
  1959. "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
  1960. "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
  1961. "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
  1962. "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
  1963. "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
  1964. #define YUVTORGB_AVX2(yuvconstants) \
  1965. "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
  1966. "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
  1967. "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
  1968. "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
  1969. "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
  1970. "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
  1971. "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
  1972. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  1973. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  1974. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
  1975. "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
  1976. "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
  1977. "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
  1978. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  1979. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
  1980. "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
  1981. #define YUVTORGB_REGS_AVX2 \
  1982. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  1983. #else // Convert 16 pixels: 16 UV and 16 Y.
  1984. #define YUVTORGB_SETUP_AVX2(yuvconstants)
  1985. #define YUVTORGB_AVX2(yuvconstants) \
  1986. "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
  1987. "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
  1988. "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
  1989. "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
  1990. "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
  1991. "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
  1992. "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
  1993. "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
  1994. "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
  1995. "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
  1996. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  1997. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  1998. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
  1999. "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
  2000. "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
  2001. "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
  2002. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  2003. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
  2004. "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
  2005. #define YUVTORGB_REGS_AVX2
  2006. #endif
  2007. // Store 16 ARGB values.
  2008. #define STOREARGB_AVX2 \
  2009. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2010. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2011. "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
  2012. "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
  2013. "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
  2014. "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
  2015. "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
  2016. "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
  2017. "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
  2018. #ifdef HAS_I444TOARGBROW_AVX2
  2019. // 16 pixels
  2020. // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
  2021. void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
  2022. const uint8* u_buf,
  2023. const uint8* v_buf,
  2024. uint8* dst_argb,
  2025. const struct YuvConstants* yuvconstants,
  2026. int width) {
  2027. asm volatile (
  2028. YUVTORGB_SETUP_AVX2(yuvconstants)
  2029. "sub %[u_buf],%[v_buf] \n"
  2030. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2031. LABELALIGN
  2032. "1: \n"
  2033. READYUV444_AVX2
  2034. YUVTORGB_AVX2(yuvconstants)
  2035. STOREARGB_AVX2
  2036. "sub $0x10,%[width] \n"
  2037. "jg 1b \n"
  2038. "vzeroupper \n"
  2039. : [y_buf]"+r"(y_buf), // %[y_buf]
  2040. [u_buf]"+r"(u_buf), // %[u_buf]
  2041. [v_buf]"+r"(v_buf), // %[v_buf]
  2042. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2043. [width]"+rm"(width) // %[width]
  2044. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2045. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2046. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2047. );
  2048. }
  2049. #endif // HAS_I444TOARGBROW_AVX2
  2050. #ifdef HAS_I411TOARGBROW_AVX2
  2051. // 16 pixels
  2052. // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2053. void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
  2054. const uint8* u_buf,
  2055. const uint8* v_buf,
  2056. uint8* dst_argb,
  2057. const struct YuvConstants* yuvconstants,
  2058. int width) {
  2059. asm volatile (
  2060. YUVTORGB_SETUP_AVX2(yuvconstants)
  2061. "sub %[u_buf],%[v_buf] \n"
  2062. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2063. LABELALIGN
  2064. "1: \n"
  2065. READYUV411_AVX2
  2066. YUVTORGB_AVX2(yuvconstants)
  2067. STOREARGB_AVX2
  2068. "sub $0x10,%[width] \n"
  2069. "jg 1b \n"
  2070. "vzeroupper \n"
  2071. : [y_buf]"+r"(y_buf), // %[y_buf]
  2072. [u_buf]"+r"(u_buf), // %[u_buf]
  2073. [v_buf]"+r"(v_buf), // %[v_buf]
  2074. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2075. [width]"+rm"(width) // %[width]
  2076. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2077. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2078. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2079. );
  2080. }
  2081. #endif // HAS_I411TOARGBROW_AVX2
  2082. #if defined(HAS_I422TOARGBROW_AVX2)
  2083. // 16 pixels
  2084. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2085. void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
  2086. const uint8* u_buf,
  2087. const uint8* v_buf,
  2088. uint8* dst_argb,
  2089. const struct YuvConstants* yuvconstants,
  2090. int width) {
  2091. asm volatile (
  2092. YUVTORGB_SETUP_AVX2(yuvconstants)
  2093. "sub %[u_buf],%[v_buf] \n"
  2094. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2095. LABELALIGN
  2096. "1: \n"
  2097. READYUV422_AVX2
  2098. YUVTORGB_AVX2(yuvconstants)
  2099. STOREARGB_AVX2
  2100. "sub $0x10,%[width] \n"
  2101. "jg 1b \n"
  2102. "vzeroupper \n"
  2103. : [y_buf]"+r"(y_buf), // %[y_buf]
  2104. [u_buf]"+r"(u_buf), // %[u_buf]
  2105. [v_buf]"+r"(v_buf), // %[v_buf]
  2106. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2107. [width]"+rm"(width) // %[width]
  2108. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2109. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2110. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2111. );
  2112. }
  2113. #endif // HAS_I422TOARGBROW_AVX2
  2114. #if defined(HAS_I422ALPHATOARGBROW_AVX2)
  2115. // 16 pixels
  2116. // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
  2117. void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
  2118. const uint8* u_buf,
  2119. const uint8* v_buf,
  2120. const uint8* a_buf,
  2121. uint8* dst_argb,
  2122. const struct YuvConstants* yuvconstants,
  2123. int width) {
  2124. asm volatile (
  2125. YUVTORGB_SETUP_AVX2(yuvconstants)
  2126. "sub %[u_buf],%[v_buf] \n"
  2127. LABELALIGN
  2128. "1: \n"
  2129. READYUVA422_AVX2
  2130. YUVTORGB_AVX2(yuvconstants)
  2131. STOREARGB_AVX2
  2132. "subl $0x10,%[width] \n"
  2133. "jg 1b \n"
  2134. "vzeroupper \n"
  2135. : [y_buf]"+r"(y_buf), // %[y_buf]
  2136. [u_buf]"+r"(u_buf), // %[u_buf]
  2137. [v_buf]"+r"(v_buf), // %[v_buf]
  2138. [a_buf]"+r"(a_buf), // %[a_buf]
  2139. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2140. #if defined(__i386__) && defined(__pic__)
  2141. [width]"+m"(width) // %[width]
  2142. #else
  2143. [width]"+rm"(width) // %[width]
  2144. #endif
  2145. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2146. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2147. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2148. );
  2149. }
  2150. #endif // HAS_I422ALPHATOARGBROW_AVX2
  2151. #if defined(HAS_I422TORGBAROW_AVX2)
  2152. // 16 pixels
  2153. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
  2154. void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
  2155. const uint8* u_buf,
  2156. const uint8* v_buf,
  2157. uint8* dst_argb,
  2158. const struct YuvConstants* yuvconstants,
  2159. int width) {
  2160. asm volatile (
  2161. YUVTORGB_SETUP_AVX2(yuvconstants)
  2162. "sub %[u_buf],%[v_buf] \n"
  2163. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2164. LABELALIGN
  2165. "1: \n"
  2166. READYUV422_AVX2
  2167. YUVTORGB_AVX2(yuvconstants)
  2168. // Step 3: Weave into RGBA
  2169. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  2170. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2171. "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
  2172. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  2173. "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
  2174. "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
  2175. "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
  2176. "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
  2177. "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
  2178. "sub $0x10,%[width] \n"
  2179. "jg 1b \n"
  2180. "vzeroupper \n"
  2181. : [y_buf]"+r"(y_buf), // %[y_buf]
  2182. [u_buf]"+r"(u_buf), // %[u_buf]
  2183. [v_buf]"+r"(v_buf), // %[v_buf]
  2184. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2185. [width]"+rm"(width) // %[width]
  2186. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2187. : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
  2188. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2189. );
  2190. }
  2191. #endif // HAS_I422TORGBAROW_AVX2
  2192. #if defined(HAS_NV12TOARGBROW_AVX2)
  2193. // 16 pixels.
  2194. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2195. void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
  2196. const uint8* uv_buf,
  2197. uint8* dst_argb,
  2198. const struct YuvConstants* yuvconstants,
  2199. int width) {
  2200. asm volatile (
  2201. YUVTORGB_SETUP_AVX2(yuvconstants)
  2202. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2203. LABELALIGN
  2204. "1: \n"
  2205. READNV12_AVX2
  2206. YUVTORGB_AVX2(yuvconstants)
  2207. STOREARGB_AVX2
  2208. "sub $0x10,%[width] \n"
  2209. "jg 1b \n"
  2210. "vzeroupper \n"
  2211. : [y_buf]"+r"(y_buf), // %[y_buf]
  2212. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  2213. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2214. [width]"+rm"(width) // %[width]
  2215. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2216. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2217. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2218. );
  2219. }
  2220. #endif // HAS_NV12TOARGBROW_AVX2
  2221. #if defined(HAS_NV21TOARGBROW_AVX2)
  2222. // 16 pixels.
  2223. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2224. void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
  2225. const uint8* vu_buf,
  2226. uint8* dst_argb,
  2227. const struct YuvConstants* yuvconstants,
  2228. int width) {
  2229. asm volatile (
  2230. YUVTORGB_SETUP_AVX2(yuvconstants)
  2231. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2232. LABELALIGN
  2233. "1: \n"
  2234. READNV21_AVX2
  2235. YUVTORGB_AVX2(yuvconstants)
  2236. STOREARGB_AVX2
  2237. "sub $0x10,%[width] \n"
  2238. "jg 1b \n"
  2239. "vzeroupper \n"
  2240. : [y_buf]"+r"(y_buf), // %[y_buf]
  2241. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  2242. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2243. [width]"+rm"(width) // %[width]
  2244. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2245. [kShuffleNV21]"m"(kShuffleNV21)
  2246. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2247. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2248. );
  2249. }
  2250. #endif // HAS_NV21TOARGBROW_AVX2
  2251. #if defined(HAS_YUY2TOARGBROW_AVX2)
  2252. // 16 pixels.
  2253. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2254. void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
  2255. uint8* dst_argb,
  2256. const struct YuvConstants* yuvconstants,
  2257. int width) {
  2258. asm volatile (
  2259. YUVTORGB_SETUP_AVX2(yuvconstants)
  2260. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2261. LABELALIGN
  2262. "1: \n"
  2263. READYUY2_AVX2
  2264. YUVTORGB_AVX2(yuvconstants)
  2265. STOREARGB_AVX2
  2266. "sub $0x10,%[width] \n"
  2267. "jg 1b \n"
  2268. "vzeroupper \n"
  2269. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  2270. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2271. [width]"+rm"(width) // %[width]
  2272. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2273. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  2274. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  2275. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2276. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2277. );
  2278. }
  2279. #endif // HAS_YUY2TOARGBROW_AVX2
  2280. #if defined(HAS_UYVYTOARGBROW_AVX2)
  2281. // 16 pixels.
  2282. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2283. void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
  2284. uint8* dst_argb,
  2285. const struct YuvConstants* yuvconstants,
  2286. int width) {
  2287. asm volatile (
  2288. YUVTORGB_SETUP_AVX2(yuvconstants)
  2289. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2290. LABELALIGN
  2291. "1: \n"
  2292. READUYVY_AVX2
  2293. YUVTORGB_AVX2(yuvconstants)
  2294. STOREARGB_AVX2
  2295. "sub $0x10,%[width] \n"
  2296. "jg 1b \n"
  2297. "vzeroupper \n"
  2298. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  2299. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2300. [width]"+rm"(width) // %[width]
  2301. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2302. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  2303. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  2304. : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
  2305. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2306. );
  2307. }
  2308. #endif // HAS_UYVYTOARGBROW_AVX2
  2309. #ifdef HAS_I400TOARGBROW_SSE2
  2310. void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
  2311. asm volatile (
  2312. "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
  2313. "movd %%eax,%%xmm2 \n"
  2314. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  2315. "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
  2316. "movd %%eax,%%xmm3 \n"
  2317. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  2318. "pcmpeqb %%xmm4,%%xmm4 \n"
  2319. "pslld $0x18,%%xmm4 \n"
  2320. LABELALIGN
  2321. "1: \n"
  2322. // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  2323. "movq " MEMACCESS(0) ",%%xmm0 \n"
  2324. "lea " MEMLEA(0x8,0) ",%0 \n"
  2325. "punpcklbw %%xmm0,%%xmm0 \n"
  2326. "pmulhuw %%xmm2,%%xmm0 \n"
  2327. "psubusw %%xmm3,%%xmm0 \n"
  2328. "psrlw $6, %%xmm0 \n"
  2329. "packuswb %%xmm0,%%xmm0 \n"
  2330. // Step 2: Weave into ARGB
  2331. "punpcklbw %%xmm0,%%xmm0 \n"
  2332. "movdqa %%xmm0,%%xmm1 \n"
  2333. "punpcklwd %%xmm0,%%xmm0 \n"
  2334. "punpckhwd %%xmm1,%%xmm1 \n"
  2335. "por %%xmm4,%%xmm0 \n"
  2336. "por %%xmm4,%%xmm1 \n"
  2337. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2338. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  2339. "lea " MEMLEA(0x20,1) ",%1 \n"
  2340. "sub $0x8,%2 \n"
  2341. "jg 1b \n"
  2342. : "+r"(y_buf), // %0
  2343. "+r"(dst_argb), // %1
  2344. "+rm"(width) // %2
  2345. :
  2346. : "memory", "cc", "eax"
  2347. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  2348. );
  2349. }
  2350. #endif // HAS_I400TOARGBROW_SSE2
  2351. #ifdef HAS_I400TOARGBROW_AVX2
  2352. // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
  2353. // note: vpunpcklbw mutates and vpackuswb unmutates.
  2354. void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
  2355. asm volatile (
  2356. "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
  2357. "vmovd %%eax,%%xmm2 \n"
  2358. "vbroadcastss %%xmm2,%%ymm2 \n"
  2359. "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
  2360. "vmovd %%eax,%%xmm3 \n"
  2361. "vbroadcastss %%xmm3,%%ymm3 \n"
  2362. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  2363. "vpslld $0x18,%%ymm4,%%ymm4 \n"
  2364. LABELALIGN
  2365. "1: \n"
  2366. // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
  2367. "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
  2368. "lea " MEMLEA(0x10,0) ",%0 \n"
  2369. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  2370. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  2371. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  2372. "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
  2373. "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
  2374. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  2375. "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
  2376. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2377. "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
  2378. "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
  2379. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  2380. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  2381. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2382. "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
  2383. "lea " MEMLEA(0x40,1) ",%1 \n"
  2384. "sub $0x10,%2 \n"
  2385. "jg 1b \n"
  2386. "vzeroupper \n"
  2387. : "+r"(y_buf), // %0
  2388. "+r"(dst_argb), // %1
  2389. "+rm"(width) // %2
  2390. :
  2391. : "memory", "cc", "eax"
  2392. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  2393. );
  2394. }
  2395. #endif // HAS_I400TOARGBROW_AVX2
  2396. #ifdef HAS_MIRRORROW_SSSE3
  2397. // Shuffle table for reversing the bytes.
  2398. static uvec8 kShuffleMirror = {
  2399. 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  2400. };
  2401. void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  2402. intptr_t temp_width = (intptr_t)(width);
  2403. asm volatile (
  2404. "movdqa %3,%%xmm5 \n"
  2405. LABELALIGN
  2406. "1: \n"
  2407. MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
  2408. "pshufb %%xmm5,%%xmm0 \n"
  2409. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2410. "lea " MEMLEA(0x10,1) ",%1 \n"
  2411. "sub $0x10,%2 \n"
  2412. "jg 1b \n"
  2413. : "+r"(src), // %0
  2414. "+r"(dst), // %1
  2415. "+r"(temp_width) // %2
  2416. : "m"(kShuffleMirror) // %3
  2417. : "memory", "cc", NACL_R14
  2418. "xmm0", "xmm5"
  2419. );
  2420. }
  2421. #endif // HAS_MIRRORROW_SSSE3
  2422. #ifdef HAS_MIRRORROW_AVX2
  2423. void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  2424. intptr_t temp_width = (intptr_t)(width);
  2425. asm volatile (
  2426. "vbroadcastf128 %3,%%ymm5 \n"
  2427. LABELALIGN
  2428. "1: \n"
  2429. MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
  2430. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  2431. "vpermq $0x4e,%%ymm0,%%ymm0 \n"
  2432. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2433. "lea " MEMLEA(0x20,1) ",%1 \n"
  2434. "sub $0x20,%2 \n"
  2435. "jg 1b \n"
  2436. "vzeroupper \n"
  2437. : "+r"(src), // %0
  2438. "+r"(dst), // %1
  2439. "+r"(temp_width) // %2
  2440. : "m"(kShuffleMirror) // %3
  2441. : "memory", "cc", NACL_R14
  2442. "xmm0", "xmm5"
  2443. );
  2444. }
  2445. #endif // HAS_MIRRORROW_AVX2
  2446. #ifdef HAS_MIRRORUVROW_SSSE3
  2447. // Shuffle table for reversing the bytes of UV channels.
  2448. static uvec8 kShuffleMirrorUV = {
  2449. 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
  2450. };
  2451. void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
  2452. int width) {
  2453. intptr_t temp_width = (intptr_t)(width);
  2454. asm volatile (
  2455. "movdqa %4,%%xmm1 \n"
  2456. "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
  2457. "sub %1,%2 \n"
  2458. LABELALIGN
  2459. "1: \n"
  2460. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2461. "lea " MEMLEA(-0x10,0) ",%0 \n"
  2462. "pshufb %%xmm1,%%xmm0 \n"
  2463. "movlpd %%xmm0," MEMACCESS(1) " \n"
  2464. MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
  2465. "lea " MEMLEA(0x8,1) ",%1 \n"
  2466. "sub $8,%3 \n"
  2467. "jg 1b \n"
  2468. : "+r"(src), // %0
  2469. "+r"(dst_u), // %1
  2470. "+r"(dst_v), // %2
  2471. "+r"(temp_width) // %3
  2472. : "m"(kShuffleMirrorUV) // %4
  2473. : "memory", "cc", NACL_R14
  2474. "xmm0", "xmm1"
  2475. );
  2476. }
  2477. #endif // HAS_MIRRORUVROW_SSSE3
  2478. #ifdef HAS_ARGBMIRRORROW_SSE2
  2479. void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  2480. intptr_t temp_width = (intptr_t)(width);
  2481. asm volatile (
  2482. "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
  2483. LABELALIGN
  2484. "1: \n"
  2485. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2486. "pshufd $0x1b,%%xmm0,%%xmm0 \n"
  2487. "lea " MEMLEA(-0x10,0) ",%0 \n"
  2488. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2489. "lea " MEMLEA(0x10,1) ",%1 \n"
  2490. "sub $0x4,%2 \n"
  2491. "jg 1b \n"
  2492. : "+r"(src), // %0
  2493. "+r"(dst), // %1
  2494. "+r"(temp_width) // %2
  2495. :
  2496. : "memory", "cc"
  2497. , "xmm0"
  2498. );
  2499. }
  2500. #endif // HAS_ARGBMIRRORROW_SSE2
  2501. #ifdef HAS_ARGBMIRRORROW_AVX2
  2502. // Shuffle table for reversing the bytes.
  2503. static const ulvec32 kARGBShuffleMirror_AVX2 = {
  2504. 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
  2505. };
  2506. void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
  2507. intptr_t temp_width = (intptr_t)(width);
  2508. asm volatile (
  2509. "vmovdqu %3,%%ymm5 \n"
  2510. LABELALIGN
  2511. "1: \n"
  2512. VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
  2513. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2514. "lea " MEMLEA(0x20,1) ",%1 \n"
  2515. "sub $0x8,%2 \n"
  2516. "jg 1b \n"
  2517. "vzeroupper \n"
  2518. : "+r"(src), // %0
  2519. "+r"(dst), // %1
  2520. "+r"(temp_width) // %2
  2521. : "m"(kARGBShuffleMirror_AVX2) // %3
  2522. : "memory", "cc", NACL_R14
  2523. "xmm0", "xmm5"
  2524. );
  2525. }
  2526. #endif // HAS_ARGBMIRRORROW_AVX2
  2527. #ifdef HAS_SPLITUVROW_AVX2
  2528. void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  2529. int width) {
  2530. asm volatile (
  2531. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2532. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  2533. "sub %1,%2 \n"
  2534. LABELALIGN
  2535. "1: \n"
  2536. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  2537. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  2538. "lea " MEMLEA(0x40,0) ",%0 \n"
  2539. "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
  2540. "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
  2541. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  2542. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  2543. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  2544. "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
  2545. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  2546. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  2547. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2548. MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
  2549. "lea " MEMLEA(0x20,1) ",%1 \n"
  2550. "sub $0x20,%3 \n"
  2551. "jg 1b \n"
  2552. "vzeroupper \n"
  2553. : "+r"(src_uv), // %0
  2554. "+r"(dst_u), // %1
  2555. "+r"(dst_v), // %2
  2556. "+r"(width) // %3
  2557. :
  2558. : "memory", "cc", NACL_R14
  2559. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  2560. );
  2561. }
  2562. #endif // HAS_SPLITUVROW_AVX2
  2563. #ifdef HAS_SPLITUVROW_SSE2
  2564. void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  2565. int width) {
  2566. asm volatile (
  2567. "pcmpeqb %%xmm5,%%xmm5 \n"
  2568. "psrlw $0x8,%%xmm5 \n"
  2569. "sub %1,%2 \n"
  2570. LABELALIGN
  2571. "1: \n"
  2572. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2573. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2574. "lea " MEMLEA(0x20,0) ",%0 \n"
  2575. "movdqa %%xmm0,%%xmm2 \n"
  2576. "movdqa %%xmm1,%%xmm3 \n"
  2577. "pand %%xmm5,%%xmm0 \n"
  2578. "pand %%xmm5,%%xmm1 \n"
  2579. "packuswb %%xmm1,%%xmm0 \n"
  2580. "psrlw $0x8,%%xmm2 \n"
  2581. "psrlw $0x8,%%xmm3 \n"
  2582. "packuswb %%xmm3,%%xmm2 \n"
  2583. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2584. MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
  2585. "lea " MEMLEA(0x10,1) ",%1 \n"
  2586. "sub $0x10,%3 \n"
  2587. "jg 1b \n"
  2588. : "+r"(src_uv), // %0
  2589. "+r"(dst_u), // %1
  2590. "+r"(dst_v), // %2
  2591. "+r"(width) // %3
  2592. :
  2593. : "memory", "cc", NACL_R14
  2594. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  2595. );
  2596. }
  2597. #endif // HAS_SPLITUVROW_SSE2
  2598. #ifdef HAS_MERGEUVROW_AVX2
  2599. void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  2600. int width) {
  2601. asm volatile (
  2602. "sub %0,%1 \n"
  2603. LABELALIGN
  2604. "1: \n"
  2605. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  2606. MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
  2607. "lea " MEMLEA(0x20,0) ",%0 \n"
  2608. "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
  2609. "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
  2610. "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
  2611. "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
  2612. "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
  2613. "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
  2614. "lea " MEMLEA(0x40,2) ",%2 \n"
  2615. "sub $0x20,%3 \n"
  2616. "jg 1b \n"
  2617. "vzeroupper \n"
  2618. : "+r"(src_u), // %0
  2619. "+r"(src_v), // %1
  2620. "+r"(dst_uv), // %2
  2621. "+r"(width) // %3
  2622. :
  2623. : "memory", "cc", NACL_R14
  2624. "xmm0", "xmm1", "xmm2"
  2625. );
  2626. }
  2627. #endif // HAS_MERGEUVROW_AVX2
  2628. #ifdef HAS_MERGEUVROW_SSE2
  2629. void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  2630. int width) {
  2631. asm volatile (
  2632. "sub %0,%1 \n"
  2633. LABELALIGN
  2634. "1: \n"
  2635. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2636. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  2637. "lea " MEMLEA(0x10,0) ",%0 \n"
  2638. "movdqa %%xmm0,%%xmm2 \n"
  2639. "punpcklbw %%xmm1,%%xmm0 \n"
  2640. "punpckhbw %%xmm1,%%xmm2 \n"
  2641. "movdqu %%xmm0," MEMACCESS(2) " \n"
  2642. "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
  2643. "lea " MEMLEA(0x20,2) ",%2 \n"
  2644. "sub $0x10,%3 \n"
  2645. "jg 1b \n"
  2646. : "+r"(src_u), // %0
  2647. "+r"(src_v), // %1
  2648. "+r"(dst_uv), // %2
  2649. "+r"(width) // %3
  2650. :
  2651. : "memory", "cc", NACL_R14
  2652. "xmm0", "xmm1", "xmm2"
  2653. );
  2654. }
  2655. #endif // HAS_MERGEUVROW_SSE2
  2656. #ifdef HAS_COPYROW_SSE2
  2657. void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  2658. asm volatile (
  2659. "test $0xf,%0 \n"
  2660. "jne 2f \n"
  2661. "test $0xf,%1 \n"
  2662. "jne 2f \n"
  2663. LABELALIGN
  2664. "1: \n"
  2665. "movdqa " MEMACCESS(0) ",%%xmm0 \n"
  2666. "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2667. "lea " MEMLEA(0x20,0) ",%0 \n"
  2668. "movdqa %%xmm0," MEMACCESS(1) " \n"
  2669. "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
  2670. "lea " MEMLEA(0x20,1) ",%1 \n"
  2671. "sub $0x20,%2 \n"
  2672. "jg 1b \n"
  2673. "jmp 9f \n"
  2674. LABELALIGN
  2675. "2: \n"
  2676. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2677. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2678. "lea " MEMLEA(0x20,0) ",%0 \n"
  2679. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2680. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  2681. "lea " MEMLEA(0x20,1) ",%1 \n"
  2682. "sub $0x20,%2 \n"
  2683. "jg 2b \n"
  2684. "9: \n"
  2685. : "+r"(src), // %0
  2686. "+r"(dst), // %1
  2687. "+r"(count) // %2
  2688. :
  2689. : "memory", "cc"
  2690. , "xmm0", "xmm1"
  2691. );
  2692. }
  2693. #endif // HAS_COPYROW_SSE2
  2694. #ifdef HAS_COPYROW_AVX
  2695. void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
  2696. asm volatile (
  2697. LABELALIGN
  2698. "1: \n"
  2699. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  2700. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  2701. "lea " MEMLEA(0x40,0) ",%0 \n"
  2702. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  2703. "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
  2704. "lea " MEMLEA(0x40,1) ",%1 \n"
  2705. "sub $0x40,%2 \n"
  2706. "jg 1b \n"
  2707. : "+r"(src), // %0
  2708. "+r"(dst), // %1
  2709. "+r"(count) // %2
  2710. :
  2711. : "memory", "cc"
  2712. , "xmm0", "xmm1"
  2713. );
  2714. }
  2715. #endif // HAS_COPYROW_AVX
  2716. #ifdef HAS_COPYROW_ERMS
  2717. // Multiple of 1.
  2718. void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
  2719. size_t width_tmp = (size_t)(width);
  2720. asm volatile (
  2721. "rep movsb " MEMMOVESTRING(0,1) " \n"
  2722. : "+S"(src), // %0
  2723. "+D"(dst), // %1
  2724. "+c"(width_tmp) // %2
  2725. :
  2726. : "memory", "cc"
  2727. );
  2728. }
  2729. #endif // HAS_COPYROW_ERMS
  2730. #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  2731. // width in pixels
  2732. void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  2733. asm volatile (
  2734. "pcmpeqb %%xmm0,%%xmm0 \n"
  2735. "pslld $0x18,%%xmm0 \n"
  2736. "pcmpeqb %%xmm1,%%xmm1 \n"
  2737. "psrld $0x8,%%xmm1 \n"
  2738. LABELALIGN
  2739. "1: \n"
  2740. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  2741. "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
  2742. "lea " MEMLEA(0x20,0) ",%0 \n"
  2743. "movdqu " MEMACCESS(1) ",%%xmm4 \n"
  2744. "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
  2745. "pand %%xmm0,%%xmm2 \n"
  2746. "pand %%xmm0,%%xmm3 \n"
  2747. "pand %%xmm1,%%xmm4 \n"
  2748. "pand %%xmm1,%%xmm5 \n"
  2749. "por %%xmm4,%%xmm2 \n"
  2750. "por %%xmm5,%%xmm3 \n"
  2751. "movdqu %%xmm2," MEMACCESS(1) " \n"
  2752. "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
  2753. "lea " MEMLEA(0x20,1) ",%1 \n"
  2754. "sub $0x8,%2 \n"
  2755. "jg 1b \n"
  2756. : "+r"(src), // %0
  2757. "+r"(dst), // %1
  2758. "+r"(width) // %2
  2759. :
  2760. : "memory", "cc"
  2761. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2762. );
  2763. }
  2764. #endif // HAS_ARGBCOPYALPHAROW_SSE2
  2765. #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  2766. // width in pixels
  2767. void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  2768. asm volatile (
  2769. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  2770. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  2771. LABELALIGN
  2772. "1: \n"
  2773. "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
  2774. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
  2775. "lea " MEMLEA(0x40,0) ",%0 \n"
  2776. "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
  2777. "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
  2778. "vmovdqu %%ymm1," MEMACCESS(1) " \n"
  2779. "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
  2780. "lea " MEMLEA(0x40,1) ",%1 \n"
  2781. "sub $0x10,%2 \n"
  2782. "jg 1b \n"
  2783. "vzeroupper \n"
  2784. : "+r"(src), // %0
  2785. "+r"(dst), // %1
  2786. "+r"(width) // %2
  2787. :
  2788. : "memory", "cc"
  2789. , "xmm0", "xmm1", "xmm2"
  2790. );
  2791. }
  2792. #endif // HAS_ARGBCOPYALPHAROW_AVX2
  2793. #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
  2794. // width in pixels
  2795. void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
  2796. asm volatile (
  2797. LABELALIGN
  2798. "1: \n"
  2799. "movdqu " MEMACCESS(0) ", %%xmm0 \n"
  2800. "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
  2801. "lea " MEMLEA(0x20, 0) ", %0 \n"
  2802. "psrld $0x18, %%xmm0 \n"
  2803. "psrld $0x18, %%xmm1 \n"
  2804. "packssdw %%xmm1, %%xmm0 \n"
  2805. "packuswb %%xmm0, %%xmm0 \n"
  2806. "movq %%xmm0," MEMACCESS(1) " \n"
  2807. "lea " MEMLEA(0x8, 1) ", %1 \n"
  2808. "sub $0x8, %2 \n"
  2809. "jg 1b \n"
  2810. : "+r"(src_argb), // %0
  2811. "+r"(dst_a), // %1
  2812. "+rm"(width) // %2
  2813. :
  2814. : "memory", "cc"
  2815. , "xmm0", "xmm1"
  2816. );
  2817. }
  2818. #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
  2819. #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  2820. // width in pixels
  2821. void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
  2822. asm volatile (
  2823. "pcmpeqb %%xmm0,%%xmm0 \n"
  2824. "pslld $0x18,%%xmm0 \n"
  2825. "pcmpeqb %%xmm1,%%xmm1 \n"
  2826. "psrld $0x8,%%xmm1 \n"
  2827. LABELALIGN
  2828. "1: \n"
  2829. "movq " MEMACCESS(0) ",%%xmm2 \n"
  2830. "lea " MEMLEA(0x8,0) ",%0 \n"
  2831. "punpcklbw %%xmm2,%%xmm2 \n"
  2832. "punpckhwd %%xmm2,%%xmm3 \n"
  2833. "punpcklwd %%xmm2,%%xmm2 \n"
  2834. "movdqu " MEMACCESS(1) ",%%xmm4 \n"
  2835. "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
  2836. "pand %%xmm0,%%xmm2 \n"
  2837. "pand %%xmm0,%%xmm3 \n"
  2838. "pand %%xmm1,%%xmm4 \n"
  2839. "pand %%xmm1,%%xmm5 \n"
  2840. "por %%xmm4,%%xmm2 \n"
  2841. "por %%xmm5,%%xmm3 \n"
  2842. "movdqu %%xmm2," MEMACCESS(1) " \n"
  2843. "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
  2844. "lea " MEMLEA(0x20,1) ",%1 \n"
  2845. "sub $0x8,%2 \n"
  2846. "jg 1b \n"
  2847. : "+r"(src), // %0
  2848. "+r"(dst), // %1
  2849. "+r"(width) // %2
  2850. :
  2851. : "memory", "cc"
  2852. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2853. );
  2854. }
  2855. #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
  2856. #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  2857. // width in pixels
  2858. void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
  2859. asm volatile (
  2860. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  2861. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  2862. LABELALIGN
  2863. "1: \n"
  2864. "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
  2865. "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
  2866. "lea " MEMLEA(0x10,0) ",%0 \n"
  2867. "vpslld $0x18,%%ymm1,%%ymm1 \n"
  2868. "vpslld $0x18,%%ymm2,%%ymm2 \n"
  2869. "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
  2870. "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
  2871. "vmovdqu %%ymm1," MEMACCESS(1) " \n"
  2872. "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
  2873. "lea " MEMLEA(0x40,1) ",%1 \n"
  2874. "sub $0x10,%2 \n"
  2875. "jg 1b \n"
  2876. "vzeroupper \n"
  2877. : "+r"(src), // %0
  2878. "+r"(dst), // %1
  2879. "+r"(width) // %2
  2880. :
  2881. : "memory", "cc"
  2882. , "xmm0", "xmm1", "xmm2"
  2883. );
  2884. }
  2885. #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
  2886. #ifdef HAS_SETROW_X86
  2887. void SetRow_X86(uint8* dst, uint8 v8, int width) {
  2888. size_t width_tmp = (size_t)(width >> 2);
  2889. const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
  2890. asm volatile (
  2891. "rep stosl " MEMSTORESTRING(eax,0) " \n"
  2892. : "+D"(dst), // %0
  2893. "+c"(width_tmp) // %1
  2894. : "a"(v32) // %2
  2895. : "memory", "cc");
  2896. }
  2897. void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
  2898. size_t width_tmp = (size_t)(width);
  2899. asm volatile (
  2900. "rep stosb " MEMSTORESTRING(al,0) " \n"
  2901. : "+D"(dst), // %0
  2902. "+c"(width_tmp) // %1
  2903. : "a"(v8) // %2
  2904. : "memory", "cc");
  2905. }
  2906. void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
  2907. size_t width_tmp = (size_t)(width);
  2908. asm volatile (
  2909. "rep stosl " MEMSTORESTRING(eax,0) " \n"
  2910. : "+D"(dst_argb), // %0
  2911. "+c"(width_tmp) // %1
  2912. : "a"(v32) // %2
  2913. : "memory", "cc");
  2914. }
  2915. #endif // HAS_SETROW_X86
  2916. #ifdef HAS_YUY2TOYROW_SSE2
  2917. void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
  2918. asm volatile (
  2919. "pcmpeqb %%xmm5,%%xmm5 \n"
  2920. "psrlw $0x8,%%xmm5 \n"
  2921. LABELALIGN
  2922. "1: \n"
  2923. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2924. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2925. "lea " MEMLEA(0x20,0) ",%0 \n"
  2926. "pand %%xmm5,%%xmm0 \n"
  2927. "pand %%xmm5,%%xmm1 \n"
  2928. "packuswb %%xmm1,%%xmm0 \n"
  2929. "movdqu %%xmm0," MEMACCESS(1) " \n"
  2930. "lea " MEMLEA(0x10,1) ",%1 \n"
  2931. "sub $0x10,%2 \n"
  2932. "jg 1b \n"
  2933. : "+r"(src_yuy2), // %0
  2934. "+r"(dst_y), // %1
  2935. "+r"(width) // %2
  2936. :
  2937. : "memory", "cc"
  2938. , "xmm0", "xmm1", "xmm5"
  2939. );
  2940. }
  2941. void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
  2942. uint8* dst_u, uint8* dst_v, int width) {
  2943. asm volatile (
  2944. "pcmpeqb %%xmm5,%%xmm5 \n"
  2945. "psrlw $0x8,%%xmm5 \n"
  2946. "sub %1,%2 \n"
  2947. LABELALIGN
  2948. "1: \n"
  2949. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2950. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2951. MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
  2952. MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
  2953. "lea " MEMLEA(0x20,0) ",%0 \n"
  2954. "pavgb %%xmm2,%%xmm0 \n"
  2955. "pavgb %%xmm3,%%xmm1 \n"
  2956. "psrlw $0x8,%%xmm0 \n"
  2957. "psrlw $0x8,%%xmm1 \n"
  2958. "packuswb %%xmm1,%%xmm0 \n"
  2959. "movdqa %%xmm0,%%xmm1 \n"
  2960. "pand %%xmm5,%%xmm0 \n"
  2961. "packuswb %%xmm0,%%xmm0 \n"
  2962. "psrlw $0x8,%%xmm1 \n"
  2963. "packuswb %%xmm1,%%xmm1 \n"
  2964. "movq %%xmm0," MEMACCESS(1) " \n"
  2965. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  2966. "lea " MEMLEA(0x8,1) ",%1 \n"
  2967. "sub $0x10,%3 \n"
  2968. "jg 1b \n"
  2969. : "+r"(src_yuy2), // %0
  2970. "+r"(dst_u), // %1
  2971. "+r"(dst_v), // %2
  2972. "+r"(width) // %3
  2973. : "r"((intptr_t)(stride_yuy2)) // %4
  2974. : "memory", "cc", NACL_R14
  2975. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  2976. );
  2977. }
  2978. void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
  2979. uint8* dst_u, uint8* dst_v, int width) {
  2980. asm volatile (
  2981. "pcmpeqb %%xmm5,%%xmm5 \n"
  2982. "psrlw $0x8,%%xmm5 \n"
  2983. "sub %1,%2 \n"
  2984. LABELALIGN
  2985. "1: \n"
  2986. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  2987. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  2988. "lea " MEMLEA(0x20,0) ",%0 \n"
  2989. "psrlw $0x8,%%xmm0 \n"
  2990. "psrlw $0x8,%%xmm1 \n"
  2991. "packuswb %%xmm1,%%xmm0 \n"
  2992. "movdqa %%xmm0,%%xmm1 \n"
  2993. "pand %%xmm5,%%xmm0 \n"
  2994. "packuswb %%xmm0,%%xmm0 \n"
  2995. "psrlw $0x8,%%xmm1 \n"
  2996. "packuswb %%xmm1,%%xmm1 \n"
  2997. "movq %%xmm0," MEMACCESS(1) " \n"
  2998. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  2999. "lea " MEMLEA(0x8,1) ",%1 \n"
  3000. "sub $0x10,%3 \n"
  3001. "jg 1b \n"
  3002. : "+r"(src_yuy2), // %0
  3003. "+r"(dst_u), // %1
  3004. "+r"(dst_v), // %2
  3005. "+r"(width) // %3
  3006. :
  3007. : "memory", "cc", NACL_R14
  3008. "xmm0", "xmm1", "xmm5"
  3009. );
  3010. }
  3011. void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
  3012. asm volatile (
  3013. LABELALIGN
  3014. "1: \n"
  3015. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3016. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3017. "lea " MEMLEA(0x20,0) ",%0 \n"
  3018. "psrlw $0x8,%%xmm0 \n"
  3019. "psrlw $0x8,%%xmm1 \n"
  3020. "packuswb %%xmm1,%%xmm0 \n"
  3021. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3022. "lea " MEMLEA(0x10,1) ",%1 \n"
  3023. "sub $0x10,%2 \n"
  3024. "jg 1b \n"
  3025. : "+r"(src_uyvy), // %0
  3026. "+r"(dst_y), // %1
  3027. "+r"(width) // %2
  3028. :
  3029. : "memory", "cc"
  3030. , "xmm0", "xmm1"
  3031. );
  3032. }
  3033. void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
  3034. uint8* dst_u, uint8* dst_v, int width) {
  3035. asm volatile (
  3036. "pcmpeqb %%xmm5,%%xmm5 \n"
  3037. "psrlw $0x8,%%xmm5 \n"
  3038. "sub %1,%2 \n"
  3039. LABELALIGN
  3040. "1: \n"
  3041. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3042. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3043. MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
  3044. MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
  3045. "lea " MEMLEA(0x20,0) ",%0 \n"
  3046. "pavgb %%xmm2,%%xmm0 \n"
  3047. "pavgb %%xmm3,%%xmm1 \n"
  3048. "pand %%xmm5,%%xmm0 \n"
  3049. "pand %%xmm5,%%xmm1 \n"
  3050. "packuswb %%xmm1,%%xmm0 \n"
  3051. "movdqa %%xmm0,%%xmm1 \n"
  3052. "pand %%xmm5,%%xmm0 \n"
  3053. "packuswb %%xmm0,%%xmm0 \n"
  3054. "psrlw $0x8,%%xmm1 \n"
  3055. "packuswb %%xmm1,%%xmm1 \n"
  3056. "movq %%xmm0," MEMACCESS(1) " \n"
  3057. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  3058. "lea " MEMLEA(0x8,1) ",%1 \n"
  3059. "sub $0x10,%3 \n"
  3060. "jg 1b \n"
  3061. : "+r"(src_uyvy), // %0
  3062. "+r"(dst_u), // %1
  3063. "+r"(dst_v), // %2
  3064. "+r"(width) // %3
  3065. : "r"((intptr_t)(stride_uyvy)) // %4
  3066. : "memory", "cc", NACL_R14
  3067. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3068. );
  3069. }
  3070. void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
  3071. uint8* dst_u, uint8* dst_v, int width) {
  3072. asm volatile (
  3073. "pcmpeqb %%xmm5,%%xmm5 \n"
  3074. "psrlw $0x8,%%xmm5 \n"
  3075. "sub %1,%2 \n"
  3076. LABELALIGN
  3077. "1: \n"
  3078. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3079. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3080. "lea " MEMLEA(0x20,0) ",%0 \n"
  3081. "pand %%xmm5,%%xmm0 \n"
  3082. "pand %%xmm5,%%xmm1 \n"
  3083. "packuswb %%xmm1,%%xmm0 \n"
  3084. "movdqa %%xmm0,%%xmm1 \n"
  3085. "pand %%xmm5,%%xmm0 \n"
  3086. "packuswb %%xmm0,%%xmm0 \n"
  3087. "psrlw $0x8,%%xmm1 \n"
  3088. "packuswb %%xmm1,%%xmm1 \n"
  3089. "movq %%xmm0," MEMACCESS(1) " \n"
  3090. MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
  3091. "lea " MEMLEA(0x8,1) ",%1 \n"
  3092. "sub $0x10,%3 \n"
  3093. "jg 1b \n"
  3094. : "+r"(src_uyvy), // %0
  3095. "+r"(dst_u), // %1
  3096. "+r"(dst_v), // %2
  3097. "+r"(width) // %3
  3098. :
  3099. : "memory", "cc", NACL_R14
  3100. "xmm0", "xmm1", "xmm5"
  3101. );
  3102. }
  3103. #endif // HAS_YUY2TOYROW_SSE2
  3104. #ifdef HAS_YUY2TOYROW_AVX2
  3105. void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
  3106. asm volatile (
  3107. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3108. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3109. LABELALIGN
  3110. "1: \n"
  3111. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3112. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3113. "lea " MEMLEA(0x40,0) ",%0 \n"
  3114. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3115. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3116. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3117. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3118. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  3119. "lea " MEMLEA(0x20,1) ",%1 \n"
  3120. "sub $0x20,%2 \n"
  3121. "jg 1b \n"
  3122. "vzeroupper \n"
  3123. : "+r"(src_yuy2), // %0
  3124. "+r"(dst_y), // %1
  3125. "+r"(width) // %2
  3126. :
  3127. : "memory", "cc"
  3128. , "xmm0", "xmm1", "xmm5"
  3129. );
  3130. }
  3131. void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
  3132. uint8* dst_u, uint8* dst_v, int width) {
  3133. asm volatile (
  3134. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3135. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3136. "sub %1,%2 \n"
  3137. LABELALIGN
  3138. "1: \n"
  3139. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3140. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3141. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  3142. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  3143. "lea " MEMLEA(0x40,0) ",%0 \n"
  3144. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3145. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3146. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3147. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3148. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3149. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3150. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3151. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3152. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3153. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3154. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3155. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3156. "lea " MEMLEA(0x10,1) ",%1 \n"
  3157. "sub $0x20,%3 \n"
  3158. "jg 1b \n"
  3159. "vzeroupper \n"
  3160. : "+r"(src_yuy2), // %0
  3161. "+r"(dst_u), // %1
  3162. "+r"(dst_v), // %2
  3163. "+r"(width) // %3
  3164. : "r"((intptr_t)(stride_yuy2)) // %4
  3165. : "memory", "cc", NACL_R14
  3166. "xmm0", "xmm1", "xmm5"
  3167. );
  3168. }
  3169. void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
  3170. uint8* dst_u, uint8* dst_v, int width) {
  3171. asm volatile (
  3172. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3173. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3174. "sub %1,%2 \n"
  3175. LABELALIGN
  3176. "1: \n"
  3177. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3178. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3179. "lea " MEMLEA(0x40,0) ",%0 \n"
  3180. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3181. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3182. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3183. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3184. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3185. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3186. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3187. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3188. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3189. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3190. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3191. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3192. "lea " MEMLEA(0x10,1) ",%1 \n"
  3193. "sub $0x20,%3 \n"
  3194. "jg 1b \n"
  3195. "vzeroupper \n"
  3196. : "+r"(src_yuy2), // %0
  3197. "+r"(dst_u), // %1
  3198. "+r"(dst_v), // %2
  3199. "+r"(width) // %3
  3200. :
  3201. : "memory", "cc", NACL_R14
  3202. "xmm0", "xmm1", "xmm5"
  3203. );
  3204. }
  3205. void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
  3206. asm volatile (
  3207. LABELALIGN
  3208. "1: \n"
  3209. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3210. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3211. "lea " MEMLEA(0x40,0) ",%0 \n"
  3212. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3213. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3214. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3215. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3216. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  3217. "lea " MEMLEA(0x20,1) ",%1 \n"
  3218. "sub $0x20,%2 \n"
  3219. "jg 1b \n"
  3220. "vzeroupper \n"
  3221. : "+r"(src_uyvy), // %0
  3222. "+r"(dst_y), // %1
  3223. "+r"(width) // %2
  3224. :
  3225. : "memory", "cc"
  3226. , "xmm0", "xmm1", "xmm5"
  3227. );
  3228. }
  3229. void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
  3230. uint8* dst_u, uint8* dst_v, int width) {
  3231. asm volatile (
  3232. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3233. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3234. "sub %1,%2 \n"
  3235. LABELALIGN
  3236. "1: \n"
  3237. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3238. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3239. VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
  3240. VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
  3241. "lea " MEMLEA(0x40,0) ",%0 \n"
  3242. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3243. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3244. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3245. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3246. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3247. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3248. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3249. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3250. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3251. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3252. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3253. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3254. "lea " MEMLEA(0x10,1) ",%1 \n"
  3255. "sub $0x20,%3 \n"
  3256. "jg 1b \n"
  3257. "vzeroupper \n"
  3258. : "+r"(src_uyvy), // %0
  3259. "+r"(dst_u), // %1
  3260. "+r"(dst_v), // %2
  3261. "+r"(width) // %3
  3262. : "r"((intptr_t)(stride_uyvy)) // %4
  3263. : "memory", "cc", NACL_R14
  3264. "xmm0", "xmm1", "xmm5"
  3265. );
  3266. }
  3267. void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
  3268. uint8* dst_u, uint8* dst_v, int width) {
  3269. asm volatile (
  3270. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3271. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3272. "sub %1,%2 \n"
  3273. LABELALIGN
  3274. "1: \n"
  3275. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  3276. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  3277. "lea " MEMLEA(0x40,0) ",%0 \n"
  3278. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3279. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3280. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3281. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3282. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  3283. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3284. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  3285. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  3286. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  3287. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3288. "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
  3289. VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
  3290. "lea " MEMLEA(0x10,1) ",%1 \n"
  3291. "sub $0x20,%3 \n"
  3292. "jg 1b \n"
  3293. "vzeroupper \n"
  3294. : "+r"(src_uyvy), // %0
  3295. "+r"(dst_u), // %1
  3296. "+r"(dst_v), // %2
  3297. "+r"(width) // %3
  3298. :
  3299. : "memory", "cc", NACL_R14
  3300. "xmm0", "xmm1", "xmm5"
  3301. );
  3302. }
  3303. #endif // HAS_YUY2TOYROW_AVX2
  3304. #ifdef HAS_ARGBBLENDROW_SSSE3
  3305. // Shuffle table for isolating alpha.
  3306. static uvec8 kShuffleAlpha = {
  3307. 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  3308. 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
  3309. };
  3310. // Blend 8 pixels at a time
  3311. void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
  3312. uint8* dst_argb, int width) {
  3313. asm volatile (
  3314. "pcmpeqb %%xmm7,%%xmm7 \n"
  3315. "psrlw $0xf,%%xmm7 \n"
  3316. "pcmpeqb %%xmm6,%%xmm6 \n"
  3317. "psrlw $0x8,%%xmm6 \n"
  3318. "pcmpeqb %%xmm5,%%xmm5 \n"
  3319. "psllw $0x8,%%xmm5 \n"
  3320. "pcmpeqb %%xmm4,%%xmm4 \n"
  3321. "pslld $0x18,%%xmm4 \n"
  3322. "sub $0x4,%3 \n"
  3323. "jl 49f \n"
  3324. // 4 pixel loop.
  3325. LABELALIGN
  3326. "40: \n"
  3327. "movdqu " MEMACCESS(0) ",%%xmm3 \n"
  3328. "lea " MEMLEA(0x10,0) ",%0 \n"
  3329. "movdqa %%xmm3,%%xmm0 \n"
  3330. "pxor %%xmm4,%%xmm3 \n"
  3331. "movdqu " MEMACCESS(1) ",%%xmm2 \n"
  3332. "pshufb %4,%%xmm3 \n"
  3333. "pand %%xmm6,%%xmm2 \n"
  3334. "paddw %%xmm7,%%xmm3 \n"
  3335. "pmullw %%xmm3,%%xmm2 \n"
  3336. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  3337. "lea " MEMLEA(0x10,1) ",%1 \n"
  3338. "psrlw $0x8,%%xmm1 \n"
  3339. "por %%xmm4,%%xmm0 \n"
  3340. "pmullw %%xmm3,%%xmm1 \n"
  3341. "psrlw $0x8,%%xmm2 \n"
  3342. "paddusb %%xmm2,%%xmm0 \n"
  3343. "pand %%xmm5,%%xmm1 \n"
  3344. "paddusb %%xmm1,%%xmm0 \n"
  3345. "movdqu %%xmm0," MEMACCESS(2) " \n"
  3346. "lea " MEMLEA(0x10,2) ",%2 \n"
  3347. "sub $0x4,%3 \n"
  3348. "jge 40b \n"
  3349. "49: \n"
  3350. "add $0x3,%3 \n"
  3351. "jl 99f \n"
  3352. // 1 pixel loop.
  3353. "91: \n"
  3354. "movd " MEMACCESS(0) ",%%xmm3 \n"
  3355. "lea " MEMLEA(0x4,0) ",%0 \n"
  3356. "movdqa %%xmm3,%%xmm0 \n"
  3357. "pxor %%xmm4,%%xmm3 \n"
  3358. "movd " MEMACCESS(1) ",%%xmm2 \n"
  3359. "pshufb %4,%%xmm3 \n"
  3360. "pand %%xmm6,%%xmm2 \n"
  3361. "paddw %%xmm7,%%xmm3 \n"
  3362. "pmullw %%xmm3,%%xmm2 \n"
  3363. "movd " MEMACCESS(1) ",%%xmm1 \n"
  3364. "lea " MEMLEA(0x4,1) ",%1 \n"
  3365. "psrlw $0x8,%%xmm1 \n"
  3366. "por %%xmm4,%%xmm0 \n"
  3367. "pmullw %%xmm3,%%xmm1 \n"
  3368. "psrlw $0x8,%%xmm2 \n"
  3369. "paddusb %%xmm2,%%xmm0 \n"
  3370. "pand %%xmm5,%%xmm1 \n"
  3371. "paddusb %%xmm1,%%xmm0 \n"
  3372. "movd %%xmm0," MEMACCESS(2) " \n"
  3373. "lea " MEMLEA(0x4,2) ",%2 \n"
  3374. "sub $0x1,%3 \n"
  3375. "jge 91b \n"
  3376. "99: \n"
  3377. : "+r"(src_argb0), // %0
  3378. "+r"(src_argb1), // %1
  3379. "+r"(dst_argb), // %2
  3380. "+r"(width) // %3
  3381. : "m"(kShuffleAlpha) // %4
  3382. : "memory", "cc"
  3383. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3384. );
  3385. }
  3386. #endif // HAS_ARGBBLENDROW_SSSE3
  3387. #ifdef HAS_BLENDPLANEROW_SSSE3
  3388. // Blend 8 pixels at a time.
  3389. // unsigned version of math
  3390. // =((A2*C2)+(B2*(255-C2))+255)/256
  3391. // signed version of math
  3392. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3393. void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
  3394. const uint8* alpha, uint8* dst, int width) {
  3395. asm volatile (
  3396. "pcmpeqb %%xmm5,%%xmm5 \n"
  3397. "psllw $0x8,%%xmm5 \n"
  3398. "mov $0x80808080,%%eax \n"
  3399. "movd %%eax,%%xmm6 \n"
  3400. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  3401. "mov $0x807f807f,%%eax \n"
  3402. "movd %%eax,%%xmm7 \n"
  3403. "pshufd $0x0,%%xmm7,%%xmm7 \n"
  3404. "sub %2,%0 \n"
  3405. "sub %2,%1 \n"
  3406. "sub %2,%3 \n"
  3407. // 8 pixel loop.
  3408. LABELALIGN
  3409. "1: \n"
  3410. "movq (%2),%%xmm0 \n"
  3411. "punpcklbw %%xmm0,%%xmm0 \n"
  3412. "pxor %%xmm5,%%xmm0 \n"
  3413. "movq (%0,%2,1),%%xmm1 \n"
  3414. "movq (%1,%2,1),%%xmm2 \n"
  3415. "punpcklbw %%xmm2,%%xmm1 \n"
  3416. "psubb %%xmm6,%%xmm1 \n"
  3417. "pmaddubsw %%xmm1,%%xmm0 \n"
  3418. "paddw %%xmm7,%%xmm0 \n"
  3419. "psrlw $0x8,%%xmm0 \n"
  3420. "packuswb %%xmm0,%%xmm0 \n"
  3421. "movq %%xmm0,(%3,%2,1) \n"
  3422. "lea 0x8(%2),%2 \n"
  3423. "sub $0x8,%4 \n"
  3424. "jg 1b \n"
  3425. : "+r"(src0), // %0
  3426. "+r"(src1), // %1
  3427. "+r"(alpha), // %2
  3428. "+r"(dst), // %3
  3429. "+rm"(width) // %4
  3430. :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
  3431. );
  3432. }
  3433. #endif // HAS_BLENDPLANEROW_SSSE3
  3434. #ifdef HAS_BLENDPLANEROW_AVX2
  3435. // Blend 32 pixels at a time.
  3436. // unsigned version of math
  3437. // =((A2*C2)+(B2*(255-C2))+255)/256
  3438. // signed version of math
  3439. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3440. void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
  3441. const uint8* alpha, uint8* dst, int width) {
  3442. asm volatile (
  3443. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3444. "vpsllw $0x8,%%ymm5,%%ymm5 \n"
  3445. "mov $0x80808080,%%eax \n"
  3446. "vmovd %%eax,%%xmm6 \n"
  3447. "vbroadcastss %%xmm6,%%ymm6 \n"
  3448. "mov $0x807f807f,%%eax \n"
  3449. "vmovd %%eax,%%xmm7 \n"
  3450. "vbroadcastss %%xmm7,%%ymm7 \n"
  3451. "sub %2,%0 \n"
  3452. "sub %2,%1 \n"
  3453. "sub %2,%3 \n"
  3454. // 32 pixel loop.
  3455. LABELALIGN
  3456. "1: \n"
  3457. "vmovdqu (%2),%%ymm0 \n"
  3458. "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
  3459. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  3460. "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
  3461. "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
  3462. "vmovdqu (%0,%2,1),%%ymm1 \n"
  3463. "vmovdqu (%1,%2,1),%%ymm2 \n"
  3464. "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
  3465. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  3466. "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
  3467. "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
  3468. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  3469. "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
  3470. "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
  3471. "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
  3472. "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
  3473. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3474. "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
  3475. "vmovdqu %%ymm0,(%3,%2,1) \n"
  3476. "lea 0x20(%2),%2 \n"
  3477. "sub $0x20,%4 \n"
  3478. "jg 1b \n"
  3479. "vzeroupper \n"
  3480. : "+r"(src0), // %0
  3481. "+r"(src1), // %1
  3482. "+r"(alpha), // %2
  3483. "+r"(dst), // %3
  3484. "+rm"(width) // %4
  3485. :: "memory", "cc", "eax",
  3486. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3487. );
  3488. }
  3489. #endif // HAS_BLENDPLANEROW_AVX2
  3490. #ifdef HAS_ARGBATTENUATEROW_SSSE3
  3491. // Shuffle table duplicating alpha
  3492. static uvec8 kShuffleAlpha0 = {
  3493. 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
  3494. };
  3495. static uvec8 kShuffleAlpha1 = {
  3496. 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  3497. 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
  3498. };
  3499. // Attenuate 4 pixels at a time.
  3500. void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  3501. asm volatile (
  3502. "pcmpeqb %%xmm3,%%xmm3 \n"
  3503. "pslld $0x18,%%xmm3 \n"
  3504. "movdqa %3,%%xmm4 \n"
  3505. "movdqa %4,%%xmm5 \n"
  3506. // 4 pixel loop.
  3507. LABELALIGN
  3508. "1: \n"
  3509. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3510. "pshufb %%xmm4,%%xmm0 \n"
  3511. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3512. "punpcklbw %%xmm1,%%xmm1 \n"
  3513. "pmulhuw %%xmm1,%%xmm0 \n"
  3514. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3515. "pshufb %%xmm5,%%xmm1 \n"
  3516. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  3517. "punpckhbw %%xmm2,%%xmm2 \n"
  3518. "pmulhuw %%xmm2,%%xmm1 \n"
  3519. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  3520. "lea " MEMLEA(0x10,0) ",%0 \n"
  3521. "pand %%xmm3,%%xmm2 \n"
  3522. "psrlw $0x8,%%xmm0 \n"
  3523. "psrlw $0x8,%%xmm1 \n"
  3524. "packuswb %%xmm1,%%xmm0 \n"
  3525. "por %%xmm2,%%xmm0 \n"
  3526. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3527. "lea " MEMLEA(0x10,1) ",%1 \n"
  3528. "sub $0x4,%2 \n"
  3529. "jg 1b \n"
  3530. : "+r"(src_argb), // %0
  3531. "+r"(dst_argb), // %1
  3532. "+r"(width) // %2
  3533. : "m"(kShuffleAlpha0), // %3
  3534. "m"(kShuffleAlpha1) // %4
  3535. : "memory", "cc"
  3536. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3537. );
  3538. }
  3539. #endif // HAS_ARGBATTENUATEROW_SSSE3
  3540. #ifdef HAS_ARGBATTENUATEROW_AVX2
  3541. // Shuffle table duplicating alpha.
  3542. static const uvec8 kShuffleAlpha_AVX2 = {
  3543. 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
  3544. };
  3545. // Attenuate 8 pixels at a time.
  3546. void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
  3547. asm volatile (
  3548. "vbroadcastf128 %3,%%ymm4 \n"
  3549. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3550. "vpslld $0x18,%%ymm5,%%ymm5 \n"
  3551. "sub %0,%1 \n"
  3552. // 8 pixel loop.
  3553. LABELALIGN
  3554. "1: \n"
  3555. "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
  3556. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  3557. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  3558. "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
  3559. "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
  3560. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3561. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  3562. "vpand %%ymm5,%%ymm6,%%ymm6 \n"
  3563. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  3564. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  3565. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3566. "vpor %%ymm6,%%ymm0,%%ymm0 \n"
  3567. MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
  3568. "lea " MEMLEA(0x20,0) ",%0 \n"
  3569. "sub $0x8,%2 \n"
  3570. "jg 1b \n"
  3571. "vzeroupper \n"
  3572. : "+r"(src_argb), // %0
  3573. "+r"(dst_argb), // %1
  3574. "+r"(width) // %2
  3575. : "m"(kShuffleAlpha_AVX2) // %3
  3576. : "memory", "cc"
  3577. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  3578. );
  3579. }
  3580. #endif // HAS_ARGBATTENUATEROW_AVX2
  3581. #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  3582. // Unattenuate 4 pixels at a time.
  3583. void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  3584. int width) {
  3585. uintptr_t alpha;
  3586. asm volatile (
  3587. // 4 pixel loop.
  3588. LABELALIGN
  3589. "1: \n"
  3590. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3591. "movzb " MEMACCESS2(0x03,0) ",%3 \n"
  3592. "punpcklbw %%xmm0,%%xmm0 \n"
  3593. MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
  3594. "movzb " MEMACCESS2(0x07,0) ",%3 \n"
  3595. MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
  3596. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  3597. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  3598. "movlhps %%xmm3,%%xmm2 \n"
  3599. "pmulhuw %%xmm2,%%xmm0 \n"
  3600. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3601. "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
  3602. "punpckhbw %%xmm1,%%xmm1 \n"
  3603. MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
  3604. "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
  3605. MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
  3606. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  3607. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  3608. "movlhps %%xmm3,%%xmm2 \n"
  3609. "pmulhuw %%xmm2,%%xmm1 \n"
  3610. "lea " MEMLEA(0x10,0) ",%0 \n"
  3611. "packuswb %%xmm1,%%xmm0 \n"
  3612. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3613. "lea " MEMLEA(0x10,1) ",%1 \n"
  3614. "sub $0x4,%2 \n"
  3615. "jg 1b \n"
  3616. : "+r"(src_argb), // %0
  3617. "+r"(dst_argb), // %1
  3618. "+r"(width), // %2
  3619. "=&r"(alpha) // %3
  3620. : "r"(fixed_invtbl8) // %4
  3621. : "memory", "cc", NACL_R14
  3622. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3623. );
  3624. }
  3625. #endif // HAS_ARGBUNATTENUATEROW_SSE2
  3626. #ifdef HAS_ARGBUNATTENUATEROW_AVX2
  3627. // Shuffle table duplicating alpha.
  3628. static const uvec8 kUnattenShuffleAlpha_AVX2 = {
  3629. 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
  3630. };
  3631. // Unattenuate 8 pixels at a time.
  3632. void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  3633. int width) {
  3634. uintptr_t alpha;
  3635. asm volatile (
  3636. "sub %0,%1 \n"
  3637. "vbroadcastf128 %5,%%ymm5 \n"
  3638. // 8 pixel loop.
  3639. LABELALIGN
  3640. "1: \n"
  3641. // replace VPGATHER
  3642. "movzb " MEMACCESS2(0x03,0) ",%3 \n"
  3643. MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
  3644. "movzb " MEMACCESS2(0x07,0) ",%3 \n"
  3645. MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
  3646. "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
  3647. "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
  3648. MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
  3649. "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
  3650. MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
  3651. "movzb " MEMACCESS2(0x13,0) ",%3 \n"
  3652. "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
  3653. MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
  3654. "movzb " MEMACCESS2(0x17,0) ",%3 \n"
  3655. MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
  3656. "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
  3657. "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
  3658. MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
  3659. "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
  3660. MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
  3661. "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
  3662. "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
  3663. "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
  3664. "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
  3665. // end of VPGATHER
  3666. "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
  3667. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  3668. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  3669. "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
  3670. "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
  3671. "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
  3672. "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
  3673. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3674. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  3675. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3676. MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
  3677. "lea " MEMLEA(0x20,0) ",%0 \n"
  3678. "sub $0x8,%2 \n"
  3679. "jg 1b \n"
  3680. "vzeroupper \n"
  3681. : "+r"(src_argb), // %0
  3682. "+r"(dst_argb), // %1
  3683. "+r"(width), // %2
  3684. "=&r"(alpha) // %3
  3685. : "r"(fixed_invtbl8), // %4
  3686. "m"(kUnattenShuffleAlpha_AVX2) // %5
  3687. : "memory", "cc", NACL_R14
  3688. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3689. );
  3690. }
  3691. #endif // HAS_ARGBUNATTENUATEROW_AVX2
  3692. #ifdef HAS_ARGBGRAYROW_SSSE3
  3693. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  3694. void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
  3695. asm volatile (
  3696. "movdqa %3,%%xmm4 \n"
  3697. "movdqa %4,%%xmm5 \n"
  3698. // 8 pixel loop.
  3699. LABELALIGN
  3700. "1: \n"
  3701. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3702. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3703. "pmaddubsw %%xmm4,%%xmm0 \n"
  3704. "pmaddubsw %%xmm4,%%xmm1 \n"
  3705. "phaddw %%xmm1,%%xmm0 \n"
  3706. "paddw %%xmm5,%%xmm0 \n"
  3707. "psrlw $0x7,%%xmm0 \n"
  3708. "packuswb %%xmm0,%%xmm0 \n"
  3709. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  3710. "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
  3711. "lea " MEMLEA(0x20,0) ",%0 \n"
  3712. "psrld $0x18,%%xmm2 \n"
  3713. "psrld $0x18,%%xmm3 \n"
  3714. "packuswb %%xmm3,%%xmm2 \n"
  3715. "packuswb %%xmm2,%%xmm2 \n"
  3716. "movdqa %%xmm0,%%xmm3 \n"
  3717. "punpcklbw %%xmm0,%%xmm0 \n"
  3718. "punpcklbw %%xmm2,%%xmm3 \n"
  3719. "movdqa %%xmm0,%%xmm1 \n"
  3720. "punpcklwd %%xmm3,%%xmm0 \n"
  3721. "punpckhwd %%xmm3,%%xmm1 \n"
  3722. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3723. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  3724. "lea " MEMLEA(0x20,1) ",%1 \n"
  3725. "sub $0x8,%2 \n"
  3726. "jg 1b \n"
  3727. : "+r"(src_argb), // %0
  3728. "+r"(dst_argb), // %1
  3729. "+r"(width) // %2
  3730. : "m"(kARGBToYJ), // %3
  3731. "m"(kAddYJ64) // %4
  3732. : "memory", "cc"
  3733. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  3734. );
  3735. }
  3736. #endif // HAS_ARGBGRAYROW_SSSE3
  3737. #ifdef HAS_ARGBSEPIAROW_SSSE3
  3738. // b = (r * 35 + g * 68 + b * 17) >> 7
  3739. // g = (r * 45 + g * 88 + b * 22) >> 7
  3740. // r = (r * 50 + g * 98 + b * 24) >> 7
  3741. // Constant for ARGB color to sepia tone
  3742. static vec8 kARGBToSepiaB = {
  3743. 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
  3744. };
  3745. static vec8 kARGBToSepiaG = {
  3746. 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
  3747. };
  3748. static vec8 kARGBToSepiaR = {
  3749. 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
  3750. };
  3751. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  3752. void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  3753. asm volatile (
  3754. "movdqa %2,%%xmm2 \n"
  3755. "movdqa %3,%%xmm3 \n"
  3756. "movdqa %4,%%xmm4 \n"
  3757. // 8 pixel loop.
  3758. LABELALIGN
  3759. "1: \n"
  3760. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3761. "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
  3762. "pmaddubsw %%xmm2,%%xmm0 \n"
  3763. "pmaddubsw %%xmm2,%%xmm6 \n"
  3764. "phaddw %%xmm6,%%xmm0 \n"
  3765. "psrlw $0x7,%%xmm0 \n"
  3766. "packuswb %%xmm0,%%xmm0 \n"
  3767. "movdqu " MEMACCESS(0) ",%%xmm5 \n"
  3768. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3769. "pmaddubsw %%xmm3,%%xmm5 \n"
  3770. "pmaddubsw %%xmm3,%%xmm1 \n"
  3771. "phaddw %%xmm1,%%xmm5 \n"
  3772. "psrlw $0x7,%%xmm5 \n"
  3773. "packuswb %%xmm5,%%xmm5 \n"
  3774. "punpcklbw %%xmm5,%%xmm0 \n"
  3775. "movdqu " MEMACCESS(0) ",%%xmm5 \n"
  3776. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3777. "pmaddubsw %%xmm4,%%xmm5 \n"
  3778. "pmaddubsw %%xmm4,%%xmm1 \n"
  3779. "phaddw %%xmm1,%%xmm5 \n"
  3780. "psrlw $0x7,%%xmm5 \n"
  3781. "packuswb %%xmm5,%%xmm5 \n"
  3782. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  3783. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3784. "psrld $0x18,%%xmm6 \n"
  3785. "psrld $0x18,%%xmm1 \n"
  3786. "packuswb %%xmm1,%%xmm6 \n"
  3787. "packuswb %%xmm6,%%xmm6 \n"
  3788. "punpcklbw %%xmm6,%%xmm5 \n"
  3789. "movdqa %%xmm0,%%xmm1 \n"
  3790. "punpcklwd %%xmm5,%%xmm0 \n"
  3791. "punpckhwd %%xmm5,%%xmm1 \n"
  3792. "movdqu %%xmm0," MEMACCESS(0) " \n"
  3793. "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
  3794. "lea " MEMLEA(0x20,0) ",%0 \n"
  3795. "sub $0x8,%1 \n"
  3796. "jg 1b \n"
  3797. : "+r"(dst_argb), // %0
  3798. "+r"(width) // %1
  3799. : "m"(kARGBToSepiaB), // %2
  3800. "m"(kARGBToSepiaG), // %3
  3801. "m"(kARGBToSepiaR) // %4
  3802. : "memory", "cc"
  3803. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  3804. );
  3805. }
  3806. #endif // HAS_ARGBSEPIAROW_SSSE3
  3807. #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  3808. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  3809. // Same as Sepia except matrix is provided.
  3810. void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  3811. const int8* matrix_argb, int width) {
  3812. asm volatile (
  3813. "movdqu " MEMACCESS(3) ",%%xmm5 \n"
  3814. "pshufd $0x00,%%xmm5,%%xmm2 \n"
  3815. "pshufd $0x55,%%xmm5,%%xmm3 \n"
  3816. "pshufd $0xaa,%%xmm5,%%xmm4 \n"
  3817. "pshufd $0xff,%%xmm5,%%xmm5 \n"
  3818. // 8 pixel loop.
  3819. LABELALIGN
  3820. "1: \n"
  3821. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3822. "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
  3823. "pmaddubsw %%xmm2,%%xmm0 \n"
  3824. "pmaddubsw %%xmm2,%%xmm7 \n"
  3825. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  3826. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  3827. "pmaddubsw %%xmm3,%%xmm6 \n"
  3828. "pmaddubsw %%xmm3,%%xmm1 \n"
  3829. "phaddsw %%xmm7,%%xmm0 \n"
  3830. "phaddsw %%xmm1,%%xmm6 \n"
  3831. "psraw $0x6,%%xmm0 \n"
  3832. "psraw $0x6,%%xmm6 \n"
  3833. "packuswb %%xmm0,%%xmm0 \n"
  3834. "packuswb %%xmm6,%%xmm6 \n"
  3835. "punpcklbw %%xmm6,%%xmm0 \n"
  3836. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3837. "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
  3838. "pmaddubsw %%xmm4,%%xmm1 \n"
  3839. "pmaddubsw %%xmm4,%%xmm7 \n"
  3840. "phaddsw %%xmm7,%%xmm1 \n"
  3841. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  3842. "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
  3843. "pmaddubsw %%xmm5,%%xmm6 \n"
  3844. "pmaddubsw %%xmm5,%%xmm7 \n"
  3845. "phaddsw %%xmm7,%%xmm6 \n"
  3846. "psraw $0x6,%%xmm1 \n"
  3847. "psraw $0x6,%%xmm6 \n"
  3848. "packuswb %%xmm1,%%xmm1 \n"
  3849. "packuswb %%xmm6,%%xmm6 \n"
  3850. "punpcklbw %%xmm6,%%xmm1 \n"
  3851. "movdqa %%xmm0,%%xmm6 \n"
  3852. "punpcklwd %%xmm1,%%xmm0 \n"
  3853. "punpckhwd %%xmm1,%%xmm6 \n"
  3854. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3855. "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
  3856. "lea " MEMLEA(0x20,0) ",%0 \n"
  3857. "lea " MEMLEA(0x20,1) ",%1 \n"
  3858. "sub $0x8,%2 \n"
  3859. "jg 1b \n"
  3860. : "+r"(src_argb), // %0
  3861. "+r"(dst_argb), // %1
  3862. "+r"(width) // %2
  3863. : "r"(matrix_argb) // %3
  3864. : "memory", "cc"
  3865. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3866. );
  3867. }
  3868. #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
  3869. #ifdef HAS_ARGBQUANTIZEROW_SSE2
  3870. // Quantize 4 ARGB pixels (16 bytes).
  3871. void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
  3872. int interval_offset, int width) {
  3873. asm volatile (
  3874. "movd %2,%%xmm2 \n"
  3875. "movd %3,%%xmm3 \n"
  3876. "movd %4,%%xmm4 \n"
  3877. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  3878. "pshufd $0x44,%%xmm2,%%xmm2 \n"
  3879. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  3880. "pshufd $0x44,%%xmm3,%%xmm3 \n"
  3881. "pshuflw $0x40,%%xmm4,%%xmm4 \n"
  3882. "pshufd $0x44,%%xmm4,%%xmm4 \n"
  3883. "pxor %%xmm5,%%xmm5 \n"
  3884. "pcmpeqb %%xmm6,%%xmm6 \n"
  3885. "pslld $0x18,%%xmm6 \n"
  3886. // 4 pixel loop.
  3887. LABELALIGN
  3888. "1: \n"
  3889. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3890. "punpcklbw %%xmm5,%%xmm0 \n"
  3891. "pmulhuw %%xmm2,%%xmm0 \n"
  3892. "movdqu " MEMACCESS(0) ",%%xmm1 \n"
  3893. "punpckhbw %%xmm5,%%xmm1 \n"
  3894. "pmulhuw %%xmm2,%%xmm1 \n"
  3895. "pmullw %%xmm3,%%xmm0 \n"
  3896. "movdqu " MEMACCESS(0) ",%%xmm7 \n"
  3897. "pmullw %%xmm3,%%xmm1 \n"
  3898. "pand %%xmm6,%%xmm7 \n"
  3899. "paddw %%xmm4,%%xmm0 \n"
  3900. "paddw %%xmm4,%%xmm1 \n"
  3901. "packuswb %%xmm1,%%xmm0 \n"
  3902. "por %%xmm7,%%xmm0 \n"
  3903. "movdqu %%xmm0," MEMACCESS(0) " \n"
  3904. "lea " MEMLEA(0x10,0) ",%0 \n"
  3905. "sub $0x4,%1 \n"
  3906. "jg 1b \n"
  3907. : "+r"(dst_argb), // %0
  3908. "+r"(width) // %1
  3909. : "r"(scale), // %2
  3910. "r"(interval_size), // %3
  3911. "r"(interval_offset) // %4
  3912. : "memory", "cc"
  3913. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  3914. );
  3915. }
  3916. #endif // HAS_ARGBQUANTIZEROW_SSE2
  3917. #ifdef HAS_ARGBSHADEROW_SSE2
  3918. // Shade 4 pixels at a time by specified value.
  3919. void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
  3920. uint32 value) {
  3921. asm volatile (
  3922. "movd %3,%%xmm2 \n"
  3923. "punpcklbw %%xmm2,%%xmm2 \n"
  3924. "punpcklqdq %%xmm2,%%xmm2 \n"
  3925. // 4 pixel loop.
  3926. LABELALIGN
  3927. "1: \n"
  3928. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3929. "lea " MEMLEA(0x10,0) ",%0 \n"
  3930. "movdqa %%xmm0,%%xmm1 \n"
  3931. "punpcklbw %%xmm0,%%xmm0 \n"
  3932. "punpckhbw %%xmm1,%%xmm1 \n"
  3933. "pmulhuw %%xmm2,%%xmm0 \n"
  3934. "pmulhuw %%xmm2,%%xmm1 \n"
  3935. "psrlw $0x8,%%xmm0 \n"
  3936. "psrlw $0x8,%%xmm1 \n"
  3937. "packuswb %%xmm1,%%xmm0 \n"
  3938. "movdqu %%xmm0," MEMACCESS(1) " \n"
  3939. "lea " MEMLEA(0x10,1) ",%1 \n"
  3940. "sub $0x4,%2 \n"
  3941. "jg 1b \n"
  3942. : "+r"(src_argb), // %0
  3943. "+r"(dst_argb), // %1
  3944. "+r"(width) // %2
  3945. : "r"(value) // %3
  3946. : "memory", "cc"
  3947. , "xmm0", "xmm1", "xmm2"
  3948. );
  3949. }
  3950. #endif // HAS_ARGBSHADEROW_SSE2
  3951. #ifdef HAS_ARGBMULTIPLYROW_SSE2
  3952. // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  3953. void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  3954. uint8* dst_argb, int width) {
  3955. asm volatile (
  3956. "pxor %%xmm5,%%xmm5 \n"
  3957. // 4 pixel loop.
  3958. LABELALIGN
  3959. "1: \n"
  3960. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  3961. "lea " MEMLEA(0x10,0) ",%0 \n"
  3962. "movdqu " MEMACCESS(1) ",%%xmm2 \n"
  3963. "lea " MEMLEA(0x10,1) ",%1 \n"
  3964. "movdqu %%xmm0,%%xmm1 \n"
  3965. "movdqu %%xmm2,%%xmm3 \n"
  3966. "punpcklbw %%xmm0,%%xmm0 \n"
  3967. "punpckhbw %%xmm1,%%xmm1 \n"
  3968. "punpcklbw %%xmm5,%%xmm2 \n"
  3969. "punpckhbw %%xmm5,%%xmm3 \n"
  3970. "pmulhuw %%xmm2,%%xmm0 \n"
  3971. "pmulhuw %%xmm3,%%xmm1 \n"
  3972. "packuswb %%xmm1,%%xmm0 \n"
  3973. "movdqu %%xmm0," MEMACCESS(2) " \n"
  3974. "lea " MEMLEA(0x10,2) ",%2 \n"
  3975. "sub $0x4,%3 \n"
  3976. "jg 1b \n"
  3977. : "+r"(src_argb0), // %0
  3978. "+r"(src_argb1), // %1
  3979. "+r"(dst_argb), // %2
  3980. "+r"(width) // %3
  3981. :
  3982. : "memory", "cc"
  3983. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  3984. );
  3985. }
  3986. #endif // HAS_ARGBMULTIPLYROW_SSE2
  3987. #ifdef HAS_ARGBMULTIPLYROW_AVX2
  3988. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  3989. void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  3990. uint8* dst_argb, int width) {
  3991. asm volatile (
  3992. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  3993. // 4 pixel loop.
  3994. LABELALIGN
  3995. "1: \n"
  3996. "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
  3997. "lea " MEMLEA(0x20,0) ",%0 \n"
  3998. "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
  3999. "lea " MEMLEA(0x20,1) ",%1 \n"
  4000. "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
  4001. "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
  4002. "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
  4003. "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
  4004. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  4005. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  4006. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4007. "vmovdqu %%ymm0," MEMACCESS(2) " \n"
  4008. "lea " MEMLEA(0x20,2) ",%2 \n"
  4009. "sub $0x8,%3 \n"
  4010. "jg 1b \n"
  4011. "vzeroupper \n"
  4012. : "+r"(src_argb0), // %0
  4013. "+r"(src_argb1), // %1
  4014. "+r"(dst_argb), // %2
  4015. "+r"(width) // %3
  4016. :
  4017. : "memory", "cc"
  4018. #if defined(__AVX2__)
  4019. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4020. #endif
  4021. );
  4022. }
  4023. #endif // HAS_ARGBMULTIPLYROW_AVX2
  4024. #ifdef HAS_ARGBADDROW_SSE2
  4025. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4026. void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  4027. uint8* dst_argb, int width) {
  4028. asm volatile (
  4029. // 4 pixel loop.
  4030. LABELALIGN
  4031. "1: \n"
  4032. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4033. "lea " MEMLEA(0x10,0) ",%0 \n"
  4034. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  4035. "lea " MEMLEA(0x10,1) ",%1 \n"
  4036. "paddusb %%xmm1,%%xmm0 \n"
  4037. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4038. "lea " MEMLEA(0x10,2) ",%2 \n"
  4039. "sub $0x4,%3 \n"
  4040. "jg 1b \n"
  4041. : "+r"(src_argb0), // %0
  4042. "+r"(src_argb1), // %1
  4043. "+r"(dst_argb), // %2
  4044. "+r"(width) // %3
  4045. :
  4046. : "memory", "cc"
  4047. , "xmm0", "xmm1"
  4048. );
  4049. }
  4050. #endif // HAS_ARGBADDROW_SSE2
  4051. #ifdef HAS_ARGBADDROW_AVX2
  4052. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4053. void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  4054. uint8* dst_argb, int width) {
  4055. asm volatile (
  4056. // 4 pixel loop.
  4057. LABELALIGN
  4058. "1: \n"
  4059. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  4060. "lea " MEMLEA(0x20,0) ",%0 \n"
  4061. "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
  4062. "lea " MEMLEA(0x20,1) ",%1 \n"
  4063. "vmovdqu %%ymm0," MEMACCESS(2) " \n"
  4064. "lea " MEMLEA(0x20,2) ",%2 \n"
  4065. "sub $0x8,%3 \n"
  4066. "jg 1b \n"
  4067. "vzeroupper \n"
  4068. : "+r"(src_argb0), // %0
  4069. "+r"(src_argb1), // %1
  4070. "+r"(dst_argb), // %2
  4071. "+r"(width) // %3
  4072. :
  4073. : "memory", "cc"
  4074. , "xmm0"
  4075. );
  4076. }
  4077. #endif // HAS_ARGBADDROW_AVX2
  4078. #ifdef HAS_ARGBSUBTRACTROW_SSE2
  4079. // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
  4080. void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  4081. uint8* dst_argb, int width) {
  4082. asm volatile (
  4083. // 4 pixel loop.
  4084. LABELALIGN
  4085. "1: \n"
  4086. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4087. "lea " MEMLEA(0x10,0) ",%0 \n"
  4088. "movdqu " MEMACCESS(1) ",%%xmm1 \n"
  4089. "lea " MEMLEA(0x10,1) ",%1 \n"
  4090. "psubusb %%xmm1,%%xmm0 \n"
  4091. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4092. "lea " MEMLEA(0x10,2) ",%2 \n"
  4093. "sub $0x4,%3 \n"
  4094. "jg 1b \n"
  4095. : "+r"(src_argb0), // %0
  4096. "+r"(src_argb1), // %1
  4097. "+r"(dst_argb), // %2
  4098. "+r"(width) // %3
  4099. :
  4100. : "memory", "cc"
  4101. , "xmm0", "xmm1"
  4102. );
  4103. }
  4104. #endif // HAS_ARGBSUBTRACTROW_SSE2
  4105. #ifdef HAS_ARGBSUBTRACTROW_AVX2
  4106. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  4107. void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
  4108. uint8* dst_argb, int width) {
  4109. asm volatile (
  4110. // 4 pixel loop.
  4111. LABELALIGN
  4112. "1: \n"
  4113. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  4114. "lea " MEMLEA(0x20,0) ",%0 \n"
  4115. "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
  4116. "lea " MEMLEA(0x20,1) ",%1 \n"
  4117. "vmovdqu %%ymm0," MEMACCESS(2) " \n"
  4118. "lea " MEMLEA(0x20,2) ",%2 \n"
  4119. "sub $0x8,%3 \n"
  4120. "jg 1b \n"
  4121. "vzeroupper \n"
  4122. : "+r"(src_argb0), // %0
  4123. "+r"(src_argb1), // %1
  4124. "+r"(dst_argb), // %2
  4125. "+r"(width) // %3
  4126. :
  4127. : "memory", "cc"
  4128. , "xmm0"
  4129. );
  4130. }
  4131. #endif // HAS_ARGBSUBTRACTROW_AVX2
  4132. #ifdef HAS_SOBELXROW_SSE2
  4133. // SobelX as a matrix is
  4134. // -1 0 1
  4135. // -2 0 2
  4136. // -1 0 1
  4137. void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  4138. const uint8* src_y2, uint8* dst_sobelx, int width) {
  4139. asm volatile (
  4140. "sub %0,%1 \n"
  4141. "sub %0,%2 \n"
  4142. "sub %0,%3 \n"
  4143. "pxor %%xmm5,%%xmm5 \n"
  4144. // 8 pixel loop.
  4145. LABELALIGN
  4146. "1: \n"
  4147. "movq " MEMACCESS(0) ",%%xmm0 \n"
  4148. "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
  4149. "punpcklbw %%xmm5,%%xmm0 \n"
  4150. "punpcklbw %%xmm5,%%xmm1 \n"
  4151. "psubw %%xmm1,%%xmm0 \n"
  4152. MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
  4153. MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
  4154. "punpcklbw %%xmm5,%%xmm1 \n"
  4155. "punpcklbw %%xmm5,%%xmm2 \n"
  4156. "psubw %%xmm2,%%xmm1 \n"
  4157. MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
  4158. MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
  4159. "punpcklbw %%xmm5,%%xmm2 \n"
  4160. "punpcklbw %%xmm5,%%xmm3 \n"
  4161. "psubw %%xmm3,%%xmm2 \n"
  4162. "paddw %%xmm2,%%xmm0 \n"
  4163. "paddw %%xmm1,%%xmm0 \n"
  4164. "paddw %%xmm1,%%xmm0 \n"
  4165. "pxor %%xmm1,%%xmm1 \n"
  4166. "psubw %%xmm0,%%xmm1 \n"
  4167. "pmaxsw %%xmm1,%%xmm0 \n"
  4168. "packuswb %%xmm0,%%xmm0 \n"
  4169. MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
  4170. "lea " MEMLEA(0x8,0) ",%0 \n"
  4171. "sub $0x8,%4 \n"
  4172. "jg 1b \n"
  4173. : "+r"(src_y0), // %0
  4174. "+r"(src_y1), // %1
  4175. "+r"(src_y2), // %2
  4176. "+r"(dst_sobelx), // %3
  4177. "+r"(width) // %4
  4178. :
  4179. : "memory", "cc", NACL_R14
  4180. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4181. );
  4182. }
  4183. #endif // HAS_SOBELXROW_SSE2
  4184. #ifdef HAS_SOBELYROW_SSE2
  4185. // SobelY as a matrix is
  4186. // -1 -2 -1
  4187. // 0 0 0
  4188. // 1 2 1
  4189. void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
  4190. uint8* dst_sobely, int width) {
  4191. asm volatile (
  4192. "sub %0,%1 \n"
  4193. "sub %0,%2 \n"
  4194. "pxor %%xmm5,%%xmm5 \n"
  4195. // 8 pixel loop.
  4196. LABELALIGN
  4197. "1: \n"
  4198. "movq " MEMACCESS(0) ",%%xmm0 \n"
  4199. MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
  4200. "punpcklbw %%xmm5,%%xmm0 \n"
  4201. "punpcklbw %%xmm5,%%xmm1 \n"
  4202. "psubw %%xmm1,%%xmm0 \n"
  4203. "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
  4204. MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
  4205. "punpcklbw %%xmm5,%%xmm1 \n"
  4206. "punpcklbw %%xmm5,%%xmm2 \n"
  4207. "psubw %%xmm2,%%xmm1 \n"
  4208. "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
  4209. MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
  4210. "punpcklbw %%xmm5,%%xmm2 \n"
  4211. "punpcklbw %%xmm5,%%xmm3 \n"
  4212. "psubw %%xmm3,%%xmm2 \n"
  4213. "paddw %%xmm2,%%xmm0 \n"
  4214. "paddw %%xmm1,%%xmm0 \n"
  4215. "paddw %%xmm1,%%xmm0 \n"
  4216. "pxor %%xmm1,%%xmm1 \n"
  4217. "psubw %%xmm0,%%xmm1 \n"
  4218. "pmaxsw %%xmm1,%%xmm0 \n"
  4219. "packuswb %%xmm0,%%xmm0 \n"
  4220. MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
  4221. "lea " MEMLEA(0x8,0) ",%0 \n"
  4222. "sub $0x8,%3 \n"
  4223. "jg 1b \n"
  4224. : "+r"(src_y0), // %0
  4225. "+r"(src_y1), // %1
  4226. "+r"(dst_sobely), // %2
  4227. "+r"(width) // %3
  4228. :
  4229. : "memory", "cc", NACL_R14
  4230. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4231. );
  4232. }
  4233. #endif // HAS_SOBELYROW_SSE2
  4234. #ifdef HAS_SOBELROW_SSE2
  4235. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  4236. // A = 255
  4237. // R = Sobel
  4238. // G = Sobel
  4239. // B = Sobel
  4240. void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4241. uint8* dst_argb, int width) {
  4242. asm volatile (
  4243. "sub %0,%1 \n"
  4244. "pcmpeqb %%xmm5,%%xmm5 \n"
  4245. "pslld $0x18,%%xmm5 \n"
  4246. // 8 pixel loop.
  4247. LABELALIGN
  4248. "1: \n"
  4249. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4250. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  4251. "lea " MEMLEA(0x10,0) ",%0 \n"
  4252. "paddusb %%xmm1,%%xmm0 \n"
  4253. "movdqa %%xmm0,%%xmm2 \n"
  4254. "punpcklbw %%xmm0,%%xmm2 \n"
  4255. "punpckhbw %%xmm0,%%xmm0 \n"
  4256. "movdqa %%xmm2,%%xmm1 \n"
  4257. "punpcklwd %%xmm2,%%xmm1 \n"
  4258. "punpckhwd %%xmm2,%%xmm2 \n"
  4259. "por %%xmm5,%%xmm1 \n"
  4260. "por %%xmm5,%%xmm2 \n"
  4261. "movdqa %%xmm0,%%xmm3 \n"
  4262. "punpcklwd %%xmm0,%%xmm3 \n"
  4263. "punpckhwd %%xmm0,%%xmm0 \n"
  4264. "por %%xmm5,%%xmm3 \n"
  4265. "por %%xmm5,%%xmm0 \n"
  4266. "movdqu %%xmm1," MEMACCESS(2) " \n"
  4267. "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
  4268. "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
  4269. "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
  4270. "lea " MEMLEA(0x40,2) ",%2 \n"
  4271. "sub $0x10,%3 \n"
  4272. "jg 1b \n"
  4273. : "+r"(src_sobelx), // %0
  4274. "+r"(src_sobely), // %1
  4275. "+r"(dst_argb), // %2
  4276. "+r"(width) // %3
  4277. :
  4278. : "memory", "cc", NACL_R14
  4279. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  4280. );
  4281. }
  4282. #endif // HAS_SOBELROW_SSE2
  4283. #ifdef HAS_SOBELTOPLANEROW_SSE2
  4284. // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  4285. void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4286. uint8* dst_y, int width) {
  4287. asm volatile (
  4288. "sub %0,%1 \n"
  4289. "pcmpeqb %%xmm5,%%xmm5 \n"
  4290. "pslld $0x18,%%xmm5 \n"
  4291. // 8 pixel loop.
  4292. LABELALIGN
  4293. "1: \n"
  4294. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4295. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  4296. "lea " MEMLEA(0x10,0) ",%0 \n"
  4297. "paddusb %%xmm1,%%xmm0 \n"
  4298. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4299. "lea " MEMLEA(0x10,2) ",%2 \n"
  4300. "sub $0x10,%3 \n"
  4301. "jg 1b \n"
  4302. : "+r"(src_sobelx), // %0
  4303. "+r"(src_sobely), // %1
  4304. "+r"(dst_y), // %2
  4305. "+r"(width) // %3
  4306. :
  4307. : "memory", "cc", NACL_R14
  4308. "xmm0", "xmm1"
  4309. );
  4310. }
  4311. #endif // HAS_SOBELTOPLANEROW_SSE2
  4312. #ifdef HAS_SOBELXYROW_SSE2
  4313. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  4314. // A = 255
  4315. // R = Sobel X
  4316. // G = Sobel
  4317. // B = Sobel Y
  4318. void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
  4319. uint8* dst_argb, int width) {
  4320. asm volatile (
  4321. "sub %0,%1 \n"
  4322. "pcmpeqb %%xmm5,%%xmm5 \n"
  4323. // 8 pixel loop.
  4324. LABELALIGN
  4325. "1: \n"
  4326. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4327. MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
  4328. "lea " MEMLEA(0x10,0) ",%0 \n"
  4329. "movdqa %%xmm0,%%xmm2 \n"
  4330. "paddusb %%xmm1,%%xmm2 \n"
  4331. "movdqa %%xmm0,%%xmm3 \n"
  4332. "punpcklbw %%xmm5,%%xmm3 \n"
  4333. "punpckhbw %%xmm5,%%xmm0 \n"
  4334. "movdqa %%xmm1,%%xmm4 \n"
  4335. "punpcklbw %%xmm2,%%xmm4 \n"
  4336. "punpckhbw %%xmm2,%%xmm1 \n"
  4337. "movdqa %%xmm4,%%xmm6 \n"
  4338. "punpcklwd %%xmm3,%%xmm6 \n"
  4339. "punpckhwd %%xmm3,%%xmm4 \n"
  4340. "movdqa %%xmm1,%%xmm7 \n"
  4341. "punpcklwd %%xmm0,%%xmm7 \n"
  4342. "punpckhwd %%xmm0,%%xmm1 \n"
  4343. "movdqu %%xmm6," MEMACCESS(2) " \n"
  4344. "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
  4345. "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
  4346. "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
  4347. "lea " MEMLEA(0x40,2) ",%2 \n"
  4348. "sub $0x10,%3 \n"
  4349. "jg 1b \n"
  4350. : "+r"(src_sobelx), // %0
  4351. "+r"(src_sobely), // %1
  4352. "+r"(dst_argb), // %2
  4353. "+r"(width) // %3
  4354. :
  4355. : "memory", "cc", NACL_R14
  4356. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4357. );
  4358. }
  4359. #endif // HAS_SOBELXYROW_SSE2
  4360. #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  4361. // Creates a table of cumulative sums where each value is a sum of all values
  4362. // above and to the left of the value, inclusive of the value.
  4363. void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
  4364. const int32* previous_cumsum, int width) {
  4365. asm volatile (
  4366. "pxor %%xmm0,%%xmm0 \n"
  4367. "pxor %%xmm1,%%xmm1 \n"
  4368. "sub $0x4,%3 \n"
  4369. "jl 49f \n"
  4370. "test $0xf,%1 \n"
  4371. "jne 49f \n"
  4372. // 4 pixel loop \n"
  4373. LABELALIGN
  4374. "40: \n"
  4375. "movdqu " MEMACCESS(0) ",%%xmm2 \n"
  4376. "lea " MEMLEA(0x10,0) ",%0 \n"
  4377. "movdqa %%xmm2,%%xmm4 \n"
  4378. "punpcklbw %%xmm1,%%xmm2 \n"
  4379. "movdqa %%xmm2,%%xmm3 \n"
  4380. "punpcklwd %%xmm1,%%xmm2 \n"
  4381. "punpckhwd %%xmm1,%%xmm3 \n"
  4382. "punpckhbw %%xmm1,%%xmm4 \n"
  4383. "movdqa %%xmm4,%%xmm5 \n"
  4384. "punpcklwd %%xmm1,%%xmm4 \n"
  4385. "punpckhwd %%xmm1,%%xmm5 \n"
  4386. "paddd %%xmm2,%%xmm0 \n"
  4387. "movdqu " MEMACCESS(2) ",%%xmm2 \n"
  4388. "paddd %%xmm0,%%xmm2 \n"
  4389. "paddd %%xmm3,%%xmm0 \n"
  4390. "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
  4391. "paddd %%xmm0,%%xmm3 \n"
  4392. "paddd %%xmm4,%%xmm0 \n"
  4393. "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
  4394. "paddd %%xmm0,%%xmm4 \n"
  4395. "paddd %%xmm5,%%xmm0 \n"
  4396. "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
  4397. "lea " MEMLEA(0x40,2) ",%2 \n"
  4398. "paddd %%xmm0,%%xmm5 \n"
  4399. "movdqu %%xmm2," MEMACCESS(1) " \n"
  4400. "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
  4401. "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
  4402. "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
  4403. "lea " MEMLEA(0x40,1) ",%1 \n"
  4404. "sub $0x4,%3 \n"
  4405. "jge 40b \n"
  4406. "49: \n"
  4407. "add $0x3,%3 \n"
  4408. "jl 19f \n"
  4409. // 1 pixel loop \n"
  4410. LABELALIGN
  4411. "10: \n"
  4412. "movd " MEMACCESS(0) ",%%xmm2 \n"
  4413. "lea " MEMLEA(0x4,0) ",%0 \n"
  4414. "punpcklbw %%xmm1,%%xmm2 \n"
  4415. "punpcklwd %%xmm1,%%xmm2 \n"
  4416. "paddd %%xmm2,%%xmm0 \n"
  4417. "movdqu " MEMACCESS(2) ",%%xmm2 \n"
  4418. "lea " MEMLEA(0x10,2) ",%2 \n"
  4419. "paddd %%xmm0,%%xmm2 \n"
  4420. "movdqu %%xmm2," MEMACCESS(1) " \n"
  4421. "lea " MEMLEA(0x10,1) ",%1 \n"
  4422. "sub $0x1,%3 \n"
  4423. "jge 10b \n"
  4424. "19: \n"
  4425. : "+r"(row), // %0
  4426. "+r"(cumsum), // %1
  4427. "+r"(previous_cumsum), // %2
  4428. "+r"(width) // %3
  4429. :
  4430. : "memory", "cc"
  4431. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  4432. );
  4433. }
  4434. #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
  4435. #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4436. void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
  4437. int width, int area, uint8* dst,
  4438. int count) {
  4439. asm volatile (
  4440. "movd %5,%%xmm5 \n"
  4441. "cvtdq2ps %%xmm5,%%xmm5 \n"
  4442. "rcpss %%xmm5,%%xmm4 \n"
  4443. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  4444. "sub $0x4,%3 \n"
  4445. "jl 49f \n"
  4446. "cmpl $0x80,%5 \n"
  4447. "ja 40f \n"
  4448. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  4449. "pcmpeqb %%xmm6,%%xmm6 \n"
  4450. "psrld $0x10,%%xmm6 \n"
  4451. "cvtdq2ps %%xmm6,%%xmm6 \n"
  4452. "addps %%xmm6,%%xmm5 \n"
  4453. "mulps %%xmm4,%%xmm5 \n"
  4454. "cvtps2dq %%xmm5,%%xmm5 \n"
  4455. "packssdw %%xmm5,%%xmm5 \n"
  4456. // 4 pixel small loop \n"
  4457. LABELALIGN
  4458. "4: \n"
  4459. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4460. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  4461. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  4462. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  4463. MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
  4464. MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
  4465. MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
  4466. MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
  4467. "lea " MEMLEA(0x40,0) ",%0 \n"
  4468. "psubd " MEMACCESS(1) ",%%xmm0 \n"
  4469. "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
  4470. "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
  4471. "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
  4472. MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
  4473. MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
  4474. MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
  4475. MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
  4476. "lea " MEMLEA(0x40,1) ",%1 \n"
  4477. "packssdw %%xmm1,%%xmm0 \n"
  4478. "packssdw %%xmm3,%%xmm2 \n"
  4479. "pmulhuw %%xmm5,%%xmm0 \n"
  4480. "pmulhuw %%xmm5,%%xmm2 \n"
  4481. "packuswb %%xmm2,%%xmm0 \n"
  4482. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4483. "lea " MEMLEA(0x10,2) ",%2 \n"
  4484. "sub $0x4,%3 \n"
  4485. "jge 4b \n"
  4486. "jmp 49f \n"
  4487. // 4 pixel loop \n"
  4488. LABELALIGN
  4489. "40: \n"
  4490. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4491. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  4492. "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
  4493. "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
  4494. MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
  4495. MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
  4496. MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
  4497. MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
  4498. "lea " MEMLEA(0x40,0) ",%0 \n"
  4499. "psubd " MEMACCESS(1) ",%%xmm0 \n"
  4500. "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
  4501. "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
  4502. "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
  4503. MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
  4504. MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
  4505. MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
  4506. MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
  4507. "lea " MEMLEA(0x40,1) ",%1 \n"
  4508. "cvtdq2ps %%xmm0,%%xmm0 \n"
  4509. "cvtdq2ps %%xmm1,%%xmm1 \n"
  4510. "mulps %%xmm4,%%xmm0 \n"
  4511. "mulps %%xmm4,%%xmm1 \n"
  4512. "cvtdq2ps %%xmm2,%%xmm2 \n"
  4513. "cvtdq2ps %%xmm3,%%xmm3 \n"
  4514. "mulps %%xmm4,%%xmm2 \n"
  4515. "mulps %%xmm4,%%xmm3 \n"
  4516. "cvtps2dq %%xmm0,%%xmm0 \n"
  4517. "cvtps2dq %%xmm1,%%xmm1 \n"
  4518. "cvtps2dq %%xmm2,%%xmm2 \n"
  4519. "cvtps2dq %%xmm3,%%xmm3 \n"
  4520. "packssdw %%xmm1,%%xmm0 \n"
  4521. "packssdw %%xmm3,%%xmm2 \n"
  4522. "packuswb %%xmm2,%%xmm0 \n"
  4523. "movdqu %%xmm0," MEMACCESS(2) " \n"
  4524. "lea " MEMLEA(0x10,2) ",%2 \n"
  4525. "sub $0x4,%3 \n"
  4526. "jge 40b \n"
  4527. "49: \n"
  4528. "add $0x3,%3 \n"
  4529. "jl 19f \n"
  4530. // 1 pixel loop \n"
  4531. LABELALIGN
  4532. "10: \n"
  4533. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4534. MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
  4535. "lea " MEMLEA(0x10,0) ",%0 \n"
  4536. "psubd " MEMACCESS(1) ",%%xmm0 \n"
  4537. MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
  4538. "lea " MEMLEA(0x10,1) ",%1 \n"
  4539. "cvtdq2ps %%xmm0,%%xmm0 \n"
  4540. "mulps %%xmm4,%%xmm0 \n"
  4541. "cvtps2dq %%xmm0,%%xmm0 \n"
  4542. "packssdw %%xmm0,%%xmm0 \n"
  4543. "packuswb %%xmm0,%%xmm0 \n"
  4544. "movd %%xmm0," MEMACCESS(2) " \n"
  4545. "lea " MEMLEA(0x4,2) ",%2 \n"
  4546. "sub $0x1,%3 \n"
  4547. "jge 10b \n"
  4548. "19: \n"
  4549. : "+r"(topleft), // %0
  4550. "+r"(botleft), // %1
  4551. "+r"(dst), // %2
  4552. "+rm"(count) // %3
  4553. : "r"((intptr_t)(width)), // %4
  4554. "rm"(area) // %5
  4555. : "memory", "cc", NACL_R14
  4556. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  4557. );
  4558. }
  4559. #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4560. #ifdef HAS_ARGBAFFINEROW_SSE2
  4561. // Copy ARGB pixels from source image with slope to a row of destination.
  4562. LIBYUV_API
  4563. void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
  4564. uint8* dst_argb, const float* src_dudv, int width) {
  4565. intptr_t src_argb_stride_temp = src_argb_stride;
  4566. intptr_t temp;
  4567. asm volatile (
  4568. "movq " MEMACCESS(3) ",%%xmm2 \n"
  4569. "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
  4570. "shl $0x10,%1 \n"
  4571. "add $0x4,%1 \n"
  4572. "movd %1,%%xmm5 \n"
  4573. "sub $0x4,%4 \n"
  4574. "jl 49f \n"
  4575. "pshufd $0x44,%%xmm7,%%xmm7 \n"
  4576. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  4577. "movdqa %%xmm2,%%xmm0 \n"
  4578. "addps %%xmm7,%%xmm0 \n"
  4579. "movlhps %%xmm0,%%xmm2 \n"
  4580. "movdqa %%xmm7,%%xmm4 \n"
  4581. "addps %%xmm4,%%xmm4 \n"
  4582. "movdqa %%xmm2,%%xmm3 \n"
  4583. "addps %%xmm4,%%xmm3 \n"
  4584. "addps %%xmm4,%%xmm4 \n"
  4585. // 4 pixel loop \n"
  4586. LABELALIGN
  4587. "40: \n"
  4588. "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
  4589. "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
  4590. "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
  4591. "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
  4592. "movd %%xmm0,%k1 \n"
  4593. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  4594. "movd %%xmm0,%k5 \n"
  4595. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  4596. MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
  4597. MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
  4598. "punpckldq %%xmm6,%%xmm1 \n"
  4599. "addps %%xmm4,%%xmm2 \n"
  4600. "movq %%xmm1," MEMACCESS(2) " \n"
  4601. "movd %%xmm0,%k1 \n"
  4602. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  4603. "movd %%xmm0,%k5 \n"
  4604. MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
  4605. MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
  4606. "punpckldq %%xmm6,%%xmm0 \n"
  4607. "addps %%xmm4,%%xmm3 \n"
  4608. "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
  4609. "lea " MEMLEA(0x10,2) ",%2 \n"
  4610. "sub $0x4,%4 \n"
  4611. "jge 40b \n"
  4612. "49: \n"
  4613. "add $0x3,%4 \n"
  4614. "jl 19f \n"
  4615. // 1 pixel loop \n"
  4616. LABELALIGN
  4617. "10: \n"
  4618. "cvttps2dq %%xmm2,%%xmm0 \n"
  4619. "packssdw %%xmm0,%%xmm0 \n"
  4620. "pmaddwd %%xmm5,%%xmm0 \n"
  4621. "addps %%xmm7,%%xmm2 \n"
  4622. "movd %%xmm0,%k1 \n"
  4623. MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
  4624. "movd %%xmm0," MEMACCESS(2) " \n"
  4625. "lea " MEMLEA(0x04,2) ",%2 \n"
  4626. "sub $0x1,%4 \n"
  4627. "jge 10b \n"
  4628. "19: \n"
  4629. : "+r"(src_argb), // %0
  4630. "+r"(src_argb_stride_temp), // %1
  4631. "+r"(dst_argb), // %2
  4632. "+r"(src_dudv), // %3
  4633. "+rm"(width), // %4
  4634. "=&r"(temp) // %5
  4635. :
  4636. : "memory", "cc", NACL_R14
  4637. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  4638. );
  4639. }
  4640. #endif // HAS_ARGBAFFINEROW_SSE2
  4641. #ifdef HAS_INTERPOLATEROW_SSSE3
  4642. // Bilinear filter 16x2 -> 16x1
  4643. void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  4644. ptrdiff_t src_stride, int dst_width,
  4645. int source_y_fraction) {
  4646. asm volatile (
  4647. "sub %1,%0 \n"
  4648. "cmp $0x0,%3 \n"
  4649. "je 100f \n"
  4650. "cmp $0x80,%3 \n"
  4651. "je 50f \n"
  4652. "movd %3,%%xmm0 \n"
  4653. "neg %3 \n"
  4654. "add $0x100,%3 \n"
  4655. "movd %3,%%xmm5 \n"
  4656. "punpcklbw %%xmm0,%%xmm5 \n"
  4657. "punpcklwd %%xmm5,%%xmm5 \n"
  4658. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  4659. "mov $0x80808080,%%eax \n"
  4660. "movd %%eax,%%xmm4 \n"
  4661. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  4662. // General purpose row blend.
  4663. LABELALIGN
  4664. "1: \n"
  4665. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  4666. MEMOPREG(movdqu,0x00,1,4,1,xmm2)
  4667. "movdqa %%xmm0,%%xmm1 \n"
  4668. "punpcklbw %%xmm2,%%xmm0 \n"
  4669. "punpckhbw %%xmm2,%%xmm1 \n"
  4670. "psubb %%xmm4,%%xmm0 \n"
  4671. "psubb %%xmm4,%%xmm1 \n"
  4672. "movdqa %%xmm5,%%xmm2 \n"
  4673. "movdqa %%xmm5,%%xmm3 \n"
  4674. "pmaddubsw %%xmm0,%%xmm2 \n"
  4675. "pmaddubsw %%xmm1,%%xmm3 \n"
  4676. "paddw %%xmm4,%%xmm2 \n"
  4677. "paddw %%xmm4,%%xmm3 \n"
  4678. "psrlw $0x8,%%xmm2 \n"
  4679. "psrlw $0x8,%%xmm3 \n"
  4680. "packuswb %%xmm3,%%xmm2 \n"
  4681. MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
  4682. "lea " MEMLEA(0x10,1) ",%1 \n"
  4683. "sub $0x10,%2 \n"
  4684. "jg 1b \n"
  4685. "jmp 99f \n"
  4686. // Blend 50 / 50.
  4687. LABELALIGN
  4688. "50: \n"
  4689. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  4690. MEMOPREG(movdqu,0x00,1,4,1,xmm1)
  4691. "pavgb %%xmm1,%%xmm0 \n"
  4692. MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  4693. "lea " MEMLEA(0x10,1) ",%1 \n"
  4694. "sub $0x10,%2 \n"
  4695. "jg 50b \n"
  4696. "jmp 99f \n"
  4697. // Blend 100 / 0 - Copy row unchanged.
  4698. LABELALIGN
  4699. "100: \n"
  4700. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  4701. MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
  4702. "lea " MEMLEA(0x10,1) ",%1 \n"
  4703. "sub $0x10,%2 \n"
  4704. "jg 100b \n"
  4705. "99: \n"
  4706. : "+r"(dst_ptr), // %0
  4707. "+r"(src_ptr), // %1
  4708. "+rm"(dst_width), // %2
  4709. "+r"(source_y_fraction) // %3
  4710. : "r"((intptr_t)(src_stride)) // %4
  4711. : "memory", "cc", "eax", NACL_R14
  4712. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  4713. );
  4714. }
  4715. #endif // HAS_INTERPOLATEROW_SSSE3
  4716. #ifdef HAS_INTERPOLATEROW_AVX2
  4717. // Bilinear filter 32x2 -> 32x1
  4718. void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
  4719. ptrdiff_t src_stride, int dst_width,
  4720. int source_y_fraction) {
  4721. asm volatile (
  4722. "cmp $0x0,%3 \n"
  4723. "je 100f \n"
  4724. "sub %1,%0 \n"
  4725. "cmp $0x80,%3 \n"
  4726. "je 50f \n"
  4727. "vmovd %3,%%xmm0 \n"
  4728. "neg %3 \n"
  4729. "add $0x100,%3 \n"
  4730. "vmovd %3,%%xmm5 \n"
  4731. "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
  4732. "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
  4733. "vbroadcastss %%xmm5,%%ymm5 \n"
  4734. "mov $0x80808080,%%eax \n"
  4735. "vmovd %%eax,%%xmm4 \n"
  4736. "vbroadcastss %%xmm4,%%ymm4 \n"
  4737. // General purpose row blend.
  4738. LABELALIGN
  4739. "1: \n"
  4740. "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
  4741. MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
  4742. "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
  4743. "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
  4744. "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
  4745. "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
  4746. "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
  4747. "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
  4748. "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
  4749. "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
  4750. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4751. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4752. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4753. MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
  4754. "lea " MEMLEA(0x20,1) ",%1 \n"
  4755. "sub $0x20,%2 \n"
  4756. "jg 1b \n"
  4757. "jmp 99f \n"
  4758. // Blend 50 / 50.
  4759. LABELALIGN
  4760. "50: \n"
  4761. "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
  4762. VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
  4763. MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
  4764. "lea " MEMLEA(0x20,1) ",%1 \n"
  4765. "sub $0x20,%2 \n"
  4766. "jg 50b \n"
  4767. "jmp 99f \n"
  4768. // Blend 100 / 0 - Copy row unchanged.
  4769. LABELALIGN
  4770. "100: \n"
  4771. "rep movsb " MEMMOVESTRING(1,0) " \n"
  4772. "jmp 999f \n"
  4773. "99: \n"
  4774. "vzeroupper \n"
  4775. "999: \n"
  4776. : "+D"(dst_ptr), // %0
  4777. "+S"(src_ptr), // %1
  4778. "+cm"(dst_width), // %2
  4779. "+r"(source_y_fraction) // %3
  4780. : "r"((intptr_t)(src_stride)) // %4
  4781. : "memory", "cc", "eax", NACL_R14
  4782. "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
  4783. );
  4784. }
  4785. #endif // HAS_INTERPOLATEROW_AVX2
  4786. #ifdef HAS_ARGBSHUFFLEROW_SSSE3
  4787. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  4788. void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  4789. const uint8* shuffler, int width) {
  4790. asm volatile (
  4791. "movdqu " MEMACCESS(3) ",%%xmm5 \n"
  4792. LABELALIGN
  4793. "1: \n"
  4794. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4795. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  4796. "lea " MEMLEA(0x20,0) ",%0 \n"
  4797. "pshufb %%xmm5,%%xmm0 \n"
  4798. "pshufb %%xmm5,%%xmm1 \n"
  4799. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4800. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  4801. "lea " MEMLEA(0x20,1) ",%1 \n"
  4802. "sub $0x8,%2 \n"
  4803. "jg 1b \n"
  4804. : "+r"(src_argb), // %0
  4805. "+r"(dst_argb), // %1
  4806. "+r"(width) // %2
  4807. : "r"(shuffler) // %3
  4808. : "memory", "cc"
  4809. , "xmm0", "xmm1", "xmm5"
  4810. );
  4811. }
  4812. #endif // HAS_ARGBSHUFFLEROW_SSSE3
  4813. #ifdef HAS_ARGBSHUFFLEROW_AVX2
  4814. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  4815. void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
  4816. const uint8* shuffler, int width) {
  4817. asm volatile (
  4818. "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
  4819. LABELALIGN
  4820. "1: \n"
  4821. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  4822. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  4823. "lea " MEMLEA(0x40,0) ",%0 \n"
  4824. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  4825. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  4826. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  4827. "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
  4828. "lea " MEMLEA(0x40,1) ",%1 \n"
  4829. "sub $0x10,%2 \n"
  4830. "jg 1b \n"
  4831. "vzeroupper \n"
  4832. : "+r"(src_argb), // %0
  4833. "+r"(dst_argb), // %1
  4834. "+r"(width) // %2
  4835. : "r"(shuffler) // %3
  4836. : "memory", "cc"
  4837. , "xmm0", "xmm1", "xmm5"
  4838. );
  4839. }
  4840. #endif // HAS_ARGBSHUFFLEROW_AVX2
  4841. #ifdef HAS_ARGBSHUFFLEROW_SSE2
  4842. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  4843. void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
  4844. const uint8* shuffler, int width) {
  4845. uintptr_t pixel_temp;
  4846. asm volatile (
  4847. "pxor %%xmm5,%%xmm5 \n"
  4848. "mov " MEMACCESS(4) ",%k2 \n"
  4849. "cmp $0x3000102,%k2 \n"
  4850. "je 3012f \n"
  4851. "cmp $0x10203,%k2 \n"
  4852. "je 123f \n"
  4853. "cmp $0x30201,%k2 \n"
  4854. "je 321f \n"
  4855. "cmp $0x2010003,%k2 \n"
  4856. "je 2103f \n"
  4857. LABELALIGN
  4858. "1: \n"
  4859. "movzb " MEMACCESS(4) ",%2 \n"
  4860. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  4861. "mov %b2," MEMACCESS(1) " \n"
  4862. "movzb " MEMACCESS2(0x1,4) ",%2 \n"
  4863. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  4864. "mov %b2," MEMACCESS2(0x1,1) " \n"
  4865. "movzb " MEMACCESS2(0x2,4) ",%2 \n"
  4866. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  4867. "mov %b2," MEMACCESS2(0x2,1) " \n"
  4868. "movzb " MEMACCESS2(0x3,4) ",%2 \n"
  4869. MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
  4870. "mov %b2," MEMACCESS2(0x3,1) " \n"
  4871. "lea " MEMLEA(0x4,0) ",%0 \n"
  4872. "lea " MEMLEA(0x4,1) ",%1 \n"
  4873. "sub $0x1,%3 \n"
  4874. "jg 1b \n"
  4875. "jmp 99f \n"
  4876. LABELALIGN
  4877. "123: \n"
  4878. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4879. "lea " MEMLEA(0x10,0) ",%0 \n"
  4880. "movdqa %%xmm0,%%xmm1 \n"
  4881. "punpcklbw %%xmm5,%%xmm0 \n"
  4882. "punpckhbw %%xmm5,%%xmm1 \n"
  4883. "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
  4884. "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
  4885. "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
  4886. "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
  4887. "packuswb %%xmm1,%%xmm0 \n"
  4888. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4889. "lea " MEMLEA(0x10,1) ",%1 \n"
  4890. "sub $0x4,%3 \n"
  4891. "jg 123b \n"
  4892. "jmp 99f \n"
  4893. LABELALIGN
  4894. "321: \n"
  4895. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4896. "lea " MEMLEA(0x10,0) ",%0 \n"
  4897. "movdqa %%xmm0,%%xmm1 \n"
  4898. "punpcklbw %%xmm5,%%xmm0 \n"
  4899. "punpckhbw %%xmm5,%%xmm1 \n"
  4900. "pshufhw $0x39,%%xmm0,%%xmm0 \n"
  4901. "pshuflw $0x39,%%xmm0,%%xmm0 \n"
  4902. "pshufhw $0x39,%%xmm1,%%xmm1 \n"
  4903. "pshuflw $0x39,%%xmm1,%%xmm1 \n"
  4904. "packuswb %%xmm1,%%xmm0 \n"
  4905. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4906. "lea " MEMLEA(0x10,1) ",%1 \n"
  4907. "sub $0x4,%3 \n"
  4908. "jg 321b \n"
  4909. "jmp 99f \n"
  4910. LABELALIGN
  4911. "2103: \n"
  4912. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4913. "lea " MEMLEA(0x10,0) ",%0 \n"
  4914. "movdqa %%xmm0,%%xmm1 \n"
  4915. "punpcklbw %%xmm5,%%xmm0 \n"
  4916. "punpckhbw %%xmm5,%%xmm1 \n"
  4917. "pshufhw $0x93,%%xmm0,%%xmm0 \n"
  4918. "pshuflw $0x93,%%xmm0,%%xmm0 \n"
  4919. "pshufhw $0x93,%%xmm1,%%xmm1 \n"
  4920. "pshuflw $0x93,%%xmm1,%%xmm1 \n"
  4921. "packuswb %%xmm1,%%xmm0 \n"
  4922. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4923. "lea " MEMLEA(0x10,1) ",%1 \n"
  4924. "sub $0x4,%3 \n"
  4925. "jg 2103b \n"
  4926. "jmp 99f \n"
  4927. LABELALIGN
  4928. "3012: \n"
  4929. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4930. "lea " MEMLEA(0x10,0) ",%0 \n"
  4931. "movdqa %%xmm0,%%xmm1 \n"
  4932. "punpcklbw %%xmm5,%%xmm0 \n"
  4933. "punpckhbw %%xmm5,%%xmm1 \n"
  4934. "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
  4935. "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
  4936. "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
  4937. "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
  4938. "packuswb %%xmm1,%%xmm0 \n"
  4939. "movdqu %%xmm0," MEMACCESS(1) " \n"
  4940. "lea " MEMLEA(0x10,1) ",%1 \n"
  4941. "sub $0x4,%3 \n"
  4942. "jg 3012b \n"
  4943. "99: \n"
  4944. : "+r"(src_argb), // %0
  4945. "+r"(dst_argb), // %1
  4946. "=&d"(pixel_temp), // %2
  4947. "+r"(width) // %3
  4948. : "r"(shuffler) // %4
  4949. : "memory", "cc", NACL_R14
  4950. "xmm0", "xmm1", "xmm5"
  4951. );
  4952. }
  4953. #endif // HAS_ARGBSHUFFLEROW_SSE2
  4954. #ifdef HAS_I422TOYUY2ROW_SSE2
  4955. void I422ToYUY2Row_SSE2(const uint8* src_y,
  4956. const uint8* src_u,
  4957. const uint8* src_v,
  4958. uint8* dst_frame, int width) {
  4959. asm volatile (
  4960. "sub %1,%2 \n"
  4961. LABELALIGN
  4962. "1: \n"
  4963. "movq " MEMACCESS(1) ",%%xmm2 \n"
  4964. MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
  4965. "lea " MEMLEA(0x8,1) ",%1 \n"
  4966. "punpcklbw %%xmm3,%%xmm2 \n"
  4967. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  4968. "lea " MEMLEA(0x10,0) ",%0 \n"
  4969. "movdqa %%xmm0,%%xmm1 \n"
  4970. "punpcklbw %%xmm2,%%xmm0 \n"
  4971. "punpckhbw %%xmm2,%%xmm1 \n"
  4972. "movdqu %%xmm0," MEMACCESS(3) " \n"
  4973. "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
  4974. "lea " MEMLEA(0x20,3) ",%3 \n"
  4975. "sub $0x10,%4 \n"
  4976. "jg 1b \n"
  4977. : "+r"(src_y), // %0
  4978. "+r"(src_u), // %1
  4979. "+r"(src_v), // %2
  4980. "+r"(dst_frame), // %3
  4981. "+rm"(width) // %4
  4982. :
  4983. : "memory", "cc", NACL_R14
  4984. "xmm0", "xmm1", "xmm2", "xmm3"
  4985. );
  4986. }
  4987. #endif // HAS_I422TOYUY2ROW_SSE2
  4988. #ifdef HAS_I422TOUYVYROW_SSE2
  4989. void I422ToUYVYRow_SSE2(const uint8* src_y,
  4990. const uint8* src_u,
  4991. const uint8* src_v,
  4992. uint8* dst_frame, int width) {
  4993. asm volatile (
  4994. "sub %1,%2 \n"
  4995. LABELALIGN
  4996. "1: \n"
  4997. "movq " MEMACCESS(1) ",%%xmm2 \n"
  4998. MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
  4999. "lea " MEMLEA(0x8,1) ",%1 \n"
  5000. "punpcklbw %%xmm3,%%xmm2 \n"
  5001. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  5002. "movdqa %%xmm2,%%xmm1 \n"
  5003. "lea " MEMLEA(0x10,0) ",%0 \n"
  5004. "punpcklbw %%xmm0,%%xmm1 \n"
  5005. "punpckhbw %%xmm0,%%xmm2 \n"
  5006. "movdqu %%xmm1," MEMACCESS(3) " \n"
  5007. "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
  5008. "lea " MEMLEA(0x20,3) ",%3 \n"
  5009. "sub $0x10,%4 \n"
  5010. "jg 1b \n"
  5011. : "+r"(src_y), // %0
  5012. "+r"(src_u), // %1
  5013. "+r"(src_v), // %2
  5014. "+r"(dst_frame), // %3
  5015. "+rm"(width) // %4
  5016. :
  5017. : "memory", "cc", NACL_R14
  5018. "xmm0", "xmm1", "xmm2", "xmm3"
  5019. );
  5020. }
  5021. #endif // HAS_I422TOUYVYROW_SSE2
  5022. #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  5023. void ARGBPolynomialRow_SSE2(const uint8* src_argb,
  5024. uint8* dst_argb, const float* poly,
  5025. int width) {
  5026. asm volatile (
  5027. "pxor %%xmm3,%%xmm3 \n"
  5028. // 2 pixel loop.
  5029. LABELALIGN
  5030. "1: \n"
  5031. "movq " MEMACCESS(0) ",%%xmm0 \n"
  5032. "lea " MEMLEA(0x8,0) ",%0 \n"
  5033. "punpcklbw %%xmm3,%%xmm0 \n"
  5034. "movdqa %%xmm0,%%xmm4 \n"
  5035. "punpcklwd %%xmm3,%%xmm0 \n"
  5036. "punpckhwd %%xmm3,%%xmm4 \n"
  5037. "cvtdq2ps %%xmm0,%%xmm0 \n"
  5038. "cvtdq2ps %%xmm4,%%xmm4 \n"
  5039. "movdqa %%xmm0,%%xmm1 \n"
  5040. "movdqa %%xmm4,%%xmm5 \n"
  5041. "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
  5042. "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
  5043. "addps " MEMACCESS(3) ",%%xmm0 \n"
  5044. "addps " MEMACCESS(3) ",%%xmm4 \n"
  5045. "movdqa %%xmm1,%%xmm2 \n"
  5046. "movdqa %%xmm5,%%xmm6 \n"
  5047. "mulps %%xmm1,%%xmm2 \n"
  5048. "mulps %%xmm5,%%xmm6 \n"
  5049. "mulps %%xmm2,%%xmm1 \n"
  5050. "mulps %%xmm6,%%xmm5 \n"
  5051. "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
  5052. "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
  5053. "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
  5054. "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
  5055. "addps %%xmm2,%%xmm0 \n"
  5056. "addps %%xmm6,%%xmm4 \n"
  5057. "addps %%xmm1,%%xmm0 \n"
  5058. "addps %%xmm5,%%xmm4 \n"
  5059. "cvttps2dq %%xmm0,%%xmm0 \n"
  5060. "cvttps2dq %%xmm4,%%xmm4 \n"
  5061. "packuswb %%xmm4,%%xmm0 \n"
  5062. "packuswb %%xmm0,%%xmm0 \n"
  5063. "movq %%xmm0," MEMACCESS(1) " \n"
  5064. "lea " MEMLEA(0x8,1) ",%1 \n"
  5065. "sub $0x2,%2 \n"
  5066. "jg 1b \n"
  5067. : "+r"(src_argb), // %0
  5068. "+r"(dst_argb), // %1
  5069. "+r"(width) // %2
  5070. : "r"(poly) // %3
  5071. : "memory", "cc"
  5072. , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  5073. );
  5074. }
  5075. #endif // HAS_ARGBPOLYNOMIALROW_SSE2
  5076. #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  5077. void ARGBPolynomialRow_AVX2(const uint8* src_argb,
  5078. uint8* dst_argb, const float* poly,
  5079. int width) {
  5080. asm volatile (
  5081. "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
  5082. "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
  5083. "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
  5084. "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
  5085. // 2 pixel loop.
  5086. LABELALIGN
  5087. "1: \n"
  5088. "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
  5089. "lea " MEMLEA(0x8,0) ",%0 \n"
  5090. "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
  5091. "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
  5092. "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
  5093. "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
  5094. "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
  5095. "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
  5096. "vcvttps2dq %%ymm0,%%ymm0 \n"
  5097. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  5098. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  5099. "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
  5100. "vmovq %%xmm0," MEMACCESS(1) " \n"
  5101. "lea " MEMLEA(0x8,1) ",%1 \n"
  5102. "sub $0x2,%2 \n"
  5103. "jg 1b \n"
  5104. "vzeroupper \n"
  5105. : "+r"(src_argb), // %0
  5106. "+r"(dst_argb), // %1
  5107. "+r"(width) // %2
  5108. : "r"(poly) // %3
  5109. : "memory", "cc",
  5110. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  5111. );
  5112. }
  5113. #endif // HAS_ARGBPOLYNOMIALROW_AVX2
  5114. #ifdef HAS_ARGBCOLORTABLEROW_X86
  5115. // Tranform ARGB pixels with color table.
  5116. void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
  5117. int width) {
  5118. uintptr_t pixel_temp;
  5119. asm volatile (
  5120. // 1 pixel loop.
  5121. LABELALIGN
  5122. "1: \n"
  5123. "movzb " MEMACCESS(0) ",%1 \n"
  5124. "lea " MEMLEA(0x4,0) ",%0 \n"
  5125. MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
  5126. "mov %b1," MEMACCESS2(-0x4,0) " \n"
  5127. "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
  5128. MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
  5129. "mov %b1," MEMACCESS2(-0x3,0) " \n"
  5130. "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
  5131. MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
  5132. "mov %b1," MEMACCESS2(-0x2,0) " \n"
  5133. "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
  5134. MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
  5135. "mov %b1," MEMACCESS2(-0x1,0) " \n"
  5136. "dec %2 \n"
  5137. "jg 1b \n"
  5138. : "+r"(dst_argb), // %0
  5139. "=&d"(pixel_temp), // %1
  5140. "+r"(width) // %2
  5141. : "r"(table_argb) // %3
  5142. : "memory", "cc");
  5143. }
  5144. #endif // HAS_ARGBCOLORTABLEROW_X86
  5145. #ifdef HAS_RGBCOLORTABLEROW_X86
  5146. // Tranform RGB pixels with color table.
  5147. void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
  5148. uintptr_t pixel_temp;
  5149. asm volatile (
  5150. // 1 pixel loop.
  5151. LABELALIGN
  5152. "1: \n"
  5153. "movzb " MEMACCESS(0) ",%1 \n"
  5154. "lea " MEMLEA(0x4,0) ",%0 \n"
  5155. MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
  5156. "mov %b1," MEMACCESS2(-0x4,0) " \n"
  5157. "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
  5158. MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
  5159. "mov %b1," MEMACCESS2(-0x3,0) " \n"
  5160. "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
  5161. MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
  5162. "mov %b1," MEMACCESS2(-0x2,0) " \n"
  5163. "dec %2 \n"
  5164. "jg 1b \n"
  5165. : "+r"(dst_argb), // %0
  5166. "=&d"(pixel_temp), // %1
  5167. "+r"(width) // %2
  5168. : "r"(table_argb) // %3
  5169. : "memory", "cc");
  5170. }
  5171. #endif // HAS_RGBCOLORTABLEROW_X86
  5172. #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5173. // Tranform RGB pixels with luma table.
  5174. void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
  5175. int width,
  5176. const uint8* luma, uint32 lumacoeff) {
  5177. uintptr_t pixel_temp;
  5178. uintptr_t table_temp;
  5179. asm volatile (
  5180. "movd %6,%%xmm3 \n"
  5181. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  5182. "pcmpeqb %%xmm4,%%xmm4 \n"
  5183. "psllw $0x8,%%xmm4 \n"
  5184. "pxor %%xmm5,%%xmm5 \n"
  5185. // 4 pixel loop.
  5186. LABELALIGN
  5187. "1: \n"
  5188. "movdqu " MEMACCESS(2) ",%%xmm0 \n"
  5189. "pmaddubsw %%xmm3,%%xmm0 \n"
  5190. "phaddw %%xmm0,%%xmm0 \n"
  5191. "pand %%xmm4,%%xmm0 \n"
  5192. "punpcklwd %%xmm5,%%xmm0 \n"
  5193. "movd %%xmm0,%k1 \n" // 32 bit offset
  5194. "add %5,%1 \n"
  5195. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5196. "movzb " MEMACCESS(2) ",%0 \n"
  5197. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5198. "mov %b0," MEMACCESS(3) " \n"
  5199. "movzb " MEMACCESS2(0x1,2) ",%0 \n"
  5200. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5201. "mov %b0," MEMACCESS2(0x1,3) " \n"
  5202. "movzb " MEMACCESS2(0x2,2) ",%0 \n"
  5203. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5204. "mov %b0," MEMACCESS2(0x2,3) " \n"
  5205. "movzb " MEMACCESS2(0x3,2) ",%0 \n"
  5206. "mov %b0," MEMACCESS2(0x3,3) " \n"
  5207. "movd %%xmm0,%k1 \n" // 32 bit offset
  5208. "add %5,%1 \n"
  5209. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5210. "movzb " MEMACCESS2(0x4,2) ",%0 \n"
  5211. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5212. "mov %b0," MEMACCESS2(0x4,3) " \n"
  5213. "movzb " MEMACCESS2(0x5,2) ",%0 \n"
  5214. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5215. "mov %b0," MEMACCESS2(0x5,3) " \n"
  5216. "movzb " MEMACCESS2(0x6,2) ",%0 \n"
  5217. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5218. "mov %b0," MEMACCESS2(0x6,3) " \n"
  5219. "movzb " MEMACCESS2(0x7,2) ",%0 \n"
  5220. "mov %b0," MEMACCESS2(0x7,3) " \n"
  5221. "movd %%xmm0,%k1 \n" // 32 bit offset
  5222. "add %5,%1 \n"
  5223. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5224. "movzb " MEMACCESS2(0x8,2) ",%0 \n"
  5225. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5226. "mov %b0," MEMACCESS2(0x8,3) " \n"
  5227. "movzb " MEMACCESS2(0x9,2) ",%0 \n"
  5228. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5229. "mov %b0," MEMACCESS2(0x9,3) " \n"
  5230. "movzb " MEMACCESS2(0xa,2) ",%0 \n"
  5231. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5232. "mov %b0," MEMACCESS2(0xa,3) " \n"
  5233. "movzb " MEMACCESS2(0xb,2) ",%0 \n"
  5234. "mov %b0," MEMACCESS2(0xb,3) " \n"
  5235. "movd %%xmm0,%k1 \n" // 32 bit offset
  5236. "add %5,%1 \n"
  5237. "movzb " MEMACCESS2(0xc,2) ",%0 \n"
  5238. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5239. "mov %b0," MEMACCESS2(0xc,3) " \n"
  5240. "movzb " MEMACCESS2(0xd,2) ",%0 \n"
  5241. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5242. "mov %b0," MEMACCESS2(0xd,3) " \n"
  5243. "movzb " MEMACCESS2(0xe,2) ",%0 \n"
  5244. MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
  5245. "mov %b0," MEMACCESS2(0xe,3) " \n"
  5246. "movzb " MEMACCESS2(0xf,2) ",%0 \n"
  5247. "mov %b0," MEMACCESS2(0xf,3) " \n"
  5248. "lea " MEMLEA(0x10,2) ",%2 \n"
  5249. "lea " MEMLEA(0x10,3) ",%3 \n"
  5250. "sub $0x4,%4 \n"
  5251. "jg 1b \n"
  5252. : "=&d"(pixel_temp), // %0
  5253. "=&a"(table_temp), // %1
  5254. "+r"(src_argb), // %2
  5255. "+r"(dst_argb), // %3
  5256. "+rm"(width) // %4
  5257. : "r"(luma), // %5
  5258. "rm"(lumacoeff) // %6
  5259. : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
  5260. );
  5261. }
  5262. #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5263. #endif // defined(__x86_64__) || defined(__i386__)
  5264. #ifdef __cplusplus
  5265. } // extern "C"
  5266. } // namespace libyuv
  5267. #endif