stb_image_resize2.h 446 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651
  1. /* stb_image_resize2 - v2.17 - public domain image resizing
  2. by Jeff Roberts (v2) and Jorge L Rodriguez
  3. http://github.com/nothings/stb
  4. Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only
  5. scaling and translation is supported, no rotations or shears.
  6. COMPILING & LINKING
  7. In one C/C++ file that #includes this file, do this:
  8. #define STB_IMAGE_RESIZE_IMPLEMENTATION
  9. before the #include. That will create the implementation in that file.
  10. EASY API CALLS:
  11. Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.
  12. stbir_resize_uint8_srgb( input_pixels, input_w, input_h, input_stride_in_bytes,
  13. output_pixels, output_w, output_h, output_stride_in_bytes,
  14. pixel_layout_enum )
  15. stbir_resize_uint8_linear( input_pixels, input_w, input_h, input_stride_in_bytes,
  16. output_pixels, output_w, output_h, output_stride_in_bytes,
  17. pixel_layout_enum )
  18. stbir_resize_float_linear( input_pixels, input_w, input_h, input_stride_in_bytes,
  19. output_pixels, output_w, output_h, output_stride_in_bytes,
  20. pixel_layout_enum )
  21. If you pass NULL or zero for the output_pixels, we will allocate the output buffer
  22. for you and return it from the function (free with free() or STBIR_FREE).
  23. As a special case, XX_stride_in_bytes of 0 means packed continuously in memory.
  24. API LEVELS
  25. There are three levels of API - easy-to-use, medium-complexity and extended-complexity.
  26. See the "header file" section of the source for API documentation.
  27. ADDITIONAL DOCUMENTATION
  28. MEMORY ALLOCATION
  29. By default, we use malloc and free for memory allocation. To override the
  30. memory allocation, before the implementation #include, add a:
  31. #define STBIR_MALLOC(size,user_data) ...
  32. #define STBIR_FREE(ptr,user_data) ...
  33. Each resize makes exactly one call to malloc/free (unless you use the
  34. extended API where you can do one allocation for many resizes). Under
  35. address sanitizer, we do separate allocations to find overread/writes.
  36. PERFORMANCE
  37. This library was written with an emphasis on performance. When testing
  38. stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with
  39. STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other resize
  40. libs do by default). Also, make sure SIMD is turned on of course (default
  41. for 64-bit targets). Avoid WRAP edge mode if you want the fastest speed.
  42. This library also comes with profiling built-in. If you define STBIR_PROFILE,
  43. you can use the advanced API and get low-level profiling information by
  44. calling stbir_resize_extended_profile_info() or stbir_resize_split_profile_info()
  45. after a resize.
  46. SIMD
  47. Most of the routines have optimized SSE2, AVX, NEON and WASM versions.
  48. On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and
  49. ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or
  50. STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX
  51. or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2
  52. support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2.
  53. On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For 32-bit,
  54. we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2 enabled
  55. on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4 for both
  56. clang and GCC, but GCC also requires an additional -mfp16-format=ieee to
  57. automatically enable NEON.
  58. On x86 platforms, you can also define STBIR_FP16C to turn on FP16C instructions
  59. for converting back and forth to half-floats. This is autoselected when we
  60. are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses
  61. the built-in half float hardware NEON instructions.
  62. You can also tell us to use multiply-add instructions with STBIR_USE_FMA.
  63. Because x86 doesn't always have fma, we turn it off by default to maintain
  64. determinism across all platforms. If you don't care about non-FMA determinism
  65. and are willing to restrict yourself to more recent x86 CPUs (around the AVX
  66. timeframe), then fma will give you around a 15% speedup.
  67. You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can turn
  68. off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is 10%
  69. to 40% faster, and AVX2 is generally another 12%.
  70. ALPHA CHANNEL
  71. Most of the resizing functions provide the ability to control how the alpha
  72. channel of an image is processed.
  73. When alpha represents transparency, it is important that when combining
  74. colors with filtering, the pixels should not be treated equally; they
  75. should use a weighted average based on their alpha values. For example,
  76. if a pixel is 1% opaque bright green and another pixel is 99% opaque
  77. black and you average them, the average will be 50% opaque, but the
  78. unweighted average and will be a middling green color, while the weighted
  79. average will be nearly black. This means the unweighted version introduced
  80. green energy that didn't exist in the source image.
  81. (If you want to know why this makes sense, you can work out the math for
  82. the following: consider what happens if you alpha composite a source image
  83. over a fixed color and then average the output, vs. if you average the
  84. source image pixels and then composite that over the same fixed color.
  85. Only the weighted average produces the same result as the ground truth
  86. composite-then-average result.)
  87. Therefore, it is in general best to "alpha weight" the pixels when applying
  88. filters to them. This essentially means multiplying the colors by the alpha
  89. values before combining them, and then dividing by the alpha value at the
  90. end.
  91. The computer graphics industry introduced a technique called "premultiplied
  92. alpha" or "associated alpha" in which image colors are stored in image files
  93. already multiplied by their alpha. This saves some math when compositing,
  94. and also avoids the need to divide by the alpha at the end (which is quite
  95. inefficient). However, while premultiplied alpha is common in the movie CGI
  96. industry, it is not commonplace in other industries like videogames, and most
  97. consumer file formats are generally expected to contain not-premultiplied
  98. colors. For example, Photoshop saves PNG files "unpremultiplied", and web
  99. browsers like Chrome and Firefox expect PNG images to be unpremultiplied.
  100. Note that there are three possibilities that might describe your image
  101. and resize expectation:
  102. 1. images are not premultiplied, alpha weighting is desired
  103. 2. images are not premultiplied, alpha weighting is not desired
  104. 3. images are premultiplied
  105. Both case #2 and case #3 require the exact same math: no alpha weighting
  106. should be applied or removed. Only case 1 requires extra math operations;
  107. the other two cases can be handled identically.
  108. stb_image_resize expects case #1 by default, applying alpha weighting to
  109. images, expecting the input images to be unpremultiplied. This is what the
  110. COLOR+ALPHA buffer types tell the resizer to do.
  111. When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB,
  112. STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are
  113. non-premultiplied. In these cases, the resizer will alpha weight the colors
  114. (effectively creating the premultiplied image), do the filtering, and then
  115. convert back to non-premult on exit.
  116. When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM,
  117. STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels
  118. ARE premultiplied. In this case, the resizer doesn't have to do the
  119. premultipling - it can filter directly on the input. This about twice as
  120. fast as the non-premultiplied case, so it's the right option if your data is
  121. already setup correctly.
  122. When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are
  123. telling us that there is no channel that represents transparency; it may be
  124. RGB and some unrelated fourth channel that has been stored in the alpha
  125. channel, but it is actually not alpha. No special processing will be
  126. performed.
  127. The difference between the generic 4 or 2 channel layouts, and the
  128. specialized _PM versions is with the _PM versions you are telling us that
  129. the data *is* alpha, just don't premultiply it. That's important when
  130. using SRGB pixel formats, we need to know where the alpha is, because
  131. it is converted linearly (rather than with the SRGB converters).
  132. Because alpha weighting produces the same effect as premultiplying, you
  133. even have the option with non-premultiplied inputs to let the resizer
  134. produce a premultiplied output. Because the intially computed alpha-weighted
  135. output image is effectively premultiplied, this is actually more performant
  136. than the normal path which un-premultiplies the output image as a final step.
  137. Finally, when converting both in and out of non-premulitplied space (for
  138. example, when using STBIR_RGBA), we go to somewhat heroic measures to
  139. ensure that areas with zero alpha value pixels get something reasonable
  140. in the RGB values. If you don't care about the RGB values of zero alpha
  141. pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality()
  142. function - this runs a premultiplied resize about 25% faster. That said,
  143. when you really care about speed, using premultiplied pixels for both in
  144. and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied
  145. options.
  146. PIXEL LAYOUT CONVERSION
  147. The resizer can convert from some pixel layouts to others. When using the
  148. stbir_set_pixel_layouts(), you can, for example, specify STBIR_RGBA
  149. on input, and STBIR_ARGB on output, and it will re-organize the channels
  150. during the resize. Currently, you can only convert between two pixel
  151. layouts with the same number of channels.
  152. DETERMINISM
  153. We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc).
  154. This requires compiling with fast-math off (using at least /fp:precise).
  155. Also, you must turn off fp-contracting (which turns mult+adds into fmas)!
  156. We attempt to do this with pragmas, but with Clang, you usually want to add
  157. -ffp-contract=off to the command line as well.
  158. For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is,
  159. if the scalar x87 unit gets used at all, we immediately lose determinism.
  160. On Microsoft Visual Studio 2008 and earlier, from what we can tell there is
  161. no way to be deterministic in 32-bit x86 (some x87 always leaks in, even
  162. with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and
  163. -fpmath=sse.
  164. Note that we will not be deterministic with float data containing NaNs -
  165. the NaNs will propagate differently on different SIMD and platforms.
  166. If you turn on STBIR_USE_FMA, then we will be deterministic with other
  167. fma targets, but we will differ from non-fma targets (this is unavoidable,
  168. because a fma isn't simply an add with a mult - it also introduces a
  169. rounding difference compared to non-fma instruction sequences.
  170. FLOAT PIXEL FORMAT RANGE
  171. Any range of values can be used for the non-alpha float data that you pass
  172. in (0 to 1, -1 to 1, whatever). However, if you are inputting float values
  173. but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we
  174. scale back properly. The alpha channel must also be 0 to 1 for any format
  175. that does premultiplication prior to resizing.
  176. Note also that with float output, using filters with negative lobes, the
  177. output filtered values might go slightly out of range. You can define
  178. STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range
  179. to clamp to on output, if that's important.
  180. MAX/MIN SCALE FACTORS
  181. The input pixel resolutions are in integers, and we do the internal pointer
  182. resolution in size_t sized integers. However, the scale ratio from input
  183. resolution to output resolution is calculated in float form. This means
  184. the effective possible scale ratio is limited to 24 bits (or 16 million
  185. to 1). As you get close to the size of the float resolution (again, 16
  186. million pixels wide or high), you might start seeing float inaccuracy
  187. issues in general in the pipeline. If you have to do extreme resizes,
  188. you can usually do this is multiple stages (using float intermediate
  189. buffers).
  190. FLIPPED IMAGES
  191. Stride is just the delta from one scanline to the next. This means you can
  192. use a negative stride to handle inverted images (point to the final
  193. scanline and use a negative stride). You can invert the input or output,
  194. using negative strides.
  195. DEFAULT FILTERS
  196. For functions which don't provide explicit control over what filters to
  197. use, you can change the compile-time defaults with:
  198. #define STBIR_DEFAULT_FILTER_UPSAMPLE STBIR_FILTER_something
  199. #define STBIR_DEFAULT_FILTER_DOWNSAMPLE STBIR_FILTER_something
  200. See stbir_filter in the header-file section for the list of filters.
  201. NEW FILTERS
  202. A number of 1D filter kernels are supplied. For a list of supported
  203. filters, see the stbir_filter enum. You can install your own filters by
  204. using the stbir_set_filter_callbacks function.
  205. PROGRESS
  206. For interactive use with slow resize operations, you can use the
  207. scanline callbacks in the extended API. It would have to be a *very* large
  208. image resample to need progress though - we're very fast.
  209. CEIL and FLOOR
  210. In scalar mode, the only functions we use from math.h are ceilf and floorf,
  211. but if you have your own versions, you can define the STBIR_CEILF(v) and
  212. STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use
  213. our own versions.
  214. ASSERT
  215. Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
  216. PORTING FROM VERSION 1
  217. The API has changed. You can continue to use the old version of stb_image_resize.h,
  218. which is available in the "deprecated/" directory.
  219. If you're using the old simple-to-use API, porting is straightforward.
  220. (For more advanced APIs, read the documentation.)
  221. stbir_resize_uint8():
  222. - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
  223. stbir_resize_float():
  224. - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
  225. stbir_resize_uint8_srgb():
  226. - function name is unchanged
  227. - cast channel count to `stbir_pixel_layout`
  228. - above is sufficient unless your image has alpha and it's not RGBA/BGRA
  229. - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
  230. stbir_resize_uint8_srgb_edgemode()
  231. - switch to the "medium complexity" API
  232. - stbir_resize(), very similar API but a few more parameters:
  233. - pixel_layout: cast channel count to `stbir_pixel_layout`
  234. - data_type: STBIR_TYPE_UINT8_SRGB
  235. - edge: unchanged (STBIR_EDGE_WRAP, etc.)
  236. - filter: STBIR_FILTER_DEFAULT
  237. - which channel is alpha is specified in stbir_pixel_layout, see enum for details
  238. FUTURE TODOS
  239. * For polyphase integral filters, we just memcpy the coeffs to dupe
  240. them, but we should indirect and use the same coeff memory.
  241. * Add pixel layout conversions for sensible different channel counts
  242. (maybe, 1->3/4, 3->4, 4->1, 3->1).
  243. * For SIMD encode and decode scanline routines, do any pre-aligning
  244. for bad input/output buffer alignments and pitch?
  245. * For very wide scanlines, we should we do vertical strips to stay within
  246. L2 cache. Maybe do chunks of 1K pixels at a time. There would be
  247. some pixel reconversion, but probably dwarfed by things falling out
  248. of cache. Probably also something possible with alternating between
  249. scattering and gathering at high resize scales?
  250. * Should we have a multiple MIPs at the same time function (could keep
  251. more memory in cache during multiple resizes)?
  252. * Rewrite the coefficient generator to do many at once.
  253. * AVX-512 vertical kernels - worried about downclocking here.
  254. * Convert the reincludes to macros when we know they aren't changing.
  255. * Experiment with pivoting the horizontal and always using the
  256. vertical filters (which are faster, but perhaps not enough to overcome
  257. the pivot cost and the extra memory touches). Need to buffer the whole
  258. image so have to balance memory use.
  259. * Most of our code is internally function pointers, should we compile
  260. all the SIMD stuff always and dynamically dispatch?
  261. CONTRIBUTORS
  262. Jeff Roberts: 2.0 implementation, optimizations, SIMD
  263. Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer
  264. Fabian Giesen: half float and srgb converters
  265. Sean Barrett: API design, optimizations
  266. Jorge L Rodriguez: Original 1.0 implementation
  267. Aras Pranckevicius: bugfixes
  268. Nathan Reed: warning fixes for 1.0
  269. REVISIONS
  270. 2.17 (2025-10-25) silly format bug in easy-to-use APIs.
  271. 2.16 (2025-10-21) fixed the easy-to-use APIs to allow inverted bitmaps (negative
  272. strides), fix vertical filter kernel callback, fix threaded
  273. gather buffer priming (and assert).
  274. (thanks adipose, TainZerL, and Harrison Green)
  275. 2.15 (2025-07-17) fixed an assert in debug mode when using floats with input
  276. callbacks, work around GCC warning when adding to null ptr
  277. (thanks Johannes Spohr and Pyry Kovanen).
  278. 2.14 (2025-05-09) fixed a bug using downsampling gather horizontal first, and
  279. scatter with vertical first.
  280. 2.13 (2025-02-27) fixed a bug when using input callbacks, turned off simd for
  281. tiny-c, fixed some variables that should have been static,
  282. fixes a bug when calculating temp memory with resizes that
  283. exceed 2GB of temp memory (very large resizes).
  284. 2.12 (2024-10-18) fix incorrect use of user_data with STBIR_FREE
  285. 2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode
  286. with AVX-2, fix some weird scaling edge conditions with
  287. point sample mode.
  288. 2.10 (2024-07-27) fix the defines GCC and mingw for loop unroll control,
  289. fix MSVC 32-bit arm half float routines.
  290. 2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
  291. hardware half floats).
  292. 2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks
  293. to Ryan Salsbury), fix for sub-rect resizes, use the
  294. pragmas to control unrolling when they are available.
  295. 2.07 (2024-05-24) fix for slow final split during threaded conversions of very
  296. wide scanlines when downsampling (caused by extra input
  297. converting), fix for wide scanline resamples with many
  298. splits (int overflow), fix GCC warning.
  299. 2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling
  300. undersampling a single row on rare resize ratios (about 1%).
  301. 2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras),
  302. fix for output callback (thanks Julien Koenen).
  303. 2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
  304. 2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
  305. 2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc
  306. 2x-5x faster without simd, 4x-12x faster with simd,
  307. in some cases, 20x to 40x faster esp resizing large to very small.
  308. 0.96 (2019-03-04) fixed warnings
  309. 0.95 (2017-07-23) fixed warnings
  310. 0.94 (2017-03-18) fixed warnings
  311. 0.93 (2017-03-03) fixed bug with certain combinations of heights
  312. 0.92 (2017-01-02) fix integer overflow on large (>2GB) images
  313. 0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
  314. 0.90 (2014-09-17) first released version
  315. LICENSE
  316. See end of file for license information.
  317. */
  318. #if !defined(STB_IMAGE_RESIZE_DO_HORIZONTALS) && !defined(STB_IMAGE_RESIZE_DO_VERTICALS) && !defined(STB_IMAGE_RESIZE_DO_CODERS) // for internal re-includes
  319. #ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
  320. #define STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
  321. #include <stddef.h>
  322. #ifdef _MSC_VER
  323. typedef unsigned char stbir_uint8;
  324. typedef unsigned short stbir_uint16;
  325. typedef unsigned int stbir_uint32;
  326. typedef unsigned __int64 stbir_uint64;
  327. #else
  328. #include <stdint.h>
  329. typedef uint8_t stbir_uint8;
  330. typedef uint16_t stbir_uint16;
  331. typedef uint32_t stbir_uint32;
  332. typedef uint64_t stbir_uint64;
  333. #endif
  334. #ifndef STBIRDEF
  335. #ifdef STB_IMAGE_RESIZE_STATIC
  336. #define STBIRDEF static
  337. #else
  338. #ifdef __cplusplus
  339. #define STBIRDEF extern "C"
  340. #else
  341. #define STBIRDEF extern
  342. #endif
  343. #endif
  344. #endif
  345. //////////////////////////////////////////////////////////////////////////////
  346. //// start "header file" ///////////////////////////////////////////////////
  347. //
  348. // Easy-to-use API:
  349. //
  350. // * stride is the offset between successive rows of image data
  351. // in memory, in bytes. specify 0 for packed continuously in memory
  352. // * colorspace is linear or sRGB as specified by function name
  353. // * Uses the default filters
  354. // * Uses edge mode clamped
  355. // * returned result is 1 for success or 0 in case of an error.
  356. // stbir_pixel_layout specifies:
  357. // number of channels
  358. // order of channels
  359. // whether color is premultiplied by alpha
  360. // for back compatibility, you can cast the old channel count to an stbir_pixel_layout
  361. typedef enum
  362. {
  363. STBIR_1CHANNEL = 1,
  364. STBIR_2CHANNEL = 2,
  365. STBIR_RGB = 3, // 3-chan, with order specified (for channel flipping)
  366. STBIR_BGR = 0, // 3-chan, with order specified (for channel flipping)
  367. STBIR_4CHANNEL = 5,
  368. STBIR_RGBA = 4, // alpha formats, where alpha is NOT premultiplied into color channels
  369. STBIR_BGRA = 6,
  370. STBIR_ARGB = 7,
  371. STBIR_ABGR = 8,
  372. STBIR_RA = 9,
  373. STBIR_AR = 10,
  374. STBIR_RGBA_PM = 11, // alpha formats, where alpha is premultiplied into color channels
  375. STBIR_BGRA_PM = 12,
  376. STBIR_ARGB_PM = 13,
  377. STBIR_ABGR_PM = 14,
  378. STBIR_RA_PM = 15,
  379. STBIR_AR_PM = 16,
  380. STBIR_RGBA_NO_AW = 11, // alpha formats, where NO alpha weighting is applied at all!
  381. STBIR_BGRA_NO_AW = 12, // these are just synonyms for the _PM flags (which also do
  382. STBIR_ARGB_NO_AW = 13, // no alpha weighting). These names just make it more clear
  383. STBIR_ABGR_NO_AW = 14, // for some folks).
  384. STBIR_RA_NO_AW = 15,
  385. STBIR_AR_NO_AW = 16,
  386. } stbir_pixel_layout;
  387. //===============================================================
  388. // Simple-complexity API
  389. //
  390. // If output_pixels is NULL (0), then we will allocate the buffer and return it to you.
  391. //--------------------------------
  392. STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  393. unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  394. stbir_pixel_layout pixel_type );
  395. STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  396. unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  397. stbir_pixel_layout pixel_type );
  398. STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  399. float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  400. stbir_pixel_layout pixel_type );
  401. //===============================================================
  402. //===============================================================
  403. // Medium-complexity API
  404. //
  405. // This extends the easy-to-use API as follows:
  406. //
  407. // * Can specify the datatype - U8, U8_SRGB, U16, FLOAT, HALF_FLOAT
  408. // * Edge wrap can selected explicitly
  409. // * Filter can be selected explicitly
  410. //--------------------------------
  411. typedef enum
  412. {
  413. STBIR_EDGE_CLAMP = 0,
  414. STBIR_EDGE_REFLECT = 1,
  415. STBIR_EDGE_WRAP = 2, // this edge mode is slower and uses more memory
  416. STBIR_EDGE_ZERO = 3,
  417. } stbir_edge;
  418. typedef enum
  419. {
  420. STBIR_FILTER_DEFAULT = 0, // use same filter type that easy-to-use API chooses
  421. STBIR_FILTER_BOX = 1, // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
  422. STBIR_FILTER_TRIANGLE = 2, // On upsampling, produces same results as bilinear texture filtering
  423. STBIR_FILTER_CUBICBSPLINE = 3, // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
  424. STBIR_FILTER_CATMULLROM = 4, // An interpolating cubic spline
  425. STBIR_FILTER_MITCHELL = 5, // Mitchell-Netrevalli filter with B=1/3, C=1/3
  426. STBIR_FILTER_POINT_SAMPLE = 6, // Simple point sampling
  427. STBIR_FILTER_OTHER = 7, // User callback specified
  428. } stbir_filter;
  429. typedef enum
  430. {
  431. STBIR_TYPE_UINT8 = 0,
  432. STBIR_TYPE_UINT8_SRGB = 1,
  433. STBIR_TYPE_UINT8_SRGB_ALPHA = 2, // alpha channel, when present, should also be SRGB (this is very unusual)
  434. STBIR_TYPE_UINT16 = 3,
  435. STBIR_TYPE_FLOAT = 4,
  436. STBIR_TYPE_HALF_FLOAT = 5
  437. } stbir_datatype;
  438. // medium api
  439. STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  440. void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  441. stbir_pixel_layout pixel_layout, stbir_datatype data_type,
  442. stbir_edge edge, stbir_filter filter );
  443. //===============================================================
  444. //===============================================================
  445. // Extended-complexity API
  446. //
  447. // This API exposes all resize functionality.
  448. //
  449. // * Separate filter types for each axis
  450. // * Separate edge modes for each axis
  451. // * Separate input and output data types
  452. // * Can specify regions with subpixel correctness
  453. // * Can specify alpha flags
  454. // * Can specify a memory callback
  455. // * Can specify a callback data type for pixel input and output
  456. // * Can be threaded for a single resize
  457. // * Can be used to resize many frames without recalculating the sampler info
  458. //
  459. // Use this API as follows:
  460. // 1) Call the stbir_resize_init function on a local STBIR_RESIZE structure
  461. // 2) Call any of the stbir_set functions
  462. // 3) Optionally call stbir_build_samplers() if you are going to resample multiple times
  463. // with the same input and output dimensions (like resizing video frames)
  464. // 4) Resample by calling stbir_resize_extended().
  465. // 5) Call stbir_free_samplers() if you called stbir_build_samplers()
  466. //--------------------------------
  467. // Types:
  468. // INPUT CALLBACK: this callback is used for input scanlines
  469. typedef void const * stbir_input_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context );
  470. // OUTPUT CALLBACK: this callback is used for output scanlines
  471. typedef void stbir_output_callback( void const * output_ptr, int num_pixels, int y, void * context );
  472. // callbacks for user installed filters
  473. typedef float stbir__kernel_callback( float x, float scale, void * user_data ); // centered at zero
  474. typedef float stbir__support_callback( float scale, void * user_data );
  475. // internal structure with precomputed scaling
  476. typedef struct stbir__info stbir__info;
  477. typedef struct STBIR_RESIZE // use the stbir_resize_init and stbir_override functions to set these values for future compatibility
  478. {
  479. void * user_data;
  480. void const * input_pixels;
  481. int input_w, input_h;
  482. double input_s0, input_t0, input_s1, input_t1;
  483. stbir_input_callback * input_cb;
  484. void * output_pixels;
  485. int output_w, output_h;
  486. int output_subx, output_suby, output_subw, output_subh;
  487. stbir_output_callback * output_cb;
  488. int input_stride_in_bytes;
  489. int output_stride_in_bytes;
  490. int splits;
  491. int fast_alpha;
  492. int needs_rebuild;
  493. int called_alloc;
  494. stbir_pixel_layout input_pixel_layout_public;
  495. stbir_pixel_layout output_pixel_layout_public;
  496. stbir_datatype input_data_type;
  497. stbir_datatype output_data_type;
  498. stbir_filter horizontal_filter, vertical_filter;
  499. stbir_edge horizontal_edge, vertical_edge;
  500. stbir__kernel_callback * horizontal_filter_kernel; stbir__support_callback * horizontal_filter_support;
  501. stbir__kernel_callback * vertical_filter_kernel; stbir__support_callback * vertical_filter_support;
  502. stbir__info * samplers;
  503. } STBIR_RESIZE;
  504. // extended complexity api
  505. // First off, you must ALWAYS call stbir_resize_init on your resize structure before any of the other calls!
  506. STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
  507. const void *input_pixels, int input_w, int input_h, int input_stride_in_bytes, // stride can be zero
  508. void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
  509. stbir_pixel_layout pixel_layout, stbir_datatype data_type );
  510. //===============================================================
  511. // You can update these parameters any time after resize_init and there is no cost
  512. //--------------------------------
  513. STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );
  514. STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb ); // no callbacks by default
  515. STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data ); // pass back STBIR_RESIZE* by default
  516. STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes );
  517. //===============================================================
  518. //===============================================================
  519. // If you call any of these functions, you will trigger a sampler rebuild!
  520. //--------------------------------
  521. STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout ); // sets new buffer layouts
  522. STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge ); // CLAMP by default
  523. STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
  524. STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support );
  525. STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets both sub-regions (full regions by default)
  526. STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 ); // sets input sub-region (full region by default)
  527. STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets output sub-region (full region by default)
  528. // when inputting AND outputting non-premultiplied alpha pixels, we use a slower but higher quality technique
  529. // that fills the zero alpha pixel's RGB values with something plausible. If you don't care about areas of
  530. // zero alpha, you can call this function to get about a 25% speed improvement for STBIR_RGBA to STBIR_RGBA
  531. // types of resizes.
  532. STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality );
  533. //===============================================================
  534. //===============================================================
  535. // You can call build_samplers to prebuild all the internal data we need to resample.
  536. // Then, if you call resize_extended many times with the same resize, you only pay the
  537. // cost once.
  538. // If you do call build_samplers, you MUST call free_samplers eventually.
  539. //--------------------------------
  540. // This builds the samplers and does one allocation
  541. STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize );
  542. // You MUST call this, if you call stbir_build_samplers or stbir_build_samplers_with_splits
  543. STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize );
  544. //===============================================================
  545. // And this is the main function to perform the resize synchronously on one thread.
  546. STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize );
  547. //===============================================================
  548. // Use these functions for multithreading.
  549. // 1) You call stbir_build_samplers_with_splits first on the main thread
  550. // 2) Then stbir_resize_with_split on each thread
  551. // 3) stbir_free_samplers when done on the main thread
  552. //--------------------------------
  553. // This will build samplers for threading.
  554. // You can pass in the number of threads you'd like to use (try_splits).
  555. // It returns the number of splits (threads) that you can call it with.
  556. /// It might be less if the image resize can't be split up that many ways.
  557. STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );
  558. // This function does a split of the resizing (you call this fuction for each
  559. // split, on multiple threads). A split is a piece of the output resize pixel space.
  560. // Note that you MUST call stbir_build_samplers_with_splits before stbir_resize_extended_split!
  561. // Usually, you will always call stbir_resize_split with split_start as the thread_index
  562. // and "1" for the split_count.
  563. // But, if you have a weird situation where you MIGHT want 8 threads, but sometimes
  564. // only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the
  565. // split_count each time to turn in into a 4 thread resize. (This is unusual).
  566. STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );
  567. //===============================================================
  568. //===============================================================
  569. // Pixel Callbacks info:
  570. //--------------------------------
  571. // The input callback is super flexible - it calls you with the input address
  572. // (based on the stride and base pointer), it gives you an optional_output
  573. // pointer that you can fill, or you can just return your own pointer into
  574. // your own data.
  575. //
  576. // You can also do conversion from non-supported data types if necessary - in
  577. // this case, you ignore the input_ptr and just use the x and y parameters to
  578. // calculate your own input_ptr based on the size of each non-supported pixel.
  579. // (Something like the third example below.)
  580. //
  581. // You can also install just an input or just an output callback by setting the
  582. // callback that you don't want to zero.
  583. //
  584. // First example, progress: (getting a callback that you can monitor the progress):
  585. // void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
  586. // {
  587. // percentage_done = y / input_height;
  588. // return input_ptr; // use buffer from call
  589. // }
  590. //
  591. // Next example, copying: (copy from some other buffer or stream):
  592. // void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
  593. // {
  594. // CopyOrStreamData( optional_output, other_data_src, num_pixels * pixel_width_in_bytes );
  595. // return optional_output; // return the optional buffer that we filled
  596. // }
  597. //
  598. // Third example, input another buffer without copying: (zero-copy from other buffer):
  599. // void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
  600. // {
  601. // void * pixels = ( (char*) other_image_base ) + ( y * other_image_stride ) + ( x * other_pixel_width_in_bytes );
  602. // return pixels; // return pointer to your data without copying
  603. // }
  604. //
  605. //
  606. // The output callback is considerably simpler - it just calls you so that you can dump
  607. // out each scanline. You could even directly copy out to disk if you have a simple format
  608. // like TGA or BMP. You can also convert to other output types here if you want.
  609. //
  610. // Simple example:
  611. // void const * my_output( void * output_ptr, int num_pixels, int y, void * context )
  612. // {
  613. // percentage_done = y / output_height;
  614. // fwrite( output_ptr, pixel_width_in_bytes, num_pixels, output_file );
  615. // }
  616. //===============================================================
  617. //===============================================================
  618. // optional built-in profiling API
  619. //--------------------------------
  620. #ifdef STBIR_PROFILE
  621. typedef struct STBIR_PROFILE_INFO
  622. {
  623. stbir_uint64 total_clocks;
  624. // how many clocks spent (of total_clocks) in the various resize routines, along with a string description
  625. // there are "resize_count" number of zones
  626. stbir_uint64 clocks[ 8 ];
  627. char const ** descriptions;
  628. // count of clocks and descriptions
  629. stbir_uint32 count;
  630. } STBIR_PROFILE_INFO;
  631. // use after calling stbir_resize_extended (or stbir_build_samplers or stbir_build_samplers_with_splits)
  632. STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
  633. // use after calling stbir_resize_extended
  634. STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
  635. // use after calling stbir_resize_extended_split
  636. STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize, int split_start, int split_num );
  637. //===============================================================
  638. #endif
  639. //// end header file /////////////////////////////////////////////////////
  640. #endif // STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
  641. #if defined(STB_IMAGE_RESIZE_IMPLEMENTATION) || defined(STB_IMAGE_RESIZE2_IMPLEMENTATION)
  642. #ifndef STBIR_ASSERT
  643. #include <assert.h>
  644. #define STBIR_ASSERT(x) assert(x)
  645. #endif
  646. #ifndef STBIR_MALLOC
  647. #include <stdlib.h>
  648. #define STBIR_MALLOC(size,user_data) ((void)(user_data), malloc(size))
  649. #define STBIR_FREE(ptr,user_data) ((void)(user_data), free(ptr))
  650. // (we used the comma operator to evaluate user_data, to avoid "unused parameter" warnings)
  651. #endif
  652. #ifdef _MSC_VER
  653. #define stbir__inline __forceinline
  654. #else
  655. #define stbir__inline __inline__
  656. // Clang address sanitizer
  657. #if defined(__has_feature)
  658. #if __has_feature(address_sanitizer) || __has_feature(memory_sanitizer)
  659. #ifndef STBIR__SEPARATE_ALLOCATIONS
  660. #define STBIR__SEPARATE_ALLOCATIONS
  661. #endif
  662. #endif
  663. #endif
  664. #endif
  665. // GCC and MSVC
  666. #if defined(__SANITIZE_ADDRESS__)
  667. #ifndef STBIR__SEPARATE_ALLOCATIONS
  668. #define STBIR__SEPARATE_ALLOCATIONS
  669. #endif
  670. #endif
  671. // Always turn off automatic FMA use - use STBIR_USE_FMA if you want.
  672. // Otherwise, this is a determinism disaster.
  673. #ifndef STBIR_DONT_CHANGE_FP_CONTRACT // override in case you don't want this behavior
  674. #if defined(_MSC_VER) && !defined(__clang__)
  675. #if _MSC_VER > 1200
  676. #pragma fp_contract(off)
  677. #endif
  678. #elif defined(__GNUC__) && !defined(__clang__)
  679. #pragma GCC optimize("fp-contract=off")
  680. #else
  681. #pragma STDC FP_CONTRACT OFF
  682. #endif
  683. #endif
  684. #ifdef _MSC_VER
  685. #define STBIR__UNUSED(v) (void)(v)
  686. #else
  687. #define STBIR__UNUSED(v) (void)sizeof(v)
  688. #endif
  689. #define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
  690. #ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
  691. #define STBIR_DEFAULT_FILTER_UPSAMPLE STBIR_FILTER_CATMULLROM
  692. #endif
  693. #ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
  694. #define STBIR_DEFAULT_FILTER_DOWNSAMPLE STBIR_FILTER_MITCHELL
  695. #endif
  696. #ifndef STBIR__HEADER_FILENAME
  697. #define STBIR__HEADER_FILENAME "stb_image_resize2.h"
  698. #endif
  699. // the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
  700. // the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
  701. typedef enum
  702. {
  703. STBIRI_1CHANNEL = 0,
  704. STBIRI_2CHANNEL = 1,
  705. STBIRI_RGB = 2,
  706. STBIRI_BGR = 3,
  707. STBIRI_4CHANNEL = 4,
  708. STBIRI_RGBA = 5,
  709. STBIRI_BGRA = 6,
  710. STBIRI_ARGB = 7,
  711. STBIRI_ABGR = 8,
  712. STBIRI_RA = 9,
  713. STBIRI_AR = 10,
  714. STBIRI_RGBA_PM = 11,
  715. STBIRI_BGRA_PM = 12,
  716. STBIRI_ARGB_PM = 13,
  717. STBIRI_ABGR_PM = 14,
  718. STBIRI_RA_PM = 15,
  719. STBIRI_AR_PM = 16,
  720. } stbir_internal_pixel_layout;
  721. // define the public pixel layouts to not compile inside the implementation (to avoid accidental use)
  722. #define STBIR_BGR bad_dont_use_in_implementation
  723. #define STBIR_1CHANNEL STBIR_BGR
  724. #define STBIR_2CHANNEL STBIR_BGR
  725. #define STBIR_RGB STBIR_BGR
  726. #define STBIR_RGBA STBIR_BGR
  727. #define STBIR_4CHANNEL STBIR_BGR
  728. #define STBIR_BGRA STBIR_BGR
  729. #define STBIR_ARGB STBIR_BGR
  730. #define STBIR_ABGR STBIR_BGR
  731. #define STBIR_RA STBIR_BGR
  732. #define STBIR_AR STBIR_BGR
  733. #define STBIR_RGBA_PM STBIR_BGR
  734. #define STBIR_BGRA_PM STBIR_BGR
  735. #define STBIR_ARGB_PM STBIR_BGR
  736. #define STBIR_ABGR_PM STBIR_BGR
  737. #define STBIR_RA_PM STBIR_BGR
  738. #define STBIR_AR_PM STBIR_BGR
  739. // must match stbir_datatype
  740. static unsigned char stbir__type_size[] = {
  741. 1,1,1,2,4,2 // STBIR_TYPE_UINT8,STBIR_TYPE_UINT8_SRGB,STBIR_TYPE_UINT8_SRGB_ALPHA,STBIR_TYPE_UINT16,STBIR_TYPE_FLOAT,STBIR_TYPE_HALF_FLOAT
  742. };
  743. // When gathering, the contributors are which source pixels contribute.
  744. // When scattering, the contributors are which destination pixels are contributed to.
  745. typedef struct
  746. {
  747. int n0; // First contributing pixel
  748. int n1; // Last contributing pixel
  749. } stbir__contributors;
  750. typedef struct
  751. {
  752. int lowest; // First sample index for whole filter
  753. int highest; // Last sample index for whole filter
  754. int widest; // widest single set of samples for an output
  755. } stbir__filter_extent_info;
  756. typedef struct
  757. {
  758. int n0; // First pixel of decode buffer to write to
  759. int n1; // Last pixel of decode that will be written to
  760. int pixel_offset_for_input; // Pixel offset into input_scanline
  761. } stbir__span;
  762. typedef struct stbir__scale_info
  763. {
  764. int input_full_size;
  765. int output_sub_size;
  766. float scale;
  767. float inv_scale;
  768. float pixel_shift; // starting shift in output pixel space (in pixels)
  769. int scale_is_rational;
  770. stbir_uint32 scale_numerator, scale_denominator;
  771. } stbir__scale_info;
  772. typedef struct
  773. {
  774. stbir__contributors * contributors;
  775. float* coefficients;
  776. stbir__contributors * gather_prescatter_contributors;
  777. float * gather_prescatter_coefficients;
  778. stbir__scale_info scale_info;
  779. float support;
  780. stbir_filter filter_enum;
  781. stbir__kernel_callback * filter_kernel;
  782. stbir__support_callback * filter_support;
  783. stbir_edge edge;
  784. int coefficient_width;
  785. int filter_pixel_width;
  786. int filter_pixel_margin;
  787. int num_contributors;
  788. int contributors_size;
  789. int coefficients_size;
  790. stbir__filter_extent_info extent_info;
  791. int is_gather; // 0 = scatter, 1 = gather with scale >= 1, 2 = gather with scale < 1
  792. int gather_prescatter_num_contributors;
  793. int gather_prescatter_coefficient_width;
  794. int gather_prescatter_contributors_size;
  795. int gather_prescatter_coefficients_size;
  796. } stbir__sampler;
  797. typedef struct
  798. {
  799. stbir__contributors conservative;
  800. int edge_sizes[2]; // this can be less than filter_pixel_margin, if the filter and scaling falls off
  801. stbir__span spans[2]; // can be two spans, if doing input subrect with clamp mode WRAP
  802. } stbir__extents;
  803. typedef struct
  804. {
  805. #ifdef STBIR_PROFILE
  806. union
  807. {
  808. struct { stbir_uint64 total, looping, vertical, horizontal, decode, encode, alpha, unalpha; } named;
  809. stbir_uint64 array[8];
  810. } profile;
  811. stbir_uint64 * current_zone_excluded_ptr;
  812. #endif
  813. float* decode_buffer;
  814. int ring_buffer_first_scanline;
  815. int ring_buffer_last_scanline;
  816. int ring_buffer_begin_index; // first_scanline is at this index in the ring buffer
  817. int start_output_y, end_output_y;
  818. int start_input_y, end_input_y; // used in scatter only
  819. #ifdef STBIR__SEPARATE_ALLOCATIONS
  820. float** ring_buffers; // one pointer for each ring buffer
  821. #else
  822. float* ring_buffer; // one big buffer that we index into
  823. #endif
  824. float* vertical_buffer;
  825. char no_cache_straddle[64];
  826. } stbir__per_split_info;
  827. typedef float * stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
  828. typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
  829. typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer,
  830. stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
  831. typedef void stbir__alpha_unweight_func(float * encode_buffer, int width_times_channels );
  832. typedef void stbir__encode_pixels_func( void * output, int width_times_channels, float const * encode );
  833. struct stbir__info
  834. {
  835. #ifdef STBIR_PROFILE
  836. union
  837. {
  838. struct { stbir_uint64 total, build, alloc, horizontal, vertical, cleanup, pivot; } named;
  839. stbir_uint64 array[7];
  840. } profile;
  841. stbir_uint64 * current_zone_excluded_ptr;
  842. #endif
  843. stbir__sampler horizontal;
  844. stbir__sampler vertical;
  845. void const * input_data;
  846. void * output_data;
  847. int input_stride_bytes;
  848. int output_stride_bytes;
  849. int ring_buffer_length_bytes; // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
  850. int ring_buffer_num_entries; // Total number of entries in the ring buffer.
  851. stbir_datatype input_type;
  852. stbir_datatype output_type;
  853. stbir_input_callback * in_pixels_cb;
  854. void * user_data;
  855. stbir_output_callback * out_pixels_cb;
  856. stbir__extents scanline_extents;
  857. void * alloced_mem;
  858. stbir__per_split_info * split_info; // by default 1, but there will be N of these allocated based on the thread init you did
  859. stbir__decode_pixels_func * decode_pixels;
  860. stbir__alpha_weight_func * alpha_weight;
  861. stbir__horizontal_gather_channels_func * horizontal_gather_channels;
  862. stbir__alpha_unweight_func * alpha_unweight;
  863. stbir__encode_pixels_func * encode_pixels;
  864. int alloc_ring_buffer_num_entries; // Number of entries in the ring buffer that will be allocated
  865. int splits; // count of splits
  866. stbir_internal_pixel_layout input_pixel_layout_internal;
  867. stbir_internal_pixel_layout output_pixel_layout_internal;
  868. int input_color_and_type;
  869. int offset_x, offset_y; // offset within output_data
  870. int vertical_first;
  871. int channels;
  872. int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
  873. size_t alloced_total;
  874. };
  875. #define stbir__max_uint8_as_float 255.0f
  876. #define stbir__max_uint16_as_float 65535.0f
  877. #define stbir__max_uint8_as_float_inverted 3.9215689e-03f // (1.0f/255.0f)
  878. #define stbir__max_uint16_as_float_inverted 1.5259022e-05f // (1.0f/65535.0f)
  879. #define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
  880. // min/max friendly
  881. #define STBIR_CLAMP(x, xmin, xmax) for(;;) { \
  882. if ( (x) < (xmin) ) (x) = (xmin); \
  883. if ( (x) > (xmax) ) (x) = (xmax); \
  884. break; \
  885. }
  886. static stbir__inline int stbir__min(int a, int b)
  887. {
  888. return a < b ? a : b;
  889. }
  890. static stbir__inline int stbir__max(int a, int b)
  891. {
  892. return a > b ? a : b;
  893. }
  894. static float stbir__srgb_uchar_to_linear_float[256] = {
  895. 0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
  896. 0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
  897. 0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
  898. 0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
  899. 0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
  900. 0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
  901. 0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
  902. 0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
  903. 0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
  904. 0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
  905. 0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
  906. 0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
  907. 0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
  908. 0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
  909. 0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
  910. 0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
  911. 0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
  912. 0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
  913. 0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
  914. 0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
  915. 0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
  916. 0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
  917. 0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
  918. 0.982251f, 0.991102f, 1.0f
  919. };
  920. typedef union
  921. {
  922. unsigned int u;
  923. float f;
  924. } stbir__FP32;
  925. // From https://gist.github.com/rygorous/2203834
  926. static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
  927. 0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
  928. 0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
  929. 0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
  930. 0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
  931. 0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
  932. 0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
  933. 0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
  934. 0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
  935. 0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
  936. 0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
  937. 0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
  938. 0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
  939. 0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
  940. };
  941. static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
  942. {
  943. static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
  944. static const stbir__FP32 minval = { (127-13) << 23 };
  945. stbir_uint32 tab,bias,scale,t;
  946. stbir__FP32 f;
  947. // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
  948. // The tests are carefully written so that NaNs map to 0, same as in the reference
  949. // implementation.
  950. if (!(in > minval.f)) // written this way to catch NaNs
  951. return 0;
  952. if (in > almostone.f)
  953. return 255;
  954. // Do the table lookup and unpack bias, scale
  955. f.f = in;
  956. tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
  957. bias = (tab >> 16) << 9;
  958. scale = tab & 0xffff;
  959. // Grab next-highest mantissa bits and perform linear interpolation
  960. t = (f.u >> 12) & 0xff;
  961. return (unsigned char) ((bias + scale*t) >> 16);
  962. }
  963. #ifndef STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT
  964. #define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win.
  965. #endif
  966. #ifndef STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS
  967. #define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
  968. #endif
  969. #define STBIR_INPUT_CALLBACK_PADDING 3
  970. #ifdef _M_IX86_FP
  971. #if ( _M_IX86_FP >= 1 )
  972. #ifndef STBIR_SSE
  973. #define STBIR_SSE
  974. #endif
  975. #endif
  976. #endif
  977. #ifdef __TINYC__
  978. // tiny c has no intrinsics yet - this can become a version check if they add them
  979. #define STBIR_NO_SIMD
  980. #endif
  981. #if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
  982. #ifndef STBIR_SSE2
  983. #define STBIR_SSE2
  984. #endif
  985. #if defined(__AVX__) || defined(STBIR_AVX2)
  986. #ifndef STBIR_AVX
  987. #ifndef STBIR_NO_AVX
  988. #define STBIR_AVX
  989. #endif
  990. #endif
  991. #endif
  992. #if defined(__AVX2__) || defined(STBIR_AVX2)
  993. #ifndef STBIR_NO_AVX2
  994. #ifndef STBIR_AVX2
  995. #define STBIR_AVX2
  996. #endif
  997. #if defined( _MSC_VER ) && !defined(__clang__)
  998. #ifndef STBIR_FP16C // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
  999. #define STBIR_FP16C
  1000. #endif
  1001. #endif
  1002. #endif
  1003. #endif
  1004. #ifdef __F16C__
  1005. #ifndef STBIR_FP16C // turn on FP16C instructions if the define is set (for clang and gcc)
  1006. #define STBIR_FP16C
  1007. #endif
  1008. #endif
  1009. #endif
  1010. #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
  1011. #ifndef STBIR_NEON
  1012. #define STBIR_NEON
  1013. #endif
  1014. #endif
  1015. #if defined(_M_ARM) || defined(__arm__)
  1016. #ifdef STBIR_USE_FMA
  1017. #undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
  1018. #endif
  1019. #endif
  1020. #if defined(__wasm__) && defined(__wasm_simd128__)
  1021. #ifndef STBIR_WASM
  1022. #define STBIR_WASM
  1023. #endif
  1024. #endif
  1025. // restrict pointers for the output pointers, other loop and unroll control
  1026. #if defined( _MSC_VER ) && !defined(__clang__)
  1027. #define STBIR_STREAMOUT_PTR( star ) star __restrict
  1028. #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
  1029. #if _MSC_VER >= 1900
  1030. #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector ))
  1031. #else
  1032. #define STBIR_NO_UNROLL_LOOP_START
  1033. #endif
  1034. #elif defined( __clang__ )
  1035. #define STBIR_STREAMOUT_PTR( star ) star __restrict__
  1036. #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
  1037. #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) )
  1038. #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)")
  1039. #else
  1040. #define STBIR_NO_UNROLL_LOOP_START
  1041. #endif
  1042. #elif defined( __GNUC__ )
  1043. #define STBIR_STREAMOUT_PTR( star ) star __restrict__
  1044. #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
  1045. #if __GNUC__ >= 14
  1046. #define STBIR_NO_UNROLL_LOOP_START _Pragma("GCC unroll 0") _Pragma("GCC novector")
  1047. #else
  1048. #define STBIR_NO_UNROLL_LOOP_START
  1049. #endif
  1050. #define STBIR_NO_UNROLL_LOOP_START_INF_FOR
  1051. #else
  1052. #define STBIR_STREAMOUT_PTR( star ) star
  1053. #define STBIR_NO_UNROLL( ptr )
  1054. #define STBIR_NO_UNROLL_LOOP_START
  1055. #endif
  1056. #ifndef STBIR_NO_UNROLL_LOOP_START_INF_FOR
  1057. #define STBIR_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START
  1058. #endif
  1059. #ifdef STBIR_NO_SIMD // force simd off for whatever reason
  1060. // force simd off overrides everything else, so clear it all
  1061. #ifdef STBIR_SSE2
  1062. #undef STBIR_SSE2
  1063. #endif
  1064. #ifdef STBIR_AVX
  1065. #undef STBIR_AVX
  1066. #endif
  1067. #ifdef STBIR_NEON
  1068. #undef STBIR_NEON
  1069. #endif
  1070. #ifdef STBIR_AVX2
  1071. #undef STBIR_AVX2
  1072. #endif
  1073. #ifdef STBIR_FP16C
  1074. #undef STBIR_FP16C
  1075. #endif
  1076. #ifdef STBIR_WASM
  1077. #undef STBIR_WASM
  1078. #endif
  1079. #ifdef STBIR_SIMD
  1080. #undef STBIR_SIMD
  1081. #endif
  1082. #else // STBIR_SIMD
  1083. #ifdef STBIR_SSE2
  1084. #include <emmintrin.h>
  1085. #define stbir__simdf __m128
  1086. #define stbir__simdi __m128i
  1087. #define stbir_simdi_castf( reg ) _mm_castps_si128(reg)
  1088. #define stbir_simdf_casti( reg ) _mm_castsi128_ps(reg)
  1089. #define stbir__simdf_load( reg, ptr ) (reg) = _mm_loadu_ps( (float const*)(ptr) )
  1090. #define stbir__simdi_load( reg, ptr ) (reg) = _mm_loadu_si128 ( (stbir__simdi const*)(ptr) )
  1091. #define stbir__simdf_load1( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf)
  1092. #define stbir__simdi_load1( out, ptr ) (out) = _mm_castps_si128( _mm_load_ss( (float const*)(ptr) ))
  1093. #define stbir__simdf_load1z( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) ) // top values must be zero
  1094. #define stbir__simdf_frep4( fvar ) _mm_set_ps1( fvar )
  1095. #define stbir__simdf_load1frep4( out, fvar ) (out) = _mm_set_ps1( fvar )
  1096. #define stbir__simdf_load2( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values can be random (not denormal or nan for perf)
  1097. #define stbir__simdf_load2z( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values must be zero
  1098. #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = _mm_castpd_ps(_mm_loadh_pd( _mm_castps_pd(reg), (double*)(ptr) ))
  1099. #define stbir__simdf_zeroP() _mm_setzero_ps()
  1100. #define stbir__simdf_zero( reg ) (reg) = _mm_setzero_ps()
  1101. #define stbir__simdf_store( ptr, reg ) _mm_storeu_ps( (float*)(ptr), reg )
  1102. #define stbir__simdf_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), reg )
  1103. #define stbir__simdf_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), _mm_castps_si128(reg) )
  1104. #define stbir__simdf_store2h( ptr, reg ) _mm_storeh_pd( (double*)(ptr), _mm_castps_pd(reg) )
  1105. #define stbir__simdi_store( ptr, reg ) _mm_storeu_si128( (__m128i*)(ptr), reg )
  1106. #define stbir__simdi_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), _mm_castsi128_ps(reg) )
  1107. #define stbir__simdi_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), (reg) )
  1108. #define stbir__prefetch( ptr ) _mm_prefetch((char*)(ptr), _MM_HINT_T0 )
  1109. #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
  1110. { \
  1111. stbir__simdi zero = _mm_setzero_si128(); \
  1112. out2 = _mm_unpacklo_epi8( ireg, zero ); \
  1113. out3 = _mm_unpackhi_epi8( ireg, zero ); \
  1114. out0 = _mm_unpacklo_epi16( out2, zero ); \
  1115. out1 = _mm_unpackhi_epi16( out2, zero ); \
  1116. out2 = _mm_unpacklo_epi16( out3, zero ); \
  1117. out3 = _mm_unpackhi_epi16( out3, zero ); \
  1118. }
  1119. #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
  1120. { \
  1121. stbir__simdi zero = _mm_setzero_si128(); \
  1122. out = _mm_unpacklo_epi8( ireg, zero ); \
  1123. out = _mm_unpacklo_epi16( out, zero ); \
  1124. }
  1125. #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
  1126. { \
  1127. stbir__simdi zero = _mm_setzero_si128(); \
  1128. out0 = _mm_unpacklo_epi16( ireg, zero ); \
  1129. out1 = _mm_unpackhi_epi16( ireg, zero ); \
  1130. }
  1131. #define stbir__simdf_convert_float_to_i32( i, f ) (i) = _mm_cvttps_epi32(f)
  1132. #define stbir__simdf_convert_float_to_int( f ) _mm_cvtt_ss2si(f)
  1133. #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),_mm_setzero_ps()))))
  1134. #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))))
  1135. #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i)
  1136. #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = _mm_cvtepi32_ps( ireg )
  1137. #define stbir__simdf_add( out, reg0, reg1 ) (out) = _mm_add_ps( reg0, reg1 )
  1138. #define stbir__simdf_mult( out, reg0, reg1 ) (out) = _mm_mul_ps( reg0, reg1 )
  1139. #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = _mm_mul_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
  1140. #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = _mm_mul_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
  1141. #define stbir__simdf_add_mem( out, reg, ptr ) (out) = _mm_add_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
  1142. #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = _mm_add_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
  1143. #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd
  1144. #include <immintrin.h>
  1145. #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_fmadd_ps( mul1, mul2, add )
  1146. #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_fmadd_ss( mul1, mul2, add )
  1147. #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ps( mul, _mm_loadu_ps( (float const*)(ptr) ), add )
  1148. #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ss( mul, _mm_load_ss( (float const*)(ptr) ), add )
  1149. #else
  1150. #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_add_ps( add, _mm_mul_ps( mul1, mul2 ) )
  1151. #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_add_ss( add, _mm_mul_ss( mul1, mul2 ) )
  1152. #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_add_ps( add, _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) )
  1153. #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_add_ss( add, _mm_mul_ss( mul, _mm_load_ss( (float const*)(ptr) ) ) )
  1154. #endif
  1155. #define stbir__simdf_add1( out, reg0, reg1 ) (out) = _mm_add_ss( reg0, reg1 )
  1156. #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = _mm_mul_ss( reg0, reg1 )
  1157. #define stbir__simdf_and( out, reg0, reg1 ) (out) = _mm_and_ps( reg0, reg1 )
  1158. #define stbir__simdf_or( out, reg0, reg1 ) (out) = _mm_or_ps( reg0, reg1 )
  1159. #define stbir__simdf_min( out, reg0, reg1 ) (out) = _mm_min_ps( reg0, reg1 )
  1160. #define stbir__simdf_max( out, reg0, reg1 ) (out) = _mm_max_ps( reg0, reg1 )
  1161. #define stbir__simdf_min1( out, reg0, reg1 ) (out) = _mm_min_ss( reg0, reg1 )
  1162. #define stbir__simdf_max1( out, reg0, reg1 ) (out) = _mm_max_ss( reg0, reg1 )
  1163. #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (3<<0) + (0<<2) + (1<<4) + (2<<6) ) )
  1164. #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (2<<0) + (3<<2) + (0<<4) + (1<<6) ) )
  1165. static const stbir__simdf STBIR_zeroones = { 0.0f,1.0f,0.0f,1.0f };
  1166. static const stbir__simdf STBIR_onezeros = { 1.0f,0.0f,1.0f,0.0f };
  1167. #define stbir__simdf_aaa1( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movehl_ps( ones, alp ) ), (1<<0) + (1<<2) + (1<<4) + (2<<6) ) )
  1168. #define stbir__simdf_1aaa( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movelh_ps( ones, alp ) ), (0<<0) + (2<<2) + (2<<4) + (2<<6) ) )
  1169. #define stbir__simdf_a1a1( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_srli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_zeroones )
  1170. #define stbir__simdf_1a1a( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_slli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_onezeros )
  1171. #define stbir__simdf_swiz( reg, one, two, three, four ) _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( reg ), (one<<0) + (two<<2) + (three<<4) + (four<<6) ) )
  1172. #define stbir__simdi_and( out, reg0, reg1 ) (out) = _mm_and_si128( reg0, reg1 )
  1173. #define stbir__simdi_or( out, reg0, reg1 ) (out) = _mm_or_si128( reg0, reg1 )
  1174. #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = _mm_madd_epi16( reg0, reg1 )
  1175. #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
  1176. { \
  1177. stbir__simdf af,bf; \
  1178. stbir__simdi a,b; \
  1179. af = _mm_min_ps( aa, STBIR_max_uint8_as_float ); \
  1180. bf = _mm_min_ps( bb, STBIR_max_uint8_as_float ); \
  1181. af = _mm_max_ps( af, _mm_setzero_ps() ); \
  1182. bf = _mm_max_ps( bf, _mm_setzero_ps() ); \
  1183. a = _mm_cvttps_epi32( af ); \
  1184. b = _mm_cvttps_epi32( bf ); \
  1185. a = _mm_packs_epi32( a, b ); \
  1186. out = _mm_packus_epi16( a, a ); \
  1187. }
  1188. #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
  1189. stbir__simdf_load( o0, (ptr) ); \
  1190. stbir__simdf_load( o1, (ptr)+4 ); \
  1191. stbir__simdf_load( o2, (ptr)+8 ); \
  1192. stbir__simdf_load( o3, (ptr)+12 ); \
  1193. { \
  1194. __m128 tmp0, tmp1, tmp2, tmp3; \
  1195. tmp0 = _mm_unpacklo_ps(o0, o1); \
  1196. tmp2 = _mm_unpacklo_ps(o2, o3); \
  1197. tmp1 = _mm_unpackhi_ps(o0, o1); \
  1198. tmp3 = _mm_unpackhi_ps(o2, o3); \
  1199. o0 = _mm_movelh_ps(tmp0, tmp2); \
  1200. o1 = _mm_movehl_ps(tmp2, tmp0); \
  1201. o2 = _mm_movelh_ps(tmp1, tmp3); \
  1202. o3 = _mm_movehl_ps(tmp3, tmp1); \
  1203. }
  1204. #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
  1205. r0 = _mm_packs_epi32( r0, r1 ); \
  1206. r2 = _mm_packs_epi32( r2, r3 ); \
  1207. r1 = _mm_unpacklo_epi16( r0, r2 ); \
  1208. r3 = _mm_unpackhi_epi16( r0, r2 ); \
  1209. r0 = _mm_unpacklo_epi16( r1, r3 ); \
  1210. r2 = _mm_unpackhi_epi16( r1, r3 ); \
  1211. r0 = _mm_packus_epi16( r0, r2 ); \
  1212. stbir__simdi_store( ptr, r0 ); \
  1213. #define stbir__simdi_32shr( out, reg, imm ) out = _mm_srli_epi32( reg, imm )
  1214. #if defined(_MSC_VER) && !defined(__clang__)
  1215. // msvc inits with 8 bytes
  1216. #define STBIR__CONST_32_TO_8( v ) (char)(unsigned char)((v)&255),(char)(unsigned char)(((v)>>8)&255),(char)(unsigned char)(((v)>>16)&255),(char)(unsigned char)(((v)>>24)&255)
  1217. #define STBIR__CONST_4_32i( v ) STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v )
  1218. #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) STBIR__CONST_32_TO_8( v0 ), STBIR__CONST_32_TO_8( v1 ), STBIR__CONST_32_TO_8( v2 ), STBIR__CONST_32_TO_8( v3 )
  1219. #else
  1220. // everything else inits with long long's
  1221. #define STBIR__CONST_4_32i( v ) (long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v))),(long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v)))
  1222. #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) (long long)((((stbir_uint64)(stbir_uint32)(v1))<<32)|((stbir_uint64)(stbir_uint32)(v0))),(long long)((((stbir_uint64)(stbir_uint32)(v3))<<32)|((stbir_uint64)(stbir_uint32)(v2)))
  1223. #endif
  1224. #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
  1225. #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { STBIR__CONST_4_32i(x) }
  1226. #define STBIR__CONSTF(var) (var)
  1227. #define STBIR__CONSTI(var) (var)
  1228. #if defined(STBIR_AVX) || defined(__SSE4_1__)
  1229. #include <smmintrin.h>
  1230. #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))
  1231. #else
  1232. static STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
  1233. static STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
  1234. #define stbir__simdf_pack_to_8words(out,reg0,reg1) \
  1235. { \
  1236. stbir__simdi tmp0,tmp1; \
  1237. tmp0 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
  1238. tmp1 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
  1239. tmp0 = _mm_sub_epi32( tmp0, stbir__s32_32768 ); \
  1240. tmp1 = _mm_sub_epi32( tmp1, stbir__s32_32768 ); \
  1241. out = _mm_packs_epi32( tmp0, tmp1 ); \
  1242. out = _mm_sub_epi16( out, stbir__s16_32768 ); \
  1243. }
  1244. #endif
  1245. #define STBIR_SIMD
  1246. // if we detect AVX, set the simd8 defines
  1247. #ifdef STBIR_AVX
  1248. #include <immintrin.h>
  1249. #define STBIR_SIMD8
  1250. #define stbir__simdf8 __m256
  1251. #define stbir__simdi8 __m256i
  1252. #define stbir__simdf8_load( out, ptr ) (out) = _mm256_loadu_ps( (float const *)(ptr) )
  1253. #define stbir__simdi8_load( out, ptr ) (out) = _mm256_loadu_si256( (__m256i const *)(ptr) )
  1254. #define stbir__simdf8_mult( out, a, b ) (out) = _mm256_mul_ps( (a), (b) )
  1255. #define stbir__simdf8_store( ptr, out ) _mm256_storeu_ps( (float*)(ptr), out )
  1256. #define stbir__simdi8_store( ptr, reg ) _mm256_storeu_si256( (__m256i*)(ptr), reg )
  1257. #define stbir__simdf8_frep8( fval ) _mm256_set1_ps( fval )
  1258. #define stbir__simdf8_min( out, reg0, reg1 ) (out) = _mm256_min_ps( reg0, reg1 )
  1259. #define stbir__simdf8_max( out, reg0, reg1 ) (out) = _mm256_max_ps( reg0, reg1 )
  1260. #define stbir__simdf8_add4halves( out, bot4, top8 ) (out) = _mm_add_ps( bot4, _mm256_extractf128_ps( top8, 1 ) )
  1261. #define stbir__simdf8_mult_mem( out, reg, ptr ) (out) = _mm256_mul_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
  1262. #define stbir__simdf8_add_mem( out, reg, ptr ) (out) = _mm256_add_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
  1263. #define stbir__simdf8_add( out, a, b ) (out) = _mm256_add_ps( a, b )
  1264. #define stbir__simdf8_load1b( out, ptr ) (out) = _mm256_broadcast_ss( ptr )
  1265. #define stbir__simdf_load1rep4( out, ptr ) (out) = _mm_broadcast_ss( ptr ) // avx load instruction
  1266. #define stbir__simdi8_convert_i32_to_float(out, ireg) (out) = _mm256_cvtepi32_ps( ireg )
  1267. #define stbir__simdf8_convert_float_to_i32( i, f ) (i) = _mm256_cvttps_epi32(f)
  1268. #define stbir__simdf8_bot4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (0<<0)+(2<<4) )
  1269. #define stbir__simdf8_top4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (1<<0)+(3<<4) )
  1270. #define stbir__simdf8_gettop4( reg ) _mm256_extractf128_ps(reg,1)
  1271. #ifdef STBIR_AVX2
  1272. #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
  1273. { \
  1274. stbir__simdi8 a, zero =_mm256_setzero_si256();\
  1275. a = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), zero ),(0<<0)+(2<<2)+(1<<4)+(3<<6)); \
  1276. out0 = _mm256_unpacklo_epi16( a, zero ); \
  1277. out1 = _mm256_unpackhi_epi16( a, zero ); \
  1278. }
  1279. #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
  1280. { \
  1281. stbir__simdi8 t; \
  1282. stbir__simdf8 af,bf; \
  1283. stbir__simdi8 a,b; \
  1284. af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
  1285. bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
  1286. af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
  1287. bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
  1288. a = _mm256_cvttps_epi32( af ); \
  1289. b = _mm256_cvttps_epi32( bf ); \
  1290. t = _mm256_permute4x64_epi64( _mm256_packs_epi32( a, b ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
  1291. out = _mm256_castsi256_si128( _mm256_permute4x64_epi64( _mm256_packus_epi16( t, t ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ) ); \
  1292. }
  1293. #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() );
  1294. #define stbir__simdf8_pack_to_16words(out,aa,bb) \
  1295. { \
  1296. stbir__simdf8 af,bf; \
  1297. stbir__simdi8 a,b; \
  1298. af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
  1299. bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
  1300. af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
  1301. bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
  1302. a = _mm256_cvttps_epi32( af ); \
  1303. b = _mm256_cvttps_epi32( bf ); \
  1304. (out) = _mm256_permute4x64_epi64( _mm256_packus_epi32(a, b), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
  1305. }
  1306. #else
  1307. #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
  1308. { \
  1309. stbir__simdi a,zero = _mm_setzero_si128(); \
  1310. a = _mm_unpacklo_epi8( ireg, zero ); \
  1311. out0 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
  1312. a = _mm_unpackhi_epi8( ireg, zero ); \
  1313. out1 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
  1314. }
  1315. #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
  1316. { \
  1317. stbir__simdi t; \
  1318. stbir__simdf8 af,bf; \
  1319. stbir__simdi8 a,b; \
  1320. af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
  1321. bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
  1322. af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
  1323. bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
  1324. a = _mm256_cvttps_epi32( af ); \
  1325. b = _mm256_cvttps_epi32( bf ); \
  1326. out = _mm_packs_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
  1327. out = _mm_packus_epi16( out, out ); \
  1328. t = _mm_packs_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
  1329. t = _mm_packus_epi16( t, t ); \
  1330. out = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(out), _mm_castsi128_ps(t), (0<<0)+(1<<2)+(0<<4)+(1<<6) ) ); \
  1331. }
  1332. #define stbir__simdi8_expand_u16_to_u32(out,ireg) \
  1333. { \
  1334. stbir__simdi a,b,zero = _mm_setzero_si128(); \
  1335. a = _mm_unpacklo_epi16( ireg, zero ); \
  1336. b = _mm_unpackhi_epi16( ireg, zero ); \
  1337. out = _mm256_insertf128_si256( _mm256_castsi128_si256( a ), b, 1 ); \
  1338. }
  1339. #define stbir__simdf8_pack_to_16words(out,aa,bb) \
  1340. { \
  1341. stbir__simdi t0,t1; \
  1342. stbir__simdf8 af,bf; \
  1343. stbir__simdi8 a,b; \
  1344. af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
  1345. bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
  1346. af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
  1347. bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
  1348. a = _mm256_cvttps_epi32( af ); \
  1349. b = _mm256_cvttps_epi32( bf ); \
  1350. t0 = _mm_packus_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
  1351. t1 = _mm_packus_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
  1352. out = _mm256_setr_m128i( t0, t1 ); \
  1353. }
  1354. #endif
  1355. static __m256i stbir_00001111 = { STBIR__CONST_4d_32i( 0, 0, 0, 0 ), STBIR__CONST_4d_32i( 1, 1, 1, 1 ) };
  1356. #define stbir__simdf8_0123to00001111( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00001111 )
  1357. static __m256i stbir_22223333 = { STBIR__CONST_4d_32i( 2, 2, 2, 2 ), STBIR__CONST_4d_32i( 3, 3, 3, 3 ) };
  1358. #define stbir__simdf8_0123to22223333( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_22223333 )
  1359. #define stbir__simdf8_0123to2222( out, in ) (out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2,2,2,2 )
  1360. #define stbir__simdf8_load4b( out, ptr ) (out) = _mm256_broadcast_ps( (__m128 const *)(ptr) )
  1361. static __m256i stbir_00112233 = { STBIR__CONST_4d_32i( 0, 0, 1, 1 ), STBIR__CONST_4d_32i( 2, 2, 3, 3 ) };
  1362. #define stbir__simdf8_0123to00112233( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00112233 )
  1363. #define stbir__simdf8_add4( out, a8, b ) (out) = _mm256_add_ps( a8, _mm256_castps128_ps256( b ) )
  1364. static __m256i stbir_load6 = { STBIR__CONST_4_32i( 0x80000000 ), STBIR__CONST_4d_32i( 0x80000000, 0x80000000, 0, 0 ) };
  1365. #define stbir__simdf8_load6z( out, ptr ) (out) = _mm256_maskload_ps( ptr, stbir_load6 )
  1366. #define stbir__simdf8_0123to00000000( out, in ) (out) = _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(0<<4)+(0<<6) )
  1367. #define stbir__simdf8_0123to11111111( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(1<<4)+(1<<6) )
  1368. #define stbir__simdf8_0123to22222222( out, in ) (out) = _mm256_shuffle_ps ( in, in, (2<<0)+(2<<2)+(2<<4)+(2<<6) )
  1369. #define stbir__simdf8_0123to33333333( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(3<<2)+(3<<4)+(3<<6) )
  1370. #define stbir__simdf8_0123to21032103( out, in ) (out) = _mm256_shuffle_ps ( in, in, (2<<0)+(1<<2)+(0<<4)+(3<<6) )
  1371. #define stbir__simdf8_0123to32103210( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(2<<2)+(1<<4)+(0<<6) )
  1372. #define stbir__simdf8_0123to12301230( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(2<<2)+(3<<4)+(0<<6) )
  1373. #define stbir__simdf8_0123to10321032( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(0<<2)+(3<<4)+(2<<6) )
  1374. #define stbir__simdf8_0123to30123012( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(0<<2)+(1<<4)+(2<<6) )
  1375. #define stbir__simdf8_0123to11331133( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(3<<4)+(3<<6) )
  1376. #define stbir__simdf8_0123to00220022( out, in ) (out) = _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(2<<4)+(2<<6) )
  1377. #define stbir__simdf8_aaa1( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(1<<1)+(1<<2)+(0<<3)+(1<<4)+(1<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (3<<0) + (3<<2) + (3<<4) + (0<<6) )
  1378. #define stbir__simdf8_1aaa( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(1<<2)+(1<<3)+(0<<4)+(1<<5)+(1<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (0<<4) + (0<<6) )
  1379. #define stbir__simdf8_a1a1( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(0<<1)+(1<<2)+(0<<3)+(1<<4)+(0<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
  1380. #define stbir__simdf8_1a1a( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(0<<2)+(1<<3)+(0<<4)+(1<<5)+(0<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
  1381. #define stbir__simdf8_zero( reg ) (reg) = _mm256_setzero_ps()
  1382. #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd
  1383. #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_fmadd_ps( mul1, mul2, add )
  1384. #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ), add )
  1385. #define stbir__simdf8_madd_mem4( out, add, mul, ptr )(out) = _mm256_fmadd_ps( _mm256_setr_m128( mul, _mm_setzero_ps() ), _mm256_setr_m128( _mm_loadu_ps( (float const*)(ptr) ), _mm_setzero_ps() ), add )
  1386. #else
  1387. #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul1, mul2 ) )
  1388. #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ) ) )
  1389. #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_setr_m128( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ), _mm_setzero_ps() ) )
  1390. #endif
  1391. #define stbir__if_simdf8_cast_to_simdf4( val ) _mm256_castps256_ps128( val )
  1392. #endif
  1393. #ifdef STBIR_FLOORF
  1394. #undef STBIR_FLOORF
  1395. #endif
  1396. #define STBIR_FLOORF stbir_simd_floorf
  1397. static stbir__inline float stbir_simd_floorf(float x) // martins floorf
  1398. {
  1399. #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
  1400. __m128 t = _mm_set_ss(x);
  1401. return _mm_cvtss_f32( _mm_floor_ss(t, t) );
  1402. #else
  1403. __m128 f = _mm_set_ss(x);
  1404. __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
  1405. __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), _mm_set_ss(-1.0f)));
  1406. return _mm_cvtss_f32(r);
  1407. #endif
  1408. }
  1409. #ifdef STBIR_CEILF
  1410. #undef STBIR_CEILF
  1411. #endif
  1412. #define STBIR_CEILF stbir_simd_ceilf
  1413. static stbir__inline float stbir_simd_ceilf(float x) // martins ceilf
  1414. {
  1415. #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
  1416. __m128 t = _mm_set_ss(x);
  1417. return _mm_cvtss_f32( _mm_ceil_ss(t, t) );
  1418. #else
  1419. __m128 f = _mm_set_ss(x);
  1420. __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
  1421. __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(t, f), _mm_set_ss(1.0f)));
  1422. return _mm_cvtss_f32(r);
  1423. #endif
  1424. }
  1425. #elif defined(STBIR_NEON)
  1426. #include <arm_neon.h>
  1427. #define stbir__simdf float32x4_t
  1428. #define stbir__simdi uint32x4_t
  1429. #define stbir_simdi_castf( reg ) vreinterpretq_u32_f32(reg)
  1430. #define stbir_simdf_casti( reg ) vreinterpretq_f32_u32(reg)
  1431. #define stbir__simdf_load( reg, ptr ) (reg) = vld1q_f32( (float const*)(ptr) )
  1432. #define stbir__simdi_load( reg, ptr ) (reg) = vld1q_u32( (uint32_t const*)(ptr) )
  1433. #define stbir__simdf_load1( out, ptr ) (out) = vld1q_dup_f32( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf)
  1434. #define stbir__simdi_load1( out, ptr ) (out) = vld1q_dup_u32( (uint32_t const*)(ptr) )
  1435. #define stbir__simdf_load1z( out, ptr ) (out) = vld1q_lane_f32( (float const*)(ptr), vdupq_n_f32(0), 0 ) // top values must be zero
  1436. #define stbir__simdf_frep4( fvar ) vdupq_n_f32( fvar )
  1437. #define stbir__simdf_load1frep4( out, fvar ) (out) = vdupq_n_f32( fvar )
  1438. #define stbir__simdf_load2( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values can be random (not denormal or nan for perf)
  1439. #define stbir__simdf_load2z( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values must be zero
  1440. #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = vcombine_f32( vget_low_f32(reg), vld1_f32( (float const*)(ptr) ) )
  1441. #define stbir__simdf_zeroP() vdupq_n_f32(0)
  1442. #define stbir__simdf_zero( reg ) (reg) = vdupq_n_f32(0)
  1443. #define stbir__simdf_store( ptr, reg ) vst1q_f32( (float*)(ptr), reg )
  1444. #define stbir__simdf_store1( ptr, reg ) vst1q_lane_f32( (float*)(ptr), reg, 0)
  1445. #define stbir__simdf_store2( ptr, reg ) vst1_f32( (float*)(ptr), vget_low_f32(reg) )
  1446. #define stbir__simdf_store2h( ptr, reg ) vst1_f32( (float*)(ptr), vget_high_f32(reg) )
  1447. #define stbir__simdi_store( ptr, reg ) vst1q_u32( (uint32_t*)(ptr), reg )
  1448. #define stbir__simdi_store1( ptr, reg ) vst1q_lane_u32( (uint32_t*)(ptr), reg, 0 )
  1449. #define stbir__simdi_store2( ptr, reg ) vst1_u32( (uint32_t*)(ptr), vget_low_u32(reg) )
  1450. #define stbir__prefetch( ptr )
  1451. #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
  1452. { \
  1453. uint16x8_t l = vmovl_u8( vget_low_u8 ( vreinterpretq_u8_u32(ireg) ) ); \
  1454. uint16x8_t h = vmovl_u8( vget_high_u8( vreinterpretq_u8_u32(ireg) ) ); \
  1455. out0 = vmovl_u16( vget_low_u16 ( l ) ); \
  1456. out1 = vmovl_u16( vget_high_u16( l ) ); \
  1457. out2 = vmovl_u16( vget_low_u16 ( h ) ); \
  1458. out3 = vmovl_u16( vget_high_u16( h ) ); \
  1459. }
  1460. #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
  1461. { \
  1462. uint16x8_t tmp = vmovl_u8( vget_low_u8( vreinterpretq_u8_u32(ireg) ) ); \
  1463. out = vmovl_u16( vget_low_u16( tmp ) ); \
  1464. }
  1465. #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
  1466. { \
  1467. uint16x8_t tmp = vreinterpretq_u16_u32(ireg); \
  1468. out0 = vmovl_u16( vget_low_u16 ( tmp ) ); \
  1469. out1 = vmovl_u16( vget_high_u16( tmp ) ); \
  1470. }
  1471. #define stbir__simdf_convert_float_to_i32( i, f ) (i) = vreinterpretq_u32_s32( vcvtq_s32_f32(f) )
  1472. #define stbir__simdf_convert_float_to_int( f ) vgetq_lane_s32(vcvtq_s32_f32(f), 0)
  1473. #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0)
  1474. #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),vdupq_n_f32(0))), 0))
  1475. #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),vdupq_n_f32(0))), 0))
  1476. #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = vcvtq_f32_s32( vreinterpretq_s32_u32(ireg) )
  1477. #define stbir__simdf_add( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
  1478. #define stbir__simdf_mult( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
  1479. #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
  1480. #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
  1481. #define stbir__simdf_add_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
  1482. #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
  1483. #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd (and also x64 no madd to arm madd)
  1484. #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
  1485. #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
  1486. #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_f32( (float const*)(ptr) ) )
  1487. #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_dup_f32( (float const*)(ptr) ) )
  1488. #else
  1489. #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
  1490. #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
  1491. #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_f32( (float const*)(ptr) ) ) )
  1492. #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_dup_f32( (float const*)(ptr) ) ) )
  1493. #endif
  1494. #define stbir__simdf_add1( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
  1495. #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
  1496. #define stbir__simdf_and( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
  1497. #define stbir__simdf_or( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
  1498. #define stbir__simdf_min( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
  1499. #define stbir__simdf_max( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
  1500. #define stbir__simdf_min1( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
  1501. #define stbir__simdf_max1( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
  1502. #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 3 )
  1503. #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 2 )
  1504. #define stbir__simdf_a1a1( out, alp, ones ) (out) = vzipq_f32(vuzpq_f32(alp, alp).val[1], ones).val[0]
  1505. #define stbir__simdf_1a1a( out, alp, ones ) (out) = vzipq_f32(ones, vuzpq_f32(alp, alp).val[0]).val[0]
  1506. #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
  1507. #define stbir__simdf_aaa1( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3, ones, 3)
  1508. #define stbir__simdf_1aaa( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0, ones, 0)
  1509. #if defined( _MSC_VER ) && !defined(__clang__)
  1510. #define stbir_make16(a,b,c,d) vcombine_u8( \
  1511. vcreate_u8( (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
  1512. ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \
  1513. vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \
  1514. ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) )
  1515. static stbir__inline uint8x16x2_t stbir_make16x2(float32x4_t rega,float32x4_t regb)
  1516. {
  1517. uint8x16x2_t r = { vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb) };
  1518. return r;
  1519. }
  1520. #else
  1521. #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3}
  1522. #define stbir_make16x2(a,b) (uint8x16x2_t){{vreinterpretq_u8_f32(a),vreinterpretq_u8_f32(b)}}
  1523. #endif
  1524. #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
  1525. #define stbir__simdf_swiz2( rega, regb, one, two, three, four ) vreinterpretq_f32_u8( vqtbl2q_u8( stbir_make16x2(rega,regb), stbir_make16(one, two, three, four) ) )
  1526. #define stbir__simdi_16madd( out, reg0, reg1 ) \
  1527. { \
  1528. int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
  1529. int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
  1530. int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
  1531. int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
  1532. (out) = vreinterpretq_u32_s32( vpaddq_s32(tmp0, tmp1) ); \
  1533. }
  1534. #else
  1535. #define stbir__simdf_aaa1( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3)
  1536. #define stbir__simdf_1aaa( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0)
  1537. #if defined( _MSC_VER ) && !defined(__clang__)
  1538. static stbir__inline uint8x8x2_t stbir_make8x2(float32x4_t reg)
  1539. {
  1540. uint8x8x2_t r = { { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } };
  1541. return r;
  1542. }
  1543. #define stbir_make8(a,b) vcreate_u8( \
  1544. (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
  1545. ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56) )
  1546. #else
  1547. #define stbir_make8x2(reg) (uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } }
  1548. #define stbir_make8(a,b) (uint8x8_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3}
  1549. #endif
  1550. #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vcombine_u8( \
  1551. vtbl2_u8( stbir_make8x2( reg ), stbir_make8( one, two ) ), \
  1552. vtbl2_u8( stbir_make8x2( reg ), stbir_make8( three, four ) ) ) )
  1553. #define stbir__simdi_16madd( out, reg0, reg1 ) \
  1554. { \
  1555. int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
  1556. int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
  1557. int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
  1558. int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
  1559. int32x2_t out0 = vpadd_s32( vget_low_s32(tmp0), vget_high_s32(tmp0) ); \
  1560. int32x2_t out1 = vpadd_s32( vget_low_s32(tmp1), vget_high_s32(tmp1) ); \
  1561. (out) = vreinterpretq_u32_s32( vcombine_s32(out0, out1) ); \
  1562. }
  1563. #endif
  1564. #define stbir__simdi_and( out, reg0, reg1 ) (out) = vandq_u32( reg0, reg1 )
  1565. #define stbir__simdi_or( out, reg0, reg1 ) (out) = vorrq_u32( reg0, reg1 )
  1566. #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
  1567. { \
  1568. float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
  1569. float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
  1570. int16x4_t ai = vqmovn_s32( vcvtq_s32_f32( af ) ); \
  1571. int16x4_t bi = vqmovn_s32( vcvtq_s32_f32( bf ) ); \
  1572. uint8x8_t out8 = vqmovun_s16( vcombine_s16(ai, bi) ); \
  1573. out = vreinterpretq_u32_u8( vcombine_u8(out8, out8) ); \
  1574. }
  1575. #define stbir__simdf_pack_to_8words(out,aa,bb) \
  1576. { \
  1577. float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
  1578. float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
  1579. int32x4_t ai = vcvtq_s32_f32( af ); \
  1580. int32x4_t bi = vcvtq_s32_f32( bf ); \
  1581. out = vreinterpretq_u32_u16( vcombine_u16(vqmovun_s32(ai), vqmovun_s32(bi)) ); \
  1582. }
  1583. #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
  1584. { \
  1585. int16x4x2_t tmp0 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r0)), vqmovn_s32(vreinterpretq_s32_u32(r2)) ); \
  1586. int16x4x2_t tmp1 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r1)), vqmovn_s32(vreinterpretq_s32_u32(r3)) ); \
  1587. uint8x8x2_t out = \
  1588. { { \
  1589. vqmovun_s16( vcombine_s16(tmp0.val[0], tmp0.val[1]) ), \
  1590. vqmovun_s16( vcombine_s16(tmp1.val[0], tmp1.val[1]) ), \
  1591. } }; \
  1592. vst2_u8(ptr, out); \
  1593. }
  1594. #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
  1595. { \
  1596. float32x4x4_t tmp = vld4q_f32(ptr); \
  1597. o0 = tmp.val[0]; \
  1598. o1 = tmp.val[1]; \
  1599. o2 = tmp.val[2]; \
  1600. o3 = tmp.val[3]; \
  1601. }
  1602. #define stbir__simdi_32shr( out, reg, imm ) out = vshrq_n_u32( reg, imm )
  1603. #if defined( _MSC_VER ) && !defined(__clang__)
  1604. #define STBIR__SIMDF_CONST(var, x) __declspec(align(8)) float var[] = { x, x, x, x }
  1605. #define STBIR__SIMDI_CONST(var, x) __declspec(align(8)) uint32_t var[] = { x, x, x, x }
  1606. #define STBIR__CONSTF(var) (*(const float32x4_t*)var)
  1607. #define STBIR__CONSTI(var) (*(const uint32x4_t*)var)
  1608. #else
  1609. #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
  1610. #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
  1611. #define STBIR__CONSTF(var) (var)
  1612. #define STBIR__CONSTI(var) (var)
  1613. #endif
  1614. #ifdef STBIR_FLOORF
  1615. #undef STBIR_FLOORF
  1616. #endif
  1617. #define STBIR_FLOORF stbir_simd_floorf
  1618. static stbir__inline float stbir_simd_floorf(float x)
  1619. {
  1620. #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
  1621. return vget_lane_f32( vrndm_f32( vdup_n_f32(x) ), 0);
  1622. #else
  1623. float32x2_t f = vdup_n_f32(x);
  1624. float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
  1625. uint32x2_t a = vclt_f32(f, t);
  1626. uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(-1.0f));
  1627. float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
  1628. return vget_lane_f32(r, 0);
  1629. #endif
  1630. }
  1631. #ifdef STBIR_CEILF
  1632. #undef STBIR_CEILF
  1633. #endif
  1634. #define STBIR_CEILF stbir_simd_ceilf
  1635. static stbir__inline float stbir_simd_ceilf(float x)
  1636. {
  1637. #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
  1638. return vget_lane_f32( vrndp_f32( vdup_n_f32(x) ), 0);
  1639. #else
  1640. float32x2_t f = vdup_n_f32(x);
  1641. float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
  1642. uint32x2_t a = vclt_f32(t, f);
  1643. uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(1.0f));
  1644. float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
  1645. return vget_lane_f32(r, 0);
  1646. #endif
  1647. }
  1648. #define STBIR_SIMD
  1649. #elif defined(STBIR_WASM)
  1650. #include <wasm_simd128.h>
  1651. #define stbir__simdf v128_t
  1652. #define stbir__simdi v128_t
  1653. #define stbir_simdi_castf( reg ) (reg)
  1654. #define stbir_simdf_casti( reg ) (reg)
  1655. #define stbir__simdf_load( reg, ptr ) (reg) = wasm_v128_load( (void const*)(ptr) )
  1656. #define stbir__simdi_load( reg, ptr ) (reg) = wasm_v128_load( (void const*)(ptr) )
  1657. #define stbir__simdf_load1( out, ptr ) (out) = wasm_v128_load32_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
  1658. #define stbir__simdi_load1( out, ptr ) (out) = wasm_v128_load32_splat( (void const*)(ptr) )
  1659. #define stbir__simdf_load1z( out, ptr ) (out) = wasm_v128_load32_zero( (void const*)(ptr) ) // top values must be zero
  1660. #define stbir__simdf_frep4( fvar ) wasm_f32x4_splat( fvar )
  1661. #define stbir__simdf_load1frep4( out, fvar ) (out) = wasm_f32x4_splat( fvar )
  1662. #define stbir__simdf_load2( out, ptr ) (out) = wasm_v128_load64_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
  1663. #define stbir__simdf_load2z( out, ptr ) (out) = wasm_v128_load64_zero( (void const*)(ptr) ) // top values must be zero
  1664. #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = wasm_v128_load64_lane( (void const*)(ptr), reg, 1 )
  1665. #define stbir__simdf_zeroP() wasm_f32x4_const_splat(0)
  1666. #define stbir__simdf_zero( reg ) (reg) = wasm_f32x4_const_splat(0)
  1667. #define stbir__simdf_store( ptr, reg ) wasm_v128_store( (void*)(ptr), reg )
  1668. #define stbir__simdf_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
  1669. #define stbir__simdf_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
  1670. #define stbir__simdf_store2h( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 1 )
  1671. #define stbir__simdi_store( ptr, reg ) wasm_v128_store( (void*)(ptr), reg )
  1672. #define stbir__simdi_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
  1673. #define stbir__simdi_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
  1674. #define stbir__prefetch( ptr )
  1675. #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
  1676. { \
  1677. v128_t l = wasm_u16x8_extend_low_u8x16 ( ireg ); \
  1678. v128_t h = wasm_u16x8_extend_high_u8x16( ireg ); \
  1679. out0 = wasm_u32x4_extend_low_u16x8 ( l ); \
  1680. out1 = wasm_u32x4_extend_high_u16x8( l ); \
  1681. out2 = wasm_u32x4_extend_low_u16x8 ( h ); \
  1682. out3 = wasm_u32x4_extend_high_u16x8( h ); \
  1683. }
  1684. #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
  1685. { \
  1686. v128_t tmp = wasm_u16x8_extend_low_u8x16(ireg); \
  1687. out = wasm_u32x4_extend_low_u16x8(tmp); \
  1688. }
  1689. #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
  1690. { \
  1691. out0 = wasm_u32x4_extend_low_u16x8 ( ireg ); \
  1692. out1 = wasm_u32x4_extend_high_u16x8( ireg ); \
  1693. }
  1694. #define stbir__simdf_convert_float_to_i32( i, f ) (i) = wasm_i32x4_trunc_sat_f32x4(f)
  1695. #define stbir__simdf_convert_float_to_int( f ) wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0)
  1696. #define stbir__simdi_to_int( i ) wasm_i32x4_extract_lane(i, 0)
  1697. #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint8_as_float),wasm_f32x4_const_splat(0))), 0))
  1698. #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint16_as_float),wasm_f32x4_const_splat(0))), 0))
  1699. #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = wasm_f32x4_convert_i32x4(ireg)
  1700. #define stbir__simdf_add( out, reg0, reg1 ) (out) = wasm_f32x4_add( reg0, reg1 )
  1701. #define stbir__simdf_mult( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 )
  1702. #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = wasm_f32x4_mul( reg, wasm_v128_load( (void const*)(ptr) ) )
  1703. #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = wasm_f32x4_mul( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
  1704. #define stbir__simdf_add_mem( out, reg, ptr ) (out) = wasm_f32x4_add( reg, wasm_v128_load( (void const*)(ptr) ) )
  1705. #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = wasm_f32x4_add( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
  1706. #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
  1707. #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
  1708. #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load( (void const*)(ptr) ) ) )
  1709. #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load32_splat( (void const*)(ptr) ) ) )
  1710. #define stbir__simdf_add1( out, reg0, reg1 ) (out) = wasm_f32x4_add( reg0, reg1 )
  1711. #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 )
  1712. #define stbir__simdf_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 )
  1713. #define stbir__simdf_or( out, reg0, reg1 ) (out) = wasm_v128_or( reg0, reg1 )
  1714. #define stbir__simdf_min( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
  1715. #define stbir__simdf_max( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
  1716. #define stbir__simdf_min1( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
  1717. #define stbir__simdf_max1( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
  1718. #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 3, 4, 5, -1 )
  1719. #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 2, 3, 4, -1 )
  1720. #define stbir__simdf_aaa1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 3, 3, 3, 4)
  1721. #define stbir__simdf_1aaa(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 0, 0)
  1722. #define stbir__simdf_a1a1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 1, 4, 3, 4)
  1723. #define stbir__simdf_1a1a(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 4, 2)
  1724. #define stbir__simdf_swiz( reg, one, two, three, four ) wasm_i32x4_shuffle(reg, reg, one, two, three, four)
  1725. #define stbir__simdi_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 )
  1726. #define stbir__simdi_or( out, reg0, reg1 ) (out) = wasm_v128_or( reg0, reg1 )
  1727. #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = wasm_i32x4_dot_i16x8( reg0, reg1 )
  1728. #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
  1729. { \
  1730. v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
  1731. v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
  1732. v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
  1733. v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
  1734. v128_t out16 = wasm_i16x8_narrow_i32x4( ai, bi ); \
  1735. out = wasm_u8x16_narrow_i16x8( out16, out16 ); \
  1736. }
  1737. #define stbir__simdf_pack_to_8words(out,aa,bb) \
  1738. { \
  1739. v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
  1740. v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
  1741. v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
  1742. v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
  1743. out = wasm_u16x8_narrow_i32x4( ai, bi ); \
  1744. }
  1745. #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
  1746. { \
  1747. v128_t tmp0 = wasm_i16x8_narrow_i32x4(r0, r1); \
  1748. v128_t tmp1 = wasm_i16x8_narrow_i32x4(r2, r3); \
  1749. v128_t tmp = wasm_u8x16_narrow_i16x8(tmp0, tmp1); \
  1750. tmp = wasm_i8x16_shuffle(tmp, tmp, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); \
  1751. wasm_v128_store( (void*)(ptr), tmp); \
  1752. }
  1753. #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
  1754. { \
  1755. v128_t t0 = wasm_v128_load( ptr ); \
  1756. v128_t t1 = wasm_v128_load( ptr+4 ); \
  1757. v128_t t2 = wasm_v128_load( ptr+8 ); \
  1758. v128_t t3 = wasm_v128_load( ptr+12 ); \
  1759. v128_t s0 = wasm_i32x4_shuffle(t0, t1, 0, 4, 2, 6); \
  1760. v128_t s1 = wasm_i32x4_shuffle(t0, t1, 1, 5, 3, 7); \
  1761. v128_t s2 = wasm_i32x4_shuffle(t2, t3, 0, 4, 2, 6); \
  1762. v128_t s3 = wasm_i32x4_shuffle(t2, t3, 1, 5, 3, 7); \
  1763. o0 = wasm_i32x4_shuffle(s0, s2, 0, 1, 4, 5); \
  1764. o1 = wasm_i32x4_shuffle(s1, s3, 0, 1, 4, 5); \
  1765. o2 = wasm_i32x4_shuffle(s0, s2, 2, 3, 6, 7); \
  1766. o3 = wasm_i32x4_shuffle(s1, s3, 2, 3, 6, 7); \
  1767. }
  1768. #define stbir__simdi_32shr( out, reg, imm ) out = wasm_u32x4_shr( reg, imm )
  1769. typedef float stbir__f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
  1770. #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = (v128_t)(stbir__f32x4){ x, x, x, x }
  1771. #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
  1772. #define STBIR__CONSTF(var) (var)
  1773. #define STBIR__CONSTI(var) (var)
  1774. #ifdef STBIR_FLOORF
  1775. #undef STBIR_FLOORF
  1776. #endif
  1777. #define STBIR_FLOORF stbir_simd_floorf
  1778. static stbir__inline float stbir_simd_floorf(float x)
  1779. {
  1780. return wasm_f32x4_extract_lane( wasm_f32x4_floor( wasm_f32x4_splat(x) ), 0);
  1781. }
  1782. #ifdef STBIR_CEILF
  1783. #undef STBIR_CEILF
  1784. #endif
  1785. #define STBIR_CEILF stbir_simd_ceilf
  1786. static stbir__inline float stbir_simd_ceilf(float x)
  1787. {
  1788. return wasm_f32x4_extract_lane( wasm_f32x4_ceil( wasm_f32x4_splat(x) ), 0);
  1789. }
  1790. #define STBIR_SIMD
  1791. #endif // SSE2/NEON/WASM
  1792. #endif // NO SIMD
  1793. #ifdef STBIR_SIMD8
  1794. #define stbir__simdfX stbir__simdf8
  1795. #define stbir__simdiX stbir__simdi8
  1796. #define stbir__simdfX_load stbir__simdf8_load
  1797. #define stbir__simdiX_load stbir__simdi8_load
  1798. #define stbir__simdfX_mult stbir__simdf8_mult
  1799. #define stbir__simdfX_add_mem stbir__simdf8_add_mem
  1800. #define stbir__simdfX_madd_mem stbir__simdf8_madd_mem
  1801. #define stbir__simdfX_store stbir__simdf8_store
  1802. #define stbir__simdiX_store stbir__simdi8_store
  1803. #define stbir__simdf_frepX stbir__simdf8_frep8
  1804. #define stbir__simdfX_madd stbir__simdf8_madd
  1805. #define stbir__simdfX_min stbir__simdf8_min
  1806. #define stbir__simdfX_max stbir__simdf8_max
  1807. #define stbir__simdfX_aaa1 stbir__simdf8_aaa1
  1808. #define stbir__simdfX_1aaa stbir__simdf8_1aaa
  1809. #define stbir__simdfX_a1a1 stbir__simdf8_a1a1
  1810. #define stbir__simdfX_1a1a stbir__simdf8_1a1a
  1811. #define stbir__simdfX_convert_float_to_i32 stbir__simdf8_convert_float_to_i32
  1812. #define stbir__simdfX_pack_to_words stbir__simdf8_pack_to_16words
  1813. #define stbir__simdfX_zero stbir__simdf8_zero
  1814. #define STBIR_onesX STBIR_ones8
  1815. #define STBIR_max_uint8_as_floatX STBIR_max_uint8_as_float8
  1816. #define STBIR_max_uint16_as_floatX STBIR_max_uint16_as_float8
  1817. #define STBIR_simd_point5X STBIR_simd_point58
  1818. #define stbir__simdfX_float_count 8
  1819. #define stbir__simdfX_0123to1230 stbir__simdf8_0123to12301230
  1820. #define stbir__simdfX_0123to2103 stbir__simdf8_0123to21032103
  1821. static const stbir__simdf8 STBIR_max_uint16_as_float_inverted8 = { stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted };
  1822. static const stbir__simdf8 STBIR_max_uint8_as_float_inverted8 = { stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted };
  1823. static const stbir__simdf8 STBIR_ones8 = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 };
  1824. static const stbir__simdf8 STBIR_simd_point58 = { 0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5 };
  1825. static const stbir__simdf8 STBIR_max_uint8_as_float8 = { stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float, stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float };
  1826. static const stbir__simdf8 STBIR_max_uint16_as_float8 = { stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float, stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float };
  1827. #else
  1828. #define stbir__simdfX stbir__simdf
  1829. #define stbir__simdiX stbir__simdi
  1830. #define stbir__simdfX_load stbir__simdf_load
  1831. #define stbir__simdiX_load stbir__simdi_load
  1832. #define stbir__simdfX_mult stbir__simdf_mult
  1833. #define stbir__simdfX_add_mem stbir__simdf_add_mem
  1834. #define stbir__simdfX_madd_mem stbir__simdf_madd_mem
  1835. #define stbir__simdfX_store stbir__simdf_store
  1836. #define stbir__simdiX_store stbir__simdi_store
  1837. #define stbir__simdf_frepX stbir__simdf_frep4
  1838. #define stbir__simdfX_madd stbir__simdf_madd
  1839. #define stbir__simdfX_min stbir__simdf_min
  1840. #define stbir__simdfX_max stbir__simdf_max
  1841. #define stbir__simdfX_aaa1 stbir__simdf_aaa1
  1842. #define stbir__simdfX_1aaa stbir__simdf_1aaa
  1843. #define stbir__simdfX_a1a1 stbir__simdf_a1a1
  1844. #define stbir__simdfX_1a1a stbir__simdf_1a1a
  1845. #define stbir__simdfX_convert_float_to_i32 stbir__simdf_convert_float_to_i32
  1846. #define stbir__simdfX_pack_to_words stbir__simdf_pack_to_8words
  1847. #define stbir__simdfX_zero stbir__simdf_zero
  1848. #define STBIR_onesX STBIR__CONSTF(STBIR_ones)
  1849. #define STBIR_simd_point5X STBIR__CONSTF(STBIR_simd_point5)
  1850. #define STBIR_max_uint8_as_floatX STBIR__CONSTF(STBIR_max_uint8_as_float)
  1851. #define STBIR_max_uint16_as_floatX STBIR__CONSTF(STBIR_max_uint16_as_float)
  1852. #define stbir__simdfX_float_count 4
  1853. #define stbir__if_simdf8_cast_to_simdf4( val ) ( val )
  1854. #define stbir__simdfX_0123to1230 stbir__simdf_0123to1230
  1855. #define stbir__simdfX_0123to2103 stbir__simdf_0123to2103
  1856. #endif
  1857. #if defined(STBIR_NEON) && !defined(_M_ARM) && !defined(__arm__)
  1858. #if defined( _MSC_VER ) && !defined(__clang__)
  1859. typedef __int16 stbir__FP16;
  1860. #else
  1861. typedef float16_t stbir__FP16;
  1862. #endif
  1863. #else // no NEON, or 32-bit ARM for MSVC
  1864. typedef union stbir__FP16
  1865. {
  1866. unsigned short u;
  1867. } stbir__FP16;
  1868. #endif
  1869. #if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) || (defined(STBIR_NEON) && defined(_M_ARM)) || (defined(STBIR_NEON) && defined(__arm__))
  1870. // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
  1871. static stbir__inline float stbir__half_to_float( stbir__FP16 h )
  1872. {
  1873. static const stbir__FP32 magic = { (254 - 15) << 23 };
  1874. static const stbir__FP32 was_infnan = { (127 + 16) << 23 };
  1875. stbir__FP32 o;
  1876. o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
  1877. o.f *= magic.f; // exponent adjust
  1878. if (o.f >= was_infnan.f) // make sure Inf/NaN survive
  1879. o.u |= 255 << 23;
  1880. o.u |= (h.u & 0x8000) << 16; // sign bit
  1881. return o.f;
  1882. }
  1883. static stbir__inline stbir__FP16 stbir__float_to_half(float val)
  1884. {
  1885. stbir__FP32 f32infty = { 255 << 23 };
  1886. stbir__FP32 f16max = { (127 + 16) << 23 };
  1887. stbir__FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
  1888. unsigned int sign_mask = 0x80000000u;
  1889. stbir__FP16 o = { 0 };
  1890. stbir__FP32 f;
  1891. unsigned int sign;
  1892. f.f = val;
  1893. sign = f.u & sign_mask;
  1894. f.u ^= sign;
  1895. if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
  1896. o.u = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
  1897. else // (De)normalized number or zero
  1898. {
  1899. if (f.u < (113 << 23)) // resulting FP16 is subnormal or zero
  1900. {
  1901. // use a magic value to align our 10 mantissa bits at the bottom of
  1902. // the float. as long as FP addition is round-to-nearest-even this
  1903. // just works.
  1904. f.f += denorm_magic.f;
  1905. // and one integer subtract of the bias later, we have our final float!
  1906. o.u = (unsigned short) ( f.u - denorm_magic.u );
  1907. }
  1908. else
  1909. {
  1910. unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
  1911. // update exponent, rounding bias part 1
  1912. f.u = f.u + ((15u - 127) << 23) + 0xfff;
  1913. // rounding bias part 2
  1914. f.u += mant_odd;
  1915. // take the bits!
  1916. o.u = (unsigned short) ( f.u >> 13 );
  1917. }
  1918. }
  1919. o.u |= sign >> 16;
  1920. return o;
  1921. }
  1922. #endif
  1923. #if defined(STBIR_FP16C)
  1924. #include <immintrin.h>
  1925. static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
  1926. {
  1927. _mm256_storeu_ps( (float*)output, _mm256_cvtph_ps( _mm_loadu_si128( (__m128i const* )input ) ) );
  1928. }
  1929. static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
  1930. {
  1931. _mm_storeu_si128( (__m128i*)output, _mm256_cvtps_ph( _mm256_loadu_ps( input ), 0 ) );
  1932. }
  1933. static stbir__inline float stbir__half_to_float( stbir__FP16 h )
  1934. {
  1935. return _mm_cvtss_f32( _mm_cvtph_ps( _mm_cvtsi32_si128( (int)h.u ) ) );
  1936. }
  1937. static stbir__inline stbir__FP16 stbir__float_to_half( float f )
  1938. {
  1939. stbir__FP16 h;
  1940. h.u = (unsigned short) _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( f ), 0 ) );
  1941. return h;
  1942. }
  1943. #elif defined(STBIR_SSE2)
  1944. // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
  1945. stbir__inline static void stbir__half_to_float_SIMD(float * output, void const * input)
  1946. {
  1947. static const STBIR__SIMDI_CONST(mask_nosign, 0x7fff);
  1948. static const STBIR__SIMDI_CONST(smallest_normal, 0x0400);
  1949. static const STBIR__SIMDI_CONST(infinity, 0x7c00);
  1950. static const STBIR__SIMDI_CONST(expadjust_normal, (127 - 15) << 23);
  1951. static const STBIR__SIMDI_CONST(magic_denorm, 113 << 23);
  1952. __m128i i = _mm_loadu_si128 ( (__m128i const*)(input) );
  1953. __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
  1954. __m128i mnosign = STBIR__CONSTI(mask_nosign);
  1955. __m128i eadjust = STBIR__CONSTI(expadjust_normal);
  1956. __m128i smallest = STBIR__CONSTI(smallest_normal);
  1957. __m128i infty = STBIR__CONSTI(infinity);
  1958. __m128i expmant = _mm_and_si128(mnosign, h);
  1959. __m128i justsign = _mm_xor_si128(h, expmant);
  1960. __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
  1961. __m128i b_isdenorm = _mm_cmpgt_epi32(smallest, expmant);
  1962. __m128i shifted = _mm_slli_epi32(expmant, 13);
  1963. __m128i adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust);
  1964. __m128i adjusted = _mm_add_epi32(eadjust, shifted);
  1965. __m128i den1 = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
  1966. __m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
  1967. __m128 den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
  1968. __m128 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
  1969. __m128 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
  1970. __m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4);
  1971. __m128i sign = _mm_slli_epi32(justsign, 16);
  1972. __m128 final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
  1973. stbir__simdf_store( output + 0, final );
  1974. h = _mm_unpackhi_epi16 ( i, _mm_setzero_si128() );
  1975. expmant = _mm_and_si128(mnosign, h);
  1976. justsign = _mm_xor_si128(h, expmant);
  1977. b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
  1978. b_isdenorm = _mm_cmpgt_epi32(smallest, expmant);
  1979. shifted = _mm_slli_epi32(expmant, 13);
  1980. adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust);
  1981. adjusted = _mm_add_epi32(eadjust, shifted);
  1982. den1 = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
  1983. adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
  1984. den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
  1985. adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
  1986. adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
  1987. adjusted5 = _mm_or_ps(adjusted3, adjusted4);
  1988. sign = _mm_slli_epi32(justsign, 16);
  1989. final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
  1990. stbir__simdf_store( output + 4, final );
  1991. // ~38 SSE2 ops for 8 values
  1992. }
  1993. // Fabian's round-to-nearest-even float to half
  1994. // ~48 SSE2 ops for 8 output
  1995. stbir__inline static void stbir__float_to_half_SIMD(void * output, float const * input)
  1996. {
  1997. static const STBIR__SIMDI_CONST(mask_sign, 0x80000000u);
  1998. static const STBIR__SIMDI_CONST(c_f16max, (127 + 16) << 23); // all FP32 values >=this round to +inf
  1999. static const STBIR__SIMDI_CONST(c_nanbit, 0x200);
  2000. static const STBIR__SIMDI_CONST(c_infty_as_fp16, 0x7c00);
  2001. static const STBIR__SIMDI_CONST(c_min_normal, (127 - 14) << 23); // smallest FP32 that yields a normalized FP16
  2002. static const STBIR__SIMDI_CONST(c_subnorm_magic, ((127 - 15) + (23 - 10) + 1) << 23);
  2003. static const STBIR__SIMDI_CONST(c_normal_bias, 0xfff - ((127 - 15) << 23)); // adjust exponent and add mantissa rounding
  2004. __m128 f = _mm_loadu_ps(input);
  2005. __m128 msign = _mm_castsi128_ps(STBIR__CONSTI(mask_sign));
  2006. __m128 justsign = _mm_and_ps(msign, f);
  2007. __m128 absf = _mm_xor_ps(f, justsign);
  2008. __m128i absf_int = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
  2009. __m128i f16max = STBIR__CONSTI(c_f16max);
  2010. __m128 b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN?
  2011. __m128i b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
  2012. __m128i nanbit = _mm_and_si128(_mm_castps_si128(b_isnan), STBIR__CONSTI(c_nanbit));
  2013. __m128i inf_or_nan = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
  2014. __m128i min_normal = STBIR__CONSTI(c_min_normal);
  2015. __m128i b_issub = _mm_cmpgt_epi32(min_normal, absf_int);
  2016. // "result is subnormal" path
  2017. __m128 subnorm1 = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
  2018. __m128i subnorm2 = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
  2019. // "result is normal" path
  2020. __m128i mantoddbit = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
  2021. __m128i mantodd = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
  2022. __m128i round1 = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
  2023. __m128i round2 = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
  2024. __m128i normal = _mm_srli_epi32(round2, 13); // rounded result
  2025. // combine the two non-specials
  2026. __m128i nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
  2027. // merge in specials as well
  2028. __m128i joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
  2029. __m128i sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
  2030. __m128i final2, final= _mm_or_si128(joined, sign_shift);
  2031. f = _mm_loadu_ps(input+4);
  2032. justsign = _mm_and_ps(msign, f);
  2033. absf = _mm_xor_ps(f, justsign);
  2034. absf_int = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
  2035. b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN?
  2036. b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
  2037. nanbit = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit);
  2038. inf_or_nan = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
  2039. b_issub = _mm_cmpgt_epi32(min_normal, absf_int);
  2040. // "result is subnormal" path
  2041. subnorm1 = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
  2042. subnorm2 = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
  2043. // "result is normal" path
  2044. mantoddbit = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
  2045. mantodd = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
  2046. round1 = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
  2047. round2 = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
  2048. normal = _mm_srli_epi32(round2, 13); // rounded result
  2049. // combine the two non-specials
  2050. nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
  2051. // merge in specials as well
  2052. joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
  2053. sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
  2054. final2 = _mm_or_si128(joined, sign_shift);
  2055. final = _mm_packs_epi32(final, final2);
  2056. stbir__simdi_store( output,final );
  2057. }
  2058. #elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // 64-bit ARM on MSVC (not clang)
  2059. static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
  2060. {
  2061. float16x4_t in0 = vld1_f16(input + 0);
  2062. float16x4_t in1 = vld1_f16(input + 4);
  2063. vst1q_f32(output + 0, vcvt_f32_f16(in0));
  2064. vst1q_f32(output + 4, vcvt_f32_f16(in1));
  2065. }
  2066. static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
  2067. {
  2068. float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
  2069. float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
  2070. vst1_f16(output+0, out0);
  2071. vst1_f16(output+4, out1);
  2072. }
  2073. static stbir__inline float stbir__half_to_float( stbir__FP16 h )
  2074. {
  2075. return vgetq_lane_f32(vcvt_f32_f16(vld1_dup_f16(&h)), 0);
  2076. }
  2077. static stbir__inline stbir__FP16 stbir__float_to_half( float f )
  2078. {
  2079. return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
  2080. }
  2081. #elif defined(STBIR_NEON) && ( defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) ) // 64-bit ARM
  2082. static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
  2083. {
  2084. float16x8_t in = vld1q_f16(input);
  2085. vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(in)));
  2086. vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(in)));
  2087. }
  2088. static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
  2089. {
  2090. float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
  2091. float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
  2092. vst1q_f16(output, vcombine_f16(out0, out1));
  2093. }
  2094. static stbir__inline float stbir__half_to_float( stbir__FP16 h )
  2095. {
  2096. return vgetq_lane_f32(vcvt_f32_f16(vdup_n_f16(h)), 0);
  2097. }
  2098. static stbir__inline stbir__FP16 stbir__float_to_half( float f )
  2099. {
  2100. return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
  2101. }
  2102. #elif defined(STBIR_WASM) || (defined(STBIR_NEON) && (defined(_MSC_VER) || defined(_M_ARM) || defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang
  2103. static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
  2104. {
  2105. for (int i=0; i<8; i++)
  2106. {
  2107. output[i] = stbir__half_to_float(input[i]);
  2108. }
  2109. }
  2110. static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
  2111. {
  2112. for (int i=0; i<8; i++)
  2113. {
  2114. output[i] = stbir__float_to_half(input[i]);
  2115. }
  2116. }
  2117. #endif
  2118. #ifdef STBIR_SIMD
  2119. #define stbir__simdf_0123to3333( out, reg ) (out) = stbir__simdf_swiz( reg, 3,3,3,3 )
  2120. #define stbir__simdf_0123to2222( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,2,2 )
  2121. #define stbir__simdf_0123to1111( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,1,1 )
  2122. #define stbir__simdf_0123to0000( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,0 )
  2123. #define stbir__simdf_0123to0003( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,3 )
  2124. #define stbir__simdf_0123to0001( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,1 )
  2125. #define stbir__simdf_0123to1122( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,2,2 )
  2126. #define stbir__simdf_0123to2333( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,3,3 )
  2127. #define stbir__simdf_0123to0023( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,3 )
  2128. #define stbir__simdf_0123to1230( out, reg ) (out) = stbir__simdf_swiz( reg, 1,2,3,0 )
  2129. #define stbir__simdf_0123to2103( out, reg ) (out) = stbir__simdf_swiz( reg, 2,1,0,3 )
  2130. #define stbir__simdf_0123to3210( out, reg ) (out) = stbir__simdf_swiz( reg, 3,2,1,0 )
  2131. #define stbir__simdf_0123to2301( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,0,1 )
  2132. #define stbir__simdf_0123to3012( out, reg ) (out) = stbir__simdf_swiz( reg, 3,0,1,2 )
  2133. #define stbir__simdf_0123to0011( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,1,1 )
  2134. #define stbir__simdf_0123to1100( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,0,0 )
  2135. #define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 )
  2136. #define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 )
  2137. #define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 )
  2138. #define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 )
  2139. typedef union stbir__simdi_u32
  2140. {
  2141. stbir_uint32 m128i_u32[4];
  2142. int m128i_i32[4];
  2143. stbir__simdi m128i_i128;
  2144. } stbir__simdi_u32;
  2145. static const int STBIR_mask[9] = { 0,0,0,-1,-1,-1,0,0,0 };
  2146. static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float, stbir__max_uint8_as_float);
  2147. static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float, stbir__max_uint16_as_float);
  2148. static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float_inverted, stbir__max_uint8_as_float_inverted);
  2149. static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted);
  2150. static const STBIR__SIMDF_CONST(STBIR_simd_point5, 0.5f);
  2151. static const STBIR__SIMDF_CONST(STBIR_ones, 1.0f);
  2152. static const STBIR__SIMDI_CONST(STBIR_almost_zero, (127 - 13) << 23);
  2153. static const STBIR__SIMDI_CONST(STBIR_almost_one, 0x3f7fffff);
  2154. static const STBIR__SIMDI_CONST(STBIR_mastissa_mask, 0xff);
  2155. static const STBIR__SIMDI_CONST(STBIR_topscale, 0x02000000);
  2156. // Basically, in simd mode, we unroll the proper amount, and we don't want
  2157. // the non-simd remnant loops to be unroll because they only run a few times
  2158. // Adding this switch saves about 5K on clang which is Captain Unroll the 3rd.
  2159. #define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
  2160. #define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr)
  2161. #define STBIR_SIMD_NO_UNROLL_LOOP_START STBIR_NO_UNROLL_LOOP_START
  2162. #define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START_INF_FOR
  2163. #ifdef STBIR_MEMCPY
  2164. #undef STBIR_MEMCPY
  2165. #endif
  2166. #define STBIR_MEMCPY stbir_simd_memcpy
  2167. // override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
  2168. static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
  2169. {
  2170. char STBIR_SIMD_STREAMOUT_PTR (*) d = (char*) dest;
  2171. char STBIR_SIMD_STREAMOUT_PTR( * ) d_end = ((char*) dest) + bytes;
  2172. ptrdiff_t ofs_to_src = (char*)src - (char*)dest;
  2173. // check overlaps
  2174. STBIR_ASSERT( ( ( d >= ( (char*)src) + bytes ) ) || ( ( d + bytes ) <= (char*)src ) );
  2175. if ( bytes < (16*stbir__simdfX_float_count) )
  2176. {
  2177. if ( bytes < 16 )
  2178. {
  2179. if ( bytes )
  2180. {
  2181. STBIR_SIMD_NO_UNROLL_LOOP_START
  2182. do
  2183. {
  2184. STBIR_SIMD_NO_UNROLL(d);
  2185. d[ 0 ] = d[ ofs_to_src ];
  2186. ++d;
  2187. } while ( d < d_end );
  2188. }
  2189. }
  2190. else
  2191. {
  2192. stbir__simdf x;
  2193. // do one unaligned to get us aligned for the stream out below
  2194. stbir__simdf_load( x, ( d + ofs_to_src ) );
  2195. stbir__simdf_store( d, x );
  2196. d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 );
  2197. STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
  2198. for(;;)
  2199. {
  2200. STBIR_SIMD_NO_UNROLL(d);
  2201. if ( d > ( d_end - 16 ) )
  2202. {
  2203. if ( d == d_end )
  2204. return;
  2205. d = d_end - 16;
  2206. }
  2207. stbir__simdf_load( x, ( d + ofs_to_src ) );
  2208. stbir__simdf_store( d, x );
  2209. d += 16;
  2210. }
  2211. }
  2212. }
  2213. else
  2214. {
  2215. stbir__simdfX x0,x1,x2,x3;
  2216. // do one unaligned to get us aligned for the stream out below
  2217. stbir__simdfX_load( x0, ( d + ofs_to_src ) + 0*stbir__simdfX_float_count );
  2218. stbir__simdfX_load( x1, ( d + ofs_to_src ) + 4*stbir__simdfX_float_count );
  2219. stbir__simdfX_load( x2, ( d + ofs_to_src ) + 8*stbir__simdfX_float_count );
  2220. stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
  2221. stbir__simdfX_store( d + 0*stbir__simdfX_float_count, x0 );
  2222. stbir__simdfX_store( d + 4*stbir__simdfX_float_count, x1 );
  2223. stbir__simdfX_store( d + 8*stbir__simdfX_float_count, x2 );
  2224. stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
  2225. d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
  2226. STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
  2227. for(;;)
  2228. {
  2229. STBIR_SIMD_NO_UNROLL(d);
  2230. if ( d > ( d_end - (16*stbir__simdfX_float_count) ) )
  2231. {
  2232. if ( d == d_end )
  2233. return;
  2234. d = d_end - (16*stbir__simdfX_float_count);
  2235. }
  2236. stbir__simdfX_load( x0, ( d + ofs_to_src ) + 0*stbir__simdfX_float_count );
  2237. stbir__simdfX_load( x1, ( d + ofs_to_src ) + 4*stbir__simdfX_float_count );
  2238. stbir__simdfX_load( x2, ( d + ofs_to_src ) + 8*stbir__simdfX_float_count );
  2239. stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
  2240. stbir__simdfX_store( d + 0*stbir__simdfX_float_count, x0 );
  2241. stbir__simdfX_store( d + 4*stbir__simdfX_float_count, x1 );
  2242. stbir__simdfX_store( d + 8*stbir__simdfX_float_count, x2 );
  2243. stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
  2244. d += (16*stbir__simdfX_float_count);
  2245. }
  2246. }
  2247. }
  2248. // memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
  2249. // a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
  2250. // the diff between dest and src)
  2251. static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
  2252. {
  2253. char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
  2254. char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
  2255. ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
  2256. if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away?
  2257. {
  2258. char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15);
  2259. STBIR_SIMD_NO_UNROLL_LOOP_START
  2260. do
  2261. {
  2262. stbir__simdf x;
  2263. STBIR_SIMD_NO_UNROLL(sd);
  2264. stbir__simdf_load( x, sd );
  2265. stbir__simdf_store( ( sd + ofs_to_dest ), x );
  2266. sd += 16;
  2267. } while ( sd < s_end16 );
  2268. if ( sd == s_end )
  2269. return;
  2270. }
  2271. do
  2272. {
  2273. STBIR_SIMD_NO_UNROLL(sd);
  2274. *(int*)( sd + ofs_to_dest ) = *(int*) sd;
  2275. sd += 4;
  2276. } while ( sd < s_end );
  2277. }
  2278. #else // no SSE2
  2279. // when in scalar mode, we let unrolling happen, so this macro just does the __restrict
  2280. #define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
  2281. #define STBIR_SIMD_NO_UNROLL(ptr)
  2282. #define STBIR_SIMD_NO_UNROLL_LOOP_START
  2283. #define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
  2284. #endif // SSE2
  2285. #ifdef STBIR_PROFILE
  2286. #ifndef STBIR_PROFILE_FUNC
  2287. #if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
  2288. #ifdef _MSC_VER
  2289. STBIRDEF stbir_uint64 __rdtsc();
  2290. #define STBIR_PROFILE_FUNC() __rdtsc()
  2291. #else // non msvc
  2292. static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
  2293. {
  2294. stbir_uint32 lo, hi;
  2295. asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
  2296. return ( ( (stbir_uint64) hi ) << 32 ) | ( (stbir_uint64) lo );
  2297. }
  2298. #endif // msvc
  2299. #elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__)
  2300. #if defined( _MSC_VER ) && !defined(__clang__)
  2301. #define STBIR_PROFILE_FUNC() _ReadStatusReg(ARM64_CNTVCT)
  2302. #else
  2303. static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
  2304. {
  2305. stbir_uint64 tsc;
  2306. asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
  2307. return tsc;
  2308. }
  2309. #endif
  2310. #else // x64, arm
  2311. #error Unknown platform for profiling.
  2312. #endif // x64, arm
  2313. #endif // STBIR_PROFILE_FUNC
  2314. #define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info
  2315. #define STBIR_ONLY_PROFILE_SET_SPLIT_INFO ,split_info
  2316. #define STBIR_ONLY_PROFILE_BUILD_GET_INFO ,stbir__info * profile_info
  2317. #define STBIR_ONLY_PROFILE_BUILD_SET_INFO ,profile_info
  2318. // super light-weight micro profiler
  2319. #define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded;
  2320. #define STBIR_PROFILE_END_ll( info, wh ) wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime; info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded; *wh##save_parent_excluded_ptr += wh##thiszonetime; info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr; }
  2321. #define STBIR_PROFILE_FIRST_START_ll( info, wh ) { int i; info->current_zone_excluded_ptr = &info->profile.named.total; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; } STBIR_PROFILE_START_ll( info, wh );
  2322. #define STBIR_PROFILE_CLEAR_EXTRAS_ll( info, num ) { int extra; for(extra=1;extra<(num);extra++) { int i; for(i=0;i<STBIR__ARRAY_SIZE((info)->profile.array);i++) (info)[extra].profile.array[i]=0; } }
  2323. // for thread data
  2324. #define STBIR_PROFILE_START( wh ) STBIR_PROFILE_START_ll( split_info, wh )
  2325. #define STBIR_PROFILE_END( wh ) STBIR_PROFILE_END_ll( split_info, wh )
  2326. #define STBIR_PROFILE_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( split_info, wh )
  2327. #define STBIR_PROFILE_CLEAR_EXTRAS() STBIR_PROFILE_CLEAR_EXTRAS_ll( split_info, split_count )
  2328. // for build data
  2329. #define STBIR_PROFILE_BUILD_START( wh ) STBIR_PROFILE_START_ll( profile_info, wh )
  2330. #define STBIR_PROFILE_BUILD_END( wh ) STBIR_PROFILE_END_ll( profile_info, wh )
  2331. #define STBIR_PROFILE_BUILD_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( profile_info, wh )
  2332. #define STBIR_PROFILE_BUILD_CLEAR( info ) { int i; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; }
  2333. #else // no profile
  2334. #define STBIR_ONLY_PROFILE_GET_SPLIT_INFO
  2335. #define STBIR_ONLY_PROFILE_SET_SPLIT_INFO
  2336. #define STBIR_ONLY_PROFILE_BUILD_GET_INFO
  2337. #define STBIR_ONLY_PROFILE_BUILD_SET_INFO
  2338. #define STBIR_PROFILE_START( wh )
  2339. #define STBIR_PROFILE_END( wh )
  2340. #define STBIR_PROFILE_FIRST_START( wh )
  2341. #define STBIR_PROFILE_CLEAR_EXTRAS( )
  2342. #define STBIR_PROFILE_BUILD_START( wh )
  2343. #define STBIR_PROFILE_BUILD_END( wh )
  2344. #define STBIR_PROFILE_BUILD_FIRST_START( wh )
  2345. #define STBIR_PROFILE_BUILD_CLEAR( info )
  2346. #endif // stbir_profile
  2347. #ifndef STBIR_CEILF
  2348. #include <math.h>
  2349. #if _MSC_VER <= 1200 // support VC6 for Sean
  2350. #define STBIR_CEILF(x) ((float)ceil((float)(x)))
  2351. #define STBIR_FLOORF(x) ((float)floor((float)(x)))
  2352. #else
  2353. #define STBIR_CEILF(x) ceilf(x)
  2354. #define STBIR_FLOORF(x) floorf(x)
  2355. #endif
  2356. #endif
  2357. #ifndef STBIR_MEMCPY
  2358. // For memcpy
  2359. #include <string.h>
  2360. #define STBIR_MEMCPY( dest, src, len ) memcpy( dest, src, len )
  2361. #endif
  2362. #ifndef STBIR_SIMD
  2363. // memcpy that is specifically intentionally overlapping (src is smaller then dest, so can be
  2364. // a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
  2365. // the diff between dest and src)
  2366. static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
  2367. {
  2368. char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
  2369. char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
  2370. ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
  2371. if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away?
  2372. {
  2373. char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7);
  2374. STBIR_NO_UNROLL_LOOP_START
  2375. do
  2376. {
  2377. STBIR_NO_UNROLL(sd);
  2378. *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd;
  2379. sd += 8;
  2380. } while ( sd < s_end8 );
  2381. if ( sd == s_end )
  2382. return;
  2383. }
  2384. STBIR_NO_UNROLL_LOOP_START
  2385. do
  2386. {
  2387. STBIR_NO_UNROLL(sd);
  2388. *(int*)( sd + ofs_to_dest ) = *(int*) sd;
  2389. sd += 4;
  2390. } while ( sd < s_end );
  2391. }
  2392. #endif
  2393. static float stbir__filter_trapezoid(float x, float scale, void * user_data)
  2394. {
  2395. float halfscale = scale / 2;
  2396. float t = 0.5f + halfscale;
  2397. STBIR_ASSERT(scale <= 1);
  2398. STBIR__UNUSED(user_data);
  2399. if ( x < 0.0f ) x = -x;
  2400. if (x >= t)
  2401. return 0.0f;
  2402. else
  2403. {
  2404. float r = 0.5f - halfscale;
  2405. if (x <= r)
  2406. return 1.0f;
  2407. else
  2408. return (t - x) / scale;
  2409. }
  2410. }
  2411. static float stbir__support_trapezoid(float scale, void * user_data)
  2412. {
  2413. STBIR__UNUSED(user_data);
  2414. return 0.5f + scale / 2.0f;
  2415. }
  2416. static float stbir__filter_triangle(float x, float s, void * user_data)
  2417. {
  2418. STBIR__UNUSED(s);
  2419. STBIR__UNUSED(user_data);
  2420. if ( x < 0.0f ) x = -x;
  2421. if (x <= 1.0f)
  2422. return 1.0f - x;
  2423. else
  2424. return 0.0f;
  2425. }
  2426. static float stbir__filter_point(float x, float s, void * user_data)
  2427. {
  2428. STBIR__UNUSED(x);
  2429. STBIR__UNUSED(s);
  2430. STBIR__UNUSED(user_data);
  2431. return 1.0f;
  2432. }
  2433. static float stbir__filter_cubic(float x, float s, void * user_data)
  2434. {
  2435. STBIR__UNUSED(s);
  2436. STBIR__UNUSED(user_data);
  2437. if ( x < 0.0f ) x = -x;
  2438. if (x < 1.0f)
  2439. return (4.0f + x*x*(3.0f*x - 6.0f))/6.0f;
  2440. else if (x < 2.0f)
  2441. return (8.0f + x*(-12.0f + x*(6.0f - x)))/6.0f;
  2442. return (0.0f);
  2443. }
  2444. static float stbir__filter_catmullrom(float x, float s, void * user_data)
  2445. {
  2446. STBIR__UNUSED(s);
  2447. STBIR__UNUSED(user_data);
  2448. if ( x < 0.0f ) x = -x;
  2449. if (x < 1.0f)
  2450. return 1.0f - x*x*(2.5f - 1.5f*x);
  2451. else if (x < 2.0f)
  2452. return 2.0f - x*(4.0f + x*(0.5f*x - 2.5f));
  2453. return (0.0f);
  2454. }
  2455. static float stbir__filter_mitchell(float x, float s, void * user_data)
  2456. {
  2457. STBIR__UNUSED(s);
  2458. STBIR__UNUSED(user_data);
  2459. if ( x < 0.0f ) x = -x;
  2460. if (x < 1.0f)
  2461. return (16.0f + x*x*(21.0f * x - 36.0f))/18.0f;
  2462. else if (x < 2.0f)
  2463. return (32.0f + x*(-60.0f + x*(36.0f - 7.0f*x)))/18.0f;
  2464. return (0.0f);
  2465. }
  2466. static float stbir__support_zeropoint5(float s, void * user_data)
  2467. {
  2468. STBIR__UNUSED(s);
  2469. STBIR__UNUSED(user_data);
  2470. return 0.5f;
  2471. }
  2472. static float stbir__support_one(float s, void * user_data)
  2473. {
  2474. STBIR__UNUSED(s);
  2475. STBIR__UNUSED(user_data);
  2476. return 1;
  2477. }
  2478. static float stbir__support_two(float s, void * user_data)
  2479. {
  2480. STBIR__UNUSED(s);
  2481. STBIR__UNUSED(user_data);
  2482. return 2;
  2483. }
  2484. // This is the maximum number of input samples that can affect an output sample
  2485. // with the given filter from the output pixel's perspective
  2486. static int stbir__get_filter_pixel_width(stbir__support_callback * support, float scale, void * user_data)
  2487. {
  2488. STBIR_ASSERT(support != 0);
  2489. if ( scale >= ( 1.0f-stbir__small_float ) ) // upscale
  2490. return (int)STBIR_CEILF(support(1.0f/scale,user_data) * 2.0f);
  2491. else
  2492. return (int)STBIR_CEILF(support(scale,user_data) * 2.0f / scale);
  2493. }
  2494. // this is how many coefficents per run of the filter (which is different
  2495. // from the filter_pixel_width depending on if we are scattering or gathering)
  2496. static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, void * user_data)
  2497. {
  2498. float scale = samp->scale_info.scale;
  2499. stbir__support_callback * support = samp->filter_support;
  2500. switch( is_gather )
  2501. {
  2502. case 1:
  2503. return (int)STBIR_CEILF(support(1.0f / scale, user_data) * 2.0f);
  2504. case 2:
  2505. return (int)STBIR_CEILF(support(scale, user_data) * 2.0f / scale);
  2506. case 0:
  2507. return (int)STBIR_CEILF(support(scale, user_data) * 2.0f);
  2508. default:
  2509. STBIR_ASSERT( (is_gather >= 0 ) && (is_gather <= 2 ) );
  2510. return 0;
  2511. }
  2512. }
  2513. static int stbir__get_contributors(stbir__sampler * samp, int is_gather)
  2514. {
  2515. if (is_gather)
  2516. return samp->scale_info.output_sub_size;
  2517. else
  2518. return (samp->scale_info.input_full_size + samp->filter_pixel_margin * 2);
  2519. }
  2520. static int stbir__edge_zero_full( int n, int max )
  2521. {
  2522. STBIR__UNUSED(n);
  2523. STBIR__UNUSED(max);
  2524. return 0; // NOTREACHED
  2525. }
  2526. static int stbir__edge_clamp_full( int n, int max )
  2527. {
  2528. if (n < 0)
  2529. return 0;
  2530. if (n >= max)
  2531. return max - 1;
  2532. return n; // NOTREACHED
  2533. }
  2534. static int stbir__edge_reflect_full( int n, int max )
  2535. {
  2536. if (n < 0)
  2537. {
  2538. if (n > -max)
  2539. return -n;
  2540. else
  2541. return max - 1;
  2542. }
  2543. if (n >= max)
  2544. {
  2545. int max2 = max * 2;
  2546. if (n >= max2)
  2547. return 0;
  2548. else
  2549. return max2 - n - 1;
  2550. }
  2551. return n; // NOTREACHED
  2552. }
  2553. static int stbir__edge_wrap_full( int n, int max )
  2554. {
  2555. if (n >= 0)
  2556. return (n % max);
  2557. else
  2558. {
  2559. int m = (-n) % max;
  2560. if (m != 0)
  2561. m = max - m;
  2562. return (m);
  2563. }
  2564. }
  2565. typedef int stbir__edge_wrap_func( int n, int max );
  2566. static stbir__edge_wrap_func * stbir__edge_wrap_slow[] =
  2567. {
  2568. stbir__edge_clamp_full, // STBIR_EDGE_CLAMP
  2569. stbir__edge_reflect_full, // STBIR_EDGE_REFLECT
  2570. stbir__edge_wrap_full, // STBIR_EDGE_WRAP
  2571. stbir__edge_zero_full, // STBIR_EDGE_ZERO
  2572. };
  2573. stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
  2574. {
  2575. // avoid per-pixel switch
  2576. if (n >= 0 && n < max)
  2577. return n;
  2578. return stbir__edge_wrap_slow[edge]( n, max );
  2579. }
  2580. #define STBIR__MERGE_RUNS_PIXEL_THRESHOLD 16
  2581. // get information on the extents of a sampler
  2582. static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline_extents )
  2583. {
  2584. int j, stop;
  2585. int left_margin, right_margin;
  2586. int min_n = 0x7fffffff, max_n = -0x7fffffff;
  2587. int min_left = 0x7fffffff, max_left = -0x7fffffff;
  2588. int min_right = 0x7fffffff, max_right = -0x7fffffff;
  2589. stbir_edge edge = samp->edge;
  2590. stbir__contributors* contributors = samp->contributors;
  2591. int output_sub_size = samp->scale_info.output_sub_size;
  2592. int input_full_size = samp->scale_info.input_full_size;
  2593. int filter_pixel_margin = samp->filter_pixel_margin;
  2594. STBIR_ASSERT( samp->is_gather );
  2595. stop = output_sub_size;
  2596. for (j = 0; j < stop; j++ )
  2597. {
  2598. STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
  2599. if ( contributors[j].n0 < min_n )
  2600. {
  2601. min_n = contributors[j].n0;
  2602. stop = j + filter_pixel_margin; // if we find a new min, only scan another filter width
  2603. if ( stop > output_sub_size ) stop = output_sub_size;
  2604. }
  2605. }
  2606. stop = 0;
  2607. for (j = output_sub_size - 1; j >= stop; j-- )
  2608. {
  2609. STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
  2610. if ( contributors[j].n1 > max_n )
  2611. {
  2612. max_n = contributors[j].n1;
  2613. stop = j - filter_pixel_margin; // if we find a new max, only scan another filter width
  2614. if (stop<0) stop = 0;
  2615. }
  2616. }
  2617. STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
  2618. STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
  2619. // now calculate how much into the margins we really read
  2620. left_margin = 0;
  2621. if ( min_n < 0 )
  2622. {
  2623. left_margin = -min_n;
  2624. min_n = 0;
  2625. }
  2626. right_margin = 0;
  2627. if ( max_n >= input_full_size )
  2628. {
  2629. right_margin = max_n - input_full_size + 1;
  2630. max_n = input_full_size - 1;
  2631. }
  2632. // index 1 is margin pixel extents (how many pixels we hang over the edge)
  2633. scanline_extents->edge_sizes[0] = left_margin;
  2634. scanline_extents->edge_sizes[1] = right_margin;
  2635. // index 2 is pixels read from the input
  2636. scanline_extents->spans[0].n0 = min_n;
  2637. scanline_extents->spans[0].n1 = max_n;
  2638. scanline_extents->spans[0].pixel_offset_for_input = min_n;
  2639. // default to no other input range
  2640. scanline_extents->spans[1].n0 = 0;
  2641. scanline_extents->spans[1].n1 = -1;
  2642. scanline_extents->spans[1].pixel_offset_for_input = 0;
  2643. // don't have to do edge calc for zero clamp
  2644. if ( edge == STBIR_EDGE_ZERO )
  2645. return;
  2646. // convert margin pixels to the pixels within the input (min and max)
  2647. for( j = -left_margin ; j < 0 ; j++ )
  2648. {
  2649. int p = stbir__edge_wrap( edge, j, input_full_size );
  2650. if ( p < min_left )
  2651. min_left = p;
  2652. if ( p > max_left )
  2653. max_left = p;
  2654. }
  2655. for( j = input_full_size ; j < (input_full_size + right_margin) ; j++ )
  2656. {
  2657. int p = stbir__edge_wrap( edge, j, input_full_size );
  2658. if ( p < min_right )
  2659. min_right = p;
  2660. if ( p > max_right )
  2661. max_right = p;
  2662. }
  2663. // merge the left margin pixel region if it connects within 4 pixels of main pixel region
  2664. if ( min_left != 0x7fffffff )
  2665. {
  2666. if ( ( ( min_left <= min_n ) && ( ( max_left + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
  2667. ( ( min_n <= min_left ) && ( ( max_n + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_left ) ) )
  2668. {
  2669. scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_left );
  2670. scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_left );
  2671. scanline_extents->spans[0].pixel_offset_for_input = min_n;
  2672. left_margin = 0;
  2673. }
  2674. }
  2675. // merge the right margin pixel region if it connects within 4 pixels of main pixel region
  2676. if ( min_right != 0x7fffffff )
  2677. {
  2678. if ( ( ( min_right <= min_n ) && ( ( max_right + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
  2679. ( ( min_n <= min_right ) && ( ( max_n + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_right ) ) )
  2680. {
  2681. scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_right );
  2682. scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_right );
  2683. scanline_extents->spans[0].pixel_offset_for_input = min_n;
  2684. right_margin = 0;
  2685. }
  2686. }
  2687. STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
  2688. STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
  2689. // you get two ranges when you have the WRAP edge mode and you are doing just the a piece of the resize
  2690. // so you need to get a second run of pixels from the opposite side of the scanline (which you
  2691. // wouldn't need except for WRAP)
  2692. // if we can't merge the min_left range, add it as a second range
  2693. if ( ( left_margin ) && ( min_left != 0x7fffffff ) )
  2694. {
  2695. stbir__span * newspan = scanline_extents->spans + 1;
  2696. STBIR_ASSERT( right_margin == 0 );
  2697. if ( min_left < scanline_extents->spans[0].n0 )
  2698. {
  2699. scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
  2700. scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
  2701. scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
  2702. --newspan;
  2703. }
  2704. newspan->pixel_offset_for_input = min_left;
  2705. newspan->n0 = -left_margin;
  2706. newspan->n1 = ( max_left - min_left ) - left_margin;
  2707. scanline_extents->edge_sizes[0] = 0; // don't need to copy the left margin, since we are directly decoding into the margin
  2708. }
  2709. // if we can't merge the min_left range, add it as a second range
  2710. else
  2711. if ( ( right_margin ) && ( min_right != 0x7fffffff ) )
  2712. {
  2713. stbir__span * newspan = scanline_extents->spans + 1;
  2714. if ( min_right < scanline_extents->spans[0].n0 )
  2715. {
  2716. scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
  2717. scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
  2718. scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
  2719. --newspan;
  2720. }
  2721. newspan->pixel_offset_for_input = min_right;
  2722. newspan->n0 = scanline_extents->spans[1].n1 + 1;
  2723. newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right );
  2724. scanline_extents->edge_sizes[1] = 0; // don't need to copy the right margin, since we are directly decoding into the margin
  2725. }
  2726. // sort the spans into write output order
  2727. if ( ( scanline_extents->spans[1].n1 > scanline_extents->spans[1].n0 ) && ( scanline_extents->spans[0].n0 > scanline_extents->spans[1].n0 ) )
  2728. {
  2729. stbir__span tspan = scanline_extents->spans[0];
  2730. scanline_extents->spans[0] = scanline_extents->spans[1];
  2731. scanline_extents->spans[1] = tspan;
  2732. }
  2733. }
  2734. static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel, float out_pixel_center, float out_filter_radius, float inv_scale, float out_shift, int input_size, stbir_edge edge )
  2735. {
  2736. int first, last;
  2737. float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
  2738. float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
  2739. float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale;
  2740. float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale;
  2741. first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
  2742. last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
  2743. if ( last < first ) last = first; // point sample mode can span a value *right* at 0.5, and cause these to cross
  2744. if ( edge == STBIR_EDGE_WRAP )
  2745. {
  2746. if ( first < -input_size )
  2747. first = -input_size;
  2748. if ( last >= (input_size*2))
  2749. last = (input_size*2) - 1;
  2750. }
  2751. *first_pixel = first;
  2752. *last_pixel = last;
  2753. }
  2754. static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float* coefficient_group, int coefficient_width, stbir_edge edge, void * user_data )
  2755. {
  2756. int n, end;
  2757. float inv_scale = scale_info->inv_scale;
  2758. float out_shift = scale_info->pixel_shift;
  2759. int input_size = scale_info->input_full_size;
  2760. int numerator = scale_info->scale_numerator;
  2761. int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
  2762. // Looping through out pixels
  2763. end = num_contributors; if ( polyphase ) end = numerator;
  2764. for (n = 0; n < end; n++)
  2765. {
  2766. int i;
  2767. int last_non_zero;
  2768. float out_pixel_center = (float)n + 0.5f;
  2769. float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;
  2770. int in_first_pixel, in_last_pixel;
  2771. stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
  2772. // make sure we never generate a range larger than our precalculated coeff width
  2773. // this only happens in point sample mode, but it's a good safe thing to do anyway
  2774. if ( ( in_last_pixel - in_first_pixel + 1 ) > coefficient_width )
  2775. in_last_pixel = in_first_pixel + coefficient_width - 1;
  2776. last_non_zero = -1;
  2777. for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
  2778. {
  2779. float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
  2780. float coeff = kernel(in_center_of_out - in_pixel_center, inv_scale, user_data);
  2781. // kill denormals
  2782. if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
  2783. {
  2784. if ( i == 0 ) // if we're at the front, just eat zero contributors
  2785. {
  2786. STBIR_ASSERT ( ( in_last_pixel - in_first_pixel ) != 0 ); // there should be at least one contrib
  2787. ++in_first_pixel;
  2788. i--;
  2789. continue;
  2790. }
  2791. coeff = 0; // make sure is fully zero (should keep denormals away)
  2792. }
  2793. else
  2794. last_non_zero = i;
  2795. coefficient_group[i] = coeff;
  2796. }
  2797. in_last_pixel = last_non_zero+in_first_pixel; // kills trailing zeros
  2798. contributors->n0 = in_first_pixel;
  2799. contributors->n1 = in_last_pixel;
  2800. STBIR_ASSERT(contributors->n1 >= contributors->n0);
  2801. ++contributors;
  2802. coefficient_group += coefficient_width;
  2803. }
  2804. }
  2805. static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width )
  2806. {
  2807. if ( new_pixel <= contribs->n1 ) // before the end
  2808. {
  2809. if ( new_pixel < contribs->n0 ) // before the front?
  2810. {
  2811. if ( ( contribs->n1 - new_pixel + 1 ) <= max_width )
  2812. {
  2813. int j, o = contribs->n0 - new_pixel;
  2814. for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
  2815. coeffs[ j + o ] = coeffs[ j ];
  2816. for ( j = 1 ; j < o ; j-- )
  2817. coeffs[ j ] = coeffs[ 0 ];
  2818. coeffs[ 0 ] = new_coeff;
  2819. contribs->n0 = new_pixel;
  2820. }
  2821. }
  2822. else
  2823. {
  2824. coeffs[ new_pixel - contribs->n0 ] += new_coeff;
  2825. }
  2826. }
  2827. else
  2828. {
  2829. if ( ( new_pixel - contribs->n0 + 1 ) <= max_width )
  2830. {
  2831. int j, e = new_pixel - contribs->n0;
  2832. for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
  2833. coeffs[j] = 0;
  2834. coeffs[ e ] = new_coeff;
  2835. contribs->n1 = new_pixel;
  2836. }
  2837. }
  2838. }
  2839. static void stbir__calculate_out_pixel_range( int * first_pixel, int * last_pixel, float in_pixel_center, float in_pixels_radius, float scale, float out_shift, int out_size )
  2840. {
  2841. float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
  2842. float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
  2843. float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale - out_shift;
  2844. float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale - out_shift;
  2845. int out_first_pixel = (int)(STBIR_FLOORF(out_pixel_influence_lowerbound + 0.5f));
  2846. int out_last_pixel = (int)(STBIR_FLOORF(out_pixel_influence_upperbound - 0.5f));
  2847. if ( out_first_pixel < 0 )
  2848. out_first_pixel = 0;
  2849. if ( out_last_pixel >= out_size )
  2850. out_last_pixel = out_size - 1;
  2851. *first_pixel = out_first_pixel;
  2852. *last_pixel = out_last_pixel;
  2853. }
  2854. static void stbir__calculate_coefficients_for_gather_downsample( int start, int end, float in_pixels_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int coefficient_width, int num_contributors, stbir__contributors * contributors, float * coefficient_group, void * user_data )
  2855. {
  2856. int in_pixel;
  2857. int i;
  2858. int first_out_inited = -1;
  2859. float scale = scale_info->scale;
  2860. float out_shift = scale_info->pixel_shift;
  2861. int out_size = scale_info->output_sub_size;
  2862. int numerator = scale_info->scale_numerator;
  2863. int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < out_size ) );
  2864. STBIR__UNUSED(num_contributors);
  2865. // Loop through the input pixels
  2866. for (in_pixel = start; in_pixel < end; in_pixel++)
  2867. {
  2868. float in_pixel_center = (float)in_pixel + 0.5f;
  2869. float out_center_of_in = in_pixel_center * scale - out_shift;
  2870. int out_first_pixel, out_last_pixel;
  2871. stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, in_pixel_center, in_pixels_radius, scale, out_shift, out_size );
  2872. if ( out_first_pixel > out_last_pixel )
  2873. continue;
  2874. // clamp or exit if we are using polyphase filtering, and the limit is up
  2875. if ( polyphase )
  2876. {
  2877. // when polyphase, you only have to do coeffs up to the numerator count
  2878. if ( out_first_pixel == numerator )
  2879. break;
  2880. // don't do any extra work, clamp last pixel at numerator too
  2881. if ( out_last_pixel >= numerator )
  2882. out_last_pixel = numerator - 1;
  2883. }
  2884. for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
  2885. {
  2886. float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
  2887. float x = out_pixel_center - out_center_of_in;
  2888. float coeff = kernel(x, scale, user_data) * scale;
  2889. // kill the coeff if it's too small (avoid denormals)
  2890. if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
  2891. coeff = 0.0f;
  2892. {
  2893. int out = i + out_first_pixel;
  2894. float * coeffs = coefficient_group + out * coefficient_width;
  2895. stbir__contributors * contribs = contributors + out;
  2896. // is this the first time this output pixel has been seen? Init it.
  2897. if ( out > first_out_inited )
  2898. {
  2899. STBIR_ASSERT( out == ( first_out_inited + 1 ) ); // ensure we have only advanced one at time
  2900. first_out_inited = out;
  2901. contribs->n0 = in_pixel;
  2902. contribs->n1 = in_pixel;
  2903. coeffs[0] = coeff;
  2904. }
  2905. else
  2906. {
  2907. // insert on end (always in order)
  2908. if ( coeffs[0] == 0.0f ) // if the first coefficent is zero, then zap it for this coeffs
  2909. {
  2910. STBIR_ASSERT( ( in_pixel - contribs->n0 ) == 1 ); // ensure that when we zap, we're at the 2nd pos
  2911. contribs->n0 = in_pixel;
  2912. }
  2913. contribs->n1 = in_pixel;
  2914. STBIR_ASSERT( ( in_pixel - contribs->n0 ) < coefficient_width );
  2915. coeffs[in_pixel - contribs->n0] = coeff;
  2916. }
  2917. }
  2918. }
  2919. }
  2920. }
  2921. #ifdef STBIR_RENORMALIZE_IN_FLOAT
  2922. #define STBIR_RENORM_TYPE float
  2923. #else
  2924. #define STBIR_RENORM_TYPE double
  2925. #endif
  2926. static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
  2927. {
  2928. int input_size = scale_info->input_full_size;
  2929. int input_last_n1 = input_size - 1;
  2930. int n, end;
  2931. int lowest = 0x7fffffff;
  2932. int highest = -0x7fffffff;
  2933. int widest = -1;
  2934. int numerator = scale_info->scale_numerator;
  2935. int denominator = scale_info->scale_denominator;
  2936. int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
  2937. float * coeffs;
  2938. stbir__contributors * contribs;
  2939. // weight all the coeffs for each sample
  2940. coeffs = coefficient_group;
  2941. contribs = contributors;
  2942. end = num_contributors; if ( polyphase ) end = numerator;
  2943. for (n = 0; n < end; n++)
  2944. {
  2945. int i;
  2946. STBIR_RENORM_TYPE filter_scale, total_filter = 0;
  2947. int e;
  2948. // add all contribs
  2949. e = contribs->n1 - contribs->n0;
  2950. for( i = 0 ; i <= e ; i++ )
  2951. {
  2952. total_filter += (STBIR_RENORM_TYPE) coeffs[i];
  2953. STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f ) ); // check for wonky weights
  2954. }
  2955. // rescale
  2956. if ( ( total_filter < stbir__small_float ) && ( total_filter > -stbir__small_float ) )
  2957. {
  2958. // all coeffs are extremely small, just zero it
  2959. contribs->n1 = contribs->n0;
  2960. coeffs[0] = 0.0f;
  2961. }
  2962. else
  2963. {
  2964. // if the total isn't 1.0, rescale everything
  2965. if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) )
  2966. {
  2967. filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter;
  2968. // scale them all
  2969. for (i = 0; i <= e; i++)
  2970. coeffs[i] = (float) ( coeffs[i] * filter_scale );
  2971. }
  2972. }
  2973. ++contribs;
  2974. coeffs += coefficient_width;
  2975. }
  2976. // if we have a rational for the scale, we can exploit the polyphaseness to not calculate
  2977. // most of the coefficients, so we copy them here
  2978. if ( polyphase )
  2979. {
  2980. stbir__contributors * prev_contribs = contributors;
  2981. stbir__contributors * cur_contribs = contributors + numerator;
  2982. for( n = numerator ; n < num_contributors ; n++ )
  2983. {
  2984. cur_contribs->n0 = prev_contribs->n0 + denominator;
  2985. cur_contribs->n1 = prev_contribs->n1 + denominator;
  2986. ++cur_contribs;
  2987. ++prev_contribs;
  2988. }
  2989. stbir_overlapping_memcpy( coefficient_group + numerator * coefficient_width, coefficient_group, ( num_contributors - numerator ) * coefficient_width * sizeof( coeffs[ 0 ] ) );
  2990. }
  2991. coeffs = coefficient_group;
  2992. contribs = contributors;
  2993. for (n = 0; n < num_contributors; n++)
  2994. {
  2995. int i;
  2996. // in zero edge mode, just remove out of bounds contribs completely (since their weights are accounted for now)
  2997. if ( edge == STBIR_EDGE_ZERO )
  2998. {
  2999. // shrink the right side if necessary
  3000. if ( contribs->n1 > input_last_n1 )
  3001. contribs->n1 = input_last_n1;
  3002. // shrink the left side
  3003. if ( contribs->n0 < 0 )
  3004. {
  3005. int j, left, skips = 0;
  3006. skips = -contribs->n0;
  3007. contribs->n0 = 0;
  3008. // now move down the weights
  3009. left = contribs->n1 - contribs->n0 + 1;
  3010. if ( left > 0 )
  3011. {
  3012. for( j = 0 ; j < left ; j++ )
  3013. coeffs[ j ] = coeffs[ j + skips ];
  3014. }
  3015. }
  3016. }
  3017. else if ( ( edge == STBIR_EDGE_CLAMP ) || ( edge == STBIR_EDGE_REFLECT ) )
  3018. {
  3019. // for clamp and reflect, calculate the true inbounds position (based on edge type) and just add that to the existing weight
  3020. // right hand side first
  3021. if ( contribs->n1 > input_last_n1 )
  3022. {
  3023. int start = contribs->n0;
  3024. int endi = contribs->n1;
  3025. contribs->n1 = input_last_n1;
  3026. for( i = input_size; i <= endi; i++ )
  3027. stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start], coefficient_width );
  3028. }
  3029. // now check left hand edge
  3030. if ( contribs->n0 < 0 )
  3031. {
  3032. int save_n0;
  3033. float save_n0_coeff;
  3034. float * c = coeffs - ( contribs->n0 + 1 );
  3035. // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
  3036. for( i = -1 ; i > contribs->n0 ; i-- )
  3037. stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c--, coefficient_width );
  3038. save_n0 = contribs->n0;
  3039. save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
  3040. // now slide all the coeffs down (since we have accumulated them in the positive contribs) and reset the first contrib
  3041. contribs->n0 = 0;
  3042. for(i = 0 ; i <= contribs->n1 ; i++ )
  3043. coeffs[i] = coeffs[i-save_n0];
  3044. // now that we have shrunk down the contribs, we insert the first one safely
  3045. stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff, coefficient_width );
  3046. }
  3047. }
  3048. if ( contribs->n0 <= contribs->n1 )
  3049. {
  3050. int diff = contribs->n1 - contribs->n0 + 1;
  3051. while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
  3052. --diff;
  3053. contribs->n1 = contribs->n0 + diff - 1;
  3054. if ( contribs->n0 <= contribs->n1 )
  3055. {
  3056. if ( contribs->n0 < lowest )
  3057. lowest = contribs->n0;
  3058. if ( contribs->n1 > highest )
  3059. highest = contribs->n1;
  3060. if ( diff > widest )
  3061. widest = diff;
  3062. }
  3063. // re-zero out unused coefficients (if any)
  3064. for( i = diff ; i < coefficient_width ; i++ )
  3065. coeffs[i] = 0.0f;
  3066. }
  3067. ++contribs;
  3068. coeffs += coefficient_width;
  3069. }
  3070. filter_info->lowest = lowest;
  3071. filter_info->highest = highest;
  3072. filter_info->widest = widest;
  3073. }
  3074. #undef STBIR_RENORM_TYPE
  3075. static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 )
  3076. {
  3077. #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
  3078. #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
  3079. #ifdef STBIR_SIMD
  3080. #define STBIR_MOVE_4( dest, src ) { stbir__simdf t; STBIR_NO_UNROLL(dest); stbir__simdf_load( t, src ); stbir__simdf_store( dest, t ); }
  3081. #else
  3082. #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
  3083. #endif
  3084. int row_end = row1 + 1;
  3085. STBIR__UNUSED( row0 ); // only used in an assert
  3086. if ( coefficient_width != widest )
  3087. {
  3088. float * pc = coefficents;
  3089. float * coeffs = coefficents;
  3090. float * pc_end = coefficents + num_contributors * widest;
  3091. switch( widest )
  3092. {
  3093. case 1:
  3094. STBIR_NO_UNROLL_LOOP_START
  3095. do {
  3096. STBIR_MOVE_1( pc, coeffs );
  3097. ++pc;
  3098. coeffs += coefficient_width;
  3099. } while ( pc < pc_end );
  3100. break;
  3101. case 2:
  3102. STBIR_NO_UNROLL_LOOP_START
  3103. do {
  3104. STBIR_MOVE_2( pc, coeffs );
  3105. pc += 2;
  3106. coeffs += coefficient_width;
  3107. } while ( pc < pc_end );
  3108. break;
  3109. case 3:
  3110. STBIR_NO_UNROLL_LOOP_START
  3111. do {
  3112. STBIR_MOVE_2( pc, coeffs );
  3113. STBIR_MOVE_1( pc+2, coeffs+2 );
  3114. pc += 3;
  3115. coeffs += coefficient_width;
  3116. } while ( pc < pc_end );
  3117. break;
  3118. case 4:
  3119. STBIR_NO_UNROLL_LOOP_START
  3120. do {
  3121. STBIR_MOVE_4( pc, coeffs );
  3122. pc += 4;
  3123. coeffs += coefficient_width;
  3124. } while ( pc < pc_end );
  3125. break;
  3126. case 5:
  3127. STBIR_NO_UNROLL_LOOP_START
  3128. do {
  3129. STBIR_MOVE_4( pc, coeffs );
  3130. STBIR_MOVE_1( pc+4, coeffs+4 );
  3131. pc += 5;
  3132. coeffs += coefficient_width;
  3133. } while ( pc < pc_end );
  3134. break;
  3135. case 6:
  3136. STBIR_NO_UNROLL_LOOP_START
  3137. do {
  3138. STBIR_MOVE_4( pc, coeffs );
  3139. STBIR_MOVE_2( pc+4, coeffs+4 );
  3140. pc += 6;
  3141. coeffs += coefficient_width;
  3142. } while ( pc < pc_end );
  3143. break;
  3144. case 7:
  3145. STBIR_NO_UNROLL_LOOP_START
  3146. do {
  3147. STBIR_MOVE_4( pc, coeffs );
  3148. STBIR_MOVE_2( pc+4, coeffs+4 );
  3149. STBIR_MOVE_1( pc+6, coeffs+6 );
  3150. pc += 7;
  3151. coeffs += coefficient_width;
  3152. } while ( pc < pc_end );
  3153. break;
  3154. case 8:
  3155. STBIR_NO_UNROLL_LOOP_START
  3156. do {
  3157. STBIR_MOVE_4( pc, coeffs );
  3158. STBIR_MOVE_4( pc+4, coeffs+4 );
  3159. pc += 8;
  3160. coeffs += coefficient_width;
  3161. } while ( pc < pc_end );
  3162. break;
  3163. case 9:
  3164. STBIR_NO_UNROLL_LOOP_START
  3165. do {
  3166. STBIR_MOVE_4( pc, coeffs );
  3167. STBIR_MOVE_4( pc+4, coeffs+4 );
  3168. STBIR_MOVE_1( pc+8, coeffs+8 );
  3169. pc += 9;
  3170. coeffs += coefficient_width;
  3171. } while ( pc < pc_end );
  3172. break;
  3173. case 10:
  3174. STBIR_NO_UNROLL_LOOP_START
  3175. do {
  3176. STBIR_MOVE_4( pc, coeffs );
  3177. STBIR_MOVE_4( pc+4, coeffs+4 );
  3178. STBIR_MOVE_2( pc+8, coeffs+8 );
  3179. pc += 10;
  3180. coeffs += coefficient_width;
  3181. } while ( pc < pc_end );
  3182. break;
  3183. case 11:
  3184. STBIR_NO_UNROLL_LOOP_START
  3185. do {
  3186. STBIR_MOVE_4( pc, coeffs );
  3187. STBIR_MOVE_4( pc+4, coeffs+4 );
  3188. STBIR_MOVE_2( pc+8, coeffs+8 );
  3189. STBIR_MOVE_1( pc+10, coeffs+10 );
  3190. pc += 11;
  3191. coeffs += coefficient_width;
  3192. } while ( pc < pc_end );
  3193. break;
  3194. case 12:
  3195. STBIR_NO_UNROLL_LOOP_START
  3196. do {
  3197. STBIR_MOVE_4( pc, coeffs );
  3198. STBIR_MOVE_4( pc+4, coeffs+4 );
  3199. STBIR_MOVE_4( pc+8, coeffs+8 );
  3200. pc += 12;
  3201. coeffs += coefficient_width;
  3202. } while ( pc < pc_end );
  3203. break;
  3204. default:
  3205. STBIR_NO_UNROLL_LOOP_START
  3206. do {
  3207. float * copy_end = pc + widest - 4;
  3208. float * c = coeffs;
  3209. do {
  3210. STBIR_NO_UNROLL( pc );
  3211. STBIR_MOVE_4( pc, c );
  3212. pc += 4;
  3213. c += 4;
  3214. } while ( pc <= copy_end );
  3215. copy_end += 4;
  3216. STBIR_NO_UNROLL_LOOP_START
  3217. while ( pc < copy_end )
  3218. {
  3219. STBIR_MOVE_1( pc, c );
  3220. ++pc; ++c;
  3221. }
  3222. coeffs += coefficient_width;
  3223. } while ( pc < pc_end );
  3224. break;
  3225. }
  3226. }
  3227. // some horizontal routines read one float off the end (which is then masked off), so put in a sentinal so we don't read an snan or denormal
  3228. coefficents[ widest * num_contributors ] = 8888.0f;
  3229. // the minimum we might read for unrolled filters widths is 12. So, we need to
  3230. // make sure we never read outside the decode buffer, by possibly moving
  3231. // the sample area back into the scanline, and putting zeros weights first.
  3232. // we start on the right edge and check until we're well past the possible
  3233. // clip area (2*widest).
  3234. {
  3235. stbir__contributors * contribs = contributors + num_contributors - 1;
  3236. float * coeffs = coefficents + widest * ( num_contributors - 1 );
  3237. // go until no chance of clipping (this is usually less than 8 lops)
  3238. while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_end ) )
  3239. {
  3240. // might we clip??
  3241. if ( ( contribs->n0 + widest ) > row_end )
  3242. {
  3243. int stop_range = widest;
  3244. // if range is larger than 12, it will be handled by generic loops that can terminate on the exact length
  3245. // of this contrib n1, instead of a fixed widest amount - so calculate this
  3246. if ( widest > 12 )
  3247. {
  3248. int mod;
  3249. // how far will be read in the n_coeff loop (which depends on the widest count mod4);
  3250. mod = widest & 3;
  3251. stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
  3252. // the n_coeff loops do a minimum amount of coeffs, so factor that in!
  3253. if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
  3254. }
  3255. // now see if we still clip with the refined range
  3256. if ( ( contribs->n0 + stop_range ) > row_end )
  3257. {
  3258. int new_n0 = row_end - stop_range;
  3259. int num = contribs->n1 - contribs->n0 + 1;
  3260. int backup = contribs->n0 - new_n0;
  3261. float * from_co = coeffs + num - 1;
  3262. float * to_co = from_co + backup;
  3263. STBIR_ASSERT( ( new_n0 >= row0 ) && ( new_n0 < contribs->n0 ) );
  3264. // move the coeffs over
  3265. while( num )
  3266. {
  3267. *to_co-- = *from_co--;
  3268. --num;
  3269. }
  3270. // zero new positions
  3271. while ( to_co >= coeffs )
  3272. *to_co-- = 0;
  3273. // set new start point
  3274. contribs->n0 = new_n0;
  3275. if ( widest > 12 )
  3276. {
  3277. int mod;
  3278. // how far will be read in the n_coeff loop (which depends on the widest count mod4);
  3279. mod = widest & 3;
  3280. stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
  3281. // the n_coeff loops do a minimum amount of coeffs, so factor that in!
  3282. if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
  3283. }
  3284. }
  3285. }
  3286. --contribs;
  3287. coeffs -= widest;
  3288. }
  3289. }
  3290. return widest;
  3291. #undef STBIR_MOVE_1
  3292. #undef STBIR_MOVE_2
  3293. #undef STBIR_MOVE_4
  3294. }
  3295. static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * other_axis_for_pivot, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
  3296. {
  3297. int n;
  3298. float scale = samp->scale_info.scale;
  3299. stbir__kernel_callback * kernel = samp->filter_kernel;
  3300. stbir__support_callback * support = samp->filter_support;
  3301. float inv_scale = samp->scale_info.inv_scale;
  3302. int input_full_size = samp->scale_info.input_full_size;
  3303. int gather_num_contributors = samp->num_contributors;
  3304. stbir__contributors* gather_contributors = samp->contributors;
  3305. float * gather_coeffs = samp->coefficients;
  3306. int gather_coefficient_width = samp->coefficient_width;
  3307. switch ( samp->is_gather )
  3308. {
  3309. case 1: // gather upsample
  3310. {
  3311. float out_pixels_radius = support(inv_scale,user_data) * scale;
  3312. stbir__calculate_coefficients_for_gather_upsample( out_pixels_radius, kernel, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width, samp->edge, user_data );
  3313. STBIR_PROFILE_BUILD_START( cleanup );
  3314. stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
  3315. STBIR_PROFILE_BUILD_END( cleanup );
  3316. }
  3317. break;
  3318. case 0: // scatter downsample (only on vertical)
  3319. case 2: // gather downsample
  3320. {
  3321. float in_pixels_radius = support(scale,user_data) * inv_scale;
  3322. int filter_pixel_margin = samp->filter_pixel_margin;
  3323. int input_end = input_full_size + filter_pixel_margin;
  3324. // if this is a scatter, we do a downsample gather to get the coeffs, and then pivot after
  3325. if ( !samp->is_gather )
  3326. {
  3327. // check if we are using the same gather downsample on the horizontal as this vertical,
  3328. // if so, then we don't have to generate them, we can just pivot from the horizontal.
  3329. if ( other_axis_for_pivot )
  3330. {
  3331. gather_contributors = other_axis_for_pivot->contributors;
  3332. gather_coeffs = other_axis_for_pivot->coefficients;
  3333. gather_coefficient_width = other_axis_for_pivot->coefficient_width;
  3334. gather_num_contributors = other_axis_for_pivot->num_contributors;
  3335. samp->extent_info.lowest = other_axis_for_pivot->extent_info.lowest;
  3336. samp->extent_info.highest = other_axis_for_pivot->extent_info.highest;
  3337. samp->extent_info.widest = other_axis_for_pivot->extent_info.widest;
  3338. goto jump_right_to_pivot;
  3339. }
  3340. gather_contributors = samp->gather_prescatter_contributors;
  3341. gather_coeffs = samp->gather_prescatter_coefficients;
  3342. gather_coefficient_width = samp->gather_prescatter_coefficient_width;
  3343. gather_num_contributors = samp->gather_prescatter_num_contributors;
  3344. }
  3345. stbir__calculate_coefficients_for_gather_downsample( -filter_pixel_margin, input_end, in_pixels_radius, kernel, &samp->scale_info, gather_coefficient_width, gather_num_contributors, gather_contributors, gather_coeffs, user_data );
  3346. STBIR_PROFILE_BUILD_START( cleanup );
  3347. stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
  3348. STBIR_PROFILE_BUILD_END( cleanup );
  3349. if ( !samp->is_gather )
  3350. {
  3351. // if this is a scatter (vertical only), then we need to pivot the coeffs
  3352. stbir__contributors * scatter_contributors;
  3353. int highest_set;
  3354. jump_right_to_pivot:
  3355. STBIR_PROFILE_BUILD_START( pivot );
  3356. highest_set = (-filter_pixel_margin) - 1;
  3357. for (n = 0; n < gather_num_contributors; n++)
  3358. {
  3359. int k;
  3360. int gn0 = gather_contributors->n0, gn1 = gather_contributors->n1;
  3361. int scatter_coefficient_width = samp->coefficient_width;
  3362. float * scatter_coeffs = samp->coefficients + ( gn0 + filter_pixel_margin ) * scatter_coefficient_width;
  3363. float * g_coeffs = gather_coeffs;
  3364. scatter_contributors = samp->contributors + ( gn0 + filter_pixel_margin );
  3365. for (k = gn0 ; k <= gn1 ; k++ )
  3366. {
  3367. float gc = *g_coeffs++;
  3368. // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width
  3369. // (which happens when pivoting from horizontal, which might have dummy zeros)
  3370. if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) )
  3371. {
  3372. if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
  3373. {
  3374. {
  3375. // if we are skipping over several contributors, we need to clear the skipped ones
  3376. stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
  3377. while ( clear_contributors < scatter_contributors )
  3378. {
  3379. clear_contributors->n0 = 0;
  3380. clear_contributors->n1 = -1;
  3381. ++clear_contributors;
  3382. }
  3383. }
  3384. scatter_contributors->n0 = n;
  3385. scatter_contributors->n1 = n;
  3386. scatter_coeffs[0] = gc;
  3387. highest_set = k;
  3388. }
  3389. else
  3390. {
  3391. stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc, scatter_coefficient_width );
  3392. }
  3393. STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
  3394. }
  3395. ++scatter_contributors;
  3396. scatter_coeffs += scatter_coefficient_width;
  3397. }
  3398. ++gather_contributors;
  3399. gather_coeffs += gather_coefficient_width;
  3400. }
  3401. // now clear any unset contribs
  3402. {
  3403. stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
  3404. stbir__contributors * end_contributors = samp->contributors + samp->num_contributors;
  3405. while ( clear_contributors < end_contributors )
  3406. {
  3407. clear_contributors->n0 = 0;
  3408. clear_contributors->n1 = -1;
  3409. ++clear_contributors;
  3410. }
  3411. }
  3412. STBIR_PROFILE_BUILD_END( pivot );
  3413. }
  3414. }
  3415. break;
  3416. }
  3417. }
  3418. //========================================================================================================
  3419. // scanline decoders and encoders
  3420. #define stbir__coder_min_num 1
  3421. #define STB_IMAGE_RESIZE_DO_CODERS
  3422. #include STBIR__HEADER_FILENAME
  3423. #define stbir__decode_suffix BGRA
  3424. #define stbir__decode_swizzle
  3425. #define stbir__decode_order0 2
  3426. #define stbir__decode_order1 1
  3427. #define stbir__decode_order2 0
  3428. #define stbir__decode_order3 3
  3429. #define stbir__encode_order0 2
  3430. #define stbir__encode_order1 1
  3431. #define stbir__encode_order2 0
  3432. #define stbir__encode_order3 3
  3433. #define stbir__coder_min_num 4
  3434. #define STB_IMAGE_RESIZE_DO_CODERS
  3435. #include STBIR__HEADER_FILENAME
  3436. #define stbir__decode_suffix ARGB
  3437. #define stbir__decode_swizzle
  3438. #define stbir__decode_order0 1
  3439. #define stbir__decode_order1 2
  3440. #define stbir__decode_order2 3
  3441. #define stbir__decode_order3 0
  3442. #define stbir__encode_order0 3
  3443. #define stbir__encode_order1 0
  3444. #define stbir__encode_order2 1
  3445. #define stbir__encode_order3 2
  3446. #define stbir__coder_min_num 4
  3447. #define STB_IMAGE_RESIZE_DO_CODERS
  3448. #include STBIR__HEADER_FILENAME
  3449. #define stbir__decode_suffix ABGR
  3450. #define stbir__decode_swizzle
  3451. #define stbir__decode_order0 3
  3452. #define stbir__decode_order1 2
  3453. #define stbir__decode_order2 1
  3454. #define stbir__decode_order3 0
  3455. #define stbir__encode_order0 3
  3456. #define stbir__encode_order1 2
  3457. #define stbir__encode_order2 1
  3458. #define stbir__encode_order3 0
  3459. #define stbir__coder_min_num 4
  3460. #define STB_IMAGE_RESIZE_DO_CODERS
  3461. #include STBIR__HEADER_FILENAME
  3462. #define stbir__decode_suffix AR
  3463. #define stbir__decode_swizzle
  3464. #define stbir__decode_order0 1
  3465. #define stbir__decode_order1 0
  3466. #define stbir__decode_order2 3
  3467. #define stbir__decode_order3 2
  3468. #define stbir__encode_order0 1
  3469. #define stbir__encode_order1 0
  3470. #define stbir__encode_order2 3
  3471. #define stbir__encode_order3 2
  3472. #define stbir__coder_min_num 2
  3473. #define STB_IMAGE_RESIZE_DO_CODERS
  3474. #include STBIR__HEADER_FILENAME
  3475. // fancy alpha means we expand to keep both premultipied and non-premultiplied color channels
  3476. static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_channels )
  3477. {
  3478. float STBIR_STREAMOUT_PTR(*) out = out_buffer;
  3479. float const * end_decode = out_buffer + ( width_times_channels / 4 ) * 7; // decode buffer aligned to end of out_buffer
  3480. float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
  3481. // fancy alpha is stored internally as R G B A Rpm Gpm Bpm
  3482. #ifdef STBIR_SIMD
  3483. #ifdef STBIR_SIMD8
  3484. decode += 16;
  3485. STBIR_NO_UNROLL_LOOP_START
  3486. while ( decode <= end_decode )
  3487. {
  3488. stbir__simdf8 d0,d1,a0,a1,p0,p1;
  3489. STBIR_NO_UNROLL(decode);
  3490. stbir__simdf8_load( d0, decode-16 );
  3491. stbir__simdf8_load( d1, decode-16+8 );
  3492. stbir__simdf8_0123to33333333( a0, d0 );
  3493. stbir__simdf8_0123to33333333( a1, d1 );
  3494. stbir__simdf8_mult( p0, a0, d0 );
  3495. stbir__simdf8_mult( p1, a1, d1 );
  3496. stbir__simdf8_bot4s( a0, d0, p0 );
  3497. stbir__simdf8_bot4s( a1, d1, p1 );
  3498. stbir__simdf8_top4s( d0, d0, p0 );
  3499. stbir__simdf8_top4s( d1, d1, p1 );
  3500. stbir__simdf8_store ( out, a0 );
  3501. stbir__simdf8_store ( out+7, d0 );
  3502. stbir__simdf8_store ( out+14, a1 );
  3503. stbir__simdf8_store ( out+21, d1 );
  3504. decode += 16;
  3505. out += 28;
  3506. }
  3507. decode -= 16;
  3508. #else
  3509. decode += 8;
  3510. STBIR_NO_UNROLL_LOOP_START
  3511. while ( decode <= end_decode )
  3512. {
  3513. stbir__simdf d0,a0,d1,a1,p0,p1;
  3514. STBIR_NO_UNROLL(decode);
  3515. stbir__simdf_load( d0, decode-8 );
  3516. stbir__simdf_load( d1, decode-8+4 );
  3517. stbir__simdf_0123to3333( a0, d0 );
  3518. stbir__simdf_0123to3333( a1, d1 );
  3519. stbir__simdf_mult( p0, a0, d0 );
  3520. stbir__simdf_mult( p1, a1, d1 );
  3521. stbir__simdf_store ( out, d0 );
  3522. stbir__simdf_store ( out+4, p0 );
  3523. stbir__simdf_store ( out+7, d1 );
  3524. stbir__simdf_store ( out+7+4, p1 );
  3525. decode += 8;
  3526. out += 14;
  3527. }
  3528. decode -= 8;
  3529. #endif
  3530. // might be one last odd pixel
  3531. #ifdef STBIR_SIMD8
  3532. STBIR_NO_UNROLL_LOOP_START
  3533. while ( decode < end_decode )
  3534. #else
  3535. if ( decode < end_decode )
  3536. #endif
  3537. {
  3538. stbir__simdf d,a,p;
  3539. STBIR_NO_UNROLL(decode);
  3540. stbir__simdf_load( d, decode );
  3541. stbir__simdf_0123to3333( a, d );
  3542. stbir__simdf_mult( p, a, d );
  3543. stbir__simdf_store ( out, d );
  3544. stbir__simdf_store ( out+4, p );
  3545. decode += 4;
  3546. out += 7;
  3547. }
  3548. #else
  3549. while( decode < end_decode )
  3550. {
  3551. float r = decode[0], g = decode[1], b = decode[2], alpha = decode[3];
  3552. out[0] = r;
  3553. out[1] = g;
  3554. out[2] = b;
  3555. out[3] = alpha;
  3556. out[4] = r * alpha;
  3557. out[5] = g * alpha;
  3558. out[6] = b * alpha;
  3559. out += 7;
  3560. decode += 4;
  3561. }
  3562. #endif
  3563. }
  3564. static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_channels )
  3565. {
  3566. float STBIR_STREAMOUT_PTR(*) out = out_buffer;
  3567. float const * end_decode = out_buffer + ( width_times_channels / 2 ) * 3;
  3568. float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
  3569. // for fancy alpha, turns into: [X A Xpm][X A Xpm],etc
  3570. #ifdef STBIR_SIMD
  3571. decode += 8;
  3572. if ( decode <= end_decode )
  3573. {
  3574. STBIR_NO_UNROLL_LOOP_START
  3575. do {
  3576. #ifdef STBIR_SIMD8
  3577. stbir__simdf8 d0,a0,p0;
  3578. STBIR_NO_UNROLL(decode);
  3579. stbir__simdf8_load( d0, decode-8 );
  3580. stbir__simdf8_0123to11331133( p0, d0 );
  3581. stbir__simdf8_0123to00220022( a0, d0 );
  3582. stbir__simdf8_mult( p0, p0, a0 );
  3583. stbir__simdf_store2( out, stbir__if_simdf8_cast_to_simdf4( d0 ) );
  3584. stbir__simdf_store( out+2, stbir__if_simdf8_cast_to_simdf4( p0 ) );
  3585. stbir__simdf_store2h( out+3, stbir__if_simdf8_cast_to_simdf4( d0 ) );
  3586. stbir__simdf_store2( out+6, stbir__simdf8_gettop4( d0 ) );
  3587. stbir__simdf_store( out+8, stbir__simdf8_gettop4( p0 ) );
  3588. stbir__simdf_store2h( out+9, stbir__simdf8_gettop4( d0 ) );
  3589. #else
  3590. stbir__simdf d0,a0,d1,a1,p0,p1;
  3591. STBIR_NO_UNROLL(decode);
  3592. stbir__simdf_load( d0, decode-8 );
  3593. stbir__simdf_load( d1, decode-8+4 );
  3594. stbir__simdf_0123to1133( p0, d0 );
  3595. stbir__simdf_0123to1133( p1, d1 );
  3596. stbir__simdf_0123to0022( a0, d0 );
  3597. stbir__simdf_0123to0022( a1, d1 );
  3598. stbir__simdf_mult( p0, p0, a0 );
  3599. stbir__simdf_mult( p1, p1, a1 );
  3600. stbir__simdf_store2( out, d0 );
  3601. stbir__simdf_store( out+2, p0 );
  3602. stbir__simdf_store2h( out+3, d0 );
  3603. stbir__simdf_store2( out+6, d1 );
  3604. stbir__simdf_store( out+8, p1 );
  3605. stbir__simdf_store2h( out+9, d1 );
  3606. #endif
  3607. decode += 8;
  3608. out += 12;
  3609. } while ( decode <= end_decode );
  3610. }
  3611. decode -= 8;
  3612. #endif
  3613. STBIR_SIMD_NO_UNROLL_LOOP_START
  3614. while( decode < end_decode )
  3615. {
  3616. float x = decode[0], y = decode[1];
  3617. STBIR_SIMD_NO_UNROLL(decode);
  3618. out[0] = x;
  3619. out[1] = y;
  3620. out[2] = x * y;
  3621. out += 3;
  3622. decode += 2;
  3623. }
  3624. }
  3625. static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
  3626. {
  3627. float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
  3628. float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
  3629. float const * end_output = encode_buffer + width_times_channels;
  3630. // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
  3631. STBIR_SIMD_NO_UNROLL_LOOP_START
  3632. do {
  3633. float alpha = input[3];
  3634. #ifdef STBIR_SIMD
  3635. stbir__simdf i,ia;
  3636. STBIR_SIMD_NO_UNROLL(encode);
  3637. if ( alpha < stbir__small_float )
  3638. {
  3639. stbir__simdf_load( i, input );
  3640. stbir__simdf_store( encode, i );
  3641. }
  3642. else
  3643. {
  3644. stbir__simdf_load1frep4( ia, 1.0f / alpha );
  3645. stbir__simdf_load( i, input+4 );
  3646. stbir__simdf_mult( i, i, ia );
  3647. stbir__simdf_store( encode, i );
  3648. encode[3] = alpha;
  3649. }
  3650. #else
  3651. if ( alpha < stbir__small_float )
  3652. {
  3653. encode[0] = input[0];
  3654. encode[1] = input[1];
  3655. encode[2] = input[2];
  3656. }
  3657. else
  3658. {
  3659. float ialpha = 1.0f / alpha;
  3660. encode[0] = input[4] * ialpha;
  3661. encode[1] = input[5] * ialpha;
  3662. encode[2] = input[6] * ialpha;
  3663. }
  3664. encode[3] = alpha;
  3665. #endif
  3666. input += 7;
  3667. encode += 4;
  3668. } while ( encode < end_output );
  3669. }
  3670. // format: [X A Xpm][X A Xpm] etc
  3671. static void stbir__fancy_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
  3672. {
  3673. float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
  3674. float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
  3675. float const * end_output = encode_buffer + width_times_channels;
  3676. do {
  3677. float alpha = input[1];
  3678. encode[0] = input[0];
  3679. if ( alpha >= stbir__small_float )
  3680. encode[0] = input[2] / alpha;
  3681. encode[1] = alpha;
  3682. input += 3;
  3683. encode += 2;
  3684. } while ( encode < end_output );
  3685. }
  3686. static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_times_channels )
  3687. {
  3688. float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
  3689. float const * end_decode = decode_buffer + width_times_channels;
  3690. #ifdef STBIR_SIMD
  3691. {
  3692. decode += 2 * stbir__simdfX_float_count;
  3693. STBIR_NO_UNROLL_LOOP_START
  3694. while ( decode <= end_decode )
  3695. {
  3696. stbir__simdfX d0,a0,d1,a1;
  3697. STBIR_NO_UNROLL(decode);
  3698. stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
  3699. stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
  3700. stbir__simdfX_aaa1( a0, d0, STBIR_onesX );
  3701. stbir__simdfX_aaa1( a1, d1, STBIR_onesX );
  3702. stbir__simdfX_mult( d0, d0, a0 );
  3703. stbir__simdfX_mult( d1, d1, a1 );
  3704. stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
  3705. stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
  3706. decode += 2 * stbir__simdfX_float_count;
  3707. }
  3708. decode -= 2 * stbir__simdfX_float_count;
  3709. // few last pixels remnants
  3710. #ifdef STBIR_SIMD8
  3711. STBIR_NO_UNROLL_LOOP_START
  3712. while ( decode < end_decode )
  3713. #else
  3714. if ( decode < end_decode )
  3715. #endif
  3716. {
  3717. stbir__simdf d,a;
  3718. stbir__simdf_load( d, decode );
  3719. stbir__simdf_aaa1( a, d, STBIR__CONSTF(STBIR_ones) );
  3720. stbir__simdf_mult( d, d, a );
  3721. stbir__simdf_store ( decode, d );
  3722. decode += 4;
  3723. }
  3724. }
  3725. #else
  3726. while( decode < end_decode )
  3727. {
  3728. float alpha = decode[3];
  3729. decode[0] *= alpha;
  3730. decode[1] *= alpha;
  3731. decode[2] *= alpha;
  3732. decode += 4;
  3733. }
  3734. #endif
  3735. }
  3736. static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_times_channels )
  3737. {
  3738. float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
  3739. float const * end_decode = decode_buffer + width_times_channels;
  3740. #ifdef STBIR_SIMD
  3741. decode += 2 * stbir__simdfX_float_count;
  3742. STBIR_NO_UNROLL_LOOP_START
  3743. while ( decode <= end_decode )
  3744. {
  3745. stbir__simdfX d0,a0,d1,a1;
  3746. STBIR_NO_UNROLL(decode);
  3747. stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
  3748. stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
  3749. stbir__simdfX_a1a1( a0, d0, STBIR_onesX );
  3750. stbir__simdfX_a1a1( a1, d1, STBIR_onesX );
  3751. stbir__simdfX_mult( d0, d0, a0 );
  3752. stbir__simdfX_mult( d1, d1, a1 );
  3753. stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
  3754. stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
  3755. decode += 2 * stbir__simdfX_float_count;
  3756. }
  3757. decode -= 2 * stbir__simdfX_float_count;
  3758. #endif
  3759. STBIR_SIMD_NO_UNROLL_LOOP_START
  3760. while( decode < end_decode )
  3761. {
  3762. float alpha = decode[1];
  3763. STBIR_SIMD_NO_UNROLL(decode);
  3764. decode[0] *= alpha;
  3765. decode += 2;
  3766. }
  3767. }
  3768. static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
  3769. {
  3770. float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
  3771. float const * end_output = encode_buffer + width_times_channels;
  3772. STBIR_SIMD_NO_UNROLL_LOOP_START
  3773. do {
  3774. float alpha = encode[3];
  3775. #ifdef STBIR_SIMD
  3776. stbir__simdf i,ia;
  3777. STBIR_SIMD_NO_UNROLL(encode);
  3778. if ( alpha >= stbir__small_float )
  3779. {
  3780. stbir__simdf_load1frep4( ia, 1.0f / alpha );
  3781. stbir__simdf_load( i, encode );
  3782. stbir__simdf_mult( i, i, ia );
  3783. stbir__simdf_store( encode, i );
  3784. encode[3] = alpha;
  3785. }
  3786. #else
  3787. if ( alpha >= stbir__small_float )
  3788. {
  3789. float ialpha = 1.0f / alpha;
  3790. encode[0] *= ialpha;
  3791. encode[1] *= ialpha;
  3792. encode[2] *= ialpha;
  3793. }
  3794. #endif
  3795. encode += 4;
  3796. } while ( encode < end_output );
  3797. }
  3798. static void stbir__simple_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
  3799. {
  3800. float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
  3801. float const * end_output = encode_buffer + width_times_channels;
  3802. do {
  3803. float alpha = encode[1];
  3804. if ( alpha >= stbir__small_float )
  3805. encode[0] /= alpha;
  3806. encode += 2;
  3807. } while ( encode < end_output );
  3808. }
  3809. // only used in RGB->BGR or BGR->RGB
  3810. static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_channels )
  3811. {
  3812. float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
  3813. float const * end_decode = decode_buffer + width_times_channels;
  3814. #ifdef STBIR_SIMD
  3815. #ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
  3816. end_decode -= 12;
  3817. STBIR_NO_UNROLL_LOOP_START
  3818. while( decode <= end_decode )
  3819. {
  3820. // on arm64 8 instructions, no overlapping stores
  3821. stbir__simdf a,b,c,na,nb;
  3822. STBIR_SIMD_NO_UNROLL(decode);
  3823. stbir__simdf_load( a, decode );
  3824. stbir__simdf_load( b, decode+4 );
  3825. stbir__simdf_load( c, decode+8 );
  3826. na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );
  3827. b = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );
  3828. nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );
  3829. c = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );
  3830. stbir__simdf_store( decode, na );
  3831. stbir__simdf_store( decode+4, nb );
  3832. stbir__simdf_store( decode+8, c );
  3833. decode += 12;
  3834. }
  3835. end_decode += 12;
  3836. #else
  3837. end_decode -= 24;
  3838. STBIR_NO_UNROLL_LOOP_START
  3839. while( decode <= end_decode )
  3840. {
  3841. // 26 instructions on x64
  3842. stbir__simdf a,b,c,d,e,f,g;
  3843. float i21, i23;
  3844. STBIR_SIMD_NO_UNROLL(decode);
  3845. stbir__simdf_load( a, decode );
  3846. stbir__simdf_load( b, decode+3 );
  3847. stbir__simdf_load( c, decode+6 );
  3848. stbir__simdf_load( d, decode+9 );
  3849. stbir__simdf_load( e, decode+12 );
  3850. stbir__simdf_load( f, decode+15 );
  3851. stbir__simdf_load( g, decode+18 );
  3852. a = stbir__simdf_swiz( a, 2, 1, 0, 3 );
  3853. b = stbir__simdf_swiz( b, 2, 1, 0, 3 );
  3854. c = stbir__simdf_swiz( c, 2, 1, 0, 3 );
  3855. d = stbir__simdf_swiz( d, 2, 1, 0, 3 );
  3856. e = stbir__simdf_swiz( e, 2, 1, 0, 3 );
  3857. f = stbir__simdf_swiz( f, 2, 1, 0, 3 );
  3858. g = stbir__simdf_swiz( g, 2, 1, 0, 3 );
  3859. // stores overlap, need to be in order,
  3860. stbir__simdf_store( decode, a );
  3861. i21 = decode[21];
  3862. stbir__simdf_store( decode+3, b );
  3863. i23 = decode[23];
  3864. stbir__simdf_store( decode+6, c );
  3865. stbir__simdf_store( decode+9, d );
  3866. stbir__simdf_store( decode+12, e );
  3867. stbir__simdf_store( decode+15, f );
  3868. stbir__simdf_store( decode+18, g );
  3869. decode[21] = i23;
  3870. decode[23] = i21;
  3871. decode += 24;
  3872. }
  3873. end_decode += 24;
  3874. #endif
  3875. #else
  3876. end_decode -= 12;
  3877. STBIR_NO_UNROLL_LOOP_START
  3878. while( decode <= end_decode )
  3879. {
  3880. // 16 instructions
  3881. float t0,t1,t2,t3;
  3882. STBIR_NO_UNROLL(decode);
  3883. t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9];
  3884. decode[0] = decode[2]; decode[3] = decode[5]; decode[6] = decode[8]; decode[9] = decode[11];
  3885. decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3;
  3886. decode += 12;
  3887. }
  3888. end_decode += 12;
  3889. #endif
  3890. STBIR_NO_UNROLL_LOOP_START
  3891. while( decode < end_decode )
  3892. {
  3893. float t = decode[0];
  3894. STBIR_NO_UNROLL(decode);
  3895. decode[0] = decode[2];
  3896. decode[2] = t;
  3897. decode += 3;
  3898. }
  3899. }
  3900. static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float * output_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
  3901. {
  3902. int channels = stbir_info->channels;
  3903. int effective_channels = stbir_info->effective_channels;
  3904. int input_sample_in_bytes = stbir__type_size[stbir_info->input_type] * channels;
  3905. stbir_edge edge_horizontal = stbir_info->horizontal.edge;
  3906. stbir_edge edge_vertical = stbir_info->vertical.edge;
  3907. int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
  3908. const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
  3909. stbir__span const * spans = stbir_info->scanline_extents.spans;
  3910. float * full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
  3911. float * last_decoded = 0;
  3912. // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
  3913. STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
  3914. do
  3915. {
  3916. float * decode_buffer;
  3917. void const * input_data;
  3918. float * end_decode;
  3919. int width_times_channels;
  3920. int width;
  3921. if ( spans->n1 < spans->n0 )
  3922. break;
  3923. width = spans->n1 + 1 - spans->n0;
  3924. decode_buffer = full_decode_buffer + spans->n0 * effective_channels;
  3925. end_decode = full_decode_buffer + ( spans->n1 + 1 ) * effective_channels;
  3926. width_times_channels = width * channels;
  3927. // read directly out of input plane by default
  3928. input_data = ( (char*)input_plane_data ) + spans->pixel_offset_for_input * input_sample_in_bytes;
  3929. // if we have an input callback, call it to get the input data
  3930. if ( stbir_info->in_pixels_cb )
  3931. {
  3932. // call the callback with a temp buffer (that they can choose to use or not). the temp is just right aligned memory in the decode_buffer itself
  3933. input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ) + ( ( stbir_info->input_type != STBIR_TYPE_FLOAT ) ? ( sizeof(float)*STBIR_INPUT_CALLBACK_PADDING ) : 0 ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
  3934. }
  3935. STBIR_PROFILE_START( decode );
  3936. // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
  3937. last_decoded = stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
  3938. STBIR_PROFILE_END( decode );
  3939. if (stbir_info->alpha_weight)
  3940. {
  3941. STBIR_PROFILE_START( alpha );
  3942. stbir_info->alpha_weight( decode_buffer, width_times_channels );
  3943. STBIR_PROFILE_END( alpha );
  3944. }
  3945. ++spans;
  3946. } while ( spans <= ( &stbir_info->scanline_extents.spans[1] ) );
  3947. // handle the edge_wrap filter (all other types are handled back out at the calculate_filter stage)
  3948. // basically the idea here is that if we have the whole scanline in memory, we don't redecode the
  3949. // wrapped edge pixels, and instead just memcpy them from the scanline into the edge positions
  3950. if ( ( edge_horizontal == STBIR_EDGE_WRAP ) && ( stbir_info->scanline_extents.edge_sizes[0] | stbir_info->scanline_extents.edge_sizes[1] ) )
  3951. {
  3952. // this code only runs if we're in edge_wrap, and we're doing the entire scanline
  3953. int e, start_x[2];
  3954. int input_full_size = stbir_info->horizontal.scale_info.input_full_size;
  3955. start_x[0] = -stbir_info->scanline_extents.edge_sizes[0]; // left edge start x
  3956. start_x[1] = input_full_size; // right edge
  3957. for( e = 0; e < 2 ; e++ )
  3958. {
  3959. // do each margin
  3960. int margin = stbir_info->scanline_extents.edge_sizes[e];
  3961. if ( margin )
  3962. {
  3963. int x = start_x[e];
  3964. float * marg = full_decode_buffer + x * effective_channels;
  3965. float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels;
  3966. STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) );
  3967. if ( e == 1 ) last_decoded = marg + margin * effective_channels;
  3968. }
  3969. }
  3970. }
  3971. // some of the horizontal gathers read one float off the edge (which is masked out), but we force a zero here to make sure no NaNs leak in
  3972. // (we can't pre-zero it, because the input callback can use that area as padding)
  3973. last_decoded[0] = 0.0f;
  3974. // we clear this extra float, because the final output pixel filter kernel might have used one less coeff than the max filter width
  3975. // when this happens, we do read that pixel from the input, so it too could be Nan, so just zero an extra one.
  3976. // this fits because each scanline is padded by three floats (STBIR_INPUT_CALLBACK_PADDING)
  3977. last_decoded[1] = 0.0f;
  3978. }
  3979. //=================
  3980. // Do 1 channel horizontal routines
  3981. #ifdef STBIR_SIMD
  3982. #define stbir__1_coeff_only() \
  3983. stbir__simdf tot,c; \
  3984. STBIR_SIMD_NO_UNROLL(decode); \
  3985. stbir__simdf_load1( c, hc ); \
  3986. stbir__simdf_mult1_mem( tot, c, decode );
  3987. #define stbir__2_coeff_only() \
  3988. stbir__simdf tot,c,d; \
  3989. STBIR_SIMD_NO_UNROLL(decode); \
  3990. stbir__simdf_load2z( c, hc ); \
  3991. stbir__simdf_load2( d, decode ); \
  3992. stbir__simdf_mult( tot, c, d ); \
  3993. stbir__simdf_0123to1230( c, tot ); \
  3994. stbir__simdf_add1( tot, tot, c );
  3995. #define stbir__3_coeff_only() \
  3996. stbir__simdf tot,c,t; \
  3997. STBIR_SIMD_NO_UNROLL(decode); \
  3998. stbir__simdf_load( c, hc ); \
  3999. stbir__simdf_mult_mem( tot, c, decode ); \
  4000. stbir__simdf_0123to1230( c, tot ); \
  4001. stbir__simdf_0123to2301( t, tot ); \
  4002. stbir__simdf_add1( tot, tot, c ); \
  4003. stbir__simdf_add1( tot, tot, t );
  4004. #define stbir__store_output_tiny() \
  4005. stbir__simdf_store1( output, tot ); \
  4006. horizontal_coefficients += coefficient_width; \
  4007. ++horizontal_contributors; \
  4008. output += 1;
  4009. #define stbir__4_coeff_start() \
  4010. stbir__simdf tot,c; \
  4011. STBIR_SIMD_NO_UNROLL(decode); \
  4012. stbir__simdf_load( c, hc ); \
  4013. stbir__simdf_mult_mem( tot, c, decode ); \
  4014. #define stbir__4_coeff_continue_from_4( ofs ) \
  4015. STBIR_SIMD_NO_UNROLL(decode); \
  4016. stbir__simdf_load( c, hc + (ofs) ); \
  4017. stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
  4018. #define stbir__1_coeff_remnant( ofs ) \
  4019. { stbir__simdf d; \
  4020. stbir__simdf_load1z( c, hc + (ofs) ); \
  4021. stbir__simdf_load1( d, decode + (ofs) ); \
  4022. stbir__simdf_madd( tot, tot, d, c ); }
  4023. #define stbir__2_coeff_remnant( ofs ) \
  4024. { stbir__simdf d; \
  4025. stbir__simdf_load2z( c, hc+(ofs) ); \
  4026. stbir__simdf_load2( d, decode+(ofs) ); \
  4027. stbir__simdf_madd( tot, tot, d, c ); }
  4028. #define stbir__3_coeff_setup() \
  4029. stbir__simdf mask; \
  4030. stbir__simdf_load( mask, STBIR_mask + 3 );
  4031. #define stbir__3_coeff_remnant( ofs ) \
  4032. stbir__simdf_load( c, hc+(ofs) ); \
  4033. stbir__simdf_and( c, c, mask ); \
  4034. stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
  4035. #define stbir__store_output() \
  4036. stbir__simdf_0123to2301( c, tot ); \
  4037. stbir__simdf_add( tot, tot, c ); \
  4038. stbir__simdf_0123to1230( c, tot ); \
  4039. stbir__simdf_add1( tot, tot, c ); \
  4040. stbir__simdf_store1( output, tot ); \
  4041. horizontal_coefficients += coefficient_width; \
  4042. ++horizontal_contributors; \
  4043. output += 1;
  4044. #else
  4045. #define stbir__1_coeff_only() \
  4046. float tot; \
  4047. tot = decode[0]*hc[0];
  4048. #define stbir__2_coeff_only() \
  4049. float tot; \
  4050. tot = decode[0] * hc[0]; \
  4051. tot += decode[1] * hc[1];
  4052. #define stbir__3_coeff_only() \
  4053. float tot; \
  4054. tot = decode[0] * hc[0]; \
  4055. tot += decode[1] * hc[1]; \
  4056. tot += decode[2] * hc[2];
  4057. #define stbir__store_output_tiny() \
  4058. output[0] = tot; \
  4059. horizontal_coefficients += coefficient_width; \
  4060. ++horizontal_contributors; \
  4061. output += 1;
  4062. #define stbir__4_coeff_start() \
  4063. float tot0,tot1,tot2,tot3; \
  4064. tot0 = decode[0] * hc[0]; \
  4065. tot1 = decode[1] * hc[1]; \
  4066. tot2 = decode[2] * hc[2]; \
  4067. tot3 = decode[3] * hc[3];
  4068. #define stbir__4_coeff_continue_from_4( ofs ) \
  4069. tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \
  4070. tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \
  4071. tot2 += decode[2+(ofs)] * hc[2+(ofs)]; \
  4072. tot3 += decode[3+(ofs)] * hc[3+(ofs)];
  4073. #define stbir__1_coeff_remnant( ofs ) \
  4074. tot0 += decode[0+(ofs)] * hc[0+(ofs)];
  4075. #define stbir__2_coeff_remnant( ofs ) \
  4076. tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \
  4077. tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \
  4078. #define stbir__3_coeff_remnant( ofs ) \
  4079. tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \
  4080. tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \
  4081. tot2 += decode[2+(ofs)] * hc[2+(ofs)];
  4082. #define stbir__store_output() \
  4083. output[0] = (tot0+tot2)+(tot1+tot3); \
  4084. horizontal_coefficients += coefficient_width; \
  4085. ++horizontal_contributors; \
  4086. output += 1;
  4087. #endif
  4088. #define STBIR__horizontal_channels 1
  4089. #define STB_IMAGE_RESIZE_DO_HORIZONTALS
  4090. #include STBIR__HEADER_FILENAME
  4091. //=================
  4092. // Do 2 channel horizontal routines
  4093. #ifdef STBIR_SIMD
  4094. #define stbir__1_coeff_only() \
  4095. stbir__simdf tot,c,d; \
  4096. STBIR_SIMD_NO_UNROLL(decode); \
  4097. stbir__simdf_load1z( c, hc ); \
  4098. stbir__simdf_0123to0011( c, c ); \
  4099. stbir__simdf_load2( d, decode ); \
  4100. stbir__simdf_mult( tot, d, c );
  4101. #define stbir__2_coeff_only() \
  4102. stbir__simdf tot,c; \
  4103. STBIR_SIMD_NO_UNROLL(decode); \
  4104. stbir__simdf_load2( c, hc ); \
  4105. stbir__simdf_0123to0011( c, c ); \
  4106. stbir__simdf_mult_mem( tot, c, decode );
  4107. #define stbir__3_coeff_only() \
  4108. stbir__simdf tot,c,cs,d; \
  4109. STBIR_SIMD_NO_UNROLL(decode); \
  4110. stbir__simdf_load( cs, hc ); \
  4111. stbir__simdf_0123to0011( c, cs ); \
  4112. stbir__simdf_mult_mem( tot, c, decode ); \
  4113. stbir__simdf_0123to2222( c, cs ); \
  4114. stbir__simdf_load2z( d, decode+4 ); \
  4115. stbir__simdf_madd( tot, tot, d, c );
  4116. #define stbir__store_output_tiny() \
  4117. stbir__simdf_0123to2301( c, tot ); \
  4118. stbir__simdf_add( tot, tot, c ); \
  4119. stbir__simdf_store2( output, tot ); \
  4120. horizontal_coefficients += coefficient_width; \
  4121. ++horizontal_contributors; \
  4122. output += 2;
  4123. #ifdef STBIR_SIMD8
  4124. #define stbir__4_coeff_start() \
  4125. stbir__simdf8 tot0,c,cs; \
  4126. STBIR_SIMD_NO_UNROLL(decode); \
  4127. stbir__simdf8_load4b( cs, hc ); \
  4128. stbir__simdf8_0123to00112233( c, cs ); \
  4129. stbir__simdf8_mult_mem( tot0, c, decode );
  4130. #define stbir__4_coeff_continue_from_4( ofs ) \
  4131. STBIR_SIMD_NO_UNROLL(decode); \
  4132. stbir__simdf8_load4b( cs, hc + (ofs) ); \
  4133. stbir__simdf8_0123to00112233( c, cs ); \
  4134. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
  4135. #define stbir__1_coeff_remnant( ofs ) \
  4136. { stbir__simdf t,d; \
  4137. stbir__simdf_load1z( t, hc + (ofs) ); \
  4138. stbir__simdf_load2( d, decode + (ofs) * 2 ); \
  4139. stbir__simdf_0123to0011( t, t ); \
  4140. stbir__simdf_mult( t, t, d ); \
  4141. stbir__simdf8_add4( tot0, tot0, t ); }
  4142. #define stbir__2_coeff_remnant( ofs ) \
  4143. { stbir__simdf t; \
  4144. stbir__simdf_load2( t, hc + (ofs) ); \
  4145. stbir__simdf_0123to0011( t, t ); \
  4146. stbir__simdf_mult_mem( t, t, decode+(ofs)*2 ); \
  4147. stbir__simdf8_add4( tot0, tot0, t ); }
  4148. #define stbir__3_coeff_remnant( ofs ) \
  4149. { stbir__simdf8 d; \
  4150. stbir__simdf8_load4b( cs, hc + (ofs) ); \
  4151. stbir__simdf8_0123to00112233( c, cs ); \
  4152. stbir__simdf8_load6z( d, decode+(ofs)*2 ); \
  4153. stbir__simdf8_madd( tot0, tot0, c, d ); }
  4154. #define stbir__store_output() \
  4155. { stbir__simdf t,d; \
  4156. stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 ); \
  4157. stbir__simdf_0123to2301( d, t ); \
  4158. stbir__simdf_add( t, t, d ); \
  4159. stbir__simdf_store2( output, t ); \
  4160. horizontal_coefficients += coefficient_width; \
  4161. ++horizontal_contributors; \
  4162. output += 2; }
  4163. #else
  4164. #define stbir__4_coeff_start() \
  4165. stbir__simdf tot0,tot1,c,cs; \
  4166. STBIR_SIMD_NO_UNROLL(decode); \
  4167. stbir__simdf_load( cs, hc ); \
  4168. stbir__simdf_0123to0011( c, cs ); \
  4169. stbir__simdf_mult_mem( tot0, c, decode ); \
  4170. stbir__simdf_0123to2233( c, cs ); \
  4171. stbir__simdf_mult_mem( tot1, c, decode+4 );
  4172. #define stbir__4_coeff_continue_from_4( ofs ) \
  4173. STBIR_SIMD_NO_UNROLL(decode); \
  4174. stbir__simdf_load( cs, hc + (ofs) ); \
  4175. stbir__simdf_0123to0011( c, cs ); \
  4176. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
  4177. stbir__simdf_0123to2233( c, cs ); \
  4178. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );
  4179. #define stbir__1_coeff_remnant( ofs ) \
  4180. { stbir__simdf d; \
  4181. stbir__simdf_load1z( cs, hc + (ofs) ); \
  4182. stbir__simdf_0123to0011( c, cs ); \
  4183. stbir__simdf_load2( d, decode + (ofs) * 2 ); \
  4184. stbir__simdf_madd( tot0, tot0, d, c ); }
  4185. #define stbir__2_coeff_remnant( ofs ) \
  4186. stbir__simdf_load2( cs, hc + (ofs) ); \
  4187. stbir__simdf_0123to0011( c, cs ); \
  4188. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
  4189. #define stbir__3_coeff_remnant( ofs ) \
  4190. { stbir__simdf d; \
  4191. stbir__simdf_load( cs, hc + (ofs) ); \
  4192. stbir__simdf_0123to0011( c, cs ); \
  4193. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
  4194. stbir__simdf_0123to2222( c, cs ); \
  4195. stbir__simdf_load2z( d, decode + (ofs) * 2 + 4 ); \
  4196. stbir__simdf_madd( tot1, tot1, d, c ); }
  4197. #define stbir__store_output() \
  4198. stbir__simdf_add( tot0, tot0, tot1 ); \
  4199. stbir__simdf_0123to2301( c, tot0 ); \
  4200. stbir__simdf_add( tot0, tot0, c ); \
  4201. stbir__simdf_store2( output, tot0 ); \
  4202. horizontal_coefficients += coefficient_width; \
  4203. ++horizontal_contributors; \
  4204. output += 2;
  4205. #endif
  4206. #else
  4207. #define stbir__1_coeff_only() \
  4208. float tota,totb,c; \
  4209. c = hc[0]; \
  4210. tota = decode[0]*c; \
  4211. totb = decode[1]*c;
  4212. #define stbir__2_coeff_only() \
  4213. float tota,totb,c; \
  4214. c = hc[0]; \
  4215. tota = decode[0]*c; \
  4216. totb = decode[1]*c; \
  4217. c = hc[1]; \
  4218. tota += decode[2]*c; \
  4219. totb += decode[3]*c;
  4220. // this weird order of add matches the simd
  4221. #define stbir__3_coeff_only() \
  4222. float tota,totb,c; \
  4223. c = hc[0]; \
  4224. tota = decode[0]*c; \
  4225. totb = decode[1]*c; \
  4226. c = hc[2]; \
  4227. tota += decode[4]*c; \
  4228. totb += decode[5]*c; \
  4229. c = hc[1]; \
  4230. tota += decode[2]*c; \
  4231. totb += decode[3]*c;
  4232. #define stbir__store_output_tiny() \
  4233. output[0] = tota; \
  4234. output[1] = totb; \
  4235. horizontal_coefficients += coefficient_width; \
  4236. ++horizontal_contributors; \
  4237. output += 2;
  4238. #define stbir__4_coeff_start() \
  4239. float tota0,tota1,tota2,tota3,totb0,totb1,totb2,totb3,c; \
  4240. c = hc[0]; \
  4241. tota0 = decode[0]*c; \
  4242. totb0 = decode[1]*c; \
  4243. c = hc[1]; \
  4244. tota1 = decode[2]*c; \
  4245. totb1 = decode[3]*c; \
  4246. c = hc[2]; \
  4247. tota2 = decode[4]*c; \
  4248. totb2 = decode[5]*c; \
  4249. c = hc[3]; \
  4250. tota3 = decode[6]*c; \
  4251. totb3 = decode[7]*c;
  4252. #define stbir__4_coeff_continue_from_4( ofs ) \
  4253. c = hc[0+(ofs)]; \
  4254. tota0 += decode[0+(ofs)*2]*c; \
  4255. totb0 += decode[1+(ofs)*2]*c; \
  4256. c = hc[1+(ofs)]; \
  4257. tota1 += decode[2+(ofs)*2]*c; \
  4258. totb1 += decode[3+(ofs)*2]*c; \
  4259. c = hc[2+(ofs)]; \
  4260. tota2 += decode[4+(ofs)*2]*c; \
  4261. totb2 += decode[5+(ofs)*2]*c; \
  4262. c = hc[3+(ofs)]; \
  4263. tota3 += decode[6+(ofs)*2]*c; \
  4264. totb3 += decode[7+(ofs)*2]*c;
  4265. #define stbir__1_coeff_remnant( ofs ) \
  4266. c = hc[0+(ofs)]; \
  4267. tota0 += decode[0+(ofs)*2] * c; \
  4268. totb0 += decode[1+(ofs)*2] * c;
  4269. #define stbir__2_coeff_remnant( ofs ) \
  4270. c = hc[0+(ofs)]; \
  4271. tota0 += decode[0+(ofs)*2] * c; \
  4272. totb0 += decode[1+(ofs)*2] * c; \
  4273. c = hc[1+(ofs)]; \
  4274. tota1 += decode[2+(ofs)*2] * c; \
  4275. totb1 += decode[3+(ofs)*2] * c;
  4276. #define stbir__3_coeff_remnant( ofs ) \
  4277. c = hc[0+(ofs)]; \
  4278. tota0 += decode[0+(ofs)*2] * c; \
  4279. totb0 += decode[1+(ofs)*2] * c; \
  4280. c = hc[1+(ofs)]; \
  4281. tota1 += decode[2+(ofs)*2] * c; \
  4282. totb1 += decode[3+(ofs)*2] * c; \
  4283. c = hc[2+(ofs)]; \
  4284. tota2 += decode[4+(ofs)*2] * c; \
  4285. totb2 += decode[5+(ofs)*2] * c;
  4286. #define stbir__store_output() \
  4287. output[0] = (tota0+tota2)+(tota1+tota3); \
  4288. output[1] = (totb0+totb2)+(totb1+totb3); \
  4289. horizontal_coefficients += coefficient_width; \
  4290. ++horizontal_contributors; \
  4291. output += 2;
  4292. #endif
  4293. #define STBIR__horizontal_channels 2
  4294. #define STB_IMAGE_RESIZE_DO_HORIZONTALS
  4295. #include STBIR__HEADER_FILENAME
  4296. //=================
  4297. // Do 3 channel horizontal routines
  4298. #ifdef STBIR_SIMD
  4299. #define stbir__1_coeff_only() \
  4300. stbir__simdf tot,c,d; \
  4301. STBIR_SIMD_NO_UNROLL(decode); \
  4302. stbir__simdf_load1z( c, hc ); \
  4303. stbir__simdf_0123to0001( c, c ); \
  4304. stbir__simdf_load( d, decode ); \
  4305. stbir__simdf_mult( tot, d, c );
  4306. #define stbir__2_coeff_only() \
  4307. stbir__simdf tot,c,cs,d; \
  4308. STBIR_SIMD_NO_UNROLL(decode); \
  4309. stbir__simdf_load2( cs, hc ); \
  4310. stbir__simdf_0123to0000( c, cs ); \
  4311. stbir__simdf_load( d, decode ); \
  4312. stbir__simdf_mult( tot, d, c ); \
  4313. stbir__simdf_0123to1111( c, cs ); \
  4314. stbir__simdf_load( d, decode+3 ); \
  4315. stbir__simdf_madd( tot, tot, d, c );
  4316. #define stbir__3_coeff_only() \
  4317. stbir__simdf tot,c,d,cs; \
  4318. STBIR_SIMD_NO_UNROLL(decode); \
  4319. stbir__simdf_load( cs, hc ); \
  4320. stbir__simdf_0123to0000( c, cs ); \
  4321. stbir__simdf_load( d, decode ); \
  4322. stbir__simdf_mult( tot, d, c ); \
  4323. stbir__simdf_0123to1111( c, cs ); \
  4324. stbir__simdf_load( d, decode+3 ); \
  4325. stbir__simdf_madd( tot, tot, d, c ); \
  4326. stbir__simdf_0123to2222( c, cs ); \
  4327. stbir__simdf_load( d, decode+6 ); \
  4328. stbir__simdf_madd( tot, tot, d, c );
  4329. #define stbir__store_output_tiny() \
  4330. stbir__simdf_store2( output, tot ); \
  4331. stbir__simdf_0123to2301( tot, tot ); \
  4332. stbir__simdf_store1( output+2, tot ); \
  4333. horizontal_coefficients += coefficient_width; \
  4334. ++horizontal_contributors; \
  4335. output += 3;
  4336. #ifdef STBIR_SIMD8
  4337. // we're loading from the XXXYYY decode by -1 to get the XXXYYY into different halves of the AVX reg fyi
  4338. #define stbir__4_coeff_start() \
  4339. stbir__simdf8 tot0,tot1,c,cs; stbir__simdf t; \
  4340. STBIR_SIMD_NO_UNROLL(decode); \
  4341. stbir__simdf8_load4b( cs, hc ); \
  4342. stbir__simdf8_0123to00001111( c, cs ); \
  4343. stbir__simdf8_mult_mem( tot0, c, decode - 1 ); \
  4344. stbir__simdf8_0123to22223333( c, cs ); \
  4345. stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );
  4346. #define stbir__4_coeff_continue_from_4( ofs ) \
  4347. STBIR_SIMD_NO_UNROLL(decode); \
  4348. stbir__simdf8_load4b( cs, hc + (ofs) ); \
  4349. stbir__simdf8_0123to00001111( c, cs ); \
  4350. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
  4351. stbir__simdf8_0123to22223333( c, cs ); \
  4352. stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );
  4353. #define stbir__1_coeff_remnant( ofs ) \
  4354. STBIR_SIMD_NO_UNROLL(decode); \
  4355. stbir__simdf_load1rep4( t, hc + (ofs) ); \
  4356. stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 );
  4357. #define stbir__2_coeff_remnant( ofs ) \
  4358. STBIR_SIMD_NO_UNROLL(decode); \
  4359. stbir__simdf8_load4b( cs, hc + (ofs) - 2 ); \
  4360. stbir__simdf8_0123to22223333( c, cs ); \
  4361. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );
  4362. #define stbir__3_coeff_remnant( ofs ) \
  4363. STBIR_SIMD_NO_UNROLL(decode); \
  4364. stbir__simdf8_load4b( cs, hc + (ofs) ); \
  4365. stbir__simdf8_0123to00001111( c, cs ); \
  4366. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
  4367. stbir__simdf8_0123to2222( t, cs ); \
  4368. stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 );
  4369. #define stbir__store_output() \
  4370. stbir__simdf8_add( tot0, tot0, tot1 ); \
  4371. stbir__simdf_0123to1230( t, stbir__if_simdf8_cast_to_simdf4( tot0 ) ); \
  4372. stbir__simdf8_add4halves( t, t, tot0 ); \
  4373. horizontal_coefficients += coefficient_width; \
  4374. ++horizontal_contributors; \
  4375. output += 3; \
  4376. if ( output < output_end ) \
  4377. { \
  4378. stbir__simdf_store( output-3, t ); \
  4379. continue; \
  4380. } \
  4381. { stbir__simdf tt; stbir__simdf_0123to2301( tt, t ); \
  4382. stbir__simdf_store2( output-3, t ); \
  4383. stbir__simdf_store1( output+2-3, tt ); } \
  4384. break;
  4385. #else
  4386. #define stbir__4_coeff_start() \
  4387. stbir__simdf tot0,tot1,tot2,c,cs; \
  4388. STBIR_SIMD_NO_UNROLL(decode); \
  4389. stbir__simdf_load( cs, hc ); \
  4390. stbir__simdf_0123to0001( c, cs ); \
  4391. stbir__simdf_mult_mem( tot0, c, decode ); \
  4392. stbir__simdf_0123to1122( c, cs ); \
  4393. stbir__simdf_mult_mem( tot1, c, decode+4 ); \
  4394. stbir__simdf_0123to2333( c, cs ); \
  4395. stbir__simdf_mult_mem( tot2, c, decode+8 );
  4396. #define stbir__4_coeff_continue_from_4( ofs ) \
  4397. STBIR_SIMD_NO_UNROLL(decode); \
  4398. stbir__simdf_load( cs, hc + (ofs) ); \
  4399. stbir__simdf_0123to0001( c, cs ); \
  4400. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
  4401. stbir__simdf_0123to1122( c, cs ); \
  4402. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
  4403. stbir__simdf_0123to2333( c, cs ); \
  4404. stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );
  4405. #define stbir__1_coeff_remnant( ofs ) \
  4406. STBIR_SIMD_NO_UNROLL(decode); \
  4407. stbir__simdf_load1z( c, hc + (ofs) ); \
  4408. stbir__simdf_0123to0001( c, c ); \
  4409. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );
  4410. #define stbir__2_coeff_remnant( ofs ) \
  4411. { stbir__simdf d; \
  4412. STBIR_SIMD_NO_UNROLL(decode); \
  4413. stbir__simdf_load2z( cs, hc + (ofs) ); \
  4414. stbir__simdf_0123to0001( c, cs ); \
  4415. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
  4416. stbir__simdf_0123to1122( c, cs ); \
  4417. stbir__simdf_load2z( d, decode+(ofs)*3+4 ); \
  4418. stbir__simdf_madd( tot1, tot1, c, d ); }
  4419. #define stbir__3_coeff_remnant( ofs ) \
  4420. { stbir__simdf d; \
  4421. STBIR_SIMD_NO_UNROLL(decode); \
  4422. stbir__simdf_load( cs, hc + (ofs) ); \
  4423. stbir__simdf_0123to0001( c, cs ); \
  4424. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
  4425. stbir__simdf_0123to1122( c, cs ); \
  4426. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
  4427. stbir__simdf_0123to2222( c, cs ); \
  4428. stbir__simdf_load1z( d, decode+(ofs)*3+8 ); \
  4429. stbir__simdf_madd( tot2, tot2, c, d ); }
  4430. #define stbir__store_output() \
  4431. stbir__simdf_0123ABCDto3ABx( c, tot0, tot1 ); \
  4432. stbir__simdf_0123ABCDto23Ax( cs, tot1, tot2 ); \
  4433. stbir__simdf_0123to1230( tot2, tot2 ); \
  4434. stbir__simdf_add( tot0, tot0, cs ); \
  4435. stbir__simdf_add( c, c, tot2 ); \
  4436. stbir__simdf_add( tot0, tot0, c ); \
  4437. horizontal_coefficients += coefficient_width; \
  4438. ++horizontal_contributors; \
  4439. output += 3; \
  4440. if ( output < output_end ) \
  4441. { \
  4442. stbir__simdf_store( output-3, tot0 ); \
  4443. continue; \
  4444. } \
  4445. stbir__simdf_0123to2301( tot1, tot0 ); \
  4446. stbir__simdf_store2( output-3, tot0 ); \
  4447. stbir__simdf_store1( output+2-3, tot1 ); \
  4448. break;
  4449. #endif
  4450. #else
  4451. #define stbir__1_coeff_only() \
  4452. float tot0, tot1, tot2, c; \
  4453. c = hc[0]; \
  4454. tot0 = decode[0]*c; \
  4455. tot1 = decode[1]*c; \
  4456. tot2 = decode[2]*c;
  4457. #define stbir__2_coeff_only() \
  4458. float tot0, tot1, tot2, c; \
  4459. c = hc[0]; \
  4460. tot0 = decode[0]*c; \
  4461. tot1 = decode[1]*c; \
  4462. tot2 = decode[2]*c; \
  4463. c = hc[1]; \
  4464. tot0 += decode[3]*c; \
  4465. tot1 += decode[4]*c; \
  4466. tot2 += decode[5]*c;
  4467. #define stbir__3_coeff_only() \
  4468. float tot0, tot1, tot2, c; \
  4469. c = hc[0]; \
  4470. tot0 = decode[0]*c; \
  4471. tot1 = decode[1]*c; \
  4472. tot2 = decode[2]*c; \
  4473. c = hc[1]; \
  4474. tot0 += decode[3]*c; \
  4475. tot1 += decode[4]*c; \
  4476. tot2 += decode[5]*c; \
  4477. c = hc[2]; \
  4478. tot0 += decode[6]*c; \
  4479. tot1 += decode[7]*c; \
  4480. tot2 += decode[8]*c;
  4481. #define stbir__store_output_tiny() \
  4482. output[0] = tot0; \
  4483. output[1] = tot1; \
  4484. output[2] = tot2; \
  4485. horizontal_coefficients += coefficient_width; \
  4486. ++horizontal_contributors; \
  4487. output += 3;
  4488. #define stbir__4_coeff_start() \
  4489. float tota0,tota1,tota2,totb0,totb1,totb2,totc0,totc1,totc2,totd0,totd1,totd2,c; \
  4490. c = hc[0]; \
  4491. tota0 = decode[0]*c; \
  4492. tota1 = decode[1]*c; \
  4493. tota2 = decode[2]*c; \
  4494. c = hc[1]; \
  4495. totb0 = decode[3]*c; \
  4496. totb1 = decode[4]*c; \
  4497. totb2 = decode[5]*c; \
  4498. c = hc[2]; \
  4499. totc0 = decode[6]*c; \
  4500. totc1 = decode[7]*c; \
  4501. totc2 = decode[8]*c; \
  4502. c = hc[3]; \
  4503. totd0 = decode[9]*c; \
  4504. totd1 = decode[10]*c; \
  4505. totd2 = decode[11]*c;
  4506. #define stbir__4_coeff_continue_from_4( ofs ) \
  4507. c = hc[0+(ofs)]; \
  4508. tota0 += decode[0+(ofs)*3]*c; \
  4509. tota1 += decode[1+(ofs)*3]*c; \
  4510. tota2 += decode[2+(ofs)*3]*c; \
  4511. c = hc[1+(ofs)]; \
  4512. totb0 += decode[3+(ofs)*3]*c; \
  4513. totb1 += decode[4+(ofs)*3]*c; \
  4514. totb2 += decode[5+(ofs)*3]*c; \
  4515. c = hc[2+(ofs)]; \
  4516. totc0 += decode[6+(ofs)*3]*c; \
  4517. totc1 += decode[7+(ofs)*3]*c; \
  4518. totc2 += decode[8+(ofs)*3]*c; \
  4519. c = hc[3+(ofs)]; \
  4520. totd0 += decode[9+(ofs)*3]*c; \
  4521. totd1 += decode[10+(ofs)*3]*c; \
  4522. totd2 += decode[11+(ofs)*3]*c;
  4523. #define stbir__1_coeff_remnant( ofs ) \
  4524. c = hc[0+(ofs)]; \
  4525. tota0 += decode[0+(ofs)*3]*c; \
  4526. tota1 += decode[1+(ofs)*3]*c; \
  4527. tota2 += decode[2+(ofs)*3]*c;
  4528. #define stbir__2_coeff_remnant( ofs ) \
  4529. c = hc[0+(ofs)]; \
  4530. tota0 += decode[0+(ofs)*3]*c; \
  4531. tota1 += decode[1+(ofs)*3]*c; \
  4532. tota2 += decode[2+(ofs)*3]*c; \
  4533. c = hc[1+(ofs)]; \
  4534. totb0 += decode[3+(ofs)*3]*c; \
  4535. totb1 += decode[4+(ofs)*3]*c; \
  4536. totb2 += decode[5+(ofs)*3]*c; \
  4537. #define stbir__3_coeff_remnant( ofs ) \
  4538. c = hc[0+(ofs)]; \
  4539. tota0 += decode[0+(ofs)*3]*c; \
  4540. tota1 += decode[1+(ofs)*3]*c; \
  4541. tota2 += decode[2+(ofs)*3]*c; \
  4542. c = hc[1+(ofs)]; \
  4543. totb0 += decode[3+(ofs)*3]*c; \
  4544. totb1 += decode[4+(ofs)*3]*c; \
  4545. totb2 += decode[5+(ofs)*3]*c; \
  4546. c = hc[2+(ofs)]; \
  4547. totc0 += decode[6+(ofs)*3]*c; \
  4548. totc1 += decode[7+(ofs)*3]*c; \
  4549. totc2 += decode[8+(ofs)*3]*c;
  4550. #define stbir__store_output() \
  4551. output[0] = (tota0+totc0)+(totb0+totd0); \
  4552. output[1] = (tota1+totc1)+(totb1+totd1); \
  4553. output[2] = (tota2+totc2)+(totb2+totd2); \
  4554. horizontal_coefficients += coefficient_width; \
  4555. ++horizontal_contributors; \
  4556. output += 3;
  4557. #endif
  4558. #define STBIR__horizontal_channels 3
  4559. #define STB_IMAGE_RESIZE_DO_HORIZONTALS
  4560. #include STBIR__HEADER_FILENAME
  4561. //=================
  4562. // Do 4 channel horizontal routines
  4563. #ifdef STBIR_SIMD
  4564. #define stbir__1_coeff_only() \
  4565. stbir__simdf tot,c; \
  4566. STBIR_SIMD_NO_UNROLL(decode); \
  4567. stbir__simdf_load1( c, hc ); \
  4568. stbir__simdf_0123to0000( c, c ); \
  4569. stbir__simdf_mult_mem( tot, c, decode );
  4570. #define stbir__2_coeff_only() \
  4571. stbir__simdf tot,c,cs; \
  4572. STBIR_SIMD_NO_UNROLL(decode); \
  4573. stbir__simdf_load2( cs, hc ); \
  4574. stbir__simdf_0123to0000( c, cs ); \
  4575. stbir__simdf_mult_mem( tot, c, decode ); \
  4576. stbir__simdf_0123to1111( c, cs ); \
  4577. stbir__simdf_madd_mem( tot, tot, c, decode+4 );
  4578. #define stbir__3_coeff_only() \
  4579. stbir__simdf tot,c,cs; \
  4580. STBIR_SIMD_NO_UNROLL(decode); \
  4581. stbir__simdf_load( cs, hc ); \
  4582. stbir__simdf_0123to0000( c, cs ); \
  4583. stbir__simdf_mult_mem( tot, c, decode ); \
  4584. stbir__simdf_0123to1111( c, cs ); \
  4585. stbir__simdf_madd_mem( tot, tot, c, decode+4 ); \
  4586. stbir__simdf_0123to2222( c, cs ); \
  4587. stbir__simdf_madd_mem( tot, tot, c, decode+8 );
  4588. #define stbir__store_output_tiny() \
  4589. stbir__simdf_store( output, tot ); \
  4590. horizontal_coefficients += coefficient_width; \
  4591. ++horizontal_contributors; \
  4592. output += 4;
  4593. #ifdef STBIR_SIMD8
  4594. #define stbir__4_coeff_start() \
  4595. stbir__simdf8 tot0,c,cs; stbir__simdf t; \
  4596. STBIR_SIMD_NO_UNROLL(decode); \
  4597. stbir__simdf8_load4b( cs, hc ); \
  4598. stbir__simdf8_0123to00001111( c, cs ); \
  4599. stbir__simdf8_mult_mem( tot0, c, decode ); \
  4600. stbir__simdf8_0123to22223333( c, cs ); \
  4601. stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );
  4602. #define stbir__4_coeff_continue_from_4( ofs ) \
  4603. STBIR_SIMD_NO_UNROLL(decode); \
  4604. stbir__simdf8_load4b( cs, hc + (ofs) ); \
  4605. stbir__simdf8_0123to00001111( c, cs ); \
  4606. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
  4607. stbir__simdf8_0123to22223333( c, cs ); \
  4608. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
  4609. #define stbir__1_coeff_remnant( ofs ) \
  4610. STBIR_SIMD_NO_UNROLL(decode); \
  4611. stbir__simdf_load1rep4( t, hc + (ofs) ); \
  4612. stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 );
  4613. #define stbir__2_coeff_remnant( ofs ) \
  4614. STBIR_SIMD_NO_UNROLL(decode); \
  4615. stbir__simdf8_load4b( cs, hc + (ofs) - 2 ); \
  4616. stbir__simdf8_0123to22223333( c, cs ); \
  4617. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
  4618. #define stbir__3_coeff_remnant( ofs ) \
  4619. STBIR_SIMD_NO_UNROLL(decode); \
  4620. stbir__simdf8_load4b( cs, hc + (ofs) ); \
  4621. stbir__simdf8_0123to00001111( c, cs ); \
  4622. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
  4623. stbir__simdf8_0123to2222( t, cs ); \
  4624. stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 );
  4625. #define stbir__store_output() \
  4626. stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 ); \
  4627. stbir__simdf_store( output, t ); \
  4628. horizontal_coefficients += coefficient_width; \
  4629. ++horizontal_contributors; \
  4630. output += 4;
  4631. #else
  4632. #define stbir__4_coeff_start() \
  4633. stbir__simdf tot0,tot1,c,cs; \
  4634. STBIR_SIMD_NO_UNROLL(decode); \
  4635. stbir__simdf_load( cs, hc ); \
  4636. stbir__simdf_0123to0000( c, cs ); \
  4637. stbir__simdf_mult_mem( tot0, c, decode ); \
  4638. stbir__simdf_0123to1111( c, cs ); \
  4639. stbir__simdf_mult_mem( tot1, c, decode+4 ); \
  4640. stbir__simdf_0123to2222( c, cs ); \
  4641. stbir__simdf_madd_mem( tot0, tot0, c, decode+8 ); \
  4642. stbir__simdf_0123to3333( c, cs ); \
  4643. stbir__simdf_madd_mem( tot1, tot1, c, decode+12 );
  4644. #define stbir__4_coeff_continue_from_4( ofs ) \
  4645. STBIR_SIMD_NO_UNROLL(decode); \
  4646. stbir__simdf_load( cs, hc + (ofs) ); \
  4647. stbir__simdf_0123to0000( c, cs ); \
  4648. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
  4649. stbir__simdf_0123to1111( c, cs ); \
  4650. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); \
  4651. stbir__simdf_0123to2222( c, cs ); \
  4652. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 ); \
  4653. stbir__simdf_0123to3333( c, cs ); \
  4654. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 );
  4655. #define stbir__1_coeff_remnant( ofs ) \
  4656. STBIR_SIMD_NO_UNROLL(decode); \
  4657. stbir__simdf_load1( c, hc + (ofs) ); \
  4658. stbir__simdf_0123to0000( c, c ); \
  4659. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
  4660. #define stbir__2_coeff_remnant( ofs ) \
  4661. STBIR_SIMD_NO_UNROLL(decode); \
  4662. stbir__simdf_load2( cs, hc + (ofs) ); \
  4663. stbir__simdf_0123to0000( c, cs ); \
  4664. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
  4665. stbir__simdf_0123to1111( c, cs ); \
  4666. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );
  4667. #define stbir__3_coeff_remnant( ofs ) \
  4668. STBIR_SIMD_NO_UNROLL(decode); \
  4669. stbir__simdf_load( cs, hc + (ofs) ); \
  4670. stbir__simdf_0123to0000( c, cs ); \
  4671. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \
  4672. stbir__simdf_0123to1111( c, cs ); \
  4673. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); \
  4674. stbir__simdf_0123to2222( c, cs ); \
  4675. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
  4676. #define stbir__store_output() \
  4677. stbir__simdf_add( tot0, tot0, tot1 ); \
  4678. stbir__simdf_store( output, tot0 ); \
  4679. horizontal_coefficients += coefficient_width; \
  4680. ++horizontal_contributors; \
  4681. output += 4;
  4682. #endif
  4683. #else
  4684. #define stbir__1_coeff_only() \
  4685. float p0,p1,p2,p3,c; \
  4686. STBIR_SIMD_NO_UNROLL(decode); \
  4687. c = hc[0]; \
  4688. p0 = decode[0] * c; \
  4689. p1 = decode[1] * c; \
  4690. p2 = decode[2] * c; \
  4691. p3 = decode[3] * c;
  4692. #define stbir__2_coeff_only() \
  4693. float p0,p1,p2,p3,c; \
  4694. STBIR_SIMD_NO_UNROLL(decode); \
  4695. c = hc[0]; \
  4696. p0 = decode[0] * c; \
  4697. p1 = decode[1] * c; \
  4698. p2 = decode[2] * c; \
  4699. p3 = decode[3] * c; \
  4700. c = hc[1]; \
  4701. p0 += decode[4] * c; \
  4702. p1 += decode[5] * c; \
  4703. p2 += decode[6] * c; \
  4704. p3 += decode[7] * c;
  4705. #define stbir__3_coeff_only() \
  4706. float p0,p1,p2,p3,c; \
  4707. STBIR_SIMD_NO_UNROLL(decode); \
  4708. c = hc[0]; \
  4709. p0 = decode[0] * c; \
  4710. p1 = decode[1] * c; \
  4711. p2 = decode[2] * c; \
  4712. p3 = decode[3] * c; \
  4713. c = hc[1]; \
  4714. p0 += decode[4] * c; \
  4715. p1 += decode[5] * c; \
  4716. p2 += decode[6] * c; \
  4717. p3 += decode[7] * c; \
  4718. c = hc[2]; \
  4719. p0 += decode[8] * c; \
  4720. p1 += decode[9] * c; \
  4721. p2 += decode[10] * c; \
  4722. p3 += decode[11] * c;
  4723. #define stbir__store_output_tiny() \
  4724. output[0] = p0; \
  4725. output[1] = p1; \
  4726. output[2] = p2; \
  4727. output[3] = p3; \
  4728. horizontal_coefficients += coefficient_width; \
  4729. ++horizontal_contributors; \
  4730. output += 4;
  4731. #define stbir__4_coeff_start() \
  4732. float x0,x1,x2,x3,y0,y1,y2,y3,c; \
  4733. STBIR_SIMD_NO_UNROLL(decode); \
  4734. c = hc[0]; \
  4735. x0 = decode[0] * c; \
  4736. x1 = decode[1] * c; \
  4737. x2 = decode[2] * c; \
  4738. x3 = decode[3] * c; \
  4739. c = hc[1]; \
  4740. y0 = decode[4] * c; \
  4741. y1 = decode[5] * c; \
  4742. y2 = decode[6] * c; \
  4743. y3 = decode[7] * c; \
  4744. c = hc[2]; \
  4745. x0 += decode[8] * c; \
  4746. x1 += decode[9] * c; \
  4747. x2 += decode[10] * c; \
  4748. x3 += decode[11] * c; \
  4749. c = hc[3]; \
  4750. y0 += decode[12] * c; \
  4751. y1 += decode[13] * c; \
  4752. y2 += decode[14] * c; \
  4753. y3 += decode[15] * c;
  4754. #define stbir__4_coeff_continue_from_4( ofs ) \
  4755. STBIR_SIMD_NO_UNROLL(decode); \
  4756. c = hc[0+(ofs)]; \
  4757. x0 += decode[0+(ofs)*4] * c; \
  4758. x1 += decode[1+(ofs)*4] * c; \
  4759. x2 += decode[2+(ofs)*4] * c; \
  4760. x3 += decode[3+(ofs)*4] * c; \
  4761. c = hc[1+(ofs)]; \
  4762. y0 += decode[4+(ofs)*4] * c; \
  4763. y1 += decode[5+(ofs)*4] * c; \
  4764. y2 += decode[6+(ofs)*4] * c; \
  4765. y3 += decode[7+(ofs)*4] * c; \
  4766. c = hc[2+(ofs)]; \
  4767. x0 += decode[8+(ofs)*4] * c; \
  4768. x1 += decode[9+(ofs)*4] * c; \
  4769. x2 += decode[10+(ofs)*4] * c; \
  4770. x3 += decode[11+(ofs)*4] * c; \
  4771. c = hc[3+(ofs)]; \
  4772. y0 += decode[12+(ofs)*4] * c; \
  4773. y1 += decode[13+(ofs)*4] * c; \
  4774. y2 += decode[14+(ofs)*4] * c; \
  4775. y3 += decode[15+(ofs)*4] * c;
  4776. #define stbir__1_coeff_remnant( ofs ) \
  4777. STBIR_SIMD_NO_UNROLL(decode); \
  4778. c = hc[0+(ofs)]; \
  4779. x0 += decode[0+(ofs)*4] * c; \
  4780. x1 += decode[1+(ofs)*4] * c; \
  4781. x2 += decode[2+(ofs)*4] * c; \
  4782. x3 += decode[3+(ofs)*4] * c;
  4783. #define stbir__2_coeff_remnant( ofs ) \
  4784. STBIR_SIMD_NO_UNROLL(decode); \
  4785. c = hc[0+(ofs)]; \
  4786. x0 += decode[0+(ofs)*4] * c; \
  4787. x1 += decode[1+(ofs)*4] * c; \
  4788. x2 += decode[2+(ofs)*4] * c; \
  4789. x3 += decode[3+(ofs)*4] * c; \
  4790. c = hc[1+(ofs)]; \
  4791. y0 += decode[4+(ofs)*4] * c; \
  4792. y1 += decode[5+(ofs)*4] * c; \
  4793. y2 += decode[6+(ofs)*4] * c; \
  4794. y3 += decode[7+(ofs)*4] * c;
  4795. #define stbir__3_coeff_remnant( ofs ) \
  4796. STBIR_SIMD_NO_UNROLL(decode); \
  4797. c = hc[0+(ofs)]; \
  4798. x0 += decode[0+(ofs)*4] * c; \
  4799. x1 += decode[1+(ofs)*4] * c; \
  4800. x2 += decode[2+(ofs)*4] * c; \
  4801. x3 += decode[3+(ofs)*4] * c; \
  4802. c = hc[1+(ofs)]; \
  4803. y0 += decode[4+(ofs)*4] * c; \
  4804. y1 += decode[5+(ofs)*4] * c; \
  4805. y2 += decode[6+(ofs)*4] * c; \
  4806. y3 += decode[7+(ofs)*4] * c; \
  4807. c = hc[2+(ofs)]; \
  4808. x0 += decode[8+(ofs)*4] * c; \
  4809. x1 += decode[9+(ofs)*4] * c; \
  4810. x2 += decode[10+(ofs)*4] * c; \
  4811. x3 += decode[11+(ofs)*4] * c;
  4812. #define stbir__store_output() \
  4813. output[0] = x0 + y0; \
  4814. output[1] = x1 + y1; \
  4815. output[2] = x2 + y2; \
  4816. output[3] = x3 + y3; \
  4817. horizontal_coefficients += coefficient_width; \
  4818. ++horizontal_contributors; \
  4819. output += 4;
  4820. #endif
  4821. #define STBIR__horizontal_channels 4
  4822. #define STB_IMAGE_RESIZE_DO_HORIZONTALS
  4823. #include STBIR__HEADER_FILENAME
  4824. //=================
  4825. // Do 7 channel horizontal routines
  4826. #ifdef STBIR_SIMD
  4827. #define stbir__1_coeff_only() \
  4828. stbir__simdf tot0,tot1,c; \
  4829. STBIR_SIMD_NO_UNROLL(decode); \
  4830. stbir__simdf_load1( c, hc ); \
  4831. stbir__simdf_0123to0000( c, c ); \
  4832. stbir__simdf_mult_mem( tot0, c, decode ); \
  4833. stbir__simdf_mult_mem( tot1, c, decode+3 );
  4834. #define stbir__2_coeff_only() \
  4835. stbir__simdf tot0,tot1,c,cs; \
  4836. STBIR_SIMD_NO_UNROLL(decode); \
  4837. stbir__simdf_load2( cs, hc ); \
  4838. stbir__simdf_0123to0000( c, cs ); \
  4839. stbir__simdf_mult_mem( tot0, c, decode ); \
  4840. stbir__simdf_mult_mem( tot1, c, decode+3 ); \
  4841. stbir__simdf_0123to1111( c, cs ); \
  4842. stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
  4843. stbir__simdf_madd_mem( tot1, tot1, c,decode+10 );
  4844. #define stbir__3_coeff_only() \
  4845. stbir__simdf tot0,tot1,c,cs; \
  4846. STBIR_SIMD_NO_UNROLL(decode); \
  4847. stbir__simdf_load( cs, hc ); \
  4848. stbir__simdf_0123to0000( c, cs ); \
  4849. stbir__simdf_mult_mem( tot0, c, decode ); \
  4850. stbir__simdf_mult_mem( tot1, c, decode+3 ); \
  4851. stbir__simdf_0123to1111( c, cs ); \
  4852. stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
  4853. stbir__simdf_madd_mem( tot1, tot1, c, decode+10 ); \
  4854. stbir__simdf_0123to2222( c, cs ); \
  4855. stbir__simdf_madd_mem( tot0, tot0, c, decode+14 ); \
  4856. stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );
  4857. #define stbir__store_output_tiny() \
  4858. stbir__simdf_store( output+3, tot1 ); \
  4859. stbir__simdf_store( output, tot0 ); \
  4860. horizontal_coefficients += coefficient_width; \
  4861. ++horizontal_contributors; \
  4862. output += 7;
  4863. #ifdef STBIR_SIMD8
  4864. #define stbir__4_coeff_start() \
  4865. stbir__simdf8 tot0,tot1,c,cs; \
  4866. STBIR_SIMD_NO_UNROLL(decode); \
  4867. stbir__simdf8_load4b( cs, hc ); \
  4868. stbir__simdf8_0123to00000000( c, cs ); \
  4869. stbir__simdf8_mult_mem( tot0, c, decode ); \
  4870. stbir__simdf8_0123to11111111( c, cs ); \
  4871. stbir__simdf8_mult_mem( tot1, c, decode+7 ); \
  4872. stbir__simdf8_0123to22222222( c, cs ); \
  4873. stbir__simdf8_madd_mem( tot0, tot0, c, decode+14 ); \
  4874. stbir__simdf8_0123to33333333( c, cs ); \
  4875. stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );
  4876. #define stbir__4_coeff_continue_from_4( ofs ) \
  4877. STBIR_SIMD_NO_UNROLL(decode); \
  4878. stbir__simdf8_load4b( cs, hc + (ofs) ); \
  4879. stbir__simdf8_0123to00000000( c, cs ); \
  4880. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
  4881. stbir__simdf8_0123to11111111( c, cs ); \
  4882. stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 ); \
  4883. stbir__simdf8_0123to22222222( c, cs ); \
  4884. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
  4885. stbir__simdf8_0123to33333333( c, cs ); \
  4886. stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 );
  4887. #define stbir__1_coeff_remnant( ofs ) \
  4888. STBIR_SIMD_NO_UNROLL(decode); \
  4889. stbir__simdf8_load1b( c, hc + (ofs) ); \
  4890. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );
  4891. #define stbir__2_coeff_remnant( ofs ) \
  4892. STBIR_SIMD_NO_UNROLL(decode); \
  4893. stbir__simdf8_load1b( c, hc + (ofs) ); \
  4894. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
  4895. stbir__simdf8_load1b( c, hc + (ofs)+1 ); \
  4896. stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );
  4897. #define stbir__3_coeff_remnant( ofs ) \
  4898. STBIR_SIMD_NO_UNROLL(decode); \
  4899. stbir__simdf8_load4b( cs, hc + (ofs) ); \
  4900. stbir__simdf8_0123to00000000( c, cs ); \
  4901. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
  4902. stbir__simdf8_0123to11111111( c, cs ); \
  4903. stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 ); \
  4904. stbir__simdf8_0123to22222222( c, cs ); \
  4905. stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );
  4906. #define stbir__store_output() \
  4907. stbir__simdf8_add( tot0, tot0, tot1 ); \
  4908. horizontal_coefficients += coefficient_width; \
  4909. ++horizontal_contributors; \
  4910. output += 7; \
  4911. if ( output < output_end ) \
  4912. { \
  4913. stbir__simdf8_store( output-7, tot0 ); \
  4914. continue; \
  4915. } \
  4916. stbir__simdf_store( output-7+3, stbir__simdf_swiz(stbir__simdf8_gettop4(tot0),0,0,1,2) ); \
  4917. stbir__simdf_store( output-7, stbir__if_simdf8_cast_to_simdf4(tot0) ); \
  4918. break;
  4919. #else
  4920. #define stbir__4_coeff_start() \
  4921. stbir__simdf tot0,tot1,tot2,tot3,c,cs; \
  4922. STBIR_SIMD_NO_UNROLL(decode); \
  4923. stbir__simdf_load( cs, hc ); \
  4924. stbir__simdf_0123to0000( c, cs ); \
  4925. stbir__simdf_mult_mem( tot0, c, decode ); \
  4926. stbir__simdf_mult_mem( tot1, c, decode+3 ); \
  4927. stbir__simdf_0123to1111( c, cs ); \
  4928. stbir__simdf_mult_mem( tot2, c, decode+7 ); \
  4929. stbir__simdf_mult_mem( tot3, c, decode+10 ); \
  4930. stbir__simdf_0123to2222( c, cs ); \
  4931. stbir__simdf_madd_mem( tot0, tot0, c, decode+14 ); \
  4932. stbir__simdf_madd_mem( tot1, tot1, c, decode+17 ); \
  4933. stbir__simdf_0123to3333( c, cs ); \
  4934. stbir__simdf_madd_mem( tot2, tot2, c, decode+21 ); \
  4935. stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );
  4936. #define stbir__4_coeff_continue_from_4( ofs ) \
  4937. STBIR_SIMD_NO_UNROLL(decode); \
  4938. stbir__simdf_load( cs, hc + (ofs) ); \
  4939. stbir__simdf_0123to0000( c, cs ); \
  4940. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
  4941. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
  4942. stbir__simdf_0123to1111( c, cs ); \
  4943. stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \
  4944. stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 ); \
  4945. stbir__simdf_0123to2222( c, cs ); \
  4946. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
  4947. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 ); \
  4948. stbir__simdf_0123to3333( c, cs ); \
  4949. stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+21 ); \
  4950. stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );
  4951. #define stbir__1_coeff_remnant( ofs ) \
  4952. STBIR_SIMD_NO_UNROLL(decode); \
  4953. stbir__simdf_load1( c, hc + (ofs) ); \
  4954. stbir__simdf_0123to0000( c, c ); \
  4955. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
  4956. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
  4957. #define stbir__2_coeff_remnant( ofs ) \
  4958. STBIR_SIMD_NO_UNROLL(decode); \
  4959. stbir__simdf_load2( cs, hc + (ofs) ); \
  4960. stbir__simdf_0123to0000( c, cs ); \
  4961. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
  4962. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
  4963. stbir__simdf_0123to1111( c, cs ); \
  4964. stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \
  4965. stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );
  4966. #define stbir__3_coeff_remnant( ofs ) \
  4967. STBIR_SIMD_NO_UNROLL(decode); \
  4968. stbir__simdf_load( cs, hc + (ofs) ); \
  4969. stbir__simdf_0123to0000( c, cs ); \
  4970. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \
  4971. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \
  4972. stbir__simdf_0123to1111( c, cs ); \
  4973. stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \
  4974. stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 ); \
  4975. stbir__simdf_0123to2222( c, cs ); \
  4976. stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
  4977. stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );
  4978. #define stbir__store_output() \
  4979. stbir__simdf_add( tot0, tot0, tot2 ); \
  4980. stbir__simdf_add( tot1, tot1, tot3 ); \
  4981. stbir__simdf_store( output+3, tot1 ); \
  4982. stbir__simdf_store( output, tot0 ); \
  4983. horizontal_coefficients += coefficient_width; \
  4984. ++horizontal_contributors; \
  4985. output += 7;
  4986. #endif
  4987. #else
  4988. #define stbir__1_coeff_only() \
  4989. float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
  4990. c = hc[0]; \
  4991. tot0 = decode[0]*c; \
  4992. tot1 = decode[1]*c; \
  4993. tot2 = decode[2]*c; \
  4994. tot3 = decode[3]*c; \
  4995. tot4 = decode[4]*c; \
  4996. tot5 = decode[5]*c; \
  4997. tot6 = decode[6]*c;
  4998. #define stbir__2_coeff_only() \
  4999. float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
  5000. c = hc[0]; \
  5001. tot0 = decode[0]*c; \
  5002. tot1 = decode[1]*c; \
  5003. tot2 = decode[2]*c; \
  5004. tot3 = decode[3]*c; \
  5005. tot4 = decode[4]*c; \
  5006. tot5 = decode[5]*c; \
  5007. tot6 = decode[6]*c; \
  5008. c = hc[1]; \
  5009. tot0 += decode[7]*c; \
  5010. tot1 += decode[8]*c; \
  5011. tot2 += decode[9]*c; \
  5012. tot3 += decode[10]*c; \
  5013. tot4 += decode[11]*c; \
  5014. tot5 += decode[12]*c; \
  5015. tot6 += decode[13]*c; \
  5016. #define stbir__3_coeff_only() \
  5017. float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
  5018. c = hc[0]; \
  5019. tot0 = decode[0]*c; \
  5020. tot1 = decode[1]*c; \
  5021. tot2 = decode[2]*c; \
  5022. tot3 = decode[3]*c; \
  5023. tot4 = decode[4]*c; \
  5024. tot5 = decode[5]*c; \
  5025. tot6 = decode[6]*c; \
  5026. c = hc[1]; \
  5027. tot0 += decode[7]*c; \
  5028. tot1 += decode[8]*c; \
  5029. tot2 += decode[9]*c; \
  5030. tot3 += decode[10]*c; \
  5031. tot4 += decode[11]*c; \
  5032. tot5 += decode[12]*c; \
  5033. tot6 += decode[13]*c; \
  5034. c = hc[2]; \
  5035. tot0 += decode[14]*c; \
  5036. tot1 += decode[15]*c; \
  5037. tot2 += decode[16]*c; \
  5038. tot3 += decode[17]*c; \
  5039. tot4 += decode[18]*c; \
  5040. tot5 += decode[19]*c; \
  5041. tot6 += decode[20]*c; \
  5042. #define stbir__store_output_tiny() \
  5043. output[0] = tot0; \
  5044. output[1] = tot1; \
  5045. output[2] = tot2; \
  5046. output[3] = tot3; \
  5047. output[4] = tot4; \
  5048. output[5] = tot5; \
  5049. output[6] = tot6; \
  5050. horizontal_coefficients += coefficient_width; \
  5051. ++horizontal_contributors; \
  5052. output += 7;
  5053. #define stbir__4_coeff_start() \
  5054. float x0,x1,x2,x3,x4,x5,x6,y0,y1,y2,y3,y4,y5,y6,c; \
  5055. STBIR_SIMD_NO_UNROLL(decode); \
  5056. c = hc[0]; \
  5057. x0 = decode[0] * c; \
  5058. x1 = decode[1] * c; \
  5059. x2 = decode[2] * c; \
  5060. x3 = decode[3] * c; \
  5061. x4 = decode[4] * c; \
  5062. x5 = decode[5] * c; \
  5063. x6 = decode[6] * c; \
  5064. c = hc[1]; \
  5065. y0 = decode[7] * c; \
  5066. y1 = decode[8] * c; \
  5067. y2 = decode[9] * c; \
  5068. y3 = decode[10] * c; \
  5069. y4 = decode[11] * c; \
  5070. y5 = decode[12] * c; \
  5071. y6 = decode[13] * c; \
  5072. c = hc[2]; \
  5073. x0 += decode[14] * c; \
  5074. x1 += decode[15] * c; \
  5075. x2 += decode[16] * c; \
  5076. x3 += decode[17] * c; \
  5077. x4 += decode[18] * c; \
  5078. x5 += decode[19] * c; \
  5079. x6 += decode[20] * c; \
  5080. c = hc[3]; \
  5081. y0 += decode[21] * c; \
  5082. y1 += decode[22] * c; \
  5083. y2 += decode[23] * c; \
  5084. y3 += decode[24] * c; \
  5085. y4 += decode[25] * c; \
  5086. y5 += decode[26] * c; \
  5087. y6 += decode[27] * c;
  5088. #define stbir__4_coeff_continue_from_4( ofs ) \
  5089. STBIR_SIMD_NO_UNROLL(decode); \
  5090. c = hc[0+(ofs)]; \
  5091. x0 += decode[0+(ofs)*7] * c; \
  5092. x1 += decode[1+(ofs)*7] * c; \
  5093. x2 += decode[2+(ofs)*7] * c; \
  5094. x3 += decode[3+(ofs)*7] * c; \
  5095. x4 += decode[4+(ofs)*7] * c; \
  5096. x5 += decode[5+(ofs)*7] * c; \
  5097. x6 += decode[6+(ofs)*7] * c; \
  5098. c = hc[1+(ofs)]; \
  5099. y0 += decode[7+(ofs)*7] * c; \
  5100. y1 += decode[8+(ofs)*7] * c; \
  5101. y2 += decode[9+(ofs)*7] * c; \
  5102. y3 += decode[10+(ofs)*7] * c; \
  5103. y4 += decode[11+(ofs)*7] * c; \
  5104. y5 += decode[12+(ofs)*7] * c; \
  5105. y6 += decode[13+(ofs)*7] * c; \
  5106. c = hc[2+(ofs)]; \
  5107. x0 += decode[14+(ofs)*7] * c; \
  5108. x1 += decode[15+(ofs)*7] * c; \
  5109. x2 += decode[16+(ofs)*7] * c; \
  5110. x3 += decode[17+(ofs)*7] * c; \
  5111. x4 += decode[18+(ofs)*7] * c; \
  5112. x5 += decode[19+(ofs)*7] * c; \
  5113. x6 += decode[20+(ofs)*7] * c; \
  5114. c = hc[3+(ofs)]; \
  5115. y0 += decode[21+(ofs)*7] * c; \
  5116. y1 += decode[22+(ofs)*7] * c; \
  5117. y2 += decode[23+(ofs)*7] * c; \
  5118. y3 += decode[24+(ofs)*7] * c; \
  5119. y4 += decode[25+(ofs)*7] * c; \
  5120. y5 += decode[26+(ofs)*7] * c; \
  5121. y6 += decode[27+(ofs)*7] * c;
  5122. #define stbir__1_coeff_remnant( ofs ) \
  5123. STBIR_SIMD_NO_UNROLL(decode); \
  5124. c = hc[0+(ofs)]; \
  5125. x0 += decode[0+(ofs)*7] * c; \
  5126. x1 += decode[1+(ofs)*7] * c; \
  5127. x2 += decode[2+(ofs)*7] * c; \
  5128. x3 += decode[3+(ofs)*7] * c; \
  5129. x4 += decode[4+(ofs)*7] * c; \
  5130. x5 += decode[5+(ofs)*7] * c; \
  5131. x6 += decode[6+(ofs)*7] * c; \
  5132. #define stbir__2_coeff_remnant( ofs ) \
  5133. STBIR_SIMD_NO_UNROLL(decode); \
  5134. c = hc[0+(ofs)]; \
  5135. x0 += decode[0+(ofs)*7] * c; \
  5136. x1 += decode[1+(ofs)*7] * c; \
  5137. x2 += decode[2+(ofs)*7] * c; \
  5138. x3 += decode[3+(ofs)*7] * c; \
  5139. x4 += decode[4+(ofs)*7] * c; \
  5140. x5 += decode[5+(ofs)*7] * c; \
  5141. x6 += decode[6+(ofs)*7] * c; \
  5142. c = hc[1+(ofs)]; \
  5143. y0 += decode[7+(ofs)*7] * c; \
  5144. y1 += decode[8+(ofs)*7] * c; \
  5145. y2 += decode[9+(ofs)*7] * c; \
  5146. y3 += decode[10+(ofs)*7] * c; \
  5147. y4 += decode[11+(ofs)*7] * c; \
  5148. y5 += decode[12+(ofs)*7] * c; \
  5149. y6 += decode[13+(ofs)*7] * c; \
  5150. #define stbir__3_coeff_remnant( ofs ) \
  5151. STBIR_SIMD_NO_UNROLL(decode); \
  5152. c = hc[0+(ofs)]; \
  5153. x0 += decode[0+(ofs)*7] * c; \
  5154. x1 += decode[1+(ofs)*7] * c; \
  5155. x2 += decode[2+(ofs)*7] * c; \
  5156. x3 += decode[3+(ofs)*7] * c; \
  5157. x4 += decode[4+(ofs)*7] * c; \
  5158. x5 += decode[5+(ofs)*7] * c; \
  5159. x6 += decode[6+(ofs)*7] * c; \
  5160. c = hc[1+(ofs)]; \
  5161. y0 += decode[7+(ofs)*7] * c; \
  5162. y1 += decode[8+(ofs)*7] * c; \
  5163. y2 += decode[9+(ofs)*7] * c; \
  5164. y3 += decode[10+(ofs)*7] * c; \
  5165. y4 += decode[11+(ofs)*7] * c; \
  5166. y5 += decode[12+(ofs)*7] * c; \
  5167. y6 += decode[13+(ofs)*7] * c; \
  5168. c = hc[2+(ofs)]; \
  5169. x0 += decode[14+(ofs)*7] * c; \
  5170. x1 += decode[15+(ofs)*7] * c; \
  5171. x2 += decode[16+(ofs)*7] * c; \
  5172. x3 += decode[17+(ofs)*7] * c; \
  5173. x4 += decode[18+(ofs)*7] * c; \
  5174. x5 += decode[19+(ofs)*7] * c; \
  5175. x6 += decode[20+(ofs)*7] * c; \
  5176. #define stbir__store_output() \
  5177. output[0] = x0 + y0; \
  5178. output[1] = x1 + y1; \
  5179. output[2] = x2 + y2; \
  5180. output[3] = x3 + y3; \
  5181. output[4] = x4 + y4; \
  5182. output[5] = x5 + y5; \
  5183. output[6] = x6 + y6; \
  5184. horizontal_coefficients += coefficient_width; \
  5185. ++horizontal_contributors; \
  5186. output += 7;
  5187. #endif
  5188. #define STBIR__horizontal_channels 7
  5189. #define STB_IMAGE_RESIZE_DO_HORIZONTALS
  5190. #include STBIR__HEADER_FILENAME
  5191. // include all of the vertical resamplers (both scatter and gather versions)
  5192. #define STBIR__vertical_channels 1
  5193. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5194. #include STBIR__HEADER_FILENAME
  5195. #define STBIR__vertical_channels 1
  5196. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5197. #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  5198. #include STBIR__HEADER_FILENAME
  5199. #define STBIR__vertical_channels 2
  5200. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5201. #include STBIR__HEADER_FILENAME
  5202. #define STBIR__vertical_channels 2
  5203. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5204. #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  5205. #include STBIR__HEADER_FILENAME
  5206. #define STBIR__vertical_channels 3
  5207. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5208. #include STBIR__HEADER_FILENAME
  5209. #define STBIR__vertical_channels 3
  5210. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5211. #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  5212. #include STBIR__HEADER_FILENAME
  5213. #define STBIR__vertical_channels 4
  5214. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5215. #include STBIR__HEADER_FILENAME
  5216. #define STBIR__vertical_channels 4
  5217. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5218. #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  5219. #include STBIR__HEADER_FILENAME
  5220. #define STBIR__vertical_channels 5
  5221. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5222. #include STBIR__HEADER_FILENAME
  5223. #define STBIR__vertical_channels 5
  5224. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5225. #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  5226. #include STBIR__HEADER_FILENAME
  5227. #define STBIR__vertical_channels 6
  5228. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5229. #include STBIR__HEADER_FILENAME
  5230. #define STBIR__vertical_channels 6
  5231. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5232. #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  5233. #include STBIR__HEADER_FILENAME
  5234. #define STBIR__vertical_channels 7
  5235. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5236. #include STBIR__HEADER_FILENAME
  5237. #define STBIR__vertical_channels 7
  5238. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5239. #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  5240. #include STBIR__HEADER_FILENAME
  5241. #define STBIR__vertical_channels 8
  5242. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5243. #include STBIR__HEADER_FILENAME
  5244. #define STBIR__vertical_channels 8
  5245. #define STB_IMAGE_RESIZE_DO_VERTICALS
  5246. #define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  5247. #include STBIR__HEADER_FILENAME
  5248. typedef void STBIR_VERTICAL_GATHERFUNC( float * output, float const * coeffs, float const ** inputs, float const * input0_end );
  5249. static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers[ 8 ] =
  5250. {
  5251. stbir__vertical_gather_with_1_coeffs,stbir__vertical_gather_with_2_coeffs,stbir__vertical_gather_with_3_coeffs,stbir__vertical_gather_with_4_coeffs,stbir__vertical_gather_with_5_coeffs,stbir__vertical_gather_with_6_coeffs,stbir__vertical_gather_with_7_coeffs,stbir__vertical_gather_with_8_coeffs
  5252. };
  5253. static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers_continues[ 8 ] =
  5254. {
  5255. stbir__vertical_gather_with_1_coeffs_cont,stbir__vertical_gather_with_2_coeffs_cont,stbir__vertical_gather_with_3_coeffs_cont,stbir__vertical_gather_with_4_coeffs_cont,stbir__vertical_gather_with_5_coeffs_cont,stbir__vertical_gather_with_6_coeffs_cont,stbir__vertical_gather_with_7_coeffs_cont,stbir__vertical_gather_with_8_coeffs_cont
  5256. };
  5257. typedef void STBIR_VERTICAL_SCATTERFUNC( float ** outputs, float const * coeffs, float const * input, float const * input_end );
  5258. static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_sets[ 8 ] =
  5259. {
  5260. stbir__vertical_scatter_with_1_coeffs,stbir__vertical_scatter_with_2_coeffs,stbir__vertical_scatter_with_3_coeffs,stbir__vertical_scatter_with_4_coeffs,stbir__vertical_scatter_with_5_coeffs,stbir__vertical_scatter_with_6_coeffs,stbir__vertical_scatter_with_7_coeffs,stbir__vertical_scatter_with_8_coeffs
  5261. };
  5262. static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_blends[ 8 ] =
  5263. {
  5264. stbir__vertical_scatter_with_1_coeffs_cont,stbir__vertical_scatter_with_2_coeffs_cont,stbir__vertical_scatter_with_3_coeffs_cont,stbir__vertical_scatter_with_4_coeffs_cont,stbir__vertical_scatter_with_5_coeffs_cont,stbir__vertical_scatter_with_6_coeffs_cont,stbir__vertical_scatter_with_7_coeffs_cont,stbir__vertical_scatter_with_8_coeffs_cont
  5265. };
  5266. static void stbir__encode_scanline( stbir__info const * stbir_info, void *output_buffer_data, float * encode_buffer, int row STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
  5267. {
  5268. int num_pixels = stbir_info->horizontal.scale_info.output_sub_size;
  5269. int channels = stbir_info->channels;
  5270. int width_times_channels = num_pixels * channels;
  5271. void * output_buffer;
  5272. // un-alpha weight if we need to
  5273. if ( stbir_info->alpha_unweight )
  5274. {
  5275. STBIR_PROFILE_START( unalpha );
  5276. stbir_info->alpha_unweight( encode_buffer, width_times_channels );
  5277. STBIR_PROFILE_END( unalpha );
  5278. }
  5279. // write directly into output by default
  5280. output_buffer = output_buffer_data;
  5281. // if we have an output callback, we first convert the decode buffer in place (and then hand that to the callback)
  5282. if ( stbir_info->out_pixels_cb )
  5283. output_buffer = encode_buffer;
  5284. STBIR_PROFILE_START( encode );
  5285. // convert into the output buffer
  5286. stbir_info->encode_pixels( output_buffer, width_times_channels, encode_buffer );
  5287. STBIR_PROFILE_END( encode );
  5288. // if we have an output callback, call it to send the data
  5289. if ( stbir_info->out_pixels_cb )
  5290. stbir_info->out_pixels_cb( output_buffer, num_pixels, row, stbir_info->user_data );
  5291. }
  5292. // Get the ring buffer pointer for an index
  5293. static float* stbir__get_ring_buffer_entry(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int index )
  5294. {
  5295. STBIR_ASSERT( index < stbir_info->ring_buffer_num_entries );
  5296. #ifdef STBIR__SEPARATE_ALLOCATIONS
  5297. return split_info->ring_buffers[ index ];
  5298. #else
  5299. return (float*) ( ( (char*) split_info->ring_buffer ) + ( index * stbir_info->ring_buffer_length_bytes ) );
  5300. #endif
  5301. }
  5302. // Get the specified scan line from the ring buffer
  5303. static float* stbir__get_ring_buffer_scanline(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int get_scanline)
  5304. {
  5305. int ring_buffer_index = (split_info->ring_buffer_begin_index + (get_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
  5306. return stbir__get_ring_buffer_entry( stbir_info, split_info, ring_buffer_index );
  5307. }
  5308. static void stbir__resample_horizontal_gather(stbir__info const * stbir_info, float* output_buffer, float const * input_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
  5309. {
  5310. float const * decode_buffer = input_buffer - ( stbir_info->scanline_extents.conservative.n0 * stbir_info->effective_channels );
  5311. STBIR_PROFILE_START( horizontal );
  5312. if ( ( stbir_info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( stbir_info->horizontal.scale_info.scale == 1.0f ) )
  5313. STBIR_MEMCPY( output_buffer, input_buffer, stbir_info->horizontal.scale_info.output_sub_size * sizeof( float ) * stbir_info->effective_channels );
  5314. else
  5315. stbir_info->horizontal_gather_channels( output_buffer, stbir_info->horizontal.scale_info.output_sub_size, decode_buffer, stbir_info->horizontal.contributors, stbir_info->horizontal.coefficients, stbir_info->horizontal.coefficient_width );
  5316. STBIR_PROFILE_END( horizontal );
  5317. }
  5318. static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n, int contrib_n0, int contrib_n1, float const * vertical_coefficients )
  5319. {
  5320. float* encode_buffer = split_info->vertical_buffer;
  5321. float* decode_buffer = split_info->decode_buffer;
  5322. int vertical_first = stbir_info->vertical_first;
  5323. int width = (vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
  5324. int width_times_channels = stbir_info->effective_channels * width;
  5325. STBIR_ASSERT( stbir_info->vertical.is_gather );
  5326. // loop over the contributing scanlines and scale into the buffer
  5327. STBIR_PROFILE_START( vertical );
  5328. {
  5329. int k = 0, total = contrib_n1 - contrib_n0 + 1;
  5330. STBIR_ASSERT( total > 0 );
  5331. do {
  5332. float const * inputs[8];
  5333. int i, cnt = total; if ( cnt > 8 ) cnt = 8;
  5334. for( i = 0 ; i < cnt ; i++ )
  5335. inputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+contrib_n0 );
  5336. // call the N scanlines at a time function (up to 8 scanlines of blending at once)
  5337. ((k==0)?stbir__vertical_gathers:stbir__vertical_gathers_continues)[cnt-1]( (vertical_first) ? decode_buffer : encode_buffer, vertical_coefficients + k, inputs, inputs[0] + width_times_channels );
  5338. k += cnt;
  5339. total -= cnt;
  5340. } while ( total );
  5341. }
  5342. STBIR_PROFILE_END( vertical );
  5343. if ( vertical_first )
  5344. {
  5345. // Now resample the gathered vertical data in the horizontal axis into the encode buffer
  5346. decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
  5347. decode_buffer[ width_times_channels+1 ] = 0.0f;
  5348. stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
  5349. }
  5350. stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes),
  5351. encode_buffer, n STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
  5352. }
  5353. static void stbir__decode_and_resample_for_vertical_gather_loop(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n)
  5354. {
  5355. int ring_buffer_index;
  5356. float* ring_buffer;
  5357. // Decode the nth scanline from the source image into the decode buffer.
  5358. stbir__decode_scanline( stbir_info, n, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
  5359. // update new end scanline
  5360. split_info->ring_buffer_last_scanline = n;
  5361. // get ring buffer
  5362. ring_buffer_index = (split_info->ring_buffer_begin_index + (split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
  5363. ring_buffer = stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index);
  5364. // Now resample it into the ring buffer.
  5365. stbir__resample_horizontal_gather( stbir_info, ring_buffer, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
  5366. // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
  5367. }
  5368. static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
  5369. {
  5370. int y, start_output_y, end_output_y;
  5371. stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
  5372. float const * vertical_coefficients = stbir_info->vertical.coefficients;
  5373. STBIR_ASSERT( stbir_info->vertical.is_gather );
  5374. start_output_y = split_info->start_output_y;
  5375. end_output_y = split_info[split_count-1].end_output_y;
  5376. vertical_contributors += start_output_y;
  5377. vertical_coefficients += start_output_y * stbir_info->vertical.coefficient_width;
  5378. // initialize the ring buffer for gathering
  5379. split_info->ring_buffer_begin_index = 0;
  5380. split_info->ring_buffer_first_scanline = vertical_contributors->n0;
  5381. split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
  5382. for (y = start_output_y; y < end_output_y; y++)
  5383. {
  5384. int in_first_scanline, in_last_scanline;
  5385. in_first_scanline = vertical_contributors->n0;
  5386. in_last_scanline = vertical_contributors->n1;
  5387. // make sure the indexing hasn't broken
  5388. STBIR_ASSERT( in_first_scanline >= split_info->ring_buffer_first_scanline );
  5389. // Load in new scanlines
  5390. while (in_last_scanline > split_info->ring_buffer_last_scanline)
  5391. {
  5392. STBIR_ASSERT( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) <= stbir_info->ring_buffer_num_entries );
  5393. // make sure there was room in the ring buffer when we add new scanlines
  5394. if ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries )
  5395. {
  5396. split_info->ring_buffer_first_scanline++;
  5397. split_info->ring_buffer_begin_index++;
  5398. }
  5399. if ( stbir_info->vertical_first )
  5400. {
  5401. float * ring_buffer = stbir__get_ring_buffer_scanline( stbir_info, split_info, ++split_info->ring_buffer_last_scanline );
  5402. // Decode the nth scanline from the source image into the decode buffer.
  5403. stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
  5404. }
  5405. else
  5406. {
  5407. stbir__decode_and_resample_for_vertical_gather_loop(stbir_info, split_info, split_info->ring_buffer_last_scanline + 1);
  5408. }
  5409. }
  5410. // Now all buffers should be ready to write a row of vertical sampling, so do it.
  5411. stbir__resample_vertical_gather(stbir_info, split_info, y, in_first_scanline, in_last_scanline, vertical_coefficients );
  5412. ++vertical_contributors;
  5413. vertical_coefficients += stbir_info->vertical.coefficient_width;
  5414. }
  5415. }
  5416. #define STBIR__FLOAT_EMPTY_MARKER 3.0e+38F
  5417. #define STBIR__FLOAT_BUFFER_IS_EMPTY(ptr) ((ptr)[0]==STBIR__FLOAT_EMPTY_MARKER)
  5418. static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
  5419. {
  5420. // evict a scanline out into the output buffer
  5421. float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
  5422. // dump the scanline out
  5423. stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
  5424. // mark it as empty
  5425. ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
  5426. // advance the first scanline
  5427. split_info->ring_buffer_first_scanline++;
  5428. if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
  5429. split_info->ring_buffer_begin_index = 0;
  5430. }
  5431. static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
  5432. {
  5433. // evict a scanline out into the output buffer
  5434. float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
  5435. // Now resample it into the buffer.
  5436. stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
  5437. // dump the scanline out
  5438. stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
  5439. // mark it as empty
  5440. ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
  5441. // advance the first scanline
  5442. split_info->ring_buffer_first_scanline++;
  5443. if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
  5444. split_info->ring_buffer_begin_index = 0;
  5445. }
  5446. static void stbir__resample_vertical_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n0, int n1, float const * vertical_coefficients, float const * vertical_buffer, float const * vertical_buffer_end )
  5447. {
  5448. STBIR_ASSERT( !stbir_info->vertical.is_gather );
  5449. STBIR_PROFILE_START( vertical );
  5450. {
  5451. int k = 0, total = n1 - n0 + 1;
  5452. STBIR_ASSERT( total > 0 );
  5453. do {
  5454. float * outputs[8];
  5455. int i, n = total; if ( n > 8 ) n = 8;
  5456. for( i = 0 ; i < n ; i++ )
  5457. {
  5458. outputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+n0 );
  5459. if ( ( i ) && ( STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[i] ) != STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ) ) ) // make sure runs are of the same type
  5460. {
  5461. n = i;
  5462. break;
  5463. }
  5464. }
  5465. // call the scatter to N scanlines at a time function (up to 8 scanlines of scattering at once)
  5466. ((STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ))?stbir__vertical_scatter_sets:stbir__vertical_scatter_blends)[n-1]( outputs, vertical_coefficients + k, vertical_buffer, vertical_buffer_end );
  5467. k += n;
  5468. total -= n;
  5469. } while ( total );
  5470. }
  5471. STBIR_PROFILE_END( vertical );
  5472. }
  5473. typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info);
  5474. static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
  5475. {
  5476. int y, start_output_y, end_output_y, start_input_y, end_input_y;
  5477. stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
  5478. float const * vertical_coefficients = stbir_info->vertical.coefficients;
  5479. stbir__handle_scanline_for_scatter_func * handle_scanline_for_scatter;
  5480. void * scanline_scatter_buffer;
  5481. void * scanline_scatter_buffer_end;
  5482. int on_first_input_y, last_input_y;
  5483. int width = (stbir_info->vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
  5484. int width_times_channels = stbir_info->effective_channels * width;
  5485. STBIR_ASSERT( !stbir_info->vertical.is_gather );
  5486. start_output_y = split_info->start_output_y;
  5487. end_output_y = split_info[split_count-1].end_output_y; // may do multiple split counts
  5488. start_input_y = split_info->start_input_y;
  5489. end_input_y = split_info[split_count-1].end_input_y;
  5490. // adjust for starting offset start_input_y
  5491. y = start_input_y + stbir_info->vertical.filter_pixel_margin;
  5492. vertical_contributors += y ;
  5493. vertical_coefficients += stbir_info->vertical.coefficient_width * y;
  5494. if ( stbir_info->vertical_first )
  5495. {
  5496. handle_scanline_for_scatter = stbir__horizontal_resample_and_encode_first_scanline_from_scatter;
  5497. scanline_scatter_buffer = split_info->decode_buffer;
  5498. scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * (stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1);
  5499. }
  5500. else
  5501. {
  5502. handle_scanline_for_scatter = stbir__encode_first_scanline_from_scatter;
  5503. scanline_scatter_buffer = split_info->vertical_buffer;
  5504. scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * stbir_info->horizontal.scale_info.output_sub_size;
  5505. }
  5506. // initialize the ring buffer for scattering
  5507. split_info->ring_buffer_first_scanline = start_output_y;
  5508. split_info->ring_buffer_last_scanline = -1;
  5509. split_info->ring_buffer_begin_index = -1;
  5510. // mark all the buffers as empty to start
  5511. for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ )
  5512. {
  5513. float * decode_buffer = stbir__get_ring_buffer_entry( stbir_info, split_info, y );
  5514. decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
  5515. decode_buffer[ width_times_channels+1 ] = 0.0f;
  5516. decode_buffer[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
  5517. }
  5518. // do the loop in input space
  5519. on_first_input_y = 1; last_input_y = start_input_y;
  5520. for (y = start_input_y ; y < end_input_y; y++)
  5521. {
  5522. int out_first_scanline, out_last_scanline;
  5523. out_first_scanline = vertical_contributors->n0;
  5524. out_last_scanline = vertical_contributors->n1;
  5525. STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
  5526. if ( ( out_last_scanline >= out_first_scanline ) && ( ( ( out_first_scanline >= start_output_y ) && ( out_first_scanline < end_output_y ) ) || ( ( out_last_scanline >= start_output_y ) && ( out_last_scanline < end_output_y ) ) ) )
  5527. {
  5528. float const * vc = vertical_coefficients;
  5529. // keep track of the range actually seen for the next resize
  5530. last_input_y = y;
  5531. if ( ( on_first_input_y ) && ( y > start_input_y ) )
  5532. split_info->start_input_y = y;
  5533. on_first_input_y = 0;
  5534. // clip the region
  5535. if ( out_first_scanline < start_output_y )
  5536. {
  5537. vc += start_output_y - out_first_scanline;
  5538. out_first_scanline = start_output_y;
  5539. }
  5540. if ( out_last_scanline >= end_output_y )
  5541. out_last_scanline = end_output_y - 1;
  5542. // if very first scanline, init the index
  5543. if (split_info->ring_buffer_begin_index < 0)
  5544. split_info->ring_buffer_begin_index = out_first_scanline - start_output_y;
  5545. STBIR_ASSERT( split_info->ring_buffer_begin_index <= out_first_scanline );
  5546. // Decode the nth scanline from the source image into the decode buffer.
  5547. stbir__decode_scanline( stbir_info, y, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
  5548. // When horizontal first, we resample horizontally into the vertical buffer before we scatter it out
  5549. if ( !stbir_info->vertical_first )
  5550. stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
  5551. // Now it's sitting in the buffer ready to be distributed into the ring buffers.
  5552. // evict from the ringbuffer, if we need are full
  5553. if ( ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) &&
  5554. ( out_last_scanline > split_info->ring_buffer_last_scanline ) )
  5555. handle_scanline_for_scatter( stbir_info, split_info );
  5556. // Now the horizontal buffer is ready to write to all ring buffer rows, so do it.
  5557. stbir__resample_vertical_scatter(stbir_info, split_info, out_first_scanline, out_last_scanline, vc, (float*)scanline_scatter_buffer, (float*)scanline_scatter_buffer_end );
  5558. // update the end of the buffer
  5559. if ( out_last_scanline > split_info->ring_buffer_last_scanline )
  5560. split_info->ring_buffer_last_scanline = out_last_scanline;
  5561. }
  5562. ++vertical_contributors;
  5563. vertical_coefficients += stbir_info->vertical.coefficient_width;
  5564. }
  5565. // now evict the scanlines that are left over in the ring buffer
  5566. while ( split_info->ring_buffer_first_scanline < end_output_y )
  5567. handle_scanline_for_scatter(stbir_info, split_info);
  5568. // update the end_input_y if we do multiple resizes with the same data
  5569. ++last_input_y;
  5570. for( y = 0 ; y < split_count; y++ )
  5571. if ( split_info[y].end_input_y > last_input_y )
  5572. split_info[y].end_input_y = last_input_y;
  5573. }
  5574. static stbir__kernel_callback * stbir__builtin_kernels[] = { 0, stbir__filter_trapezoid, stbir__filter_triangle, stbir__filter_cubic, stbir__filter_catmullrom, stbir__filter_mitchell, stbir__filter_point };
  5575. static stbir__support_callback * stbir__builtin_supports[] = { 0, stbir__support_trapezoid, stbir__support_one, stbir__support_two, stbir__support_two, stbir__support_two, stbir__support_zeropoint5 };
  5576. static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir__kernel_callback * kernel, stbir__support_callback * support, stbir_edge edge, stbir__scale_info * scale_info, int always_gather, void * user_data )
  5577. {
  5578. // set filter
  5579. if (filter == 0)
  5580. {
  5581. filter = STBIR_DEFAULT_FILTER_DOWNSAMPLE; // default to downsample
  5582. if (scale_info->scale >= ( 1.0f - stbir__small_float ) )
  5583. {
  5584. if ( (scale_info->scale <= ( 1.0f + stbir__small_float ) ) && ( STBIR_CEILF(scale_info->pixel_shift) == scale_info->pixel_shift ) )
  5585. filter = STBIR_FILTER_POINT_SAMPLE;
  5586. else
  5587. filter = STBIR_DEFAULT_FILTER_UPSAMPLE;
  5588. }
  5589. }
  5590. samp->filter_enum = filter;
  5591. STBIR_ASSERT(samp->filter_enum != 0);
  5592. STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER);
  5593. samp->filter_kernel = stbir__builtin_kernels[ filter ];
  5594. samp->filter_support = stbir__builtin_supports[ filter ];
  5595. if ( kernel && support )
  5596. {
  5597. samp->filter_kernel = kernel;
  5598. samp->filter_support = support;
  5599. samp->filter_enum = STBIR_FILTER_OTHER;
  5600. }
  5601. samp->edge = edge;
  5602. samp->filter_pixel_width = stbir__get_filter_pixel_width (samp->filter_support, scale_info->scale, user_data );
  5603. // Gather is always better, but in extreme downsamples, you have to most or all of the data in memory
  5604. // For horizontal, we always have all the pixels, so we always use gather here (always_gather==1).
  5605. // For vertical, we use gather if scaling up (which means we will have samp->filter_pixel_width
  5606. // scanlines in memory at once).
  5607. samp->is_gather = 0;
  5608. if ( scale_info->scale >= ( 1.0f - stbir__small_float ) )
  5609. samp->is_gather = 1;
  5610. else if ( ( always_gather ) || ( samp->filter_pixel_width <= STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT ) )
  5611. samp->is_gather = 2;
  5612. // pre calculate stuff based on the above
  5613. samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);
  5614. // filter_pixel_width is the conservative size in pixels of input that affect an output pixel.
  5615. // In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the
  5616. // filter will extend before or after the scanline beyond just one extra entire copy of the
  5617. // scanline (we would hit the edge twice). We don't let you do that, so we clamp the total
  5618. // width to 3x the total of input pixel (once for the scanline, once for the left side
  5619. // overhang, and once for the right side). We only do this for edge mode, since the other
  5620. // modes can just re-edge clamp back in again.
  5621. if ( edge == STBIR_EDGE_WRAP )
  5622. if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) )
  5623. samp->filter_pixel_width = scale_info->input_full_size * 3;
  5624. // This is how much to expand buffers to account for filters seeking outside
  5625. // the image boundaries.
  5626. samp->filter_pixel_margin = samp->filter_pixel_width / 2;
  5627. // filter_pixel_margin is the amount that this filter can overhang on just one side of either
  5628. // end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's
  5629. // worth of pixels, we clamp this one side of overhang to the input scanline size. Again,
  5630. // this clamping only happens in rare cases with the default filters (2 pix to 1 pix).
  5631. if ( edge == STBIR_EDGE_WRAP )
  5632. if ( samp->filter_pixel_margin > scale_info->input_full_size )
  5633. samp->filter_pixel_margin = scale_info->input_full_size;
  5634. samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
  5635. samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
  5636. samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra sizeof(float) is padding
  5637. samp->gather_prescatter_contributors = 0;
  5638. samp->gather_prescatter_coefficients = 0;
  5639. if ( samp->is_gather == 0 )
  5640. {
  5641. samp->gather_prescatter_coefficient_width = samp->filter_pixel_width;
  5642. samp->gather_prescatter_num_contributors = stbir__get_contributors(samp, 2);
  5643. samp->gather_prescatter_contributors_size = samp->gather_prescatter_num_contributors * sizeof(stbir__contributors);
  5644. samp->gather_prescatter_coefficients_size = samp->gather_prescatter_num_contributors * samp->gather_prescatter_coefficient_width * sizeof(float);
  5645. }
  5646. }
  5647. static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contributors * range, void * user_data )
  5648. {
  5649. float scale = samp->scale_info.scale;
  5650. float out_shift = samp->scale_info.pixel_shift;
  5651. stbir__support_callback * support = samp->filter_support;
  5652. int input_full_size = samp->scale_info.input_full_size;
  5653. stbir_edge edge = samp->edge;
  5654. float inv_scale = samp->scale_info.inv_scale;
  5655. STBIR_ASSERT( samp->is_gather != 0 );
  5656. if ( samp->is_gather == 1 )
  5657. {
  5658. int in_first_pixel, in_last_pixel;
  5659. float out_filter_radius = support(inv_scale, user_data) * scale;
  5660. stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0.5, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
  5661. range->n0 = in_first_pixel;
  5662. stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, ( (float)(samp->scale_info.output_sub_size-1) ) + 0.5f, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
  5663. range->n1 = in_last_pixel;
  5664. }
  5665. else if ( samp->is_gather == 2 ) // downsample gather, refine
  5666. {
  5667. float in_pixels_radius = support(scale, user_data) * inv_scale;
  5668. int filter_pixel_margin = samp->filter_pixel_margin;
  5669. int output_sub_size = samp->scale_info.output_sub_size;
  5670. int input_end;
  5671. int n;
  5672. int in_first_pixel, in_last_pixel;
  5673. // get a conservative area of the input range
  5674. stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0, 0, inv_scale, out_shift, input_full_size, edge );
  5675. range->n0 = in_first_pixel;
  5676. stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, (float)output_sub_size, 0, inv_scale, out_shift, input_full_size, edge );
  5677. range->n1 = in_last_pixel;
  5678. // now go through the margin to the start of area to find bottom
  5679. n = range->n0 + 1;
  5680. input_end = -filter_pixel_margin;
  5681. while( n >= input_end )
  5682. {
  5683. int out_first_pixel, out_last_pixel;
  5684. stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
  5685. if ( out_first_pixel > out_last_pixel )
  5686. break;
  5687. if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
  5688. range->n0 = n;
  5689. --n;
  5690. }
  5691. // now go through the end of the area through the margin to find top
  5692. n = range->n1 - 1;
  5693. input_end = n + 1 + filter_pixel_margin;
  5694. while( n <= input_end )
  5695. {
  5696. int out_first_pixel, out_last_pixel;
  5697. stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
  5698. if ( out_first_pixel > out_last_pixel )
  5699. break;
  5700. if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
  5701. range->n1 = n;
  5702. ++n;
  5703. }
  5704. }
  5705. if ( samp->edge == STBIR_EDGE_WRAP )
  5706. {
  5707. // if we are wrapping, and we are very close to the image size (so the edges might merge), just use the scanline up to the edge
  5708. if ( ( range->n0 > 0 ) && ( range->n1 >= input_full_size ) )
  5709. {
  5710. int marg = range->n1 - input_full_size + 1;
  5711. if ( ( marg + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= range->n0 )
  5712. range->n0 = 0;
  5713. }
  5714. if ( ( range->n0 < 0 ) && ( range->n1 < (input_full_size-1) ) )
  5715. {
  5716. int marg = -range->n0;
  5717. if ( ( input_full_size - marg - STBIR__MERGE_RUNS_PIXEL_THRESHOLD - 1 ) <= range->n1 )
  5718. range->n1 = input_full_size - 1;
  5719. }
  5720. }
  5721. else
  5722. {
  5723. // for non-edge-wrap modes, we never read over the edge, so clamp
  5724. if ( range->n0 < 0 )
  5725. range->n0 = 0;
  5726. if ( range->n1 >= input_full_size )
  5727. range->n1 = input_full_size - 1;
  5728. }
  5729. }
  5730. static void stbir__get_split_info( stbir__per_split_info* split_info, int splits, int output_height, int vertical_pixel_margin, int input_full_height, int is_gather, stbir__contributors * contribs )
  5731. {
  5732. int i, cur;
  5733. int left = output_height;
  5734. cur = 0;
  5735. for( i = 0 ; i < splits ; i++ )
  5736. {
  5737. int each;
  5738. split_info[i].start_output_y = cur;
  5739. each = left / ( splits - i );
  5740. split_info[i].end_output_y = cur + each;
  5741. // ok, when we are gathering, we need to make sure we are starting on a y offset that doesn't have
  5742. // a "special" set of coefficients. Basically, with exactly the right filter at exactly the right
  5743. // resize at exactly the right phase, some of the coefficents can be zero. When they are zero, we
  5744. // don't process them at all. But this leads to a tricky thing with the thread splits, where we
  5745. // might have a set of two coeffs like this for example: (4,4) and (3,6). The 4,4 means there was
  5746. // just one single coeff because things worked out perfectly (normally, they all have 4 coeffs
  5747. // like the range 3,6. The problem is that if we start right on the (4,4) on a brand new thread,
  5748. // then when we get to (3,6), we don't have the "3" sample in memory (because we didn't load
  5749. // it on the initial (4,4) range because it didn't have a 3 (we only add new samples that are
  5750. // larger than our existing samples - it's just how the eviction works). So, our solution here
  5751. // is pretty simple, if we start right on a range that has samples that start earlier, then we
  5752. // simply bump up our previous thread split range to include it, and then start this threads
  5753. // range with the smaller sample. It just moves one scanline from one thread split to another,
  5754. // so that we end with the unusual one, instead of start with it. To do this, we check 2-4
  5755. // sample at each thread split start and then occassionally move them.
  5756. if ( ( is_gather ) && ( i ) )
  5757. {
  5758. stbir__contributors * small_contribs;
  5759. int j, smallest, stop, start_n0;
  5760. stbir__contributors * split_contribs = contribs + cur;
  5761. // scan for a max of 3x the filter width or until the next thread split
  5762. stop = vertical_pixel_margin * 3;
  5763. if ( each < stop )
  5764. stop = each;
  5765. // loops a few times before early out
  5766. smallest = 0;
  5767. small_contribs = split_contribs;
  5768. start_n0 = small_contribs->n0;
  5769. for( j = 1 ; j <= stop ; j++ )
  5770. {
  5771. ++split_contribs;
  5772. if ( split_contribs->n0 > start_n0 )
  5773. break;
  5774. if ( split_contribs->n0 < small_contribs->n0 )
  5775. {
  5776. small_contribs = split_contribs;
  5777. smallest = j;
  5778. }
  5779. }
  5780. split_info[i-1].end_output_y += smallest;
  5781. split_info[i].start_output_y += smallest;
  5782. }
  5783. cur += each;
  5784. left -= each;
  5785. // scatter range (updated to minimum as you run it)
  5786. split_info[i].start_input_y = -vertical_pixel_margin;
  5787. split_info[i].end_input_y = input_full_height + vertical_pixel_margin;
  5788. }
  5789. }
  5790. static void stbir__free_internal_mem( stbir__info *info )
  5791. {
  5792. #define STBIR__FREE_AND_CLEAR( ptr ) { if ( ptr ) { void * p = (ptr); (ptr) = 0; STBIR_FREE( p, info->user_data); } }
  5793. if ( info )
  5794. {
  5795. #ifndef STBIR__SEPARATE_ALLOCATIONS
  5796. STBIR__FREE_AND_CLEAR( info->alloced_mem );
  5797. #else
  5798. int i,j;
  5799. if ( ( info->vertical.gather_prescatter_contributors ) && ( (void*)info->vertical.gather_prescatter_contributors != (void*)info->split_info[0].decode_buffer ) )
  5800. {
  5801. STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_coefficients );
  5802. STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_contributors );
  5803. }
  5804. for( i = 0 ; i < info->splits ; i++ )
  5805. {
  5806. for( j = 0 ; j < info->alloc_ring_buffer_num_entries ; j++ )
  5807. {
  5808. #ifdef STBIR_SIMD8
  5809. if ( info->effective_channels == 3 )
  5810. --info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
  5811. #endif
  5812. STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers[j] );
  5813. }
  5814. #ifdef STBIR_SIMD8
  5815. if ( info->effective_channels == 3 )
  5816. --info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
  5817. #endif
  5818. STBIR__FREE_AND_CLEAR( info->split_info[i].decode_buffer );
  5819. STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers );
  5820. STBIR__FREE_AND_CLEAR( info->split_info[i].vertical_buffer );
  5821. }
  5822. STBIR__FREE_AND_CLEAR( info->split_info );
  5823. if ( info->vertical.coefficients != info->horizontal.coefficients )
  5824. {
  5825. STBIR__FREE_AND_CLEAR( info->vertical.coefficients );
  5826. STBIR__FREE_AND_CLEAR( info->vertical.contributors );
  5827. }
  5828. STBIR__FREE_AND_CLEAR( info->horizontal.coefficients );
  5829. STBIR__FREE_AND_CLEAR( info->horizontal.contributors );
  5830. STBIR__FREE_AND_CLEAR( info->alloced_mem );
  5831. STBIR_FREE( info, info->user_data );
  5832. #endif
  5833. }
  5834. #undef STBIR__FREE_AND_CLEAR
  5835. }
  5836. static int stbir__get_max_split( int splits, int height )
  5837. {
  5838. int i;
  5839. int max = 0;
  5840. for( i = 0 ; i < splits ; i++ )
  5841. {
  5842. int each = height / ( splits - i );
  5843. if ( each > max )
  5844. max = each;
  5845. height -= each;
  5846. }
  5847. return max;
  5848. }
  5849. static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_n_coeffs_funcs[8] =
  5850. {
  5851. 0, stbir__horizontal_gather_1_channels_with_n_coeffs_funcs, stbir__horizontal_gather_2_channels_with_n_coeffs_funcs, stbir__horizontal_gather_3_channels_with_n_coeffs_funcs, stbir__horizontal_gather_4_channels_with_n_coeffs_funcs, 0,0, stbir__horizontal_gather_7_channels_with_n_coeffs_funcs
  5852. };
  5853. static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_channels_funcs[8] =
  5854. {
  5855. 0, stbir__horizontal_gather_1_channels_funcs, stbir__horizontal_gather_2_channels_funcs, stbir__horizontal_gather_3_channels_funcs, stbir__horizontal_gather_4_channels_funcs, 0,0, stbir__horizontal_gather_7_channels_funcs
  5856. };
  5857. // there are six resize classifications: 0 == vertical scatter, 1 == vertical gather < 1x scale, 2 == vertical gather 1x-2x scale, 4 == vertical gather < 3x scale, 4 == vertical gather > 3x scale, 5 == <=4 pixel height, 6 == <=4 pixel wide column
  5858. #define STBIR_RESIZE_CLASSIFICATIONS 8
  5859. static float stbir__compute_weights[5][STBIR_RESIZE_CLASSIFICATIONS][4]= // 5 = 0=1chan, 1=2chan, 2=3chan, 3=4chan, 4=7chan
  5860. {
  5861. {
  5862. { 1.00000f, 1.00000f, 0.31250f, 1.00000f },
  5863. { 0.56250f, 0.59375f, 0.00000f, 0.96875f },
  5864. { 1.00000f, 0.06250f, 0.00000f, 1.00000f },
  5865. { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
  5866. { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
  5867. { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
  5868. { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
  5869. { 0.00000f, 1.00000f, 0.00000f, 0.03125f },
  5870. }, {
  5871. { 0.00000f, 0.84375f, 0.00000f, 0.03125f },
  5872. { 0.09375f, 0.93750f, 0.00000f, 0.78125f },
  5873. { 0.87500f, 0.21875f, 0.00000f, 0.96875f },
  5874. { 0.09375f, 0.09375f, 1.00000f, 1.00000f },
  5875. { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
  5876. { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
  5877. { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
  5878. { 0.00000f, 1.00000f, 0.00000f, 0.53125f },
  5879. }, {
  5880. { 0.00000f, 0.53125f, 0.00000f, 0.03125f },
  5881. { 0.06250f, 0.96875f, 0.00000f, 0.53125f },
  5882. { 0.87500f, 0.18750f, 0.00000f, 0.93750f },
  5883. { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
  5884. { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
  5885. { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
  5886. { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
  5887. { 0.00000f, 1.00000f, 0.00000f, 0.56250f },
  5888. }, {
  5889. { 0.00000f, 0.50000f, 0.00000f, 0.71875f },
  5890. { 0.06250f, 0.84375f, 0.00000f, 0.87500f },
  5891. { 1.00000f, 0.50000f, 0.50000f, 0.96875f },
  5892. { 1.00000f, 0.09375f, 0.31250f, 0.50000f },
  5893. { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
  5894. { 1.00000f, 0.03125f, 0.03125f, 0.53125f },
  5895. { 0.18750f, 0.12500f, 0.00000f, 1.00000f },
  5896. { 0.00000f, 1.00000f, 0.03125f, 0.18750f },
  5897. }, {
  5898. { 0.00000f, 0.59375f, 0.00000f, 0.96875f },
  5899. { 0.06250f, 0.81250f, 0.06250f, 0.59375f },
  5900. { 0.75000f, 0.43750f, 0.12500f, 0.96875f },
  5901. { 0.87500f, 0.06250f, 0.18750f, 0.43750f },
  5902. { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
  5903. { 0.15625f, 0.12500f, 1.00000f, 1.00000f },
  5904. { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
  5905. { 0.00000f, 1.00000f, 0.03125f, 0.34375f },
  5906. }
  5907. };
  5908. // structure that allow us to query and override info for training the costs
  5909. typedef struct STBIR__V_FIRST_INFO
  5910. {
  5911. double v_cost, h_cost;
  5912. int control_v_first; // 0 = no control, 1 = force hori, 2 = force vert
  5913. int v_first;
  5914. int v_resize_classification;
  5915. int is_gather;
  5916. } STBIR__V_FIRST_INFO;
  5917. #ifdef STBIR__V_FIRST_INFO_BUFFER
  5918. static STBIR__V_FIRST_INFO STBIR__V_FIRST_INFO_BUFFER = {0};
  5919. #define STBIR__V_FIRST_INFO_POINTER &STBIR__V_FIRST_INFO_BUFFER
  5920. #else
  5921. #define STBIR__V_FIRST_INFO_POINTER 0
  5922. #endif
  5923. // Figure out whether to scale along the horizontal or vertical first.
  5924. // This only *super* important when you are scaling by a massively
  5925. // different amount in the vertical vs the horizontal (for example, if
  5926. // you are scaling by 2x in the width, and 0.5x in the height, then you
  5927. // want to do the vertical scale first, because it's around 3x faster
  5928. // in that order.
  5929. //
  5930. // In more normal circumstances, this makes a 20-40% differences, so
  5931. // it's good to get right, but not critical. The normal way that you
  5932. // decide which direction goes first is just figuring out which
  5933. // direction does more multiplies. But with modern CPUs with their
  5934. // fancy caches and SIMD and high IPC abilities, so there's just a lot
  5935. // more that goes into it.
  5936. //
  5937. // My handwavy sort of solution is to have an app that does a whole
  5938. // bunch of timing for both vertical and horizontal first modes,
  5939. // and then another app that can read lots of these timing files
  5940. // and try to search for the best weights to use. Dotimings.c
  5941. // is the app that does a bunch of timings, and vf_train.c is the
  5942. // app that solves for the best weights (and shows how well it
  5943. // does currently).
  5944. static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int horizontal_filter_pixel_width, float horizontal_scale, int horizontal_output_size, int vertical_filter_pixel_width, float vertical_scale, int vertical_output_size, int is_gather, STBIR__V_FIRST_INFO * info )
  5945. {
  5946. double v_cost, h_cost;
  5947. float * weights;
  5948. int vertical_first;
  5949. int v_classification;
  5950. // categorize the resize into buckets
  5951. if ( ( vertical_output_size <= 4 ) || ( horizontal_output_size <= 4 ) )
  5952. v_classification = ( vertical_output_size < horizontal_output_size ) ? 6 : 7;
  5953. else if ( vertical_scale <= 1.0f )
  5954. v_classification = ( is_gather ) ? 1 : 0;
  5955. else if ( vertical_scale <= 2.0f)
  5956. v_classification = 2;
  5957. else if ( vertical_scale <= 3.0f)
  5958. v_classification = 3;
  5959. else if ( vertical_scale <= 4.0f)
  5960. v_classification = 5;
  5961. else
  5962. v_classification = 6;
  5963. // use the right weights
  5964. weights = weights_table[ v_classification ];
  5965. // this is the costs when you don't take into account modern CPUs with high ipc and simd and caches - wish we had a better estimate
  5966. h_cost = (float)horizontal_filter_pixel_width * weights[0] + horizontal_scale * (float)vertical_filter_pixel_width * weights[1];
  5967. v_cost = (float)vertical_filter_pixel_width * weights[2] + vertical_scale * (float)horizontal_filter_pixel_width * weights[3];
  5968. // use computation estimate to decide vertical first or not
  5969. vertical_first = ( v_cost <= h_cost ) ? 1 : 0;
  5970. // save these, if requested
  5971. if ( info )
  5972. {
  5973. info->h_cost = h_cost;
  5974. info->v_cost = v_cost;
  5975. info->v_resize_classification = v_classification;
  5976. info->v_first = vertical_first;
  5977. info->is_gather = is_gather;
  5978. }
  5979. // and this allows us to override everything for testing (see dotiming.c)
  5980. if ( ( info ) && ( info->control_v_first ) )
  5981. vertical_first = ( info->control_v_first == 2 ) ? 1 : 0;
  5982. return vertical_first;
  5983. }
  5984. // layout lookups - must match stbir_internal_pixel_layout
  5985. static unsigned char stbir__pixel_channels[] = {
  5986. 1,2,3,3,4, // 1ch, 2ch, rgb, bgr, 4ch
  5987. 4,4,4,4,2,2, // RGBA,BGRA,ARGB,ABGR,RA,AR
  5988. 4,4,4,4,2,2, // RGBA_PM,BGRA_PM,ARGB_PM,ABGR_PM,RA_PM,AR_PM
  5989. };
  5990. // the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
  5991. // the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
  5992. static stbir_internal_pixel_layout stbir__pixel_layout_convert_public_to_internal[] = {
  5993. STBIRI_BGR, STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB, STBIRI_RGBA,
  5994. STBIRI_4CHANNEL, STBIRI_BGRA, STBIRI_ARGB, STBIRI_ABGR, STBIRI_RA, STBIRI_AR,
  5995. STBIRI_RGBA_PM, STBIRI_BGRA_PM, STBIRI_ARGB_PM, STBIRI_ABGR_PM, STBIRI_RA_PM, STBIRI_AR_PM,
  5996. };
  5997. static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sampler * horizontal, stbir__sampler * vertical, stbir__contributors * conservative, stbir_pixel_layout input_pixel_layout_public, stbir_pixel_layout output_pixel_layout_public, int splits, int new_x, int new_y, int fast_alpha, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
  5998. {
  5999. static char stbir_channel_count_index[8]={ 9,0,1,2, 3,9,9,4 };
  6000. stbir__info * info = 0;
  6001. void * alloced = 0;
  6002. size_t alloced_total = 0;
  6003. int vertical_first;
  6004. size_t decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size;
  6005. int alloc_ring_buffer_num_entries;
  6006. int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy
  6007. int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size );
  6008. stbir_internal_pixel_layout input_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ input_pixel_layout_public ];
  6009. stbir_internal_pixel_layout output_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ output_pixel_layout_public ];
  6010. int channels = stbir__pixel_channels[ input_pixel_layout ];
  6011. int effective_channels = channels;
  6012. // first figure out what type of alpha weighting to use (if any)
  6013. if ( ( horizontal->filter_enum != STBIR_FILTER_POINT_SAMPLE ) || ( vertical->filter_enum != STBIR_FILTER_POINT_SAMPLE ) ) // no alpha weighting on point sampling
  6014. {
  6015. if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
  6016. {
  6017. if ( fast_alpha )
  6018. {
  6019. alpha_weighting_type = 4;
  6020. }
  6021. else
  6022. {
  6023. static int fancy_alpha_effective_cnts[6] = { 7, 7, 7, 7, 3, 3 };
  6024. alpha_weighting_type = 2;
  6025. effective_channels = fancy_alpha_effective_cnts[ input_pixel_layout - STBIRI_RGBA ];
  6026. }
  6027. }
  6028. else if ( ( input_pixel_layout >= STBIRI_RGBA_PM ) && ( input_pixel_layout <= STBIRI_AR_PM ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
  6029. {
  6030. // input premult, output non-premult
  6031. alpha_weighting_type = 3;
  6032. }
  6033. else if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA_PM ) && ( output_pixel_layout <= STBIRI_AR_PM ) )
  6034. {
  6035. // input non-premult, output premult
  6036. alpha_weighting_type = 1;
  6037. }
  6038. }
  6039. // channel in and out count must match currently
  6040. if ( channels != stbir__pixel_channels[ output_pixel_layout ] )
  6041. return 0;
  6042. // get vertical first
  6043. vertical_first = stbir__should_do_vertical_first( stbir__compute_weights[ (int)stbir_channel_count_index[ effective_channels ] ], horizontal->filter_pixel_width, horizontal->scale_info.scale, horizontal->scale_info.output_sub_size, vertical->filter_pixel_width, vertical->scale_info.scale, vertical->scale_info.output_sub_size, vertical->is_gather, STBIR__V_FIRST_INFO_POINTER );
  6044. // sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect)
  6045. // we use a few extra floats instead of just 1, so that input callback buffer can overlap with the decode buffer without
  6046. // the conversion routines overwriting the callback input data.
  6047. decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for input callback stagger
  6048. #if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
  6049. if ( effective_channels == 3 )
  6050. decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations)
  6051. #endif
  6052. ring_buffer_length_bytes = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for padding
  6053. // if we do vertical first, the ring buffer holds a whole decoded line
  6054. if ( vertical_first )
  6055. ring_buffer_length_bytes = ( decode_buffer_size + 15 ) & ~15;
  6056. if ( ( ring_buffer_length_bytes & 4095 ) == 0 ) ring_buffer_length_bytes += 64*3; // avoid 4k alias
  6057. // One extra entry because floating point precision problems sometimes cause an extra to be necessary.
  6058. alloc_ring_buffer_num_entries = vertical->filter_pixel_width + 1;
  6059. // we never need more ring buffer entries than the scanlines we're outputting when in scatter mode
  6060. if ( ( !vertical->is_gather ) && ( alloc_ring_buffer_num_entries > conservative_split_output_size ) )
  6061. alloc_ring_buffer_num_entries = conservative_split_output_size;
  6062. ring_buffer_size = (size_t)alloc_ring_buffer_num_entries * (size_t)ring_buffer_length_bytes;
  6063. // The vertical buffer is used differently, depending on whether we are scattering
  6064. // the vertical scanlines, or gathering them.
  6065. // If scattering, it's used at the temp buffer to accumulate each output.
  6066. // If gathering, it's just the output buffer.
  6067. vertical_buffer_size = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float); // extra float for padding
  6068. // we make two passes through this loop, 1st to add everything up, 2nd to allocate and init
  6069. for(;;)
  6070. {
  6071. int i;
  6072. void * advance_mem = alloced;
  6073. int copy_horizontal = 0;
  6074. stbir__sampler * possibly_use_horizontal_for_pivot = 0;
  6075. #ifdef STBIR__SEPARATE_ALLOCATIONS
  6076. #define STBIR__NEXT_PTR( ptr, size, ntype ) if ( alloced ) { void * p = STBIR_MALLOC( size, user_data); if ( p == 0 ) { stbir__free_internal_mem( info ); return 0; } (ptr) = (ntype*)p; }
  6077. #else
  6078. #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = (char*)(((size_t)advance_mem) + (size));
  6079. #endif
  6080. STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info );
  6081. STBIR__NEXT_PTR( info->split_info, sizeof( stbir__per_split_info ) * splits, stbir__per_split_info );
  6082. if ( info )
  6083. {
  6084. static stbir__alpha_weight_func * fancy_alpha_weights[6] = { stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_2ch, stbir__fancy_alpha_weight_2ch };
  6085. static stbir__alpha_unweight_func * fancy_alpha_unweights[6] = { stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_2ch, stbir__fancy_alpha_unweight_2ch };
  6086. static stbir__alpha_weight_func * simple_alpha_weights[6] = { stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_2ch, stbir__simple_alpha_weight_2ch };
  6087. static stbir__alpha_unweight_func * simple_alpha_unweights[6] = { stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_2ch, stbir__simple_alpha_unweight_2ch };
  6088. // initialize info fields
  6089. info->alloced_mem = alloced;
  6090. info->alloced_total = alloced_total;
  6091. info->channels = channels;
  6092. info->effective_channels = effective_channels;
  6093. info->offset_x = new_x;
  6094. info->offset_y = new_y;
  6095. info->alloc_ring_buffer_num_entries = (int)alloc_ring_buffer_num_entries;
  6096. info->ring_buffer_num_entries = 0;
  6097. info->ring_buffer_length_bytes = (int)ring_buffer_length_bytes;
  6098. info->splits = splits;
  6099. info->vertical_first = vertical_first;
  6100. info->input_pixel_layout_internal = input_pixel_layout;
  6101. info->output_pixel_layout_internal = output_pixel_layout;
  6102. // setup alpha weight functions
  6103. info->alpha_weight = 0;
  6104. info->alpha_unweight = 0;
  6105. // handle alpha weighting functions and overrides
  6106. if ( alpha_weighting_type == 2 )
  6107. {
  6108. // high quality alpha multiplying on the way in, dividing on the way out
  6109. info->alpha_weight = fancy_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
  6110. info->alpha_unweight = fancy_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
  6111. }
  6112. else if ( alpha_weighting_type == 4 )
  6113. {
  6114. // fast alpha multiplying on the way in, dividing on the way out
  6115. info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
  6116. info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
  6117. }
  6118. else if ( alpha_weighting_type == 1 )
  6119. {
  6120. // fast alpha on the way in, leave in premultiplied form on way out
  6121. info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
  6122. }
  6123. else if ( alpha_weighting_type == 3 )
  6124. {
  6125. // incoming is premultiplied, fast alpha dividing on the way out - non-premultiplied output
  6126. info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
  6127. }
  6128. // handle 3-chan color flipping, using the alpha weight path
  6129. if ( ( ( input_pixel_layout == STBIRI_RGB ) && ( output_pixel_layout == STBIRI_BGR ) ) ||
  6130. ( ( input_pixel_layout == STBIRI_BGR ) && ( output_pixel_layout == STBIRI_RGB ) ) )
  6131. {
  6132. // do the flipping on the smaller of the two ends
  6133. if ( horizontal->scale_info.scale < 1.0f )
  6134. info->alpha_unweight = stbir__simple_flip_3ch;
  6135. else
  6136. info->alpha_weight = stbir__simple_flip_3ch;
  6137. }
  6138. }
  6139. // get all the per-split buffers
  6140. for( i = 0 ; i < splits ; i++ )
  6141. {
  6142. STBIR__NEXT_PTR( info->split_info[i].decode_buffer, decode_buffer_size, float );
  6143. #ifdef STBIR__SEPARATE_ALLOCATIONS
  6144. #ifdef STBIR_SIMD8
  6145. if ( ( info ) && ( effective_channels == 3 ) )
  6146. ++info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
  6147. #endif
  6148. STBIR__NEXT_PTR( info->split_info[i].ring_buffers, alloc_ring_buffer_num_entries * sizeof(float*), float* );
  6149. {
  6150. int j;
  6151. for( j = 0 ; j < alloc_ring_buffer_num_entries ; j++ )
  6152. {
  6153. STBIR__NEXT_PTR( info->split_info[i].ring_buffers[j], ring_buffer_length_bytes, float );
  6154. #ifdef STBIR_SIMD8
  6155. if ( ( info ) && ( effective_channels == 3 ) )
  6156. ++info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
  6157. #endif
  6158. }
  6159. }
  6160. #else
  6161. STBIR__NEXT_PTR( info->split_info[i].ring_buffer, ring_buffer_size, float );
  6162. #endif
  6163. STBIR__NEXT_PTR( info->split_info[i].vertical_buffer, vertical_buffer_size, float );
  6164. }
  6165. // alloc memory for to-be-pivoted coeffs (if necessary)
  6166. if ( vertical->is_gather == 0 )
  6167. {
  6168. size_t both;
  6169. size_t temp_mem_amt;
  6170. // when in vertical scatter mode, we first build the coefficients in gather mode, and then pivot after,
  6171. // that means we need two buffers, so we try to use the decode buffer and ring buffer for this. if that
  6172. // is too small, we just allocate extra memory to use as this temp.
  6173. both = (size_t)vertical->gather_prescatter_contributors_size + (size_t)vertical->gather_prescatter_coefficients_size;
  6174. #ifdef STBIR__SEPARATE_ALLOCATIONS
  6175. temp_mem_amt = decode_buffer_size;
  6176. #ifdef STBIR_SIMD8
  6177. if ( effective_channels == 3 )
  6178. --temp_mem_amt; // avx in 3 channel mode needs one float at the start of the buffer
  6179. #endif
  6180. #else
  6181. temp_mem_amt = (size_t)( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * (size_t)splits;
  6182. #endif
  6183. if ( temp_mem_amt >= both )
  6184. {
  6185. if ( info )
  6186. {
  6187. vertical->gather_prescatter_contributors = (stbir__contributors*)info->split_info[0].decode_buffer;
  6188. vertical->gather_prescatter_coefficients = (float*) ( ( (char*)info->split_info[0].decode_buffer ) + vertical->gather_prescatter_contributors_size );
  6189. }
  6190. }
  6191. else
  6192. {
  6193. // ring+decode memory is too small, so allocate temp memory
  6194. STBIR__NEXT_PTR( vertical->gather_prescatter_contributors, vertical->gather_prescatter_contributors_size, stbir__contributors );
  6195. STBIR__NEXT_PTR( vertical->gather_prescatter_coefficients, vertical->gather_prescatter_coefficients_size, float );
  6196. }
  6197. }
  6198. STBIR__NEXT_PTR( horizontal->contributors, horizontal->contributors_size, stbir__contributors );
  6199. STBIR__NEXT_PTR( horizontal->coefficients, horizontal->coefficients_size, float );
  6200. // are the two filters identical?? (happens a lot with mipmap generation)
  6201. if ( ( horizontal->filter_kernel == vertical->filter_kernel ) && ( horizontal->filter_support == vertical->filter_support ) && ( horizontal->edge == vertical->edge ) && ( horizontal->scale_info.output_sub_size == vertical->scale_info.output_sub_size ) )
  6202. {
  6203. float diff_scale = horizontal->scale_info.scale - vertical->scale_info.scale;
  6204. float diff_shift = horizontal->scale_info.pixel_shift - vertical->scale_info.pixel_shift;
  6205. if ( diff_scale < 0.0f ) diff_scale = -diff_scale;
  6206. if ( diff_shift < 0.0f ) diff_shift = -diff_shift;
  6207. if ( ( diff_scale <= stbir__small_float ) && ( diff_shift <= stbir__small_float ) )
  6208. {
  6209. if ( horizontal->is_gather == vertical->is_gather )
  6210. {
  6211. copy_horizontal = 1;
  6212. goto no_vert_alloc;
  6213. }
  6214. // everything matches, but vertical is scatter, horizontal is gather, use horizontal coeffs for vertical pivot coeffs
  6215. possibly_use_horizontal_for_pivot = horizontal;
  6216. }
  6217. }
  6218. STBIR__NEXT_PTR( vertical->contributors, vertical->contributors_size, stbir__contributors );
  6219. STBIR__NEXT_PTR( vertical->coefficients, vertical->coefficients_size, float );
  6220. no_vert_alloc:
  6221. if ( info )
  6222. {
  6223. STBIR_PROFILE_BUILD_START( horizontal );
  6224. stbir__calculate_filters( horizontal, 0, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
  6225. // setup the horizontal gather functions
  6226. // start with defaulting to the n_coeffs functions (specialized on channels and remnant leftover)
  6227. info->horizontal_gather_channels = stbir__horizontal_gather_n_coeffs_funcs[ effective_channels ][ horizontal->extent_info.widest & 3 ];
  6228. // but if the number of coeffs <= 12, use another set of special cases. <=12 coeffs is any enlarging resize, or shrinking resize down to about 1/3 size
  6229. if ( horizontal->extent_info.widest <= 12 )
  6230. info->horizontal_gather_channels = stbir__horizontal_gather_channels_funcs[ effective_channels ][ horizontal->extent_info.widest - 1 ];
  6231. info->scanline_extents.conservative.n0 = conservative->n0;
  6232. info->scanline_extents.conservative.n1 = conservative->n1;
  6233. // get exact extents
  6234. stbir__get_extents( horizontal, &info->scanline_extents );
  6235. // pack the horizontal coeffs
  6236. horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n0, info->scanline_extents.conservative.n1 );
  6237. STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) );
  6238. STBIR_PROFILE_BUILD_END( horizontal );
  6239. if ( copy_horizontal )
  6240. {
  6241. STBIR_MEMCPY( &info->vertical, horizontal, sizeof( stbir__sampler ) );
  6242. }
  6243. else
  6244. {
  6245. STBIR_PROFILE_BUILD_START( vertical );
  6246. stbir__calculate_filters( vertical, possibly_use_horizontal_for_pivot, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
  6247. STBIR_MEMCPY( &info->vertical, vertical, sizeof( stbir__sampler ) );
  6248. STBIR_PROFILE_BUILD_END( vertical );
  6249. }
  6250. // setup the vertical split ranges
  6251. stbir__get_split_info( info->split_info, info->splits, info->vertical.scale_info.output_sub_size, info->vertical.filter_pixel_margin, info->vertical.scale_info.input_full_size, info->vertical.is_gather, info->vertical.contributors );
  6252. // now we know precisely how many entries we need
  6253. info->ring_buffer_num_entries = info->vertical.extent_info.widest;
  6254. // we never need more ring buffer entries than the scanlines we're outputting
  6255. if ( ( !info->vertical.is_gather ) && ( info->ring_buffer_num_entries > conservative_split_output_size ) )
  6256. info->ring_buffer_num_entries = conservative_split_output_size;
  6257. STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries );
  6258. }
  6259. #undef STBIR__NEXT_PTR
  6260. // is this the first time through loop?
  6261. if ( info == 0 )
  6262. {
  6263. alloced_total = ( 15 + (size_t)advance_mem );
  6264. alloced = STBIR_MALLOC( alloced_total, user_data );
  6265. if ( alloced == 0 )
  6266. return 0;
  6267. }
  6268. else
  6269. return info; // success
  6270. }
  6271. }
  6272. static int stbir__perform_resize( stbir__info const * info, int split_start, int split_count )
  6273. {
  6274. stbir__per_split_info * split_info = info->split_info + split_start;
  6275. STBIR_PROFILE_CLEAR_EXTRAS();
  6276. STBIR_PROFILE_FIRST_START( looping );
  6277. if (info->vertical.is_gather)
  6278. stbir__vertical_gather_loop( info, split_info, split_count );
  6279. else
  6280. stbir__vertical_scatter_loop( info, split_info, split_count );
  6281. STBIR_PROFILE_END( looping );
  6282. return 1;
  6283. }
  6284. static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * resize )
  6285. {
  6286. static stbir__decode_pixels_func * decode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
  6287. {
  6288. /* 1ch-4ch */ stbir__decode_uint8_srgb, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear,
  6289. };
  6290. static stbir__decode_pixels_func * decode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
  6291. {
  6292. { /* RGBA */ stbir__decode_uint8_srgb4_linearalpha, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear },
  6293. { /* BGRA */ stbir__decode_uint8_srgb4_linearalpha_BGRA, stbir__decode_uint8_srgb_BGRA, 0, stbir__decode_float_linear_BGRA, stbir__decode_half_float_linear_BGRA },
  6294. { /* ARGB */ stbir__decode_uint8_srgb4_linearalpha_ARGB, stbir__decode_uint8_srgb_ARGB, 0, stbir__decode_float_linear_ARGB, stbir__decode_half_float_linear_ARGB },
  6295. { /* ABGR */ stbir__decode_uint8_srgb4_linearalpha_ABGR, stbir__decode_uint8_srgb_ABGR, 0, stbir__decode_float_linear_ABGR, stbir__decode_half_float_linear_ABGR },
  6296. { /* RA */ stbir__decode_uint8_srgb2_linearalpha, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear },
  6297. { /* AR */ stbir__decode_uint8_srgb2_linearalpha_AR, stbir__decode_uint8_srgb_AR, 0, stbir__decode_float_linear_AR, stbir__decode_half_float_linear_AR },
  6298. };
  6299. static stbir__decode_pixels_func * decode_simple_scaled_or_not[2][2]=
  6300. {
  6301. { stbir__decode_uint8_linear_scaled, stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear },
  6302. };
  6303. static stbir__decode_pixels_func * decode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
  6304. {
  6305. { /* RGBA */ { stbir__decode_uint8_linear_scaled, stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear } },
  6306. { /* BGRA */ { stbir__decode_uint8_linear_scaled_BGRA, stbir__decode_uint8_linear_BGRA }, { stbir__decode_uint16_linear_scaled_BGRA, stbir__decode_uint16_linear_BGRA } },
  6307. { /* ARGB */ { stbir__decode_uint8_linear_scaled_ARGB, stbir__decode_uint8_linear_ARGB }, { stbir__decode_uint16_linear_scaled_ARGB, stbir__decode_uint16_linear_ARGB } },
  6308. { /* ABGR */ { stbir__decode_uint8_linear_scaled_ABGR, stbir__decode_uint8_linear_ABGR }, { stbir__decode_uint16_linear_scaled_ABGR, stbir__decode_uint16_linear_ABGR } },
  6309. { /* RA */ { stbir__decode_uint8_linear_scaled, stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear } },
  6310. { /* AR */ { stbir__decode_uint8_linear_scaled_AR, stbir__decode_uint8_linear_AR }, { stbir__decode_uint16_linear_scaled_AR, stbir__decode_uint16_linear_AR } }
  6311. };
  6312. static stbir__encode_pixels_func * encode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
  6313. {
  6314. /* 1ch-4ch */ stbir__encode_uint8_srgb, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear,
  6315. };
  6316. static stbir__encode_pixels_func * encode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
  6317. {
  6318. { /* RGBA */ stbir__encode_uint8_srgb4_linearalpha, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear },
  6319. { /* BGRA */ stbir__encode_uint8_srgb4_linearalpha_BGRA, stbir__encode_uint8_srgb_BGRA, 0, stbir__encode_float_linear_BGRA, stbir__encode_half_float_linear_BGRA },
  6320. { /* ARGB */ stbir__encode_uint8_srgb4_linearalpha_ARGB, stbir__encode_uint8_srgb_ARGB, 0, stbir__encode_float_linear_ARGB, stbir__encode_half_float_linear_ARGB },
  6321. { /* ABGR */ stbir__encode_uint8_srgb4_linearalpha_ABGR, stbir__encode_uint8_srgb_ABGR, 0, stbir__encode_float_linear_ABGR, stbir__encode_half_float_linear_ABGR },
  6322. { /* RA */ stbir__encode_uint8_srgb2_linearalpha, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear },
  6323. { /* AR */ stbir__encode_uint8_srgb2_linearalpha_AR, stbir__encode_uint8_srgb_AR, 0, stbir__encode_float_linear_AR, stbir__encode_half_float_linear_AR }
  6324. };
  6325. static stbir__encode_pixels_func * encode_simple_scaled_or_not[2][2]=
  6326. {
  6327. { stbir__encode_uint8_linear_scaled, stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear },
  6328. };
  6329. static stbir__encode_pixels_func * encode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
  6330. {
  6331. { /* RGBA */ { stbir__encode_uint8_linear_scaled, stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear } },
  6332. { /* BGRA */ { stbir__encode_uint8_linear_scaled_BGRA, stbir__encode_uint8_linear_BGRA }, { stbir__encode_uint16_linear_scaled_BGRA, stbir__encode_uint16_linear_BGRA } },
  6333. { /* ARGB */ { stbir__encode_uint8_linear_scaled_ARGB, stbir__encode_uint8_linear_ARGB }, { stbir__encode_uint16_linear_scaled_ARGB, stbir__encode_uint16_linear_ARGB } },
  6334. { /* ABGR */ { stbir__encode_uint8_linear_scaled_ABGR, stbir__encode_uint8_linear_ABGR }, { stbir__encode_uint16_linear_scaled_ABGR, stbir__encode_uint16_linear_ABGR } },
  6335. { /* RA */ { stbir__encode_uint8_linear_scaled, stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear } },
  6336. { /* AR */ { stbir__encode_uint8_linear_scaled_AR, stbir__encode_uint8_linear_AR }, { stbir__encode_uint16_linear_scaled_AR, stbir__encode_uint16_linear_AR } }
  6337. };
  6338. stbir__decode_pixels_func * decode_pixels = 0;
  6339. stbir__encode_pixels_func * encode_pixels = 0;
  6340. stbir_datatype input_type, output_type;
  6341. input_type = resize->input_data_type;
  6342. output_type = resize->output_data_type;
  6343. info->input_data = resize->input_pixels;
  6344. info->input_stride_bytes = resize->input_stride_in_bytes;
  6345. info->output_stride_bytes = resize->output_stride_in_bytes;
  6346. // if we're completely point sampling, then we can turn off SRGB
  6347. if ( ( info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( info->vertical.filter_enum == STBIR_FILTER_POINT_SAMPLE ) )
  6348. {
  6349. if ( ( ( input_type == STBIR_TYPE_UINT8_SRGB ) || ( input_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) &&
  6350. ( ( output_type == STBIR_TYPE_UINT8_SRGB ) || ( output_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) )
  6351. {
  6352. input_type = STBIR_TYPE_UINT8;
  6353. output_type = STBIR_TYPE_UINT8;
  6354. }
  6355. }
  6356. // recalc the output and input strides
  6357. if ( info->input_stride_bytes == 0 )
  6358. info->input_stride_bytes = info->channels * info->horizontal.scale_info.input_full_size * stbir__type_size[input_type];
  6359. if ( info->output_stride_bytes == 0 )
  6360. info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type];
  6361. // calc offset
  6362. info->output_data = ( (char*) resize->output_pixels ) + ( (size_t) info->offset_y * (size_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
  6363. info->in_pixels_cb = resize->input_cb;
  6364. info->user_data = resize->user_data;
  6365. info->out_pixels_cb = resize->output_cb;
  6366. // setup the input format converters
  6367. if ( ( input_type == STBIR_TYPE_UINT8 ) || ( input_type == STBIR_TYPE_UINT16 ) )
  6368. {
  6369. int non_scaled = 0;
  6370. // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
  6371. if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
  6372. if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
  6373. non_scaled = 1;
  6374. if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
  6375. decode_pixels = decode_simple_scaled_or_not[ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
  6376. else
  6377. decode_pixels = decode_alphas_scaled_or_not[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
  6378. }
  6379. else
  6380. {
  6381. if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
  6382. decode_pixels = decode_simple[ input_type - STBIR_TYPE_UINT8_SRGB ];
  6383. else
  6384. decode_pixels = decode_alphas[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type - STBIR_TYPE_UINT8_SRGB ];
  6385. }
  6386. // setup the output format converters
  6387. if ( ( output_type == STBIR_TYPE_UINT8 ) || ( output_type == STBIR_TYPE_UINT16 ) )
  6388. {
  6389. int non_scaled = 0;
  6390. // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
  6391. if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
  6392. if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
  6393. non_scaled = 1;
  6394. if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
  6395. encode_pixels = encode_simple_scaled_or_not[ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
  6396. else
  6397. encode_pixels = encode_alphas_scaled_or_not[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
  6398. }
  6399. else
  6400. {
  6401. if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
  6402. encode_pixels = encode_simple[ output_type - STBIR_TYPE_UINT8_SRGB ];
  6403. else
  6404. encode_pixels = encode_alphas[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type - STBIR_TYPE_UINT8_SRGB ];
  6405. }
  6406. info->input_type = input_type;
  6407. info->output_type = output_type;
  6408. info->decode_pixels = decode_pixels;
  6409. info->encode_pixels = encode_pixels;
  6410. }
  6411. static void stbir__clip( int * outx, int * outsubw, int outw, double * u0, double * u1 )
  6412. {
  6413. double per, adj;
  6414. int over;
  6415. // do left/top edge
  6416. if ( *outx < 0 )
  6417. {
  6418. per = ( (double)*outx ) / ( (double)*outsubw ); // is negative
  6419. adj = per * ( *u1 - *u0 );
  6420. *u0 -= adj; // increases u0
  6421. *outx = 0;
  6422. }
  6423. // do right/bot edge
  6424. over = outw - ( *outx + *outsubw );
  6425. if ( over < 0 )
  6426. {
  6427. per = ( (double)over ) / ( (double)*outsubw ); // is negative
  6428. adj = per * ( *u1 - *u0 );
  6429. *u1 += adj; // decrease u1
  6430. *outsubw = outw - *outx;
  6431. }
  6432. }
  6433. // converts a double to a rational that has less than one float bit of error (returns 0 if unable to do so)
  6434. static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32 *numer, stbir_uint32 *denom, int limit_denom ) // limit_denom (1) or limit numer (0)
  6435. {
  6436. double err;
  6437. stbir_uint64 top, bot;
  6438. stbir_uint64 numer_last = 0;
  6439. stbir_uint64 denom_last = 1;
  6440. stbir_uint64 numer_estimate = 1;
  6441. stbir_uint64 denom_estimate = 0;
  6442. // scale to past float error range
  6443. top = (stbir_uint64)( f * (double)(1 << 25) );
  6444. bot = 1 << 25;
  6445. // keep refining, but usually stops in a few loops - usually 5 for bad cases
  6446. for(;;)
  6447. {
  6448. stbir_uint64 est, temp;
  6449. // hit limit, break out and do best full range estimate
  6450. if ( ( ( limit_denom ) ? denom_estimate : numer_estimate ) >= limit )
  6451. break;
  6452. // is the current error less than 1 bit of a float? if so, we're done
  6453. if ( denom_estimate )
  6454. {
  6455. err = ( (double)numer_estimate / (double)denom_estimate ) - f;
  6456. if ( err < 0.0 ) err = -err;
  6457. if ( err < ( 1.0 / (double)(1<<24) ) )
  6458. {
  6459. // yup, found it
  6460. *numer = (stbir_uint32) numer_estimate;
  6461. *denom = (stbir_uint32) denom_estimate;
  6462. return 1;
  6463. }
  6464. }
  6465. // no more refinement bits left? break out and do full range estimate
  6466. if ( bot == 0 )
  6467. break;
  6468. // gcd the estimate bits
  6469. est = top / bot;
  6470. temp = top % bot;
  6471. top = bot;
  6472. bot = temp;
  6473. // move remainders
  6474. temp = est * denom_estimate + denom_last;
  6475. denom_last = denom_estimate;
  6476. denom_estimate = temp;
  6477. // move remainders
  6478. temp = est * numer_estimate + numer_last;
  6479. numer_last = numer_estimate;
  6480. numer_estimate = temp;
  6481. }
  6482. // we didn't fine anything good enough for float, use a full range estimate
  6483. if ( limit_denom )
  6484. {
  6485. numer_estimate= (stbir_uint64)( f * (double)limit + 0.5 );
  6486. denom_estimate = limit;
  6487. }
  6488. else
  6489. {
  6490. numer_estimate = limit;
  6491. denom_estimate = (stbir_uint64)( ( (double)limit / f ) + 0.5 );
  6492. }
  6493. *numer = (stbir_uint32) numer_estimate;
  6494. *denom = (stbir_uint32) denom_estimate;
  6495. err = ( denom_estimate ) ? ( ( (double)(stbir_uint32)numer_estimate / (double)(stbir_uint32)denom_estimate ) - f ) : 1.0;
  6496. if ( err < 0.0 ) err = -err;
  6497. return ( err < ( 1.0 / (double)(1<<24) ) ) ? 1 : 0;
  6498. }
  6499. static int stbir__calculate_region_transform( stbir__scale_info * scale_info, int output_full_range, int * output_offset, int output_sub_range, int input_full_range, double input_s0, double input_s1 )
  6500. {
  6501. double output_range, input_range, output_s, input_s, ratio, scale;
  6502. input_s = input_s1 - input_s0;
  6503. // null area
  6504. if ( ( output_full_range == 0 ) || ( input_full_range == 0 ) ||
  6505. ( output_sub_range == 0 ) || ( input_s <= stbir__small_float ) )
  6506. return 0;
  6507. // are either of the ranges completely out of bounds?
  6508. if ( ( *output_offset >= output_full_range ) || ( ( *output_offset + output_sub_range ) <= 0 ) || ( input_s0 >= (1.0f-stbir__small_float) ) || ( input_s1 <= stbir__small_float ) )
  6509. return 0;
  6510. output_range = (double)output_full_range;
  6511. input_range = (double)input_full_range;
  6512. output_s = ( (double)output_sub_range) / output_range;
  6513. // figure out the scaling to use
  6514. ratio = output_s / input_s;
  6515. // save scale before clipping
  6516. scale = ( output_range / input_range ) * ratio;
  6517. scale_info->scale = (float)scale;
  6518. scale_info->inv_scale = (float)( 1.0 / scale );
  6519. // clip output area to left/right output edges (and adjust input area)
  6520. stbir__clip( output_offset, &output_sub_range, output_full_range, &input_s0, &input_s1 );
  6521. // recalc input area
  6522. input_s = input_s1 - input_s0;
  6523. // after clipping do we have zero input area?
  6524. if ( input_s <= stbir__small_float )
  6525. return 0;
  6526. // calculate and store the starting source offsets in output pixel space
  6527. scale_info->pixel_shift = (float) ( input_s0 * ratio * output_range );
  6528. scale_info->scale_is_rational = stbir__double_to_rational( scale, ( scale <= 1.0 ) ? output_full_range : input_full_range, &scale_info->scale_numerator, &scale_info->scale_denominator, ( scale >= 1.0 ) );
  6529. scale_info->input_full_size = input_full_range;
  6530. scale_info->output_sub_size = output_sub_range;
  6531. return 1;
  6532. }
  6533. static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layout pixel_layout, stbir_datatype data_type )
  6534. {
  6535. resize->input_cb = 0;
  6536. resize->output_cb = 0;
  6537. resize->user_data = resize;
  6538. resize->samplers = 0;
  6539. resize->called_alloc = 0;
  6540. resize->horizontal_filter = STBIR_FILTER_DEFAULT;
  6541. resize->horizontal_filter_kernel = 0; resize->horizontal_filter_support = 0;
  6542. resize->vertical_filter = STBIR_FILTER_DEFAULT;
  6543. resize->vertical_filter_kernel = 0; resize->vertical_filter_support = 0;
  6544. resize->horizontal_edge = STBIR_EDGE_CLAMP;
  6545. resize->vertical_edge = STBIR_EDGE_CLAMP;
  6546. resize->input_s0 = 0; resize->input_t0 = 0; resize->input_s1 = 1; resize->input_t1 = 1;
  6547. resize->output_subx = 0; resize->output_suby = 0; resize->output_subw = resize->output_w; resize->output_subh = resize->output_h;
  6548. resize->input_data_type = data_type;
  6549. resize->output_data_type = data_type;
  6550. resize->input_pixel_layout_public = pixel_layout;
  6551. resize->output_pixel_layout_public = pixel_layout;
  6552. resize->needs_rebuild = 1;
  6553. }
  6554. STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
  6555. const void *input_pixels, int input_w, int input_h, int input_stride_in_bytes, // stride can be zero
  6556. void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
  6557. stbir_pixel_layout pixel_layout, stbir_datatype data_type )
  6558. {
  6559. resize->input_pixels = input_pixels;
  6560. resize->input_w = input_w;
  6561. resize->input_h = input_h;
  6562. resize->input_stride_in_bytes = input_stride_in_bytes;
  6563. resize->output_pixels = output_pixels;
  6564. resize->output_w = output_w;
  6565. resize->output_h = output_h;
  6566. resize->output_stride_in_bytes = output_stride_in_bytes;
  6567. resize->fast_alpha = 0;
  6568. stbir__init_and_set_layout( resize, pixel_layout, data_type );
  6569. }
  6570. // You can update parameters any time after resize_init
  6571. STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type ) // by default, datatype from resize_init
  6572. {
  6573. resize->input_data_type = input_type;
  6574. resize->output_data_type = output_type;
  6575. if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
  6576. stbir__update_info_from_resize( resize->samplers, resize );
  6577. }
  6578. STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb ) // no callbacks by default
  6579. {
  6580. resize->input_cb = input_cb;
  6581. resize->output_cb = output_cb;
  6582. if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
  6583. {
  6584. resize->samplers->in_pixels_cb = input_cb;
  6585. resize->samplers->out_pixels_cb = output_cb;
  6586. }
  6587. }
  6588. STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data ) // pass back STBIR_RESIZE* by default
  6589. {
  6590. resize->user_data = user_data;
  6591. if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
  6592. resize->samplers->user_data = user_data;
  6593. }
  6594. STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes )
  6595. {
  6596. resize->input_pixels = input_pixels;
  6597. resize->input_stride_in_bytes = input_stride_in_bytes;
  6598. resize->output_pixels = output_pixels;
  6599. resize->output_stride_in_bytes = output_stride_in_bytes;
  6600. if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
  6601. stbir__update_info_from_resize( resize->samplers, resize );
  6602. }
  6603. STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge ) // CLAMP by default
  6604. {
  6605. resize->horizontal_edge = horizontal_edge;
  6606. resize->vertical_edge = vertical_edge;
  6607. resize->needs_rebuild = 1;
  6608. return 1;
  6609. }
  6610. STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ) // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
  6611. {
  6612. resize->horizontal_filter = horizontal_filter;
  6613. resize->vertical_filter = vertical_filter;
  6614. resize->needs_rebuild = 1;
  6615. return 1;
  6616. }
  6617. STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support )
  6618. {
  6619. resize->horizontal_filter_kernel = horizontal_filter; resize->horizontal_filter_support = horizontal_support;
  6620. resize->vertical_filter_kernel = vertical_filter; resize->vertical_filter_support = vertical_support;
  6621. resize->needs_rebuild = 1;
  6622. return 1;
  6623. }
  6624. STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout ) // sets new pixel layouts
  6625. {
  6626. resize->input_pixel_layout_public = input_pixel_layout;
  6627. resize->output_pixel_layout_public = output_pixel_layout;
  6628. resize->needs_rebuild = 1;
  6629. return 1;
  6630. }
  6631. STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality ) // sets alpha speed
  6632. {
  6633. resize->fast_alpha = non_pma_alpha_speed_over_quality;
  6634. resize->needs_rebuild = 1;
  6635. return 1;
  6636. }
  6637. STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 ) // sets input region (full region by default)
  6638. {
  6639. resize->input_s0 = s0;
  6640. resize->input_t0 = t0;
  6641. resize->input_s1 = s1;
  6642. resize->input_t1 = t1;
  6643. resize->needs_rebuild = 1;
  6644. // are we inbounds?
  6645. if ( ( s1 < stbir__small_float ) || ( (s1-s0) < stbir__small_float ) ||
  6646. ( t1 < stbir__small_float ) || ( (t1-t0) < stbir__small_float ) ||
  6647. ( s0 > (1.0f-stbir__small_float) ) ||
  6648. ( t0 > (1.0f-stbir__small_float) ) )
  6649. return 0;
  6650. return 1;
  6651. }
  6652. STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ) // sets input region (full region by default)
  6653. {
  6654. resize->output_subx = subx;
  6655. resize->output_suby = suby;
  6656. resize->output_subw = subw;
  6657. resize->output_subh = subh;
  6658. resize->needs_rebuild = 1;
  6659. // are we inbounds?
  6660. if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
  6661. return 0;
  6662. return 1;
  6663. }
  6664. STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ) // sets both regions (full regions by default)
  6665. {
  6666. double s0, t0, s1, t1;
  6667. s0 = ( (double)subx ) / ( (double)resize->output_w );
  6668. t0 = ( (double)suby ) / ( (double)resize->output_h );
  6669. s1 = ( (double)(subx+subw) ) / ( (double)resize->output_w );
  6670. t1 = ( (double)(suby+subh) ) / ( (double)resize->output_h );
  6671. resize->input_s0 = s0;
  6672. resize->input_t0 = t0;
  6673. resize->input_s1 = s1;
  6674. resize->input_t1 = t1;
  6675. resize->output_subx = subx;
  6676. resize->output_suby = suby;
  6677. resize->output_subw = subw;
  6678. resize->output_subh = subh;
  6679. resize->needs_rebuild = 1;
  6680. // are we inbounds?
  6681. if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
  6682. return 0;
  6683. return 1;
  6684. }
  6685. static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
  6686. {
  6687. stbir__contributors conservative = { 0, 0 };
  6688. stbir__sampler horizontal, vertical;
  6689. int new_output_subx, new_output_suby;
  6690. stbir__info * out_info;
  6691. #ifdef STBIR_PROFILE
  6692. stbir__info profile_infod; // used to contain building profile info before everything is allocated
  6693. stbir__info * profile_info = &profile_infod;
  6694. #endif
  6695. // have we already built the samplers?
  6696. if ( resize->samplers )
  6697. return 0;
  6698. #define STBIR_RETURN_ERROR_AND_ASSERT( exp ) STBIR_ASSERT( !(exp) ); if (exp) return 0;
  6699. STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->horizontal_filter >= STBIR_FILTER_OTHER)
  6700. STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->vertical_filter >= STBIR_FILTER_OTHER)
  6701. #undef STBIR_RETURN_ERROR_AND_ASSERT
  6702. if ( splits <= 0 )
  6703. return 0;
  6704. STBIR_PROFILE_BUILD_FIRST_START( build );
  6705. new_output_subx = resize->output_subx;
  6706. new_output_suby = resize->output_suby;
  6707. // do horizontal clip and scale calcs
  6708. if ( !stbir__calculate_region_transform( &horizontal.scale_info, resize->output_w, &new_output_subx, resize->output_subw, resize->input_w, resize->input_s0, resize->input_s1 ) )
  6709. return 0;
  6710. // do vertical clip and scale calcs
  6711. if ( !stbir__calculate_region_transform( &vertical.scale_info, resize->output_h, &new_output_suby, resize->output_subh, resize->input_h, resize->input_t0, resize->input_t1 ) )
  6712. return 0;
  6713. // if nothing to do, just return
  6714. if ( ( horizontal.scale_info.output_sub_size == 0 ) || ( vertical.scale_info.output_sub_size == 0 ) )
  6715. return 0;
  6716. stbir__set_sampler(&horizontal, resize->horizontal_filter, resize->horizontal_filter_kernel, resize->horizontal_filter_support, resize->horizontal_edge, &horizontal.scale_info, 1, resize->user_data );
  6717. stbir__get_conservative_extents( &horizontal, &conservative, resize->user_data );
  6718. stbir__set_sampler(&vertical, resize->vertical_filter, resize->vertical_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data );
  6719. if ( ( vertical.scale_info.output_sub_size / splits ) < STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS ) // each split should be a minimum of 4 scanlines (handwavey choice)
  6720. {
  6721. splits = vertical.scale_info.output_sub_size / STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS;
  6722. if ( splits == 0 ) splits = 1;
  6723. }
  6724. STBIR_PROFILE_BUILD_START( alloc );
  6725. out_info = stbir__alloc_internal_mem_and_build_samplers( &horizontal, &vertical, &conservative, resize->input_pixel_layout_public, resize->output_pixel_layout_public, splits, new_output_subx, new_output_suby, resize->fast_alpha, resize->user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
  6726. STBIR_PROFILE_BUILD_END( alloc );
  6727. STBIR_PROFILE_BUILD_END( build );
  6728. if ( out_info )
  6729. {
  6730. resize->splits = splits;
  6731. resize->samplers = out_info;
  6732. resize->needs_rebuild = 0;
  6733. #ifdef STBIR_PROFILE
  6734. STBIR_MEMCPY( &out_info->profile, &profile_infod.profile, sizeof( out_info->profile ) );
  6735. #endif
  6736. // update anything that can be changed without recalcing samplers
  6737. stbir__update_info_from_resize( out_info, resize );
  6738. return splits;
  6739. }
  6740. return 0;
  6741. }
  6742. void stbir_free_samplers( STBIR_RESIZE * resize )
  6743. {
  6744. if ( resize->samplers )
  6745. {
  6746. stbir__free_internal_mem( resize->samplers );
  6747. resize->samplers = 0;
  6748. resize->called_alloc = 0;
  6749. }
  6750. }
  6751. STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int splits )
  6752. {
  6753. if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
  6754. {
  6755. if ( resize->samplers )
  6756. stbir_free_samplers( resize );
  6757. resize->called_alloc = 1;
  6758. return stbir__perform_build( resize, splits );
  6759. }
  6760. STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
  6761. return 1;
  6762. }
  6763. STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize )
  6764. {
  6765. return stbir_build_samplers_with_splits( resize, 1 );
  6766. }
  6767. STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
  6768. {
  6769. int result;
  6770. if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
  6771. {
  6772. int alloc_state = resize->called_alloc; // remember allocated state
  6773. if ( resize->samplers )
  6774. {
  6775. stbir__free_internal_mem( resize->samplers );
  6776. resize->samplers = 0;
  6777. }
  6778. if ( !stbir_build_samplers( resize ) )
  6779. return 0;
  6780. resize->called_alloc = alloc_state;
  6781. // if build_samplers succeeded (above), but there are no samplers set, then
  6782. // the area to stretch into was zero pixels, so don't do anything and return
  6783. // success
  6784. if ( resize->samplers == 0 )
  6785. return 1;
  6786. }
  6787. else
  6788. {
  6789. // didn't build anything - clear it
  6790. STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
  6791. }
  6792. // do resize
  6793. result = stbir__perform_resize( resize->samplers, 0, resize->splits );
  6794. // if we alloced, then free
  6795. if ( !resize->called_alloc )
  6796. {
  6797. stbir_free_samplers( resize );
  6798. resize->samplers = 0;
  6799. }
  6800. return result;
  6801. }
  6802. STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count )
  6803. {
  6804. STBIR_ASSERT( resize->samplers );
  6805. // if we're just doing the whole thing, call full
  6806. if ( ( split_start == -1 ) || ( ( split_start == 0 ) && ( split_count == resize->splits ) ) )
  6807. return stbir_resize_extended( resize );
  6808. // you **must** build samplers first when using split resize
  6809. if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
  6810. return 0;
  6811. if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
  6812. return 0;
  6813. // do resize
  6814. return stbir__perform_resize( resize->samplers, split_start, split_count );
  6815. }
  6816. static void * stbir_quick_resize_helper( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  6817. void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  6818. stbir_pixel_layout pixel_layout, stbir_datatype data_type, stbir_edge edge, stbir_filter filter )
  6819. {
  6820. STBIR_RESIZE resize;
  6821. int scanline_output_in_bytes;
  6822. int positive_output_stride_in_bytes;
  6823. void * start_ptr;
  6824. void * free_ptr;
  6825. scanline_output_in_bytes = output_w * stbir__type_size[ data_type ] * stbir__pixel_channels[ stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ];
  6826. if ( scanline_output_in_bytes == 0 )
  6827. return 0;
  6828. // if zero stride, use scanline output
  6829. if ( output_stride_in_bytes == 0 )
  6830. output_stride_in_bytes = scanline_output_in_bytes;
  6831. // abs value for inverted images (negative pitches)
  6832. positive_output_stride_in_bytes = output_stride_in_bytes;
  6833. if ( positive_output_stride_in_bytes < 0 )
  6834. positive_output_stride_in_bytes = -positive_output_stride_in_bytes;
  6835. // is the requested stride smaller than the scanline output? if so, just fail
  6836. if ( positive_output_stride_in_bytes < scanline_output_in_bytes )
  6837. return 0;
  6838. start_ptr = output_pixels;
  6839. free_ptr = 0; // no free pointer, since they passed buffer to use
  6840. // did they pass a zero for the dest? if so, allocate the buffer
  6841. if ( output_pixels == 0 )
  6842. {
  6843. size_t size;
  6844. char * ptr;
  6845. size = (size_t)positive_output_stride_in_bytes * (size_t)output_h;
  6846. if ( size == 0 )
  6847. return 0;
  6848. ptr = (char*) STBIR_MALLOC( size, 0 );
  6849. if ( ptr == 0 )
  6850. return 0;
  6851. free_ptr = ptr;
  6852. // point at the last scanline, if they requested a flipped image
  6853. if ( output_stride_in_bytes < 0 )
  6854. start_ptr = ptr + ( (size_t)positive_output_stride_in_bytes * (size_t)( output_h - 1 ) );
  6855. else
  6856. start_ptr = ptr;
  6857. }
  6858. // ok, now do the resize
  6859. stbir_resize_init( &resize,
  6860. input_pixels, input_w, input_h, input_stride_in_bytes,
  6861. start_ptr, output_w, output_h, output_stride_in_bytes,
  6862. pixel_layout, data_type );
  6863. resize.horizontal_edge = edge;
  6864. resize.vertical_edge = edge;
  6865. resize.horizontal_filter = filter;
  6866. resize.vertical_filter = filter;
  6867. if ( !stbir_resize_extended( &resize ) )
  6868. {
  6869. if ( free_ptr )
  6870. STBIR_FREE( free_ptr, 0 );
  6871. return 0;
  6872. }
  6873. return (free_ptr) ? free_ptr : start_ptr;
  6874. }
  6875. STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  6876. unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  6877. stbir_pixel_layout pixel_layout )
  6878. {
  6879. return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes,
  6880. output_pixels, output_w, output_h, output_stride_in_bytes,
  6881. pixel_layout, STBIR_TYPE_UINT8, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT );
  6882. }
  6883. STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  6884. unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  6885. stbir_pixel_layout pixel_layout )
  6886. {
  6887. return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes,
  6888. output_pixels, output_w, output_h, output_stride_in_bytes,
  6889. pixel_layout, STBIR_TYPE_UINT8_SRGB, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT );
  6890. }
  6891. STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  6892. float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  6893. stbir_pixel_layout pixel_layout )
  6894. {
  6895. return (float *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes,
  6896. output_pixels, output_w, output_h, output_stride_in_bytes,
  6897. pixel_layout, STBIR_TYPE_FLOAT, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT );
  6898. }
  6899. STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
  6900. void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
  6901. stbir_pixel_layout pixel_layout, stbir_datatype data_type,
  6902. stbir_edge edge, stbir_filter filter )
  6903. {
  6904. return (void *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes,
  6905. output_pixels, output_w, output_h, output_stride_in_bytes,
  6906. pixel_layout, data_type, edge, filter );
  6907. }
  6908. #ifdef STBIR_PROFILE
  6909. STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
  6910. {
  6911. static char const * bdescriptions[6] = { "Building", "Allocating", "Horizontal sampler", "Vertical sampler", "Coefficient cleanup", "Coefficient piovot" } ;
  6912. stbir__info* samp = resize->samplers;
  6913. int i;
  6914. typedef int testa[ (STBIR__ARRAY_SIZE( bdescriptions ) == (STBIR__ARRAY_SIZE( samp->profile.array )-1) )?1:-1];
  6915. typedef int testb[ (sizeof( samp->profile.array ) == (sizeof(samp->profile.named)) )?1:-1];
  6916. typedef int testc[ (sizeof( info->clocks ) >= (sizeof(samp->profile.named)) )?1:-1];
  6917. for( i = 0 ; i < STBIR__ARRAY_SIZE( bdescriptions ) ; i++)
  6918. info->clocks[i] = samp->profile.array[i+1];
  6919. info->total_clocks = samp->profile.named.total;
  6920. info->descriptions = bdescriptions;
  6921. info->count = STBIR__ARRAY_SIZE( bdescriptions );
  6922. }
  6923. STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize, int split_start, int split_count )
  6924. {
  6925. static char const * descriptions[7] = { "Looping", "Vertical sampling", "Horizontal sampling", "Scanline input", "Scanline output", "Alpha weighting", "Alpha unweighting" };
  6926. stbir__per_split_info * split_info;
  6927. int s, i;
  6928. typedef int testa[ (STBIR__ARRAY_SIZE( descriptions ) == (STBIR__ARRAY_SIZE( split_info->profile.array )-1) )?1:-1];
  6929. typedef int testb[ (sizeof( split_info->profile.array ) == (sizeof(split_info->profile.named)) )?1:-1];
  6930. typedef int testc[ (sizeof( info->clocks ) >= (sizeof(split_info->profile.named)) )?1:-1];
  6931. if ( split_start == -1 )
  6932. {
  6933. split_start = 0;
  6934. split_count = resize->samplers->splits;
  6935. }
  6936. if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
  6937. {
  6938. info->total_clocks = 0;
  6939. info->descriptions = 0;
  6940. info->count = 0;
  6941. return;
  6942. }
  6943. split_info = resize->samplers->split_info + split_start;
  6944. // sum up the profile from all the splits
  6945. for( i = 0 ; i < STBIR__ARRAY_SIZE( descriptions ) ; i++ )
  6946. {
  6947. stbir_uint64 sum = 0;
  6948. for( s = 0 ; s < split_count ; s++ )
  6949. sum += split_info[s].profile.array[i+1];
  6950. info->clocks[i] = sum;
  6951. }
  6952. info->total_clocks = split_info->profile.named.total;
  6953. info->descriptions = descriptions;
  6954. info->count = STBIR__ARRAY_SIZE( descriptions );
  6955. }
  6956. STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
  6957. {
  6958. stbir_resize_split_profile_info( info, resize, -1, 0 );
  6959. }
  6960. #endif // STBIR_PROFILE
  6961. #undef STBIR_BGR
  6962. #undef STBIR_1CHANNEL
  6963. #undef STBIR_2CHANNEL
  6964. #undef STBIR_RGB
  6965. #undef STBIR_RGBA
  6966. #undef STBIR_4CHANNEL
  6967. #undef STBIR_BGRA
  6968. #undef STBIR_ARGB
  6969. #undef STBIR_ABGR
  6970. #undef STBIR_RA
  6971. #undef STBIR_AR
  6972. #undef STBIR_RGBA_PM
  6973. #undef STBIR_BGRA_PM
  6974. #undef STBIR_ARGB_PM
  6975. #undef STBIR_ABGR_PM
  6976. #undef STBIR_RA_PM
  6977. #undef STBIR_AR_PM
  6978. #endif // STB_IMAGE_RESIZE_IMPLEMENTATION
  6979. #else // STB_IMAGE_RESIZE_HORIZONTALS&STB_IMAGE_RESIZE_DO_VERTICALS
  6980. // we reinclude the header file to define all the horizontal functions
  6981. // specializing each function for the number of coeffs is 20-40% faster *OVERALL*
  6982. // by including the header file again this way, we can still debug the functions
  6983. #define STBIR_strs_join2( start, mid, end ) start##mid##end
  6984. #define STBIR_strs_join1( start, mid, end ) STBIR_strs_join2( start, mid, end )
  6985. #define STBIR_strs_join24( start, mid1, mid2, end ) start##mid1##mid2##end
  6986. #define STBIR_strs_join14( start, mid1, mid2, end ) STBIR_strs_join24( start, mid1, mid2, end )
  6987. #ifdef STB_IMAGE_RESIZE_DO_CODERS
  6988. #ifdef stbir__decode_suffix
  6989. #define STBIR__CODER_NAME( name ) STBIR_strs_join1( name, _, stbir__decode_suffix )
  6990. #else
  6991. #define STBIR__CODER_NAME( name ) name
  6992. #endif
  6993. #ifdef stbir__decode_swizzle
  6994. #define stbir__decode_simdf8_flip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3),stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
  6995. #define stbir__decode_simdf4_flip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
  6996. #define stbir__encode_simdf8_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3),stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
  6997. #define stbir__encode_simdf4_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
  6998. #else
  6999. #define stbir__decode_order0 0
  7000. #define stbir__decode_order1 1
  7001. #define stbir__decode_order2 2
  7002. #define stbir__decode_order3 3
  7003. #define stbir__encode_order0 0
  7004. #define stbir__encode_order1 1
  7005. #define stbir__encode_order2 2
  7006. #define stbir__encode_order3 3
  7007. #define stbir__decode_simdf8_flip(reg)
  7008. #define stbir__decode_simdf4_flip(reg)
  7009. #define stbir__encode_simdf8_unflip(reg)
  7010. #define stbir__encode_simdf4_unflip(reg)
  7011. #endif
  7012. #ifdef STBIR_SIMD8
  7013. #define stbir__encode_simdfX_unflip stbir__encode_simdf8_unflip
  7014. #else
  7015. #define stbir__encode_simdfX_unflip stbir__encode_simdf4_unflip
  7016. #endif
  7017. static float * STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
  7018. {
  7019. float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  7020. float * decode_end = (float*) decode + width_times_channels;
  7021. unsigned char const * input = (unsigned char const*)inputp;
  7022. #ifdef STBIR_SIMD
  7023. unsigned char const * end_input_m16 = input + width_times_channels - 16;
  7024. if ( width_times_channels >= 16 )
  7025. {
  7026. decode_end -= 16;
  7027. STBIR_NO_UNROLL_LOOP_START_INF_FOR
  7028. for(;;)
  7029. {
  7030. #ifdef STBIR_SIMD8
  7031. stbir__simdi i; stbir__simdi8 o0,o1;
  7032. stbir__simdf8 of0, of1;
  7033. STBIR_NO_UNROLL(decode);
  7034. stbir__simdi_load( i, input );
  7035. stbir__simdi8_expand_u8_to_u32( o0, o1, i );
  7036. stbir__simdi8_convert_i32_to_float( of0, o0 );
  7037. stbir__simdi8_convert_i32_to_float( of1, o1 );
  7038. stbir__simdf8_mult( of0, of0, STBIR_max_uint8_as_float_inverted8);
  7039. stbir__simdf8_mult( of1, of1, STBIR_max_uint8_as_float_inverted8);
  7040. stbir__decode_simdf8_flip( of0 );
  7041. stbir__decode_simdf8_flip( of1 );
  7042. stbir__simdf8_store( decode + 0, of0 );
  7043. stbir__simdf8_store( decode + 8, of1 );
  7044. #else
  7045. stbir__simdi i, o0, o1, o2, o3;
  7046. stbir__simdf of0, of1, of2, of3;
  7047. STBIR_NO_UNROLL(decode);
  7048. stbir__simdi_load( i, input );
  7049. stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
  7050. stbir__simdi_convert_i32_to_float( of0, o0 );
  7051. stbir__simdi_convert_i32_to_float( of1, o1 );
  7052. stbir__simdi_convert_i32_to_float( of2, o2 );
  7053. stbir__simdi_convert_i32_to_float( of3, o3 );
  7054. stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
  7055. stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
  7056. stbir__simdf_mult( of2, of2, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
  7057. stbir__simdf_mult( of3, of3, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
  7058. stbir__decode_simdf4_flip( of0 );
  7059. stbir__decode_simdf4_flip( of1 );
  7060. stbir__decode_simdf4_flip( of2 );
  7061. stbir__decode_simdf4_flip( of3 );
  7062. stbir__simdf_store( decode + 0, of0 );
  7063. stbir__simdf_store( decode + 4, of1 );
  7064. stbir__simdf_store( decode + 8, of2 );
  7065. stbir__simdf_store( decode + 12, of3 );
  7066. #endif
  7067. decode += 16;
  7068. input += 16;
  7069. if ( decode <= decode_end )
  7070. continue;
  7071. if ( decode == ( decode_end + 16 ) )
  7072. break;
  7073. decode = decode_end; // backup and do last couple
  7074. input = end_input_m16;
  7075. }
  7076. return decode_end + 16;
  7077. }
  7078. #endif
  7079. // try to do blocks of 4 when you can
  7080. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7081. decode += 4;
  7082. STBIR_SIMD_NO_UNROLL_LOOP_START
  7083. while( decode <= decode_end )
  7084. {
  7085. STBIR_SIMD_NO_UNROLL(decode);
  7086. decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
  7087. decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
  7088. decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
  7089. decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint8_as_float_inverted;
  7090. decode += 4;
  7091. input += 4;
  7092. }
  7093. decode -= 4;
  7094. #endif
  7095. // do the remnants
  7096. #if stbir__coder_min_num < 4
  7097. STBIR_NO_UNROLL_LOOP_START
  7098. while( decode < decode_end )
  7099. {
  7100. STBIR_NO_UNROLL(decode);
  7101. decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
  7102. #if stbir__coder_min_num >= 2
  7103. decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
  7104. #endif
  7105. #if stbir__coder_min_num >= 3
  7106. decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
  7107. #endif
  7108. decode += stbir__coder_min_num;
  7109. input += stbir__coder_min_num;
  7110. }
  7111. #endif
  7112. return decode_end;
  7113. }
  7114. static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outputp, int width_times_channels, float const * encode )
  7115. {
  7116. unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
  7117. unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
  7118. #ifdef STBIR_SIMD
  7119. if ( width_times_channels >= stbir__simdfX_float_count*2 )
  7120. {
  7121. float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
  7122. end_output -= stbir__simdfX_float_count*2;
  7123. STBIR_NO_UNROLL_LOOP_START_INF_FOR
  7124. for(;;)
  7125. {
  7126. stbir__simdfX e0, e1;
  7127. stbir__simdi i;
  7128. STBIR_SIMD_NO_UNROLL(encode);
  7129. stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode );
  7130. stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode+stbir__simdfX_float_count );
  7131. stbir__encode_simdfX_unflip( e0 );
  7132. stbir__encode_simdfX_unflip( e1 );
  7133. #ifdef STBIR_SIMD8
  7134. stbir__simdf8_pack_to_16bytes( i, e0, e1 );
  7135. stbir__simdi_store( output, i );
  7136. #else
  7137. stbir__simdf_pack_to_8bytes( i, e0, e1 );
  7138. stbir__simdi_store2( output, i );
  7139. #endif
  7140. encode += stbir__simdfX_float_count*2;
  7141. output += stbir__simdfX_float_count*2;
  7142. if ( output <= end_output )
  7143. continue;
  7144. if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
  7145. break;
  7146. output = end_output; // backup and do last couple
  7147. encode = end_encode_m8;
  7148. }
  7149. return;
  7150. }
  7151. // try to do blocks of 4 when you can
  7152. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7153. output += 4;
  7154. STBIR_NO_UNROLL_LOOP_START
  7155. while( output <= end_output )
  7156. {
  7157. stbir__simdf e0;
  7158. stbir__simdi i0;
  7159. STBIR_NO_UNROLL(encode);
  7160. stbir__simdf_load( e0, encode );
  7161. stbir__simdf_madd( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), e0 );
  7162. stbir__encode_simdf4_unflip( e0 );
  7163. stbir__simdf_pack_to_8bytes( i0, e0, e0 ); // only use first 4
  7164. *(int*)(output-4) = stbir__simdi_to_int( i0 );
  7165. output += 4;
  7166. encode += 4;
  7167. }
  7168. output -= 4;
  7169. #endif
  7170. // do the remnants
  7171. #if stbir__coder_min_num < 4
  7172. STBIR_NO_UNROLL_LOOP_START
  7173. while( output < end_output )
  7174. {
  7175. stbir__simdf e0;
  7176. STBIR_NO_UNROLL(encode);
  7177. stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_uint8( e0 );
  7178. #if stbir__coder_min_num >= 2
  7179. stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_uint8( e0 );
  7180. #endif
  7181. #if stbir__coder_min_num >= 3
  7182. stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_uint8( e0 );
  7183. #endif
  7184. output += stbir__coder_min_num;
  7185. encode += stbir__coder_min_num;
  7186. }
  7187. #endif
  7188. #else
  7189. // try to do blocks of 4 when you can
  7190. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7191. output += 4;
  7192. while( output <= end_output )
  7193. {
  7194. float f;
  7195. f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
  7196. f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
  7197. f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
  7198. f = encode[stbir__encode_order3] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
  7199. output += 4;
  7200. encode += 4;
  7201. }
  7202. output -= 4;
  7203. #endif
  7204. // do the remnants
  7205. #if stbir__coder_min_num < 4
  7206. STBIR_NO_UNROLL_LOOP_START
  7207. while( output < end_output )
  7208. {
  7209. float f;
  7210. STBIR_NO_UNROLL(encode);
  7211. f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
  7212. #if stbir__coder_min_num >= 2
  7213. f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
  7214. #endif
  7215. #if stbir__coder_min_num >= 3
  7216. f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
  7217. #endif
  7218. output += stbir__coder_min_num;
  7219. encode += stbir__coder_min_num;
  7220. }
  7221. #endif
  7222. #endif
  7223. }
  7224. static float * STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
  7225. {
  7226. float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  7227. float * decode_end = (float*) decode + width_times_channels;
  7228. unsigned char const * input = (unsigned char const*)inputp;
  7229. #ifdef STBIR_SIMD
  7230. unsigned char const * end_input_m16 = input + width_times_channels - 16;
  7231. if ( width_times_channels >= 16 )
  7232. {
  7233. decode_end -= 16;
  7234. STBIR_NO_UNROLL_LOOP_START_INF_FOR
  7235. for(;;)
  7236. {
  7237. #ifdef STBIR_SIMD8
  7238. stbir__simdi i; stbir__simdi8 o0,o1;
  7239. stbir__simdf8 of0, of1;
  7240. STBIR_NO_UNROLL(decode);
  7241. stbir__simdi_load( i, input );
  7242. stbir__simdi8_expand_u8_to_u32( o0, o1, i );
  7243. stbir__simdi8_convert_i32_to_float( of0, o0 );
  7244. stbir__simdi8_convert_i32_to_float( of1, o1 );
  7245. stbir__decode_simdf8_flip( of0 );
  7246. stbir__decode_simdf8_flip( of1 );
  7247. stbir__simdf8_store( decode + 0, of0 );
  7248. stbir__simdf8_store( decode + 8, of1 );
  7249. #else
  7250. stbir__simdi i, o0, o1, o2, o3;
  7251. stbir__simdf of0, of1, of2, of3;
  7252. STBIR_NO_UNROLL(decode);
  7253. stbir__simdi_load( i, input );
  7254. stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
  7255. stbir__simdi_convert_i32_to_float( of0, o0 );
  7256. stbir__simdi_convert_i32_to_float( of1, o1 );
  7257. stbir__simdi_convert_i32_to_float( of2, o2 );
  7258. stbir__simdi_convert_i32_to_float( of3, o3 );
  7259. stbir__decode_simdf4_flip( of0 );
  7260. stbir__decode_simdf4_flip( of1 );
  7261. stbir__decode_simdf4_flip( of2 );
  7262. stbir__decode_simdf4_flip( of3 );
  7263. stbir__simdf_store( decode + 0, of0 );
  7264. stbir__simdf_store( decode + 4, of1 );
  7265. stbir__simdf_store( decode + 8, of2 );
  7266. stbir__simdf_store( decode + 12, of3 );
  7267. #endif
  7268. decode += 16;
  7269. input += 16;
  7270. if ( decode <= decode_end )
  7271. continue;
  7272. if ( decode == ( decode_end + 16 ) )
  7273. break;
  7274. decode = decode_end; // backup and do last couple
  7275. input = end_input_m16;
  7276. }
  7277. return decode_end + 16;
  7278. }
  7279. #endif
  7280. // try to do blocks of 4 when you can
  7281. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7282. decode += 4;
  7283. STBIR_SIMD_NO_UNROLL_LOOP_START
  7284. while( decode <= decode_end )
  7285. {
  7286. STBIR_SIMD_NO_UNROLL(decode);
  7287. decode[0-4] = ((float)(input[stbir__decode_order0]));
  7288. decode[1-4] = ((float)(input[stbir__decode_order1]));
  7289. decode[2-4] = ((float)(input[stbir__decode_order2]));
  7290. decode[3-4] = ((float)(input[stbir__decode_order3]));
  7291. decode += 4;
  7292. input += 4;
  7293. }
  7294. decode -= 4;
  7295. #endif
  7296. // do the remnants
  7297. #if stbir__coder_min_num < 4
  7298. STBIR_NO_UNROLL_LOOP_START
  7299. while( decode < decode_end )
  7300. {
  7301. STBIR_NO_UNROLL(decode);
  7302. decode[0] = ((float)(input[stbir__decode_order0]));
  7303. #if stbir__coder_min_num >= 2
  7304. decode[1] = ((float)(input[stbir__decode_order1]));
  7305. #endif
  7306. #if stbir__coder_min_num >= 3
  7307. decode[2] = ((float)(input[stbir__decode_order2]));
  7308. #endif
  7309. decode += stbir__coder_min_num;
  7310. input += stbir__coder_min_num;
  7311. }
  7312. #endif
  7313. return decode_end;
  7314. }
  7315. static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int width_times_channels, float const * encode )
  7316. {
  7317. unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
  7318. unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
  7319. #ifdef STBIR_SIMD
  7320. if ( width_times_channels >= stbir__simdfX_float_count*2 )
  7321. {
  7322. float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
  7323. end_output -= stbir__simdfX_float_count*2;
  7324. STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
  7325. for(;;)
  7326. {
  7327. stbir__simdfX e0, e1;
  7328. stbir__simdi i;
  7329. STBIR_SIMD_NO_UNROLL(encode);
  7330. stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
  7331. stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
  7332. stbir__encode_simdfX_unflip( e0 );
  7333. stbir__encode_simdfX_unflip( e1 );
  7334. #ifdef STBIR_SIMD8
  7335. stbir__simdf8_pack_to_16bytes( i, e0, e1 );
  7336. stbir__simdi_store( output, i );
  7337. #else
  7338. stbir__simdf_pack_to_8bytes( i, e0, e1 );
  7339. stbir__simdi_store2( output, i );
  7340. #endif
  7341. encode += stbir__simdfX_float_count*2;
  7342. output += stbir__simdfX_float_count*2;
  7343. if ( output <= end_output )
  7344. continue;
  7345. if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
  7346. break;
  7347. output = end_output; // backup and do last couple
  7348. encode = end_encode_m8;
  7349. }
  7350. return;
  7351. }
  7352. // try to do blocks of 4 when you can
  7353. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7354. output += 4;
  7355. STBIR_NO_UNROLL_LOOP_START
  7356. while( output <= end_output )
  7357. {
  7358. stbir__simdf e0;
  7359. stbir__simdi i0;
  7360. STBIR_NO_UNROLL(encode);
  7361. stbir__simdf_load( e0, encode );
  7362. stbir__simdf_add( e0, STBIR__CONSTF(STBIR_simd_point5), e0 );
  7363. stbir__encode_simdf4_unflip( e0 );
  7364. stbir__simdf_pack_to_8bytes( i0, e0, e0 ); // only use first 4
  7365. *(int*)(output-4) = stbir__simdi_to_int( i0 );
  7366. output += 4;
  7367. encode += 4;
  7368. }
  7369. output -= 4;
  7370. #endif
  7371. #else
  7372. // try to do blocks of 4 when you can
  7373. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7374. output += 4;
  7375. while( output <= end_output )
  7376. {
  7377. float f;
  7378. f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
  7379. f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
  7380. f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
  7381. f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
  7382. output += 4;
  7383. encode += 4;
  7384. }
  7385. output -= 4;
  7386. #endif
  7387. #endif
  7388. // do the remnants
  7389. #if stbir__coder_min_num < 4
  7390. STBIR_NO_UNROLL_LOOP_START
  7391. while( output < end_output )
  7392. {
  7393. float f;
  7394. STBIR_NO_UNROLL(encode);
  7395. f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
  7396. #if stbir__coder_min_num >= 2
  7397. f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
  7398. #endif
  7399. #if stbir__coder_min_num >= 3
  7400. f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
  7401. #endif
  7402. output += stbir__coder_min_num;
  7403. encode += stbir__coder_min_num;
  7404. }
  7405. #endif
  7406. }
  7407. static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
  7408. {
  7409. float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  7410. float * decode_end = (float*) decode + width_times_channels;
  7411. unsigned char const * input = (unsigned char const *)inputp;
  7412. // try to do blocks of 4 when you can
  7413. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7414. decode += 4;
  7415. while( decode <= decode_end )
  7416. {
  7417. decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
  7418. decode[1-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
  7419. decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
  7420. decode[3-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order3 ] ];
  7421. decode += 4;
  7422. input += 4;
  7423. }
  7424. decode -= 4;
  7425. #endif
  7426. // do the remnants
  7427. #if stbir__coder_min_num < 4
  7428. STBIR_NO_UNROLL_LOOP_START
  7429. while( decode < decode_end )
  7430. {
  7431. STBIR_NO_UNROLL(decode);
  7432. decode[0] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
  7433. #if stbir__coder_min_num >= 2
  7434. decode[1] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
  7435. #endif
  7436. #if stbir__coder_min_num >= 3
  7437. decode[2] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
  7438. #endif
  7439. decode += stbir__coder_min_num;
  7440. input += stbir__coder_min_num;
  7441. }
  7442. #endif
  7443. return decode_end;
  7444. }
  7445. #define stbir__min_max_shift20( i, f ) \
  7446. stbir__simdf_max( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_zero )) ); \
  7447. stbir__simdf_min( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_one )) ); \
  7448. stbir__simdi_32shr( i, stbir_simdi_castf( f ), 20 );
  7449. #define stbir__scale_and_convert( i, f ) \
  7450. stbir__simdf_madd( f, STBIR__CONSTF( STBIR_simd_point5 ), STBIR__CONSTF( STBIR_max_uint8_as_float ), f ); \
  7451. stbir__simdf_max( f, f, stbir__simdf_zeroP() ); \
  7452. stbir__simdf_min( f, f, STBIR__CONSTF( STBIR_max_uint8_as_float ) ); \
  7453. stbir__simdf_convert_float_to_i32( i, f );
  7454. #define stbir__linear_to_srgb_finish( i, f ) \
  7455. { \
  7456. stbir__simdi temp; \
  7457. stbir__simdi_32shr( temp, stbir_simdi_castf( f ), 12 ) ; \
  7458. stbir__simdi_and( temp, temp, STBIR__CONSTI(STBIR_mastissa_mask) ); \
  7459. stbir__simdi_or( temp, temp, STBIR__CONSTI(STBIR_topscale) ); \
  7460. stbir__simdi_16madd( i, i, temp ); \
  7461. stbir__simdi_32shr( i, i, 16 ); \
  7462. }
  7463. #define stbir__simdi_table_lookup2( v0,v1, table ) \
  7464. { \
  7465. stbir__simdi_u32 temp0,temp1; \
  7466. temp0.m128i_i128 = v0; \
  7467. temp1.m128i_i128 = v1; \
  7468. temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
  7469. temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
  7470. v0 = temp0.m128i_i128; \
  7471. v1 = temp1.m128i_i128; \
  7472. }
  7473. #define stbir__simdi_table_lookup3( v0,v1,v2, table ) \
  7474. { \
  7475. stbir__simdi_u32 temp0,temp1,temp2; \
  7476. temp0.m128i_i128 = v0; \
  7477. temp1.m128i_i128 = v1; \
  7478. temp2.m128i_i128 = v2; \
  7479. temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
  7480. temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
  7481. temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
  7482. v0 = temp0.m128i_i128; \
  7483. v1 = temp1.m128i_i128; \
  7484. v2 = temp2.m128i_i128; \
  7485. }
  7486. #define stbir__simdi_table_lookup4( v0,v1,v2,v3, table ) \
  7487. { \
  7488. stbir__simdi_u32 temp0,temp1,temp2,temp3; \
  7489. temp0.m128i_i128 = v0; \
  7490. temp1.m128i_i128 = v1; \
  7491. temp2.m128i_i128 = v2; \
  7492. temp3.m128i_i128 = v3; \
  7493. temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
  7494. temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
  7495. temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
  7496. temp3.m128i_u32[0] = table[temp3.m128i_i32[0]]; temp3.m128i_u32[1] = table[temp3.m128i_i32[1]]; temp3.m128i_u32[2] = table[temp3.m128i_i32[2]]; temp3.m128i_u32[3] = table[temp3.m128i_i32[3]]; \
  7497. v0 = temp0.m128i_i128; \
  7498. v1 = temp1.m128i_i128; \
  7499. v2 = temp2.m128i_i128; \
  7500. v3 = temp3.m128i_i128; \
  7501. }
  7502. static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int width_times_channels, float const * encode )
  7503. {
  7504. unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
  7505. unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
  7506. #ifdef STBIR_SIMD
  7507. if ( width_times_channels >= 16 )
  7508. {
  7509. float const * end_encode_m16 = encode + width_times_channels - 16;
  7510. end_output -= 16;
  7511. STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
  7512. for(;;)
  7513. {
  7514. stbir__simdf f0, f1, f2, f3;
  7515. stbir__simdi i0, i1, i2, i3;
  7516. STBIR_SIMD_NO_UNROLL(encode);
  7517. stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
  7518. stbir__min_max_shift20( i0, f0 );
  7519. stbir__min_max_shift20( i1, f1 );
  7520. stbir__min_max_shift20( i2, f2 );
  7521. stbir__min_max_shift20( i3, f3 );
  7522. stbir__simdi_table_lookup4( i0, i1, i2, i3, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
  7523. stbir__linear_to_srgb_finish( i0, f0 );
  7524. stbir__linear_to_srgb_finish( i1, f1 );
  7525. stbir__linear_to_srgb_finish( i2, f2 );
  7526. stbir__linear_to_srgb_finish( i3, f3 );
  7527. stbir__interleave_pack_and_store_16_u8( output, STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
  7528. encode += 16;
  7529. output += 16;
  7530. if ( output <= end_output )
  7531. continue;
  7532. if ( output == ( end_output + 16 ) )
  7533. break;
  7534. output = end_output; // backup and do last couple
  7535. encode = end_encode_m16;
  7536. }
  7537. return;
  7538. }
  7539. #endif
  7540. // try to do blocks of 4 when you can
  7541. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7542. output += 4;
  7543. STBIR_SIMD_NO_UNROLL_LOOP_START
  7544. while ( output <= end_output )
  7545. {
  7546. STBIR_SIMD_NO_UNROLL(encode);
  7547. output[0-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
  7548. output[1-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
  7549. output[2-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
  7550. output[3-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order3] );
  7551. output += 4;
  7552. encode += 4;
  7553. }
  7554. output -= 4;
  7555. #endif
  7556. // do the remnants
  7557. #if stbir__coder_min_num < 4
  7558. STBIR_NO_UNROLL_LOOP_START
  7559. while( output < end_output )
  7560. {
  7561. STBIR_NO_UNROLL(encode);
  7562. output[0] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
  7563. #if stbir__coder_min_num >= 2
  7564. output[1] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
  7565. #endif
  7566. #if stbir__coder_min_num >= 3
  7567. output[2] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
  7568. #endif
  7569. output += stbir__coder_min_num;
  7570. encode += stbir__coder_min_num;
  7571. }
  7572. #endif
  7573. }
  7574. #if ( stbir__coder_min_num == 4 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
  7575. static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
  7576. {
  7577. float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  7578. float * decode_end = (float*) decode + width_times_channels;
  7579. unsigned char const * input = (unsigned char const *)inputp;
  7580. do {
  7581. decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
  7582. decode[1] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order1] ];
  7583. decode[2] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order2] ];
  7584. decode[3] = ( (float) input[stbir__decode_order3] ) * stbir__max_uint8_as_float_inverted;
  7585. input += 4;
  7586. decode += 4;
  7587. } while( decode < decode_end );
  7588. return decode_end;
  7589. }
  7590. static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * outputp, int width_times_channels, float const * encode )
  7591. {
  7592. unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
  7593. unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
  7594. #ifdef STBIR_SIMD
  7595. if ( width_times_channels >= 16 )
  7596. {
  7597. float const * end_encode_m16 = encode + width_times_channels - 16;
  7598. end_output -= 16;
  7599. STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
  7600. for(;;)
  7601. {
  7602. stbir__simdf f0, f1, f2, f3;
  7603. stbir__simdi i0, i1, i2, i3;
  7604. STBIR_SIMD_NO_UNROLL(encode);
  7605. stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
  7606. stbir__min_max_shift20( i0, f0 );
  7607. stbir__min_max_shift20( i1, f1 );
  7608. stbir__min_max_shift20( i2, f2 );
  7609. stbir__scale_and_convert( i3, f3 );
  7610. stbir__simdi_table_lookup3( i0, i1, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
  7611. stbir__linear_to_srgb_finish( i0, f0 );
  7612. stbir__linear_to_srgb_finish( i1, f1 );
  7613. stbir__linear_to_srgb_finish( i2, f2 );
  7614. stbir__interleave_pack_and_store_16_u8( output, STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
  7615. output += 16;
  7616. encode += 16;
  7617. if ( output <= end_output )
  7618. continue;
  7619. if ( output == ( end_output + 16 ) )
  7620. break;
  7621. output = end_output; // backup and do last couple
  7622. encode = end_encode_m16;
  7623. }
  7624. return;
  7625. }
  7626. #endif
  7627. STBIR_SIMD_NO_UNROLL_LOOP_START
  7628. do {
  7629. float f;
  7630. STBIR_SIMD_NO_UNROLL(encode);
  7631. output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
  7632. output[stbir__decode_order1] = stbir__linear_to_srgb_uchar( encode[1] );
  7633. output[stbir__decode_order2] = stbir__linear_to_srgb_uchar( encode[2] );
  7634. f = encode[3] * stbir__max_uint8_as_float + 0.5f;
  7635. STBIR_CLAMP(f, 0, 255);
  7636. output[stbir__decode_order3] = (unsigned char) f;
  7637. output += 4;
  7638. encode += 4;
  7639. } while( output < end_output );
  7640. }
  7641. #endif
  7642. #if ( stbir__coder_min_num == 2 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
  7643. static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
  7644. {
  7645. float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  7646. float * decode_end = (float*) decode + width_times_channels;
  7647. unsigned char const * input = (unsigned char const *)inputp;
  7648. decode += 4;
  7649. while( decode <= decode_end )
  7650. {
  7651. decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
  7652. decode[1-4] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
  7653. decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0+2] ];
  7654. decode[3-4] = ( (float) input[stbir__decode_order1+2] ) * stbir__max_uint8_as_float_inverted;
  7655. input += 4;
  7656. decode += 4;
  7657. }
  7658. decode -= 4;
  7659. if( decode < decode_end )
  7660. {
  7661. decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ];
  7662. decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
  7663. }
  7664. return decode_end;
  7665. }
  7666. static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * outputp, int width_times_channels, float const * encode )
  7667. {
  7668. unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
  7669. unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
  7670. #ifdef STBIR_SIMD
  7671. if ( width_times_channels >= 16 )
  7672. {
  7673. float const * end_encode_m16 = encode + width_times_channels - 16;
  7674. end_output -= 16;
  7675. STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
  7676. for(;;)
  7677. {
  7678. stbir__simdf f0, f1, f2, f3;
  7679. stbir__simdi i0, i1, i2, i3;
  7680. STBIR_SIMD_NO_UNROLL(encode);
  7681. stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
  7682. stbir__min_max_shift20( i0, f0 );
  7683. stbir__scale_and_convert( i1, f1 );
  7684. stbir__min_max_shift20( i2, f2 );
  7685. stbir__scale_and_convert( i3, f3 );
  7686. stbir__simdi_table_lookup2( i0, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
  7687. stbir__linear_to_srgb_finish( i0, f0 );
  7688. stbir__linear_to_srgb_finish( i2, f2 );
  7689. stbir__interleave_pack_and_store_16_u8( output, STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
  7690. output += 16;
  7691. encode += 16;
  7692. if ( output <= end_output )
  7693. continue;
  7694. if ( output == ( end_output + 16 ) )
  7695. break;
  7696. output = end_output; // backup and do last couple
  7697. encode = end_encode_m16;
  7698. }
  7699. return;
  7700. }
  7701. #endif
  7702. STBIR_SIMD_NO_UNROLL_LOOP_START
  7703. do {
  7704. float f;
  7705. STBIR_SIMD_NO_UNROLL(encode);
  7706. output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
  7707. f = encode[1] * stbir__max_uint8_as_float + 0.5f;
  7708. STBIR_CLAMP(f, 0, 255);
  7709. output[stbir__decode_order1] = (unsigned char) f;
  7710. output += 2;
  7711. encode += 2;
  7712. } while( output < end_output );
  7713. }
  7714. #endif
  7715. static float * STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
  7716. {
  7717. float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  7718. float * decode_end = (float*) decode + width_times_channels;
  7719. unsigned short const * input = (unsigned short const *)inputp;
  7720. #ifdef STBIR_SIMD
  7721. unsigned short const * end_input_m8 = input + width_times_channels - 8;
  7722. if ( width_times_channels >= 8 )
  7723. {
  7724. decode_end -= 8;
  7725. STBIR_NO_UNROLL_LOOP_START_INF_FOR
  7726. for(;;)
  7727. {
  7728. #ifdef STBIR_SIMD8
  7729. stbir__simdi i; stbir__simdi8 o;
  7730. stbir__simdf8 of;
  7731. STBIR_NO_UNROLL(decode);
  7732. stbir__simdi_load( i, input );
  7733. stbir__simdi8_expand_u16_to_u32( o, i );
  7734. stbir__simdi8_convert_i32_to_float( of, o );
  7735. stbir__simdf8_mult( of, of, STBIR_max_uint16_as_float_inverted8);
  7736. stbir__decode_simdf8_flip( of );
  7737. stbir__simdf8_store( decode + 0, of );
  7738. #else
  7739. stbir__simdi i, o0, o1;
  7740. stbir__simdf of0, of1;
  7741. STBIR_NO_UNROLL(decode);
  7742. stbir__simdi_load( i, input );
  7743. stbir__simdi_expand_u16_to_u32( o0,o1,i );
  7744. stbir__simdi_convert_i32_to_float( of0, o0 );
  7745. stbir__simdi_convert_i32_to_float( of1, o1 );
  7746. stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted) );
  7747. stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted));
  7748. stbir__decode_simdf4_flip( of0 );
  7749. stbir__decode_simdf4_flip( of1 );
  7750. stbir__simdf_store( decode + 0, of0 );
  7751. stbir__simdf_store( decode + 4, of1 );
  7752. #endif
  7753. decode += 8;
  7754. input += 8;
  7755. if ( decode <= decode_end )
  7756. continue;
  7757. if ( decode == ( decode_end + 8 ) )
  7758. break;
  7759. decode = decode_end; // backup and do last couple
  7760. input = end_input_m8;
  7761. }
  7762. return decode_end + 8;
  7763. }
  7764. #endif
  7765. // try to do blocks of 4 when you can
  7766. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7767. decode += 4;
  7768. STBIR_SIMD_NO_UNROLL_LOOP_START
  7769. while( decode <= decode_end )
  7770. {
  7771. STBIR_SIMD_NO_UNROLL(decode);
  7772. decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
  7773. decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
  7774. decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
  7775. decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint16_as_float_inverted;
  7776. decode += 4;
  7777. input += 4;
  7778. }
  7779. decode -= 4;
  7780. #endif
  7781. // do the remnants
  7782. #if stbir__coder_min_num < 4
  7783. STBIR_NO_UNROLL_LOOP_START
  7784. while( decode < decode_end )
  7785. {
  7786. STBIR_NO_UNROLL(decode);
  7787. decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
  7788. #if stbir__coder_min_num >= 2
  7789. decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
  7790. #endif
  7791. #if stbir__coder_min_num >= 3
  7792. decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
  7793. #endif
  7794. decode += stbir__coder_min_num;
  7795. input += stbir__coder_min_num;
  7796. }
  7797. #endif
  7798. return decode_end;
  7799. }
  7800. static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * outputp, int width_times_channels, float const * encode )
  7801. {
  7802. unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
  7803. unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
  7804. #ifdef STBIR_SIMD
  7805. {
  7806. if ( width_times_channels >= stbir__simdfX_float_count*2 )
  7807. {
  7808. float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
  7809. end_output -= stbir__simdfX_float_count*2;
  7810. STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
  7811. for(;;)
  7812. {
  7813. stbir__simdfX e0, e1;
  7814. stbir__simdiX i;
  7815. STBIR_SIMD_NO_UNROLL(encode);
  7816. stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode );
  7817. stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode+stbir__simdfX_float_count );
  7818. stbir__encode_simdfX_unflip( e0 );
  7819. stbir__encode_simdfX_unflip( e1 );
  7820. stbir__simdfX_pack_to_words( i, e0, e1 );
  7821. stbir__simdiX_store( output, i );
  7822. encode += stbir__simdfX_float_count*2;
  7823. output += stbir__simdfX_float_count*2;
  7824. if ( output <= end_output )
  7825. continue;
  7826. if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
  7827. break;
  7828. output = end_output; // backup and do last couple
  7829. encode = end_encode_m8;
  7830. }
  7831. return;
  7832. }
  7833. }
  7834. // try to do blocks of 4 when you can
  7835. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7836. output += 4;
  7837. STBIR_NO_UNROLL_LOOP_START
  7838. while( output <= end_output )
  7839. {
  7840. stbir__simdf e;
  7841. stbir__simdi i;
  7842. STBIR_NO_UNROLL(encode);
  7843. stbir__simdf_load( e, encode );
  7844. stbir__simdf_madd( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), e );
  7845. stbir__encode_simdf4_unflip( e );
  7846. stbir__simdf_pack_to_8words( i, e, e ); // only use first 4
  7847. stbir__simdi_store2( output-4, i );
  7848. output += 4;
  7849. encode += 4;
  7850. }
  7851. output -= 4;
  7852. #endif
  7853. // do the remnants
  7854. #if stbir__coder_min_num < 4
  7855. STBIR_NO_UNROLL_LOOP_START
  7856. while( output < end_output )
  7857. {
  7858. stbir__simdf e;
  7859. STBIR_NO_UNROLL(encode);
  7860. stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_short( e );
  7861. #if stbir__coder_min_num >= 2
  7862. stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_short( e );
  7863. #endif
  7864. #if stbir__coder_min_num >= 3
  7865. stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_short( e );
  7866. #endif
  7867. output += stbir__coder_min_num;
  7868. encode += stbir__coder_min_num;
  7869. }
  7870. #endif
  7871. #else
  7872. // try to do blocks of 4 when you can
  7873. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7874. output += 4;
  7875. STBIR_SIMD_NO_UNROLL_LOOP_START
  7876. while( output <= end_output )
  7877. {
  7878. float f;
  7879. STBIR_SIMD_NO_UNROLL(encode);
  7880. f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
  7881. f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
  7882. f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
  7883. f = encode[stbir__encode_order3] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
  7884. output += 4;
  7885. encode += 4;
  7886. }
  7887. output -= 4;
  7888. #endif
  7889. // do the remnants
  7890. #if stbir__coder_min_num < 4
  7891. STBIR_NO_UNROLL_LOOP_START
  7892. while( output < end_output )
  7893. {
  7894. float f;
  7895. STBIR_NO_UNROLL(encode);
  7896. f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
  7897. #if stbir__coder_min_num >= 2
  7898. f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
  7899. #endif
  7900. #if stbir__coder_min_num >= 3
  7901. f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
  7902. #endif
  7903. output += stbir__coder_min_num;
  7904. encode += stbir__coder_min_num;
  7905. }
  7906. #endif
  7907. #endif
  7908. }
  7909. static float * STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
  7910. {
  7911. float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  7912. float * decode_end = (float*) decode + width_times_channels;
  7913. unsigned short const * input = (unsigned short const *)inputp;
  7914. #ifdef STBIR_SIMD
  7915. unsigned short const * end_input_m8 = input + width_times_channels - 8;
  7916. if ( width_times_channels >= 8 )
  7917. {
  7918. decode_end -= 8;
  7919. STBIR_NO_UNROLL_LOOP_START_INF_FOR
  7920. for(;;)
  7921. {
  7922. #ifdef STBIR_SIMD8
  7923. stbir__simdi i; stbir__simdi8 o;
  7924. stbir__simdf8 of;
  7925. STBIR_NO_UNROLL(decode);
  7926. stbir__simdi_load( i, input );
  7927. stbir__simdi8_expand_u16_to_u32( o, i );
  7928. stbir__simdi8_convert_i32_to_float( of, o );
  7929. stbir__decode_simdf8_flip( of );
  7930. stbir__simdf8_store( decode + 0, of );
  7931. #else
  7932. stbir__simdi i, o0, o1;
  7933. stbir__simdf of0, of1;
  7934. STBIR_NO_UNROLL(decode);
  7935. stbir__simdi_load( i, input );
  7936. stbir__simdi_expand_u16_to_u32( o0, o1, i );
  7937. stbir__simdi_convert_i32_to_float( of0, o0 );
  7938. stbir__simdi_convert_i32_to_float( of1, o1 );
  7939. stbir__decode_simdf4_flip( of0 );
  7940. stbir__decode_simdf4_flip( of1 );
  7941. stbir__simdf_store( decode + 0, of0 );
  7942. stbir__simdf_store( decode + 4, of1 );
  7943. #endif
  7944. decode += 8;
  7945. input += 8;
  7946. if ( decode <= decode_end )
  7947. continue;
  7948. if ( decode == ( decode_end + 8 ) )
  7949. break;
  7950. decode = decode_end; // backup and do last couple
  7951. input = end_input_m8;
  7952. }
  7953. return decode_end + 8;
  7954. }
  7955. #endif
  7956. // try to do blocks of 4 when you can
  7957. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  7958. decode += 4;
  7959. STBIR_SIMD_NO_UNROLL_LOOP_START
  7960. while( decode <= decode_end )
  7961. {
  7962. STBIR_SIMD_NO_UNROLL(decode);
  7963. decode[0-4] = ((float)(input[stbir__decode_order0]));
  7964. decode[1-4] = ((float)(input[stbir__decode_order1]));
  7965. decode[2-4] = ((float)(input[stbir__decode_order2]));
  7966. decode[3-4] = ((float)(input[stbir__decode_order3]));
  7967. decode += 4;
  7968. input += 4;
  7969. }
  7970. decode -= 4;
  7971. #endif
  7972. // do the remnants
  7973. #if stbir__coder_min_num < 4
  7974. STBIR_NO_UNROLL_LOOP_START
  7975. while( decode < decode_end )
  7976. {
  7977. STBIR_NO_UNROLL(decode);
  7978. decode[0] = ((float)(input[stbir__decode_order0]));
  7979. #if stbir__coder_min_num >= 2
  7980. decode[1] = ((float)(input[stbir__decode_order1]));
  7981. #endif
  7982. #if stbir__coder_min_num >= 3
  7983. decode[2] = ((float)(input[stbir__decode_order2]));
  7984. #endif
  7985. decode += stbir__coder_min_num;
  7986. input += stbir__coder_min_num;
  7987. }
  7988. #endif
  7989. return decode_end;
  7990. }
  7991. static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int width_times_channels, float const * encode )
  7992. {
  7993. unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
  7994. unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
  7995. #ifdef STBIR_SIMD
  7996. {
  7997. if ( width_times_channels >= stbir__simdfX_float_count*2 )
  7998. {
  7999. float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
  8000. end_output -= stbir__simdfX_float_count*2;
  8001. STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
  8002. for(;;)
  8003. {
  8004. stbir__simdfX e0, e1;
  8005. stbir__simdiX i;
  8006. STBIR_SIMD_NO_UNROLL(encode);
  8007. stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
  8008. stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
  8009. stbir__encode_simdfX_unflip( e0 );
  8010. stbir__encode_simdfX_unflip( e1 );
  8011. stbir__simdfX_pack_to_words( i, e0, e1 );
  8012. stbir__simdiX_store( output, i );
  8013. encode += stbir__simdfX_float_count*2;
  8014. output += stbir__simdfX_float_count*2;
  8015. if ( output <= end_output )
  8016. continue;
  8017. if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
  8018. break;
  8019. output = end_output; // backup and do last couple
  8020. encode = end_encode_m8;
  8021. }
  8022. return;
  8023. }
  8024. }
  8025. // try to do blocks of 4 when you can
  8026. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  8027. output += 4;
  8028. STBIR_NO_UNROLL_LOOP_START
  8029. while( output <= end_output )
  8030. {
  8031. stbir__simdf e;
  8032. stbir__simdi i;
  8033. STBIR_NO_UNROLL(encode);
  8034. stbir__simdf_load( e, encode );
  8035. stbir__simdf_add( e, STBIR__CONSTF(STBIR_simd_point5), e );
  8036. stbir__encode_simdf4_unflip( e );
  8037. stbir__simdf_pack_to_8words( i, e, e ); // only use first 4
  8038. stbir__simdi_store2( output-4, i );
  8039. output += 4;
  8040. encode += 4;
  8041. }
  8042. output -= 4;
  8043. #endif
  8044. #else
  8045. // try to do blocks of 4 when you can
  8046. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  8047. output += 4;
  8048. STBIR_SIMD_NO_UNROLL_LOOP_START
  8049. while( output <= end_output )
  8050. {
  8051. float f;
  8052. STBIR_SIMD_NO_UNROLL(encode);
  8053. f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
  8054. f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
  8055. f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
  8056. f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
  8057. output += 4;
  8058. encode += 4;
  8059. }
  8060. output -= 4;
  8061. #endif
  8062. #endif
  8063. // do the remnants
  8064. #if stbir__coder_min_num < 4
  8065. STBIR_NO_UNROLL_LOOP_START
  8066. while( output < end_output )
  8067. {
  8068. float f;
  8069. STBIR_NO_UNROLL(encode);
  8070. f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
  8071. #if stbir__coder_min_num >= 2
  8072. f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
  8073. #endif
  8074. #if stbir__coder_min_num >= 3
  8075. f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
  8076. #endif
  8077. output += stbir__coder_min_num;
  8078. encode += stbir__coder_min_num;
  8079. }
  8080. #endif
  8081. }
  8082. static float * STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
  8083. {
  8084. float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  8085. float * decode_end = (float*) decode + width_times_channels;
  8086. stbir__FP16 const * input = (stbir__FP16 const *)inputp;
  8087. #ifdef STBIR_SIMD
  8088. if ( width_times_channels >= 8 )
  8089. {
  8090. stbir__FP16 const * end_input_m8 = input + width_times_channels - 8;
  8091. decode_end -= 8;
  8092. STBIR_NO_UNROLL_LOOP_START_INF_FOR
  8093. for(;;)
  8094. {
  8095. STBIR_NO_UNROLL(decode);
  8096. stbir__half_to_float_SIMD( decode, input );
  8097. #ifdef stbir__decode_swizzle
  8098. #ifdef STBIR_SIMD8
  8099. {
  8100. stbir__simdf8 of;
  8101. stbir__simdf8_load( of, decode );
  8102. stbir__decode_simdf8_flip( of );
  8103. stbir__simdf8_store( decode, of );
  8104. }
  8105. #else
  8106. {
  8107. stbir__simdf of0,of1;
  8108. stbir__simdf_load( of0, decode );
  8109. stbir__simdf_load( of1, decode+4 );
  8110. stbir__decode_simdf4_flip( of0 );
  8111. stbir__decode_simdf4_flip( of1 );
  8112. stbir__simdf_store( decode, of0 );
  8113. stbir__simdf_store( decode+4, of1 );
  8114. }
  8115. #endif
  8116. #endif
  8117. decode += 8;
  8118. input += 8;
  8119. if ( decode <= decode_end )
  8120. continue;
  8121. if ( decode == ( decode_end + 8 ) )
  8122. break;
  8123. decode = decode_end; // backup and do last couple
  8124. input = end_input_m8;
  8125. }
  8126. return decode_end + 8;
  8127. }
  8128. #endif
  8129. // try to do blocks of 4 when you can
  8130. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  8131. decode += 4;
  8132. STBIR_SIMD_NO_UNROLL_LOOP_START
  8133. while( decode <= decode_end )
  8134. {
  8135. STBIR_SIMD_NO_UNROLL(decode);
  8136. decode[0-4] = stbir__half_to_float(input[stbir__decode_order0]);
  8137. decode[1-4] = stbir__half_to_float(input[stbir__decode_order1]);
  8138. decode[2-4] = stbir__half_to_float(input[stbir__decode_order2]);
  8139. decode[3-4] = stbir__half_to_float(input[stbir__decode_order3]);
  8140. decode += 4;
  8141. input += 4;
  8142. }
  8143. decode -= 4;
  8144. #endif
  8145. // do the remnants
  8146. #if stbir__coder_min_num < 4
  8147. STBIR_NO_UNROLL_LOOP_START
  8148. while( decode < decode_end )
  8149. {
  8150. STBIR_NO_UNROLL(decode);
  8151. decode[0] = stbir__half_to_float(input[stbir__decode_order0]);
  8152. #if stbir__coder_min_num >= 2
  8153. decode[1] = stbir__half_to_float(input[stbir__decode_order1]);
  8154. #endif
  8155. #if stbir__coder_min_num >= 3
  8156. decode[2] = stbir__half_to_float(input[stbir__decode_order2]);
  8157. #endif
  8158. decode += stbir__coder_min_num;
  8159. input += stbir__coder_min_num;
  8160. }
  8161. #endif
  8162. return decode_end;
  8163. }
  8164. static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp, int width_times_channels, float const * encode )
  8165. {
  8166. stbir__FP16 STBIR_SIMD_STREAMOUT_PTR( * ) output = (stbir__FP16*) outputp;
  8167. stbir__FP16 * end_output = ( (stbir__FP16*) output ) + width_times_channels;
  8168. #ifdef STBIR_SIMD
  8169. if ( width_times_channels >= 8 )
  8170. {
  8171. float const * end_encode_m8 = encode + width_times_channels - 8;
  8172. end_output -= 8;
  8173. STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
  8174. for(;;)
  8175. {
  8176. STBIR_SIMD_NO_UNROLL(encode);
  8177. #ifdef stbir__decode_swizzle
  8178. #ifdef STBIR_SIMD8
  8179. {
  8180. stbir__simdf8 of;
  8181. stbir__simdf8_load( of, encode );
  8182. stbir__encode_simdf8_unflip( of );
  8183. stbir__float_to_half_SIMD( output, (float*)&of );
  8184. }
  8185. #else
  8186. {
  8187. stbir__simdf of[2];
  8188. stbir__simdf_load( of[0], encode );
  8189. stbir__simdf_load( of[1], encode+4 );
  8190. stbir__encode_simdf4_unflip( of[0] );
  8191. stbir__encode_simdf4_unflip( of[1] );
  8192. stbir__float_to_half_SIMD( output, (float*)of );
  8193. }
  8194. #endif
  8195. #else
  8196. stbir__float_to_half_SIMD( output, encode );
  8197. #endif
  8198. encode += 8;
  8199. output += 8;
  8200. if ( output <= end_output )
  8201. continue;
  8202. if ( output == ( end_output + 8 ) )
  8203. break;
  8204. output = end_output; // backup and do last couple
  8205. encode = end_encode_m8;
  8206. }
  8207. return;
  8208. }
  8209. #endif
  8210. // try to do blocks of 4 when you can
  8211. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  8212. output += 4;
  8213. STBIR_SIMD_NO_UNROLL_LOOP_START
  8214. while( output <= end_output )
  8215. {
  8216. STBIR_SIMD_NO_UNROLL(output);
  8217. output[0-4] = stbir__float_to_half(encode[stbir__encode_order0]);
  8218. output[1-4] = stbir__float_to_half(encode[stbir__encode_order1]);
  8219. output[2-4] = stbir__float_to_half(encode[stbir__encode_order2]);
  8220. output[3-4] = stbir__float_to_half(encode[stbir__encode_order3]);
  8221. output += 4;
  8222. encode += 4;
  8223. }
  8224. output -= 4;
  8225. #endif
  8226. // do the remnants
  8227. #if stbir__coder_min_num < 4
  8228. STBIR_NO_UNROLL_LOOP_START
  8229. while( output < end_output )
  8230. {
  8231. STBIR_NO_UNROLL(output);
  8232. output[0] = stbir__float_to_half(encode[stbir__encode_order0]);
  8233. #if stbir__coder_min_num >= 2
  8234. output[1] = stbir__float_to_half(encode[stbir__encode_order1]);
  8235. #endif
  8236. #if stbir__coder_min_num >= 3
  8237. output[2] = stbir__float_to_half(encode[stbir__encode_order2]);
  8238. #endif
  8239. output += stbir__coder_min_num;
  8240. encode += stbir__coder_min_num;
  8241. }
  8242. #endif
  8243. }
  8244. static float * STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
  8245. {
  8246. #ifdef stbir__decode_swizzle
  8247. float STBIR_STREAMOUT_PTR( * ) decode = decodep;
  8248. float * decode_end = (float*) decode + width_times_channels;
  8249. float const * input = (float const *)inputp;
  8250. #ifdef STBIR_SIMD
  8251. if ( width_times_channels >= 16 )
  8252. {
  8253. float const * end_input_m16 = input + width_times_channels - 16;
  8254. decode_end -= 16;
  8255. STBIR_NO_UNROLL_LOOP_START_INF_FOR
  8256. for(;;)
  8257. {
  8258. STBIR_NO_UNROLL(decode);
  8259. #ifdef stbir__decode_swizzle
  8260. #ifdef STBIR_SIMD8
  8261. {
  8262. stbir__simdf8 of0,of1;
  8263. stbir__simdf8_load( of0, input );
  8264. stbir__simdf8_load( of1, input+8 );
  8265. stbir__decode_simdf8_flip( of0 );
  8266. stbir__decode_simdf8_flip( of1 );
  8267. stbir__simdf8_store( decode, of0 );
  8268. stbir__simdf8_store( decode+8, of1 );
  8269. }
  8270. #else
  8271. {
  8272. stbir__simdf of0,of1,of2,of3;
  8273. stbir__simdf_load( of0, input );
  8274. stbir__simdf_load( of1, input+4 );
  8275. stbir__simdf_load( of2, input+8 );
  8276. stbir__simdf_load( of3, input+12 );
  8277. stbir__decode_simdf4_flip( of0 );
  8278. stbir__decode_simdf4_flip( of1 );
  8279. stbir__decode_simdf4_flip( of2 );
  8280. stbir__decode_simdf4_flip( of3 );
  8281. stbir__simdf_store( decode, of0 );
  8282. stbir__simdf_store( decode+4, of1 );
  8283. stbir__simdf_store( decode+8, of2 );
  8284. stbir__simdf_store( decode+12, of3 );
  8285. }
  8286. #endif
  8287. #endif
  8288. decode += 16;
  8289. input += 16;
  8290. if ( decode <= decode_end )
  8291. continue;
  8292. if ( decode == ( decode_end + 16 ) )
  8293. break;
  8294. decode = decode_end; // backup and do last couple
  8295. input = end_input_m16;
  8296. }
  8297. return decode_end + 16;
  8298. }
  8299. #endif
  8300. // try to do blocks of 4 when you can
  8301. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  8302. decode += 4;
  8303. STBIR_SIMD_NO_UNROLL_LOOP_START
  8304. while( decode <= decode_end )
  8305. {
  8306. STBIR_SIMD_NO_UNROLL(decode);
  8307. decode[0-4] = input[stbir__decode_order0];
  8308. decode[1-4] = input[stbir__decode_order1];
  8309. decode[2-4] = input[stbir__decode_order2];
  8310. decode[3-4] = input[stbir__decode_order3];
  8311. decode += 4;
  8312. input += 4;
  8313. }
  8314. decode -= 4;
  8315. #endif
  8316. // do the remnants
  8317. #if stbir__coder_min_num < 4
  8318. STBIR_NO_UNROLL_LOOP_START
  8319. while( decode < decode_end )
  8320. {
  8321. STBIR_NO_UNROLL(decode);
  8322. decode[0] = input[stbir__decode_order0];
  8323. #if stbir__coder_min_num >= 2
  8324. decode[1] = input[stbir__decode_order1];
  8325. #endif
  8326. #if stbir__coder_min_num >= 3
  8327. decode[2] = input[stbir__decode_order2];
  8328. #endif
  8329. decode += stbir__coder_min_num;
  8330. input += stbir__coder_min_num;
  8331. }
  8332. #endif
  8333. return decode_end;
  8334. #else
  8335. if ( (void*)decodep != inputp )
  8336. STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) );
  8337. return decodep + width_times_channels;
  8338. #endif
  8339. }
  8340. static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int width_times_channels, float const * encode )
  8341. {
  8342. #if !defined( STBIR_FLOAT_HIGH_CLAMP ) && !defined(STBIR_FLOAT_LO_CLAMP) && !defined(stbir__decode_swizzle)
  8343. if ( (void*)outputp != (void*) encode )
  8344. STBIR_MEMCPY( outputp, encode, width_times_channels * sizeof( float ) );
  8345. #else
  8346. float STBIR_SIMD_STREAMOUT_PTR( * ) output = (float*) outputp;
  8347. float * end_output = ( (float*) output ) + width_times_channels;
  8348. #ifdef STBIR_FLOAT_HIGH_CLAMP
  8349. #define stbir_scalar_hi_clamp( v ) if ( v > STBIR_FLOAT_HIGH_CLAMP ) v = STBIR_FLOAT_HIGH_CLAMP;
  8350. #else
  8351. #define stbir_scalar_hi_clamp( v )
  8352. #endif
  8353. #ifdef STBIR_FLOAT_LOW_CLAMP
  8354. #define stbir_scalar_lo_clamp( v ) if ( v < STBIR_FLOAT_LOW_CLAMP ) v = STBIR_FLOAT_LOW_CLAMP;
  8355. #else
  8356. #define stbir_scalar_lo_clamp( v )
  8357. #endif
  8358. #ifdef STBIR_SIMD
  8359. #ifdef STBIR_FLOAT_HIGH_CLAMP
  8360. const stbir__simdfX high_clamp = stbir__simdf_frepX(STBIR_FLOAT_HIGH_CLAMP);
  8361. #endif
  8362. #ifdef STBIR_FLOAT_LOW_CLAMP
  8363. const stbir__simdfX low_clamp = stbir__simdf_frepX(STBIR_FLOAT_LOW_CLAMP);
  8364. #endif
  8365. if ( width_times_channels >= ( stbir__simdfX_float_count * 2 ) )
  8366. {
  8367. float const * end_encode_m8 = encode + width_times_channels - ( stbir__simdfX_float_count * 2 );
  8368. end_output -= ( stbir__simdfX_float_count * 2 );
  8369. STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
  8370. for(;;)
  8371. {
  8372. stbir__simdfX e0, e1;
  8373. STBIR_SIMD_NO_UNROLL(encode);
  8374. stbir__simdfX_load( e0, encode );
  8375. stbir__simdfX_load( e1, encode+stbir__simdfX_float_count );
  8376. #ifdef STBIR_FLOAT_HIGH_CLAMP
  8377. stbir__simdfX_min( e0, e0, high_clamp );
  8378. stbir__simdfX_min( e1, e1, high_clamp );
  8379. #endif
  8380. #ifdef STBIR_FLOAT_LOW_CLAMP
  8381. stbir__simdfX_max( e0, e0, low_clamp );
  8382. stbir__simdfX_max( e1, e1, low_clamp );
  8383. #endif
  8384. stbir__encode_simdfX_unflip( e0 );
  8385. stbir__encode_simdfX_unflip( e1 );
  8386. stbir__simdfX_store( output, e0 );
  8387. stbir__simdfX_store( output+stbir__simdfX_float_count, e1 );
  8388. encode += stbir__simdfX_float_count * 2;
  8389. output += stbir__simdfX_float_count * 2;
  8390. if ( output < end_output )
  8391. continue;
  8392. if ( output == ( end_output + ( stbir__simdfX_float_count * 2 ) ) )
  8393. break;
  8394. output = end_output; // backup and do last couple
  8395. encode = end_encode_m8;
  8396. }
  8397. return;
  8398. }
  8399. // try to do blocks of 4 when you can
  8400. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  8401. output += 4;
  8402. STBIR_NO_UNROLL_LOOP_START
  8403. while( output <= end_output )
  8404. {
  8405. stbir__simdf e0;
  8406. STBIR_NO_UNROLL(encode);
  8407. stbir__simdf_load( e0, encode );
  8408. #ifdef STBIR_FLOAT_HIGH_CLAMP
  8409. stbir__simdf_min( e0, e0, high_clamp );
  8410. #endif
  8411. #ifdef STBIR_FLOAT_LOW_CLAMP
  8412. stbir__simdf_max( e0, e0, low_clamp );
  8413. #endif
  8414. stbir__encode_simdf4_unflip( e0 );
  8415. stbir__simdf_store( output-4, e0 );
  8416. output += 4;
  8417. encode += 4;
  8418. }
  8419. output -= 4;
  8420. #endif
  8421. #else
  8422. // try to do blocks of 4 when you can
  8423. #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
  8424. output += 4;
  8425. STBIR_SIMD_NO_UNROLL_LOOP_START
  8426. while( output <= end_output )
  8427. {
  8428. float e;
  8429. STBIR_SIMD_NO_UNROLL(encode);
  8430. e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0-4] = e;
  8431. e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1-4] = e;
  8432. e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2-4] = e;
  8433. e = encode[ stbir__encode_order3 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[3-4] = e;
  8434. output += 4;
  8435. encode += 4;
  8436. }
  8437. output -= 4;
  8438. #endif
  8439. #endif
  8440. // do the remnants
  8441. #if stbir__coder_min_num < 4
  8442. STBIR_NO_UNROLL_LOOP_START
  8443. while( output < end_output )
  8444. {
  8445. float e;
  8446. STBIR_NO_UNROLL(encode);
  8447. e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0] = e;
  8448. #if stbir__coder_min_num >= 2
  8449. e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1] = e;
  8450. #endif
  8451. #if stbir__coder_min_num >= 3
  8452. e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2] = e;
  8453. #endif
  8454. output += stbir__coder_min_num;
  8455. encode += stbir__coder_min_num;
  8456. }
  8457. #endif
  8458. #endif
  8459. }
  8460. #undef stbir__decode_suffix
  8461. #undef stbir__decode_simdf8_flip
  8462. #undef stbir__decode_simdf4_flip
  8463. #undef stbir__decode_order0
  8464. #undef stbir__decode_order1
  8465. #undef stbir__decode_order2
  8466. #undef stbir__decode_order3
  8467. #undef stbir__encode_order0
  8468. #undef stbir__encode_order1
  8469. #undef stbir__encode_order2
  8470. #undef stbir__encode_order3
  8471. #undef stbir__encode_simdf8_unflip
  8472. #undef stbir__encode_simdf4_unflip
  8473. #undef stbir__encode_simdfX_unflip
  8474. #undef STBIR__CODER_NAME
  8475. #undef stbir__coder_min_num
  8476. #undef stbir__decode_swizzle
  8477. #undef stbir_scalar_hi_clamp
  8478. #undef stbir_scalar_lo_clamp
  8479. #undef STB_IMAGE_RESIZE_DO_CODERS
  8480. #elif defined( STB_IMAGE_RESIZE_DO_VERTICALS)
  8481. #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  8482. #define STBIR_chans( start, end ) STBIR_strs_join14(start,STBIR__vertical_channels,end,_cont)
  8483. #else
  8484. #define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__vertical_channels,end)
  8485. #endif
  8486. #if STBIR__vertical_channels >= 1
  8487. #define stbIF0( code ) code
  8488. #else
  8489. #define stbIF0( code )
  8490. #endif
  8491. #if STBIR__vertical_channels >= 2
  8492. #define stbIF1( code ) code
  8493. #else
  8494. #define stbIF1( code )
  8495. #endif
  8496. #if STBIR__vertical_channels >= 3
  8497. #define stbIF2( code ) code
  8498. #else
  8499. #define stbIF2( code )
  8500. #endif
  8501. #if STBIR__vertical_channels >= 4
  8502. #define stbIF3( code ) code
  8503. #else
  8504. #define stbIF3( code )
  8505. #endif
  8506. #if STBIR__vertical_channels >= 5
  8507. #define stbIF4( code ) code
  8508. #else
  8509. #define stbIF4( code )
  8510. #endif
  8511. #if STBIR__vertical_channels >= 6
  8512. #define stbIF5( code ) code
  8513. #else
  8514. #define stbIF5( code )
  8515. #endif
  8516. #if STBIR__vertical_channels >= 7
  8517. #define stbIF6( code ) code
  8518. #else
  8519. #define stbIF6( code )
  8520. #endif
  8521. #if STBIR__vertical_channels >= 8
  8522. #define stbIF7( code ) code
  8523. #else
  8524. #define stbIF7( code )
  8525. #endif
  8526. static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** outputs, float const * vertical_coefficients, float const * input, float const * input_end )
  8527. {
  8528. stbIF0( float STBIR_SIMD_STREAMOUT_PTR( * ) output0 = outputs[0]; float c0s = vertical_coefficients[0]; )
  8529. stbIF1( float STBIR_SIMD_STREAMOUT_PTR( * ) output1 = outputs[1]; float c1s = vertical_coefficients[1]; )
  8530. stbIF2( float STBIR_SIMD_STREAMOUT_PTR( * ) output2 = outputs[2]; float c2s = vertical_coefficients[2]; )
  8531. stbIF3( float STBIR_SIMD_STREAMOUT_PTR( * ) output3 = outputs[3]; float c3s = vertical_coefficients[3]; )
  8532. stbIF4( float STBIR_SIMD_STREAMOUT_PTR( * ) output4 = outputs[4]; float c4s = vertical_coefficients[4]; )
  8533. stbIF5( float STBIR_SIMD_STREAMOUT_PTR( * ) output5 = outputs[5]; float c5s = vertical_coefficients[5]; )
  8534. stbIF6( float STBIR_SIMD_STREAMOUT_PTR( * ) output6 = outputs[6]; float c6s = vertical_coefficients[6]; )
  8535. stbIF7( float STBIR_SIMD_STREAMOUT_PTR( * ) output7 = outputs[7]; float c7s = vertical_coefficients[7]; )
  8536. #ifdef STBIR_SIMD
  8537. {
  8538. stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
  8539. stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
  8540. stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
  8541. stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
  8542. stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
  8543. stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
  8544. stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
  8545. stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
  8546. STBIR_SIMD_NO_UNROLL_LOOP_START
  8547. while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) )
  8548. {
  8549. stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
  8550. STBIR_SIMD_NO_UNROLL(output0);
  8551. stbir__simdfX_load( r0, input ); stbir__simdfX_load( r1, input+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input+(3*stbir__simdfX_float_count) );
  8552. #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  8553. stbIF0( stbir__simdfX_load( o0, output0 ); stbir__simdfX_load( o1, output0+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output0+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output0+(3*stbir__simdfX_float_count) );
  8554. stbir__simdfX_madd( o0, o0, r0, c0 ); stbir__simdfX_madd( o1, o1, r1, c0 ); stbir__simdfX_madd( o2, o2, r2, c0 ); stbir__simdfX_madd( o3, o3, r3, c0 );
  8555. stbir__simdfX_store( output0, o0 ); stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
  8556. stbIF1( stbir__simdfX_load( o0, output1 ); stbir__simdfX_load( o1, output1+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output1+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output1+(3*stbir__simdfX_float_count) );
  8557. stbir__simdfX_madd( o0, o0, r0, c1 ); stbir__simdfX_madd( o1, o1, r1, c1 ); stbir__simdfX_madd( o2, o2, r2, c1 ); stbir__simdfX_madd( o3, o3, r3, c1 );
  8558. stbir__simdfX_store( output1, o0 ); stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
  8559. stbIF2( stbir__simdfX_load( o0, output2 ); stbir__simdfX_load( o1, output2+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output2+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output2+(3*stbir__simdfX_float_count) );
  8560. stbir__simdfX_madd( o0, o0, r0, c2 ); stbir__simdfX_madd( o1, o1, r1, c2 ); stbir__simdfX_madd( o2, o2, r2, c2 ); stbir__simdfX_madd( o3, o3, r3, c2 );
  8561. stbir__simdfX_store( output2, o0 ); stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
  8562. stbIF3( stbir__simdfX_load( o0, output3 ); stbir__simdfX_load( o1, output3+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output3+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output3+(3*stbir__simdfX_float_count) );
  8563. stbir__simdfX_madd( o0, o0, r0, c3 ); stbir__simdfX_madd( o1, o1, r1, c3 ); stbir__simdfX_madd( o2, o2, r2, c3 ); stbir__simdfX_madd( o3, o3, r3, c3 );
  8564. stbir__simdfX_store( output3, o0 ); stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
  8565. stbIF4( stbir__simdfX_load( o0, output4 ); stbir__simdfX_load( o1, output4+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output4+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output4+(3*stbir__simdfX_float_count) );
  8566. stbir__simdfX_madd( o0, o0, r0, c4 ); stbir__simdfX_madd( o1, o1, r1, c4 ); stbir__simdfX_madd( o2, o2, r2, c4 ); stbir__simdfX_madd( o3, o3, r3, c4 );
  8567. stbir__simdfX_store( output4, o0 ); stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
  8568. stbIF5( stbir__simdfX_load( o0, output5 ); stbir__simdfX_load( o1, output5+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output5+(2*stbir__simdfX_float_count)); stbir__simdfX_load( o3, output5+(3*stbir__simdfX_float_count) );
  8569. stbir__simdfX_madd( o0, o0, r0, c5 ); stbir__simdfX_madd( o1, o1, r1, c5 ); stbir__simdfX_madd( o2, o2, r2, c5 ); stbir__simdfX_madd( o3, o3, r3, c5 );
  8570. stbir__simdfX_store( output5, o0 ); stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
  8571. stbIF6( stbir__simdfX_load( o0, output6 ); stbir__simdfX_load( o1, output6+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output6+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output6+(3*stbir__simdfX_float_count) );
  8572. stbir__simdfX_madd( o0, o0, r0, c6 ); stbir__simdfX_madd( o1, o1, r1, c6 ); stbir__simdfX_madd( o2, o2, r2, c6 ); stbir__simdfX_madd( o3, o3, r3, c6 );
  8573. stbir__simdfX_store( output6, o0 ); stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
  8574. stbIF7( stbir__simdfX_load( o0, output7 ); stbir__simdfX_load( o1, output7+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output7+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output7+(3*stbir__simdfX_float_count) );
  8575. stbir__simdfX_madd( o0, o0, r0, c7 ); stbir__simdfX_madd( o1, o1, r1, c7 ); stbir__simdfX_madd( o2, o2, r2, c7 ); stbir__simdfX_madd( o3, o3, r3, c7 );
  8576. stbir__simdfX_store( output7, o0 ); stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
  8577. #else
  8578. stbIF0( stbir__simdfX_mult( o0, r0, c0 ); stbir__simdfX_mult( o1, r1, c0 ); stbir__simdfX_mult( o2, r2, c0 ); stbir__simdfX_mult( o3, r3, c0 );
  8579. stbir__simdfX_store( output0, o0 ); stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
  8580. stbIF1( stbir__simdfX_mult( o0, r0, c1 ); stbir__simdfX_mult( o1, r1, c1 ); stbir__simdfX_mult( o2, r2, c1 ); stbir__simdfX_mult( o3, r3, c1 );
  8581. stbir__simdfX_store( output1, o0 ); stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
  8582. stbIF2( stbir__simdfX_mult( o0, r0, c2 ); stbir__simdfX_mult( o1, r1, c2 ); stbir__simdfX_mult( o2, r2, c2 ); stbir__simdfX_mult( o3, r3, c2 );
  8583. stbir__simdfX_store( output2, o0 ); stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
  8584. stbIF3( stbir__simdfX_mult( o0, r0, c3 ); stbir__simdfX_mult( o1, r1, c3 ); stbir__simdfX_mult( o2, r2, c3 ); stbir__simdfX_mult( o3, r3, c3 );
  8585. stbir__simdfX_store( output3, o0 ); stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
  8586. stbIF4( stbir__simdfX_mult( o0, r0, c4 ); stbir__simdfX_mult( o1, r1, c4 ); stbir__simdfX_mult( o2, r2, c4 ); stbir__simdfX_mult( o3, r3, c4 );
  8587. stbir__simdfX_store( output4, o0 ); stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
  8588. stbIF5( stbir__simdfX_mult( o0, r0, c5 ); stbir__simdfX_mult( o1, r1, c5 ); stbir__simdfX_mult( o2, r2, c5 ); stbir__simdfX_mult( o3, r3, c5 );
  8589. stbir__simdfX_store( output5, o0 ); stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
  8590. stbIF6( stbir__simdfX_mult( o0, r0, c6 ); stbir__simdfX_mult( o1, r1, c6 ); stbir__simdfX_mult( o2, r2, c6 ); stbir__simdfX_mult( o3, r3, c6 );
  8591. stbir__simdfX_store( output6, o0 ); stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
  8592. stbIF7( stbir__simdfX_mult( o0, r0, c7 ); stbir__simdfX_mult( o1, r1, c7 ); stbir__simdfX_mult( o2, r2, c7 ); stbir__simdfX_mult( o3, r3, c7 );
  8593. stbir__simdfX_store( output7, o0 ); stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
  8594. #endif
  8595. input += (4*stbir__simdfX_float_count);
  8596. stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); )
  8597. }
  8598. STBIR_SIMD_NO_UNROLL_LOOP_START
  8599. while ( ( (char*)input_end - (char*) input ) >= 16 )
  8600. {
  8601. stbir__simdf o0, r0;
  8602. STBIR_SIMD_NO_UNROLL(output0);
  8603. stbir__simdf_load( r0, input );
  8604. #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  8605. stbIF0( stbir__simdf_load( o0, output0 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); stbir__simdf_store( output0, o0 ); )
  8606. stbIF1( stbir__simdf_load( o0, output1 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); stbir__simdf_store( output1, o0 ); )
  8607. stbIF2( stbir__simdf_load( o0, output2 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); stbir__simdf_store( output2, o0 ); )
  8608. stbIF3( stbir__simdf_load( o0, output3 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); stbir__simdf_store( output3, o0 ); )
  8609. stbIF4( stbir__simdf_load( o0, output4 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); stbir__simdf_store( output4, o0 ); )
  8610. stbIF5( stbir__simdf_load( o0, output5 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); stbir__simdf_store( output5, o0 ); )
  8611. stbIF6( stbir__simdf_load( o0, output6 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); stbir__simdf_store( output6, o0 ); )
  8612. stbIF7( stbir__simdf_load( o0, output7 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); stbir__simdf_store( output7, o0 ); )
  8613. #else
  8614. stbIF0( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); stbir__simdf_store( output0, o0 ); )
  8615. stbIF1( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); stbir__simdf_store( output1, o0 ); )
  8616. stbIF2( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); stbir__simdf_store( output2, o0 ); )
  8617. stbIF3( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); stbir__simdf_store( output3, o0 ); )
  8618. stbIF4( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); stbir__simdf_store( output4, o0 ); )
  8619. stbIF5( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); stbir__simdf_store( output5, o0 ); )
  8620. stbIF6( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); stbir__simdf_store( output6, o0 ); )
  8621. stbIF7( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); stbir__simdf_store( output7, o0 ); )
  8622. #endif
  8623. input += 4;
  8624. stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
  8625. }
  8626. }
  8627. #else
  8628. STBIR_NO_UNROLL_LOOP_START
  8629. while ( ( (char*)input_end - (char*) input ) >= 16 )
  8630. {
  8631. float r0, r1, r2, r3;
  8632. STBIR_NO_UNROLL(input);
  8633. r0 = input[0], r1 = input[1], r2 = input[2], r3 = input[3];
  8634. #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  8635. stbIF0( output0[0] += ( r0 * c0s ); output0[1] += ( r1 * c0s ); output0[2] += ( r2 * c0s ); output0[3] += ( r3 * c0s ); )
  8636. stbIF1( output1[0] += ( r0 * c1s ); output1[1] += ( r1 * c1s ); output1[2] += ( r2 * c1s ); output1[3] += ( r3 * c1s ); )
  8637. stbIF2( output2[0] += ( r0 * c2s ); output2[1] += ( r1 * c2s ); output2[2] += ( r2 * c2s ); output2[3] += ( r3 * c2s ); )
  8638. stbIF3( output3[0] += ( r0 * c3s ); output3[1] += ( r1 * c3s ); output3[2] += ( r2 * c3s ); output3[3] += ( r3 * c3s ); )
  8639. stbIF4( output4[0] += ( r0 * c4s ); output4[1] += ( r1 * c4s ); output4[2] += ( r2 * c4s ); output4[3] += ( r3 * c4s ); )
  8640. stbIF5( output5[0] += ( r0 * c5s ); output5[1] += ( r1 * c5s ); output5[2] += ( r2 * c5s ); output5[3] += ( r3 * c5s ); )
  8641. stbIF6( output6[0] += ( r0 * c6s ); output6[1] += ( r1 * c6s ); output6[2] += ( r2 * c6s ); output6[3] += ( r3 * c6s ); )
  8642. stbIF7( output7[0] += ( r0 * c7s ); output7[1] += ( r1 * c7s ); output7[2] += ( r2 * c7s ); output7[3] += ( r3 * c7s ); )
  8643. #else
  8644. stbIF0( output0[0] = ( r0 * c0s ); output0[1] = ( r1 * c0s ); output0[2] = ( r2 * c0s ); output0[3] = ( r3 * c0s ); )
  8645. stbIF1( output1[0] = ( r0 * c1s ); output1[1] = ( r1 * c1s ); output1[2] = ( r2 * c1s ); output1[3] = ( r3 * c1s ); )
  8646. stbIF2( output2[0] = ( r0 * c2s ); output2[1] = ( r1 * c2s ); output2[2] = ( r2 * c2s ); output2[3] = ( r3 * c2s ); )
  8647. stbIF3( output3[0] = ( r0 * c3s ); output3[1] = ( r1 * c3s ); output3[2] = ( r2 * c3s ); output3[3] = ( r3 * c3s ); )
  8648. stbIF4( output4[0] = ( r0 * c4s ); output4[1] = ( r1 * c4s ); output4[2] = ( r2 * c4s ); output4[3] = ( r3 * c4s ); )
  8649. stbIF5( output5[0] = ( r0 * c5s ); output5[1] = ( r1 * c5s ); output5[2] = ( r2 * c5s ); output5[3] = ( r3 * c5s ); )
  8650. stbIF6( output6[0] = ( r0 * c6s ); output6[1] = ( r1 * c6s ); output6[2] = ( r2 * c6s ); output6[3] = ( r3 * c6s ); )
  8651. stbIF7( output7[0] = ( r0 * c7s ); output7[1] = ( r1 * c7s ); output7[2] = ( r2 * c7s ); output7[3] = ( r3 * c7s ); )
  8652. #endif
  8653. input += 4;
  8654. stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
  8655. }
  8656. #endif
  8657. STBIR_NO_UNROLL_LOOP_START
  8658. while ( input < input_end )
  8659. {
  8660. float r = input[0];
  8661. STBIR_NO_UNROLL(output0);
  8662. #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  8663. stbIF0( output0[0] += ( r * c0s ); )
  8664. stbIF1( output1[0] += ( r * c1s ); )
  8665. stbIF2( output2[0] += ( r * c2s ); )
  8666. stbIF3( output3[0] += ( r * c3s ); )
  8667. stbIF4( output4[0] += ( r * c4s ); )
  8668. stbIF5( output5[0] += ( r * c5s ); )
  8669. stbIF6( output6[0] += ( r * c6s ); )
  8670. stbIF7( output7[0] += ( r * c7s ); )
  8671. #else
  8672. stbIF0( output0[0] = ( r * c0s ); )
  8673. stbIF1( output1[0] = ( r * c1s ); )
  8674. stbIF2( output2[0] = ( r * c2s ); )
  8675. stbIF3( output3[0] = ( r * c3s ); )
  8676. stbIF4( output4[0] = ( r * c4s ); )
  8677. stbIF5( output5[0] = ( r * c5s ); )
  8678. stbIF6( output6[0] = ( r * c6s ); )
  8679. stbIF7( output7[0] = ( r * c7s ); )
  8680. #endif
  8681. ++input;
  8682. stbIF0( ++output0; ) stbIF1( ++output1; ) stbIF2( ++output2; ) stbIF3( ++output3; ) stbIF4( ++output4; ) stbIF5( ++output5; ) stbIF6( ++output6; ) stbIF7( ++output7; )
  8683. }
  8684. }
  8685. static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp, float const * vertical_coefficients, float const ** inputs, float const * input0_end )
  8686. {
  8687. float STBIR_SIMD_STREAMOUT_PTR( * ) output = outputp;
  8688. stbIF0( float const * input0 = inputs[0]; float c0s = vertical_coefficients[0]; )
  8689. stbIF1( float const * input1 = inputs[1]; float c1s = vertical_coefficients[1]; )
  8690. stbIF2( float const * input2 = inputs[2]; float c2s = vertical_coefficients[2]; )
  8691. stbIF3( float const * input3 = inputs[3]; float c3s = vertical_coefficients[3]; )
  8692. stbIF4( float const * input4 = inputs[4]; float c4s = vertical_coefficients[4]; )
  8693. stbIF5( float const * input5 = inputs[5]; float c5s = vertical_coefficients[5]; )
  8694. stbIF6( float const * input6 = inputs[6]; float c6s = vertical_coefficients[6]; )
  8695. stbIF7( float const * input7 = inputs[7]; float c7s = vertical_coefficients[7]; )
  8696. #if ( STBIR__vertical_channels == 1 ) && !defined(STB_IMAGE_RESIZE_VERTICAL_CONTINUE)
  8697. // check single channel one weight
  8698. if ( ( c0s >= (1.0f-0.000001f) ) && ( c0s <= (1.0f+0.000001f) ) )
  8699. {
  8700. STBIR_MEMCPY( output, input0, (char*)input0_end - (char*)input0 );
  8701. return;
  8702. }
  8703. #endif
  8704. #ifdef STBIR_SIMD
  8705. {
  8706. stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
  8707. stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
  8708. stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
  8709. stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
  8710. stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
  8711. stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
  8712. stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
  8713. stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
  8714. STBIR_SIMD_NO_UNROLL_LOOP_START
  8715. while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) )
  8716. {
  8717. stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
  8718. STBIR_SIMD_NO_UNROLL(output);
  8719. // prefetch four loop iterations ahead (doesn't affect much for small resizes, but helps with big ones)
  8720. stbIF0( stbir__prefetch( input0 + (16*stbir__simdfX_float_count) ); )
  8721. stbIF1( stbir__prefetch( input1 + (16*stbir__simdfX_float_count) ); )
  8722. stbIF2( stbir__prefetch( input2 + (16*stbir__simdfX_float_count) ); )
  8723. stbIF3( stbir__prefetch( input3 + (16*stbir__simdfX_float_count) ); )
  8724. stbIF4( stbir__prefetch( input4 + (16*stbir__simdfX_float_count) ); )
  8725. stbIF5( stbir__prefetch( input5 + (16*stbir__simdfX_float_count) ); )
  8726. stbIF6( stbir__prefetch( input6 + (16*stbir__simdfX_float_count) ); )
  8727. stbIF7( stbir__prefetch( input7 + (16*stbir__simdfX_float_count) ); )
  8728. #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  8729. stbIF0( stbir__simdfX_load( o0, output ); stbir__simdfX_load( o1, output+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output+(3*stbir__simdfX_float_count) );
  8730. stbir__simdfX_load( r0, input0 ); stbir__simdfX_load( r1, input0+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
  8731. stbir__simdfX_madd( o0, o0, r0, c0 ); stbir__simdfX_madd( o1, o1, r1, c0 ); stbir__simdfX_madd( o2, o2, r2, c0 ); stbir__simdfX_madd( o3, o3, r3, c0 ); )
  8732. #else
  8733. stbIF0( stbir__simdfX_load( r0, input0 ); stbir__simdfX_load( r1, input0+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
  8734. stbir__simdfX_mult( o0, r0, c0 ); stbir__simdfX_mult( o1, r1, c0 ); stbir__simdfX_mult( o2, r2, c0 ); stbir__simdfX_mult( o3, r3, c0 ); )
  8735. #endif
  8736. stbIF1( stbir__simdfX_load( r0, input1 ); stbir__simdfX_load( r1, input1+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input1+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input1+(3*stbir__simdfX_float_count) );
  8737. stbir__simdfX_madd( o0, o0, r0, c1 ); stbir__simdfX_madd( o1, o1, r1, c1 ); stbir__simdfX_madd( o2, o2, r2, c1 ); stbir__simdfX_madd( o3, o3, r3, c1 ); )
  8738. stbIF2( stbir__simdfX_load( r0, input2 ); stbir__simdfX_load( r1, input2+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input2+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input2+(3*stbir__simdfX_float_count) );
  8739. stbir__simdfX_madd( o0, o0, r0, c2 ); stbir__simdfX_madd( o1, o1, r1, c2 ); stbir__simdfX_madd( o2, o2, r2, c2 ); stbir__simdfX_madd( o3, o3, r3, c2 ); )
  8740. stbIF3( stbir__simdfX_load( r0, input3 ); stbir__simdfX_load( r1, input3+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input3+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input3+(3*stbir__simdfX_float_count) );
  8741. stbir__simdfX_madd( o0, o0, r0, c3 ); stbir__simdfX_madd( o1, o1, r1, c3 ); stbir__simdfX_madd( o2, o2, r2, c3 ); stbir__simdfX_madd( o3, o3, r3, c3 ); )
  8742. stbIF4( stbir__simdfX_load( r0, input4 ); stbir__simdfX_load( r1, input4+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input4+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input4+(3*stbir__simdfX_float_count) );
  8743. stbir__simdfX_madd( o0, o0, r0, c4 ); stbir__simdfX_madd( o1, o1, r1, c4 ); stbir__simdfX_madd( o2, o2, r2, c4 ); stbir__simdfX_madd( o3, o3, r3, c4 ); )
  8744. stbIF5( stbir__simdfX_load( r0, input5 ); stbir__simdfX_load( r1, input5+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input5+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input5+(3*stbir__simdfX_float_count) );
  8745. stbir__simdfX_madd( o0, o0, r0, c5 ); stbir__simdfX_madd( o1, o1, r1, c5 ); stbir__simdfX_madd( o2, o2, r2, c5 ); stbir__simdfX_madd( o3, o3, r3, c5 ); )
  8746. stbIF6( stbir__simdfX_load( r0, input6 ); stbir__simdfX_load( r1, input6+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input6+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input6+(3*stbir__simdfX_float_count) );
  8747. stbir__simdfX_madd( o0, o0, r0, c6 ); stbir__simdfX_madd( o1, o1, r1, c6 ); stbir__simdfX_madd( o2, o2, r2, c6 ); stbir__simdfX_madd( o3, o3, r3, c6 ); )
  8748. stbIF7( stbir__simdfX_load( r0, input7 ); stbir__simdfX_load( r1, input7+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input7+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input7+(3*stbir__simdfX_float_count) );
  8749. stbir__simdfX_madd( o0, o0, r0, c7 ); stbir__simdfX_madd( o1, o1, r1, c7 ); stbir__simdfX_madd( o2, o2, r2, c7 ); stbir__simdfX_madd( o3, o3, r3, c7 ); )
  8750. stbir__simdfX_store( output, o0 ); stbir__simdfX_store( output+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output+(3*stbir__simdfX_float_count), o3 );
  8751. output += (4*stbir__simdfX_float_count);
  8752. stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); )
  8753. }
  8754. STBIR_SIMD_NO_UNROLL_LOOP_START
  8755. while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
  8756. {
  8757. stbir__simdf o0, r0;
  8758. STBIR_SIMD_NO_UNROLL(output);
  8759. #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  8760. stbIF0( stbir__simdf_load( o0, output ); stbir__simdf_load( r0, input0 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
  8761. #else
  8762. stbIF0( stbir__simdf_load( r0, input0 ); stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
  8763. #endif
  8764. stbIF1( stbir__simdf_load( r0, input1 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); )
  8765. stbIF2( stbir__simdf_load( r0, input2 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); )
  8766. stbIF3( stbir__simdf_load( r0, input3 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); )
  8767. stbIF4( stbir__simdf_load( r0, input4 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); )
  8768. stbIF5( stbir__simdf_load( r0, input5 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); )
  8769. stbIF6( stbir__simdf_load( r0, input6 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); )
  8770. stbIF7( stbir__simdf_load( r0, input7 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); )
  8771. stbir__simdf_store( output, o0 );
  8772. output += 4;
  8773. stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
  8774. }
  8775. }
  8776. #else
  8777. STBIR_NO_UNROLL_LOOP_START
  8778. while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
  8779. {
  8780. float o0, o1, o2, o3;
  8781. STBIR_NO_UNROLL(output);
  8782. #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  8783. stbIF0( o0 = output[0] + input0[0] * c0s; o1 = output[1] + input0[1] * c0s; o2 = output[2] + input0[2] * c0s; o3 = output[3] + input0[3] * c0s; )
  8784. #else
  8785. stbIF0( o0 = input0[0] * c0s; o1 = input0[1] * c0s; o2 = input0[2] * c0s; o3 = input0[3] * c0s; )
  8786. #endif
  8787. stbIF1( o0 += input1[0] * c1s; o1 += input1[1] * c1s; o2 += input1[2] * c1s; o3 += input1[3] * c1s; )
  8788. stbIF2( o0 += input2[0] * c2s; o1 += input2[1] * c2s; o2 += input2[2] * c2s; o3 += input2[3] * c2s; )
  8789. stbIF3( o0 += input3[0] * c3s; o1 += input3[1] * c3s; o2 += input3[2] * c3s; o3 += input3[3] * c3s; )
  8790. stbIF4( o0 += input4[0] * c4s; o1 += input4[1] * c4s; o2 += input4[2] * c4s; o3 += input4[3] * c4s; )
  8791. stbIF5( o0 += input5[0] * c5s; o1 += input5[1] * c5s; o2 += input5[2] * c5s; o3 += input5[3] * c5s; )
  8792. stbIF6( o0 += input6[0] * c6s; o1 += input6[1] * c6s; o2 += input6[2] * c6s; o3 += input6[3] * c6s; )
  8793. stbIF7( o0 += input7[0] * c7s; o1 += input7[1] * c7s; o2 += input7[2] * c7s; o3 += input7[3] * c7s; )
  8794. output[0] = o0; output[1] = o1; output[2] = o2; output[3] = o3;
  8795. output += 4;
  8796. stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
  8797. }
  8798. #endif
  8799. STBIR_NO_UNROLL_LOOP_START
  8800. while ( input0 < input0_end )
  8801. {
  8802. float o0;
  8803. STBIR_NO_UNROLL(output);
  8804. #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  8805. stbIF0( o0 = output[0] + input0[0] * c0s; )
  8806. #else
  8807. stbIF0( o0 = input0[0] * c0s; )
  8808. #endif
  8809. stbIF1( o0 += input1[0] * c1s; )
  8810. stbIF2( o0 += input2[0] * c2s; )
  8811. stbIF3( o0 += input3[0] * c3s; )
  8812. stbIF4( o0 += input4[0] * c4s; )
  8813. stbIF5( o0 += input5[0] * c5s; )
  8814. stbIF6( o0 += input6[0] * c6s; )
  8815. stbIF7( o0 += input7[0] * c7s; )
  8816. output[0] = o0;
  8817. ++output;
  8818. stbIF0( ++input0; ) stbIF1( ++input1; ) stbIF2( ++input2; ) stbIF3( ++input3; ) stbIF4( ++input4; ) stbIF5( ++input5; ) stbIF6( ++input6; ) stbIF7( ++input7; )
  8819. }
  8820. }
  8821. #undef stbIF0
  8822. #undef stbIF1
  8823. #undef stbIF2
  8824. #undef stbIF3
  8825. #undef stbIF4
  8826. #undef stbIF5
  8827. #undef stbIF6
  8828. #undef stbIF7
  8829. #undef STB_IMAGE_RESIZE_DO_VERTICALS
  8830. #undef STBIR__vertical_channels
  8831. #undef STB_IMAGE_RESIZE_DO_HORIZONTALS
  8832. #undef STBIR_strs_join24
  8833. #undef STBIR_strs_join14
  8834. #undef STBIR_chans
  8835. #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  8836. #undef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
  8837. #endif
  8838. #else // !STB_IMAGE_RESIZE_DO_VERTICALS
  8839. #define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__horizontal_channels,end)
  8840. #ifndef stbir__2_coeff_only
  8841. #define stbir__2_coeff_only() \
  8842. stbir__1_coeff_only(); \
  8843. stbir__1_coeff_remnant(1);
  8844. #endif
  8845. #ifndef stbir__2_coeff_remnant
  8846. #define stbir__2_coeff_remnant( ofs ) \
  8847. stbir__1_coeff_remnant(ofs); \
  8848. stbir__1_coeff_remnant((ofs)+1);
  8849. #endif
  8850. #ifndef stbir__3_coeff_only
  8851. #define stbir__3_coeff_only() \
  8852. stbir__2_coeff_only(); \
  8853. stbir__1_coeff_remnant(2);
  8854. #endif
  8855. #ifndef stbir__3_coeff_remnant
  8856. #define stbir__3_coeff_remnant( ofs ) \
  8857. stbir__2_coeff_remnant(ofs); \
  8858. stbir__1_coeff_remnant((ofs)+2);
  8859. #endif
  8860. #ifndef stbir__3_coeff_setup
  8861. #define stbir__3_coeff_setup()
  8862. #endif
  8863. #ifndef stbir__4_coeff_start
  8864. #define stbir__4_coeff_start() \
  8865. stbir__2_coeff_only(); \
  8866. stbir__2_coeff_remnant(2);
  8867. #endif
  8868. #ifndef stbir__4_coeff_continue_from_4
  8869. #define stbir__4_coeff_continue_from_4( ofs ) \
  8870. stbir__2_coeff_remnant(ofs); \
  8871. stbir__2_coeff_remnant((ofs)+2);
  8872. #endif
  8873. #ifndef stbir__store_output_tiny
  8874. #define stbir__store_output_tiny stbir__store_output
  8875. #endif
  8876. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  8877. {
  8878. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  8879. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  8880. STBIR_SIMD_NO_UNROLL_LOOP_START
  8881. do {
  8882. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  8883. float const * hc = horizontal_coefficients;
  8884. stbir__1_coeff_only();
  8885. stbir__store_output_tiny();
  8886. } while ( output < output_end );
  8887. }
  8888. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  8889. {
  8890. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  8891. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  8892. STBIR_SIMD_NO_UNROLL_LOOP_START
  8893. do {
  8894. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  8895. float const * hc = horizontal_coefficients;
  8896. stbir__2_coeff_only();
  8897. stbir__store_output_tiny();
  8898. } while ( output < output_end );
  8899. }
  8900. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  8901. {
  8902. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  8903. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  8904. STBIR_SIMD_NO_UNROLL_LOOP_START
  8905. do {
  8906. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  8907. float const * hc = horizontal_coefficients;
  8908. stbir__3_coeff_only();
  8909. stbir__store_output_tiny();
  8910. } while ( output < output_end );
  8911. }
  8912. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  8913. {
  8914. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  8915. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  8916. STBIR_SIMD_NO_UNROLL_LOOP_START
  8917. do {
  8918. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  8919. float const * hc = horizontal_coefficients;
  8920. stbir__4_coeff_start();
  8921. stbir__store_output();
  8922. } while ( output < output_end );
  8923. }
  8924. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  8925. {
  8926. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  8927. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  8928. STBIR_SIMD_NO_UNROLL_LOOP_START
  8929. do {
  8930. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  8931. float const * hc = horizontal_coefficients;
  8932. stbir__4_coeff_start();
  8933. stbir__1_coeff_remnant(4);
  8934. stbir__store_output();
  8935. } while ( output < output_end );
  8936. }
  8937. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  8938. {
  8939. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  8940. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  8941. STBIR_SIMD_NO_UNROLL_LOOP_START
  8942. do {
  8943. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  8944. float const * hc = horizontal_coefficients;
  8945. stbir__4_coeff_start();
  8946. stbir__2_coeff_remnant(4);
  8947. stbir__store_output();
  8948. } while ( output < output_end );
  8949. }
  8950. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  8951. {
  8952. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  8953. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  8954. stbir__3_coeff_setup();
  8955. STBIR_SIMD_NO_UNROLL_LOOP_START
  8956. do {
  8957. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  8958. float const * hc = horizontal_coefficients;
  8959. stbir__4_coeff_start();
  8960. stbir__3_coeff_remnant(4);
  8961. stbir__store_output();
  8962. } while ( output < output_end );
  8963. }
  8964. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  8965. {
  8966. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  8967. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  8968. STBIR_SIMD_NO_UNROLL_LOOP_START
  8969. do {
  8970. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  8971. float const * hc = horizontal_coefficients;
  8972. stbir__4_coeff_start();
  8973. stbir__4_coeff_continue_from_4(4);
  8974. stbir__store_output();
  8975. } while ( output < output_end );
  8976. }
  8977. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  8978. {
  8979. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  8980. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  8981. STBIR_SIMD_NO_UNROLL_LOOP_START
  8982. do {
  8983. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  8984. float const * hc = horizontal_coefficients;
  8985. stbir__4_coeff_start();
  8986. stbir__4_coeff_continue_from_4(4);
  8987. stbir__1_coeff_remnant(8);
  8988. stbir__store_output();
  8989. } while ( output < output_end );
  8990. }
  8991. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  8992. {
  8993. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  8994. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  8995. STBIR_SIMD_NO_UNROLL_LOOP_START
  8996. do {
  8997. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  8998. float const * hc = horizontal_coefficients;
  8999. stbir__4_coeff_start();
  9000. stbir__4_coeff_continue_from_4(4);
  9001. stbir__2_coeff_remnant(8);
  9002. stbir__store_output();
  9003. } while ( output < output_end );
  9004. }
  9005. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  9006. {
  9007. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  9008. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  9009. stbir__3_coeff_setup();
  9010. STBIR_SIMD_NO_UNROLL_LOOP_START
  9011. do {
  9012. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  9013. float const * hc = horizontal_coefficients;
  9014. stbir__4_coeff_start();
  9015. stbir__4_coeff_continue_from_4(4);
  9016. stbir__3_coeff_remnant(8);
  9017. stbir__store_output();
  9018. } while ( output < output_end );
  9019. }
  9020. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  9021. {
  9022. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  9023. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  9024. STBIR_SIMD_NO_UNROLL_LOOP_START
  9025. do {
  9026. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  9027. float const * hc = horizontal_coefficients;
  9028. stbir__4_coeff_start();
  9029. stbir__4_coeff_continue_from_4(4);
  9030. stbir__4_coeff_continue_from_4(8);
  9031. stbir__store_output();
  9032. } while ( output < output_end );
  9033. }
  9034. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  9035. {
  9036. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  9037. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  9038. STBIR_SIMD_NO_UNROLL_LOOP_START
  9039. do {
  9040. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  9041. int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2;
  9042. float const * hc = horizontal_coefficients;
  9043. stbir__4_coeff_start();
  9044. STBIR_SIMD_NO_UNROLL_LOOP_START
  9045. do {
  9046. hc += 4;
  9047. decode += STBIR__horizontal_channels * 4;
  9048. stbir__4_coeff_continue_from_4( 0 );
  9049. --n;
  9050. } while ( n > 0 );
  9051. stbir__store_output();
  9052. } while ( output < output_end );
  9053. }
  9054. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  9055. {
  9056. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  9057. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  9058. STBIR_SIMD_NO_UNROLL_LOOP_START
  9059. do {
  9060. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  9061. int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2;
  9062. float const * hc = horizontal_coefficients;
  9063. stbir__4_coeff_start();
  9064. STBIR_SIMD_NO_UNROLL_LOOP_START
  9065. do {
  9066. hc += 4;
  9067. decode += STBIR__horizontal_channels * 4;
  9068. stbir__4_coeff_continue_from_4( 0 );
  9069. --n;
  9070. } while ( n > 0 );
  9071. stbir__1_coeff_remnant( 4 );
  9072. stbir__store_output();
  9073. } while ( output < output_end );
  9074. }
  9075. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  9076. {
  9077. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  9078. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  9079. STBIR_SIMD_NO_UNROLL_LOOP_START
  9080. do {
  9081. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  9082. int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2;
  9083. float const * hc = horizontal_coefficients;
  9084. stbir__4_coeff_start();
  9085. STBIR_SIMD_NO_UNROLL_LOOP_START
  9086. do {
  9087. hc += 4;
  9088. decode += STBIR__horizontal_channels * 4;
  9089. stbir__4_coeff_continue_from_4( 0 );
  9090. --n;
  9091. } while ( n > 0 );
  9092. stbir__2_coeff_remnant( 4 );
  9093. stbir__store_output();
  9094. } while ( output < output_end );
  9095. }
  9096. static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
  9097. {
  9098. float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
  9099. float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
  9100. stbir__3_coeff_setup();
  9101. STBIR_SIMD_NO_UNROLL_LOOP_START
  9102. do {
  9103. float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
  9104. int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2;
  9105. float const * hc = horizontal_coefficients;
  9106. stbir__4_coeff_start();
  9107. STBIR_SIMD_NO_UNROLL_LOOP_START
  9108. do {
  9109. hc += 4;
  9110. decode += STBIR__horizontal_channels * 4;
  9111. stbir__4_coeff_continue_from_4( 0 );
  9112. --n;
  9113. } while ( n > 0 );
  9114. stbir__3_coeff_remnant( 4 );
  9115. stbir__store_output();
  9116. } while ( output < output_end );
  9117. }
  9118. static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_funcs)[4]=
  9119. {
  9120. STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod0),
  9121. STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod1),
  9122. STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod2),
  9123. STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod3),
  9124. };
  9125. static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_funcs)[12]=
  9126. {
  9127. STBIR_chans(stbir__horizontal_gather_,_channels_with_1_coeff),
  9128. STBIR_chans(stbir__horizontal_gather_,_channels_with_2_coeffs),
  9129. STBIR_chans(stbir__horizontal_gather_,_channels_with_3_coeffs),
  9130. STBIR_chans(stbir__horizontal_gather_,_channels_with_4_coeffs),
  9131. STBIR_chans(stbir__horizontal_gather_,_channels_with_5_coeffs),
  9132. STBIR_chans(stbir__horizontal_gather_,_channels_with_6_coeffs),
  9133. STBIR_chans(stbir__horizontal_gather_,_channels_with_7_coeffs),
  9134. STBIR_chans(stbir__horizontal_gather_,_channels_with_8_coeffs),
  9135. STBIR_chans(stbir__horizontal_gather_,_channels_with_9_coeffs),
  9136. STBIR_chans(stbir__horizontal_gather_,_channels_with_10_coeffs),
  9137. STBIR_chans(stbir__horizontal_gather_,_channels_with_11_coeffs),
  9138. STBIR_chans(stbir__horizontal_gather_,_channels_with_12_coeffs),
  9139. };
  9140. #undef STBIR__horizontal_channels
  9141. #undef STB_IMAGE_RESIZE_DO_HORIZONTALS
  9142. #undef stbir__1_coeff_only
  9143. #undef stbir__1_coeff_remnant
  9144. #undef stbir__2_coeff_only
  9145. #undef stbir__2_coeff_remnant
  9146. #undef stbir__3_coeff_only
  9147. #undef stbir__3_coeff_remnant
  9148. #undef stbir__3_coeff_setup
  9149. #undef stbir__4_coeff_start
  9150. #undef stbir__4_coeff_continue_from_4
  9151. #undef stbir__store_output
  9152. #undef stbir__store_output_tiny
  9153. #undef STBIR_chans
  9154. #endif // HORIZONALS
  9155. #undef STBIR_strs_join2
  9156. #undef STBIR_strs_join1
  9157. #endif // STB_IMAGE_RESIZE_DO_HORIZONTALS/VERTICALS/CODERS
  9158. /*
  9159. ------------------------------------------------------------------------------
  9160. This software is available under 2 licenses -- choose whichever you prefer.
  9161. ------------------------------------------------------------------------------
  9162. ALTERNATIVE A - MIT License
  9163. Copyright (c) 2017 Sean Barrett
  9164. Permission is hereby granted, free of charge, to any person obtaining a copy of
  9165. this software and associated documentation files (the "Software"), to deal in
  9166. the Software without restriction, including without limitation the rights to
  9167. use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  9168. of the Software, and to permit persons to whom the Software is furnished to do
  9169. so, subject to the following conditions:
  9170. The above copyright notice and this permission notice shall be included in all
  9171. copies or substantial portions of the Software.
  9172. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  9173. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  9174. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  9175. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  9176. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  9177. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  9178. SOFTWARE.
  9179. ------------------------------------------------------------------------------
  9180. ALTERNATIVE B - Public Domain (www.unlicense.org)
  9181. This is free and unencumbered software released into the public domain.
  9182. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
  9183. software, either in source code form or as a compiled binary, for any purpose,
  9184. commercial or non-commercial, and by any means.
  9185. In jurisdictions that recognize copyright laws, the author or authors of this
  9186. software dedicate any and all copyright interest in the software to the public
  9187. domain. We make this dedication for the benefit of the public at large and to
  9188. the detriment of our heirs and successors. We intend this dedication to be an
  9189. overt act of relinquishment in perpetuity of all present and future rights to
  9190. this software under copyright law.
  9191. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  9192. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  9193. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  9194. AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  9195. ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  9196. WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  9197. ------------------------------------------------------------------------------
  9198. */