| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886 |
- #ifdef OC_ARM_ASM
- @********************************************************************
- @* *
- @* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- @* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- @* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- @* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- @* *
- @* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
- @* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- @* *
- @********************************************************************
- @ Original implementation:
- @ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
- @ last mod: $Id: armidct.s 17728 2010-12-07 10:28:07Z tterribe $
- @********************************************************************
- .text; .p2align 2
- .global _oc_idct8x8_1_arm
- .global _oc_idct8x8_arm
- @ .type oc_idct8x8_1_arm, %function; oc_idct8x8_1_arm: @ PROC
- _oc_idct8x8_1_arm:
- @ r0 = ogg_int16_t *_y
- @ r1 = ogg_uint16_t _dc
- ORR r1, r1, r1, LSL #16
- MOV r2, r1
- MOV r3, r1
- MOV r12,r1
- STMIA r0!,{r1,r2,r3,r12}
- STMIA r0!,{r1,r2,r3,r12}
- STMIA r0!,{r1,r2,r3,r12}
- STMIA r0!,{r1,r2,r3,r12}
- STMIA r0!,{r1,r2,r3,r12}
- STMIA r0!,{r1,r2,r3,r12}
- STMIA r0!,{r1,r2,r3,r12}
- STMIA r0!,{r1,r2,r3,r12}
- MOV PC, r14
- @ .size oc_idct8x8_1_arm, .-oc_idct8x8_1_arm @ ENDP
- @ .type oc_idct8x8_arm, %function; oc_idct8x8_arm: @ PROC
- _oc_idct8x8_arm:
- @ r0 = ogg_int16_t *_y
- @ r1 = ogg_int16_t *_x
- @ r2 = int _last_zzi
- CMP r2, #3
- BLE oc_idct8x8_3_arm
- CMP r2, #6
- BLE oc_idct8x8_6_arm
- CMP r2, #10
- BLE oc_idct8x8_10_arm
- oc_idct8x8_slow_arm:
- STMFD r13!,{r4-r11,r14}
- SUB r13,r13,#64*2
- @ Row transforms
- STR r0, [r13,#-4]!
- ADD r0, r13, #4 @ Write to temp storage.
- BL idct8core_arm
- BL idct8core_arm
- BL idct8core_arm
- BL idct8core_arm
- BL idct8core_arm
- BL idct8core_arm
- BL idct8core_arm
- BL idct8core_arm
- LDR r0, [r13], #4 @ Write to the final destination.
- SUB r2, r1, #8*16
- @ Clear input data for next block.
- MOV r4, #0
- MOV r5, #0
- MOV r6, #0
- MOV r7, #0
- STMIA r2!,{r4,r5,r6,r7}
- STMIA r2!,{r4,r5,r6,r7}
- STMIA r2!,{r4,r5,r6,r7}
- STMIA r2!,{r4,r5,r6,r7}
- STMIA r2!,{r4,r5,r6,r7}
- STMIA r2!,{r4,r5,r6,r7}
- STMIA r2!,{r4,r5,r6,r7}
- STMIA r2!,{r4,r5,r6,r7}
- MOV r1, r13 @ And read from temp storage.
- @ Column transforms
- BL idct8core_down_arm
- BL idct8core_down_arm
- BL idct8core_down_arm
- BL idct8core_down_arm
- BL idct8core_down_arm
- BL idct8core_down_arm
- BL idct8core_down_arm
- BL idct8core_down_arm
- ADD r13,r13,#64*2
- LDMFD r13!,{r4-r11,PC}
- @ .size oc_idct8x8_arm, .-oc_idct8x8_arm @ ENDP
- @ .type oc_idct8x8_10_arm, %function; oc_idct8x8_10_arm: @ PROC
- oc_idct8x8_10_arm:
- STMFD r13!,{r4-r11,r14}
- SUB r13,r13,#64*2
- @ Row transforms
- MOV r2, r0
- MOV r0, r13 @ Write to temp storage.
- BL idct4core_arm
- BL idct3core_arm
- BL idct2core_arm
- BL idct1core_arm
- @ Clear input data for next block.
- MOV r4, #0
- STR r4, [r1,#-4*16]!
- STR r4, [r1,#4]
- STR r4, [r1,#16]
- STR r4, [r1,#20]
- STR r4, [r1,#32]
- STR r4, [r1,#48]
- MOV r1, r13 @ Read from temp storage.
- MOV r0, r2 @ Write to the final destination
- oc_idct8x8_10_arm_cols:
- @ Column transforms
- BL idct4core_down_arm
- BL idct4core_down_arm
- BL idct4core_down_arm
- BL idct4core_down_arm
- BL idct4core_down_arm
- BL idct4core_down_arm
- BL idct4core_down_arm
- BL idct4core_down_arm
- ADD r13,r13,#64*2
- LDMFD r13!,{r4-r11,PC}
- @ .size oc_idct8x8_10_arm, .-oc_idct8x8_10_arm @ ENDP
- @ .type oc_idct8x8_6_arm, %function; oc_idct8x8_6_arm: @ PROC
- oc_idct8x8_6_arm:
- STMFD r13!,{r4-r7,r9-r11,r14}
- SUB r13,r13,#64*2
- @ Row transforms
- MOV r2, r0
- MOV r0, r13 @ Write to temp storage.
- BL idct3core_arm
- BL idct2core_arm
- BL idct1core_arm
- @ Clear input data for next block.
- MOV r4, #0
- STR r4, [r1,#-3*16]!
- STR r4, [r1,#4]
- STR r4, [r1,#16]
- STR r4, [r1,#32]
- MOV r1, r13 @ Read from temp storage.
- MOV r0, r2 @ Write to the final destination
- @ Column transforms
- BL idct3core_down_arm
- BL idct3core_down_arm
- BL idct3core_down_arm
- BL idct3core_down_arm
- BL idct3core_down_arm
- BL idct3core_down_arm
- BL idct3core_down_arm
- BL idct3core_down_arm
- ADD r13,r13,#64*2
- LDMFD r13!,{r4-r7,r9-r11,PC}
- @ .size oc_idct8x8_6_arm, .-oc_idct8x8_6_arm @ ENDP
- @ .type oc_idct8x8_3_arm, %function; oc_idct8x8_3_arm: @ PROC
- oc_idct8x8_3_arm:
- STMFD r13!,{r4-r7,r9-r11,r14}
- SUB r13,r13,#64*2
- @ Row transforms
- MOV r2, r0
- MOV r0, r13 @ Write to temp storage.
- BL idct2core_arm
- BL idct1core_arm
- @ Clear input data for next block.
- MOV r4, #0
- STR r4, [r1,#-2*16]!
- STR r4, [r1,#16]
- MOV r1, r13 @ Read from temp storage.
- MOV r0, r2 @ Write to the final destination
- @ Column transforms
- BL idct2core_down_arm
- BL idct2core_down_arm
- BL idct2core_down_arm
- BL idct2core_down_arm
- BL idct2core_down_arm
- BL idct2core_down_arm
- BL idct2core_down_arm
- BL idct2core_down_arm
- ADD r13,r13,#64*2
- LDMFD r13!,{r4-r7,r9-r11,PC}
- @ .size oc_idct8x8_3_arm, .-oc_idct8x8_3_arm @ ENDP
- @ .type idct1core_arm, %function; idct1core_arm: @ PROC
- idct1core_arm:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- LDRSH r3, [r1], #16
- MOV r12,#0x05
- ORR r12,r12,#0xB500
- MUL r3, r12, r3
- @ Stall ?
- MOV r3, r3, ASR #16
- STRH r3, [r0], #2
- STRH r3, [r0, #14]
- STRH r3, [r0, #30]
- STRH r3, [r0, #46]
- STRH r3, [r0, #62]
- STRH r3, [r0, #78]
- STRH r3, [r0, #94]
- STRH r3, [r0, #110]
- MOV PC,R14
- @ .size idct1core_arm, .-idct1core_arm @ ENDP
- @ .type idct2core_arm, %function; idct2core_arm: @ PROC
- idct2core_arm:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- LDRSH r9, [r1], #16 @ r9 = x[0]
- LDR r12,OC_C4S4
- LDRSH r11,[r1, #-14] @ r11= x[1]
- LDR r3, OC_C7S1
- MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
- LDR r10,OC_C1S7
- MUL r3, r11,r3 @ r3 = t[4]<<16 = OC_C7S1*x[1]
- MOV r9, r9, ASR #16 @ r9 = t[0]
- MUL r11,r10,r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
- MOV r3, r3, ASR #16 @ r3 = t[4]
- MUL r10,r12,r3 @ r10= t[5]<<16 = OC_C4S4*t[4]
- MOV r11,r11,ASR #16 @ r11= t[7]
- MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
- MOV r10,r10,ASR #16 @ r10= t[5]
- ADD r12,r9,r12,ASR #16 @ r12= t[0]+t[6]
- ADD r12,r12,r10 @ r12= t[0]+t2[6] = t[0]+t[6]+t[5]
- SUB r10,r12,r10,LSL #1 @ r10= t[0]+t2[5] = t[0]+t[6]-t[5]
- ADD r3, r3, r9 @ r3 = t[0]+t[4]
- ADD r11,r11,r9 @ r11= t[0]+t[7]
- STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
- STRH r12,[r0, #14] @ y[1] = t[0]+t[6]
- STRH r10,[r0, #30] @ y[2] = t[0]+t[5]
- STRH r3, [r0, #46] @ y[3] = t[0]+t[4]
- RSB r3, r3, r9, LSL #1 @ r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
- RSB r10,r10,r9, LSL #1 @ r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
- RSB r12,r12,r9, LSL #1 @ r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
- RSB r11,r11,r9, LSL #1 @ r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
- STRH r3, [r0, #62] @ y[4] = t[0]-t[4]
- STRH r10,[r0, #78] @ y[5] = t[0]-t[5]
- STRH r12,[r0, #94] @ y[6] = t[0]-t[6]
- STRH r11,[r0, #110] @ y[7] = t[0]-t[7]
- MOV PC,r14
- @ .size idct2core_arm, .-idct2core_arm @ ENDP
- @ .type idct2core_down_arm, %function; idct2core_down_arm: @ PROC
- idct2core_down_arm:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- LDRSH r9, [r1], #16 @ r9 = x[0]
- LDR r12,OC_C4S4
- LDRSH r11,[r1, #-14] @ r11= x[1]
- LDR r3, OC_C7S1
- MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
- LDR r10,OC_C1S7
- MUL r3, r11,r3 @ r3 = t[4]<<16 = OC_C7S1*x[1]
- MOV r9, r9, ASR #16 @ r9 = t[0]
- MUL r11,r10,r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
- ADD r9, r9, #8 @ r9 = t[0]+8
- MOV r3, r3, ASR #16 @ r3 = t[4]
- MUL r10,r12,r3 @ r10= t[5]<<16 = OC_C4S4*t[4]
- MOV r11,r11,ASR #16 @ r11= t[7]
- MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
- MOV r10,r10,ASR #16 @ r10= t[5]
- ADD r12,r9,r12,ASR #16 @ r12= t[0]+t[6]+8
- ADD r12,r12,r10 @ r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
- SUB r10,r12,r10,LSL #1 @ r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
- ADD r3, r3, r9 @ r3 = t[0]+t[4]+8
- ADD r11,r11,r9 @ r11= t[0]+t[7]+8
- @ TODO: This is wrong.
- @ The C code truncates to 16 bits by storing to RAM and doing the
- @ shifts later; we've got an extra 4 bits here.
- MOV r4, r11,ASR #4
- MOV r5, r12,ASR #4
- MOV r6, r10,ASR #4
- MOV r7, r3, ASR #4
- RSB r3, r3, r9, LSL #1 @r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
- RSB r10,r10,r9, LSL #1 @r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
- RSB r12,r12,r9, LSL #1 @r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
- RSB r11,r11,r9, LSL #1 @r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
- MOV r3, r3, ASR #4
- MOV r10,r10,ASR #4
- MOV r12,r12,ASR #4
- MOV r11,r11,ASR #4
- STRH r4, [r0], #2 @ y[0] = t[0]+t[7]
- STRH r5, [r0, #14] @ y[1] = t[0]+t[6]
- STRH r6, [r0, #30] @ y[2] = t[0]+t[5]
- STRH r7, [r0, #46] @ y[3] = t[0]+t[4]
- STRH r3, [r0, #62] @ y[4] = t[0]-t[4]
- STRH r10,[r0, #78] @ y[5] = t[0]-t[5]
- STRH r12,[r0, #94] @ y[6] = t[0]-t[6]
- STRH r11,[r0, #110] @ y[7] = t[0]-t[7]
- MOV PC,r14
- @ .size idct2core_down_arm, .-idct2core_down_arm @ ENDP
- @ .type idct3core_arm, %function; idct3core_arm: @ PROC
- idct3core_arm:
- LDRSH r9, [r1], #16 @ r9 = x[0]
- LDR r12,OC_C4S4 @ r12= OC_C4S4
- LDRSH r3, [r1, #-12] @ r3 = x[2]
- LDR r10,OC_C6S2 @ r10= OC_C6S2
- MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
- LDR r4, OC_C2S6 @ r4 = OC_C2S6
- MUL r10,r3, r10 @ r10= t[2]<<16 = OC_C6S2*x[2]
- LDRSH r11,[r1, #-14] @ r11= x[1]
- MUL r3, r4, r3 @ r3 = t[3]<<16 = OC_C2S6*x[2]
- LDR r4, OC_C7S1 @ r4 = OC_C7S1
- LDR r5, OC_C1S7 @ r5 = OC_C1S7
- MOV r9, r9, ASR #16 @ r9 = t[0]
- MUL r4, r11,r4 @ r4 = t[4]<<16 = OC_C7S1*x[1]
- ADD r3, r9, r3, ASR #16 @ r3 = t[0]+t[3]
- MUL r11,r5, r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
- MOV r4, r4, ASR #16 @ r4 = t[4]
- MUL r5, r12,r4 @ r5 = t[5]<<16 = OC_C4S4*t[4]
- MOV r11,r11,ASR #16 @ r11= t[7]
- MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
- ADD r10,r9, r10,ASR #16 @ r10= t[1] = t[0]+t[2]
- RSB r6, r10,r9, LSL #1 @ r6 = t[2] = t[0]-t[2]
- @ r3 = t2[0] = t[0]+t[3]
- RSB r9, r3, r9, LSL #1 @ r9 = t2[3] = t[0]-t[3]
- MOV r12,r12,ASR #16 @ r12= t[6]
- ADD r5, r12,r5, ASR #16 @ r5 = t2[6] = t[6]+t[5]
- RSB r12,r5, r12,LSL #1 @ r12= t2[5] = t[6]-t[5]
- ADD r11,r3, r11 @ r11= t2[0]+t[7]
- ADD r5, r10,r5 @ r5 = t[1]+t2[6]
- ADD r12,r6, r12 @ r12= t[2]+t2[5]
- ADD r4, r9, r4 @ r4 = t2[3]+t[4]
- STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
- STRH r5, [r0, #14] @ y[1] = t[1]+t2[6]
- STRH r12,[r0, #30] @ y[2] = t[2]+t2[5]
- STRH r4, [r0, #46] @ y[3] = t2[3]+t[4]
- RSB r11,r11,r3, LSL #1 @ r11= t2[0] - t[7]
- RSB r5, r5, r10,LSL #1 @ r5 = t[1] - t2[6]
- RSB r12,r12,r6, LSL #1 @ r6 = t[2] - t2[5]
- RSB r4, r4, r9, LSL #1 @ r4 = t2[3] - t[4]
- STRH r4, [r0, #62] @ y[4] = t2[3]-t[4]
- STRH r12,[r0, #78] @ y[5] = t[2]-t2[5]
- STRH r5, [r0, #94] @ y[6] = t[1]-t2[6]
- STRH r11,[r0, #110] @ y[7] = t2[0]-t[7]
- MOV PC,R14
- @ .size idct3core_arm, .-idct3core_arm @ ENDP
- @ .type idct3core_down_arm, %function; idct3core_down_arm: @ PROC
- idct3core_down_arm:
- LDRSH r9, [r1], #16 @ r9 = x[0]
- LDR r12,OC_C4S4 @ r12= OC_C4S4
- LDRSH r3, [r1, #-12] @ r3 = x[2]
- LDR r10,OC_C6S2 @ r10= OC_C6S2
- MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
- LDR r4, OC_C2S6 @ r4 = OC_C2S6
- MUL r10,r3, r10 @ r10= t[2]<<16 = OC_C6S2*x[2]
- LDRSH r11,[r1, #-14] @ r11= x[1]
- MUL r3, r4, r3 @ r3 = t[3]<<16 = OC_C2S6*x[2]
- LDR r4, OC_C7S1 @ r4 = OC_C7S1
- LDR r5, OC_C1S7 @ r5 = OC_C1S7
- MOV r9, r9, ASR #16 @ r9 = t[0]
- MUL r4, r11,r4 @ r4 = t[4]<<16 = OC_C7S1*x[1]
- ADD r9, r9, #8 @ r9 = t[0]+8
- MUL r11,r5, r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
- ADD r3, r9, r3, ASR #16 @ r3 = t[0]+t[3]+8
- MOV r4, r4, ASR #16 @ r4 = t[4]
- MUL r5, r12,r4 @ r5 = t[5]<<16 = OC_C4S4*t[4]
- MOV r11,r11,ASR #16 @ r11= t[7]
- MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
- ADD r10,r9, r10,ASR #16 @ r10= t[1]+8 = t[0]+t[2]+8
- RSB r6, r10,r9, LSL #1 @ r6 = t[2]+8 = t[0]-t[2]+8
- @ r3 = t2[0]+8 = t[0]+t[3]+8
- RSB r9, r3, r9, LSL #1 @ r9 = t2[3]+8 = t[0]-t[3]+8
- MOV r12,r12,ASR #16 @ r12= t[6]
- ADD r5, r12,r5, ASR #16 @ r5 = t2[6] = t[6]+t[5]
- RSB r12,r5, r12,LSL #1 @ r12= t2[5] = t[6]-t[5]
- ADD r11,r3, r11 @ r11= t2[0]+t[7] +8
- ADD r5, r10,r5 @ r5 = t[1] +t2[6]+8
- ADD r12,r6, r12 @ r12= t[2] +t2[5]+8
- ADD r4, r9, r4 @ r4 = t2[3]+t[4] +8
- RSB r3, r11,r3, LSL #1 @ r11= t2[0] - t[7] + 8
- RSB r10,r5, r10,LSL #1 @ r5 = t[1] - t2[6] + 8
- RSB r6, r12,r6, LSL #1 @ r6 = t[2] - t2[5] + 8
- RSB r9, r4, r9, LSL #1 @ r4 = t2[3] - t[4] + 8
- @ TODO: This is wrong.
- @ The C code truncates to 16 bits by storing to RAM and doing the
- @ shifts later; we've got an extra 4 bits here.
- MOV r11,r11,ASR #4
- MOV r5, r5, ASR #4
- MOV r12,r12,ASR #4
- MOV r4, r4, ASR #4
- MOV r9, r9, ASR #4
- MOV r6, r6, ASR #4
- MOV r10,r10,ASR #4
- MOV r3, r3, ASR #4
- STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
- STRH r5, [r0, #14] @ y[1] = t[1]+t2[6]
- STRH r12,[r0, #30] @ y[2] = t[2]+t2[5]
- STRH r4, [r0, #46] @ y[3] = t2[3]+t[4]
- STRH r9, [r0, #62] @ y[4] = t2[3]-t[4]
- STRH r6, [r0, #78] @ y[5] = t[2]-t2[5]
- STRH r10,[r0, #94] @ y[6] = t[1]-t2[6]
- STRH r3, [r0, #110] @ y[7] = t2[0]-t[7]
- MOV PC,R14
- @ .size idct3core_down_arm, .-idct3core_down_arm @ ENDP
- @ .type idct4core_arm, %function; idct4core_arm: @ PROC
- idct4core_arm:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- LDRSH r9, [r1], #16 @ r9 = x[0]
- LDR r10,OC_C4S4 @ r10= OC_C4S4
- LDRSH r12,[r1, #-12] @ r12= x[2]
- LDR r4, OC_C6S2 @ r4 = OC_C6S2
- MUL r9, r10,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
- LDR r5, OC_C2S6 @ r5 = OC_C2S6
- MUL r4, r12,r4 @ r4 = t[2]<<16 = OC_C6S2*x[2]
- LDRSH r3, [r1, #-14] @ r3 = x[1]
- MUL r5, r12,r5 @ r5 = t[3]<<16 = OC_C2S6*x[2]
- LDR r6, OC_C7S1 @ r6 = OC_C7S1
- LDR r12,OC_C1S7 @ r12= OC_C1S7
- LDRSH r11,[r1, #-10] @ r11= x[3]
- MUL r6, r3, r6 @ r6 = t[4]<<16 = OC_C7S1*x[1]
- LDR r7, OC_C5S3 @ r7 = OC_C5S3
- MUL r3, r12,r3 @ r3 = t[7]<<16 = OC_C1S7*x[1]
- LDR r8, OC_C3S5 @ r8 = OC_C3S5
- MUL r7, r11,r7 @ r7 = -t[5]<<16 = OC_C5S3*x[3]
- MOV r9, r9, ASR #16 @ r9 = t[0]
- MUL r11,r8, r11 @ r11= t[6]<<16 = OC_C3S5*x[3]
- MOV r6, r6, ASR #16 @ r6 = t[4]
- @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
- @ before multiplying, not after (this is not equivalent)
- SUB r7, r6, r7, ASR #16 @ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
- RSB r6, r7, r6, LSL #1 @ r6 = t[4]-t[5]
- MUL r6, r10,r6 @ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
- MOV r3, r3, ASR #16 @ r3 = t[7]
- ADD r11,r3, r11,ASR #16 @ r11= t2[7]=t[7]+t[6]
- RSB r3, r11,r3, LSL #1 @ r3 = t[7]-t[6]
- MUL r3, r10,r3 @ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
- ADD r4, r9, r4, ASR #16 @ r4 = t[1] = t[0] + t[2]
- RSB r10,r4, r9, LSL #1 @ r10= t[2] = t[0] - t[2]
- ADD r5, r9, r5, ASR #16 @ r5 = t[0] = t[0] + t[3]
- RSB r9, r5, r9, LSL #1 @ r9 = t[3] = t[0] - t[3]
- MOV r3, r3, ASR #16 @ r3 = t2[6]
- ADD r6, r3, r6, ASR #16 @ r6 = t3[6] = t2[6]+t2[5]
- RSB r3, r6, r3, LSL #1 @ r3 = t3[5] = t2[6]-t2[5]
- ADD r11,r5, r11 @ r11= t[0]+t2[7]
- ADD r6, r4, r6 @ r6 = t[1]+t3[6]
- ADD r3, r10,r3 @ r3 = t[2]+t3[5]
- ADD r7, r9, r7 @ r7 = t[3]+t2[4]
- STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
- STRH r6, [r0, #14] @ y[1] = t[1]+t2[6]
- STRH r3, [r0, #30] @ y[2] = t[2]+t2[5]
- STRH r7, [r0, #46] @ y[3] = t2[3]+t[4]
- RSB r11,r11,r5, LSL #1 @ r11= t[0]-t2[7]
- RSB r6, r6, r4, LSL #1 @ r6 = t[1]-t3[6]
- RSB r3, r3, r10,LSL #1 @ r3 = t[2]-t3[5]
- RSB r7, r7, r9, LSL #1 @ r7 = t[3]-t2[4]
- STRH r7, [r0, #62] @ y[4] = t2[3]-t[4]
- STRH r3, [r0, #78] @ y[5] = t[2]-t2[5]
- STRH r6, [r0, #94] @ y[6] = t[1]-t2[6]
- STRH r11, [r0, #110] @ y[7] = t2[0]-t[7]
- MOV PC,r14
- @ .size idct4core_arm, .-idct4core_arm @ ENDP
- @ .type idct4core_down_arm, %function; idct4core_down_arm: @ PROC
- idct4core_down_arm:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- LDRSH r9, [r1], #16 @ r9 = x[0]
- LDR r10,OC_C4S4 @ r10= OC_C4S4
- LDRSH r12,[r1, #-12] @ r12= x[2]
- LDR r4, OC_C6S2 @ r4 = OC_C6S2
- MUL r9, r10,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
- LDR r5, OC_C2S6 @ r5 = OC_C2S6
- MUL r4, r12,r4 @ r4 = t[2]<<16 = OC_C6S2*x[2]
- LDRSH r3, [r1, #-14] @ r3 = x[1]
- MUL r5, r12,r5 @ r5 = t[3]<<16 = OC_C2S6*x[2]
- LDR r6, OC_C7S1 @ r6 = OC_C7S1
- LDR r12,OC_C1S7 @ r12= OC_C1S7
- LDRSH r11,[r1, #-10] @ r11= x[3]
- MUL r6, r3, r6 @ r6 = t[4]<<16 = OC_C7S1*x[1]
- LDR r7, OC_C5S3 @ r7 = OC_C5S3
- MUL r3, r12,r3 @ r3 = t[7]<<16 = OC_C1S7*x[1]
- LDR r8, OC_C3S5 @ r8 = OC_C3S5
- MUL r7, r11,r7 @ r7 = -t[5]<<16 = OC_C5S3*x[3]
- MOV r9, r9, ASR #16 @ r9 = t[0]
- MUL r11,r8, r11 @ r11= t[6]<<16 = OC_C3S5*x[3]
- MOV r6, r6, ASR #16 @ r6 = t[4]
- @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
- @ before multiplying, not after (this is not equivalent)
- SUB r7, r6, r7, ASR #16 @ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
- RSB r6, r7, r6, LSL #1 @ r6 = t[4]-t[5]
- MUL r6, r10,r6 @ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
- MOV r3, r3, ASR #16 @ r3 = t[7]
- ADD r11,r3, r11,ASR #16 @ r11= t2[7]=t[7]+t[6]
- RSB r3, r11,r3, LSL #1 @ r3 = t[7]-t[6]
- ADD r9, r9, #8 @ r9 = t[0]+8
- MUL r3, r10,r3 @ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
- ADD r4, r9, r4, ASR #16 @ r4 = t[1] = t[0] + t[2] + 8
- RSB r10,r4, r9, LSL #1 @ r10= t[2] = t[0] - t[2] + 8
- ADD r5, r9, r5, ASR #16 @ r5 = t[0] = t[0] + t[3] + 8
- RSB r9, r5, r9, LSL #1 @ r9 = t[3] = t[0] - t[3] + 8
- MOV r3, r3, ASR #16 @ r3 = t2[6]
- ADD r6, r3, r6, ASR #16 @ r6 = t3[6] = t2[6]+t2[5]
- RSB r3, r6, r3, LSL #1 @ r3 = t3[5] = t2[6]-t2[5]
- ADD r5, r5, r11 @ r5 = t[0]+t2[7]+8
- ADD r4, r4, r6 @ r4 = t[1]+t3[6]+8
- ADD r10,r10,r3 @ r10= t[2]+t3[5]+8
- ADD r9, r9, r7 @ r9 = t[3]+t2[4]+8
- SUB r11,r5, r11,LSL #1 @ r11= t[0]-t2[7]+8
- SUB r6, r4, r6, LSL #1 @ r6 = t[1]-t3[6]+8
- SUB r3, r10,r3, LSL #1 @ r3 = t[2]-t3[5]+8
- SUB r7, r9, r7, LSL #1 @ r7 = t[3]-t2[4]+8
- @ TODO: This is wrong.
- @ The C code truncates to 16 bits by storing to RAM and doing the
- @ shifts later; we've got an extra 4 bits here.
- MOV r11,r11,ASR #4
- MOV r6, r6, ASR #4
- MOV r3, r3, ASR #4
- MOV r7, r7, ASR #4
- MOV r9, r9, ASR #4
- MOV r10,r10,ASR #4
- MOV r4, r4, ASR #4
- MOV r5, r5, ASR #4
- STRH r5,[r0], #2 @ y[0] = t[0]+t[7]
- STRH r4, [r0, #14] @ y[1] = t[1]+t2[6]
- STRH r10,[r0, #30] @ y[2] = t[2]+t2[5]
- STRH r9, [r0, #46] @ y[3] = t2[3]+t[4]
- STRH r7, [r0, #62] @ y[4] = t2[3]-t[4]
- STRH r3, [r0, #78] @ y[5] = t[2]-t2[5]
- STRH r6, [r0, #94] @ y[6] = t[1]-t2[6]
- STRH r11,[r0, #110] @ y[7] = t2[0]-t[7]
- MOV PC,r14
- @ .size idct4core_down_arm, .-idct4core_down_arm @ ENDP
- @ .type idct8core_arm, %function; idct8core_arm: @ PROC
- idct8core_arm:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- LDRSH r2, [r1],#16 @ r2 = x[0]
- STMFD r13!,{r1,r14}
- LDRSH r6, [r1, #-8] @ r6 = x[4]
- LDR r12,OC_C4S4 @ r12= C4S4
- LDRSH r4, [r1, #-12] @ r4 = x[2]
- ADD r2, r2, r6 @ r2 = x[0] + x[4]
- SUB r6, r2, r6, LSL #1 @ r6 = x[0] - x[4]
- @ For spec compliance, these sums must be truncated to 16-bit precision
- @ _before_ the multiply (not after).
- @ Sadly, ARMv4 provides no simple way to do that.
- MOV r2, r2, LSL #16
- MOV r6, r6, LSL #16
- MOV r2, r2, ASR #16
- MOV r6, r6, ASR #16
- MUL r2, r12,r2 @ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
- LDRSH r8, [r1, #-4] @ r8 = x[6]
- LDR r7, OC_C6S2 @ r7 = OC_C6S2
- MUL r6, r12,r6 @ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
- LDR r14,OC_C2S6 @ r14= OC_C2S6
- MUL r3, r4, r7 @ r3 = OC_C6S2*x[2]
- LDR r5, OC_C7S1 @ r5 = OC_C7S1
- MUL r4, r14,r4 @ r4 = OC_C2S6*x[2]
- MOV r3, r3, ASR #16 @ r3 = OC_C6S2*x[2]>>16
- MUL r14,r8, r14 @ r14= OC_C2S6*x[6]
- MOV r4, r4, ASR #16 @ r4 = OC_C2S6*x[2]>>16
- MUL r8, r7, r8 @ r8 = OC_C6S2*x[6]
- LDR r7, OC_C1S7 @ r7 = OC_C1S7
- SUB r3, r3, r14,ASR #16 @ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
- LDRSH r14,[r1, #-14] @ r14= x[1]
- ADD r4, r4, r8, ASR #16 @ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
- LDRSH r8, [r1, #-2] @ r8 = x[7]
- MUL r9, r5, r14 @ r9 = OC_C7S1*x[1]
- LDRSH r10,[r1, #-6] @ r10= x[5]
- MUL r14,r7, r14 @ r14= OC_C1S7*x[1]
- MOV r9, r9, ASR #16 @ r9 = OC_C7S1*x[1]>>16
- MUL r7, r8, r7 @ r7 = OC_C1S7*x[7]
- MOV r14,r14,ASR #16 @ r14= OC_C1S7*x[1]>>16
- MUL r8, r5, r8 @ r8 = OC_C7S1*x[7]
- LDRSH r1, [r1, #-10] @ r1 = x[3]
- LDR r5, OC_C3S5 @ r5 = OC_C3S5
- LDR r11,OC_C5S3 @ r11= OC_C5S3
- ADD r8, r14,r8, ASR #16 @ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
- MUL r14,r5, r10 @ r14= OC_C3S5*x[5]
- SUB r9, r9, r7, ASR #16 @ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
- MUL r10,r11,r10 @ r10= OC_C5S3*x[5]
- MOV r14,r14,ASR #16 @ r14= OC_C3S5*x[5]>>16
- MUL r11,r1, r11 @ r11= OC_C5S3*x[3]
- MOV r10,r10,ASR #16 @ r10= OC_C5S3*x[5]>>16
- MUL r1, r5, r1 @ r1 = OC_C3S5*x[3]
- SUB r14,r14,r11,ASR #16 @r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
- ADD r10,r10,r1, ASR #16 @r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
- @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
- @ r10=t[6] r12=C4S4 r14=t[5]
- @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
- @ before multiplying, not after (this is not equivalent)
- @ Stage 2
- @ 4-5 butterfly
- ADD r9, r9, r14 @ r9 = t2[4] = t[4]+t[5]
- SUB r14,r9, r14, LSL #1 @ r14= t[4]-t[5]
- MUL r14,r12,r14 @ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
- @ 7-6 butterfly
- ADD r8, r8, r10 @ r8 = t2[7] = t[7]+t[6]
- SUB r10,r8, r10, LSL #1 @ r10= t[7]-t[6]
- MUL r10,r12,r10 @ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
- @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
- @ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
- @ Stage 3
- @ 0-3 butterfly
- ADD r2, r4, r2, ASR #16 @ r2 = t2[0] = t[0] + t[3]
- SUB r4, r2, r4, LSL #1 @ r4 = t2[3] = t[0] - t[3]
- @ 1-2 butterfly
- ADD r6, r3, r6, ASR #16 @ r6 = t2[1] = t[1] + t[2]
- SUB r3, r6, r3, LSL #1 @ r3 = t2[2] = t[1] - t[2]
- @ 6-5 butterfly
- MOV r14,r14,ASR #16 @ r14= t2[5]
- ADD r10,r14,r10,ASR #16 @ r10= t3[6] = t[6] + t[5]
- SUB r14,r10,r14,LSL #1 @ r14= t3[5] = t[6] - t[5]
- @ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
- @ r10=t3[6] r14=t3[5]
- @ Stage 4
- ADD r2, r2, r8 @ r2 = t[0] + t[7]
- ADD r6, r6, r10 @ r6 = t[1] + t[6]
- ADD r3, r3, r14 @ r3 = t[2] + t[5]
- ADD r4, r4, r9 @ r4 = t[3] + t[4]
- SUB r8, r2, r8, LSL #1 @ r8 = t[0] - t[7]
- SUB r10,r6, r10,LSL #1 @ r10= t[1] - t[6]
- SUB r14,r3, r14,LSL #1 @ r14= t[2] - t[5]
- SUB r9, r4, r9, LSL #1 @ r9 = t[3] - t[4]
- STRH r2, [r0], #2 @ y[0] = t[0]+t[7]
- STRH r6, [r0, #14] @ y[1] = t[1]+t[6]
- STRH r3, [r0, #30] @ y[2] = t[2]+t[5]
- STRH r4, [r0, #46] @ y[3] = t[3]+t[4]
- STRH r9, [r0, #62] @ y[4] = t[3]-t[4]
- STRH r14,[r0, #78] @ y[5] = t[2]-t[5]
- STRH r10,[r0, #94] @ y[6] = t[1]-t[6]
- STRH r8, [r0, #110] @ y[7] = t[0]-t[7]
- LDMFD r13!,{r1,PC}
- @ .size idct8core_arm, .-idct8core_arm @ ENDP
- @ .type idct8core_down_arm, %function; idct8core_down_arm: @ PROC
- idct8core_down_arm:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- LDRSH r2, [r1],#16 @ r2 = x[0]
- STMFD r13!,{r1,r14}
- LDRSH r6, [r1, #-8] @ r6 = x[4]
- LDR r12,OC_C4S4 @ r12= C4S4
- LDRSH r4, [r1, #-12] @ r4 = x[2]
- ADD r2, r2, r6 @ r2 = x[0] + x[4]
- SUB r6, r2, r6, LSL #1 @ r6 = x[0] - x[4]
- @ For spec compliance, these sums must be truncated to 16-bit precision
- @ _before_ the multiply (not after).
- @ Sadly, ARMv4 provides no simple way to do that.
- MOV r2, r2, LSL #16
- MOV r6, r6, LSL #16
- MOV r2, r2, ASR #16
- MOV r6, r6, ASR #16
- MUL r2, r12,r2 @ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
- LDRSH r8, [r1, #-4] @ r8 = x[6]
- LDR r7, OC_C6S2 @ r7 = OC_C6S2
- MUL r6, r12,r6 @ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
- LDR r14,OC_C2S6 @ r14= OC_C2S6
- MUL r3, r4, r7 @ r3 = OC_C6S2*x[2]
- LDR r5, OC_C7S1 @ r5 = OC_C7S1
- MUL r4, r14,r4 @ r4 = OC_C2S6*x[2]
- MOV r3, r3, ASR #16 @ r3 = OC_C6S2*x[2]>>16
- MUL r14,r8, r14 @ r14= OC_C2S6*x[6]
- MOV r4, r4, ASR #16 @ r4 = OC_C2S6*x[2]>>16
- MUL r8, r7, r8 @ r8 = OC_C6S2*x[6]
- LDR r7, OC_C1S7 @ r7 = OC_C1S7
- SUB r3, r3, r14,ASR #16 @ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
- LDRSH r14,[r1, #-14] @ r14= x[1]
- ADD r4, r4, r8, ASR #16 @ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
- LDRSH r8, [r1, #-2] @ r8 = x[7]
- MUL r9, r5, r14 @ r9 = OC_C7S1*x[1]
- LDRSH r10,[r1, #-6] @ r10= x[5]
- MUL r14,r7, r14 @ r14= OC_C1S7*x[1]
- MOV r9, r9, ASR #16 @ r9 = OC_C7S1*x[1]>>16
- MUL r7, r8, r7 @ r7 = OC_C1S7*x[7]
- MOV r14,r14,ASR #16 @ r14= OC_C1S7*x[1]>>16
- MUL r8, r5, r8 @ r8 = OC_C7S1*x[7]
- LDRSH r1, [r1, #-10] @ r1 = x[3]
- LDR r5, OC_C3S5 @ r5 = OC_C3S5
- LDR r11,OC_C5S3 @ r11= OC_C5S3
- ADD r8, r14,r8, ASR #16 @ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
- MUL r14,r5, r10 @ r14= OC_C3S5*x[5]
- SUB r9, r9, r7, ASR #16 @ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
- MUL r10,r11,r10 @ r10= OC_C5S3*x[5]
- MOV r14,r14,ASR #16 @ r14= OC_C3S5*x[5]>>16
- MUL r11,r1, r11 @ r11= OC_C5S3*x[3]
- MOV r10,r10,ASR #16 @ r10= OC_C5S3*x[5]>>16
- MUL r1, r5, r1 @ r1 = OC_C3S5*x[3]
- SUB r14,r14,r11,ASR #16 @r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
- ADD r10,r10,r1, ASR #16 @r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
- @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
- @ r10=t[6] r12=C4S4 r14=t[5]
- @ Stage 2
- @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
- @ before multiplying, not after (this is not equivalent)
- @ 4-5 butterfly
- ADD r9, r9, r14 @ r9 = t2[4] = t[4]+t[5]
- SUB r14,r9, r14, LSL #1 @ r14= t[4]-t[5]
- MUL r14,r12,r14 @ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
- @ 7-6 butterfly
- ADD r8, r8, r10 @ r8 = t2[7] = t[7]+t[6]
- SUB r10,r8, r10, LSL #1 @ r10= t[7]-t[6]
- MUL r10,r12,r10 @ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
- @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
- @ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
- @ Stage 3
- ADD r2, r2, #8<<16 @ r2 = t[0]+8<<16
- ADD r6, r6, #8<<16 @ r6 = t[1]+8<<16
- @ 0-3 butterfly
- ADD r2, r4, r2, ASR #16 @ r2 = t2[0] = t[0] + t[3] + 8
- SUB r4, r2, r4, LSL #1 @ r4 = t2[3] = t[0] - t[3] + 8
- @ 1-2 butterfly
- ADD r6, r3, r6, ASR #16 @ r6 = t2[1] = t[1] + t[2] + 8
- SUB r3, r6, r3, LSL #1 @ r3 = t2[2] = t[1] - t[2] + 8
- @ 6-5 butterfly
- MOV r14,r14,ASR #16 @ r14= t2[5]
- ADD r10,r14,r10,ASR #16 @ r10= t3[6] = t[6] + t[5]
- SUB r14,r10,r14,LSL #1 @ r14= t3[5] = t[6] - t[5]
- @ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
- @ r10=t3[6] r14=t3[5]
- @ Stage 4
- ADD r2, r2, r8 @ r2 = t[0] + t[7] + 8
- ADD r6, r6, r10 @ r6 = t[1] + t[6] + 8
- ADD r3, r3, r14 @ r3 = t[2] + t[5] + 8
- ADD r4, r4, r9 @ r4 = t[3] + t[4] + 8
- SUB r8, r2, r8, LSL #1 @ r8 = t[0] - t[7] + 8
- SUB r10,r6, r10,LSL #1 @ r10= t[1] - t[6] + 8
- SUB r14,r3, r14,LSL #1 @ r14= t[2] - t[5] + 8
- SUB r9, r4, r9, LSL #1 @ r9 = t[3] - t[4] + 8
- @ TODO: This is wrong.
- @ The C code truncates to 16 bits by storing to RAM and doing the
- @ shifts later; we've got an extra 4 bits here.
- MOV r2, r2, ASR #4
- MOV r6, r6, ASR #4
- MOV r3, r3, ASR #4
- MOV r4, r4, ASR #4
- MOV r8, r8, ASR #4
- MOV r10,r10,ASR #4
- MOV r14,r14,ASR #4
- MOV r9, r9, ASR #4
- STRH r2, [r0], #2 @ y[0] = t[0]+t[7]
- STRH r6, [r0, #14] @ y[1] = t[1]+t[6]
- STRH r3, [r0, #30] @ y[2] = t[2]+t[5]
- STRH r4, [r0, #46] @ y[3] = t[3]+t[4]
- STRH r9, [r0, #62] @ y[4] = t[3]-t[4]
- STRH r14,[r0, #78] @ y[5] = t[2]-t[5]
- STRH r10,[r0, #94] @ y[6] = t[1]-t[6]
- STRH r8, [r0, #110] @ y[7] = t[0]-t[7]
- LDMFD r13!,{r1,PC}
- @ .size idct8core_down_arm, .-idct8core_down_arm @ ENDP
- .if OC_ARM_ASM_MEDIA
- .global _oc_idct8x8_1_v6
- .global _oc_idct8x8_v6
- @ .type oc_idct8x8_1_v6, %function; oc_idct8x8_1_v6: @ PROC
- _oc_idct8x8_1_v6:
- @ r0 = ogg_int16_t *_y
- @ r1 = ogg_uint16_t _dc
- ORR r2, r1, r1, LSL #16
- ORR r3, r1, r1, LSL #16
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- STRD r2, r3, [r0], #8
- MOV PC, r14
- @ .size oc_idct8x8_1_v6, .-oc_idct8x8_1_v6 @ ENDP
- @ .type oc_idct8x8_v6, %function; oc_idct8x8_v6: @ PROC
- _oc_idct8x8_v6:
- @ r0 = ogg_int16_t *_y
- @ r1 = ogg_int16_t *_x
- @ r2 = int _last_zzi
- CMP r2, #3
- BLE oc_idct8x8_3_v6
- @CMP r2, #6
- @BLE oc_idct8x8_6_v6
- CMP r2, #10
- BLE oc_idct8x8_10_v6
- oc_idct8x8_slow_v6:
- STMFD r13!,{r4-r11,r14}
- SUB r13,r13,#64*2
- @ Row transforms
- STR r0, [r13,#-4]!
- ADD r0, r13, #4 @ Write to temp storage.
- BL idct8_8core_v6
- BL idct8_8core_v6
- BL idct8_8core_v6
- BL idct8_8core_v6
- LDR r0, [r13], #4 @ Write to the final destination.
- @ Clear input data for next block.
- MOV r4, #0
- MOV r5, #0
- STRD r4, r5, [r1,#-8*16]!
- STRD r4, r5, [r1,#8]
- STRD r4, r5, [r1,#16]
- STRD r4, r5, [r1,#24]
- STRD r4, r5, [r1,#32]
- STRD r4, r5, [r1,#40]
- STRD r4, r5, [r1,#48]
- STRD r4, r5, [r1,#56]
- STRD r4, r5, [r1,#64]
- STRD r4, r5, [r1,#72]
- STRD r4, r5, [r1,#80]
- STRD r4, r5, [r1,#88]
- STRD r4, r5, [r1,#96]
- STRD r4, r5, [r1,#104]
- STRD r4, r5, [r1,#112]
- STRD r4, r5, [r1,#120]
- MOV r1, r13 @ And read from temp storage.
- @ Column transforms
- BL idct8_8core_down_v6
- BL idct8_8core_down_v6
- BL idct8_8core_down_v6
- BL idct8_8core_down_v6
- ADD r13,r13,#64*2
- LDMFD r13!,{r4-r11,PC}
- @ .size oc_idct8x8_v6, .-oc_idct8x8_v6 @ ENDP
- @ .type oc_idct8x8_10_v6, %function; oc_idct8x8_10_v6: @ PROC
- oc_idct8x8_10_v6:
- STMFD r13!,{r4-r11,r14}
- SUB r13,r13,#64*2+4
- @ Row transforms
- MOV r2, r13
- STR r0, [r13,#-4]!
- AND r0, r2, #4 @ Align the stack.
- ADD r0, r0, r2 @ Write to temp storage.
- BL idct4_3core_v6
- BL idct2_1core_v6
- LDR r0, [r13], #4 @ Write to the final destination.
- @ Clear input data for next block.
- MOV r4, #0
- MOV r5, #0
- STRD r4, r5, [r1,#-4*16]!
- STRD r4, r5, [r1,#16]
- STR r4, [r1,#32]
- STR r4, [r1,#48]
- AND r1, r13,#4 @ Align the stack.
- ADD r1, r1, r13 @ And read from temp storage.
- @ Column transforms
- BL idct4_4core_down_v6
- BL idct4_4core_down_v6
- BL idct4_4core_down_v6
- BL idct4_4core_down_v6
- ADD r13,r13,#64*2+4
- LDMFD r13!,{r4-r11,PC}
- @ .size oc_idct8x8_10_v6, .-oc_idct8x8_10_v6 @ ENDP
- @ .type oc_idct8x8_3_v6, %function; oc_idct8x8_3_v6: @ PROC
- oc_idct8x8_3_v6:
- STMFD r13!,{r4-r8,r14}
- SUB r13,r13,#64*2
- @ Row transforms
- MOV r8, r0
- MOV r0, r13 @ Write to temp storage.
- BL idct2_1core_v6
- @ Clear input data for next block.
- MOV r4, #0
- STR r4, [r1,#-2*16]!
- STR r4, [r1,#16]
- MOV r1, r13 @ Read from temp storage.
- MOV r0, r8 @ Write to the final destination.
- @ Column transforms
- BL idct2_2core_down_v6
- BL idct2_2core_down_v6
- BL idct2_2core_down_v6
- BL idct2_2core_down_v6
- ADD r13,r13,#64*2
- LDMFD r13!,{r4-r8,PC}
- @ .size oc_idct8x8_3_v6, .-oc_idct8x8_3_v6 @ ENDP
- @ .type idct2_1core_v6, %function; idct2_1core_v6: @ PROC
- idct2_1core_v6:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- @ Stage 1:
- LDR r2, [r1], #16 @ r2 = <x[0,1]|x[0,0]>
- LDR r3, OC_C4S4
- LDRSH r6, [r1], #16 @ r6 = x[1,0]
- SMULWB r12,r3, r2 @ r12= t[0,0]=OC_C4S4*x[0,0]>>16
- LDRD r4, r5, OC_C7S1 @ r4 = OC_C7S1; r5 = OC_C1S7
- SMULWB r6, r3, r6 @ r6 = t[1,0]=OC_C4S4*x[1,0]>>16
- SMULWT r4, r4, r2 @ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
- SMULWT r7, r5, r2 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
- @ Stage 2:
- SMULWB r5, r3, r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
- PKHBT r12,r12,r6, LSL #16 @ r12= <t[1,0]|t[0,0]>
- SMULWB r6, r3, r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
- PKHBT r7, r7, r3 @ r7 = <0|t[0,7]>
- @ Stage 3:
- PKHBT r5, r6, r5, LSL #16 @ r5 = <t[0,5]|t[0,6]>
- PKHBT r4, r4, r3 @ r4 = <0|t[0,4]>
- SADDSUBX r5, r5, r5 @ r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
- @ Stage 4:
- PKHTB r6, r3, r5, ASR #16 @ r6 = <0|t[0,6]>
- PKHBT r5, r5, r3 @ r5 = <0|t[0,5]>
- SADD16 r3, r12,r7 @ r3 = t[0]+t[7]
- STR r3, [r0], #4 @ y[0<<3] = t[0]+t[7]
- SADD16 r3, r12,r6 @ r3 = t[0]+t[6]
- STR r3, [r0, #12] @ y[1<<3] = t[0]+t[6]
- SADD16 r3, r12,r5 @ r3 = t[0]+t[5]
- STR r3, [r0, #28] @ y[2<<3] = t[0]+t[5]
- SADD16 r3, r12,r4 @ r3 = t[0]+t[4]
- STR r3, [r0, #44] @ y[3<<3] = t[0]+t[4]
- SSUB16 r4, r12,r4 @ r4 = t[0]-t[4]
- STR r4, [r0, #60] @ y[4<<3] = t[0]-t[4]
- SSUB16 r5, r12,r5 @ r5 = t[0]-t[5]
- STR r5, [r0, #76] @ y[5<<3] = t[0]-t[5]
- SSUB16 r6, r12,r6 @ r6 = t[0]-t[6]
- STR r6, [r0, #92] @ y[6<<3] = t[0]-t[6]
- SSUB16 r7, r12,r7 @ r7 = t[0]-t[7]
- STR r7, [r0, #108] @ y[7<<3] = t[0]-t[7]
- MOV PC,r14
- @ .size idct2_1core_v6, .-idct2_1core_v6 @ ENDP
- .endif
- .balign 8
- OC_C7S1:
- .word 12785 @ 31F1
- OC_C1S7:
- .word 64277 @ FB15
- OC_C6S2:
- .word 25080 @ 61F8
- OC_C2S6:
- .word 60547 @ EC83
- OC_C5S3:
- .word 36410 @ 8E3A
- OC_C3S5:
- .word 54491 @ D4DB
- OC_C4S4:
- .word 46341 @ B505
- .if OC_ARM_ASM_MEDIA
- @ .type idct2_2core_down_v6, %function; idct2_2core_down_v6: @ PROC
- idct2_2core_down_v6:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- @ Stage 1:
- LDR r2, [r1], #16 @ r2 = <x[0,1]|x[0,0]>
- LDR r3, OC_C4S4
- MOV r7 ,#8 @ r7 = 8
- LDR r6, [r1], #16 @ r6 = <x[1,1]|x[1,0]>
- SMLAWB r12,r3, r2, r7 @ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
- LDRD r4, r5, OC_C7S1 @ r4 = OC_C7S1; r5 = OC_C1S7
- SMLAWB r7, r3, r6, r7 @ r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
- SMULWT r5, r5, r2 @ r2 = t[0,7]=OC_C1S7*x[0,1]>>16
- PKHBT r12,r12,r7, LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
- SMULWT r4, r4, r2 @ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
- @ Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
- PKHBT r7, r5, r5, LSL #16 @ r7 = <t[0,7]|t[0,7]>
- @ Stage 2:
- SMULWB r6, r3, r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
- PKHBT r4, r4, r4, LSL #16 @ r4 = <t[0,4]|t[0,4]>
- SMULWT r2, r3, r7 @ r2 = t[1,6]=OC_C4S4*t[1,7]>>16
- SMULWB r5, r3, r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
- PKHBT r6, r6, r2, LSL #16 @ r6 = <t[1,6]|t[0,6]>
- SMULWT r2, r3, r4 @ r2 = t[1,5]=OC_C4S4*t[1,4]>>16
- PKHBT r2, r5, r2, LSL #16 @ r2 = <t[1,5]|t[0,5]>
- @ Stage 3:
- SSUB16 r5, r6, r2 @ r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
- SADD16 r6, r6, r2 @ r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
- @ Stage 4:
- SADD16 r2, r12,r7 @ r2 = t[0]+t[7]+8
- MOV r3, r2, ASR #4
- MOV r2, r2, LSL #16
- PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[7]+8>>4
- STR r3, [r0], #4 @ y[0<<3] = t[0]+t[7]+8>>4
- SADD16 r2, r12,r6 @ r2 = t[0]+t[6]+8
- MOV r3, r2, ASR #4
- MOV r2, r2, LSL #16
- PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[6]+8>>4
- STR r3, [r0, #12] @ y[1<<3] = t[0]+t[6]+8>>4
- SADD16 r2, r12,r5 @ r2 = t[0]+t[5]+8
- MOV r3, r2, ASR #4
- MOV r2, r2, LSL #16
- PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[5]+8>>4
- STR r3, [r0, #28] @ y[2<<3] = t[0]+t[5]+8>>4
- SADD16 r2, r12,r4 @ r2 = t[0]+t[4]+8
- MOV r3, r2, ASR #4
- MOV r2, r2, LSL #16
- PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[4]+8>>4
- STR r3, [r0, #44] @ y[3<<3] = t[0]+t[4]+8>>4
- SSUB16 r4, r12,r4 @ r4 = t[0]-t[4]+8
- MOV r3, r4, ASR #4
- MOV r4, r4, LSL #16
- PKHTB r3, r3, r4, ASR #20 @ r3 = t[0]-t[4]+8>>4
- STR r3, [r0, #60] @ y[4<<3] = t[0]-t[4]+8>>4
- SSUB16 r5, r12,r5 @ r5 = t[0]-t[5]+8
- MOV r3, r5, ASR #4
- MOV r5, r5, LSL #16
- PKHTB r3, r3, r5, ASR #20 @ r3 = t[0]-t[5]+8>>4
- STR r3, [r0, #76] @ y[5<<3] = t[0]-t[5]+8>>4
- SSUB16 r6, r12,r6 @ r6 = t[0]-t[6]+8
- MOV r3, r6, ASR #4
- MOV r6, r6, LSL #16
- PKHTB r3, r3, r6, ASR #20 @ r3 = t[0]-t[6]+8>>4
- STR r3, [r0, #92] @ y[6<<3] = t[0]-t[6]+8>>4
- SSUB16 r7, r12,r7 @ r7 = t[0]-t[7]+8
- MOV r3, r7, ASR #4
- MOV r7, r7, LSL #16
- PKHTB r3, r3, r7, ASR #20 @ r3 = t[0]-t[7]+8>>4
- STR r3, [r0, #108] @ y[7<<3] = t[0]-t[7]+8>>4
- MOV PC,r14
- @ .size idct2_2core_down_v6, .-idct2_2core_down_v6 @ ENDP
- @ In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
- @ pay for increased branch mis-prediction to get here, but in practice it
- @ doesn't seem to slow anything down to take it out, and it's less code this
- @ way.
- .if 0
- @ .type oc_idct8x8_6_v6, %function; oc_idct8x8_6_v6: @ PROC
- _oc_idct8x8_6_v6:
- STMFD r13!,{r4-r8,r10,r11,r14}
- SUB r13,r13,#64*2+4
- @ Row transforms
- MOV r8, r0
- AND r0, r13,#4 @ Align the stack.
- ADD r0, r0, r13 @ Write to temp storage.
- BL idct3_2core_v6
- BL idct1core_v6
- @ Clear input data for next block.
- MOV r4, #0
- MOV r5, #0
- STRD r4, r5, [r1,#-3*16]!
- STR r4, [r1,#16]
- STR r4, [r1,#32]
- AND r1, r13,#4 @ Align the stack.
- MOV r0, r8 @ Write to the final destination.
- ADD r1, r1, r13 @ And read from temp storage.
- @ Column transforms
- BL idct3_3core_down_v6
- BL idct3_3core_down_v6
- BL idct3_3core_down_v6
- BL idct3_3core_down_v6
- ADD r13,r13,#64*2+4
- LDMFD r13!,{r4-r8,r10,r11,PC}
- @ .size oc_idct8x8_6_v6, .-oc_idct8x8_6_v6 @ ENDP
- @ .type idct1core_v6, %function; idct1core_v6: @ PROC
- _idct1core_v6:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- LDRSH r3, [r1], #16
- MOV r12,#0x05
- ORR r12,r12,#0xB500
- MUL r3, r12, r3
- @ Stall ?
- MOV r3, r3, ASR #16
- @ Don't need to actually store the odd lines; they won't be read.
- STRH r3, [r0], #2
- STRH r3, [r0, #30]
- STRH r3, [r0, #62]
- STRH r3, [r0, #94]
- MOV PC,R14
- @ .size idct1core_v6, .-idct1core_v6 @ ENDP
- @ .type idct3_2core_v6, %function; idct3_2core_v6: @ PROC
- _idct3_2core_v6:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- @ Stage 1:
- LDRD r4, r5, [r1], #16 @ r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
- LDRD r10,r11,OC_C6S2_3_v6 @ r10= OC_C6S2; r11= OC_C2S6
- @ Stall
- SMULWB r3, r11,r5 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
- LDR r11,OC_C4S4
- SMULWB r2, r10,r5 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
- LDR r5, [r1], #16 @ r5 = <x[1,1]|x[1,0]>
- SMULWB r12,r11,r4 @ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
- LDRD r6, r7, OC_C7S1_3_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
- SMULWB r10,r11,r5 @ r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
- PKHBT r12,r12,r10,LSL #16 @ r12= <t[1,0]|t[0,0]>
- SMULWT r10,r7, r5 @ r10= t[1,7]=OC_C1S7*x[1,1]>>16
- PKHBT r2, r2, r11 @ r2 = <0|t[0,2]>
- SMULWT r7, r7, r4 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
- PKHBT r3, r3, r11 @ r3 = <0|t[0,3]>
- SMULWT r5, r6, r5 @ r10= t[1,4]=OC_C7S1*x[1,1]>>16
- PKHBT r7, r7, r10,LSL #16 @ r7 = <t[1,7]|t[0,7]>
- SMULWT r4, r6, r4 @ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
- @ Stage 2:
- SMULWB r6, r11,r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
- PKHBT r4, r4, r5, LSL #16 @ r4 = <t[1,4]|t[0,4]>
- SMULWT r10,r11,r7 @ r10= t[1,6]=OC_C4S4*t[1,7]>>16
- SMULWB r5, r11,r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
- PKHBT r6, r6, r10,LSL #16 @ r6 = <t[1,6]|t[0,6]>
- SMULWT r10,r11,r4 @ r10= t[1,5]=OC_C4S4*t[1,4]>>16
- @ Stage 3:
- B idct4_3core_stage3_v6
- @ .size idct3_2core_v6, .-idct3_2core_v6 @ ENDP
- @ Another copy so the LDRD offsets are less than +/- 255.
- .balign 8
- OC_C7S1_3_v6:
- .word 12785 @ 31F1
- OC_C1S7_3_v6:
- .word 64277 @ FB15
- OC_C6S2_3_v6:
- .word 25080 @ 61F8
- OC_C2S6_3_v6:
- .word 60547 @ EC83
- @ .type idct3_3core_down_v6, %function; idct3_3core_down_v6: @ PROC
- _idct3_3core_down_v6:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- @ Stage 1:
- LDRD r10,r11,[r1], #16 @ r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
- LDRD r6, r7, OC_C6S2_3_v6 @ r6 = OC_C6S2; r7 = OC_C2S6
- LDR r4, [r1], #16 @ r4 = <x[1,1]|x[1,0]>
- SMULWB r3, r7, r11 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
- MOV r7,#8
- SMULWB r2, r6, r11 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
- LDR r11,OC_C4S4
- SMLAWB r12,r11,r10,r7 @ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
- @ Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
- PKHBT r3, r3, r3, LSL #16 @ r3 = <t[0,3]|t[0,3]>
- SMLAWB r5, r11,r4, r7 @ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
- PKHBT r2, r2, r2, LSL #16 @ r2 = <t[0,2]|t[0,2]>
- LDRD r6, r7, OC_C7S1_3_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
- PKHBT r12,r12,r5, LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
- SMULWT r5, r7, r4 @ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
- SMULWT r7, r7, r10 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
- SMULWT r10,r6, r10 @ r10= t[0,4]=OC_C7S1*x[0,1]>>16
- PKHBT r7, r7, r5, LSL #16 @ r7 = <t[1,7]|t[0,7]>
- SMULWT r4, r6, r4 @ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
- @ Stage 2:
- SMULWB r6, r11,r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
- PKHBT r4, r10,r4, LSL #16 @ r4 = <t[1,4]|t[0,4]>
- SMULWT r10,r11,r7 @ r10= t[1,6]=OC_C4S4*t[1,7]>>16
- SMULWB r5, r11,r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
- PKHBT r6, r6, r10,LSL #16 @ r6 = <t[1,6]|t[0,6]>
- SMULWT r10,r11,r4 @ r10= t[1,5]=OC_C4S4*t[1,4]>>16
- @ Stage 3:
- B idct4_4core_down_stage3_v6
- @ .size idct3_3core_down_v6, .-idct3_3core_down_v6 @ ENDP
- .endif
- @ .type idct4_3core_v6, %function; idct4_3core_v6: @ PROC
- idct4_3core_v6:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- @ Stage 1:
- LDRD r10,r11,[r1], #16 @ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
- LDRD r2, r3, OC_C5S3_4_v6 @ r2 = OC_C5S3; r3 = OC_C3S5
- LDRD r4, r5, [r1], #16 @ r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
- SMULWT r9, r3, r11 @ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
- SMULWT r8, r2, r11 @ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
- PKHBT r9, r9, r2 @ r9 = <0|t[0,6]>
- LDRD r6, r7, OC_C6S2_4_v6 @ r6 = OC_C6S2; r7 = OC_C2S6
- PKHBT r8, r8, r2 @ r9 = <0|-t[0,5]>
- SMULWB r3, r7, r11 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
- SMULWB r2, r6, r11 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
- LDR r11,OC_C4S4
- SMULWB r12,r7, r5 @ r12= t[1,3]=OC_C2S6*x[1,2]>>16
- SMULWB r5, r6, r5 @ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
- PKHBT r3, r3, r12,LSL #16 @ r3 = <t[1,3]|t[0,3]>
- SMULWB r12,r11,r10 @ r12= t[0,0]=OC_C4S4*x[0,0]>>16
- PKHBT r2, r2, r5, LSL #16 @ r2 = <t[1,2]|t[0,2]>
- SMULWB r5, r11,r4 @ r5 = t[1,0]=OC_C4S4*x[1,0]>>16
- LDRD r6, r7, OC_C7S1_4_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
- PKHBT r12,r12,r5, LSL #16 @ r12= <t[1,0]|t[0,0]>
- SMULWT r5, r7, r4 @ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
- SMULWT r7, r7, r10 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
- SMULWT r10,r6, r10 @ r10= t[0,4]=OC_C7S1*x[0,1]>>16
- PKHBT r7, r7, r5, LSL #16 @ r7 = <t[1,7]|t[0,7]>
- SMULWT r4, r6, r4 @ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
- @ Stage 2:
- SSUB16 r6, r7, r9 @ r6 = t[7]-t[6]
- PKHBT r4, r10,r4, LSL #16 @ r4 = <t[1,4]|t[0,4]>
- SADD16 r7, r7, r9 @ r7 = t[7]=t[7]+t[6]
- SMULWT r9, r11,r6 @ r9 = t[1,6]=OC_C4S4*r6T>>16
- SADD16 r5, r4, r8 @ r5 = t[4]-t[5]
- SMULWB r6, r11,r6 @ r6 = t[0,6]=OC_C4S4*r6B>>16
- SSUB16 r4, r4, r8 @ r4 = t[4]=t[4]+t[5]
- SMULWT r10,r11,r5 @ r10= t[1,5]=OC_C4S4*r5T>>16
- PKHBT r6, r6, r9, LSL #16 @ r6 = <t[1,6]|t[0,6]>
- SMULWB r5, r11,r5 @ r5 = t[0,5]=OC_C4S4*r5B>>16
- @ Stage 3:
- idct4_3core_stage3_v6:
- SADD16 r11,r12,r2 @ r11= t[1]=t[0]+t[2]
- PKHBT r10,r5, r10,LSL #16 @ r10= <t[1,5]|t[0,5]>
- SSUB16 r2, r12,r2 @ r2 = t[2]=t[0]-t[2]
- idct4_3core_stage3_5_v6:
- SSUB16 r5, r6, r10 @ r5 = t[5]=t[6]-t[5]
- SADD16 r6, r6, r10 @ r6 = t[6]=t[6]+t[5]
- SADD16 r10,r12,r3 @ r10= t[0]=t[0]+t[3]
- SSUB16 r3, r12,r3 @ r3 = t[3]=t[0]-t[3]
- @ Stage 4:
- SADD16 r12,r10,r7 @ r12= t[0]+t[7]
- STR r12,[r0], #4 @ y[0<<3] = t[0]+t[7]
- SADD16 r12,r11,r6 @ r12= t[1]+t[6]
- STR r12,[r0, #12] @ y[1<<3] = t[1]+t[6]
- SADD16 r12,r2, r5 @ r12= t[2]+t[5]
- STR r12,[r0, #28] @ y[2<<3] = t[2]+t[5]
- SADD16 r12,r3, r4 @ r12= t[3]+t[4]
- STR r12,[r0, #44] @ y[3<<3] = t[3]+t[4]
- SSUB16 r4, r3, r4 @ r4 = t[3]-t[4]
- STR r4, [r0, #60] @ y[4<<3] = t[3]-t[4]
- SSUB16 r5, r2, r5 @ r5 = t[2]-t[5]
- STR r5, [r0, #76] @ y[5<<3] = t[2]-t[5]
- SSUB16 r6, r11,r6 @ r6 = t[1]-t[6]
- STR r6, [r0, #92] @ y[6<<3] = t[1]-t[6]
- SSUB16 r7, r10,r7 @ r7 = t[0]-t[7]
- STR r7, [r0, #108] @ y[7<<3] = t[0]-t[7]
- MOV PC,r14
- @ .size idct4_3core_v6, .-idct4_3core_v6 @ ENDP
- @ Another copy so the LDRD offsets are less than +/- 255.
- .balign 8
- OC_C7S1_4_v6:
- .word 12785 @ 31F1
- OC_C1S7_4_v6:
- .word 64277 @ FB15
- OC_C6S2_4_v6:
- .word 25080 @ 61F8
- OC_C2S6_4_v6:
- .word 60547 @ EC83
- OC_C5S3_4_v6:
- .word 36410 @ 8E3A
- OC_C3S5_4_v6:
- .word 54491 @ D4DB
- @ .type idct4_4core_down_v6, %function; idct4_4core_down_v6: @ PROC
- idct4_4core_down_v6:
- @ r0 = ogg_int16_t *_y (destination)
- @ r1 = const ogg_int16_t *_x (source)
- @ Stage 1:
- LDRD r10,r11,[r1], #16 @ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
- LDRD r2, r3, OC_C5S3_4_v6 @ r2 = OC_C5S3; r3 = OC_C3S5
- LDRD r4, r5, [r1], #16 @ r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
- SMULWT r9, r3, r11 @ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
- LDRD r6, r7, OC_C6S2_4_v6 @ r6 = OC_C6S2; r7 = OC_C2S6
- SMULWT r8, r2, r11 @ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
- @ Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
- PKHBT r9, r9, r9, LSL #16 @ r9 = <t[0,6]|t[0,6]>
- SMULWB r3, r7, r11 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
- PKHBT r8, r8, r8, LSL #16 @ r8 = <-t[0,5]|-t[0,5]>
- SMULWB r2, r6, r11 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
- LDR r11,OC_C4S4
- SMULWB r12,r7, r5 @ r12= t[1,3]=OC_C2S6*x[1,2]>>16
- MOV r7,#8
- SMULWB r5, r6, r5 @ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
- PKHBT r3, r3, r12,LSL #16 @ r3 = <t[1,3]|t[0,3]>
- SMLAWB r12,r11,r10,r7 @ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
- PKHBT r2, r2, r5, LSL #16 @ r2 = <t[1,2]|t[0,2]>
- SMLAWB r5, r11,r4 ,r7 @ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
- LDRD r6, r7, OC_C7S1_4_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
- PKHBT r12,r12,r5, LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
- SMULWT r5, r7, r4 @ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
- SMULWT r7, r7, r10 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
- SMULWT r10,r6, r10 @ r10= t[0,4]=OC_C7S1*x[0,1]>>16
- PKHBT r7, r7, r5, LSL #16 @ r7 = <t[1,7]|t[0,7]>
- SMULWT r4, r6, r4 @ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
- @ Stage 2:
- SSUB16 r6, r7, r9 @ r6 = t[7]-t[6]
- PKHBT r4, r10,r4, LSL #16 @ r4 = <t[1,4]|t[0,4]>
- SADD16 r7, r7, r9 @ r7 = t[7]=t[7]+t[6]
- SMULWT r9, r11,r6 @ r9 = t[1,6]=OC_C4S4*r6T>>16
- SADD16 r5, r4, r8 @ r5 = t[4]-t[5]
- SMULWB r6, r11,r6 @ r6 = t[0,6]=OC_C4S4*r6B>>16
- SSUB16 r4, r4, r8 @ r4 = t[4]=t[4]+t[5]
- SMULWT r10,r11,r5 @ r10= t[1,5]=OC_C4S4*r5T>>16
- PKHBT r6, r6, r9, LSL #16 @ r6 = <t[1,6]|t[0,6]>
- SMULWB r5, r11,r5 @ r5 = t[0,5]=OC_C4S4*r5B>>16
- @ Stage 3:
- idct4_4core_down_stage3_v6:
- SADD16 r11,r12,r2 @ r11= t[1]+8=t[0]+t[2]+8
- PKHBT r10,r5, r10,LSL #16 @ r10= <t[1,5]|t[0,5]>
- SSUB16 r2, r12,r2 @ r2 = t[2]+8=t[0]-t[2]+8
- B idct8_8core_down_stage3_5_v6
- @ .size idct4_4core_down_v6, .-idct4_4core_down_v6 @ ENDP
- @ .type idct8_8core_v6, %function; idct8_8core_v6: @ PROC
- idct8_8core_v6:
- STMFD r13!,{r0,r14}
- @ Stage 1:
- @5-6 rotation by 3pi/16
- LDRD r10,r11,OC_C5S3_4_v6 @ r10= OC_C5S3, r11= OC_C3S5
- LDR r4, [r1,#8] @ r4 = <x[0,5]|x[0,4]>
- LDR r7, [r1,#24] @ r7 = <x[1,5]|x[1,4]>
- SMULWT r5, r11,r4 @ r5 = OC_C3S5*x[0,5]>>16
- LDR r0, [r1,#4] @ r0 = <x[0,3]|x[0,2]>
- SMULWT r3, r11,r7 @ r3 = OC_C3S5*x[1,5]>>16
- LDR r12,[r1,#20] @ r12= <x[1,3]|x[1,2]>
- SMULWT r6, r11,r0 @ r6 = OC_C3S5*x[0,3]>>16
- SMULWT r11,r11,r12 @ r11= OC_C3S5*x[1,3]>>16
- SMLAWT r6, r10,r4, r6 @ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
- PKHBT r5, r5, r3, LSL #16 @ r5 = <r3|r5>
- SMLAWT r11,r10,r7, r11 @ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
- PKHBT r4, r4, r7, LSL #16 @ r4 = <x[1,4]|x[0,4]>
- SMULWT r3, r10,r0 @ r3 = OC_C5S3*x[0,3]>>16
- PKHBT r6, r6, r11,LSL #16 @ r6 = <t[1,6]|t[0,6]>
- SMULWT r8, r10,r12 @ r8 = OC_C5S3*x[1,3]>>16
- @2-3 rotation by 6pi/16
- LDRD r10,r11,OC_C6S2_4_v6 @ r10= OC_C6S2, r11= OC_C2S6
- PKHBT r3, r3, r8, LSL #16 @ r3 = <r8|r3>
- LDR r8, [r1,#12] @ r8 = <x[0,7]|x[0,6]>
- SMULWB r2, r10,r0 @ r2 = OC_C6S2*x[0,2]>>16
- SSUB16 r5, r5, r3 @ r5 = <t[1,5]|t[0,5]>
- SMULWB r9, r10,r12 @ r9 = OC_C6S2*x[1,2]>>16
- LDR r7, [r1,#28] @ r7 = <x[1,7]|x[1,6]>
- SMULWB r3, r10,r8 @ r3 = OC_C6S2*x[0,6]>>16
- SMULWB r10,r10,r7 @ r10= OC_C6S2*x[1,6]>>16
- PKHBT r2, r2, r9, LSL #16 @ r2 = <r2|r9>
- SMLAWB r3, r11,r0, r3 @ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
- SMLAWB r10,r11,r12,r10 @ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
- SMULWB r9, r11,r8 @ r9 = OC_C2S6*x[0,6]>>16
- PKHBT r3, r3, r10,LSL #16 @ r3 = <t[1,6]|t[0,6]>
- SMULWB r12,r11,r7 @ r12= OC_C2S6*x[1,6]>>16
- @4-7 rotation by 7pi/16
- LDRD r10,r11,OC_C7S1_8_v6 @ r10= OC_C7S1, r11= OC_C1S7
- PKHBT r9, r9, r12,LSL #16 @ r9 = <r9|r12>
- LDR r0, [r1],#16 @ r0 = <x[0,1]|x[0,0]>
- PKHTB r7, r7, r8, ASR #16 @ r7 = <x[1,7]|x[0,7]>
- SSUB16 r2, r2, r9 @ r2 = <t[1,2]|t[0,2]>
- SMULWB r9, r10,r7 @ r9 = OC_C7S1*x[0,7]>>16
- LDR r14,[r1],#16 @ r14= <x[1,1]|x[1,0]>
- SMULWT r12,r10,r7 @ r12= OC_C7S1*x[1,7]>>16
- SMULWT r8, r10,r0 @ r8 = OC_C7S1*x[0,1]>>16
- SMULWT r10,r10,r14 @ r10= OC_C7S1*x[1,1]>>16
- SMLAWT r9, r11,r0, r9 @ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
- PKHBT r8, r8, r10,LSL #16 @ r8 = <r12|r8>
- SMLAWT r12,r11,r14,r12 @ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
- PKHBT r0, r0, r14,LSL #16 @ r0 = <x[1,0]|x[0,0]>
- SMULWB r10,r11,r7 @ r10= OC_C1S7*x[0,6]>>16
- PKHBT r9, r9, r12,LSL #16 @ r9 = <t[1,7]|t[0,7]>
- SMULWT r12,r11,r7 @ r12= OC_C1S7*x[1,6]>>16
- @0-1 butterfly
- LDR r11,OC_C4S4
- PKHBT r10,r10,r12,LSL #16 @ r10= <r12|r10>
- SADD16 r7, r0, r4 @ r7 = x[0]+x[4]
- SSUB16 r10,r8, r10 @ r10= <t[1,4]|t[0,4]>
- SSUB16 r4, r0, r4 @ r4 = x[0]-x[4]
- SMULWB r8, r11,r7 @ r8 = t[0,0]=OC_C4S4*r7B>>16
- SMULWT r12,r11,r7 @ r12= t[1,0]=OC_C4S4*r7T>>16
- SMULWB r7, r11,r4 @ r7 = t[0,1]=OC_C4S4*r4B>>16
- PKHBT r12,r8, r12,LSL #16 @ r12= <t[1,0]|t[0,0]>
- SMULWT r8, r11,r4 @ r8 = t[1,1]=OC_C4S4*r4T>>16
- @ Stage 2:
- SADD16 r4, r10,r5 @ r4 = t[4]=t[4]+t[5]
- PKHBT r8, r7, r8, LSL #16 @ r8 = <t[1,0]|t[0,0]>
- SSUB16 r5, r10,r5 @ r5 = t[4]-t[5]
- SMULWB r10,r11,r5 @ r10= t[0,5]=OC_C4S4*r5B>>16
- SADD16 r7, r9, r6 @ r7 = t[7]=t[7]+t[6]
- SMULWT r5, r11,r5 @ r5 = t[1,5]=OC_C4S4*r5T>>16
- SSUB16 r6, r9, r6 @ r6 = t[7]-t[6]
- SMULWB r9, r11,r6 @ r9 = t[0,6]=OC_C4S4*r6B>>16
- PKHBT r10,r10,r5, LSL #16 @ r10= <t[1,5]|t[0,5]>
- SMULWT r6, r11,r6 @ r6 = t[1,6]=OC_C4S4*r6T>>16
- @ Stage 3:
- SADD16 r11,r8, r2 @ r11= t[1]=t[1]+t[2]
- PKHBT r6, r9, r6, LSL #16 @ r6 = <t[1,6]|t[0,6]>
- SSUB16 r2, r8, r2 @ r2 = t[2]=t[1]-t[2]
- LDMFD r13!,{r0,r14}
- B idct4_3core_stage3_5_v6
- @ .size idct8_8core_v6, .-idct8_8core_v6 @ ENDP
- @ Another copy so the LDRD offsets are less than +/- 255.
- .balign 8
- OC_C7S1_8_v6:
- .word 12785 @ 31F1
- OC_C1S7_8_v6:
- .word 64277 @ FB15
- OC_C6S2_8_v6:
- .word 25080 @ 61F8
- OC_C2S6_8_v6:
- .word 60547 @ EC83
- OC_C5S3_8_v6:
- .word 36410 @ 8E3A
- OC_C3S5_8_v6:
- .word 54491 @ D4DB
- @ .type idct8_8core_down_v6, %function; idct8_8core_down_v6: @ PROC
- idct8_8core_down_v6:
- STMFD r13!,{r0,r14}
- @ Stage 1:
- @5-6 rotation by 3pi/16
- LDRD r10,r11,OC_C5S3_8_v6 @ r10= OC_C5S3, r11= OC_C3S5
- LDR r4, [r1,#8] @ r4 = <x[0,5]|x[0,4]>
- LDR r7, [r1,#24] @ r7 = <x[1,5]|x[1,4]>
- SMULWT r5, r11,r4 @ r5 = OC_C3S5*x[0,5]>>16
- LDR r0, [r1,#4] @ r0 = <x[0,3]|x[0,2]>
- SMULWT r3, r11,r7 @ r3 = OC_C3S5*x[1,5]>>16
- LDR r12,[r1,#20] @ r12= <x[1,3]|x[1,2]>
- SMULWT r6, r11,r0 @ r6 = OC_C3S5*x[0,3]>>16
- SMULWT r11,r11,r12 @ r11= OC_C3S5*x[1,3]>>16
- SMLAWT r6, r10,r4, r6 @ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
- PKHBT r5, r5, r3, LSL #16 @ r5 = <r3|r5>
- SMLAWT r11,r10,r7, r11 @ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
- PKHBT r4, r4, r7, LSL #16 @ r4 = <x[1,4]|x[0,4]>
- SMULWT r3, r10,r0 @ r3 = OC_C5S3*x[0,3]>>16
- PKHBT r6, r6, r11,LSL #16 @ r6 = <t[1,6]|t[0,6]>
- SMULWT r8, r10,r12 @ r8 = OC_C5S3*x[1,3]>>16
- @2-3 rotation by 6pi/16
- LDRD r10,r11,OC_C6S2_8_v6 @ r10= OC_C6S2, r11= OC_C2S6
- PKHBT r3, r3, r8, LSL #16 @ r3 = <r8|r3>
- LDR r8, [r1,#12] @ r8 = <x[0,7]|x[0,6]>
- SMULWB r2, r10,r0 @ r2 = OC_C6S2*x[0,2]>>16
- SSUB16 r5, r5, r3 @ r5 = <t[1,5]|t[0,5]>
- SMULWB r9, r10,r12 @ r9 = OC_C6S2*x[1,2]>>16
- LDR r7, [r1,#28] @ r7 = <x[1,7]|x[1,6]>
- SMULWB r3, r10,r8 @ r3 = OC_C6S2*x[0,6]>>16
- SMULWB r10,r10,r7 @ r10= OC_C6S2*x[1,6]>>16
- PKHBT r2, r2, r9, LSL #16 @ r2 = <r2|r9>
- SMLAWB r3, r11,r0, r3 @ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
- SMLAWB r10,r11,r12,r10 @ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
- SMULWB r9, r11,r8 @ r9 = OC_C2S6*x[0,6]>>16
- PKHBT r3, r3, r10,LSL #16 @ r3 = <t[1,6]|t[0,6]>
- SMULWB r12,r11,r7 @ r12= OC_C2S6*x[1,6]>>16
- @4-7 rotation by 7pi/16
- LDRD r10,r11,OC_C7S1_8_v6 @ r10= OC_C7S1, r11= OC_C1S7
- PKHBT r9, r9, r12,LSL #16 @ r9 = <r9|r12>
- LDR r0, [r1],#16 @ r0 = <x[0,1]|x[0,0]>
- PKHTB r7, r7, r8, ASR #16 @ r7 = <x[1,7]|x[0,7]>
- SSUB16 r2, r2, r9 @ r2 = <t[1,2]|t[0,2]>
- SMULWB r9, r10,r7 @ r9 = OC_C7S1*x[0,7]>>16
- LDR r14,[r1],#16 @ r14= <x[1,1]|x[1,0]>
- SMULWT r12,r10,r7 @ r12= OC_C7S1*x[1,7]>>16
- SMULWT r8, r10,r0 @ r8 = OC_C7S1*x[0,1]>>16
- SMULWT r10,r10,r14 @ r10= OC_C7S1*x[1,1]>>16
- SMLAWT r9, r11,r0, r9 @ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
- PKHBT r8, r8, r10,LSL #16 @ r8 = <r12|r8>
- SMLAWT r12,r11,r14,r12 @ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
- PKHBT r0, r0, r14,LSL #16 @ r0 = <x[1,0]|x[0,0]>
- SMULWB r10,r11,r7 @ r10= OC_C1S7*x[0,6]>>16
- PKHBT r9, r9, r12,LSL #16 @ r9 = <t[1,7]|t[0,7]>
- SMULWT r12,r11,r7 @ r12= OC_C1S7*x[1,6]>>16
- @0-1 butterfly
- LDR r11,OC_C4S4
- MOV r14,#8
- PKHBT r10,r10,r12,LSL #16 @ r10= <r12|r10>
- SADD16 r7, r0, r4 @ r7 = x[0]+x[4]
- SSUB16 r10,r8, r10 @ r10= <t[1,4]|t[0,4]>
- SMLAWB r8, r11,r7, r14 @ r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
- SSUB16 r4, r0, r4 @ r4 = x[0]-x[4]
- SMLAWT r12,r11,r7, r14 @ r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
- SMLAWB r7, r11,r4, r14 @ r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
- PKHBT r12,r8, r12,LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
- SMLAWT r8, r11,r4, r14 @ r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
- @ Stage 2:
- SADD16 r4, r10,r5 @ r4 = t[4]=t[4]+t[5]
- PKHBT r8, r7, r8, LSL #16 @ r8 = <t[1,0]+8|t[0,0]+8>
- SSUB16 r5, r10,r5 @ r5 = t[4]-t[5]
- SMULWB r10,r11,r5 @ r10= t[0,5]=OC_C4S4*r5B>>16
- SADD16 r7, r9, r6 @ r7 = t[7]=t[7]+t[6]
- SMULWT r5, r11,r5 @ r5 = t[1,5]=OC_C4S4*r5T>>16
- SSUB16 r6, r9, r6 @ r6 = t[7]-t[6]
- SMULWB r9, r11,r6 @ r9 = t[0,6]=OC_C4S4*r6B>>16
- PKHBT r10,r10,r5, LSL #16 @ r10= <t[1,5]|t[0,5]>
- SMULWT r6, r11,r6 @ r6 = t[1,6]=OC_C4S4*r6T>>16
- @ Stage 3:
- SADD16 r11,r8, r2 @ r11= t[1]+8=t[1]+t[2]+8
- PKHBT r6, r9, r6, LSL #16 @ r6 = <t[1,6]|t[0,6]>
- SSUB16 r2, r8, r2 @ r2 = t[2]+8=t[1]-t[2]+8
- LDMFD r13!,{r0,r14}
- idct8_8core_down_stage3_5_v6:
- SSUB16 r5, r6, r10 @ r5 = t[5]=t[6]-t[5]
- SADD16 r6, r6, r10 @ r6 = t[6]=t[6]+t[5]
- SADD16 r10,r12,r3 @ r10= t[0]+8=t[0]+t[3]+8
- SSUB16 r3, r12,r3 @ r3 = t[3]+8=t[0]-t[3]+8
- @ Stage 4:
- SADD16 r12,r10,r7 @ r12= t[0]+t[7]+8
- SSUB16 r7, r10,r7 @ r7 = t[0]-t[7]+8
- MOV r10,r12,ASR #4
- MOV r12,r12,LSL #16
- PKHTB r10,r10,r12,ASR #20 @ r10= t[0]+t[7]+8>>4
- STR r10,[r0], #4 @ y[0<<3] = t[0]+t[7]+8>>4
- SADD16 r12,r11,r6 @ r12= t[1]+t[6]+8
- SSUB16 r6, r11,r6 @ r6 = t[1]-t[6]+8
- MOV r10,r12,ASR #4
- MOV r12,r12,LSL #16
- PKHTB r10,r10,r12,ASR #20 @ r10= t[1]+t[6]+8>>4
- STR r10,[r0, #12] @ y[1<<3] = t[1]+t[6]+8>>4
- SADD16 r12,r2, r5 @ r12= t[2]+t[5]+8
- SSUB16 r5, r2, r5 @ r5 = t[2]-t[5]+8
- MOV r10,r12,ASR #4
- MOV r12,r12,LSL #16
- PKHTB r10,r10,r12,ASR #20 @ r10= t[2]+t[5]+8>>4
- STR r10,[r0, #28] @ y[2<<3] = t[2]+t[5]+8>>4
- SADD16 r12,r3, r4 @ r12= t[3]+t[4]+8
- SSUB16 r4, r3, r4 @ r4 = t[3]-t[4]+8
- MOV r10,r12,ASR #4
- MOV r12,r12,LSL #16
- PKHTB r10,r10,r12,ASR #20 @ r10= t[3]+t[4]+8>>4
- STR r10,[r0, #44] @ y[3<<3] = t[3]+t[4]+8>>4
- MOV r10,r4, ASR #4
- MOV r4, r4, LSL #16
- PKHTB r10,r10,r4, ASR #20 @ r10= t[3]-t[4]+8>>4
- STR r10,[r0, #60] @ y[4<<3] = t[3]-t[4]+8>>4
- MOV r10,r5, ASR #4
- MOV r5, r5, LSL #16
- PKHTB r10,r10,r5, ASR #20 @ r10= t[2]-t[5]+8>>4
- STR r10,[r0, #76] @ y[5<<3] = t[2]-t[5]+8>>4
- MOV r10,r6, ASR #4
- MOV r6, r6, LSL #16
- PKHTB r10,r10,r6, ASR #20 @ r10= t[1]-t[6]+8>>4
- STR r10,[r0, #92] @ y[6<<3] = t[1]-t[6]+8>>4
- MOV r10,r7, ASR #4
- MOV r7, r7, LSL #16
- PKHTB r10,r10,r7, ASR #20 @ r10= t[0]-t[7]+8>>4
- STR r10,[r0, #108] @ y[7<<3] = t[0]-t[7]+8>>4
- MOV PC,r14
- @ .size idct8_8core_down_v6, .-idct8_8core_down_v6 @ ENDP
- .endif
- .if OC_ARM_ASM_NEON
- .global _oc_idct8x8_1_neon
- .global _oc_idct8x8_neon
- .balign 16
- OC_IDCT_CONSTS_NEON:
- .short 8
- .short 64277 @ FB15 (C1S7)
- .short 60547 @ EC83 (C2S6)
- .short 54491 @ D4DB (C3S5)
- .short 46341 @ B505 (C4S4)
- .short 36410 @ 471D (C5S3)
- .short 25080 @ 30FC (C6S2)
- .short 12785 @ 31F1 (C7S1)
- @ .type oc_idct8x8_1_neon, %function; oc_idct8x8_1_neon: @ PROC
- _oc_idct8x8_1_neon:
- @ r0 = ogg_int16_t *_y
- @ r1 = ogg_uint16_t _dc
- VDUP.S16 Q0, r1
- VMOV Q1, Q0
- VST1.64 {D0, D1, D2, D3}, [r0,:128]!
- VST1.64 {D0, D1, D2, D3}, [r0,:128]!
- VST1.64 {D0, D1, D2, D3}, [r0,:128]!
- VST1.64 {D0, D1, D2, D3}, [r0,:128]
- MOV PC, r14
- @ .size oc_idct8x8_1_neon, .-oc_idct8x8_1_neon @ ENDP
- @ .type oc_idct8x8_neon, %function; oc_idct8x8_neon: @ PROC
- _oc_idct8x8_neon:
- @ r0 = ogg_int16_t *_y
- @ r1 = ogg_int16_t *_x
- @ r2 = int _last_zzi
- CMP r2, #10
- BLE oc_idct8x8_10_neon
- oc_idct8x8_slow_neon:
- VPUSH {D8-D15}
- MOV r2, r1
- ADR r3, OC_IDCT_CONSTS_NEON
- @ Row transforms (input is pre-transposed)
- VLD1.64 {D16,D17,D18,D19}, [r2,:128]!
- VLD1.64 {D20,D21,D22,D23}, [r2,:128]!
- VLD1.64 {D24,D25,D26,D27}, [r2,:128]!
- VSUB.S16 Q1, Q8, Q12 @ Q8 = x[0]-x[4]
- VLD1.64 {D28,D29,D30,D31}, [r2,:128]
- VADD.S16 Q8, Q8, Q12 @ Q1 = x[0]+x[4]
- VLD1.64 {D0,D1}, [r3,:128]
- MOV r12, r14
- BL oc_idct8x8_stage123_neon
- @ Stage 4
- VSUB.S16 Q15,Q8, Q7 @ Q15 = y[7]=t[0]-t[7]
- VADD.S16 Q8, Q8, Q7 @ Q8 = y[0]=t[0]+t[7]
- VSUB.S16 Q14,Q9, Q3 @ Q14 = y[6]=t[1]-t[6]
- VADD.S16 Q9, Q9, Q3 @ Q9 = y[1]=t[1]+t[6]
- VSUB.S16 Q13,Q10,Q5 @ Q13 = y[5]=t[2]-t[5]
- VADD.S16 Q10,Q10,Q5 @ Q10 = y[2]=t[2]+t[5]
- VTRN.16 Q14,Q15
- VSUB.S16 Q12,Q11,Q4 @ Q12 = y[4]=t[3]-t[4]
- VADD.S16 Q11,Q11,Q4 @ Q11 = y[3]=t[3]+t[4]
- @ 8x8 Transpose
- VTRN.16 Q8, Q9
- VTRN.16 Q10,Q11
- VTRN.16 Q12,Q13
- VTRN.32 Q8, Q10
- VTRN.32 Q9, Q11
- VTRN.32 Q12,Q14
- VTRN.32 Q13,Q15
- VSWP D17,D24
- VSUB.S16 Q1, Q8, Q12 @ Q8 = x[0]-x[4]
- VSWP D19,D26
- VADD.S16 Q8, Q8, Q12 @ Q1 = x[0]+x[4]
- VSWP D21,D28
- VSWP D23,D30
- @ Column transforms
- BL oc_idct8x8_stage123_neon
- @ We have to put the return address back in the LR, or the branch
- @ predictor will not recognize the function return and mis-predict the
- @ entire call stack.
- MOV r14, r12
- @ Stage 4
- VSUB.S16 Q15,Q8, Q7 @ Q15 = y[7]=t[0]-t[7]
- VADD.S16 Q8, Q8, Q7 @ Q8 = y[0]=t[0]+t[7]
- VSUB.S16 Q14,Q9, Q3 @ Q14 = y[6]=t[1]-t[6]
- VADD.S16 Q9, Q9, Q3 @ Q9 = y[1]=t[1]+t[6]
- VSUB.S16 Q13,Q10,Q5 @ Q13 = y[5]=t[2]-t[5]
- VADD.S16 Q10,Q10,Q5 @ Q10 = y[2]=t[2]+t[5]
- VSUB.S16 Q12,Q11,Q4 @ Q12 = y[4]=t[3]-t[4]
- VADD.S16 Q11,Q11,Q4 @ Q11 = y[3]=t[3]+t[4]
- VMOV.I8 Q2,#0
- VPOP {D8-D15}
- VMOV.I8 Q3,#0
- VRSHR.S16 Q8, Q8, #4 @ Q8 = y[0]+8>>4
- VST1.64 {D4, D5, D6, D7}, [r1,:128]!
- VRSHR.S16 Q9, Q9, #4 @ Q9 = y[1]+8>>4
- VRSHR.S16 Q10,Q10,#4 @ Q10 = y[2]+8>>4
- VST1.64 {D4, D5, D6, D7}, [r1,:128]!
- VRSHR.S16 Q11,Q11,#4 @ Q11 = y[3]+8>>4
- VRSHR.S16 Q12,Q12,#4 @ Q12 = y[4]+8>>4
- VST1.64 {D4, D5, D6, D7}, [r1,:128]!
- VRSHR.S16 Q13,Q13,#4 @ Q13 = y[5]+8>>4
- VRSHR.S16 Q14,Q14,#4 @ Q14 = y[6]+8>>4
- VST1.64 {D4, D5, D6, D7}, [r1,:128]
- VRSHR.S16 Q15,Q15,#4 @ Q15 = y[7]+8>>4
- VSTMIA r0, {D16-D31}
- MOV PC, r14
- @ .size oc_idct8x8_neon, .-oc_idct8x8_neon @ ENDP
- @ .type oc_idct8x8_stage123_neon, %function; oc_idct8x8_stage123_neon: @ PROC
- oc_idct8x8_stage123_neon:
- @ Stages 1 & 2
- VMULL.S16 Q4, D18,D1[3]
- VMULL.S16 Q5, D19,D1[3]
- VMULL.S16 Q7, D30,D1[3]
- VMULL.S16 Q6, D31,D1[3]
- VMULL.S16 Q2, D30,D0[1]
- VMULL.S16 Q3, D31,D0[1]
- VSHRN.S32 D8, Q4, #16
- VSHRN.S32 D9, Q5, #16 @ Q4 = (OC_C7S1*x[1]>>16)
- VSHRN.S32 D14,Q7, #16
- VSHRN.S32 D15,Q6, #16 @ Q7 = (OC_C7S1*x[7]>>16)
- VSHRN.S32 D4, Q2, #16
- VSHRN.S32 D5, Q3, #16 @ Q2 = (OC_C1S7*x[7]>>16)-x[7]
- VSUB.S16 Q4, Q4, Q15
- VADD.S16 Q7, Q7, Q9
- VSUB.S16 Q4, Q4, Q2 @ Q4 = t[4]
- VMULL.S16 Q2, D18,D0[1]
- VMULL.S16 Q9, D19,D0[1]
- VMULL.S16 Q5, D26,D0[3]
- VMULL.S16 Q3, D27,D0[3]
- VMULL.S16 Q6, D22,D0[3]
- VMULL.S16 Q12,D23,D0[3]
- VSHRN.S32 D4, Q2, #16
- VSHRN.S32 D5, Q9, #16 @ Q2 = (OC_C1S7*x[1]>>16)-x[1]
- VSHRN.S32 D10,Q5, #16
- VSHRN.S32 D11,Q3, #16 @ Q5 = (OC_C3S5*x[5]>>16)-x[5]
- VSHRN.S32 D12,Q6, #16
- VSHRN.S32 D13,Q12,#16 @ Q6 = (OC_C3S5*x[3]>>16)-x[3]
- VADD.S16 Q7, Q7, Q2 @ Q7 = t[7]
- VSUB.S16 Q5, Q5, Q11
- VADD.S16 Q6, Q6, Q11
- VADD.S16 Q5, Q5, Q13
- VADD.S16 Q6, Q6, Q13
- VMULL.S16 Q9, D22,D1[1]
- VMULL.S16 Q11,D23,D1[1]
- VMULL.S16 Q15,D26,D1[1]
- VMULL.S16 Q13,D27,D1[1]
- VMULL.S16 Q2, D20,D1[2]
- VMULL.S16 Q12,D21,D1[2]
- VSHRN.S32 D18,Q9, #16
- VSHRN.S32 D19,Q11,#16 @ Q9 = (OC_C5S3*x[3]>>16)-x[3]
- VSHRN.S32 D30,Q15,#16
- VSHRN.S32 D31,Q13,#16 @ Q15= (OC_C5S3*x[5]>>16)-x[5]
- VSHRN.S32 D4, Q2, #16
- VSHRN.S32 D5, Q12,#16 @ Q2 = (OC_C6S2*x[2]>>16)
- VSUB.S16 Q5, Q5, Q9 @ Q5 = t[5]
- VADD.S16 Q6, Q6, Q15 @ Q6 = t[6]
- VSUB.S16 Q2, Q2, Q14
- VMULL.S16 Q3, D28,D1[2]
- VMULL.S16 Q11,D29,D1[2]
- VMULL.S16 Q12,D28,D0[2]
- VMULL.S16 Q9, D29,D0[2]
- VMULL.S16 Q13,D20,D0[2]
- VMULL.S16 Q15,D21,D0[2]
- VSHRN.S32 D6, Q3, #16
- VSHRN.S32 D7, Q11,#16 @ Q3 = (OC_C6S2*x[6]>>16)
- VSHRN.S32 D24,Q12,#16
- VSHRN.S32 D25,Q9, #16 @ Q12= (OC_C2S6*x[6]>>16)-x[6]
- VSHRN.S32 D26,Q13,#16
- VSHRN.S32 D27,Q15,#16 @ Q13= (OC_C2S6*x[2]>>16)-x[2]
- VSUB.S16 Q9, Q4, Q5 @ Q9 = t[4]-t[5]
- VSUB.S16 Q11,Q7, Q6 @ Q11= t[7]-t[6]
- VADD.S16 Q3, Q3, Q10
- VADD.S16 Q4, Q4, Q5 @ Q4 = t[4]=t[4]+t[5]
- VADD.S16 Q7, Q7, Q6 @ Q7 = t[7]=t[7]+t[6]
- VSUB.S16 Q2, Q2, Q12 @ Q2 = t[2]
- VADD.S16 Q3, Q3, Q13 @ Q3 = t[3]
- VMULL.S16 Q12,D16,D1[0]
- VMULL.S16 Q13,D17,D1[0]
- VMULL.S16 Q14,D2, D1[0]
- VMULL.S16 Q15,D3, D1[0]
- VMULL.S16 Q5, D18,D1[0]
- VMULL.S16 Q6, D22,D1[0]
- VSHRN.S32 D24,Q12,#16
- VSHRN.S32 D25,Q13,#16
- VSHRN.S32 D28,Q14,#16
- VSHRN.S32 D29,Q15,#16
- VMULL.S16 Q13,D19,D1[0]
- VMULL.S16 Q15,D23,D1[0]
- VADD.S16 Q8, Q8, Q12 @ Q8 = t[0]
- VADD.S16 Q1, Q1, Q14 @ Q1 = t[1]
- VSHRN.S32 D10,Q5, #16
- VSHRN.S32 D12,Q6, #16
- VSHRN.S32 D11,Q13,#16
- VSHRN.S32 D13,Q15,#16
- VADD.S16 Q5, Q5, Q9 @ Q5 = t[5]=OC_C4S4*(t[4]-t[5])>>16
- VADD.S16 Q6, Q6, Q11 @ Q6 = t[6]=OC_C4S4*(t[7]-t[6])>>16
- @ Stage 3
- VSUB.S16 Q11,Q8, Q3 @ Q11 = t[3]=t[0]-t[3]
- VADD.S16 Q8, Q8, Q3 @ Q8 = t[0]=t[0]+t[3]
- VADD.S16 Q9, Q1, Q2 @ Q9 = t[1]=t[1]+t[2]
- VADD.S16 Q3, Q6, Q5 @ Q3 = t[6]=t[6]+t[5]
- VSUB.S16 Q10,Q1, Q2 @ Q10 = t[2]=t[1]-t[2]
- VSUB.S16 Q5, Q6, Q5 @ Q5 = t[5]=t[6]-t[5]
- MOV PC, r14
- @ .size oc_idct8x8_stage123_neon, .-oc_idct8x8_stage123_neon @ ENDP
- @ .type oc_idct8x8_10_neon, %function; oc_idct8x8_10_neon: @ PROC
- oc_idct8x8_10_neon:
- ADR r3, OC_IDCT_CONSTS_NEON
- VLD1.64 {D0,D1}, [r3,:128]
- MOV r2, r1
- @ Row transforms (input is pre-transposed)
- @ Stage 1
- VLD1.64 {D16,D17,D18,D19},[r2,:128]!
- MOV r12, #16
- VMULL.S16 Q15,D16,D1[0] @ Q15= OC_C4S4*x[0]-(x[0]<<16)
- VLD1.64 {D17}, [r2,:64], r12
- VMULL.S16 Q2, D18,D0[1] @ Q2 = OC_C1S7*x[1]-(x[1]<<16)
- VLD1.64 {D19}, [r2,:64]
- VMULL.S16 Q14,D17,D0[2] @ Q14= OC_C2S6*x[2]-(x[2]<<16)
- VMULL.S16 Q3, D19,D0[3] @ Q3 = OC_C3S5*x[3]-(x[3]<<16)
- VMULL.S16 Q13,D19,D1[1] @ Q13= OC_C5S3*x[3]-(x[3]<<16)
- VMULL.S16 Q12,D18,D1[3] @ Q12= OC_C7S1*x[1]
- VMULL.S16 Q1, D17,D1[2] @ Q1 = OC_C6S2*x[2]
- VSHRN.S32 D30,Q15,#16 @ D30= t[0]-x[0]
- VSHRN.S32 D4, Q2, #16 @ D4 = t[7]-x[1]
- VSHRN.S32 D31,Q14,#16 @ D31= t[3]-x[2]
- VSHRN.S32 D6, Q3, #16 @ D6 = t[6]-x[3]
- VSHRN.S32 D7, Q13,#16 @ D7 = -t[5]-x[3]
- VSHRN.S32 D5, Q12,#16 @ D5 = t[4]
- VSHRN.S32 D2, Q1, #16 @ D2 = t[2]
- VADD.S16 D4, D4, D18 @ D4 = t[7]
- VADD.S16 D6, D6, D19 @ D6 = t[6]
- VADD.S16 D7, D7, D19 @ D7 = -t[5]
- VADD.S16 Q15,Q15,Q8 @ D30= t[0]
- @ D31= t[3]
- @ Stages 2 & 3
- VSUB.S16 Q12,Q2, Q3 @ D24= t[7]-t[6]
- @ D25= t[4]'=t[4]+t[5]
- VADD.S16 Q13,Q2, Q3 @ D26= t[7]=t[7]+t[6]
- @ D27= t[4]-t[5]
- VMULL.S16 Q11,D24,D1[0] @ Q11= OC_C4S4*(t[7]-t[6])
- @ -(t[7]-t[6]<<16)
- VMULL.S16 Q14,D27,D1[0] @ Q14= OC_C4S4*(t[4]-t[5])
- @ -(t[4]-t[5]<<16)
- VADD.S16 D16,D30,D31 @ D16= t[0]=t[0]+t[3]
- VSUB.S16 D17,D30,D2 @ D17= t[2]=t[0]-t[2]
- VADD.S16 D18,D30,D2 @ D18= t[1]=t[0]+t[2]
- VSHRN.S32 D22,Q11,#16 @ D22= (OC_C4S4*(t[7]-t[6])>>16)
- @ -(t[7]-t[6])
- VSHRN.S32 D23,Q14,#16 @ D23= (OC_C4S4*(t[4]-t[5])>>16)
- @ -(t[4]-t[5])
- VSUB.S16 D19,D30,D31 @ D19= t[3]=t[0]-t[3]
- VADD.S16 D22,D22,D24 @ D22= t[6]=OC_C4S4*(t[7]-t[6])>>16
- VADD.S16 D23,D23,D27 @ D23= t[5]=OC_C4S4*(t[4]-t[5])>>16
- VSUB.S16 D27,D22,D23 @ D27= t[5]=t[6]-t[5]
- VADD.S16 D24,D22,D23 @ D24= t[6]=t[6]+t[5]
- @ Stage 4
- VSUB.S16 Q11,Q8, Q13 @ D22= y[7]=t[0]-t[7]
- @ D23= y[5]=t[2]'-t[5]''
- VSUB.S16 Q10,Q9, Q12 @ D20= y[6]=t[1]-t[6]
- @ D21= y[4]=t[3]'-t[4]''
- VADD.S16 Q8, Q8, Q13 @ D16= y[0]=t[0]+t[7]
- @ D17= y[2]=t[2]'+t[5]''
- VADD.S16 Q9, Q9, Q12 @ D18= y[1]=t[1]-t[6]
- @ D19= y[3]=t[3]'-t[4]''
- @ 8x4 transpose
- VTRN.16 Q10,Q11 @ Q10= c5c4a5a4 c7c6a7a6
- @ Q11= d5d4b5b4 d7d6b7b6
- VTRN.16 Q8, Q9 @ Q8 = c3c2a3a2 c1c0a1a0
- @ Q9 = d3d2b3b2 d1d0b1b0
- VSWP D20,D21 @ Q10= c7c6a7a6 c5c4a5a4
- VSWP D22,D23 @ Q11= d7d6b7b6 d5d4b5b4
- VUZP.32 Q9, Q11 @ Q9 = b7b6b5b4 b3b2b1b0
- @ Q11= d7d6d5d4 d3d2d1d0
- VMULL.S16 Q15,D18,D0[1]
- VMULL.S16 Q13,D22,D1[1]
- VUZP.32 Q8, Q10 @ Q8 = a7a6a5a4 a3a2a1a0
- @ Q10= c7c6c5c4 c3c2c1c0
- @ Column transforms
- @ Stages 1, 2, & 3
- VMULL.S16 Q14,D19,D0[1] @ Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
- VMULL.S16 Q12,D23,D1[1] @ Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
- VMULL.S16 Q3, D22,D0[3]
- VMULL.S16 Q2, D23,D0[3] @ Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
- VSHRN.S32 D30,Q15,#16
- VSHRN.S32 D31,Q14,#16 @ Q15= (OC_C1S7*x[1]>>16)-x[1]
- VSHRN.S32 D26,Q13,#16
- VSHRN.S32 D27,Q12,#16 @ Q13= (OC_C5S3*x[3]>>16)-x[3]
- VSHRN.S32 D28,Q3, #16
- VSHRN.S32 D29,Q2, #16 @ Q14= (OC_C3S5*x[3]>>16)-x[3]
- VADD.S16 Q15,Q15,Q9 @ Q15= t[7]
- VADD.S16 Q13,Q13,Q11 @ Q13= -t[5]
- VADD.S16 Q14,Q14,Q11 @ Q14= t[6]
- VMULL.S16 Q12,D18,D1[3]
- VMULL.S16 Q2, D19,D1[3] @ Q2:Q12= OC_C7S1*x[1]
- VMULL.S16 Q1, D16,D1[0]
- VMULL.S16 Q11,D17,D1[0] @ Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
- VMULL.S16 Q3, D20,D0[2]
- VMULL.S16 Q9, D21,D0[2] @ Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
- VSHRN.S32 D24,Q12,#16
- VSHRN.S32 D25,Q2, #16 @ Q12= t[4]
- VMULL.S16 Q2, D20,D1[2]
- VSHRN.S32 D2, Q1, #16
- VSHRN.S32 D3, Q11,#16 @ Q1 = (OC_C4S4*x[0]>>16)-x[0]
- VMULL.S16 Q11,D21,D1[2] @ Q2:Q11= OC_C6S2*x[2]
- VSHRN.S32 D6, Q3, #16
- VSHRN.S32 D7, Q9, #16 @ Q3 = (OC_C2S6*x[2]>>16)-x[2]
- VSUB.S16 Q9, Q15,Q14 @ Q9 = t[7]-t[6]
- VADD.S16 Q15,Q15,Q14 @ Q15= t[7]=t[7]+t[6]
- VSHRN.S32 D4, Q2, #16
- VSHRN.S32 D5, Q11,#16 @ Q2 = t[2]
- VADD.S16 Q1, Q1, Q8 @ Q1 = t[0]
- VADD.S16 Q8, Q12,Q13 @ Q8 = t[4]-t[5]
- VADD.S16 Q3, Q3, Q10 @ Q3 = t[3]
- VMULL.S16 Q10,D16,D1[0]
- VMULL.S16 Q11,D17,D1[0] @ Q11:Q10= OC_C4S4*(t[4]-t[5])
- @ -(t[4]-t[5]<<16)
- VSUB.S16 Q12,Q12,Q13 @ Q12= t[4]=t[4]+t[5]
- VMULL.S16 Q14,D18,D1[0]
- VMULL.S16 Q13,D19,D1[0] @ Q13:Q14= OC_C4S4*(t[6]-t[7])
- @ -(t[6]-t[7]<<16)
- VSHRN.S32 D20,Q10,#16
- VSHRN.S32 D21,Q11,#16 @ Q10= (OC_C4S4*(t[4]-t[5])>>16)
- @ -(t[4]-t[5])
- VADD.S16 Q11,Q1, Q3 @ Q11= t[0]=t[0]+t[3]
- VSUB.S16 Q3, Q1, Q3 @ Q3 = t[3]=t[0]-t[3]
- VSHRN.S32 D28,Q14,#16
- VSHRN.S32 D29,Q13,#16 @ Q14= (OC_C4S4*(t[7]-t[6])>>16)
- @ -(t[7]-t[6])
- VADD.S16 Q10,Q10,Q8 @ Q10=t[5]
- VADD.S16 Q14,Q14,Q9 @ Q14=t[6]
- VSUB.S16 Q13,Q14,Q10 @ Q13=t[5]=t[6]-t[5]
- VADD.S16 Q14,Q14,Q10 @ Q14=t[6]=t[6]+t[5]
- VADD.S16 Q10,Q1, Q2 @ Q10= t[1]=t[0]+t[2]
- VSUB.S16 Q2, Q1, Q2 @ Q2 = t[2]=t[0]-t[2]
- @ Stage 4
- VADD.S16 Q8, Q11,Q15 @ Q8 = y[0]=t[0]+t[7]
- VADD.S16 Q9, Q10,Q14 @ Q9 = y[1]=t[1]+t[6]
- VSUB.S16 Q15,Q11,Q15 @ Q15 = y[7]=t[0]-t[7]
- VSUB.S16 Q14,Q10,Q14 @ Q14 = y[6]=t[1]-t[6]
- VADD.S16 Q10,Q2, Q13 @ Q10 = y[2]=t[2]+t[5]
- VADD.S16 Q11,Q3, Q12 @ Q11 = y[3]=t[3]+t[4]
- VSUB.S16 Q12,Q3, Q12 @ Q12 = y[4]=t[3]-t[4]
- VSUB.S16 Q13,Q2, Q13 @ Q13 = y[5]=t[2]-t[5]
- VMOV.I8 D2, #0
- VRSHR.S16 Q8, Q8, #4 @ Q8 = y[0]+8>>4
- VST1.64 {D2}, [r1,:64], r12
- VRSHR.S16 Q9, Q9, #4 @ Q9 = y[1]+8>>4
- VRSHR.S16 Q10,Q10,#4 @ Q10 = y[2]+8>>4
- VST1.64 {D2}, [r1,:64], r12
- VRSHR.S16 Q11,Q11,#4 @ Q11 = y[3]+8>>4
- VRSHR.S16 Q12,Q12,#4 @ Q12 = y[4]+8>>4
- VST1.64 {D2}, [r1,:64], r12
- VRSHR.S16 Q13,Q13,#4 @ Q13 = y[5]+8>>4
- VRSHR.S16 Q14,Q14,#4 @ Q14 = y[6]+8>>4
- VST1.64 {D2}, [r1,:64]
- VRSHR.S16 Q15,Q15,#4 @ Q15 = y[7]+8>>4
- VSTMIA r0, {D16-D31}
- MOV PC, r14
- @ .size oc_idct8x8_10_neon, .-oc_idct8x8_10_neon @ ENDP
- .endif
- @ END
- @ .section .note.GNU-stack,"",%progbits
- #endif
|